diff mbox

[1/7] drm/i915/guc: Reset GuC and retry on firmware load failure

Message ID 1458555400-22859-2-git-send-email-david.s.gordon@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Dave Gordon March 21, 2016, 10:16 a.m. UTC
From: Arun Siluvery <arun.siluvery@linux.intel.com>

Due to timing issues in the HW some of the status bits required for GuC
authentication doesn't get set occassionally, when that happens, GuC cannot
be initialized and we will be left with a wedged GPU. The WA suggested is
to perform a soft reset of GuC and attempt to reload the fw again for few
times before giving up.

As the failure is dependent on timing, tests performed by triggering manual
full gpu reset (i915_wedged) showed that we could sometimes hit this after
several thousand iterations but sometimes tests ran even longer without any
issues. Reset and reload mechanism proved helpful when we indeed hit fw
load failure so it is better to include this to improve driver stability.

This change implements the following WA,

WaEnableuKernelHeaderValidFix:skl,bxt
WaEnableGuCBootHashCheckNotSet:skl,bxt

Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
Cc: Alex Dai <yu.dai@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_guc_reg.h     |  1 +
 drivers/gpu/drm/i915/i915_reg.h         |  1 +
 drivers/gpu/drm/i915/intel_guc_loader.c | 49 +++++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_uncore.c     | 19 +++++++++++++
 5 files changed, 69 insertions(+), 2 deletions(-)

Comments

arun.siluvery@linux.intel.com March 21, 2016, 4:58 p.m. UTC | #1
On 21/03/2016 10:16, Dave Gordon wrote:
> From: Arun Siluvery <arun.siluvery@linux.intel.com>
>
> Due to timing issues in the HW some of the status bits required for GuC
> authentication doesn't get set occassionally, when that happens, GuC cannot
> be initialized and we will be left with a wedged GPU. The WA suggested is
> to perform a soft reset of GuC and attempt to reload the fw again for few
> times before giving up.
>
> As the failure is dependent on timing, tests performed by triggering manual
> full gpu reset (i915_wedged) showed that we could sometimes hit this after
> several thousand iterations but sometimes tests ran even longer without any
> issues. Reset and reload mechanism proved helpful when we indeed hit fw
> load failure so it is better to include this to improve driver stability.
>
> This change implements the following WA,
>
> WaEnableuKernelHeaderValidFix:skl,bxt
> WaEnableGuCBootHashCheckNotSet:skl,bxt
>
> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
> Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
> Cc: Alex Dai <yu.dai@intel.com>
> ---
This patch was previously reviewed by Alex,

https://patchwork.freedesktop.org/patch/76122/

>   drivers/gpu/drm/i915/i915_drv.h         |  1 +
>   drivers/gpu/drm/i915/i915_guc_reg.h     |  1 +
>   drivers/gpu/drm/i915/i915_reg.h         |  1 +
>   drivers/gpu/drm/i915/intel_guc_loader.c | 49 +++++++++++++++++++++++++++++++--
>   drivers/gpu/drm/i915/intel_uncore.c     | 19 +++++++++++++
>   5 files changed, 69 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 00c41a4..5418850 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2711,6 +2711,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
>   extern int intel_gpu_reset(struct drm_device *dev, u32 engine_mask);
>   extern bool intel_has_gpu_reset(struct drm_device *dev);
>   extern int i915_reset(struct drm_device *dev);
> +extern int intel_guc_reset(struct drm_i915_private *dev_priv);
>   extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
>   extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
>   extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_guc_reg.h b/drivers/gpu/drm/i915/i915_guc_reg.h
> index e4ba582..94ceee5 100644
> --- a/drivers/gpu/drm/i915/i915_guc_reg.h
> +++ b/drivers/gpu/drm/i915/i915_guc_reg.h
> @@ -27,6 +27,7 @@
>   /* Definitions of GuC H/W registers, bits, etc */
>
>   #define GUC_STATUS			_MMIO(0xc000)
> +#define   GS_MIA_IN_RESET		(1 << 0)
>   #define   GS_BOOTROM_SHIFT		1
>   #define   GS_BOOTROM_MASK		  (0x7F << GS_BOOTROM_SHIFT)
>   #define   GS_BOOTROM_RSA_FAILED		  (0x50 << GS_BOOTROM_SHIFT)
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 07e0449..cc71ca2 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -165,6 +165,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
>   #define  GEN6_GRDOM_MEDIA		(1 << 2)
>   #define  GEN6_GRDOM_BLT			(1 << 3)
>   #define  GEN6_GRDOM_VECS		(1 << 4)
> +#define  GEN9_GRDOM_GUC			(1 << 5)
>   #define  GEN8_GRDOM_MEDIA2		(1 << 7)
In the original patch GEN9_GRDOM_GUC was defined like above but during 
tdr patch review, option of arranging them according to gen was 
explored. In that case GEN9_GRDOM_GUC will be at the end.

regards
Arun

>
>   #define RING_PP_DIR_BASE(ring)		_MMIO((ring)->mmio_base+0x228)
> diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c
> index e1aff62..a07c228 100644
> --- a/drivers/gpu/drm/i915/intel_guc_loader.c
> +++ b/drivers/gpu/drm/i915/intel_guc_loader.c
> @@ -353,6 +353,24 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
>   	return ret;
>   }
>
> +static int i915_reset_guc(struct drm_i915_private *dev_priv)
> +{
> +	int ret;
> +	u32 guc_status;
> +
> +	ret = intel_guc_reset(dev_priv);
> +	if (ret) {
> +		DRM_ERROR("GuC reset failed, ret = %d\n", ret);
> +		return ret;
> +	}
> +
> +	guc_status = I915_READ(GUC_STATUS);
> +	WARN(!(guc_status & GS_MIA_IN_RESET),
> +	     "GuC status: 0x%x, MIA core expected to be in reset\n", guc_status);
> +
> +	return ret;
> +}
> +
>   /**
>    * intel_guc_ucode_load() - load GuC uCode into the device
>    * @dev:	drm device
> @@ -417,9 +435,36 @@ int intel_guc_ucode_load(struct drm_device *dev)
>   	if (err)
>   		goto fail;
>
> +	/*
> +	 * WaEnableuKernelHeaderValidFix:skl,bxt
> +	 * For BXT, this is only upto B0 but below WA is required for later
> +	 * steppings also so this is extended as well.
> +	 */
> +	/* WaEnableGuCBootHashCheckNotSet:skl,bxt */
>   	err = guc_ucode_xfer(dev_priv);
> -	if (err)
> -		goto fail;
> +	if (err) {
> +		int retries = 3;
> +
> +		DRM_ERROR("GuC fw load failed, err=%d, attempting reset and retry\n", err);
> +
> +		while (retries--) {
> +			err = i915_reset_guc(dev_priv);
> +			if (err)
> +				break;
> +
> +			err = guc_ucode_xfer(dev_priv);
> +			if (!err) {
> +				DRM_DEBUG_DRIVER("GuC fw reload succeeded after reset\n");
> +				break;
> +			}
> +			DRM_DEBUG_DRIVER("GuC fw reload retries left: %d\n", retries);
> +		}
> +
> +		if (err) {
> +			DRM_ERROR("GuC fw reload attempt failed, ret=%d\n", err);
> +			goto fail;
> +		}
> +	}
>
>   	guc_fw->guc_fw_load_status = GUC_FIRMWARE_SUCCESS;
>
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 512b7fa..d44e07e 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1673,6 +1673,25 @@ bool intel_has_gpu_reset(struct drm_device *dev)
>   	return intel_get_gpu_reset(dev) != NULL;
>   }
>
> +int intel_guc_reset(struct drm_i915_private *dev_priv)
> +{
> +	int ret;
> +	unsigned long irqflags;
> +
> +	if (!i915.enable_guc_submission)
> +		return -EINVAL;
> +
> +	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> +	spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
> +
> +	ret = gen6_hw_domain_reset(dev_priv, GEN9_GRDOM_GUC);
> +
> +	spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
> +	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> +
> +	return ret;
> +}
> +
>   bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
>   {
>   	return check_for_unclaimed_mmio(dev_priv);
>
Dave Gordon March 23, 2016, 4:03 p.m. UTC | #2
On 21/03/16 16:58, Arun Siluvery wrote:
> On 21/03/2016 10:16, Dave Gordon wrote:
>> From: Arun Siluvery <arun.siluvery@linux.intel.com>
>>
>> Due to timing issues in the HW some of the status bits required for GuC
>> authentication doesn't get set occassionally, when that happens, GuC
>> cannot
>> be initialized and we will be left with a wedged GPU. The WA suggested is
>> to perform a soft reset of GuC and attempt to reload the fw again for few
>> times before giving up.
>>
>> As the failure is dependent on timing, tests performed by triggering
>> manual
>> full gpu reset (i915_wedged) showed that we could sometimes hit this
>> after
>> several thousand iterations but sometimes tests ran even longer
>> without any
>> issues. Reset and reload mechanism proved helpful when we indeed hit fw
>> load failure so it is better to include this to improve driver stability.
>>
>> This change implements the following WA,
>>
>> WaEnableuKernelHeaderValidFix:skl,bxt
>> WaEnableGuCBootHashCheckNotSet:skl,bxt
>>
>> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com>
>> Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
>> Cc: Alex Dai <yu.dai@intel.com>
>> ---
> This patch was previously reviewed by Alex,
>
> https://patchwork.freedesktop.org/patch/76122/

OK, I'm going to repost just the first two from this set, tagged with 
R-Bs from Alex & you, and the Bugzilla reference; then the rest can form 
a new sequence for Alex to review when he gets back from vacation.

>> diff --git a/drivers/gpu/drm/i915/i915_reg.h
>> b/drivers/gpu/drm/i915/i915_reg.h
>> index 07e0449..cc71ca2 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -165,6 +165,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t
>> reg)
>>   #define  GEN6_GRDOM_MEDIA        (1 << 2)
>>   #define  GEN6_GRDOM_BLT            (1 << 3)
>>   #define  GEN6_GRDOM_VECS        (1 << 4)
>> +#define  GEN9_GRDOM_GUC            (1 << 5)
>>   #define  GEN8_GRDOM_MEDIA2        (1 << 7)
> In the original patch GEN9_GRDOM_GUC was defined like above but during
> tdr patch review, option of arranging them according to gen was
> explored. In that case GEN9_GRDOM_GUC will be at the end.
>
> regards
> Arun

I think they're better in bit-number order, because that way it's easier 
to see that you haven't accidentally got overlapping definitions.

.Dave.
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 00c41a4..5418850 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2711,6 +2711,7 @@  extern long i915_compat_ioctl(struct file *filp, unsigned int cmd,
 extern int intel_gpu_reset(struct drm_device *dev, u32 engine_mask);
 extern bool intel_has_gpu_reset(struct drm_device *dev);
 extern int i915_reset(struct drm_device *dev);
+extern int intel_guc_reset(struct drm_i915_private *dev_priv);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
 extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
 extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_guc_reg.h b/drivers/gpu/drm/i915/i915_guc_reg.h
index e4ba582..94ceee5 100644
--- a/drivers/gpu/drm/i915/i915_guc_reg.h
+++ b/drivers/gpu/drm/i915/i915_guc_reg.h
@@ -27,6 +27,7 @@ 
 /* Definitions of GuC H/W registers, bits, etc */
 
 #define GUC_STATUS			_MMIO(0xc000)
+#define   GS_MIA_IN_RESET		(1 << 0)
 #define   GS_BOOTROM_SHIFT		1
 #define   GS_BOOTROM_MASK		  (0x7F << GS_BOOTROM_SHIFT)
 #define   GS_BOOTROM_RSA_FAILED		  (0x50 << GS_BOOTROM_SHIFT)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 07e0449..cc71ca2 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -165,6 +165,7 @@  static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define  GEN6_GRDOM_MEDIA		(1 << 2)
 #define  GEN6_GRDOM_BLT			(1 << 3)
 #define  GEN6_GRDOM_VECS		(1 << 4)
+#define  GEN9_GRDOM_GUC			(1 << 5)
 #define  GEN8_GRDOM_MEDIA2		(1 << 7)
 
 #define RING_PP_DIR_BASE(ring)		_MMIO((ring)->mmio_base+0x228)
diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c
index e1aff62..a07c228 100644
--- a/drivers/gpu/drm/i915/intel_guc_loader.c
+++ b/drivers/gpu/drm/i915/intel_guc_loader.c
@@ -353,6 +353,24 @@  static int guc_ucode_xfer(struct drm_i915_private *dev_priv)
 	return ret;
 }
 
+static int i915_reset_guc(struct drm_i915_private *dev_priv)
+{
+	int ret;
+	u32 guc_status;
+
+	ret = intel_guc_reset(dev_priv);
+	if (ret) {
+		DRM_ERROR("GuC reset failed, ret = %d\n", ret);
+		return ret;
+	}
+
+	guc_status = I915_READ(GUC_STATUS);
+	WARN(!(guc_status & GS_MIA_IN_RESET),
+	     "GuC status: 0x%x, MIA core expected to be in reset\n", guc_status);
+
+	return ret;
+}
+
 /**
  * intel_guc_ucode_load() - load GuC uCode into the device
  * @dev:	drm device
@@ -417,9 +435,36 @@  int intel_guc_ucode_load(struct drm_device *dev)
 	if (err)
 		goto fail;
 
+	/*
+	 * WaEnableuKernelHeaderValidFix:skl,bxt
+	 * For BXT, this is only upto B0 but below WA is required for later
+	 * steppings also so this is extended as well.
+	 */
+	/* WaEnableGuCBootHashCheckNotSet:skl,bxt */
 	err = guc_ucode_xfer(dev_priv);
-	if (err)
-		goto fail;
+	if (err) {
+		int retries = 3;
+
+		DRM_ERROR("GuC fw load failed, err=%d, attempting reset and retry\n", err);
+
+		while (retries--) {
+			err = i915_reset_guc(dev_priv);
+			if (err)
+				break;
+
+			err = guc_ucode_xfer(dev_priv);
+			if (!err) {
+				DRM_DEBUG_DRIVER("GuC fw reload succeeded after reset\n");
+				break;
+			}
+			DRM_DEBUG_DRIVER("GuC fw reload retries left: %d\n", retries);
+		}
+
+		if (err) {
+			DRM_ERROR("GuC fw reload attempt failed, ret=%d\n", err);
+			goto fail;
+		}
+	}
 
 	guc_fw->guc_fw_load_status = GUC_FIRMWARE_SUCCESS;
 
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 512b7fa..d44e07e 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1673,6 +1673,25 @@  bool intel_has_gpu_reset(struct drm_device *dev)
 	return intel_get_gpu_reset(dev) != NULL;
 }
 
+int intel_guc_reset(struct drm_i915_private *dev_priv)
+{
+	int ret;
+	unsigned long irqflags;
+
+	if (!i915.enable_guc_submission)
+		return -EINVAL;
+
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
+	spin_lock_irqsave(&dev_priv->uncore.lock, irqflags);
+
+	ret = gen6_hw_domain_reset(dev_priv, GEN9_GRDOM_GUC);
+
+	spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags);
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+
+	return ret;
+}
+
 bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
 {
 	return check_for_unclaimed_mmio(dev_priv);