Message ID | 1458555400-22859-2-git-send-email-david.s.gordon@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 21/03/2016 10:16, Dave Gordon wrote: > From: Arun Siluvery <arun.siluvery@linux.intel.com> > > Due to timing issues in the HW some of the status bits required for GuC > authentication doesn't get set occassionally, when that happens, GuC cannot > be initialized and we will be left with a wedged GPU. The WA suggested is > to perform a soft reset of GuC and attempt to reload the fw again for few > times before giving up. > > As the failure is dependent on timing, tests performed by triggering manual > full gpu reset (i915_wedged) showed that we could sometimes hit this after > several thousand iterations but sometimes tests ran even longer without any > issues. Reset and reload mechanism proved helpful when we indeed hit fw > load failure so it is better to include this to improve driver stability. > > This change implements the following WA, > > WaEnableuKernelHeaderValidFix:skl,bxt > WaEnableGuCBootHashCheckNotSet:skl,bxt > > Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com> > Signed-off-by: Dave Gordon <david.s.gordon@intel.com> > Cc: Alex Dai <yu.dai@intel.com> > --- This patch was previously reviewed by Alex, https://patchwork.freedesktop.org/patch/76122/ > drivers/gpu/drm/i915/i915_drv.h | 1 + > drivers/gpu/drm/i915/i915_guc_reg.h | 1 + > drivers/gpu/drm/i915/i915_reg.h | 1 + > drivers/gpu/drm/i915/intel_guc_loader.c | 49 +++++++++++++++++++++++++++++++-- > drivers/gpu/drm/i915/intel_uncore.c | 19 +++++++++++++ > 5 files changed, 69 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 00c41a4..5418850 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -2711,6 +2711,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd, > extern int intel_gpu_reset(struct drm_device *dev, u32 engine_mask); > extern bool intel_has_gpu_reset(struct drm_device *dev); > extern int i915_reset(struct drm_device *dev); > +extern int intel_guc_reset(struct drm_i915_private *dev_priv); > extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv); > extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv); > extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv); > diff --git a/drivers/gpu/drm/i915/i915_guc_reg.h b/drivers/gpu/drm/i915/i915_guc_reg.h > index e4ba582..94ceee5 100644 > --- a/drivers/gpu/drm/i915/i915_guc_reg.h > +++ b/drivers/gpu/drm/i915/i915_guc_reg.h > @@ -27,6 +27,7 @@ > /* Definitions of GuC H/W registers, bits, etc */ > > #define GUC_STATUS _MMIO(0xc000) > +#define GS_MIA_IN_RESET (1 << 0) > #define GS_BOOTROM_SHIFT 1 > #define GS_BOOTROM_MASK (0x7F << GS_BOOTROM_SHIFT) > #define GS_BOOTROM_RSA_FAILED (0x50 << GS_BOOTROM_SHIFT) > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h > index 07e0449..cc71ca2 100644 > --- a/drivers/gpu/drm/i915/i915_reg.h > +++ b/drivers/gpu/drm/i915/i915_reg.h > @@ -165,6 +165,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) > #define GEN6_GRDOM_MEDIA (1 << 2) > #define GEN6_GRDOM_BLT (1 << 3) > #define GEN6_GRDOM_VECS (1 << 4) > +#define GEN9_GRDOM_GUC (1 << 5) > #define GEN8_GRDOM_MEDIA2 (1 << 7) In the original patch GEN9_GRDOM_GUC was defined like above but during tdr patch review, option of arranging them according to gen was explored. In that case GEN9_GRDOM_GUC will be at the end. regards Arun > > #define RING_PP_DIR_BASE(ring) _MMIO((ring)->mmio_base+0x228) > diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c > index e1aff62..a07c228 100644 > --- a/drivers/gpu/drm/i915/intel_guc_loader.c > +++ b/drivers/gpu/drm/i915/intel_guc_loader.c > @@ -353,6 +353,24 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv) > return ret; > } > > +static int i915_reset_guc(struct drm_i915_private *dev_priv) > +{ > + int ret; > + u32 guc_status; > + > + ret = intel_guc_reset(dev_priv); > + if (ret) { > + DRM_ERROR("GuC reset failed, ret = %d\n", ret); > + return ret; > + } > + > + guc_status = I915_READ(GUC_STATUS); > + WARN(!(guc_status & GS_MIA_IN_RESET), > + "GuC status: 0x%x, MIA core expected to be in reset\n", guc_status); > + > + return ret; > +} > + > /** > * intel_guc_ucode_load() - load GuC uCode into the device > * @dev: drm device > @@ -417,9 +435,36 @@ int intel_guc_ucode_load(struct drm_device *dev) > if (err) > goto fail; > > + /* > + * WaEnableuKernelHeaderValidFix:skl,bxt > + * For BXT, this is only upto B0 but below WA is required for later > + * steppings also so this is extended as well. > + */ > + /* WaEnableGuCBootHashCheckNotSet:skl,bxt */ > err = guc_ucode_xfer(dev_priv); > - if (err) > - goto fail; > + if (err) { > + int retries = 3; > + > + DRM_ERROR("GuC fw load failed, err=%d, attempting reset and retry\n", err); > + > + while (retries--) { > + err = i915_reset_guc(dev_priv); > + if (err) > + break; > + > + err = guc_ucode_xfer(dev_priv); > + if (!err) { > + DRM_DEBUG_DRIVER("GuC fw reload succeeded after reset\n"); > + break; > + } > + DRM_DEBUG_DRIVER("GuC fw reload retries left: %d\n", retries); > + } > + > + if (err) { > + DRM_ERROR("GuC fw reload attempt failed, ret=%d\n", err); > + goto fail; > + } > + } > > guc_fw->guc_fw_load_status = GUC_FIRMWARE_SUCCESS; > > diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c > index 512b7fa..d44e07e 100644 > --- a/drivers/gpu/drm/i915/intel_uncore.c > +++ b/drivers/gpu/drm/i915/intel_uncore.c > @@ -1673,6 +1673,25 @@ bool intel_has_gpu_reset(struct drm_device *dev) > return intel_get_gpu_reset(dev) != NULL; > } > > +int intel_guc_reset(struct drm_i915_private *dev_priv) > +{ > + int ret; > + unsigned long irqflags; > + > + if (!i915.enable_guc_submission) > + return -EINVAL; > + > + intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); > + spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); > + > + ret = gen6_hw_domain_reset(dev_priv, GEN9_GRDOM_GUC); > + > + spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); > + intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); > + > + return ret; > +} > + > bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv) > { > return check_for_unclaimed_mmio(dev_priv); >
On 21/03/16 16:58, Arun Siluvery wrote: > On 21/03/2016 10:16, Dave Gordon wrote: >> From: Arun Siluvery <arun.siluvery@linux.intel.com> >> >> Due to timing issues in the HW some of the status bits required for GuC >> authentication doesn't get set occassionally, when that happens, GuC >> cannot >> be initialized and we will be left with a wedged GPU. The WA suggested is >> to perform a soft reset of GuC and attempt to reload the fw again for few >> times before giving up. >> >> As the failure is dependent on timing, tests performed by triggering >> manual >> full gpu reset (i915_wedged) showed that we could sometimes hit this >> after >> several thousand iterations but sometimes tests ran even longer >> without any >> issues. Reset and reload mechanism proved helpful when we indeed hit fw >> load failure so it is better to include this to improve driver stability. >> >> This change implements the following WA, >> >> WaEnableuKernelHeaderValidFix:skl,bxt >> WaEnableGuCBootHashCheckNotSet:skl,bxt >> >> Signed-off-by: Arun Siluvery <arun.siluvery@linux.intel.com> >> Signed-off-by: Dave Gordon <david.s.gordon@intel.com> >> Cc: Alex Dai <yu.dai@intel.com> >> --- > This patch was previously reviewed by Alex, > > https://patchwork.freedesktop.org/patch/76122/ OK, I'm going to repost just the first two from this set, tagged with R-Bs from Alex & you, and the Bugzilla reference; then the rest can form a new sequence for Alex to review when he gets back from vacation. >> diff --git a/drivers/gpu/drm/i915/i915_reg.h >> b/drivers/gpu/drm/i915/i915_reg.h >> index 07e0449..cc71ca2 100644 >> --- a/drivers/gpu/drm/i915/i915_reg.h >> +++ b/drivers/gpu/drm/i915/i915_reg.h >> @@ -165,6 +165,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t >> reg) >> #define GEN6_GRDOM_MEDIA (1 << 2) >> #define GEN6_GRDOM_BLT (1 << 3) >> #define GEN6_GRDOM_VECS (1 << 4) >> +#define GEN9_GRDOM_GUC (1 << 5) >> #define GEN8_GRDOM_MEDIA2 (1 << 7) > In the original patch GEN9_GRDOM_GUC was defined like above but during > tdr patch review, option of arranging them according to gen was > explored. In that case GEN9_GRDOM_GUC will be at the end. > > regards > Arun I think they're better in bit-number order, because that way it's easier to see that you haven't accidentally got overlapping definitions. .Dave.
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 00c41a4..5418850 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2711,6 +2711,7 @@ extern long i915_compat_ioctl(struct file *filp, unsigned int cmd, extern int intel_gpu_reset(struct drm_device *dev, u32 engine_mask); extern bool intel_has_gpu_reset(struct drm_device *dev); extern int i915_reset(struct drm_device *dev); +extern int intel_guc_reset(struct drm_i915_private *dev_priv); extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv); extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv); extern unsigned long i915_gfx_val(struct drm_i915_private *dev_priv); diff --git a/drivers/gpu/drm/i915/i915_guc_reg.h b/drivers/gpu/drm/i915/i915_guc_reg.h index e4ba582..94ceee5 100644 --- a/drivers/gpu/drm/i915/i915_guc_reg.h +++ b/drivers/gpu/drm/i915/i915_guc_reg.h @@ -27,6 +27,7 @@ /* Definitions of GuC H/W registers, bits, etc */ #define GUC_STATUS _MMIO(0xc000) +#define GS_MIA_IN_RESET (1 << 0) #define GS_BOOTROM_SHIFT 1 #define GS_BOOTROM_MASK (0x7F << GS_BOOTROM_SHIFT) #define GS_BOOTROM_RSA_FAILED (0x50 << GS_BOOTROM_SHIFT) diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h index 07e0449..cc71ca2 100644 --- a/drivers/gpu/drm/i915/i915_reg.h +++ b/drivers/gpu/drm/i915/i915_reg.h @@ -165,6 +165,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) #define GEN6_GRDOM_MEDIA (1 << 2) #define GEN6_GRDOM_BLT (1 << 3) #define GEN6_GRDOM_VECS (1 << 4) +#define GEN9_GRDOM_GUC (1 << 5) #define GEN8_GRDOM_MEDIA2 (1 << 7) #define RING_PP_DIR_BASE(ring) _MMIO((ring)->mmio_base+0x228) diff --git a/drivers/gpu/drm/i915/intel_guc_loader.c b/drivers/gpu/drm/i915/intel_guc_loader.c index e1aff62..a07c228 100644 --- a/drivers/gpu/drm/i915/intel_guc_loader.c +++ b/drivers/gpu/drm/i915/intel_guc_loader.c @@ -353,6 +353,24 @@ static int guc_ucode_xfer(struct drm_i915_private *dev_priv) return ret; } +static int i915_reset_guc(struct drm_i915_private *dev_priv) +{ + int ret; + u32 guc_status; + + ret = intel_guc_reset(dev_priv); + if (ret) { + DRM_ERROR("GuC reset failed, ret = %d\n", ret); + return ret; + } + + guc_status = I915_READ(GUC_STATUS); + WARN(!(guc_status & GS_MIA_IN_RESET), + "GuC status: 0x%x, MIA core expected to be in reset\n", guc_status); + + return ret; +} + /** * intel_guc_ucode_load() - load GuC uCode into the device * @dev: drm device @@ -417,9 +435,36 @@ int intel_guc_ucode_load(struct drm_device *dev) if (err) goto fail; + /* + * WaEnableuKernelHeaderValidFix:skl,bxt + * For BXT, this is only upto B0 but below WA is required for later + * steppings also so this is extended as well. + */ + /* WaEnableGuCBootHashCheckNotSet:skl,bxt */ err = guc_ucode_xfer(dev_priv); - if (err) - goto fail; + if (err) { + int retries = 3; + + DRM_ERROR("GuC fw load failed, err=%d, attempting reset and retry\n", err); + + while (retries--) { + err = i915_reset_guc(dev_priv); + if (err) + break; + + err = guc_ucode_xfer(dev_priv); + if (!err) { + DRM_DEBUG_DRIVER("GuC fw reload succeeded after reset\n"); + break; + } + DRM_DEBUG_DRIVER("GuC fw reload retries left: %d\n", retries); + } + + if (err) { + DRM_ERROR("GuC fw reload attempt failed, ret=%d\n", err); + goto fail; + } + } guc_fw->guc_fw_load_status = GUC_FIRMWARE_SUCCESS; diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index 512b7fa..d44e07e 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -1673,6 +1673,25 @@ bool intel_has_gpu_reset(struct drm_device *dev) return intel_get_gpu_reset(dev) != NULL; } +int intel_guc_reset(struct drm_i915_private *dev_priv) +{ + int ret; + unsigned long irqflags; + + if (!i915.enable_guc_submission) + return -EINVAL; + + intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); + spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); + + ret = gen6_hw_domain_reset(dev_priv, GEN9_GRDOM_GUC); + + spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); + intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); + + return ret; +} + bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv) { return check_for_unclaimed_mmio(dev_priv);