diff mbox

[v2,1/2] drm/i915/guc: Use correct error code for GuC initialization failure

Message ID 20180220225711.12524-1-michal.wajdeczko@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Michal Wajdeczko Feb. 20, 2018, 10:57 p.m. UTC
Since commit 6ca9a2beb54a ("drm/i915: Unwind i915_gem_init() failure")
we believed that we correctly handle all errors encountered during
GuC initialization, including special one that indicates request to
run driver with disabled GPU submission (-EIO).

Unfortunately since commit 121981fafe69 ("drm/i915/guc: Combine
enable_guc_loading|submission modparams") we stopped using that
error code to avoid unwanted fallback to execlist submission mode.

In result any GuC initialization failure was treated as non-recoverable
error leading to driver load abort, so we could not even read related
GuC error log to investigate cause of the problem.

Fix that by always returning -EIO on uC hardware related failure.

v2: don't allow -EIO from uc_init
    don't call uc_fini[_misc] on -EIO
    mark guc fw as failed on hw init failure
    prepare uc_fini_hw to run after earlier -EIO

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michal Winiarski <michal.winiarski@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Sagar Arun Kamble <sagar.a.kamble@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c    | 13 ++++++++-----
 drivers/gpu/drm/i915/intel_guc.h   |  5 +++++
 drivers/gpu/drm/i915/intel_uc.c    | 13 +++++++++----
 drivers/gpu/drm/i915/intel_uc_fw.h |  5 +++++
 4 files changed, 27 insertions(+), 9 deletions(-)

Comments

sagar.a.kamble@intel.com Feb. 21, 2018, 8:08 a.m. UTC | #1
On 2/21/2018 4:27 AM, Michal Wajdeczko wrote:
> Since commit 6ca9a2beb54a ("drm/i915: Unwind i915_gem_init() failure")
> we believed that we correctly handle all errors encountered during
> GuC initialization, including special one that indicates request to
> run driver with disabled GPU submission (-EIO).
>
> Unfortunately since commit 121981fafe69 ("drm/i915/guc: Combine
> enable_guc_loading|submission modparams") we stopped using that
> error code to avoid unwanted fallback to execlist submission mode.
>
> In result any GuC initialization failure was treated as non-recoverable
> error leading to driver load abort, so we could not even read related
> GuC error log to investigate cause of the problem.
>
> Fix that by always returning -EIO on uC hardware related failure.
>
> v2: don't allow -EIO from uc_init
>      don't call uc_fini[_misc] on -EIO
>      mark guc fw as failed on hw init failure
>      prepare uc_fini_hw to run after earlier -EIO
>
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Michal Winiarski <michal.winiarski@intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Cc: Sagar Arun Kamble <sagar.a.kamble@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem.c    | 13 ++++++++-----
>   drivers/gpu/drm/i915/intel_guc.h   |  5 +++++
>   drivers/gpu/drm/i915/intel_uc.c    | 13 +++++++++----
>   drivers/gpu/drm/i915/intel_uc_fw.h |  5 +++++
>   4 files changed, 27 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 631a2db..80f23a8 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -5324,8 +5324,10 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
>   	intel_init_gt_powersave(dev_priv);
>   
>   	ret = intel_uc_init(dev_priv);
> -	if (ret)
> +	if (ret) {
> +		GEM_BUG_ON(ret == -EIO);
>   		goto err_pm;
> +	}
>   
>   	ret = i915_gem_init_hw(dev_priv);
>   	if (ret)
> @@ -5372,7 +5374,8 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
>   	i915_gem_contexts_lost(dev_priv);
>   	intel_uc_fini_hw(dev_priv);
This uc_fini_hw should also be not called on -EIO?
>   err_uc_init:
> -	intel_uc_fini(dev_priv);
> +	if (ret != -EIO)
> +		intel_uc_fini(dev_priv);
>   err_pm:
>   	if (ret != -EIO) {
>   		intel_cleanup_gt_powersave(dev_priv);
> @@ -5386,10 +5389,10 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
>   	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>   	mutex_unlock(&dev_priv->drm.struct_mutex);
>   
> -	intel_uc_fini_misc(dev_priv);
> -
> -	if (ret != -EIO)
> +	if (ret != -EIO) {
> +		intel_uc_fini_misc(dev_priv);
>   		i915_gem_cleanup_userptr(dev_priv);
> +	}
>   
>   	if (ret == -EIO) {
>   		/*
Comment here can be updated to say "Allow engines or uC initialization 
to fail ... "
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index 52856a9..512ff7b 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -100,6 +100,11 @@ static inline void intel_guc_notify(struct intel_guc *guc)
>   	guc->notify(guc);
>   }
>   
> +static inline bool intel_guc_is_loaded(struct intel_guc *guc)
> +{
> +	return intel_uc_fw_is_loaded(&guc->fw);
> +}
> +
>   /*
>    * GuC does not allow any gfx GGTT address that falls into range [0, WOPCM_TOP),
>    * which is reserved for Boot ROM, SRAM and WOPCM. Currently this top address is
> diff --git a/drivers/gpu/drm/i915/intel_uc.c b/drivers/gpu/drm/i915/intel_uc.c
> index 9f1bac6..75d0eb9 100644
> --- a/drivers/gpu/drm/i915/intel_uc.c
> +++ b/drivers/gpu/drm/i915/intel_uc.c
> @@ -421,11 +421,13 @@ int intel_uc_init_hw(struct drm_i915_private *dev_priv)
>   	 * Note that there is no fallback as either user explicitly asked for
>   	 * the GuC or driver default option was to run with the GuC enabled.
>   	 */
> -	if (GEM_WARN_ON(ret == -EIO))
> -		ret = -EINVAL;
> -
>   	dev_err(dev_priv->drm.dev, "GuC initialization failed %d\n", ret);
> -	return ret;
> +
> +	/* Mark GuC firmware as failed to avoid redundant clean-up */
uc_fini_hw is not redundant. uc_fini[_misc] was redundant. May be we 
should say
"to avoid clean-up on wedged"
> +	guc->fw.load_status = INTEL_UC_FIRMWARE_FAIL;
> +
> +	/* We want to disable GPU submission but keep KMS alive */
> +	return -EIO;
>   }
>   
>   void intel_uc_fini_hw(struct drm_i915_private *dev_priv)
> @@ -437,6 +439,9 @@ void intel_uc_fini_hw(struct drm_i915_private *dev_priv)
>   
>   	GEM_BUG_ON(!HAS_GUC(dev_priv));
>   
> +	if (!intel_guc_is_loaded(guc))
> +		return;
> +
Can we skip based on i915_terminally_wedged instead?
Similarly wedged check is needed for invoking other portion of 
i915_gem_fini like gem_cleanup_engines, gem_contexts_fini since
we skipped them on -EIO during gem_init.
>   	if (USES_GUC_SUBMISSION(dev_priv))
>   		intel_guc_submission_disable(guc);
>   
> diff --git a/drivers/gpu/drm/i915/intel_uc_fw.h b/drivers/gpu/drm/i915/intel_uc_fw.h
> index d5fd460..0e3b237 100644
> --- a/drivers/gpu/drm/i915/intel_uc_fw.h
> +++ b/drivers/gpu/drm/i915/intel_uc_fw.h
> @@ -115,6 +115,11 @@ static inline bool intel_uc_fw_is_selected(struct intel_uc_fw *uc_fw)
>   	return uc_fw->path != NULL;
>   }
>   
> +static inline bool intel_uc_fw_is_loaded(struct intel_uc_fw *uc_fw)
> +{
> +	return uc_fw->load_status == INTEL_UC_FIRMWARE_SUCCESS;
> +}
> +
>   void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
>   		       struct intel_uc_fw *uc_fw);
>   int intel_uc_fw_upload(struct intel_uc_fw *uc_fw,
Chris Wilson Feb. 21, 2018, 8:27 a.m. UTC | #2
Quoting Michal Wajdeczko (2018-02-20 22:57:10)
> Since commit 6ca9a2beb54a ("drm/i915: Unwind i915_gem_init() failure")
> we believed that we correctly handle all errors encountered during
> GuC initialization, including special one that indicates request to
> run driver with disabled GPU submission (-EIO).
> 
> Unfortunately since commit 121981fafe69 ("drm/i915/guc: Combine
> enable_guc_loading|submission modparams") we stopped using that
> error code to avoid unwanted fallback to execlist submission mode.
> 
> In result any GuC initialization failure was treated as non-recoverable
> error leading to driver load abort, so we could not even read related
> GuC error log to investigate cause of the problem.
> 
> Fix that by always returning -EIO on uC hardware related failure.
> 
> v2: don't allow -EIO from uc_init
>     don't call uc_fini[_misc] on -EIO
>     mark guc fw as failed on hw init failure
>     prepare uc_fini_hw to run after earlier -EIO
> 
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Michal Winiarski <michal.winiarski@intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Cc: Sagar Arun Kamble <sagar.a.kamble@intel.com>

Whilst thinking about this, do you want to disable (or rather prevent
setup of) guc submission if the driver is already wedged?
-Chris
Michal Wajdeczko Feb. 21, 2018, 12:20 p.m. UTC | #3
On Wed, 21 Feb 2018 09:08:08 +0100, Sagar Arun Kamble  
<sagar.a.kamble@intel.com> wrote:

>
>
> On 2/21/2018 4:27 AM, Michal Wajdeczko wrote:
>> Since commit 6ca9a2beb54a ("drm/i915: Unwind i915_gem_init() failure")
>> we believed that we correctly handle all errors encountered during
>> GuC initialization, including special one that indicates request to
>> run driver with disabled GPU submission (-EIO).
>>
>> Unfortunately since commit 121981fafe69 ("drm/i915/guc: Combine
>> enable_guc_loading|submission modparams") we stopped using that
>> error code to avoid unwanted fallback to execlist submission mode.
>>
>> In result any GuC initialization failure was treated as non-recoverable
>> error leading to driver load abort, so we could not even read related
>> GuC error log to investigate cause of the problem.
>>
>> Fix that by always returning -EIO on uC hardware related failure.
>>
>> v2: don't allow -EIO from uc_init
>>      don't call uc_fini[_misc] on -EIO
>>      mark guc fw as failed on hw init failure
>>      prepare uc_fini_hw to run after earlier -EIO
>>
>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Michal Winiarski <michal.winiarski@intel.com>
>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>> Cc: Sagar Arun Kamble <sagar.a.kamble@intel.com>
>> ---
>>   drivers/gpu/drm/i915/i915_gem.c    | 13 ++++++++-----
>>   drivers/gpu/drm/i915/intel_guc.h   |  5 +++++
>>   drivers/gpu/drm/i915/intel_uc.c    | 13 +++++++++----
>>   drivers/gpu/drm/i915/intel_uc_fw.h |  5 +++++
>>   4 files changed, 27 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_gem.c  
>> b/drivers/gpu/drm/i915/i915_gem.c
>> index 631a2db..80f23a8 100644
>> --- a/drivers/gpu/drm/i915/i915_gem.c
>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>> @@ -5324,8 +5324,10 @@ int i915_gem_init(struct drm_i915_private  
>> *dev_priv)
>>   	intel_init_gt_powersave(dev_priv);
>>     	ret = intel_uc_init(dev_priv);
>> -	if (ret)
>> +	if (ret) {
>> +		GEM_BUG_ON(ret == -EIO);
>>   		goto err_pm;
>> +	}
>>     	ret = i915_gem_init_hw(dev_priv);
>>   	if (ret)
>> @@ -5372,7 +5374,8 @@ int i915_gem_init(struct drm_i915_private  
>> *dev_priv)
>>   	i915_gem_contexts_lost(dev_priv);
>>   	intel_uc_fini_hw(dev_priv);
> This uc_fini_hw should also be not called on -EIO?

This one here is fine. But I need to clear guc->fw.load_status
there to make sure we will not try to perform full fini_hw() again.

>>   err_uc_init:
>> -	intel_uc_fini(dev_priv);
>> +	if (ret != -EIO)
>> +		intel_uc_fini(dev_priv);
>>   err_pm:
>>   	if (ret != -EIO) {
>>   		intel_cleanup_gt_powersave(dev_priv);
>> @@ -5386,10 +5389,10 @@ int i915_gem_init(struct drm_i915_private  
>> *dev_priv)
>>   	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>>   	mutex_unlock(&dev_priv->drm.struct_mutex);
>>   -	intel_uc_fini_misc(dev_priv);
>> -
>> -	if (ret != -EIO)
>> +	if (ret != -EIO) {
>> +		intel_uc_fini_misc(dev_priv);
>>   		i915_gem_cleanup_userptr(dev_priv);
>> +	}
>>     	if (ret == -EIO) {
>>   		/*
> Comment here can be updated to say "Allow engines or uC initialization  
> to fail ... "

ok

>> diff --git a/drivers/gpu/drm/i915/intel_guc.h  
>> b/drivers/gpu/drm/i915/intel_guc.h
>> index 52856a9..512ff7b 100644
>> --- a/drivers/gpu/drm/i915/intel_guc.h
>> +++ b/drivers/gpu/drm/i915/intel_guc.h
>> @@ -100,6 +100,11 @@ static inline void intel_guc_notify(struct  
>> intel_guc *guc)
>>   	guc->notify(guc);
>>   }
>>   +static inline bool intel_guc_is_loaded(struct intel_guc *guc)
>> +{
>> +	return intel_uc_fw_is_loaded(&guc->fw);
>> +}
>> +
>>   /*
>>    * GuC does not allow any gfx GGTT address that falls into range [0,  
>> WOPCM_TOP),
>>    * which is reserved for Boot ROM, SRAM and WOPCM. Currently this top  
>> address is
>> diff --git a/drivers/gpu/drm/i915/intel_uc.c  
>> b/drivers/gpu/drm/i915/intel_uc.c
>> index 9f1bac6..75d0eb9 100644
>> --- a/drivers/gpu/drm/i915/intel_uc.c
>> +++ b/drivers/gpu/drm/i915/intel_uc.c
>> @@ -421,11 +421,13 @@ int intel_uc_init_hw(struct drm_i915_private  
>> *dev_priv)
>>   	 * Note that there is no fallback as either user explicitly asked for
>>   	 * the GuC or driver default option was to run with the GuC enabled.
>>   	 */
>> -	if (GEM_WARN_ON(ret == -EIO))
>> -		ret = -EINVAL;
>> -
>>   	dev_err(dev_priv->drm.dev, "GuC initialization failed %d\n", ret);
>> -	return ret;
>> +
>> +	/* Mark GuC firmware as failed to avoid redundant clean-up */
> uc_fini_hw is not redundant. uc_fini[_misc] was redundant. May be we  
> should say
> "to avoid clean-up on wedged"

ok

>> +	guc->fw.load_status = INTEL_UC_FIRMWARE_FAIL;
>> +
>> +	/* We want to disable GPU submission but keep KMS alive */
>> +	return -EIO;
>>   }
>>     void intel_uc_fini_hw(struct drm_i915_private *dev_priv)
>> @@ -437,6 +439,9 @@ void intel_uc_fini_hw(struct drm_i915_private  
>> *dev_priv)
>>     	GEM_BUG_ON(!HAS_GUC(dev_priv));
>>   +	if (!intel_guc_is_loaded(guc))
>> +		return;
>> +
> Can we skip based on i915_terminally_wedged instead?

I'm not sure, as we declare GPU as wedged not only during gem_init()

> Similarly wedged check is needed for invoking other portion of  
> i915_gem_fini like gem_cleanup_engines, gem_contexts_fini since
> we skipped them on -EIO during gem_init.

Hmm, we guarantee that special -EIO is not triggered *before*
i915_gem_init_hw() so we correctly cleanup these in i915_gem_init()
on error, or in i915_gem_fini() on success/wedged. There is no
need to use i915_terminally_wedged for them.

>>   	if (USES_GUC_SUBMISSION(dev_priv))
>>   		intel_guc_submission_disable(guc);
>>   diff --git a/drivers/gpu/drm/i915/intel_uc_fw.h  
>> b/drivers/gpu/drm/i915/intel_uc_fw.h
>> index d5fd460..0e3b237 100644
>> --- a/drivers/gpu/drm/i915/intel_uc_fw.h
>> +++ b/drivers/gpu/drm/i915/intel_uc_fw.h
>> @@ -115,6 +115,11 @@ static inline bool intel_uc_fw_is_selected(struct  
>> intel_uc_fw *uc_fw)
>>   	return uc_fw->path != NULL;
>>   }
>>   +static inline bool intel_uc_fw_is_loaded(struct intel_uc_fw *uc_fw)
>> +{
>> +	return uc_fw->load_status == INTEL_UC_FIRMWARE_SUCCESS;
>> +}
>> +
>>   void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
>>   		       struct intel_uc_fw *uc_fw);
>>   int intel_uc_fw_upload(struct intel_uc_fw *uc_fw,
Daniele Ceraolo Spurio Feb. 21, 2018, 4:54 p.m. UTC | #4
<snip>

> diff --git a/drivers/gpu/drm/i915/intel_uc_fw.h b/drivers/gpu/drm/i915/intel_uc_fw.h
> index d5fd460..0e3b237 100644
> --- a/drivers/gpu/drm/i915/intel_uc_fw.h
> +++ b/drivers/gpu/drm/i915/intel_uc_fw.h
> @@ -115,6 +115,11 @@ static inline bool intel_uc_fw_is_selected(struct intel_uc_fw *uc_fw)
>   	return uc_fw->path != NULL;
>   }
>   
> +static inline bool intel_uc_fw_is_loaded(struct intel_uc_fw *uc_fw)
> +{
> +	return uc_fw->load_status == INTEL_UC_FIRMWARE_SUCCESS;

Since we do not immediately update uc_fw->load_status after full GPU 
reset we have a small window of time during re-initialization where this 
function would falsely return true. We don't hit the issue in this 
patch, but I'd personally prefer not to add this function until 
uc_fw->load_status is correctly updated as we might inadvertently start 
to use it at the wrong time. Alternatively, if you want to merge this 
soon we could read the status from the HW as an initial version and then 
switch to uc_fw->load_status after we've fixed it.

Daniele

> +}
> +
>   void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
>   		       struct intel_uc_fw *uc_fw);
>   int intel_uc_fw_upload(struct intel_uc_fw *uc_fw,
>
sagar.a.kamble@intel.com Feb. 21, 2018, 4:55 p.m. UTC | #5
On 2/21/2018 5:50 PM, Michal Wajdeczko wrote:
> On Wed, 21 Feb 2018 09:08:08 +0100, Sagar Arun Kamble 
> <sagar.a.kamble@intel.com> wrote:
>
>>
>>
>> On 2/21/2018 4:27 AM, Michal Wajdeczko wrote:
>>> Since commit 6ca9a2beb54a ("drm/i915: Unwind i915_gem_init() failure")
>>> we believed that we correctly handle all errors encountered during
>>> GuC initialization, including special one that indicates request to
>>> run driver with disabled GPU submission (-EIO).
>>>
>>> Unfortunately since commit 121981fafe69 ("drm/i915/guc: Combine
>>> enable_guc_loading|submission modparams") we stopped using that
>>> error code to avoid unwanted fallback to execlist submission mode.
>>>
>>> In result any GuC initialization failure was treated as non-recoverable
>>> error leading to driver load abort, so we could not even read related
>>> GuC error log to investigate cause of the problem.
>>>
>>> Fix that by always returning -EIO on uC hardware related failure.
>>>
>>> v2: don't allow -EIO from uc_init
>>>      don't call uc_fini[_misc] on -EIO
>>>      mark guc fw as failed on hw init failure
>>>      prepare uc_fini_hw to run after earlier -EIO
>>>
>>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Michal Winiarski <michal.winiarski@intel.com>
>>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>>> Cc: Sagar Arun Kamble <sagar.a.kamble@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/i915_gem.c    | 13 ++++++++-----
>>>   drivers/gpu/drm/i915/intel_guc.h   |  5 +++++
>>>   drivers/gpu/drm/i915/intel_uc.c    | 13 +++++++++----
>>>   drivers/gpu/drm/i915/intel_uc_fw.h |  5 +++++
>>>   4 files changed, 27 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c 
>>> b/drivers/gpu/drm/i915/i915_gem.c
>>> index 631a2db..80f23a8 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -5324,8 +5324,10 @@ int i915_gem_init(struct drm_i915_private 
>>> *dev_priv)
>>>       intel_init_gt_powersave(dev_priv);
>>>         ret = intel_uc_init(dev_priv);
>>> -    if (ret)
>>> +    if (ret) {
>>> +        GEM_BUG_ON(ret == -EIO);
>>>           goto err_pm;
>>> +    }
>>>         ret = i915_gem_init_hw(dev_priv);
>>>       if (ret)
>>> @@ -5372,7 +5374,8 @@ int i915_gem_init(struct drm_i915_private 
>>> *dev_priv)
>>>       i915_gem_contexts_lost(dev_priv);
>>>       intel_uc_fini_hw(dev_priv);
>> This uc_fini_hw should also be not called on -EIO?
>
> This one here is fine. But I need to clear guc->fw.load_status
> there to make sure we will not try to perform full fini_hw() again.
Yes. Will need to set load_status as FIRMWARE_FAIL.
>
>>>   err_uc_init:
>>> -    intel_uc_fini(dev_priv);
>>> +    if (ret != -EIO)
>>> +        intel_uc_fini(dev_priv);
>>>   err_pm:
>>>       if (ret != -EIO) {
>>>           intel_cleanup_gt_powersave(dev_priv);
>>> @@ -5386,10 +5389,10 @@ int i915_gem_init(struct drm_i915_private 
>>> *dev_priv)
>>>       intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>>>       mutex_unlock(&dev_priv->drm.struct_mutex);
>>>   -    intel_uc_fini_misc(dev_priv);
>>> -
>>> -    if (ret != -EIO)
>>> +    if (ret != -EIO) {
>>> +        intel_uc_fini_misc(dev_priv);
>>>           i915_gem_cleanup_userptr(dev_priv);
>>> +    }
>>>         if (ret == -EIO) {
>>>           /*
>> Comment here can be updated to say "Allow engines or uC 
>> initialization to fail ... "
>
> ok
>
>>> diff --git a/drivers/gpu/drm/i915/intel_guc.h 
>>> b/drivers/gpu/drm/i915/intel_guc.h
>>> index 52856a9..512ff7b 100644
>>> --- a/drivers/gpu/drm/i915/intel_guc.h
>>> +++ b/drivers/gpu/drm/i915/intel_guc.h
>>> @@ -100,6 +100,11 @@ static inline void intel_guc_notify(struct 
>>> intel_guc *guc)
>>>       guc->notify(guc);
>>>   }
>>>   +static inline bool intel_guc_is_loaded(struct intel_guc *guc)
>>> +{
>>> +    return intel_uc_fw_is_loaded(&guc->fw);
>>> +}
>>> +
>>>   /*
>>>    * GuC does not allow any gfx GGTT address that falls into range 
>>> [0, WOPCM_TOP),
>>>    * which is reserved for Boot ROM, SRAM and WOPCM. Currently this 
>>> top address is
>>> diff --git a/drivers/gpu/drm/i915/intel_uc.c 
>>> b/drivers/gpu/drm/i915/intel_uc.c
>>> index 9f1bac6..75d0eb9 100644
>>> --- a/drivers/gpu/drm/i915/intel_uc.c
>>> +++ b/drivers/gpu/drm/i915/intel_uc.c
>>> @@ -421,11 +421,13 @@ int intel_uc_init_hw(struct drm_i915_private 
>>> *dev_priv)
>>>        * Note that there is no fallback as either user explicitly 
>>> asked for
>>>        * the GuC or driver default option was to run with the GuC 
>>> enabled.
>>>        */
>>> -    if (GEM_WARN_ON(ret == -EIO))
>>> -        ret = -EINVAL;
>>> -
>>>       dev_err(dev_priv->drm.dev, "GuC initialization failed %d\n", 
>>> ret);
>>> -    return ret;
>>> +
>>> +    /* Mark GuC firmware as failed to avoid redundant clean-up */
>> uc_fini_hw is not redundant. uc_fini[_misc] was redundant. May be we 
>> should say
>> "to avoid clean-up on wedged"
>
> ok
>
>>> +    guc->fw.load_status = INTEL_UC_FIRMWARE_FAIL;
>>> +
>>> +    /* We want to disable GPU submission but keep KMS alive */
>>> +    return -EIO;
>>>   }
>>>     void intel_uc_fini_hw(struct drm_i915_private *dev_priv)
>>> @@ -437,6 +439,9 @@ void intel_uc_fini_hw(struct drm_i915_private 
>>> *dev_priv)
>>>         GEM_BUG_ON(!HAS_GUC(dev_priv));
>>>   +    if (!intel_guc_is_loaded(guc))
>>> +        return;
>>> +
>> Can we skip based on i915_terminally_wedged instead?
>
> I'm not sure, as we declare GPU as wedged not only during gem_init()
>
>> Similarly wedged check is needed for invoking other portion of 
>> i915_gem_fini like gem_cleanup_engines, gem_contexts_fini since
>> we skipped them on -EIO during gem_init.
>
> Hmm, we guarantee that special -EIO is not triggered *before*
> i915_gem_init_hw() so we correctly cleanup these in i915_gem_init()
> on error, or in i915_gem_fini() on success/wedged. There is no
> need to use i915_terminally_wedged for them.
>
I thought these functions might touch the GPU when wedged (GTT and 
possibly engine states). But it looks like it does not
create issues, otherwise drv_module_reload would have highlighted. Agree 
that init counterparts not returning -EIO also means
these need not be gated by wedged check.
load_status based check looks good to me.
>>>       if (USES_GUC_SUBMISSION(dev_priv))
>>>           intel_guc_submission_disable(guc);
>>>   diff --git a/drivers/gpu/drm/i915/intel_uc_fw.h 
>>> b/drivers/gpu/drm/i915/intel_uc_fw.h
>>> index d5fd460..0e3b237 100644
>>> --- a/drivers/gpu/drm/i915/intel_uc_fw.h
>>> +++ b/drivers/gpu/drm/i915/intel_uc_fw.h
>>> @@ -115,6 +115,11 @@ static inline bool 
>>> intel_uc_fw_is_selected(struct intel_uc_fw *uc_fw)
>>>       return uc_fw->path != NULL;
>>>   }
>>>   +static inline bool intel_uc_fw_is_loaded(struct intel_uc_fw *uc_fw)
>>> +{
>>> +    return uc_fw->load_status == INTEL_UC_FIRMWARE_SUCCESS;
>>> +}
>>> +
>>>   void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
>>>                  struct intel_uc_fw *uc_fw);
>>>   int intel_uc_fw_upload(struct intel_uc_fw *uc_fw,
Michal Wajdeczko Feb. 22, 2018, 5:30 p.m. UTC | #6
On Wed, 21 Feb 2018 09:27:51 +0100, Chris Wilson  
<chris@chris-wilson.co.uk> wrote:

> Quoting Michal Wajdeczko (2018-02-20 22:57:10)
>> Since commit 6ca9a2beb54a ("drm/i915: Unwind i915_gem_init() failure")
>> we believed that we correctly handle all errors encountered during
>> GuC initialization, including special one that indicates request to
>> run driver with disabled GPU submission (-EIO).
>>
>> Unfortunately since commit 121981fafe69 ("drm/i915/guc: Combine
>> enable_guc_loading|submission modparams") we stopped using that
>> error code to avoid unwanted fallback to execlist submission mode.
>>
>> In result any GuC initialization failure was treated as non-recoverable
>> error leading to driver load abort, so we could not even read related
>> GuC error log to investigate cause of the problem.
>>
>> Fix that by always returning -EIO on uC hardware related failure.
>>
>> v2: don't allow -EIO from uc_init
>>     don't call uc_fini[_misc] on -EIO
>>     mark guc fw as failed on hw init failure
>>     prepare uc_fini_hw to run after earlier -EIO
>>
>> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Michal Winiarski <michal.winiarski@intel.com>
>> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>> Cc: Sagar Arun Kamble <sagar.a.kamble@intel.com>
>
> Whilst thinking about this, do you want to disable (or rather prevent
> setup of) guc submission if the driver is already wedged?

We setup GuC submission in intel_uc_init_hw() and just before we call
it from i915_gem_init_hw() we already have a check for wedged driver.
So maybe we don't want to add redundant guard ?

/Michal
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 631a2db..80f23a8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5324,8 +5324,10 @@  int i915_gem_init(struct drm_i915_private *dev_priv)
 	intel_init_gt_powersave(dev_priv);
 
 	ret = intel_uc_init(dev_priv);
-	if (ret)
+	if (ret) {
+		GEM_BUG_ON(ret == -EIO);
 		goto err_pm;
+	}
 
 	ret = i915_gem_init_hw(dev_priv);
 	if (ret)
@@ -5372,7 +5374,8 @@  int i915_gem_init(struct drm_i915_private *dev_priv)
 	i915_gem_contexts_lost(dev_priv);
 	intel_uc_fini_hw(dev_priv);
 err_uc_init:
-	intel_uc_fini(dev_priv);
+	if (ret != -EIO)
+		intel_uc_fini(dev_priv);
 err_pm:
 	if (ret != -EIO) {
 		intel_cleanup_gt_powersave(dev_priv);
@@ -5386,10 +5389,10 @@  int i915_gem_init(struct drm_i915_private *dev_priv)
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
-	intel_uc_fini_misc(dev_priv);
-
-	if (ret != -EIO)
+	if (ret != -EIO) {
+		intel_uc_fini_misc(dev_priv);
 		i915_gem_cleanup_userptr(dev_priv);
+	}
 
 	if (ret == -EIO) {
 		/*
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 52856a9..512ff7b 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -100,6 +100,11 @@  static inline void intel_guc_notify(struct intel_guc *guc)
 	guc->notify(guc);
 }
 
+static inline bool intel_guc_is_loaded(struct intel_guc *guc)
+{
+	return intel_uc_fw_is_loaded(&guc->fw);
+}
+
 /*
  * GuC does not allow any gfx GGTT address that falls into range [0, WOPCM_TOP),
  * which is reserved for Boot ROM, SRAM and WOPCM. Currently this top address is
diff --git a/drivers/gpu/drm/i915/intel_uc.c b/drivers/gpu/drm/i915/intel_uc.c
index 9f1bac6..75d0eb9 100644
--- a/drivers/gpu/drm/i915/intel_uc.c
+++ b/drivers/gpu/drm/i915/intel_uc.c
@@ -421,11 +421,13 @@  int intel_uc_init_hw(struct drm_i915_private *dev_priv)
 	 * Note that there is no fallback as either user explicitly asked for
 	 * the GuC or driver default option was to run with the GuC enabled.
 	 */
-	if (GEM_WARN_ON(ret == -EIO))
-		ret = -EINVAL;
-
 	dev_err(dev_priv->drm.dev, "GuC initialization failed %d\n", ret);
-	return ret;
+
+	/* Mark GuC firmware as failed to avoid redundant clean-up */
+	guc->fw.load_status = INTEL_UC_FIRMWARE_FAIL;
+
+	/* We want to disable GPU submission but keep KMS alive */
+	return -EIO;
 }
 
 void intel_uc_fini_hw(struct drm_i915_private *dev_priv)
@@ -437,6 +439,9 @@  void intel_uc_fini_hw(struct drm_i915_private *dev_priv)
 
 	GEM_BUG_ON(!HAS_GUC(dev_priv));
 
+	if (!intel_guc_is_loaded(guc))
+		return;
+
 	if (USES_GUC_SUBMISSION(dev_priv))
 		intel_guc_submission_disable(guc);
 
diff --git a/drivers/gpu/drm/i915/intel_uc_fw.h b/drivers/gpu/drm/i915/intel_uc_fw.h
index d5fd460..0e3b237 100644
--- a/drivers/gpu/drm/i915/intel_uc_fw.h
+++ b/drivers/gpu/drm/i915/intel_uc_fw.h
@@ -115,6 +115,11 @@  static inline bool intel_uc_fw_is_selected(struct intel_uc_fw *uc_fw)
 	return uc_fw->path != NULL;
 }
 
+static inline bool intel_uc_fw_is_loaded(struct intel_uc_fw *uc_fw)
+{
+	return uc_fw->load_status == INTEL_UC_FIRMWARE_SUCCESS;
+}
+
 void intel_uc_fw_fetch(struct drm_i915_private *dev_priv,
 		       struct intel_uc_fw *uc_fw);
 int intel_uc_fw_upload(struct intel_uc_fw *uc_fw,