diff mbox

[08/17] drm/i915: Forcefully flush GuC log buffer on reset

Message ID 1468158084-22028-9-git-send-email-akash.goel@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

akash.goel@intel.com July 10, 2016, 1:41 p.m. UTC
From: Sagar Arun Kamble <sagar.a.kamble@intel.com>

If GuC logs are being captured, there should be a force log buffer flush
action sent to GuC before proceeding with GPU reset and re-initializing
GUC. Those logs would be useful to understand why the GPU reset was
initiated.

v2: Rebase.

Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com>
Signed-off-by: Akash Goel <akash.goel@intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 32 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_irq.c            |  2 ++
 drivers/gpu/drm/i915/intel_guc.h           |  1 +
 3 files changed, 35 insertions(+)

Comments

Tvrtko Ursulin July 19, 2016, 11:12 a.m. UTC | #1
On 10/07/16 14:41, akash.goel@intel.com wrote:
> From: Sagar Arun Kamble <sagar.a.kamble@intel.com>
>
> If GuC logs are being captured, there should be a force log buffer flush
> action sent to GuC before proceeding with GPU reset and re-initializing
> GUC. Those logs would be useful to understand why the GPU reset was
> initiated.
>
> v2: Rebase.
>
> Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com>
> Signed-off-by: Akash Goel <akash.goel@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_guc_submission.c | 32 ++++++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_irq.c            |  2 ++
>   drivers/gpu/drm/i915/intel_guc.h           |  1 +
>   3 files changed, 35 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 9b436fa..8cc31c6 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -183,6 +183,16 @@ static int host2guc_logbuffer_flush_complete(struct intel_guc *guc)
>   	return host2guc_action(guc, data, 1);
>   }
>
> +static int host2guc_force_logbuffer_flush(struct intel_guc *guc)
> +{
> +	u32 data[2];
> +
> +	data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH;
> +	data[1] = 0;
> +
> +	return host2guc_action(guc, data, 2);
> +}
> +
>   /*
>    * Initialise, update, or clear doorbell data shared with the GuC
>    *
> @@ -1404,6 +1414,28 @@ void i915_guc_capture_logs(struct drm_device *dev)
>   	intel_runtime_pm_put(dev_priv);
>   }
>
> +void i915_guc_capture_logs_on_reset(struct drm_device *dev)
> +{
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +	mutex_lock(&dev->struct_mutex);

Not sure what are the repercussion of taking the mutex on the 
i915_reset_and_wakeup and path (error capture, hangcheck, dont' know 
this area well). Check with Chris and Mika I suppose (cc-ed)?

> +
> +	if (!i915.enable_guc_submission || (i915.guc_log_level < 0))

It would be cool if guc_log_level could be guaranteed to be disabled 
when guc submission is not on. I think it would be more maintainable.

> +		goto end;
> +
> +	/* First disable the interrupts, will be renabled after reset */
> +	gen9_disable_guc_interrupts(dev_priv);
> +
> +	/* Ask GuC to update the log buffer state */
> +	host2guc_force_logbuffer_flush(&dev_priv->guc);
> +
> +	/* GuC would have updated the log buffer by now, so capture it */
> +	i915_guc_capture_logs(dev);
> +
> +end:
> +	mutex_unlock(&dev->struct_mutex);
> +}
> +
>   void i915_guc_unregister(struct drm_device *dev)
>   {
>   	if (!i915.enable_guc_submission)
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index f90d3c6..bdd7a67 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2640,6 +2640,8 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>   		 */
>   		intel_runtime_pm_get(dev_priv);
>
> +		i915_guc_capture_logs_on_reset(&dev_priv->drm);
> +
>   		intel_prepare_reset(dev_priv);
>
>   		/*
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index a784cf8..ed773b5 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -175,6 +175,7 @@ int i915_guc_submit(struct drm_i915_gem_request *rq);
>   void i915_guc_submission_disable(struct drm_i915_private *dev_priv);
>   void i915_guc_submission_fini(struct drm_i915_private *dev_priv);
>   void i915_guc_capture_logs(struct drm_device *dev);
> +void i915_guc_capture_logs_on_reset(struct drm_device *dev);
>   void i915_guc_register(struct drm_device *dev);
>   void i915_guc_unregister(struct drm_device *dev);
>
>

Regards,

Tvrtko
Chris Wilson July 19, 2016, 11:21 a.m. UTC | #2
On Tue, Jul 19, 2016 at 12:12:20PM +0100, Tvrtko Ursulin wrote:
> 
> On 10/07/16 14:41, akash.goel@intel.com wrote:
> >From: Sagar Arun Kamble <sagar.a.kamble@intel.com>
> >
> >If GuC logs are being captured, there should be a force log buffer flush
> >action sent to GuC before proceeding with GPU reset and re-initializing
> >GUC. Those logs would be useful to understand why the GPU reset was
> >initiated.
> >
> >v2: Rebase.
> >
> >Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com>
> >Signed-off-by: Akash Goel <akash.goel@intel.com>
> >---
> >  drivers/gpu/drm/i915/i915_guc_submission.c | 32 ++++++++++++++++++++++++++++++
> >  drivers/gpu/drm/i915/i915_irq.c            |  2 ++
> >  drivers/gpu/drm/i915/intel_guc.h           |  1 +
> >  3 files changed, 35 insertions(+)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> >index 9b436fa..8cc31c6 100644
> >--- a/drivers/gpu/drm/i915/i915_guc_submission.c
> >+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> >@@ -183,6 +183,16 @@ static int host2guc_logbuffer_flush_complete(struct intel_guc *guc)
> >  	return host2guc_action(guc, data, 1);
> >  }
> >
> >+static int host2guc_force_logbuffer_flush(struct intel_guc *guc)
> >+{
> >+	u32 data[2];
> >+
> >+	data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH;
> >+	data[1] = 0;
> >+
> >+	return host2guc_action(guc, data, 2);
> >+}
> >+
> >  /*
> >   * Initialise, update, or clear doorbell data shared with the GuC
> >   *
> >@@ -1404,6 +1414,28 @@ void i915_guc_capture_logs(struct drm_device *dev)
> >  	intel_runtime_pm_put(dev_priv);
> >  }
> >
> >+void i915_guc_capture_logs_on_reset(struct drm_device *dev)
> >+{
> >+	struct drm_i915_private *dev_priv = dev->dev_private;
> >+
> >+	mutex_lock(&dev->struct_mutex);
> 
> Not sure what are the repercussion of taking the mutex on the
> i915_reset_and_wakeup and path (error capture, hangcheck, dont' know
> this area well). Check with Chris and Mika I suppose (cc-ed)?

Flat out invalid to take struct_mutex on the error capture path, or any
lock at all really (just in case of driver bugs). Consider it to be an
atomic context that may preempt the driver at any point.
-Chris
akash.goel@intel.com July 20, 2016, 4:21 a.m. UTC | #3
On 7/19/2016 4:51 PM, Chris Wilson wrote:
> On Tue, Jul 19, 2016 at 12:12:20PM +0100, Tvrtko Ursulin wrote:
>>
>> On 10/07/16 14:41, akash.goel@intel.com wrote:
>>> From: Sagar Arun Kamble <sagar.a.kamble@intel.com>
>>>
>>> If GuC logs are being captured, there should be a force log buffer flush
>>> action sent to GuC before proceeding with GPU reset and re-initializing
>>> GUC. Those logs would be useful to understand why the GPU reset was
>>> initiated.
>>>
>>> v2: Rebase.
>>>
>>> Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com>
>>> Signed-off-by: Akash Goel <akash.goel@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/i915_guc_submission.c | 32 ++++++++++++++++++++++++++++++
>>>  drivers/gpu/drm/i915/i915_irq.c            |  2 ++
>>>  drivers/gpu/drm/i915/intel_guc.h           |  1 +
>>>  3 files changed, 35 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
>>> index 9b436fa..8cc31c6 100644
>>> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
>>> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
>>> @@ -183,6 +183,16 @@ static int host2guc_logbuffer_flush_complete(struct intel_guc *guc)
>>>  	return host2guc_action(guc, data, 1);
>>>  }
>>>
>>> +static int host2guc_force_logbuffer_flush(struct intel_guc *guc)
>>> +{
>>> +	u32 data[2];
>>> +
>>> +	data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH;
>>> +	data[1] = 0;
>>> +
>>> +	return host2guc_action(guc, data, 2);
>>> +}
>>> +
>>>  /*
>>>   * Initialise, update, or clear doorbell data shared with the GuC
>>>   *
>>> @@ -1404,6 +1414,28 @@ void i915_guc_capture_logs(struct drm_device *dev)
>>>  	intel_runtime_pm_put(dev_priv);
>>>  }
>>>
>>> +void i915_guc_capture_logs_on_reset(struct drm_device *dev)
>>> +{
>>> +	struct drm_i915_private *dev_priv = dev->dev_private;
>>> +
>>> +	mutex_lock(&dev->struct_mutex);
>>
>> Not sure what are the repercussion of taking the mutex on the
>> i915_reset_and_wakeup and path (error capture, hangcheck, dont' know
>> this area well). Check with Chris and Mika I suppose (cc-ed)?
>

Took the struct_mutex, just to avoid a very remote possibility where
i915_guc_capture_logs_on_reset & debugfs function i915_guc_log_control 
executes concurrently.

> Flat out invalid to take struct_mutex on the error capture path, or any
> lock at all really (just in case of driver bugs). Consider it to be an
> atomic context that may preempt the driver at any point.

Actually I see that i915_reset() too takes the struct_mutex right at the 
beginning and I have plugged the call to 
i915_guc_capture_logs_on_reset() just before that.

Also it is being called after i915_error_wake_up(), so any client 
waiting on a request would have backed off and any new attempt by 
clients to lock the struct_mutex should see i915_reset_in_progress as
true.

Best regards
Akash
> -Chris
>
Chris Wilson July 20, 2016, 9:12 a.m. UTC | #4
On Wed, Jul 20, 2016 at 09:51:45AM +0530, Goel, Akash wrote:
> 
> 
> On 7/19/2016 4:51 PM, Chris Wilson wrote:
> >On Tue, Jul 19, 2016 at 12:12:20PM +0100, Tvrtko Ursulin wrote:
> >>
> >>On 10/07/16 14:41, akash.goel@intel.com wrote:
> >>>From: Sagar Arun Kamble <sagar.a.kamble@intel.com>
> >>>
> >>>If GuC logs are being captured, there should be a force log buffer flush
> >>>action sent to GuC before proceeding with GPU reset and re-initializing
> >>>GUC. Those logs would be useful to understand why the GPU reset was
> >>>initiated.
> >>>
> >>>v2: Rebase.
> >>>
> >>>Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com>
> >>>Signed-off-by: Akash Goel <akash.goel@intel.com>
> >>>---
> >>> drivers/gpu/drm/i915/i915_guc_submission.c | 32 ++++++++++++++++++++++++++++++
> >>> drivers/gpu/drm/i915/i915_irq.c            |  2 ++
> >>> drivers/gpu/drm/i915/intel_guc.h           |  1 +
> >>> 3 files changed, 35 insertions(+)
> >>>
> >>>diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> >>>index 9b436fa..8cc31c6 100644
> >>>--- a/drivers/gpu/drm/i915/i915_guc_submission.c
> >>>+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> >>>@@ -183,6 +183,16 @@ static int host2guc_logbuffer_flush_complete(struct intel_guc *guc)
> >>> 	return host2guc_action(guc, data, 1);
> >>> }
> >>>
> >>>+static int host2guc_force_logbuffer_flush(struct intel_guc *guc)
> >>>+{
> >>>+	u32 data[2];
> >>>+
> >>>+	data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH;
> >>>+	data[1] = 0;
> >>>+
> >>>+	return host2guc_action(guc, data, 2);
> >>>+}
> >>>+
> >>> /*
> >>>  * Initialise, update, or clear doorbell data shared with the GuC
> >>>  *
> >>>@@ -1404,6 +1414,28 @@ void i915_guc_capture_logs(struct drm_device *dev)
> >>> 	intel_runtime_pm_put(dev_priv);
> >>> }
> >>>
> >>>+void i915_guc_capture_logs_on_reset(struct drm_device *dev)
> >>>+{
> >>>+	struct drm_i915_private *dev_priv = dev->dev_private;
> >>>+
> >>>+	mutex_lock(&dev->struct_mutex);
> >>
> >>Not sure what are the repercussion of taking the mutex on the
> >>i915_reset_and_wakeup and path (error capture, hangcheck, dont' know
> >>this area well). Check with Chris and Mika I suppose (cc-ed)?
> >
> 
> Took the struct_mutex, just to avoid a very remote possibility where
> i915_guc_capture_logs_on_reset & debugfs function
> i915_guc_log_control executes concurrently.
> 
> >Flat out invalid to take struct_mutex on the error capture path, or any
> >lock at all really (just in case of driver bugs). Consider it to be an
> >atomic context that may preempt the driver at any point.
> 
> Actually I see that i915_reset() too takes the struct_mutex right at
> the beginning and I have plugged the call to
> i915_guc_capture_logs_on_reset() just before that.

Postmortem state is captured from i915_capture_error_state(), and as I
recall one of the raison d'etre for this facility was to include the guc
log in the error state.
-Chris
akash.goel@intel.com July 20, 2016, 9:48 a.m. UTC | #5
On 7/20/2016 2:42 PM, Chris Wilson wrote:
> On Wed, Jul 20, 2016 at 09:51:45AM +0530, Goel, Akash wrote:
>>
>>
>> On 7/19/2016 4:51 PM, Chris Wilson wrote:
>>> On Tue, Jul 19, 2016 at 12:12:20PM +0100, Tvrtko Ursulin wrote:
>>>>
>>>> On 10/07/16 14:41, akash.goel@intel.com wrote:
>>>>> From: Sagar Arun Kamble <sagar.a.kamble@intel.com>
>>>>>
>>>>> If GuC logs are being captured, there should be a force log buffer flush
>>>>> action sent to GuC before proceeding with GPU reset and re-initializing
>>>>> GUC. Those logs would be useful to understand why the GPU reset was
>>>>> initiated.
>>>>>
>>>>> v2: Rebase.
>>>>>
>>>>> Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com>
>>>>> Signed-off-by: Akash Goel <akash.goel@intel.com>
>>>>> ---
>>>>> drivers/gpu/drm/i915/i915_guc_submission.c | 32 ++++++++++++++++++++++++++++++
>>>>> drivers/gpu/drm/i915/i915_irq.c            |  2 ++
>>>>> drivers/gpu/drm/i915/intel_guc.h           |  1 +
>>>>> 3 files changed, 35 insertions(+)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
>>>>> index 9b436fa..8cc31c6 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
>>>>> @@ -183,6 +183,16 @@ static int host2guc_logbuffer_flush_complete(struct intel_guc *guc)
>>>>> 	return host2guc_action(guc, data, 1);
>>>>> }
>>>>>
>>>>> +static int host2guc_force_logbuffer_flush(struct intel_guc *guc)
>>>>> +{
>>>>> +	u32 data[2];
>>>>> +
>>>>> +	data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH;
>>>>> +	data[1] = 0;
>>>>> +
>>>>> +	return host2guc_action(guc, data, 2);
>>>>> +}
>>>>> +
>>>>> /*
>>>>>  * Initialise, update, or clear doorbell data shared with the GuC
>>>>>  *
>>>>> @@ -1404,6 +1414,28 @@ void i915_guc_capture_logs(struct drm_device *dev)
>>>>> 	intel_runtime_pm_put(dev_priv);
>>>>> }
>>>>>
>>>>> +void i915_guc_capture_logs_on_reset(struct drm_device *dev)
>>>>> +{
>>>>> +	struct drm_i915_private *dev_priv = dev->dev_private;
>>>>> +
>>>>> +	mutex_lock(&dev->struct_mutex);
>>>>
>>>> Not sure what are the repercussion of taking the mutex on the
>>>> i915_reset_and_wakeup and path (error capture, hangcheck, dont' know
>>>> this area well). Check with Chris and Mika I suppose (cc-ed)?
>>>
>>
>> Took the struct_mutex, just to avoid a very remote possibility where
>> i915_guc_capture_logs_on_reset & debugfs function
>> i915_guc_log_control executes concurrently.
>>
>>> Flat out invalid to take struct_mutex on the error capture path, or any
>>> lock at all really (just in case of driver bugs). Consider it to be an
>>> atomic context that may preempt the driver at any point.
>>
>> Actually I see that i915_reset() too takes the struct_mutex right at
>> the beginning and I have plugged the call to
>> i915_guc_capture_logs_on_reset() just before that.
>
> Postmortem state is captured from i915_capture_error_state(), and as I
> recall one of the raison d'etre for this facility was to include the guc
> log in the error state.

Sorry I missed augmenting the error state with guc firmware logs.
For that also a prior flush will be needed, will do the flush without 
acquiring the struct_mutex.

Best regards
Akash

> -Chris
>
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 9b436fa..8cc31c6 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -183,6 +183,16 @@  static int host2guc_logbuffer_flush_complete(struct intel_guc *guc)
 	return host2guc_action(guc, data, 1);
 }
 
+static int host2guc_force_logbuffer_flush(struct intel_guc *guc)
+{
+	u32 data[2];
+
+	data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH;
+	data[1] = 0;
+
+	return host2guc_action(guc, data, 2);
+}
+
 /*
  * Initialise, update, or clear doorbell data shared with the GuC
  *
@@ -1404,6 +1414,28 @@  void i915_guc_capture_logs(struct drm_device *dev)
 	intel_runtime_pm_put(dev_priv);
 }
 
+void i915_guc_capture_logs_on_reset(struct drm_device *dev)
+{
+	struct drm_i915_private *dev_priv = dev->dev_private;
+
+	mutex_lock(&dev->struct_mutex);
+
+	if (!i915.enable_guc_submission || (i915.guc_log_level < 0))
+		goto end;
+
+	/* First disable the interrupts, will be renabled after reset */
+	gen9_disable_guc_interrupts(dev_priv);
+
+	/* Ask GuC to update the log buffer state */
+	host2guc_force_logbuffer_flush(&dev_priv->guc);
+
+	/* GuC would have updated the log buffer by now, so capture it */
+	i915_guc_capture_logs(dev);
+
+end:
+	mutex_unlock(&dev->struct_mutex);
+}
+
 void i915_guc_unregister(struct drm_device *dev)
 {
 	if (!i915.enable_guc_submission)
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index f90d3c6..bdd7a67 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2640,6 +2640,8 @@  static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 		 */
 		intel_runtime_pm_get(dev_priv);
 
+		i915_guc_capture_logs_on_reset(&dev_priv->drm);
+
 		intel_prepare_reset(dev_priv);
 
 		/*
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index a784cf8..ed773b5 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -175,6 +175,7 @@  int i915_guc_submit(struct drm_i915_gem_request *rq);
 void i915_guc_submission_disable(struct drm_i915_private *dev_priv);
 void i915_guc_submission_fini(struct drm_i915_private *dev_priv);
 void i915_guc_capture_logs(struct drm_device *dev);
+void i915_guc_capture_logs_on_reset(struct drm_device *dev);
 void i915_guc_register(struct drm_device *dev);
 void i915_guc_unregister(struct drm_device *dev);