Message ID | 1468158084-22028-9-git-send-email-akash.goel@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 10/07/16 14:41, akash.goel@intel.com wrote: > From: Sagar Arun Kamble <sagar.a.kamble@intel.com> > > If GuC logs are being captured, there should be a force log buffer flush > action sent to GuC before proceeding with GPU reset and re-initializing > GUC. Those logs would be useful to understand why the GPU reset was > initiated. > > v2: Rebase. > > Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com> > Signed-off-by: Akash Goel <akash.goel@intel.com> > --- > drivers/gpu/drm/i915/i915_guc_submission.c | 32 ++++++++++++++++++++++++++++++ > drivers/gpu/drm/i915/i915_irq.c | 2 ++ > drivers/gpu/drm/i915/intel_guc.h | 1 + > 3 files changed, 35 insertions(+) > > diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c > index 9b436fa..8cc31c6 100644 > --- a/drivers/gpu/drm/i915/i915_guc_submission.c > +++ b/drivers/gpu/drm/i915/i915_guc_submission.c > @@ -183,6 +183,16 @@ static int host2guc_logbuffer_flush_complete(struct intel_guc *guc) > return host2guc_action(guc, data, 1); > } > > +static int host2guc_force_logbuffer_flush(struct intel_guc *guc) > +{ > + u32 data[2]; > + > + data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH; > + data[1] = 0; > + > + return host2guc_action(guc, data, 2); > +} > + > /* > * Initialise, update, or clear doorbell data shared with the GuC > * > @@ -1404,6 +1414,28 @@ void i915_guc_capture_logs(struct drm_device *dev) > intel_runtime_pm_put(dev_priv); > } > > +void i915_guc_capture_logs_on_reset(struct drm_device *dev) > +{ > + struct drm_i915_private *dev_priv = dev->dev_private; > + > + mutex_lock(&dev->struct_mutex); Not sure what are the repercussion of taking the mutex on the i915_reset_and_wakeup and path (error capture, hangcheck, dont' know this area well). Check with Chris and Mika I suppose (cc-ed)? > + > + if (!i915.enable_guc_submission || (i915.guc_log_level < 0)) It would be cool if guc_log_level could be guaranteed to be disabled when guc submission is not on. I think it would be more maintainable. > + goto end; > + > + /* First disable the interrupts, will be renabled after reset */ > + gen9_disable_guc_interrupts(dev_priv); > + > + /* Ask GuC to update the log buffer state */ > + host2guc_force_logbuffer_flush(&dev_priv->guc); > + > + /* GuC would have updated the log buffer by now, so capture it */ > + i915_guc_capture_logs(dev); > + > +end: > + mutex_unlock(&dev->struct_mutex); > +} > + > void i915_guc_unregister(struct drm_device *dev) > { > if (!i915.enable_guc_submission) > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index f90d3c6..bdd7a67 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -2640,6 +2640,8 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) > */ > intel_runtime_pm_get(dev_priv); > > + i915_guc_capture_logs_on_reset(&dev_priv->drm); > + > intel_prepare_reset(dev_priv); > > /* > diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h > index a784cf8..ed773b5 100644 > --- a/drivers/gpu/drm/i915/intel_guc.h > +++ b/drivers/gpu/drm/i915/intel_guc.h > @@ -175,6 +175,7 @@ int i915_guc_submit(struct drm_i915_gem_request *rq); > void i915_guc_submission_disable(struct drm_i915_private *dev_priv); > void i915_guc_submission_fini(struct drm_i915_private *dev_priv); > void i915_guc_capture_logs(struct drm_device *dev); > +void i915_guc_capture_logs_on_reset(struct drm_device *dev); > void i915_guc_register(struct drm_device *dev); > void i915_guc_unregister(struct drm_device *dev); > > Regards, Tvrtko
On Tue, Jul 19, 2016 at 12:12:20PM +0100, Tvrtko Ursulin wrote: > > On 10/07/16 14:41, akash.goel@intel.com wrote: > >From: Sagar Arun Kamble <sagar.a.kamble@intel.com> > > > >If GuC logs are being captured, there should be a force log buffer flush > >action sent to GuC before proceeding with GPU reset and re-initializing > >GUC. Those logs would be useful to understand why the GPU reset was > >initiated. > > > >v2: Rebase. > > > >Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com> > >Signed-off-by: Akash Goel <akash.goel@intel.com> > >--- > > drivers/gpu/drm/i915/i915_guc_submission.c | 32 ++++++++++++++++++++++++++++++ > > drivers/gpu/drm/i915/i915_irq.c | 2 ++ > > drivers/gpu/drm/i915/intel_guc.h | 1 + > > 3 files changed, 35 insertions(+) > > > >diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c > >index 9b436fa..8cc31c6 100644 > >--- a/drivers/gpu/drm/i915/i915_guc_submission.c > >+++ b/drivers/gpu/drm/i915/i915_guc_submission.c > >@@ -183,6 +183,16 @@ static int host2guc_logbuffer_flush_complete(struct intel_guc *guc) > > return host2guc_action(guc, data, 1); > > } > > > >+static int host2guc_force_logbuffer_flush(struct intel_guc *guc) > >+{ > >+ u32 data[2]; > >+ > >+ data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH; > >+ data[1] = 0; > >+ > >+ return host2guc_action(guc, data, 2); > >+} > >+ > > /* > > * Initialise, update, or clear doorbell data shared with the GuC > > * > >@@ -1404,6 +1414,28 @@ void i915_guc_capture_logs(struct drm_device *dev) > > intel_runtime_pm_put(dev_priv); > > } > > > >+void i915_guc_capture_logs_on_reset(struct drm_device *dev) > >+{ > >+ struct drm_i915_private *dev_priv = dev->dev_private; > >+ > >+ mutex_lock(&dev->struct_mutex); > > Not sure what are the repercussion of taking the mutex on the > i915_reset_and_wakeup and path (error capture, hangcheck, dont' know > this area well). Check with Chris and Mika I suppose (cc-ed)? Flat out invalid to take struct_mutex on the error capture path, or any lock at all really (just in case of driver bugs). Consider it to be an atomic context that may preempt the driver at any point. -Chris
On 7/19/2016 4:51 PM, Chris Wilson wrote: > On Tue, Jul 19, 2016 at 12:12:20PM +0100, Tvrtko Ursulin wrote: >> >> On 10/07/16 14:41, akash.goel@intel.com wrote: >>> From: Sagar Arun Kamble <sagar.a.kamble@intel.com> >>> >>> If GuC logs are being captured, there should be a force log buffer flush >>> action sent to GuC before proceeding with GPU reset and re-initializing >>> GUC. Those logs would be useful to understand why the GPU reset was >>> initiated. >>> >>> v2: Rebase. >>> >>> Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com> >>> Signed-off-by: Akash Goel <akash.goel@intel.com> >>> --- >>> drivers/gpu/drm/i915/i915_guc_submission.c | 32 ++++++++++++++++++++++++++++++ >>> drivers/gpu/drm/i915/i915_irq.c | 2 ++ >>> drivers/gpu/drm/i915/intel_guc.h | 1 + >>> 3 files changed, 35 insertions(+) >>> >>> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c >>> index 9b436fa..8cc31c6 100644 >>> --- a/drivers/gpu/drm/i915/i915_guc_submission.c >>> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c >>> @@ -183,6 +183,16 @@ static int host2guc_logbuffer_flush_complete(struct intel_guc *guc) >>> return host2guc_action(guc, data, 1); >>> } >>> >>> +static int host2guc_force_logbuffer_flush(struct intel_guc *guc) >>> +{ >>> + u32 data[2]; >>> + >>> + data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH; >>> + data[1] = 0; >>> + >>> + return host2guc_action(guc, data, 2); >>> +} >>> + >>> /* >>> * Initialise, update, or clear doorbell data shared with the GuC >>> * >>> @@ -1404,6 +1414,28 @@ void i915_guc_capture_logs(struct drm_device *dev) >>> intel_runtime_pm_put(dev_priv); >>> } >>> >>> +void i915_guc_capture_logs_on_reset(struct drm_device *dev) >>> +{ >>> + struct drm_i915_private *dev_priv = dev->dev_private; >>> + >>> + mutex_lock(&dev->struct_mutex); >> >> Not sure what are the repercussion of taking the mutex on the >> i915_reset_and_wakeup and path (error capture, hangcheck, dont' know >> this area well). Check with Chris and Mika I suppose (cc-ed)? > Took the struct_mutex, just to avoid a very remote possibility where i915_guc_capture_logs_on_reset & debugfs function i915_guc_log_control executes concurrently. > Flat out invalid to take struct_mutex on the error capture path, or any > lock at all really (just in case of driver bugs). Consider it to be an > atomic context that may preempt the driver at any point. Actually I see that i915_reset() too takes the struct_mutex right at the beginning and I have plugged the call to i915_guc_capture_logs_on_reset() just before that. Also it is being called after i915_error_wake_up(), so any client waiting on a request would have backed off and any new attempt by clients to lock the struct_mutex should see i915_reset_in_progress as true. Best regards Akash > -Chris >
On Wed, Jul 20, 2016 at 09:51:45AM +0530, Goel, Akash wrote: > > > On 7/19/2016 4:51 PM, Chris Wilson wrote: > >On Tue, Jul 19, 2016 at 12:12:20PM +0100, Tvrtko Ursulin wrote: > >> > >>On 10/07/16 14:41, akash.goel@intel.com wrote: > >>>From: Sagar Arun Kamble <sagar.a.kamble@intel.com> > >>> > >>>If GuC logs are being captured, there should be a force log buffer flush > >>>action sent to GuC before proceeding with GPU reset and re-initializing > >>>GUC. Those logs would be useful to understand why the GPU reset was > >>>initiated. > >>> > >>>v2: Rebase. > >>> > >>>Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com> > >>>Signed-off-by: Akash Goel <akash.goel@intel.com> > >>>--- > >>> drivers/gpu/drm/i915/i915_guc_submission.c | 32 ++++++++++++++++++++++++++++++ > >>> drivers/gpu/drm/i915/i915_irq.c | 2 ++ > >>> drivers/gpu/drm/i915/intel_guc.h | 1 + > >>> 3 files changed, 35 insertions(+) > >>> > >>>diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c > >>>index 9b436fa..8cc31c6 100644 > >>>--- a/drivers/gpu/drm/i915/i915_guc_submission.c > >>>+++ b/drivers/gpu/drm/i915/i915_guc_submission.c > >>>@@ -183,6 +183,16 @@ static int host2guc_logbuffer_flush_complete(struct intel_guc *guc) > >>> return host2guc_action(guc, data, 1); > >>> } > >>> > >>>+static int host2guc_force_logbuffer_flush(struct intel_guc *guc) > >>>+{ > >>>+ u32 data[2]; > >>>+ > >>>+ data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH; > >>>+ data[1] = 0; > >>>+ > >>>+ return host2guc_action(guc, data, 2); > >>>+} > >>>+ > >>> /* > >>> * Initialise, update, or clear doorbell data shared with the GuC > >>> * > >>>@@ -1404,6 +1414,28 @@ void i915_guc_capture_logs(struct drm_device *dev) > >>> intel_runtime_pm_put(dev_priv); > >>> } > >>> > >>>+void i915_guc_capture_logs_on_reset(struct drm_device *dev) > >>>+{ > >>>+ struct drm_i915_private *dev_priv = dev->dev_private; > >>>+ > >>>+ mutex_lock(&dev->struct_mutex); > >> > >>Not sure what are the repercussion of taking the mutex on the > >>i915_reset_and_wakeup and path (error capture, hangcheck, dont' know > >>this area well). Check with Chris and Mika I suppose (cc-ed)? > > > > Took the struct_mutex, just to avoid a very remote possibility where > i915_guc_capture_logs_on_reset & debugfs function > i915_guc_log_control executes concurrently. > > >Flat out invalid to take struct_mutex on the error capture path, or any > >lock at all really (just in case of driver bugs). Consider it to be an > >atomic context that may preempt the driver at any point. > > Actually I see that i915_reset() too takes the struct_mutex right at > the beginning and I have plugged the call to > i915_guc_capture_logs_on_reset() just before that. Postmortem state is captured from i915_capture_error_state(), and as I recall one of the raison d'etre for this facility was to include the guc log in the error state. -Chris
On 7/20/2016 2:42 PM, Chris Wilson wrote: > On Wed, Jul 20, 2016 at 09:51:45AM +0530, Goel, Akash wrote: >> >> >> On 7/19/2016 4:51 PM, Chris Wilson wrote: >>> On Tue, Jul 19, 2016 at 12:12:20PM +0100, Tvrtko Ursulin wrote: >>>> >>>> On 10/07/16 14:41, akash.goel@intel.com wrote: >>>>> From: Sagar Arun Kamble <sagar.a.kamble@intel.com> >>>>> >>>>> If GuC logs are being captured, there should be a force log buffer flush >>>>> action sent to GuC before proceeding with GPU reset and re-initializing >>>>> GUC. Those logs would be useful to understand why the GPU reset was >>>>> initiated. >>>>> >>>>> v2: Rebase. >>>>> >>>>> Signed-off-by: Sagar Arun Kamble <sagar.a.kamble@intel.com> >>>>> Signed-off-by: Akash Goel <akash.goel@intel.com> >>>>> --- >>>>> drivers/gpu/drm/i915/i915_guc_submission.c | 32 ++++++++++++++++++++++++++++++ >>>>> drivers/gpu/drm/i915/i915_irq.c | 2 ++ >>>>> drivers/gpu/drm/i915/intel_guc.h | 1 + >>>>> 3 files changed, 35 insertions(+) >>>>> >>>>> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c >>>>> index 9b436fa..8cc31c6 100644 >>>>> --- a/drivers/gpu/drm/i915/i915_guc_submission.c >>>>> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c >>>>> @@ -183,6 +183,16 @@ static int host2guc_logbuffer_flush_complete(struct intel_guc *guc) >>>>> return host2guc_action(guc, data, 1); >>>>> } >>>>> >>>>> +static int host2guc_force_logbuffer_flush(struct intel_guc *guc) >>>>> +{ >>>>> + u32 data[2]; >>>>> + >>>>> + data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH; >>>>> + data[1] = 0; >>>>> + >>>>> + return host2guc_action(guc, data, 2); >>>>> +} >>>>> + >>>>> /* >>>>> * Initialise, update, or clear doorbell data shared with the GuC >>>>> * >>>>> @@ -1404,6 +1414,28 @@ void i915_guc_capture_logs(struct drm_device *dev) >>>>> intel_runtime_pm_put(dev_priv); >>>>> } >>>>> >>>>> +void i915_guc_capture_logs_on_reset(struct drm_device *dev) >>>>> +{ >>>>> + struct drm_i915_private *dev_priv = dev->dev_private; >>>>> + >>>>> + mutex_lock(&dev->struct_mutex); >>>> >>>> Not sure what are the repercussion of taking the mutex on the >>>> i915_reset_and_wakeup and path (error capture, hangcheck, dont' know >>>> this area well). Check with Chris and Mika I suppose (cc-ed)? >>> >> >> Took the struct_mutex, just to avoid a very remote possibility where >> i915_guc_capture_logs_on_reset & debugfs function >> i915_guc_log_control executes concurrently. >> >>> Flat out invalid to take struct_mutex on the error capture path, or any >>> lock at all really (just in case of driver bugs). Consider it to be an >>> atomic context that may preempt the driver at any point. >> >> Actually I see that i915_reset() too takes the struct_mutex right at >> the beginning and I have plugged the call to >> i915_guc_capture_logs_on_reset() just before that. > > Postmortem state is captured from i915_capture_error_state(), and as I > recall one of the raison d'etre for this facility was to include the guc > log in the error state. Sorry I missed augmenting the error state with guc firmware logs. For that also a prior flush will be needed, will do the flush without acquiring the struct_mutex. Best regards Akash > -Chris >
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c index 9b436fa..8cc31c6 100644 --- a/drivers/gpu/drm/i915/i915_guc_submission.c +++ b/drivers/gpu/drm/i915/i915_guc_submission.c @@ -183,6 +183,16 @@ static int host2guc_logbuffer_flush_complete(struct intel_guc *guc) return host2guc_action(guc, data, 1); } +static int host2guc_force_logbuffer_flush(struct intel_guc *guc) +{ + u32 data[2]; + + data[0] = HOST2GUC_ACTION_FORCE_LOG_BUFFER_FLUSH; + data[1] = 0; + + return host2guc_action(guc, data, 2); +} + /* * Initialise, update, or clear doorbell data shared with the GuC * @@ -1404,6 +1414,28 @@ void i915_guc_capture_logs(struct drm_device *dev) intel_runtime_pm_put(dev_priv); } +void i915_guc_capture_logs_on_reset(struct drm_device *dev) +{ + struct drm_i915_private *dev_priv = dev->dev_private; + + mutex_lock(&dev->struct_mutex); + + if (!i915.enable_guc_submission || (i915.guc_log_level < 0)) + goto end; + + /* First disable the interrupts, will be renabled after reset */ + gen9_disable_guc_interrupts(dev_priv); + + /* Ask GuC to update the log buffer state */ + host2guc_force_logbuffer_flush(&dev_priv->guc); + + /* GuC would have updated the log buffer by now, so capture it */ + i915_guc_capture_logs(dev); + +end: + mutex_unlock(&dev->struct_mutex); +} + void i915_guc_unregister(struct drm_device *dev) { if (!i915.enable_guc_submission) diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index f90d3c6..bdd7a67 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2640,6 +2640,8 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv) */ intel_runtime_pm_get(dev_priv); + i915_guc_capture_logs_on_reset(&dev_priv->drm); + intel_prepare_reset(dev_priv); /* diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h index a784cf8..ed773b5 100644 --- a/drivers/gpu/drm/i915/intel_guc.h +++ b/drivers/gpu/drm/i915/intel_guc.h @@ -175,6 +175,7 @@ int i915_guc_submit(struct drm_i915_gem_request *rq); void i915_guc_submission_disable(struct drm_i915_private *dev_priv); void i915_guc_submission_fini(struct drm_i915_private *dev_priv); void i915_guc_capture_logs(struct drm_device *dev); +void i915_guc_capture_logs_on_reset(struct drm_device *dev); void i915_guc_register(struct drm_device *dev); void i915_guc_unregister(struct drm_device *dev);