diff mbox series

[v13,4/7] drm/i915: No TLB invalidation on suspended GT

Message ID 20231012223826.2556700-5-jonathan.cavitt@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Define and use GuC and CTB TLB invalidation routines | expand

Commit Message

Cavitt, Jonathan Oct. 12, 2023, 10:38 p.m. UTC
In case of GT is suspended, don't allow submission of new TLB invalidation
request and cancel all pending requests. The TLB entries will be
invalidated either during GuC reload or on system resume.

Signed-off-by: Fei Yang <fei.yang@intel.com>
Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
CC: John Harrison <john.c.harrison@intel.com>
Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Acked-by: Nirmoy Das <nirmoy.das@intel.com>
---
 drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  1 +
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 22 ++++++++++++-------
 drivers/gpu/drm/i915/gt/uc/intel_uc.c         |  7 ++++++
 3 files changed, 22 insertions(+), 8 deletions(-)

Comments

John Harrison Oct. 13, 2023, 1:08 a.m. UTC | #1
On 10/12/2023 15:38, Jonathan Cavitt wrote:
> In case of GT is suspended, don't allow submission of new TLB invalidation
> request and cancel all pending requests. The TLB entries will be
> invalidated either during GuC reload or on system resume.
>
> Signed-off-by: Fei Yang <fei.yang@intel.com>
> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> CC: John Harrison <john.c.harrison@intel.com>
> Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
> Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Acked-by: Nirmoy Das <nirmoy.das@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  1 +
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 22 ++++++++++++-------
>   drivers/gpu/drm/i915/gt/uc/intel_uc.c         |  7 ++++++
>   3 files changed, 22 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> index 0949628d69f8b..2b6dfe62c8f2a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> @@ -537,4 +537,5 @@ int intel_guc_invalidate_tlb_engines(struct intel_guc *guc);
>   int intel_guc_invalidate_tlb_guc(struct intel_guc *guc);
>   int intel_guc_tlb_invalidation_done(struct intel_guc *guc,
>   				    const u32 *payload, u32 len);
> +void wake_up_all_tlb_invalidate(struct intel_guc *guc);
>   #endif
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 1377398afcdfa..3a0d20064878a 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -1796,13 +1796,24 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st
>   	intel_context_put(parent);
>   }
>   
> -void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)
> +void wake_up_all_tlb_invalidate(struct intel_guc *guc)
>   {
>   	struct intel_guc_tlb_wait *wait;
> +	unsigned long i;
> +
> +	if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {
Why the change from 'if(!is_available) return' to 'if(HAS_) {doStuff}'?

John.

> +		xa_lock_irq(&guc->tlb_lookup);
> +		xa_for_each(&guc->tlb_lookup, i, wait)
> +			wake_up(&wait->wq);
> +		xa_unlock_irq(&guc->tlb_lookup);
> +	}
> +}
> +
> +void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)
> +{
>   	struct intel_context *ce;
>   	unsigned long index;
>   	unsigned long flags;
> -	unsigned long i;
>   
>   	if (unlikely(!guc_submission_initialized(guc))) {
>   		/* Reset called during driver load? GuC not yet initialised! */
> @@ -1833,12 +1844,7 @@ void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stall
>   	 * The full GT reset will have cleared the TLB caches and flushed the
>   	 * G2H message queue; we can release all the blocked waiters.
>   	 */
> -	if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {
> -		xa_lock_irq(&guc->tlb_lookup);
> -		xa_for_each(&guc->tlb_lookup, i, wait)
> -			wake_up(&wait->wq);
> -		xa_unlock_irq(&guc->tlb_lookup);
> -	}
> +	wake_up_all_tlb_invalidate(guc);
>   }
>   
>   static void guc_cancel_context_requests(struct intel_context *ce)
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> index 98b103375b7ab..27f6561dd7319 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> @@ -688,6 +688,8 @@ void intel_uc_suspend(struct intel_uc *uc)
>   	/* flush the GSC worker */
>   	intel_gsc_uc_flush_work(&uc->gsc);
>   
> +	wake_up_all_tlb_invalidate(guc);
> +
>   	if (!intel_guc_is_ready(guc)) {
>   		guc->interrupts.enabled = false;
>   		return;
> @@ -736,6 +738,11 @@ static int __uc_resume(struct intel_uc *uc, bool enable_communication)
>   
>   	intel_gsc_uc_resume(&uc->gsc);
>   
> +	if (intel_guc_tlb_invalidation_is_available(guc)) {
> +		intel_guc_invalidate_tlb_engines(guc);
> +		intel_guc_invalidate_tlb_guc(guc);
> +	}
> +
>   	return 0;
>   }
>
Andi Shyti Oct. 13, 2023, 10:49 a.m. UTC | #2
Hi John,

On Thu, Oct 12, 2023 at 06:08:11PM -0700, John Harrison wrote:
> On 10/12/2023 15:38, Jonathan Cavitt wrote:
> > In case of GT is suspended, don't allow submission of new TLB invalidation
> > request and cancel all pending requests. The TLB entries will be
> > invalidated either during GuC reload or on system resume.
> > 
> > Signed-off-by: Fei Yang <fei.yang@intel.com>
> > Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> > CC: John Harrison <john.c.harrison@intel.com>
> > Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
> > Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Acked-by: Nirmoy Das <nirmoy.das@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  1 +
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 22 ++++++++++++-------
> >   drivers/gpu/drm/i915/gt/uc/intel_uc.c         |  7 ++++++
> >   3 files changed, 22 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > index 0949628d69f8b..2b6dfe62c8f2a 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > @@ -537,4 +537,5 @@ int intel_guc_invalidate_tlb_engines(struct intel_guc *guc);
> >   int intel_guc_invalidate_tlb_guc(struct intel_guc *guc);
> >   int intel_guc_tlb_invalidation_done(struct intel_guc *guc,
> >   				    const u32 *payload, u32 len);
> > +void wake_up_all_tlb_invalidate(struct intel_guc *guc);
> >   #endif
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index 1377398afcdfa..3a0d20064878a 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -1796,13 +1796,24 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st
> >   	intel_context_put(parent);
> >   }
> > -void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)
> > +void wake_up_all_tlb_invalidate(struct intel_guc *guc)
> >   {
> >   	struct intel_guc_tlb_wait *wait;
> > +	unsigned long i;
> > +
> > +	if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {
> Why the change from 'if(!is_available) return' to 'if(HAS_) {doStuff}'?

I believe the inversion of the logic is purely for aesthetic (I
preferred it how it was before so to save some level of
indentation, but this is pure taste).

While as for the statement, this was changed because at the
moment this function is called after reset, not all the
components have completed their reset. E.g., the GT is still in
the middle of it.

Another way to improve this is to call
wake_up_all_tlb_invalidate() at the end of gt_reset() in order to
clear up the queue of requests.

Andi
Cavitt, Jonathan Oct. 13, 2023, 2:42 p.m. UTC | #3
-----Original Message-----
From: Harrison, John C <john.c.harrison@intel.com> 
Sent: Thursday, October 12, 2023 6:08 PM
To: Cavitt, Jonathan <jonathan.cavitt@intel.com>; intel-gfx@lists.freedesktop.org
Cc: Gupta, saurabhg <saurabhg.gupta@intel.com>; chris.p.wilson@linux.intel.com; Iddamsetty, Aravind <aravind.iddamsetty@intel.com>; Yang, Fei <fei.yang@intel.com>; Shyti, Andi <andi.shyti@intel.com>; Das, Nirmoy <nirmoy.das@intel.com>; Krzysztofik, Janusz <janusz.krzysztofik@intel.com>; Roper, Matthew D <matthew.d.roper@intel.com>; tvrtko.ursulin@linux.intel.com; jani.nikula@linux.intel.com
Subject: Re: [PATCH v13 4/7] drm/i915: No TLB invalidation on suspended GT
> 
> On 10/12/2023 15:38, Jonathan Cavitt wrote:
> > In case of GT is suspended, don't allow submission of new TLB invalidation
> > request and cancel all pending requests. The TLB entries will be
> > invalidated either during GuC reload or on system resume.
> >
> > Signed-off-by: Fei Yang <fei.yang@intel.com>
> > Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
> > CC: John Harrison <john.c.harrison@intel.com>
> > Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
> > Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Acked-by: Nirmoy Das <nirmoy.das@intel.com>
> > ---
> >   drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  1 +
> >   .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 22 ++++++++++++-------
> >   drivers/gpu/drm/i915/gt/uc/intel_uc.c         |  7 ++++++
> >   3 files changed, 22 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > index 0949628d69f8b..2b6dfe62c8f2a 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
> > @@ -537,4 +537,5 @@ int intel_guc_invalidate_tlb_engines(struct intel_guc *guc);
> >   int intel_guc_invalidate_tlb_guc(struct intel_guc *guc);
> >   int intel_guc_tlb_invalidation_done(struct intel_guc *guc,
> >   				    const u32 *payload, u32 len);
> > +void wake_up_all_tlb_invalidate(struct intel_guc *guc);
> >   #endif
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > index 1377398afcdfa..3a0d20064878a 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> > @@ -1796,13 +1796,24 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st
> >   	intel_context_put(parent);
> >   }
> >   
> > -void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)
> > +void wake_up_all_tlb_invalidate(struct intel_guc *guc)
> >   {
> >   	struct intel_guc_tlb_wait *wait;
> > +	unsigned long i;
> > +
> > +	if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {
> Why the change from 'if(!is_available) return' to 'if(HAS_) {doStuff}'?


I feel like this question has two parts, so I'll answer them separately:

1. Why HAS_GUC_TLB_INVALIDATION and not intel_guc_tlb_invalidation_is_available?

Wake_up_all_tlb_invalidate is called during the suspend/resume path, specifically in the
middle of suspend.  It's required for it to be called here to clean up any invalidations left
in the queue during the suspend/resume phase because they are no longer valid requests.
However, the suspend/resume phase also resets GuC, so intel_guc_is_ready returns false.
In short, using intel_guc_invalidation_is_available was causing us to skip this code section
incorrectly, resulting in spurious GuC TLB invalidation timeout errors during gt reset.


2. Why use a positive check to perform and not a negative check to skip?

In patch 3, wake_up_all_tlb_invalidate was originally called universally on all platforms
during intel_guc_submission_reset, which is incorrect and not how was reimplemented here.
I discovered this was the case and retroactively corrected it, as seen below.
Because of how intel_guc_submission_reset is structured, a negative check to skip wouldn't
make much sense there, so I used a positive check to perform instead.  This is a holdover from
that implementation, and was kept to maintain consistency between patches 3 and 4.  It's
probably not as big of a deal as I'm imagining, but I think it would be awkward if the initial
implementation in intel_guc_submission_reset and the reimplementation in
wake_up_all_tlb_invalidate weren't superficially the same, even if they were functionally
equivalent otherwise.


-Jonathan Cavitt


> 
> John.
> 
> > +		xa_lock_irq(&guc->tlb_lookup);
> > +		xa_for_each(&guc->tlb_lookup, i, wait)
> > +			wake_up(&wait->wq);
> > +		xa_unlock_irq(&guc->tlb_lookup);
> > +	}
> > +}
> > +
> > +void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)
> > +{
> >   	struct intel_context *ce;
> >   	unsigned long index;
> >   	unsigned long flags;
> > -	unsigned long i;
> >   
> >   	if (unlikely(!guc_submission_initialized(guc))) {
> >   		/* Reset called during driver load? GuC not yet initialised! */
> > @@ -1833,12 +1844,7 @@ void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stall
> >   	 * The full GT reset will have cleared the TLB caches and flushed the
> >   	 * G2H message queue; we can release all the blocked waiters.
> >   	 */
> > -	if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {
> > -		xa_lock_irq(&guc->tlb_lookup);
> > -		xa_for_each(&guc->tlb_lookup, i, wait)
> > -			wake_up(&wait->wq);
> > -		xa_unlock_irq(&guc->tlb_lookup);
> > -	}
> > +	wake_up_all_tlb_invalidate(guc);
> >   }
> >   
> >   static void guc_cancel_context_requests(struct intel_context *ce)
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > index 98b103375b7ab..27f6561dd7319 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > @@ -688,6 +688,8 @@ void intel_uc_suspend(struct intel_uc *uc)
> >   	/* flush the GSC worker */
> >   	intel_gsc_uc_flush_work(&uc->gsc);
> >   
> > +	wake_up_all_tlb_invalidate(guc);
> > +
> >   	if (!intel_guc_is_ready(guc)) {
> >   		guc->interrupts.enabled = false;
> >   		return;
> > @@ -736,6 +738,11 @@ static int __uc_resume(struct intel_uc *uc, bool enable_communication)
> >   
> >   	intel_gsc_uc_resume(&uc->gsc);
> >   
> > +	if (intel_guc_tlb_invalidation_is_available(guc)) {
> > +		intel_guc_invalidate_tlb_engines(guc);
> > +		intel_guc_invalidate_tlb_guc(guc);
> > +	}
> > +
> >   	return 0;
> >   }
> >   
> 
>
John Harrison Oct. 13, 2023, 7:12 p.m. UTC | #4
On 10/13/2023 07:42, Cavitt, Jonathan wrote:
> -----Original Message-----
> From: Harrison, John C <john.c.harrison@intel.com>
> Sent: Thursday, October 12, 2023 6:08 PM
> To: Cavitt, Jonathan <jonathan.cavitt@intel.com>; intel-gfx@lists.freedesktop.org
> Cc: Gupta, saurabhg <saurabhg.gupta@intel.com>; chris.p.wilson@linux.intel.com; Iddamsetty, Aravind <aravind.iddamsetty@intel.com>; Yang, Fei <fei.yang@intel.com>; Shyti, Andi <andi.shyti@intel.com>; Das, Nirmoy <nirmoy.das@intel.com>; Krzysztofik, Janusz <janusz.krzysztofik@intel.com>; Roper, Matthew D <matthew.d.roper@intel.com>; tvrtko.ursulin@linux.intel.com; jani.nikula@linux.intel.com
> Subject: Re: [PATCH v13 4/7] drm/i915: No TLB invalidation on suspended GT
>> On 10/12/2023 15:38, Jonathan Cavitt wrote:
>>> In case of GT is suspended, don't allow submission of new TLB invalidation
>>> request and cancel all pending requests. The TLB entries will be
>>> invalidated either during GuC reload or on system resume.
>>>
>>> Signed-off-by: Fei Yang <fei.yang@intel.com>
>>> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
>>> CC: John Harrison <john.c.harrison@intel.com>
>>> Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
>>> Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> Acked-by: Nirmoy Das <nirmoy.das@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  1 +
>>>    .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 22 ++++++++++++-------
>>>    drivers/gpu/drm/i915/gt/uc/intel_uc.c         |  7 ++++++
>>>    3 files changed, 22 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>> index 0949628d69f8b..2b6dfe62c8f2a 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>> @@ -537,4 +537,5 @@ int intel_guc_invalidate_tlb_engines(struct intel_guc *guc);
>>>    int intel_guc_invalidate_tlb_guc(struct intel_guc *guc);
>>>    int intel_guc_tlb_invalidation_done(struct intel_guc *guc,
>>>    				    const u32 *payload, u32 len);
>>> +void wake_up_all_tlb_invalidate(struct intel_guc *guc);
>>>    #endif
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> index 1377398afcdfa..3a0d20064878a 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>> @@ -1796,13 +1796,24 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st
>>>    	intel_context_put(parent);
>>>    }
>>>    
>>> -void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)
>>> +void wake_up_all_tlb_invalidate(struct intel_guc *guc)
>>>    {
>>>    	struct intel_guc_tlb_wait *wait;
>>> +	unsigned long i;
>>> +
>>> +	if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {
>> Why the change from 'if(!is_available) return' to 'if(HAS_) {doStuff}'?
>
> I feel like this question has two parts, so I'll answer them separately:
>
> 1. Why HAS_GUC_TLB_INVALIDATION and not intel_guc_tlb_invalidation_is_available?
>
> Wake_up_all_tlb_invalidate is called during the suspend/resume path, specifically in the
> middle of suspend.  It's required for it to be called here to clean up any invalidations left
> in the queue during the suspend/resume phase because they are no longer valid requests.
> However, the suspend/resume phase also resets GuC, so intel_guc_is_ready returns false.
> In short, using intel_guc_invalidation_is_available was causing us to skip this code section
> incorrectly, resulting in spurious GuC TLB invalidation timeout errors during gt reset.
I'm not following this argument. If a reset is occurring then there is 
no need to issue the invalidate. And the previous version was skipping 
if GuC is in reset but this version does not. Which means it is now 
sending invalidate requests to GuC when GuC is not able to respond and 
therefore more likely to cause timeout errors not less likely.

>
>
> 2. Why use a positive check to perform and not a negative check to skip?
>
> In patch 3, wake_up_all_tlb_invalidate was originally called universally on all platforms
> during intel_guc_submission_reset, which is incorrect and not how was reimplemented here.
> I discovered this was the case and retroactively corrected it, as seen below.
> Because of how intel_guc_submission_reset is structured, a negative check to skip wouldn't
> make much sense there, so I used a positive check to perform instead.  This is a holdover from
> that implementation, and was kept to maintain consistency between patches 3 and 4.  It's
> probably not as big of a deal as I'm imagining, but I think it would be awkward if the initial
> implementation in intel_guc_submission_reset and the reimplementation in
> wake_up_all_tlb_invalidate weren't superficially the same, even if they were functionally
> equivalent otherwise.
I would argue that a bunch of early exit conditions at the start of a 
function is easy to read and maintain than adding nesting levels to the 
entire function.

John.


>
>
> -Jonathan Cavitt
>
>
>> John.
>>
>>> +		xa_lock_irq(&guc->tlb_lookup);
>>> +		xa_for_each(&guc->tlb_lookup, i, wait)
>>> +			wake_up(&wait->wq);
>>> +		xa_unlock_irq(&guc->tlb_lookup);
>>> +	}
>>> +}
>>> +
>>> +void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)
>>> +{
>>>    	struct intel_context *ce;
>>>    	unsigned long index;
>>>    	unsigned long flags;
>>> -	unsigned long i;
>>>    
>>>    	if (unlikely(!guc_submission_initialized(guc))) {
>>>    		/* Reset called during driver load? GuC not yet initialised! */
>>> @@ -1833,12 +1844,7 @@ void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stall
>>>    	 * The full GT reset will have cleared the TLB caches and flushed the
>>>    	 * G2H message queue; we can release all the blocked waiters.
>>>    	 */
>>> -	if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {
>>> -		xa_lock_irq(&guc->tlb_lookup);
>>> -		xa_for_each(&guc->tlb_lookup, i, wait)
>>> -			wake_up(&wait->wq);
>>> -		xa_unlock_irq(&guc->tlb_lookup);
>>> -	}
>>> +	wake_up_all_tlb_invalidate(guc);
>>>    }
>>>    
>>>    static void guc_cancel_context_requests(struct intel_context *ce)
>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>> index 98b103375b7ab..27f6561dd7319 100644
>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>>> @@ -688,6 +688,8 @@ void intel_uc_suspend(struct intel_uc *uc)
>>>    	/* flush the GSC worker */
>>>    	intel_gsc_uc_flush_work(&uc->gsc);
>>>    
>>> +	wake_up_all_tlb_invalidate(guc);
>>> +
>>>    	if (!intel_guc_is_ready(guc)) {
>>>    		guc->interrupts.enabled = false;
>>>    		return;
>>> @@ -736,6 +738,11 @@ static int __uc_resume(struct intel_uc *uc, bool enable_communication)
>>>    
>>>    	intel_gsc_uc_resume(&uc->gsc);
>>>    
>>> +	if (intel_guc_tlb_invalidation_is_available(guc)) {
>>> +		intel_guc_invalidate_tlb_engines(guc);
>>> +		intel_guc_invalidate_tlb_guc(guc);
>>> +	}
>>> +
>>>    	return 0;
>>>    }
>>>    
>>
John Harrison Oct. 13, 2023, 7:14 p.m. UTC | #5
On 10/13/2023 12:12, John Harrison wrote:
> On 10/13/2023 07:42, Cavitt, Jonathan wrote:
>> -----Original Message-----
>> From: Harrison, John C <john.c.harrison@intel.com>
>> Sent: Thursday, October 12, 2023 6:08 PM
>> To: Cavitt, Jonathan <jonathan.cavitt@intel.com>; 
>> intel-gfx@lists.freedesktop.org
>> Cc: Gupta, saurabhg <saurabhg.gupta@intel.com>; 
>> chris.p.wilson@linux.intel.com; Iddamsetty, Aravind 
>> <aravind.iddamsetty@intel.com>; Yang, Fei <fei.yang@intel.com>; 
>> Shyti, Andi <andi.shyti@intel.com>; Das, Nirmoy 
>> <nirmoy.das@intel.com>; Krzysztofik, Janusz 
>> <janusz.krzysztofik@intel.com>; Roper, Matthew D 
>> <matthew.d.roper@intel.com>; tvrtko.ursulin@linux.intel.com; 
>> jani.nikula@linux.intel.com
>> Subject: Re: [PATCH v13 4/7] drm/i915: No TLB invalidation on 
>> suspended GT
>>> On 10/12/2023 15:38, Jonathan Cavitt wrote:
>>>> In case of GT is suspended, don't allow submission of new TLB 
>>>> invalidation
>>>> request and cancel all pending requests. The TLB entries will be
>>>> invalidated either during GuC reload or on system resume.
>>>>
>>>> Signed-off-by: Fei Yang <fei.yang@intel.com>
>>>> Signed-off-by: Jonathan Cavitt <jonathan.cavitt@intel.com>
>>>> CC: John Harrison <john.c.harrison@intel.com>
>>>> Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>
>>>> Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> Acked-by: Nirmoy Das <nirmoy.das@intel.com>
>>>> ---
>>>>    drivers/gpu/drm/i915/gt/uc/intel_guc.h        |  1 +
>>>>    .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 22 
>>>> ++++++++++++-------
>>>>    drivers/gpu/drm/i915/gt/uc/intel_uc.c         |  7 ++++++
>>>>    3 files changed, 22 insertions(+), 8 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h 
>>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>>> index 0949628d69f8b..2b6dfe62c8f2a 100644
>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
>>>> @@ -537,4 +537,5 @@ int intel_guc_invalidate_tlb_engines(struct 
>>>> intel_guc *guc);
>>>>    int intel_guc_invalidate_tlb_guc(struct intel_guc *guc);
>>>>    int intel_guc_tlb_invalidation_done(struct intel_guc *guc,
>>>>                        const u32 *payload, u32 len);
>>>> +void wake_up_all_tlb_invalidate(struct intel_guc *guc);
>>>>    #endif
>>>> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c 
>>>> b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>> index 1377398afcdfa..3a0d20064878a 100644
>>>> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
>>>> @@ -1796,13 +1796,24 @@ static void __guc_reset_context(struct 
>>>> intel_context *ce, intel_engine_mask_t st
>>>>        intel_context_put(parent);
>>>>    }
>>>>    -void intel_guc_submission_reset(struct intel_guc *guc, 
>>>> intel_engine_mask_t stalled)
>>>> +void wake_up_all_tlb_invalidate(struct intel_guc *guc)
>>>>    {
>>>>        struct intel_guc_tlb_wait *wait;
>>>> +    unsigned long i;
>>>> +
>>>> +    if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {
>>> Why the change from 'if(!is_available) return' to 'if(HAS_) {doStuff}'?
>>
>> I feel like this question has two parts, so I'll answer them separately:
>>
>> 1. Why HAS_GUC_TLB_INVALIDATION and not 
>> intel_guc_tlb_invalidation_is_available?
>>
>> Wake_up_all_tlb_invalidate is called during the suspend/resume path, 
>> specifically in the
>> middle of suspend.  It's required for it to be called here to clean 
>> up any invalidations left
>> in the queue during the suspend/resume phase because they are no 
>> longer valid requests.
>> However, the suspend/resume phase also resets GuC, so 
>> intel_guc_is_ready returns false.
>> In short, using intel_guc_invalidation_is_available was causing us to 
>> skip this code section
>> incorrectly, resulting in spurious GuC TLB invalidation timeout 
>> errors during gt reset.
> I'm not following this argument. If a reset is occurring then there is 
> no need to issue the invalidate. And the previous version was skipping 
> if GuC is in reset but this version does not. Which means it is now 
> sending invalidate requests to GuC when GuC is not able to respond and 
> therefore more likely to cause timeout errors not less likely.
Hang on. I'm getting confused between sending the request and waking up 
blocked threads. Apologies.

Okay, that makes sense now.

John.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.h b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
index 0949628d69f8b..2b6dfe62c8f2a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc.h
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.h
@@ -537,4 +537,5 @@  int intel_guc_invalidate_tlb_engines(struct intel_guc *guc);
 int intel_guc_invalidate_tlb_guc(struct intel_guc *guc);
 int intel_guc_tlb_invalidation_done(struct intel_guc *guc,
 				    const u32 *payload, u32 len);
+void wake_up_all_tlb_invalidate(struct intel_guc *guc);
 #endif
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 1377398afcdfa..3a0d20064878a 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -1796,13 +1796,24 @@  static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st
 	intel_context_put(parent);
 }
 
-void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)
+void wake_up_all_tlb_invalidate(struct intel_guc *guc)
 {
 	struct intel_guc_tlb_wait *wait;
+	unsigned long i;
+
+	if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {
+		xa_lock_irq(&guc->tlb_lookup);
+		xa_for_each(&guc->tlb_lookup, i, wait)
+			wake_up(&wait->wq);
+		xa_unlock_irq(&guc->tlb_lookup);
+	}
+}
+
+void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stalled)
+{
 	struct intel_context *ce;
 	unsigned long index;
 	unsigned long flags;
-	unsigned long i;
 
 	if (unlikely(!guc_submission_initialized(guc))) {
 		/* Reset called during driver load? GuC not yet initialised! */
@@ -1833,12 +1844,7 @@  void intel_guc_submission_reset(struct intel_guc *guc, intel_engine_mask_t stall
 	 * The full GT reset will have cleared the TLB caches and flushed the
 	 * G2H message queue; we can release all the blocked waiters.
 	 */
-	if (HAS_GUC_TLB_INVALIDATION(guc_to_gt(guc)->i915)) {
-		xa_lock_irq(&guc->tlb_lookup);
-		xa_for_each(&guc->tlb_lookup, i, wait)
-			wake_up(&wait->wq);
-		xa_unlock_irq(&guc->tlb_lookup);
-	}
+	wake_up_all_tlb_invalidate(guc);
 }
 
 static void guc_cancel_context_requests(struct intel_context *ce)
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index 98b103375b7ab..27f6561dd7319 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -688,6 +688,8 @@  void intel_uc_suspend(struct intel_uc *uc)
 	/* flush the GSC worker */
 	intel_gsc_uc_flush_work(&uc->gsc);
 
+	wake_up_all_tlb_invalidate(guc);
+
 	if (!intel_guc_is_ready(guc)) {
 		guc->interrupts.enabled = false;
 		return;
@@ -736,6 +738,11 @@  static int __uc_resume(struct intel_uc *uc, bool enable_communication)
 
 	intel_gsc_uc_resume(&uc->gsc);
 
+	if (intel_guc_tlb_invalidation_is_available(guc)) {
+		intel_guc_invalidate_tlb_engines(guc);
+		intel_guc_invalidate_tlb_guc(guc);
+	}
+
 	return 0;
 }