diff mbox series

[v2,01/21] drm/i915/gt: Ignore TLB invalidations on idle engines

Message ID c014a1d743fa46a6b57f02bffb7badf438136442.1657800199.git.mchehab@kernel.org (mailing list archive)
State New, archived
Headers show
Series Fix performance regressions with TLB and add GuC support | expand

Commit Message

Mauro Carvalho Chehab July 14, 2022, 12:06 p.m. UTC
From: Chris Wilson <chris.p.wilson@intel.com>

Check if the device is powered down prior to any engine activity,
as, on such cases, all the TLBs were already invalidated, so an
explicit TLB invalidation is not needed, thus reducing the
performance regression impact due to it.

This becomes more significant with GuC, as it can only do so when
the connection to the GuC is awake.

Cc: stable@vger.kernel.org
Fixes: 7938d61591d3 ("drm/i915: Flush TLBs before releasing backing store")
Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
Cc: Fei Yang <fei.yang@intel.com>
Cc: Andi Shyti <andi.shyti@linux.intel.com>
Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
---

To avoid mailbombing on a large number of people, only mailing lists were C/C on the cover.
See [PATCH v2 00/21] at: https://lore.kernel.org/all/cover.1657800199.git.mchehab@kernel.org/

 drivers/gpu/drm/i915/gem/i915_gem_pages.c | 10 ++++++----
 drivers/gpu/drm/i915/gt/intel_gt.c        | 17 ++++++++++-------
 drivers/gpu/drm/i915/gt/intel_gt_pm.h     |  3 +++
 3 files changed, 19 insertions(+), 11 deletions(-)

Comments

Tvrtko Ursulin July 18, 2022, 1:16 p.m. UTC | #1
On 14/07/2022 13:06, Mauro Carvalho Chehab wrote:
> From: Chris Wilson <chris.p.wilson@intel.com>
> 
> Check if the device is powered down prior to any engine activity,
> as, on such cases, all the TLBs were already invalidated, so an
> explicit TLB invalidation is not needed, thus reducing the
> performance regression impact due to it.
> 
> This becomes more significant with GuC, as it can only do so when
> the connection to the GuC is awake.
> 
> Cc: stable@vger.kernel.org
> Fixes: 7938d61591d3 ("drm/i915: Flush TLBs before releasing backing store")

Patch itself looks fine but I don't think we closed on the issue of 
stable/fixes on this patch?

My position here is that, if the functional issue is only with GuC 
invalidations, then the tags shouldn't be there (and the huge CC list).

Regards,

Tvrtko

> Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
> Cc: Fei Yang <fei.yang@intel.com>
> Cc: Andi Shyti <andi.shyti@linux.intel.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
> ---
> 
> To avoid mailbombing on a large number of people, only mailing lists were C/C on the cover.
> See [PATCH v2 00/21] at: https://lore.kernel.org/all/cover.1657800199.git.mchehab@kernel.org/
> 
>   drivers/gpu/drm/i915/gem/i915_gem_pages.c | 10 ++++++----
>   drivers/gpu/drm/i915/gt/intel_gt.c        | 17 ++++++++++-------
>   drivers/gpu/drm/i915/gt/intel_gt_pm.h     |  3 +++
>   3 files changed, 19 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> index 97c820eee115..6835279943df 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> @@ -6,14 +6,15 @@
>   
>   #include <drm/drm_cache.h>
>   
> +#include "gt/intel_gt.h"
> +#include "gt/intel_gt_pm.h"
> +
>   #include "i915_drv.h"
>   #include "i915_gem_object.h"
>   #include "i915_scatterlist.h"
>   #include "i915_gem_lmem.h"
>   #include "i915_gem_mman.h"
>   
> -#include "gt/intel_gt.h"
> -
>   void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
>   				 struct sg_table *pages,
>   				 unsigned int sg_page_sizes)
> @@ -217,10 +218,11 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
>   
>   	if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
>   		struct drm_i915_private *i915 = to_i915(obj->base.dev);
> +		struct intel_gt *gt = to_gt(i915);
>   		intel_wakeref_t wakeref;
>   
> -		with_intel_runtime_pm_if_active(&i915->runtime_pm, wakeref)
> -			intel_gt_invalidate_tlbs(to_gt(i915));
> +		with_intel_gt_pm_if_awake(gt, wakeref)
> +			intel_gt_invalidate_tlbs(gt);
>   	}
>   
>   	return pages;
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index 68c2b0d8f187..c4d43da84d8e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -12,6 +12,7 @@
>   
>   #include "i915_drv.h"
>   #include "intel_context.h"
> +#include "intel_engine_pm.h"
>   #include "intel_engine_regs.h"
>   #include "intel_ggtt_gmch.h"
>   #include "intel_gt.h"
> @@ -924,6 +925,7 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>   	struct drm_i915_private *i915 = gt->i915;
>   	struct intel_uncore *uncore = gt->uncore;
>   	struct intel_engine_cs *engine;
> +	intel_engine_mask_t awake, tmp;
>   	enum intel_engine_id id;
>   	const i915_reg_t *regs;
>   	unsigned int num = 0;
> @@ -947,26 +949,31 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>   
>   	GEM_TRACE("\n");
>   
> -	assert_rpm_wakelock_held(&i915->runtime_pm);
> -
>   	mutex_lock(&gt->tlb_invalidate_lock);
>   	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
>   
>   	spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */
>   
> +	awake = 0;
>   	for_each_engine(engine, gt, id) {
>   		struct reg_and_bit rb;
>   
> +		if (!intel_engine_pm_is_awake(engine))
> +			continue;
> +
>   		rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
>   		if (!i915_mmio_reg_offset(rb.reg))
>   			continue;
>   
>   		intel_uncore_write_fw(uncore, rb.reg, rb.bit);
> +		awake |= engine->mask;
>   	}
>   
>   	spin_unlock_irq(&uncore->lock);
>   
> -	for_each_engine(engine, gt, id) {
> +	for_each_engine_masked(engine, gt, awake, tmp) {
> +		struct reg_and_bit rb;
> +
>   		/*
>   		 * HW architecture suggest typical invalidation time at 40us,
>   		 * with pessimistic cases up to 100us and a recommendation to
> @@ -974,12 +981,8 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>   		 */
>   		const unsigned int timeout_us = 100;
>   		const unsigned int timeout_ms = 4;
> -		struct reg_and_bit rb;
>   
>   		rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
> -		if (!i915_mmio_reg_offset(rb.reg))
> -			continue;
> -
>   		if (__intel_wait_for_register_fw(uncore,
>   						 rb.reg, rb.bit, 0,
>   						 timeout_us, timeout_ms,
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> index bc898df7a48c..a334787a4939 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> @@ -55,6 +55,9 @@ static inline void intel_gt_pm_might_put(struct intel_gt *gt)
>   	for (tmp = 1, intel_gt_pm_get(gt); tmp; \
>   	     intel_gt_pm_put(gt), tmp = 0)
>   
> +#define with_intel_gt_pm_if_awake(gt, wf) \
> +	for (wf = intel_gt_pm_get_if_awake(gt); wf; intel_gt_pm_put_async(gt), wf = 0)
> +
>   static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
>   {
>   	return intel_wakeref_wait_for_idle(&gt->wakeref);
Mauro Carvalho Chehab July 18, 2022, 2:53 p.m. UTC | #2
On Mon, 18 Jul 2022 14:16:10 +0100
Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:

> On 14/07/2022 13:06, Mauro Carvalho Chehab wrote:
> > From: Chris Wilson <chris.p.wilson@intel.com>
> > 
> > Check if the device is powered down prior to any engine activity,
> > as, on such cases, all the TLBs were already invalidated, so an
> > explicit TLB invalidation is not needed, thus reducing the
> > performance regression impact due to it.
> > 
> > This becomes more significant with GuC, as it can only do so when
> > the connection to the GuC is awake.
> > 
> > Cc: stable@vger.kernel.org
> > Fixes: 7938d61591d3 ("drm/i915: Flush TLBs before releasing backing store")  
> 
> Patch itself looks fine but I don't think we closed on the issue of 
> stable/fixes on this patch?

No, because TLB cache invalidation takes time and causes time outs, which
in turn affects applications and produce Kernel warnings.

There's even open bugs due to TLB timeouts, like this one:

	[424.370996] i915 0000:00:02.0: [drm] *ERROR* rcs0 TLB invalidation did not complete in 4ms!

See:
	https://gitlab.freedesktop.org/drm/intel/-/issues/6424

So, while this is a performance regression, it ends causing a
functional regression.

The first part of this series (patches 1-7) are meant to reduce the
risk of such timeouts by doing TLB invalidation in batch and only 
when really needed (userspace-exposed TLBs for GTs that are powered-on
and non-edged).

As they're fixing such regressions, it makes sense c/c stable and having
a fixes tag.

> My position here is that, if the functional issue is only with GuC 
> invalidations, then the tags shouldn't be there (and the huge CC list).
> 
> Regards,
> 
> Tvrtko
> 
> > Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
> > Cc: Fei Yang <fei.yang@intel.com>
> > Cc: Andi Shyti <andi.shyti@linux.intel.com>
> > Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> > Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
> > ---
> > 
> > To avoid mailbombing on a large number of people, only mailing lists were C/C on the cover.
> > See [PATCH v2 00/21] at: https://lore.kernel.org/all/cover.1657800199.git.mchehab@kernel.org/
> > 
> >   drivers/gpu/drm/i915/gem/i915_gem_pages.c | 10 ++++++----
> >   drivers/gpu/drm/i915/gt/intel_gt.c        | 17 ++++++++++-------
> >   drivers/gpu/drm/i915/gt/intel_gt_pm.h     |  3 +++
> >   3 files changed, 19 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> > index 97c820eee115..6835279943df 100644
> > --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> > +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> > @@ -6,14 +6,15 @@
> >   
> >   #include <drm/drm_cache.h>
> >   
> > +#include "gt/intel_gt.h"
> > +#include "gt/intel_gt_pm.h"
> > +
> >   #include "i915_drv.h"
> >   #include "i915_gem_object.h"
> >   #include "i915_scatterlist.h"
> >   #include "i915_gem_lmem.h"
> >   #include "i915_gem_mman.h"
> >   
> > -#include "gt/intel_gt.h"
> > -
> >   void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
> >   				 struct sg_table *pages,
> >   				 unsigned int sg_page_sizes)
> > @@ -217,10 +218,11 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
> >   
> >   	if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
> >   		struct drm_i915_private *i915 = to_i915(obj->base.dev);
> > +		struct intel_gt *gt = to_gt(i915);
> >   		intel_wakeref_t wakeref;
> >   
> > -		with_intel_runtime_pm_if_active(&i915->runtime_pm, wakeref)
> > -			intel_gt_invalidate_tlbs(to_gt(i915));
> > +		with_intel_gt_pm_if_awake(gt, wakeref)
> > +			intel_gt_invalidate_tlbs(gt);
> >   	}
> >   
> >   	return pages;
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> > index 68c2b0d8f187..c4d43da84d8e 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> > @@ -12,6 +12,7 @@
> >   
> >   #include "i915_drv.h"
> >   #include "intel_context.h"
> > +#include "intel_engine_pm.h"
> >   #include "intel_engine_regs.h"
> >   #include "intel_ggtt_gmch.h"
> >   #include "intel_gt.h"
> > @@ -924,6 +925,7 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
> >   	struct drm_i915_private *i915 = gt->i915;
> >   	struct intel_uncore *uncore = gt->uncore;
> >   	struct intel_engine_cs *engine;
> > +	intel_engine_mask_t awake, tmp;
> >   	enum intel_engine_id id;
> >   	const i915_reg_t *regs;
> >   	unsigned int num = 0;
> > @@ -947,26 +949,31 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
> >   
> >   	GEM_TRACE("\n");
> >   
> > -	assert_rpm_wakelock_held(&i915->runtime_pm);
> > -
> >   	mutex_lock(&gt->tlb_invalidate_lock);
> >   	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
> >   
> >   	spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */
> >   
> > +	awake = 0;
> >   	for_each_engine(engine, gt, id) {
> >   		struct reg_and_bit rb;
> >   
> > +		if (!intel_engine_pm_is_awake(engine))
> > +			continue;
> > +
> >   		rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
> >   		if (!i915_mmio_reg_offset(rb.reg))
> >   			continue;
> >   
> >   		intel_uncore_write_fw(uncore, rb.reg, rb.bit);
> > +		awake |= engine->mask;
> >   	}
> >   
> >   	spin_unlock_irq(&uncore->lock);
> >   
> > -	for_each_engine(engine, gt, id) {
> > +	for_each_engine_masked(engine, gt, awake, tmp) {
> > +		struct reg_and_bit rb;
> > +
> >   		/*
> >   		 * HW architecture suggest typical invalidation time at 40us,
> >   		 * with pessimistic cases up to 100us and a recommendation to
> > @@ -974,12 +981,8 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
> >   		 */
> >   		const unsigned int timeout_us = 100;
> >   		const unsigned int timeout_ms = 4;
> > -		struct reg_and_bit rb;
> >   
> >   		rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
> > -		if (!i915_mmio_reg_offset(rb.reg))
> > -			continue;
> > -
> >   		if (__intel_wait_for_register_fw(uncore,
> >   						 rb.reg, rb.bit, 0,
> >   						 timeout_us, timeout_ms,
> > diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> > index bc898df7a48c..a334787a4939 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> > @@ -55,6 +55,9 @@ static inline void intel_gt_pm_might_put(struct intel_gt *gt)
> >   	for (tmp = 1, intel_gt_pm_get(gt); tmp; \
> >   	     intel_gt_pm_put(gt), tmp = 0)
> >   
> > +#define with_intel_gt_pm_if_awake(gt, wf) \
> > +	for (wf = intel_gt_pm_get_if_awake(gt); wf; intel_gt_pm_put_async(gt), wf = 0)
> > +
> >   static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
> >   {
> >   	return intel_wakeref_wait_for_idle(&gt->wakeref);
Tvrtko Ursulin July 18, 2022, 3:01 p.m. UTC | #3
On 18/07/2022 15:53, Mauro Carvalho Chehab wrote:
> On Mon, 18 Jul 2022 14:16:10 +0100
> Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:
> 
>> On 14/07/2022 13:06, Mauro Carvalho Chehab wrote:
>>> From: Chris Wilson <chris.p.wilson@intel.com>
>>>
>>> Check if the device is powered down prior to any engine activity,
>>> as, on such cases, all the TLBs were already invalidated, so an
>>> explicit TLB invalidation is not needed, thus reducing the
>>> performance regression impact due to it.
>>>
>>> This becomes more significant with GuC, as it can only do so when
>>> the connection to the GuC is awake.
>>>
>>> Cc: stable@vger.kernel.org
>>> Fixes: 7938d61591d3 ("drm/i915: Flush TLBs before releasing backing store")
>>
>> Patch itself looks fine but I don't think we closed on the issue of
>> stable/fixes on this patch?
> 
> No, because TLB cache invalidation takes time and causes time outs, which
> in turn affects applications and produce Kernel warnings.
> 
> There's even open bugs due to TLB timeouts, like this one:
> 
> 	[424.370996] i915 0000:00:02.0: [drm] *ERROR* rcs0 TLB invalidation did not complete in 4ms!
> 
> See:
> 	https://gitlab.freedesktop.org/drm/intel/-/issues/6424
> 
> So, while this is a performance regression, it ends causing a
> functional regression.

This test is not even particularly stressful. Fair enough - thanks for 
the information.

Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Is skipping of the ggtt only bound flush the fix for this particular test?

Regards,

Tvrtko

> 
> The first part of this series (patches 1-7) are meant to reduce the
> risk of such timeouts by doing TLB invalidation in batch and only
> when really needed (userspace-exposed TLBs for GTs that are powered-on
> and non-edged).
> 
> As they're fixing such regressions, it makes sense c/c stable and having
> a fixes tag.
> 
>> My position here is that, if the functional issue is only with GuC
>> invalidations, then the tags shouldn't be there (and the huge CC list).
>>
>> Regards,
>>
>> Tvrtko
>>
>>> Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
>>> Cc: Fei Yang <fei.yang@intel.com>
>>> Cc: Andi Shyti <andi.shyti@linux.intel.com>
>>> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
>>> Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>
>>> ---
>>>
>>> To avoid mailbombing on a large number of people, only mailing lists were C/C on the cover.
>>> See [PATCH v2 00/21] at: https://lore.kernel.org/all/cover.1657800199.git.mchehab@kernel.org/
>>>
>>>    drivers/gpu/drm/i915/gem/i915_gem_pages.c | 10 ++++++----
>>>    drivers/gpu/drm/i915/gt/intel_gt.c        | 17 ++++++++++-------
>>>    drivers/gpu/drm/i915/gt/intel_gt_pm.h     |  3 +++
>>>    3 files changed, 19 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
>>> index 97c820eee115..6835279943df 100644
>>> --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
>>> +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
>>> @@ -6,14 +6,15 @@
>>>    
>>>    #include <drm/drm_cache.h>
>>>    
>>> +#include "gt/intel_gt.h"
>>> +#include "gt/intel_gt_pm.h"
>>> +
>>>    #include "i915_drv.h"
>>>    #include "i915_gem_object.h"
>>>    #include "i915_scatterlist.h"
>>>    #include "i915_gem_lmem.h"
>>>    #include "i915_gem_mman.h"
>>>    
>>> -#include "gt/intel_gt.h"
>>> -
>>>    void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
>>>    				 struct sg_table *pages,
>>>    				 unsigned int sg_page_sizes)
>>> @@ -217,10 +218,11 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
>>>    
>>>    	if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
>>>    		struct drm_i915_private *i915 = to_i915(obj->base.dev);
>>> +		struct intel_gt *gt = to_gt(i915);
>>>    		intel_wakeref_t wakeref;
>>>    
>>> -		with_intel_runtime_pm_if_active(&i915->runtime_pm, wakeref)
>>> -			intel_gt_invalidate_tlbs(to_gt(i915));
>>> +		with_intel_gt_pm_if_awake(gt, wakeref)
>>> +			intel_gt_invalidate_tlbs(gt);
>>>    	}
>>>    
>>>    	return pages;
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
>>> index 68c2b0d8f187..c4d43da84d8e 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
>>> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
>>> @@ -12,6 +12,7 @@
>>>    
>>>    #include "i915_drv.h"
>>>    #include "intel_context.h"
>>> +#include "intel_engine_pm.h"
>>>    #include "intel_engine_regs.h"
>>>    #include "intel_ggtt_gmch.h"
>>>    #include "intel_gt.h"
>>> @@ -924,6 +925,7 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>>>    	struct drm_i915_private *i915 = gt->i915;
>>>    	struct intel_uncore *uncore = gt->uncore;
>>>    	struct intel_engine_cs *engine;
>>> +	intel_engine_mask_t awake, tmp;
>>>    	enum intel_engine_id id;
>>>    	const i915_reg_t *regs;
>>>    	unsigned int num = 0;
>>> @@ -947,26 +949,31 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>>>    
>>>    	GEM_TRACE("\n");
>>>    
>>> -	assert_rpm_wakelock_held(&i915->runtime_pm);
>>> -
>>>    	mutex_lock(&gt->tlb_invalidate_lock);
>>>    	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
>>>    
>>>    	spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */
>>>    
>>> +	awake = 0;
>>>    	for_each_engine(engine, gt, id) {
>>>    		struct reg_and_bit rb;
>>>    
>>> +		if (!intel_engine_pm_is_awake(engine))
>>> +			continue;
>>> +
>>>    		rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
>>>    		if (!i915_mmio_reg_offset(rb.reg))
>>>    			continue;
>>>    
>>>    		intel_uncore_write_fw(uncore, rb.reg, rb.bit);
>>> +		awake |= engine->mask;
>>>    	}
>>>    
>>>    	spin_unlock_irq(&uncore->lock);
>>>    
>>> -	for_each_engine(engine, gt, id) {
>>> +	for_each_engine_masked(engine, gt, awake, tmp) {
>>> +		struct reg_and_bit rb;
>>> +
>>>    		/*
>>>    		 * HW architecture suggest typical invalidation time at 40us,
>>>    		 * with pessimistic cases up to 100us and a recommendation to
>>> @@ -974,12 +981,8 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>>>    		 */
>>>    		const unsigned int timeout_us = 100;
>>>    		const unsigned int timeout_ms = 4;
>>> -		struct reg_and_bit rb;
>>>    
>>>    		rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
>>> -		if (!i915_mmio_reg_offset(rb.reg))
>>> -			continue;
>>> -
>>>    		if (__intel_wait_for_register_fw(uncore,
>>>    						 rb.reg, rb.bit, 0,
>>>    						 timeout_us, timeout_ms,
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
>>> index bc898df7a48c..a334787a4939 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
>>> @@ -55,6 +55,9 @@ static inline void intel_gt_pm_might_put(struct intel_gt *gt)
>>>    	for (tmp = 1, intel_gt_pm_get(gt); tmp; \
>>>    	     intel_gt_pm_put(gt), tmp = 0)
>>>    
>>> +#define with_intel_gt_pm_if_awake(gt, wf) \
>>> +	for (wf = intel_gt_pm_get_if_awake(gt); wf; intel_gt_pm_put_async(gt), wf = 0)
>>> +
>>>    static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
>>>    {
>>>    	return intel_wakeref_wait_for_idle(&gt->wakeref);
David Laight July 18, 2022, 3:50 p.m. UTC | #4
From: Mauro Carvalho Chehab
> Sent: 18 July 2022 15:54
> 
> On Mon, 18 Jul 2022 14:16:10 +0100
> Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:
> 
> > On 14/07/2022 13:06, Mauro Carvalho Chehab wrote:
> > > From: Chris Wilson <chris.p.wilson@intel.com>
> > >
> > > Check if the device is powered down prior to any engine activity,
> > > as, on such cases, all the TLBs were already invalidated, so an
> > > explicit TLB invalidation is not needed, thus reducing the
> > > performance regression impact due to it.
> > >
> > > This becomes more significant with GuC, as it can only do so when
> > > the connection to the GuC is awake.
> > >
> > > Cc: stable@vger.kernel.org
> > > Fixes: 7938d61591d3 ("drm/i915: Flush TLBs before releasing backing store")
> >
> > Patch itself looks fine but I don't think we closed on the issue of
> > stable/fixes on this patch?
> 
> No, because TLB cache invalidation takes time and causes time outs, which
> in turn affects applications and produce Kernel warnings.

It's not only the TLB flushes that cause grief.

There is a loop that forces a write-back of all the frame buffer pages.
With a large display and some cpu (like my Ivy bridge one) that
takes long enough with pre-emption disabled that wakeup of RT processes
(and any pinned to the cpu) takes far longer than one might have
wished for.

Since some X servers request a flush every few seconds this makes
the system unusable for some workloads.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Tvrtko Ursulin July 19, 2022, 7:24 a.m. UTC | #5
Hi David,

On 18/07/2022 16:50, David Laight wrote:
> From: Mauro Carvalho Chehab
>> Sent: 18 July 2022 15:54
>>
>> On Mon, 18 Jul 2022 14:16:10 +0100
>> Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> wrote:
>>
>>> On 14/07/2022 13:06, Mauro Carvalho Chehab wrote:
>>>> From: Chris Wilson <chris.p.wilson@intel.com>
>>>>
>>>> Check if the device is powered down prior to any engine activity,
>>>> as, on such cases, all the TLBs were already invalidated, so an
>>>> explicit TLB invalidation is not needed, thus reducing the
>>>> performance regression impact due to it.
>>>>
>>>> This becomes more significant with GuC, as it can only do so when
>>>> the connection to the GuC is awake.
>>>>
>>>> Cc: stable@vger.kernel.org
>>>> Fixes: 7938d61591d3 ("drm/i915: Flush TLBs before releasing backing store")
>>>
>>> Patch itself looks fine but I don't think we closed on the issue of
>>> stable/fixes on this patch?
>>
>> No, because TLB cache invalidation takes time and causes time outs, which
>> in turn affects applications and produce Kernel warnings.
> 
> It's not only the TLB flushes that cause grief.
> 
> There is a loop that forces a write-back of all the frame buffer pages.
> With a large display and some cpu (like my Ivy bridge one) that
> takes long enough with pre-emption disabled that wakeup of RT processes
> (and any pinned to the cpu) takes far longer than one might have
> wished for.
> 
> Since some X servers request a flush every few seconds this makes
> the system unusable for some workloads.

Ok TLB invalidations as discussed in this patch does not apply to 
Ivybridge. But what is the write back loop you mention which is causing 
you grief? What size frame buffers are we talking about here? If they 
don't fit in the mappable area recently we merged a patch* which 
improves things in that situation but not sure you are hitting exactly that.

Regards,

Tvrtko

*) 230523ba24bd ("drm/i915/gem: Don't evict unmappable VMAs when pinning 
with PIN_MAPPABLE (v2)")
David Laight July 19, 2022, 7:45 a.m. UTC | #6
From: Tvrtko Ursulin
> Sent: 19 July 2022 08:25
...
> > It's not only the TLB flushes that cause grief.
> >
> > There is a loop that forces a write-back of all the frame buffer pages.
> > With a large display and some cpu (like my Ivy bridge one) that
> > takes long enough with pre-emption disabled that wakeup of RT processes
> > (and any pinned to the cpu) takes far longer than one might have
> > wished for.
> >
> > Since some X servers request a flush every few seconds this makes
> > the system unusable for some workloads.
> 
> Ok TLB invalidations as discussed in this patch does not apply to
> Ivybridge. But what is the write back loop you mention which is causing
> you grief? What size frame buffers are we talking about here? If they
> don't fit in the mappable area recently we merged a patch* which
> improves things in that situation but not sure you are hitting exactly that.

I found the old email:

What I've found is that the Intel i915 graphics driver uses the 'events_unbound'
kernel worker thread to periodically execute drm_cflush_sg().
(see https://github.com/torvalds/linux/blob/master/drivers/gpu/drm/drm_cache.c)

I'm guessing this is to ensure that any writes to graphics memory become
visible is a semi-timely manner.

This loop takes about 1us per iteration split fairly evenly between whatever is in
for_each_sg_page() and drm_cflush_page().
With a 2560x1440 display the loop count is 3600 (4 bytes/pixel) and the whole
function takes around 3.3ms.

IIRC the first few page flushes are quick (I bet they go into a fifo)
and then they all get slow.
The flushes are actually requested from userspace.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)
Andi Shyti July 22, 2022, 11:56 a.m. UTC | #7
Hi Mauro,

On Thu, Jul 14, 2022 at 01:06:06PM +0100, Mauro Carvalho Chehab wrote:
> From: Chris Wilson <chris.p.wilson@intel.com>
> 
> Check if the device is powered down prior to any engine activity,
> as, on such cases, all the TLBs were already invalidated, so an
> explicit TLB invalidation is not needed, thus reducing the
> performance regression impact due to it.
> 
> This becomes more significant with GuC, as it can only do so when
> the connection to the GuC is awake.
> 
> Cc: stable@vger.kernel.org
> Fixes: 7938d61591d3 ("drm/i915: Flush TLBs before releasing backing store")
> Signed-off-by: Chris Wilson <chris.p.wilson@intel.com>
> Cc: Fei Yang <fei.yang@intel.com>
> Cc: Andi Shyti <andi.shyti@linux.intel.com>
> Cc: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> Signed-off-by: Mauro Carvalho Chehab <mchehab@kernel.org>

For me it's good, but please, sort out with Tvrtko about his
doubts:

Reviewed-by: Andi Shyti <andi.shyti@linux.intel.com>

Andi

> ---
> 
> To avoid mailbombing on a large number of people, only mailing lists were C/C on the cover.
> See [PATCH v2 00/21] at: https://lore.kernel.org/all/cover.1657800199.git.mchehab@kernel.org/
> 
>  drivers/gpu/drm/i915/gem/i915_gem_pages.c | 10 ++++++----
>  drivers/gpu/drm/i915/gt/intel_gt.c        | 17 ++++++++++-------
>  drivers/gpu/drm/i915/gt/intel_gt_pm.h     |  3 +++
>  3 files changed, 19 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> index 97c820eee115..6835279943df 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
> @@ -6,14 +6,15 @@
>  
>  #include <drm/drm_cache.h>
>  
> +#include "gt/intel_gt.h"
> +#include "gt/intel_gt_pm.h"
> +
>  #include "i915_drv.h"
>  #include "i915_gem_object.h"
>  #include "i915_scatterlist.h"
>  #include "i915_gem_lmem.h"
>  #include "i915_gem_mman.h"
>  
> -#include "gt/intel_gt.h"
> -
>  void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
>  				 struct sg_table *pages,
>  				 unsigned int sg_page_sizes)
> @@ -217,10 +218,11 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
>  
>  	if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
>  		struct drm_i915_private *i915 = to_i915(obj->base.dev);
> +		struct intel_gt *gt = to_gt(i915);
>  		intel_wakeref_t wakeref;
>  
> -		with_intel_runtime_pm_if_active(&i915->runtime_pm, wakeref)
> -			intel_gt_invalidate_tlbs(to_gt(i915));
> +		with_intel_gt_pm_if_awake(gt, wakeref)
> +			intel_gt_invalidate_tlbs(gt);
>  	}
>  
>  	return pages;
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
> index 68c2b0d8f187..c4d43da84d8e 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt.c
> +++ b/drivers/gpu/drm/i915/gt/intel_gt.c
> @@ -12,6 +12,7 @@
>  
>  #include "i915_drv.h"
>  #include "intel_context.h"
> +#include "intel_engine_pm.h"
>  #include "intel_engine_regs.h"
>  #include "intel_ggtt_gmch.h"
>  #include "intel_gt.h"
> @@ -924,6 +925,7 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>  	struct drm_i915_private *i915 = gt->i915;
>  	struct intel_uncore *uncore = gt->uncore;
>  	struct intel_engine_cs *engine;
> +	intel_engine_mask_t awake, tmp;
>  	enum intel_engine_id id;
>  	const i915_reg_t *regs;
>  	unsigned int num = 0;
> @@ -947,26 +949,31 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>  
>  	GEM_TRACE("\n");
>  
> -	assert_rpm_wakelock_held(&i915->runtime_pm);
> -
>  	mutex_lock(&gt->tlb_invalidate_lock);
>  	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
>  
>  	spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */
>  
> +	awake = 0;
>  	for_each_engine(engine, gt, id) {
>  		struct reg_and_bit rb;
>  
> +		if (!intel_engine_pm_is_awake(engine))
> +			continue;
> +
>  		rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
>  		if (!i915_mmio_reg_offset(rb.reg))
>  			continue;
>  
>  		intel_uncore_write_fw(uncore, rb.reg, rb.bit);
> +		awake |= engine->mask;
>  	}
>  
>  	spin_unlock_irq(&uncore->lock);
>  
> -	for_each_engine(engine, gt, id) {
> +	for_each_engine_masked(engine, gt, awake, tmp) {
> +		struct reg_and_bit rb;
> +
>  		/*
>  		 * HW architecture suggest typical invalidation time at 40us,
>  		 * with pessimistic cases up to 100us and a recommendation to
> @@ -974,12 +981,8 @@ void intel_gt_invalidate_tlbs(struct intel_gt *gt)
>  		 */
>  		const unsigned int timeout_us = 100;
>  		const unsigned int timeout_ms = 4;
> -		struct reg_and_bit rb;
>  
>  		rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
> -		if (!i915_mmio_reg_offset(rb.reg))
> -			continue;
> -
>  		if (__intel_wait_for_register_fw(uncore,
>  						 rb.reg, rb.bit, 0,
>  						 timeout_us, timeout_ms,
> diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> index bc898df7a48c..a334787a4939 100644
> --- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> +++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
> @@ -55,6 +55,9 @@ static inline void intel_gt_pm_might_put(struct intel_gt *gt)
>  	for (tmp = 1, intel_gt_pm_get(gt); tmp; \
>  	     intel_gt_pm_put(gt), tmp = 0)
>  
> +#define with_intel_gt_pm_if_awake(gt, wf) \
> +	for (wf = intel_gt_pm_get_if_awake(gt); wf; intel_gt_pm_put_async(gt), wf = 0)
> +
>  static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
>  {
>  	return intel_wakeref_wait_for_idle(&gt->wakeref);
> -- 
> 2.36.1
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index 97c820eee115..6835279943df 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -6,14 +6,15 @@ 
 
 #include <drm/drm_cache.h>
 
+#include "gt/intel_gt.h"
+#include "gt/intel_gt_pm.h"
+
 #include "i915_drv.h"
 #include "i915_gem_object.h"
 #include "i915_scatterlist.h"
 #include "i915_gem_lmem.h"
 #include "i915_gem_mman.h"
 
-#include "gt/intel_gt.h"
-
 void __i915_gem_object_set_pages(struct drm_i915_gem_object *obj,
 				 struct sg_table *pages,
 				 unsigned int sg_page_sizes)
@@ -217,10 +218,11 @@  __i915_gem_object_unset_pages(struct drm_i915_gem_object *obj)
 
 	if (test_and_clear_bit(I915_BO_WAS_BOUND_BIT, &obj->flags)) {
 		struct drm_i915_private *i915 = to_i915(obj->base.dev);
+		struct intel_gt *gt = to_gt(i915);
 		intel_wakeref_t wakeref;
 
-		with_intel_runtime_pm_if_active(&i915->runtime_pm, wakeref)
-			intel_gt_invalidate_tlbs(to_gt(i915));
+		with_intel_gt_pm_if_awake(gt, wakeref)
+			intel_gt_invalidate_tlbs(gt);
 	}
 
 	return pages;
diff --git a/drivers/gpu/drm/i915/gt/intel_gt.c b/drivers/gpu/drm/i915/gt/intel_gt.c
index 68c2b0d8f187..c4d43da84d8e 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt.c
@@ -12,6 +12,7 @@ 
 
 #include "i915_drv.h"
 #include "intel_context.h"
+#include "intel_engine_pm.h"
 #include "intel_engine_regs.h"
 #include "intel_ggtt_gmch.h"
 #include "intel_gt.h"
@@ -924,6 +925,7 @@  void intel_gt_invalidate_tlbs(struct intel_gt *gt)
 	struct drm_i915_private *i915 = gt->i915;
 	struct intel_uncore *uncore = gt->uncore;
 	struct intel_engine_cs *engine;
+	intel_engine_mask_t awake, tmp;
 	enum intel_engine_id id;
 	const i915_reg_t *regs;
 	unsigned int num = 0;
@@ -947,26 +949,31 @@  void intel_gt_invalidate_tlbs(struct intel_gt *gt)
 
 	GEM_TRACE("\n");
 
-	assert_rpm_wakelock_held(&i915->runtime_pm);
-
 	mutex_lock(&gt->tlb_invalidate_lock);
 	intel_uncore_forcewake_get(uncore, FORCEWAKE_ALL);
 
 	spin_lock_irq(&uncore->lock); /* serialise invalidate with GT reset */
 
+	awake = 0;
 	for_each_engine(engine, gt, id) {
 		struct reg_and_bit rb;
 
+		if (!intel_engine_pm_is_awake(engine))
+			continue;
+
 		rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
 		if (!i915_mmio_reg_offset(rb.reg))
 			continue;
 
 		intel_uncore_write_fw(uncore, rb.reg, rb.bit);
+		awake |= engine->mask;
 	}
 
 	spin_unlock_irq(&uncore->lock);
 
-	for_each_engine(engine, gt, id) {
+	for_each_engine_masked(engine, gt, awake, tmp) {
+		struct reg_and_bit rb;
+
 		/*
 		 * HW architecture suggest typical invalidation time at 40us,
 		 * with pessimistic cases up to 100us and a recommendation to
@@ -974,12 +981,8 @@  void intel_gt_invalidate_tlbs(struct intel_gt *gt)
 		 */
 		const unsigned int timeout_us = 100;
 		const unsigned int timeout_ms = 4;
-		struct reg_and_bit rb;
 
 		rb = get_reg_and_bit(engine, regs == gen8_regs, regs, num);
-		if (!i915_mmio_reg_offset(rb.reg))
-			continue;
-
 		if (__intel_wait_for_register_fw(uncore,
 						 rb.reg, rb.bit, 0,
 						 timeout_us, timeout_ms,
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_pm.h b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
index bc898df7a48c..a334787a4939 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_pm.h
+++ b/drivers/gpu/drm/i915/gt/intel_gt_pm.h
@@ -55,6 +55,9 @@  static inline void intel_gt_pm_might_put(struct intel_gt *gt)
 	for (tmp = 1, intel_gt_pm_get(gt); tmp; \
 	     intel_gt_pm_put(gt), tmp = 0)
 
+#define with_intel_gt_pm_if_awake(gt, wf) \
+	for (wf = intel_gt_pm_get_if_awake(gt); wf; intel_gt_pm_put_async(gt), wf = 0)
+
 static inline int intel_gt_pm_wait_for_idle(struct intel_gt *gt)
 {
 	return intel_wakeref_wait_for_idle(&gt->wakeref);