diff mbox

[v5,2/2] drm/i915: Acquire PUNIT->PMIC bus for intel_uncore_forcewake_reset()

Message ID 20171019111620.26761-3-hdegoede@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Hans de Goede Oct. 19, 2017, 11:16 a.m. UTC
intel_uncore_forcewake_reset() does forcewake puts and gets as such
we need to make sure that no-one tries to access the PUNIT->PMIC bus
(on systems where this bus is shared) while it runs, otherwise bad
things happen.

Normally this is taken care of by the i915_pmic_bus_access_notifier()
which does an intel_uncore_forcewake_get(FORCEWAKE_ALL) when some other
driver tries to access the PMIC bus, so that later forcewake gets are
no-ops (for the duration of the bus access).

But intel_uncore_forcewake_reset gets called in 3 cases:
1) Before registering the pmic_bus_access_notifier
2) After unregistering the pmic_bus_access_notifier
3) To reset forcewake state on a GPU reset

In all 3 cases the i915_pmic_bus_access_notifier() protection is
insufficient.

This commit fixes this race by calling iosf_mbi_punit_acquire() before
calling intel_uncore_forcewake_reset(). In the case where it is called
directly after unregistering the pmic_bus_access_notifier, we need to
hold the punit-lock over both calls to avoid a race where
intel_uncore_fw_release_timer() may execute between the 2 calls.

Signed-off-by: Hans de Goede <hdegoede@redhat.com>
Reviewed-by: Imre Deak <imre.deak@intel.com>
---
Changes in v2:
-Rebase on current (July 6th 2017) drm-next

Changes in v3:
-Keep punit acquired / locked over the unregister + forcewake_reset
 call combo to avoid a race hitting between the 2 calls
-This requires modifying iosf_mbi_unregister_pmic_bus_access_notifier
 to not take the lock itself, since we are the only users this is done
 in this same commit

Changes in v4:
-Fix missing rename in doc-comment
-Add and use iosf_mbi_assert_punit_acquired() helper
-Add missing acquire surrounding intel_uncore_forcewake_reset() inside
 intel_uncore_check_forcewake_domains()
-Add Imre's Reviewed-by

Changes in v5:
-Separate out arch/x86 iosf_mbi changes into a separate patch
---
 drivers/gpu/drm/i915/intel_uncore.c           | 17 +++++++++++++----
 drivers/gpu/drm/i915/selftests/intel_uncore.c |  3 +++
 2 files changed, 16 insertions(+), 4 deletions(-)

Comments

Ingo Molnar Oct. 31, 2017, 9:50 a.m. UTC | #1
* Hans de Goede <j.w.r.degoede@gmail.com> wrote:

> intel_uncore_forcewake_reset() does forcewake puts and gets as such
> we need to make sure that no-one tries to access the PUNIT->PMIC bus
> (on systems where this bus is shared) while it runs, otherwise bad
> things happen.
> 
> Normally this is taken care of by the i915_pmic_bus_access_notifier()
> which does an intel_uncore_forcewake_get(FORCEWAKE_ALL) when some other
> driver tries to access the PMIC bus, so that later forcewake gets are
> no-ops (for the duration of the bus access).
> 
> But intel_uncore_forcewake_reset gets called in 3 cases:
> 1) Before registering the pmic_bus_access_notifier
> 2) After unregistering the pmic_bus_access_notifier
> 3) To reset forcewake state on a GPU reset
> 
> In all 3 cases the i915_pmic_bus_access_notifier() protection is
> insufficient.
> 
> This commit fixes this race by calling iosf_mbi_punit_acquire() before
> calling intel_uncore_forcewake_reset(). In the case where it is called
> directly after unregistering the pmic_bus_access_notifier, we need to
> hold the punit-lock over both calls to avoid a race where
> intel_uncore_fw_release_timer() may execute between the 2 calls.
> 
> Signed-off-by: Hans de Goede <hdegoede@redhat.com>
> Reviewed-by: Imre Deak <imre.deak@intel.com>
> ---
> Changes in v2:
> -Rebase on current (July 6th 2017) drm-next
> 
> Changes in v3:
> -Keep punit acquired / locked over the unregister + forcewake_reset
>  call combo to avoid a race hitting between the 2 calls
> -This requires modifying iosf_mbi_unregister_pmic_bus_access_notifier
>  to not take the lock itself, since we are the only users this is done
>  in this same commit
> 
> Changes in v4:
> -Fix missing rename in doc-comment
> -Add and use iosf_mbi_assert_punit_acquired() helper
> -Add missing acquire surrounding intel_uncore_forcewake_reset() inside
>  intel_uncore_check_forcewake_domains()
> -Add Imre's Reviewed-by
> 
> Changes in v5:
> -Separate out arch/x86 iosf_mbi changes into a separate patch
> ---
>  drivers/gpu/drm/i915/intel_uncore.c           | 17 +++++++++++++----
>  drivers/gpu/drm/i915/selftests/intel_uncore.c |  3 +++
>  2 files changed, 16 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 8c2ce81f01c2..0da81faf3981 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -229,6 +229,7 @@ intel_uncore_fw_release_timer(struct hrtimer *timer)
>  	return HRTIMER_NORESTART;
>  }
>  
> +/* Note callers must have acquired the PUNIT->PMIC bus, before calling this. */
>  static void intel_uncore_forcewake_reset(struct drm_i915_private *dev_priv,
>  					 bool restore)
>  {
> @@ -237,6 +238,8 @@ static void intel_uncore_forcewake_reset(struct drm_i915_private *dev_priv,
>  	int retry_count = 100;
>  	enum forcewake_domains fw, active_domains;
>  
> +	iosf_mbi_assert_punit_acquired();
> +
>  	/* Hold uncore.lock across reset to prevent any register access
>  	 * with forcewake not set correctly. Wait until all pending
>  	 * timers are run before holding.
> @@ -416,14 +419,18 @@ static void __intel_uncore_early_sanitize(struct drm_i915_private *dev_priv,
>  				   GT_FIFO_CTL_RC6_POLICY_STALL);
>  	}
>  
> +	iosf_mbi_punit_acquire();
>  	intel_uncore_forcewake_reset(dev_priv, restore_forcewake);
> +	iosf_mbi_punit_release();
>  }
>  
>  void intel_uncore_suspend(struct drm_i915_private *dev_priv)
>  {
> -	iosf_mbi_unregister_pmic_bus_access_notifier(
> +	iosf_mbi_punit_acquire();
> +	iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
>  		&dev_priv->uncore.pmic_bus_access_nb);
>  	intel_uncore_forcewake_reset(dev_priv, false);
> +	iosf_mbi_punit_release();
>  }
>  
>  void intel_uncore_resume_early(struct drm_i915_private *dev_priv)
> @@ -1315,12 +1322,14 @@ void intel_uncore_init(struct drm_i915_private *dev_priv)
>  
>  void intel_uncore_fini(struct drm_i915_private *dev_priv)
>  {
> -	iosf_mbi_unregister_pmic_bus_access_notifier(
> -		&dev_priv->uncore.pmic_bus_access_nb);
> -
>  	/* Paranoia: make sure we have disabled everything before we exit. */
>  	intel_uncore_sanitize(dev_priv);
> +
> +	iosf_mbi_punit_acquire();
> +	iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
> +		&dev_priv->uncore.pmic_bus_access_nb);
>  	intel_uncore_forcewake_reset(dev_priv, false);
> +	iosf_mbi_punit_release();
>  }
>  
>  static const struct reg_whitelist {
> diff --git a/drivers/gpu/drm/i915/selftests/intel_uncore.c b/drivers/gpu/drm/i915/selftests/intel_uncore.c
> index 3cac22eb47ce..733d87fe7737 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_uncore.c
> @@ -148,7 +148,10 @@ static int intel_uncore_check_forcewake_domains(struct drm_i915_private *dev_pri
>  	for_each_set_bit(offset, valid, FW_RANGE) {
>  		i915_reg_t reg = { offset };
>  
> +		iosf_mbi_punit_acquire();
>  		intel_uncore_forcewake_reset(dev_priv, false);
> +		iosf_mbi_punit_release();
> +
>  		check_for_unclaimed_mmio(dev_priv);
>  
>  		(void)I915_READ(reg);

This patch looks like one massive layering violation. Why does the GPU code muck 
with the uncore hardware?

Thanks,

	Ingo
Daniel Vetter Oct. 31, 2017, 10:04 a.m. UTC | #2
On Tue, Oct 31, 2017 at 10:50:06AM +0100, Ingo Molnar wrote:
> 
> * Hans de Goede <j.w.r.degoede@gmail.com> wrote:
> 
> > intel_uncore_forcewake_reset() does forcewake puts and gets as such
> > we need to make sure that no-one tries to access the PUNIT->PMIC bus
> > (on systems where this bus is shared) while it runs, otherwise bad
> > things happen.
> > 
> > Normally this is taken care of by the i915_pmic_bus_access_notifier()
> > which does an intel_uncore_forcewake_get(FORCEWAKE_ALL) when some other
> > driver tries to access the PMIC bus, so that later forcewake gets are
> > no-ops (for the duration of the bus access).
> > 
> > But intel_uncore_forcewake_reset gets called in 3 cases:
> > 1) Before registering the pmic_bus_access_notifier
> > 2) After unregistering the pmic_bus_access_notifier
> > 3) To reset forcewake state on a GPU reset
> > 
> > In all 3 cases the i915_pmic_bus_access_notifier() protection is
> > insufficient.
> > 
> > This commit fixes this race by calling iosf_mbi_punit_acquire() before
> > calling intel_uncore_forcewake_reset(). In the case where it is called
> > directly after unregistering the pmic_bus_access_notifier, we need to
> > hold the punit-lock over both calls to avoid a race where
> > intel_uncore_fw_release_timer() may execute between the 2 calls.
> > 
> > Signed-off-by: Hans de Goede <hdegoede@redhat.com>
> > Reviewed-by: Imre Deak <imre.deak@intel.com>
> > ---
> > Changes in v2:
> > -Rebase on current (July 6th 2017) drm-next
> > 
> > Changes in v3:
> > -Keep punit acquired / locked over the unregister + forcewake_reset
> >  call combo to avoid a race hitting between the 2 calls
> > -This requires modifying iosf_mbi_unregister_pmic_bus_access_notifier
> >  to not take the lock itself, since we are the only users this is done
> >  in this same commit
> > 
> > Changes in v4:
> > -Fix missing rename in doc-comment
> > -Add and use iosf_mbi_assert_punit_acquired() helper
> > -Add missing acquire surrounding intel_uncore_forcewake_reset() inside
> >  intel_uncore_check_forcewake_domains()
> > -Add Imre's Reviewed-by
> > 
> > Changes in v5:
> > -Separate out arch/x86 iosf_mbi changes into a separate patch
> > ---
> >  drivers/gpu/drm/i915/intel_uncore.c           | 17 +++++++++++++----
> >  drivers/gpu/drm/i915/selftests/intel_uncore.c |  3 +++
> >  2 files changed, 16 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> > index 8c2ce81f01c2..0da81faf3981 100644
> > --- a/drivers/gpu/drm/i915/intel_uncore.c
> > +++ b/drivers/gpu/drm/i915/intel_uncore.c
> > @@ -229,6 +229,7 @@ intel_uncore_fw_release_timer(struct hrtimer *timer)
> >  	return HRTIMER_NORESTART;
> >  }
> >  
> > +/* Note callers must have acquired the PUNIT->PMIC bus, before calling this. */
> >  static void intel_uncore_forcewake_reset(struct drm_i915_private *dev_priv,
> >  					 bool restore)
> >  {
> > @@ -237,6 +238,8 @@ static void intel_uncore_forcewake_reset(struct drm_i915_private *dev_priv,
> >  	int retry_count = 100;
> >  	enum forcewake_domains fw, active_domains;
> >  
> > +	iosf_mbi_assert_punit_acquired();
> > +
> >  	/* Hold uncore.lock across reset to prevent any register access
> >  	 * with forcewake not set correctly. Wait until all pending
> >  	 * timers are run before holding.
> > @@ -416,14 +419,18 @@ static void __intel_uncore_early_sanitize(struct drm_i915_private *dev_priv,
> >  				   GT_FIFO_CTL_RC6_POLICY_STALL);
> >  	}
> >  
> > +	iosf_mbi_punit_acquire();
> >  	intel_uncore_forcewake_reset(dev_priv, restore_forcewake);
> > +	iosf_mbi_punit_release();
> >  }
> >  
> >  void intel_uncore_suspend(struct drm_i915_private *dev_priv)
> >  {
> > -	iosf_mbi_unregister_pmic_bus_access_notifier(
> > +	iosf_mbi_punit_acquire();
> > +	iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
> >  		&dev_priv->uncore.pmic_bus_access_nb);
> >  	intel_uncore_forcewake_reset(dev_priv, false);
> > +	iosf_mbi_punit_release();
> >  }
> >  
> >  void intel_uncore_resume_early(struct drm_i915_private *dev_priv)
> > @@ -1315,12 +1322,14 @@ void intel_uncore_init(struct drm_i915_private *dev_priv)
> >  
> >  void intel_uncore_fini(struct drm_i915_private *dev_priv)
> >  {
> > -	iosf_mbi_unregister_pmic_bus_access_notifier(
> > -		&dev_priv->uncore.pmic_bus_access_nb);
> > -
> >  	/* Paranoia: make sure we have disabled everything before we exit. */
> >  	intel_uncore_sanitize(dev_priv);
> > +
> > +	iosf_mbi_punit_acquire();
> > +	iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
> > +		&dev_priv->uncore.pmic_bus_access_nb);
> >  	intel_uncore_forcewake_reset(dev_priv, false);
> > +	iosf_mbi_punit_release();
> >  }
> >  
> >  static const struct reg_whitelist {
> > diff --git a/drivers/gpu/drm/i915/selftests/intel_uncore.c b/drivers/gpu/drm/i915/selftests/intel_uncore.c
> > index 3cac22eb47ce..733d87fe7737 100644
> > --- a/drivers/gpu/drm/i915/selftests/intel_uncore.c
> > +++ b/drivers/gpu/drm/i915/selftests/intel_uncore.c
> > @@ -148,7 +148,10 @@ static int intel_uncore_check_forcewake_domains(struct drm_i915_private *dev_pri
> >  	for_each_set_bit(offset, valid, FW_RANGE) {
> >  		i915_reg_t reg = { offset };
> >  
> > +		iosf_mbi_punit_acquire();
> >  		intel_uncore_forcewake_reset(dev_priv, false);
> > +		iosf_mbi_punit_release();
> > +
> >  		check_for_unclaimed_mmio(dev_priv);
> >  
> >  		(void)I915_READ(reg);
> 
> This patch looks like one massive layering violation. Why does the GPU code muck 
> with the uncore hardware?

Because the hw is an even worse layering violation. There's way too much
"magic" stuff going on in the background, which then sometimes doesn't
work and we end up implementing hacks in drivers to paper over it.

Slightly more details: The gpu can also get access to the pmic, through
its own register window, except the synchronization at the hw/fw level is
screwed up and doesn't work, breaking the illusion that the gpu is a
prefectly isolated pci device. In reality, at the SoC level, it's anything
but. But because those deps aren't clearly expressed (people hoped it
would work with the magic, which was all added to make running Windows on
top of this possible), so it all looks like horrible hacks instead of the
much cleaner design arm-soc platforms have with DT describing all these
deps much more explicitly.

Aside: There's a lot more mmio windows of other devices and special
backdoors to other stuff in the gpu "pci" mmio bar than just this. Mostly
they work as designed.
-Daniel
Hans de Goede Oct. 31, 2017, 10:10 a.m. UTC | #3
Hi,

On 31-10-17 10:50, Ingo Molnar wrote:
> 
> * Hans de Goede <j.w.r.degoede@gmail.com> wrote:
> 
>> intel_uncore_forcewake_reset() does forcewake puts and gets as such
>> we need to make sure that no-one tries to access the PUNIT->PMIC bus
>> (on systems where this bus is shared) while it runs, otherwise bad
>> things happen.
>>
>> Normally this is taken care of by the i915_pmic_bus_access_notifier()
>> which does an intel_uncore_forcewake_get(FORCEWAKE_ALL) when some other
>> driver tries to access the PMIC bus, so that later forcewake gets are
>> no-ops (for the duration of the bus access).
>>
>> But intel_uncore_forcewake_reset gets called in 3 cases:
>> 1) Before registering the pmic_bus_access_notifier
>> 2) After unregistering the pmic_bus_access_notifier
>> 3) To reset forcewake state on a GPU reset
>>
>> In all 3 cases the i915_pmic_bus_access_notifier() protection is
>> insufficient.
>>
>> This commit fixes this race by calling iosf_mbi_punit_acquire() before
>> calling intel_uncore_forcewake_reset(). In the case where it is called
>> directly after unregistering the pmic_bus_access_notifier, we need to
>> hold the punit-lock over both calls to avoid a race where
>> intel_uncore_fw_release_timer() may execute between the 2 calls.
>>
>> Signed-off-by: Hans de Goede <hdegoede@redhat.com>
>> Reviewed-by: Imre Deak <imre.deak@intel.com>
>> ---
>> Changes in v2:
>> -Rebase on current (July 6th 2017) drm-next
>>
>> Changes in v3:
>> -Keep punit acquired / locked over the unregister + forcewake_reset
>>   call combo to avoid a race hitting between the 2 calls
>> -This requires modifying iosf_mbi_unregister_pmic_bus_access_notifier
>>   to not take the lock itself, since we are the only users this is done
>>   in this same commit
>>
>> Changes in v4:
>> -Fix missing rename in doc-comment
>> -Add and use iosf_mbi_assert_punit_acquired() helper
>> -Add missing acquire surrounding intel_uncore_forcewake_reset() inside
>>   intel_uncore_check_forcewake_domains()
>> -Add Imre's Reviewed-by
>>
>> Changes in v5:
>> -Separate out arch/x86 iosf_mbi changes into a separate patch
>> ---
>>   drivers/gpu/drm/i915/intel_uncore.c           | 17 +++++++++++++----
>>   drivers/gpu/drm/i915/selftests/intel_uncore.c |  3 +++
>>   2 files changed, 16 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
>> index 8c2ce81f01c2..0da81faf3981 100644
>> --- a/drivers/gpu/drm/i915/intel_uncore.c
>> +++ b/drivers/gpu/drm/i915/intel_uncore.c
>> @@ -229,6 +229,7 @@ intel_uncore_fw_release_timer(struct hrtimer *timer)
>>   	return HRTIMER_NORESTART;
>>   }
>>   
>> +/* Note callers must have acquired the PUNIT->PMIC bus, before calling this. */
>>   static void intel_uncore_forcewake_reset(struct drm_i915_private *dev_priv,
>>   					 bool restore)
>>   {
>> @@ -237,6 +238,8 @@ static void intel_uncore_forcewake_reset(struct drm_i915_private *dev_priv,
>>   	int retry_count = 100;
>>   	enum forcewake_domains fw, active_domains;
>>   
>> +	iosf_mbi_assert_punit_acquired();
>> +
>>   	/* Hold uncore.lock across reset to prevent any register access
>>   	 * with forcewake not set correctly. Wait until all pending
>>   	 * timers are run before holding.
>> @@ -416,14 +419,18 @@ static void __intel_uncore_early_sanitize(struct drm_i915_private *dev_priv,
>>   				   GT_FIFO_CTL_RC6_POLICY_STALL);
>>   	}
>>   
>> +	iosf_mbi_punit_acquire();
>>   	intel_uncore_forcewake_reset(dev_priv, restore_forcewake);
>> +	iosf_mbi_punit_release();
>>   }
>>   
>>   void intel_uncore_suspend(struct drm_i915_private *dev_priv)
>>   {
>> -	iosf_mbi_unregister_pmic_bus_access_notifier(
>> +	iosf_mbi_punit_acquire();
>> +	iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
>>   		&dev_priv->uncore.pmic_bus_access_nb);
>>   	intel_uncore_forcewake_reset(dev_priv, false);
>> +	iosf_mbi_punit_release();
>>   }
>>   
>>   void intel_uncore_resume_early(struct drm_i915_private *dev_priv)
>> @@ -1315,12 +1322,14 @@ void intel_uncore_init(struct drm_i915_private *dev_priv)
>>   
>>   void intel_uncore_fini(struct drm_i915_private *dev_priv)
>>   {
>> -	iosf_mbi_unregister_pmic_bus_access_notifier(
>> -		&dev_priv->uncore.pmic_bus_access_nb);
>> -
>>   	/* Paranoia: make sure we have disabled everything before we exit. */
>>   	intel_uncore_sanitize(dev_priv);
>> +
>> +	iosf_mbi_punit_acquire();
>> +	iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
>> +		&dev_priv->uncore.pmic_bus_access_nb);
>>   	intel_uncore_forcewake_reset(dev_priv, false);
>> +	iosf_mbi_punit_release();
>>   }
>>   
>>   static const struct reg_whitelist {
>> diff --git a/drivers/gpu/drm/i915/selftests/intel_uncore.c b/drivers/gpu/drm/i915/selftests/intel_uncore.c
>> index 3cac22eb47ce..733d87fe7737 100644
>> --- a/drivers/gpu/drm/i915/selftests/intel_uncore.c
>> +++ b/drivers/gpu/drm/i915/selftests/intel_uncore.c
>> @@ -148,7 +148,10 @@ static int intel_uncore_check_forcewake_domains(struct drm_i915_private *dev_pri
>>   	for_each_set_bit(offset, valid, FW_RANGE) {
>>   		i915_reg_t reg = { offset };
>>   
>> +		iosf_mbi_punit_acquire();
>>   		intel_uncore_forcewake_reset(dev_priv, false);
>> +		iosf_mbi_punit_release();
>> +
>>   		check_for_unclaimed_mmio(dev_priv);
>>   
>>   		(void)I915_READ(reg);
> 
> This patch looks like one massive layering violation. Why does the GPU code muck
> with the uncore hardware?

AFAIK uncore stands for not-core, so all the peripheral stuff
we have on a CPU (or rather really SoC) nowadays, the GPU is part
of those peripherals and again AFAIK the GPU driver needs to make
sure certain power-domains are awake when accessing various parts
of the GPU. But perhaps one of the i915 devs can answer this
question better.

Now as for the specifics of this patch-set, in most Intel systems
the CPU/SoC communicates to the PMIC to set various power-domain
voltages through a dedicated SVID bus.

But the Bay Trail CR (cost reduced) and Cherry Trail platforms
which use an AXP288 PMIC are special. The AXP288 PMIC does not
support the SVID bus. Instead the PUnit inside the SoC
communicates over i2c with the PMIC. This i2c bus is shared
with regular in kernel i2c drivers, which unfortunately
is necessary to implement various ACPI opregions.

To solve the shared i2c bus access the PUnit has a PMIC bus
access semaphore which gets accessed via the iosf_mbi bus.
Unfortunately having the i2c-host controller acquire this
semaphore alone is not enough protection. It seems this only
stops the PUnit from doing power-transitions requiring the
i2c bus on itself. If some code running on the CPU, like
the GPU driver wants to change certain power-domains
explicitly the kernel needs to make sure that no i2c
accesses to the PMIC are happening at the same time.

One mechanism used here is a notification from the i2c-host
controller driver to the GPU code that it is going to use
the i2c bus, which this patch set is about.

Note that the layering issues you talk about are already in
place and do not get changed in anyway by these 2 patches.

All these 2 patches do is change the code to unregister the
notifier so that a single lock can be held over both
unregistering the notifier and making powerdomain changes,
fixing a race.

Regards,

Hans
Ingo Molnar Oct. 31, 2017, 11:54 a.m. UTC | #4
* Daniel Vetter <daniel@ffwll.ch> wrote:

> On Tue, Oct 31, 2017 at 10:50:06AM +0100, Ingo Molnar wrote:
> > 
> > * Hans de Goede <j.w.r.degoede@gmail.com> wrote:
> > 
> > > intel_uncore_forcewake_reset() does forcewake puts and gets as such
> > > we need to make sure that no-one tries to access the PUNIT->PMIC bus
> > > (on systems where this bus is shared) while it runs, otherwise bad
> > > things happen.
> > > 
> > > Normally this is taken care of by the i915_pmic_bus_access_notifier()
> > > which does an intel_uncore_forcewake_get(FORCEWAKE_ALL) when some other
> > > driver tries to access the PMIC bus, so that later forcewake gets are
> > > no-ops (for the duration of the bus access).
> > > 
> > > But intel_uncore_forcewake_reset gets called in 3 cases:
> > > 1) Before registering the pmic_bus_access_notifier
> > > 2) After unregistering the pmic_bus_access_notifier
> > > 3) To reset forcewake state on a GPU reset
> > > 
> > > In all 3 cases the i915_pmic_bus_access_notifier() protection is
> > > insufficient.
> > > 
> > > This commit fixes this race by calling iosf_mbi_punit_acquire() before
> > > calling intel_uncore_forcewake_reset(). In the case where it is called
> > > directly after unregistering the pmic_bus_access_notifier, we need to
> > > hold the punit-lock over both calls to avoid a race where
> > > intel_uncore_fw_release_timer() may execute between the 2 calls.
> > > 
> > > Signed-off-by: Hans de Goede <hdegoede@redhat.com>
> > > Reviewed-by: Imre Deak <imre.deak@intel.com>
> > > ---
> > > Changes in v2:
> > > -Rebase on current (July 6th 2017) drm-next
> > > 
> > > Changes in v3:
> > > -Keep punit acquired / locked over the unregister + forcewake_reset
> > >  call combo to avoid a race hitting between the 2 calls
> > > -This requires modifying iosf_mbi_unregister_pmic_bus_access_notifier
> > >  to not take the lock itself, since we are the only users this is done
> > >  in this same commit
> > > 
> > > Changes in v4:
> > > -Fix missing rename in doc-comment
> > > -Add and use iosf_mbi_assert_punit_acquired() helper
> > > -Add missing acquire surrounding intel_uncore_forcewake_reset() inside
> > >  intel_uncore_check_forcewake_domains()
> > > -Add Imre's Reviewed-by
> > > 
> > > Changes in v5:
> > > -Separate out arch/x86 iosf_mbi changes into a separate patch
> > > ---
> > >  drivers/gpu/drm/i915/intel_uncore.c           | 17 +++++++++++++----
> > >  drivers/gpu/drm/i915/selftests/intel_uncore.c |  3 +++
> > >  2 files changed, 16 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> > > index 8c2ce81f01c2..0da81faf3981 100644
> > > --- a/drivers/gpu/drm/i915/intel_uncore.c
> > > +++ b/drivers/gpu/drm/i915/intel_uncore.c
> > > @@ -229,6 +229,7 @@ intel_uncore_fw_release_timer(struct hrtimer *timer)
> > >  	return HRTIMER_NORESTART;
> > >  }
> > >  
> > > +/* Note callers must have acquired the PUNIT->PMIC bus, before calling this. */
> > >  static void intel_uncore_forcewake_reset(struct drm_i915_private *dev_priv,
> > >  					 bool restore)
> > >  {
> > > @@ -237,6 +238,8 @@ static void intel_uncore_forcewake_reset(struct drm_i915_private *dev_priv,
> > >  	int retry_count = 100;
> > >  	enum forcewake_domains fw, active_domains;
> > >  
> > > +	iosf_mbi_assert_punit_acquired();
> > > +
> > >  	/* Hold uncore.lock across reset to prevent any register access
> > >  	 * with forcewake not set correctly. Wait until all pending
> > >  	 * timers are run before holding.
> > > @@ -416,14 +419,18 @@ static void __intel_uncore_early_sanitize(struct drm_i915_private *dev_priv,
> > >  				   GT_FIFO_CTL_RC6_POLICY_STALL);
> > >  	}
> > >  
> > > +	iosf_mbi_punit_acquire();
> > >  	intel_uncore_forcewake_reset(dev_priv, restore_forcewake);
> > > +	iosf_mbi_punit_release();
> > >  }
> > >  
> > >  void intel_uncore_suspend(struct drm_i915_private *dev_priv)
> > >  {
> > > -	iosf_mbi_unregister_pmic_bus_access_notifier(
> > > +	iosf_mbi_punit_acquire();
> > > +	iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
> > >  		&dev_priv->uncore.pmic_bus_access_nb);
> > >  	intel_uncore_forcewake_reset(dev_priv, false);
> > > +	iosf_mbi_punit_release();
> > >  }
> > >  
> > >  void intel_uncore_resume_early(struct drm_i915_private *dev_priv)
> > > @@ -1315,12 +1322,14 @@ void intel_uncore_init(struct drm_i915_private *dev_priv)
> > >  
> > >  void intel_uncore_fini(struct drm_i915_private *dev_priv)
> > >  {
> > > -	iosf_mbi_unregister_pmic_bus_access_notifier(
> > > -		&dev_priv->uncore.pmic_bus_access_nb);
> > > -
> > >  	/* Paranoia: make sure we have disabled everything before we exit. */
> > >  	intel_uncore_sanitize(dev_priv);
> > > +
> > > +	iosf_mbi_punit_acquire();
> > > +	iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
> > > +		&dev_priv->uncore.pmic_bus_access_nb);
> > >  	intel_uncore_forcewake_reset(dev_priv, false);
> > > +	iosf_mbi_punit_release();
> > >  }
> > >  
> > >  static const struct reg_whitelist {
> > > diff --git a/drivers/gpu/drm/i915/selftests/intel_uncore.c b/drivers/gpu/drm/i915/selftests/intel_uncore.c
> > > index 3cac22eb47ce..733d87fe7737 100644
> > > --- a/drivers/gpu/drm/i915/selftests/intel_uncore.c
> > > +++ b/drivers/gpu/drm/i915/selftests/intel_uncore.c
> > > @@ -148,7 +148,10 @@ static int intel_uncore_check_forcewake_domains(struct drm_i915_private *dev_pri
> > >  	for_each_set_bit(offset, valid, FW_RANGE) {
> > >  		i915_reg_t reg = { offset };
> > >  
> > > +		iosf_mbi_punit_acquire();
> > >  		intel_uncore_forcewake_reset(dev_priv, false);
> > > +		iosf_mbi_punit_release();
> > > +
> > >  		check_for_unclaimed_mmio(dev_priv);
> > >  
> > >  		(void)I915_READ(reg);
> > 
> > This patch looks like one massive layering violation. Why does the GPU code muck 
> > with the uncore hardware?
> 
> Because the hw is an even worse layering violation. There's way too much
> "magic" stuff going on in the background, which then sometimes doesn't
> work and we end up implementing hacks in drivers to paper over it.

Ok, fair enough I guess:

  Acked-by: Ingo Molnar <mingo@kernel.org>

Thanks,

	Ingo
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 8c2ce81f01c2..0da81faf3981 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -229,6 +229,7 @@  intel_uncore_fw_release_timer(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
 
+/* Note callers must have acquired the PUNIT->PMIC bus, before calling this. */
 static void intel_uncore_forcewake_reset(struct drm_i915_private *dev_priv,
 					 bool restore)
 {
@@ -237,6 +238,8 @@  static void intel_uncore_forcewake_reset(struct drm_i915_private *dev_priv,
 	int retry_count = 100;
 	enum forcewake_domains fw, active_domains;
 
+	iosf_mbi_assert_punit_acquired();
+
 	/* Hold uncore.lock across reset to prevent any register access
 	 * with forcewake not set correctly. Wait until all pending
 	 * timers are run before holding.
@@ -416,14 +419,18 @@  static void __intel_uncore_early_sanitize(struct drm_i915_private *dev_priv,
 				   GT_FIFO_CTL_RC6_POLICY_STALL);
 	}
 
+	iosf_mbi_punit_acquire();
 	intel_uncore_forcewake_reset(dev_priv, restore_forcewake);
+	iosf_mbi_punit_release();
 }
 
 void intel_uncore_suspend(struct drm_i915_private *dev_priv)
 {
-	iosf_mbi_unregister_pmic_bus_access_notifier(
+	iosf_mbi_punit_acquire();
+	iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
 		&dev_priv->uncore.pmic_bus_access_nb);
 	intel_uncore_forcewake_reset(dev_priv, false);
+	iosf_mbi_punit_release();
 }
 
 void intel_uncore_resume_early(struct drm_i915_private *dev_priv)
@@ -1315,12 +1322,14 @@  void intel_uncore_init(struct drm_i915_private *dev_priv)
 
 void intel_uncore_fini(struct drm_i915_private *dev_priv)
 {
-	iosf_mbi_unregister_pmic_bus_access_notifier(
-		&dev_priv->uncore.pmic_bus_access_nb);
-
 	/* Paranoia: make sure we have disabled everything before we exit. */
 	intel_uncore_sanitize(dev_priv);
+
+	iosf_mbi_punit_acquire();
+	iosf_mbi_unregister_pmic_bus_access_notifier_unlocked(
+		&dev_priv->uncore.pmic_bus_access_nb);
 	intel_uncore_forcewake_reset(dev_priv, false);
+	iosf_mbi_punit_release();
 }
 
 static const struct reg_whitelist {
diff --git a/drivers/gpu/drm/i915/selftests/intel_uncore.c b/drivers/gpu/drm/i915/selftests/intel_uncore.c
index 3cac22eb47ce..733d87fe7737 100644
--- a/drivers/gpu/drm/i915/selftests/intel_uncore.c
+++ b/drivers/gpu/drm/i915/selftests/intel_uncore.c
@@ -148,7 +148,10 @@  static int intel_uncore_check_forcewake_domains(struct drm_i915_private *dev_pri
 	for_each_set_bit(offset, valid, FW_RANGE) {
 		i915_reg_t reg = { offset };
 
+		iosf_mbi_punit_acquire();
 		intel_uncore_forcewake_reset(dev_priv, false);
+		iosf_mbi_punit_release();
+
 		check_for_unclaimed_mmio(dev_priv);
 
 		(void)I915_READ(reg);