diff mbox series

drm/amdgpu: Cancel delayed work when GFXOFF is disabled

Message ID 20210817082325.2579-1-michel@daenzer.net (mailing list archive)
State New, archived
Headers show
Series drm/amdgpu: Cancel delayed work when GFXOFF is disabled | expand

Commit Message

Michel Dänzer Aug. 17, 2021, 8:23 a.m. UTC
From: Michel Dänzer <mdaenzer@redhat.com>

schedule_delayed_work does not push back the work if it was already
scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
was disabled and re-enabled again during those 100 ms.

This resulted in frame drops / stutter with the upcoming mutter 41
release on Navi 14, due to constantly enabling GFXOFF in the HW and
disabling it again (for getting the GPU clock counter).

To fix this, call cancel_delayed_work_sync when the disable count
transitions from 0 to 1, and only schedule the delayed work on the
reverse transition, not if the disable count was already 0. This makes
sure the delayed work doesn't run at unexpected times, and allows it to
be lock-free.

v2:
* Use cancel_delayed_work_sync & mutex_trylock instead of
  mod_delayed_work.
v3:
* Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
v4:
* Fix race condition between amdgpu_gfx_off_ctrl incrementing
  adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
  checking for it to be 0 (Evan Quan)

Cc: stable@vger.kernel.org
Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> # v3
Acked-by: Christian König <christian.koenig@amd.com> # v3
Signed-off-by: Michel Dänzer <mdaenzer@redhat.com>
---

Alex, probably best to wait a bit longer before picking this up. :)

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++----
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++++++++++++++-------
 2 files changed, 30 insertions(+), 17 deletions(-)

Comments

Lazar, Lijo Aug. 17, 2021, 9:12 a.m. UTC | #1
On 8/17/2021 1:53 PM, Michel Dänzer wrote:
> From: Michel Dänzer <mdaenzer@redhat.com>
> 
> schedule_delayed_work does not push back the work if it was already
> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
> was disabled and re-enabled again during those 100 ms.
> 
> This resulted in frame drops / stutter with the upcoming mutter 41
> release on Navi 14, due to constantly enabling GFXOFF in the HW and
> disabling it again (for getting the GPU clock counter).
> 
> To fix this, call cancel_delayed_work_sync when the disable count
> transitions from 0 to 1, and only schedule the delayed work on the
> reverse transition, not if the disable count was already 0. This makes
> sure the delayed work doesn't run at unexpected times, and allows it to
> be lock-free.
> 
> v2:
> * Use cancel_delayed_work_sync & mutex_trylock instead of
>    mod_delayed_work.
> v3:
> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
> v4:
> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>    adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>    checking for it to be 0 (Evan Quan)
> 
> Cc: stable@vger.kernel.org
> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> # v3
> Acked-by: Christian König <christian.koenig@amd.com> # v3
> Signed-off-by: Michel Dänzer <mdaenzer@redhat.com>
> ---
> 
> Alex, probably best to wait a bit longer before picking this up. :)
> 
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++----
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++++++++++++++-------
>   2 files changed, 30 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f3fd5ec710b6..f944ed858f3e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>   	struct amdgpu_device *adev =
>   		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
>   
> -	mutex_lock(&adev->gfx.gfx_off_mutex);
> -	if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
> -		if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
> -			adev->gfx.gfx_off_state = true;
> -	}
> -	mutex_unlock(&adev->gfx.gfx_off_mutex);
> +	WARN_ON_ONCE(adev->gfx.gfx_off_state);
> +	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
> +
> +	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
> +		adev->gfx.gfx_off_state = true;
>   }
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a0be0772c8b3..b4ced45301be 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
>   
>   	mutex_lock(&adev->gfx.gfx_off_mutex);
>   
> -	if (!enable)
> -		adev->gfx.gfx_off_req_count++;
> -	else if (adev->gfx.gfx_off_req_count > 0)
> +	if (enable) {
> +		/* If the count is already 0, it means there's an imbalance bug somewhere.
> +		 * Note that the bug may be in a different caller than the one which triggers the
> +		 * WARN_ON_ONCE.
> +		 */
> +		if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
> +			goto unlock;
> +
>   		adev->gfx.gfx_off_req_count--;
>   
> -	if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
> -		schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
> -	} else if (!enable && adev->gfx.gfx_off_state) {
> -		if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
> -			adev->gfx.gfx_off_state = false;
> +		if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
> +			schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
> +	} else {
> +		if (adev->gfx.gfx_off_req_count == 0) {
> +			cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
> +
> +			if (adev->gfx.gfx_off_state &&

More of a question which I didn't check last time - Is this expected to 
be true when the disable call comes in first?

Thanks,
Lijo

> +			    !amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
> +				adev->gfx.gfx_off_state = false;
>   
> -			if (adev->gfx.funcs->init_spm_golden) {
> -				dev_dbg(adev->dev, "GFXOFF is disabled, re-init SPM golden settings\n");
> -				amdgpu_gfx_init_spm_golden(adev);
> +				if (adev->gfx.funcs->init_spm_golden) {
> +					dev_dbg(adev->dev,
> +						"GFXOFF is disabled, re-init SPM golden settings\n");
> +					amdgpu_gfx_init_spm_golden(adev);
> +				}
>   			}
>   		}
> +
> +		adev->gfx.gfx_off_req_count++;
>   	}
>   
> +unlock:
>   	mutex_unlock(&adev->gfx.gfx_off_mutex);
>   }
>   
>
Michel Dänzer Aug. 17, 2021, 9:26 a.m. UTC | #2
On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
> 
> 
> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>> From: Michel Dänzer <mdaenzer@redhat.com>
>>
>> schedule_delayed_work does not push back the work if it was already
>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>> was disabled and re-enabled again during those 100 ms.
>>
>> This resulted in frame drops / stutter with the upcoming mutter 41
>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>> disabling it again (for getting the GPU clock counter).
>>
>> To fix this, call cancel_delayed_work_sync when the disable count
>> transitions from 0 to 1, and only schedule the delayed work on the
>> reverse transition, not if the disable count was already 0. This makes
>> sure the delayed work doesn't run at unexpected times, and allows it to
>> be lock-free.
>>
>> v2:
>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>    mod_delayed_work.
>> v3:
>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>> v4:
>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>    adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>>    checking for it to be 0 (Evan Quan)
>>
>> Cc: stable@vger.kernel.org
>> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> # v3
>> Acked-by: Christian König <christian.koenig@amd.com> # v3
>> Signed-off-by: Michel Dänzer <mdaenzer@redhat.com>
>> ---
>>
>> Alex, probably best to wait a bit longer before picking this up. :)
>>
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++----
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++++++++++++++-------
>>   2 files changed, 30 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index f3fd5ec710b6..f944ed858f3e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>       struct amdgpu_device *adev =
>>           container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
>>   -    mutex_lock(&adev->gfx.gfx_off_mutex);
>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>> -            adev->gfx.gfx_off_state = true;
>> -    }
>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>> +
>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>> +        adev->gfx.gfx_off_state = true;
>>   }
>>     /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> index a0be0772c8b3..b4ced45301be 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
>>         mutex_lock(&adev->gfx.gfx_off_mutex);
>>   -    if (!enable)
>> -        adev->gfx.gfx_off_req_count++;
>> -    else if (adev->gfx.gfx_off_req_count > 0)
>> +    if (enable) {
>> +        /* If the count is already 0, it means there's an imbalance bug somewhere.
>> +         * Note that the bug may be in a different caller than the one which triggers the
>> +         * WARN_ON_ONCE.
>> +         */
>> +        if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>> +            goto unlock;
>> +
>>           adev->gfx.gfx_off_req_count--;
>>   -    if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>> -        schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
>> -            adev->gfx.gfx_off_state = false;
>> +        if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
>> +            schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>> +    } else {
>> +        if (adev->gfx.gfx_off_req_count == 0) {
>> +            cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
>> +
>> +            if (adev->gfx.gfx_off_state &&
> 
> More of a question which I didn't check last time - Is this expected to be true when the disable call comes in first?

My assumption is that cancel_delayed_work_sync guarantees amdgpu_device_delay_enable_gfx_off's assignment is visible here.
Evan Quan Aug. 17, 2021, 9:33 a.m. UTC | #3
[AMD Official Use Only]

Thanks! This seems fine to me.
Reviewed-by: Evan Quan <evan.quan@amd.com>

> -----Original Message-----
> From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of
> Michel Dänzer
> Sent: Tuesday, August 17, 2021 4:23 PM
> To: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian
> <Christian.Koenig@amd.com>
> Cc: Liu, Leo <Leo.Liu@amd.com>; Zhu, James <James.Zhu@amd.com>; amd-
> gfx@lists.freedesktop.org; dri-devel@lists.freedesktop.org
> Subject: [PATCH] drm/amdgpu: Cancel delayed work when GFXOFF is
> disabled
> 
> From: Michel Dänzer <mdaenzer@redhat.com>
> 
> schedule_delayed_work does not push back the work if it was already
> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF was
> disabled and re-enabled again during those 100 ms.
> 
> This resulted in frame drops / stutter with the upcoming mutter 41 release
> on Navi 14, due to constantly enabling GFXOFF in the HW and disabling it
> again (for getting the GPU clock counter).
> 
> To fix this, call cancel_delayed_work_sync when the disable count transitions
> from 0 to 1, and only schedule the delayed work on the reverse transition,
> not if the disable count was already 0. This makes sure the delayed work
> doesn't run at unexpected times, and allows it to be lock-free.
> 
> v2:
> * Use cancel_delayed_work_sync & mutex_trylock instead of
>   mod_delayed_work.
> v3:
> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
> v4:
> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>   adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>   checking for it to be 0 (Evan Quan)
> 
> Cc: stable@vger.kernel.org
> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> # v3
> Acked-by: Christian König <christian.koenig@amd.com> # v3
> Signed-off-by: Michel Dänzer <mdaenzer@redhat.com>
> ---
> 
> Alex, probably best to wait a bit longer before picking this up. :)
> 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++++++++++++++----
> ---
>  2 files changed, 30 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f3fd5ec710b6..f944ed858f3e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2777,12 +2777,11 @@ static void
> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>  	struct amdgpu_device *adev =
>  		container_of(work, struct amdgpu_device,
> gfx.gfx_off_delay_work.work);
> 
> -	mutex_lock(&adev->gfx.gfx_off_mutex);
> -	if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
> -		if (!amdgpu_dpm_set_powergating_by_smu(adev,
> AMD_IP_BLOCK_TYPE_GFX, true))
> -			adev->gfx.gfx_off_state = true;
> -	}
> -	mutex_unlock(&adev->gfx.gfx_off_mutex);
> +	WARN_ON_ONCE(adev->gfx.gfx_off_state);
> +	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
> +
> +	if (!amdgpu_dpm_set_powergating_by_smu(adev,
> AMD_IP_BLOCK_TYPE_GFX, true))
> +		adev->gfx.gfx_off_state = true;
>  }
> 
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a0be0772c8b3..b4ced45301be 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device
> *adev, bool enable)
> 
>  	mutex_lock(&adev->gfx.gfx_off_mutex);
> 
> -	if (!enable)
> -		adev->gfx.gfx_off_req_count++;
> -	else if (adev->gfx.gfx_off_req_count > 0)
> +	if (enable) {
> +		/* If the count is already 0, it means there's an imbalance bug
> somewhere.
> +		 * Note that the bug may be in a different caller than the one
> which triggers the
> +		 * WARN_ON_ONCE.
> +		 */
> +		if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
> +			goto unlock;
> +
>  		adev->gfx.gfx_off_req_count--;
> 
> -	if (enable && !adev->gfx.gfx_off_state && !adev-
> >gfx.gfx_off_req_count) {
> -		schedule_delayed_work(&adev->gfx.gfx_off_delay_work,
> GFX_OFF_DELAY_ENABLE);
> -	} else if (!enable && adev->gfx.gfx_off_state) {
> -		if (!amdgpu_dpm_set_powergating_by_smu(adev,
> AMD_IP_BLOCK_TYPE_GFX, false)) {
> -			adev->gfx.gfx_off_state = false;
> +		if (adev->gfx.gfx_off_req_count == 0 && !adev-
> >gfx.gfx_off_state)
> +			schedule_delayed_work(&adev-
> >gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
> +	} else {
> +		if (adev->gfx.gfx_off_req_count == 0) {
> +			cancel_delayed_work_sync(&adev-
> >gfx.gfx_off_delay_work);
> +
> +			if (adev->gfx.gfx_off_state &&
> +			    !amdgpu_dpm_set_powergating_by_smu(adev,
> AMD_IP_BLOCK_TYPE_GFX, false)) {
> +				adev->gfx.gfx_off_state = false;
> 
> -			if (adev->gfx.funcs->init_spm_golden) {
> -				dev_dbg(adev->dev, "GFXOFF is disabled, re-
> init SPM golden settings\n");
> -				amdgpu_gfx_init_spm_golden(adev);
> +				if (adev->gfx.funcs->init_spm_golden) {
> +					dev_dbg(adev->dev,
> +						"GFXOFF is disabled, re-init
> SPM golden settings\n");
> +					amdgpu_gfx_init_spm_golden(adev);
> +				}
>  			}
>  		}
> +
> +		adev->gfx.gfx_off_req_count++;
>  	}
> 
> +unlock:
>  	mutex_unlock(&adev->gfx.gfx_off_mutex);
>  }
> 
> --
> 2.32.0
Lazar, Lijo Aug. 17, 2021, 9:37 a.m. UTC | #4
On 8/17/2021 2:56 PM, Michel Dänzer wrote:
> On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
>>
>>
>> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>>> From: Michel Dänzer <mdaenzer@redhat.com>
>>>
>>> schedule_delayed_work does not push back the work if it was already
>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>> was disabled and re-enabled again during those 100 ms.
>>>
>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>> disabling it again (for getting the GPU clock counter).
>>>
>>> To fix this, call cancel_delayed_work_sync when the disable count
>>> transitions from 0 to 1, and only schedule the delayed work on the
>>> reverse transition, not if the disable count was already 0. This makes
>>> sure the delayed work doesn't run at unexpected times, and allows it to
>>> be lock-free.
>>>
>>> v2:
>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>     mod_delayed_work.
>>> v3:
>>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>>> v4:
>>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>>     adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>>>     checking for it to be 0 (Evan Quan)
>>>
>>> Cc: stable@vger.kernel.org
>>> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> # v3
>>> Acked-by: Christian König <christian.koenig@amd.com> # v3
>>> Signed-off-by: Michel Dänzer <mdaenzer@redhat.com>
>>> ---
>>>
>>> Alex, probably best to wait a bit longer before picking this up. :)
>>>
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++----
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++++++++++++++-------
>>>    2 files changed, 30 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index f3fd5ec710b6..f944ed858f3e 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>        struct amdgpu_device *adev =
>>>            container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
>>>    -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>>> -            adev->gfx.gfx_off_state = true;
>>> -    }
>>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>>> +
>>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>>> +        adev->gfx.gfx_off_state = true;
>>>    }
>>>      /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> index a0be0772c8b3..b4ced45301be 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
>>>          mutex_lock(&adev->gfx.gfx_off_mutex);
>>>    -    if (!enable)
>>> -        adev->gfx.gfx_off_req_count++;
>>> -    else if (adev->gfx.gfx_off_req_count > 0)
>>> +    if (enable) {
>>> +        /* If the count is already 0, it means there's an imbalance bug somewhere.
>>> +         * Note that the bug may be in a different caller than the one which triggers the
>>> +         * WARN_ON_ONCE.
>>> +         */
>>> +        if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>>> +            goto unlock;
>>> +
>>>            adev->gfx.gfx_off_req_count--;
>>>    -    if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>> -        schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
>>> -            adev->gfx.gfx_off_state = false;
>>> +        if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
>>> +            schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>>> +    } else {
>>> +        if (adev->gfx.gfx_off_req_count == 0) {
>>> +            cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
>>> +
>>> +            if (adev->gfx.gfx_off_state &&
>>
>> More of a question which I didn't check last time - Is this expected to be true when the disable call comes in first?
> 
> My assumption is that cancel_delayed_work_sync guarantees amdgpu_device_delay_enable_gfx_off's assignment is visible here.
> 

To clarify - when nothing is scheduled. If enable() is called when the 
count is 0, it goes to unlock. Now the expectation is someone to call 
Disable first.  Let's say  Disable() is called first, then the variable 
will be false, right?

Thanks,
Lijo
Michel Dänzer Aug. 17, 2021, 9:59 a.m. UTC | #5
On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:
> 
> 
> On 8/17/2021 2:56 PM, Michel Dänzer wrote:
>> On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
>>>
>>>
>>> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>>>> From: Michel Dänzer <mdaenzer@redhat.com>
>>>>
>>>> schedule_delayed_work does not push back the work if it was already
>>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>>> was disabled and re-enabled again during those 100 ms.
>>>>
>>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>>> disabling it again (for getting the GPU clock counter).
>>>>
>>>> To fix this, call cancel_delayed_work_sync when the disable count
>>>> transitions from 0 to 1, and only schedule the delayed work on the
>>>> reverse transition, not if the disable count was already 0. This makes
>>>> sure the delayed work doesn't run at unexpected times, and allows it to
>>>> be lock-free.
>>>>
>>>> v2:
>>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>>     mod_delayed_work.
>>>> v3:
>>>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>>>> v4:
>>>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>>>     adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>>>>     checking for it to be 0 (Evan Quan)
>>>>
>>>> Cc: stable@vger.kernel.org
>>>> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> # v3
>>>> Acked-by: Christian König <christian.koenig@amd.com> # v3
>>>> Signed-off-by: Michel Dänzer <mdaenzer@redhat.com>
>>>> ---
>>>>
>>>> Alex, probably best to wait a bit longer before picking this up. :)
>>>>
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++----
>>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++++++++++++++-------
>>>>    2 files changed, 30 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index f3fd5ec710b6..f944ed858f3e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>>        struct amdgpu_device *adev =
>>>>            container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
>>>>    -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>>>> -            adev->gfx.gfx_off_state = true;
>>>> -    }
>>>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>>>> +
>>>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>>>> +        adev->gfx.gfx_off_state = true;
>>>>    }
>>>>      /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> index a0be0772c8b3..b4ced45301be 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
>>>>          mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>    -    if (!enable)
>>>> -        adev->gfx.gfx_off_req_count++;
>>>> -    else if (adev->gfx.gfx_off_req_count > 0)
>>>> +    if (enable) {
>>>> +        /* If the count is already 0, it means there's an imbalance bug somewhere.
>>>> +         * Note that the bug may be in a different caller than the one which triggers the
>>>> +         * WARN_ON_ONCE.
>>>> +         */
>>>> +        if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>>>> +            goto unlock;
>>>> +
>>>>            adev->gfx.gfx_off_req_count--;
>>>>    -    if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>>> -        schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>>>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
>>>> -            adev->gfx.gfx_off_state = false;
>>>> +        if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
>>>> +            schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>>>> +    } else {
>>>> +        if (adev->gfx.gfx_off_req_count == 0) {
>>>> +            cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
>>>> +
>>>> +            if (adev->gfx.gfx_off_state &&
>>>
>>> More of a question which I didn't check last time - Is this expected to be true when the disable call comes in first?
>>
>> My assumption is that cancel_delayed_work_sync guarantees amdgpu_device_delay_enable_gfx_off's assignment is visible here.
>>
> 
> To clarify - when nothing is scheduled. If enable() is called when the count is 0, it goes to unlock. Now the expectation is someone to call Disable first.

Yes, the very first amdgpu_gfx_off_ctrl call must pass enable=false, or it's a bug, which

        if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))

will catch.


> Let's say  Disable() is called first, then the variable will be false, right?

Ohh, I see what you mean. The first time amdgpu_gfx_off_ctrl is called with enable=false, adev->gfx.gfx_off_state == false (what it was initialized to), so it doesn't actually disable GFXOFF in HW.

Note that this is a separate pre-existing bug, not a regression of my patch.

I wonder what's the best solution for that, move the adev->gfx.gfx_off_state assignments into amdgpu_dpm_set_powergating_by_smu?
Lazar, Lijo Aug. 17, 2021, 10:37 a.m. UTC | #6
On 8/17/2021 3:29 PM, Michel Dänzer wrote:
> On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:
>>
>>
>> On 8/17/2021 2:56 PM, Michel Dänzer wrote:
>>> On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
>>>>
>>>>
>>>> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>>>>> From: Michel Dänzer <mdaenzer@redhat.com>
>>>>>
>>>>> schedule_delayed_work does not push back the work if it was already
>>>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>>>> was disabled and re-enabled again during those 100 ms.
>>>>>
>>>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>>>> disabling it again (for getting the GPU clock counter).
>>>>>
>>>>> To fix this, call cancel_delayed_work_sync when the disable count
>>>>> transitions from 0 to 1, and only schedule the delayed work on the
>>>>> reverse transition, not if the disable count was already 0. This makes
>>>>> sure the delayed work doesn't run at unexpected times, and allows it to
>>>>> be lock-free.
>>>>>
>>>>> v2:
>>>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>>>      mod_delayed_work.
>>>>> v3:
>>>>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>>>>> v4:
>>>>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>>>>      adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>>>>>      checking for it to be 0 (Evan Quan)
>>>>>
>>>>> Cc: stable@vger.kernel.org
>>>>> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> # v3
>>>>> Acked-by: Christian König <christian.koenig@amd.com> # v3
>>>>> Signed-off-by: Michel Dänzer <mdaenzer@redhat.com>
>>>>> ---
>>>>>
>>>>> Alex, probably best to wait a bit longer before picking this up. :)
>>>>>
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++----
>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++++++++++++++-------
>>>>>     2 files changed, 30 insertions(+), 17 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index f3fd5ec710b6..f944ed858f3e 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>>>         struct amdgpu_device *adev =
>>>>>             container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
>>>>>     -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>>>>> -            adev->gfx.gfx_off_state = true;
>>>>> -    }
>>>>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>>>>> +
>>>>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>>>>> +        adev->gfx.gfx_off_state = true;
>>>>>     }
>>>>>       /**
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>> index a0be0772c8b3..b4ced45301be 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
>>>>>           mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>>     -    if (!enable)
>>>>> -        adev->gfx.gfx_off_req_count++;
>>>>> -    else if (adev->gfx.gfx_off_req_count > 0)
>>>>> +    if (enable) {
>>>>> +        /* If the count is already 0, it means there's an imbalance bug somewhere.
>>>>> +         * Note that the bug may be in a different caller than the one which triggers the
>>>>> +         * WARN_ON_ONCE.
>>>>> +         */
>>>>> +        if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>>>>> +            goto unlock;
>>>>> +
>>>>>             adev->gfx.gfx_off_req_count--;
>>>>>     -    if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>>>> -        schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>>>>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>>>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
>>>>> -            adev->gfx.gfx_off_state = false;
>>>>> +        if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
>>>>> +            schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>>>>> +    } else {
>>>>> +        if (adev->gfx.gfx_off_req_count == 0) {
>>>>> +            cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
>>>>> +
>>>>> +            if (adev->gfx.gfx_off_state &&
>>>>
>>>> More of a question which I didn't check last time - Is this expected to be true when the disable call comes in first?
>>>
>>> My assumption is that cancel_delayed_work_sync guarantees amdgpu_device_delay_enable_gfx_off's assignment is visible here.
>>>
>>
>> To clarify - when nothing is scheduled. If enable() is called when the count is 0, it goes to unlock. Now the expectation is someone to call Disable first.
> 
> Yes, the very first amdgpu_gfx_off_ctrl call must pass enable=false, or it's a bug, which
> 
>          if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
> 
> will catch.
> 
> 
>> Let's say  Disable() is called first, then the variable will be false, right?
> 
> Ohh, I see what you mean. The first time amdgpu_gfx_off_ctrl is called with enable=false, adev->gfx.gfx_off_state == false (what it was initialized to), so it doesn't actually disable GFXOFF in HW.

Exactly.
> 
> Note that this is a separate pre-existing bug, not a regression of my patch.
> 
> I wonder what's the best solution for that, move the adev->gfx.gfx_off_state assignments into amdgpu_dpm_set_powergating_by_smu?

Should be an existing one, never bothered about that condition before.

One hack would be

is_pending = cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);

	if ((adev->gfx.gfx_off_state || !is_pending) &&

If work was never scheduled or pending, is_pending should be false OR if 
it got executed, gfx_off_state should be set.

Thanks,
Lijo

> 
>
Michel Dänzer Aug. 17, 2021, 11:06 a.m. UTC | #7
On 2021-08-17 12:37 p.m., Lazar, Lijo wrote:
> 
> 
> On 8/17/2021 3:29 PM, Michel Dänzer wrote:
>> On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:
>>>
>>>
>>> On 8/17/2021 2:56 PM, Michel Dänzer wrote:
>>>> On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
>>>>>
>>>>>
>>>>> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>>>>>> From: Michel Dänzer <mdaenzer@redhat.com>
>>>>>>
>>>>>> schedule_delayed_work does not push back the work if it was already
>>>>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>>>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>>>>> was disabled and re-enabled again during those 100 ms.
>>>>>>
>>>>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>>>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>>>>> disabling it again (for getting the GPU clock counter).
>>>>>>
>>>>>> To fix this, call cancel_delayed_work_sync when the disable count
>>>>>> transitions from 0 to 1, and only schedule the delayed work on the
>>>>>> reverse transition, not if the disable count was already 0. This makes
>>>>>> sure the delayed work doesn't run at unexpected times, and allows it to
>>>>>> be lock-free.
>>>>>>
>>>>>> v2:
>>>>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>>>>      mod_delayed_work.
>>>>>> v3:
>>>>>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>>>>>> v4:
>>>>>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>>>>>      adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>>>>>>      checking for it to be 0 (Evan Quan)
>>>>>>
>>>>>> Cc: stable@vger.kernel.org
>>>>>> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> # v3
>>>>>> Acked-by: Christian König <christian.koenig@amd.com> # v3
>>>>>> Signed-off-by: Michel Dänzer <mdaenzer@redhat.com>
>>>>>> ---
>>>>>>
>>>>>> Alex, probably best to wait a bit longer before picking this up. :)
>>>>>>
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++----
>>>>>>     drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++++++++++++++-------
>>>>>>     2 files changed, 30 insertions(+), 17 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index f3fd5ec710b6..f944ed858f3e 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>>>>         struct amdgpu_device *adev =
>>>>>>             container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
>>>>>>     -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>>>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>>>>>> -            adev->gfx.gfx_off_state = true;
>>>>>> -    }
>>>>>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>>>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>>>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>>>>>> +
>>>>>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>>>>>> +        adev->gfx.gfx_off_state = true;
>>>>>>     }
>>>>>>       /**
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> index a0be0772c8b3..b4ced45301be 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
>>>>>>           mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>>>     -    if (!enable)
>>>>>> -        adev->gfx.gfx_off_req_count++;
>>>>>> -    else if (adev->gfx.gfx_off_req_count > 0)
>>>>>> +    if (enable) {
>>>>>> +        /* If the count is already 0, it means there's an imbalance bug somewhere.
>>>>>> +         * Note that the bug may be in a different caller than the one which triggers the
>>>>>> +         * WARN_ON_ONCE.
>>>>>> +         */
>>>>>> +        if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>>>>>> +            goto unlock;
>>>>>> +
>>>>>>             adev->gfx.gfx_off_req_count--;
>>>>>>     -    if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>>>>> -        schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>>>>>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>>>>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
>>>>>> -            adev->gfx.gfx_off_state = false;
>>>>>> +        if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
>>>>>> +            schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>>>>>> +    } else {
>>>>>> +        if (adev->gfx.gfx_off_req_count == 0) {
>>>>>> +            cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
>>>>>> +
>>>>>> +            if (adev->gfx.gfx_off_state &&
>>>>>
>>>>> More of a question which I didn't check last time - Is this expected to be true when the disable call comes in first?
>>>>
>>>> My assumption is that cancel_delayed_work_sync guarantees amdgpu_device_delay_enable_gfx_off's assignment is visible here.
>>>>
>>>
>>> To clarify - when nothing is scheduled. If enable() is called when the count is 0, it goes to unlock. Now the expectation is someone to call Disable first.
>>
>> Yes, the very first amdgpu_gfx_off_ctrl call must pass enable=false, or it's a bug, which
>>
>>          if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>>
>> will catch.
>>
>>
>>> Let's say  Disable() is called first, then the variable will be false, right?
>>
>> Ohh, I see what you mean. The first time amdgpu_gfx_off_ctrl is called with enable=false, adev->gfx.gfx_off_state == false (what it was initialized to), so it doesn't actually disable GFXOFF in HW.
> 
> Exactly.

Turns out that's not the end of that rabbit (side-)hole yet. :)

amdgpu_device_init initializes adev->gfx.gfx_off_req_count = 1. amdgpu_gfx_off_ctrl is then called with enable=true from amdgpu_device_init → amdgpu_device_ip_late_init → amdgpu_device_set_pg_state. This schedules amdgpu_device_delay_enable_gfx_off, which runs ~100ms later, enables GFXOFF in the HW and sets adev->gfx.gfx_off_state = true.

So it looks fine as is actually, if a bit convoluted. (I wonder if GFXOFF shouldn't rather be enabled synchronously during initialization though)
Lazar, Lijo Aug. 17, 2021, 11:49 a.m. UTC | #8
On 8/17/2021 4:36 PM, Michel Dänzer wrote:
> On 2021-08-17 12:37 p.m., Lazar, Lijo wrote:
>>
>>
>> On 8/17/2021 3:29 PM, Michel Dänzer wrote:
>>> On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:
>>>>
>>>>
>>>> On 8/17/2021 2:56 PM, Michel Dänzer wrote:
>>>>> On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
>>>>>>
>>>>>>
>>>>>> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>>>>>>> From: Michel Dänzer <mdaenzer@redhat.com>
>>>>>>>
>>>>>>> schedule_delayed_work does not push back the work if it was already
>>>>>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>>>>>> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
>>>>>>> was disabled and re-enabled again during those 100 ms.
>>>>>>>
>>>>>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>>>>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>>>>>> disabling it again (for getting the GPU clock counter).
>>>>>>>
>>>>>>> To fix this, call cancel_delayed_work_sync when the disable count
>>>>>>> transitions from 0 to 1, and only schedule the delayed work on the
>>>>>>> reverse transition, not if the disable count was already 0. This makes
>>>>>>> sure the delayed work doesn't run at unexpected times, and allows it to
>>>>>>> be lock-free.
>>>>>>>
>>>>>>> v2:
>>>>>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>>>>>       mod_delayed_work.
>>>>>>> v3:
>>>>>>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
>>>>>>> v4:
>>>>>>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>>>>>>       adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>>>>>>>       checking for it to be 0 (Evan Quan)
>>>>>>>
>>>>>>> Cc: stable@vger.kernel.org
>>>>>>> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> # v3
>>>>>>> Acked-by: Christian König <christian.koenig@amd.com> # v3
>>>>>>> Signed-off-by: Michel Dänzer <mdaenzer@redhat.com>
>>>>>>> ---
>>>>>>>
>>>>>>> Alex, probably best to wait a bit longer before picking this up. :)
>>>>>>>
>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++----
>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++++++++++++++-------
>>>>>>>      2 files changed, 30 insertions(+), 17 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> index f3fd5ec710b6..f944ed858f3e 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> @@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>>>>>          struct amdgpu_device *adev =
>>>>>>>              container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
>>>>>>>      -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>>>> -    if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>>>>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>>>>>>> -            adev->gfx.gfx_off_state = true;
>>>>>>> -    }
>>>>>>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>>>>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>>>>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>>>>>>> +
>>>>>>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
>>>>>>> +        adev->gfx.gfx_off_state = true;
>>>>>>>      }
>>>>>>>        /**
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>>> index a0be0772c8b3..b4ced45301be 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
>>>>>>>            mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>>>>      -    if (!enable)
>>>>>>> -        adev->gfx.gfx_off_req_count++;
>>>>>>> -    else if (adev->gfx.gfx_off_req_count > 0)
>>>>>>> +    if (enable) {
>>>>>>> +        /* If the count is already 0, it means there's an imbalance bug somewhere.
>>>>>>> +         * Note that the bug may be in a different caller than the one which triggers the
>>>>>>> +         * WARN_ON_ONCE.
>>>>>>> +         */
>>>>>>> +        if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>>>>>>> +            goto unlock;
>>>>>>> +
>>>>>>>              adev->gfx.gfx_off_req_count--;
>>>>>>>      -    if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
>>>>>>> -        schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>>>>>>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>>>>>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
>>>>>>> -            adev->gfx.gfx_off_state = false;
>>>>>>> +        if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
>>>>>>> +            schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
>>>>>>> +    } else {
>>>>>>> +        if (adev->gfx.gfx_off_req_count == 0) {
>>>>>>> +            cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
>>>>>>> +
>>>>>>> +            if (adev->gfx.gfx_off_state &&
>>>>>>
>>>>>> More of a question which I didn't check last time - Is this expected to be true when the disable call comes in first?
>>>>>
>>>>> My assumption is that cancel_delayed_work_sync guarantees amdgpu_device_delay_enable_gfx_off's assignment is visible here.
>>>>>
>>>>
>>>> To clarify - when nothing is scheduled. If enable() is called when the count is 0, it goes to unlock. Now the expectation is someone to call Disable first.
>>>
>>> Yes, the very first amdgpu_gfx_off_ctrl call must pass enable=false, or it's a bug, which
>>>
>>>           if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>>>
>>> will catch.
>>>
>>>
>>>> Let's say  Disable() is called first, then the variable will be false, right?
>>>
>>> Ohh, I see what you mean. The first time amdgpu_gfx_off_ctrl is called with enable=false, adev->gfx.gfx_off_state == false (what it was initialized to), so it doesn't actually disable GFXOFF in HW.
>>
>> Exactly.
> 
> Turns out that's not the end of that rabbit (side-)hole yet. :)
> 
> amdgpu_device_init initializes adev->gfx.gfx_off_req_count = 1. amdgpu_gfx_off_ctrl is then called with enable=true from amdgpu_device_init → amdgpu_device_ip_late_init → amdgpu_device_set_pg_state. This schedules amdgpu_device_delay_enable_gfx_off, which runs ~100ms later, enables GFXOFF in the HW and sets adev->gfx.gfx_off_state = true.
> 

What if a disable comes at < 100ms? Quite unlikely, neverthless in that 
case pending work will get cancelled and the variable won't be set until 
the work gets a chance to fully run. The assumption that GFXOFF disable 
succeeded in a subsequent amdgpu_gfx_off_ctrl  enable = false won't be 
correct as PMFW will by default enable GFXOFF when there is no activity.

Otherwise, keep an assumption that amdgpu_device_delay_enable_gfx_off 
gets a chance to run before any disable call comes - maybe that's the 
case in most cases.

> So it looks fine as is actually, if a bit convoluted. 

> (I wonder if GFXOFF shouldn't rather be enabled synchronously during initialization though)

Yes, that is logical. But amdgpu_device_ip_late_init is called also 
during amdgpu_device_resume. amdgpu_device_resume is used in pm_ops or 
runtime pm. In those cases it makes sense to delay it as there could be 
an immediate usage of GFX.

Thanks,
Lijo

> 
>
Lazar, Lijo Aug. 17, 2021, 12:55 p.m. UTC | #9
On 8/17/2021 5:19 PM, Lazar, Lijo wrote:
> 
> 
> On 8/17/2021 4:36 PM, Michel Dänzer wrote:
>> On 2021-08-17 12:37 p.m., Lazar, Lijo wrote:
>>>
>>>
>>> On 8/17/2021 3:29 PM, Michel Dänzer wrote:
>>>> On 2021-08-17 11:37 a.m., Lazar, Lijo wrote:
>>>>>
>>>>>
>>>>> On 8/17/2021 2:56 PM, Michel Dänzer wrote:
>>>>>> On 2021-08-17 11:12 a.m., Lazar, Lijo wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 8/17/2021 1:53 PM, Michel Dänzer wrote:
>>>>>>>> From: Michel Dänzer <mdaenzer@redhat.com>
>>>>>>>>
>>>>>>>> schedule_delayed_work does not push back the work if it was already
>>>>>>>> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
>>>>>>>> after the first time GFXOFF was disabled and re-enabled, even if 
>>>>>>>> GFXOFF
>>>>>>>> was disabled and re-enabled again during those 100 ms.
>>>>>>>>
>>>>>>>> This resulted in frame drops / stutter with the upcoming mutter 41
>>>>>>>> release on Navi 14, due to constantly enabling GFXOFF in the HW and
>>>>>>>> disabling it again (for getting the GPU clock counter).
>>>>>>>>
>>>>>>>> To fix this, call cancel_delayed_work_sync when the disable count
>>>>>>>> transitions from 0 to 1, and only schedule the delayed work on the
>>>>>>>> reverse transition, not if the disable count was already 0. This 
>>>>>>>> makes
>>>>>>>> sure the delayed work doesn't run at unexpected times, and 
>>>>>>>> allows it to
>>>>>>>> be lock-free.
>>>>>>>>
>>>>>>>> v2:
>>>>>>>> * Use cancel_delayed_work_sync & mutex_trylock instead of
>>>>>>>>       mod_delayed_work.
>>>>>>>> v3:
>>>>>>>> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian 
>>>>>>>> König)
>>>>>>>> v4:
>>>>>>>> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>>>>>>>>       adev->gfx.gfx_off_req_count and 
>>>>>>>> amdgpu_device_delay_enable_gfx_off
>>>>>>>>       checking for it to be 0 (Evan Quan)
>>>>>>>>
>>>>>>>> Cc: stable@vger.kernel.org
>>>>>>>> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> # v3
>>>>>>>> Acked-by: Christian König <christian.koenig@amd.com> # v3
>>>>>>>> Signed-off-by: Michel Dänzer <mdaenzer@redhat.com>
>>>>>>>> ---
>>>>>>>>
>>>>>>>> Alex, probably best to wait a bit longer before picking this up. :)
>>>>>>>>
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++----
>>>>>>>>      drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 
>>>>>>>> +++++++++++++++-------
>>>>>>>>      2 files changed, 30 insertions(+), 17 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> index f3fd5ec710b6..f944ed858f3e 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> @@ -2777,12 +2777,11 @@ static void 
>>>>>>>> amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>>>>>>>>          struct amdgpu_device *adev =
>>>>>>>>              container_of(work, struct amdgpu_device, 
>>>>>>>> gfx.gfx_off_delay_work.work);
>>>>>>>>      -    mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>>>>> -    if (!adev->gfx.gfx_off_state && 
>>>>>>>> !adev->gfx.gfx_off_req_count) {
>>>>>>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, 
>>>>>>>> AMD_IP_BLOCK_TYPE_GFX, true))
>>>>>>>> -            adev->gfx.gfx_off_state = true;
>>>>>>>> -    }
>>>>>>>> -    mutex_unlock(&adev->gfx.gfx_off_mutex);
>>>>>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_state);
>>>>>>>> +    WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
>>>>>>>> +
>>>>>>>> +    if (!amdgpu_dpm_set_powergating_by_smu(adev, 
>>>>>>>> AMD_IP_BLOCK_TYPE_GFX, true))
>>>>>>>> +        adev->gfx.gfx_off_state = true;
>>>>>>>>      }
>>>>>>>>        /**
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>>>> index a0be0772c8b3..b4ced45301be 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
>>>>>>>> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct 
>>>>>>>> amdgpu_device *adev, bool enable)
>>>>>>>>            mutex_lock(&adev->gfx.gfx_off_mutex);
>>>>>>>>      -    if (!enable)
>>>>>>>> -        adev->gfx.gfx_off_req_count++;
>>>>>>>> -    else if (adev->gfx.gfx_off_req_count > 0)
>>>>>>>> +    if (enable) {
>>>>>>>> +        /* If the count is already 0, it means there's an 
>>>>>>>> imbalance bug somewhere.
>>>>>>>> +         * Note that the bug may be in a different caller than 
>>>>>>>> the one which triggers the
>>>>>>>> +         * WARN_ON_ONCE.
>>>>>>>> +         */
>>>>>>>> +        if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>>>>>>>> +            goto unlock;
>>>>>>>> +
>>>>>>>>              adev->gfx.gfx_off_req_count--;
>>>>>>>>      -    if (enable && !adev->gfx.gfx_off_state && 
>>>>>>>> !adev->gfx.gfx_off_req_count) {
>>>>>>>> -        schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
>>>>>>>> GFX_OFF_DELAY_ENABLE);
>>>>>>>> -    } else if (!enable && adev->gfx.gfx_off_state) {
>>>>>>>> -        if (!amdgpu_dpm_set_powergating_by_smu(adev, 
>>>>>>>> AMD_IP_BLOCK_TYPE_GFX, false)) {
>>>>>>>> -            adev->gfx.gfx_off_state = false;
>>>>>>>> +        if (adev->gfx.gfx_off_req_count == 0 && 
>>>>>>>> !adev->gfx.gfx_off_state)
>>>>>>>> +            
>>>>>>>> schedule_delayed_work(&adev->gfx.gfx_off_delay_work, 
>>>>>>>> GFX_OFF_DELAY_ENABLE);
>>>>>>>> +    } else {
>>>>>>>> +        if (adev->gfx.gfx_off_req_count == 0) {
>>>>>>>> +            
>>>>>>>> cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
>>>>>>>> +
>>>>>>>> +            if (adev->gfx.gfx_off_state &&
>>>>>>>
>>>>>>> More of a question which I didn't check last time - Is this 
>>>>>>> expected to be true when the disable call comes in first?
>>>>>>
>>>>>> My assumption is that cancel_delayed_work_sync guarantees 
>>>>>> amdgpu_device_delay_enable_gfx_off's assignment is visible here.
>>>>>>
>>>>>
>>>>> To clarify - when nothing is scheduled. If enable() is called when 
>>>>> the count is 0, it goes to unlock. Now the expectation is someone 
>>>>> to call Disable first.
>>>>
>>>> Yes, the very first amdgpu_gfx_off_ctrl call must pass enable=false, 
>>>> or it's a bug, which
>>>>
>>>>           if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
>>>>
>>>> will catch.
>>>>
>>>>
>>>>> Let's say  Disable() is called first, then the variable will be 
>>>>> false, right?
>>>>
>>>> Ohh, I see what you mean. The first time amdgpu_gfx_off_ctrl is 
>>>> called with enable=false, adev->gfx.gfx_off_state == false (what it 
>>>> was initialized to), so it doesn't actually disable GFXOFF in HW.
>>>
>>> Exactly.
>>
>> Turns out that's not the end of that rabbit (side-)hole yet. :)
>>
>> amdgpu_device_init initializes adev->gfx.gfx_off_req_count = 1. 
>> amdgpu_gfx_off_ctrl is then called with enable=true from 
>> amdgpu_device_init → amdgpu_device_ip_late_init → 
>> amdgpu_device_set_pg_state. This schedules 
>> amdgpu_device_delay_enable_gfx_off, which runs ~100ms later, enables 
>> GFXOFF in the HW and sets adev->gfx.gfx_off_state = true.
>>
> 
> What if a disable comes at < 100ms? Quite unlikely, neverthless in that 
> case pending work will get cancelled and the variable won't be set until 
> the work gets a chance to fully run. The assumption that GFXOFF disable 
> succeeded in a subsequent amdgpu_gfx_off_ctrl  enable = false won't be 
> correct as PMFW will by default enable GFXOFF when there is no activity.

"PMFW will by default enable GFXOFF when there is no activity."
Checked again and this is false at least for Sienna Cichlid/NV1x.Driver 
must explicitly allow GfxOff first. In that sense, driver doesn't need 
to disable GFXOFF unless it has succeeded in enabling it.

Overall, the existing logic is fine. Sorry for the confusion.

Thanks,
Lijo

> Otherwise, keep an assumption that amdgpu_device_delay_enable_gfx_off 
> gets a chance to run before any disable call comes - maybe that's the 
> case in most cases.
> 
>> So it looks fine as is actually, if a bit convoluted. 
> 
>> (I wonder if GFXOFF shouldn't rather be enabled synchronously during 
>> initialization though)
> 
> Yes, that is logical. But amdgpu_device_ip_late_init is called also 
> during amdgpu_device_resume. amdgpu_device_resume is used in pm_ops or 
> runtime pm. In those cases it makes sense to delay it as there could be 
> an immediate usage of GFX.
> 
> Thanks,
> Lijo
> 
>>
>>
Alex Deucher Aug. 18, 2021, 9:56 p.m. UTC | #10
Applied.  Let's see how long this one lasts :)

Alex

On Tue, Aug 17, 2021 at 4:23 AM Michel Dänzer <michel@daenzer.net> wrote:
>
> From: Michel Dänzer <mdaenzer@redhat.com>
>
> schedule_delayed_work does not push back the work if it was already
> scheduled before, so amdgpu_device_delay_enable_gfx_off ran ~100 ms
> after the first time GFXOFF was disabled and re-enabled, even if GFXOFF
> was disabled and re-enabled again during those 100 ms.
>
> This resulted in frame drops / stutter with the upcoming mutter 41
> release on Navi 14, due to constantly enabling GFXOFF in the HW and
> disabling it again (for getting the GPU clock counter).
>
> To fix this, call cancel_delayed_work_sync when the disable count
> transitions from 0 to 1, and only schedule the delayed work on the
> reverse transition, not if the disable count was already 0. This makes
> sure the delayed work doesn't run at unexpected times, and allows it to
> be lock-free.
>
> v2:
> * Use cancel_delayed_work_sync & mutex_trylock instead of
>   mod_delayed_work.
> v3:
> * Make amdgpu_device_delay_enable_gfx_off lock-free (Christian König)
> v4:
> * Fix race condition between amdgpu_gfx_off_ctrl incrementing
>   adev->gfx.gfx_off_req_count and amdgpu_device_delay_enable_gfx_off
>   checking for it to be 0 (Evan Quan)
>
> Cc: stable@vger.kernel.org
> Reviewed-by: Lijo Lazar <lijo.lazar@amd.com> # v3
> Acked-by: Christian König <christian.koenig@amd.com> # v3
> Signed-off-by: Michel Dänzer <mdaenzer@redhat.com>
> ---
>
> Alex, probably best to wait a bit longer before picking this up. :)
>
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 36 +++++++++++++++-------
>  2 files changed, 30 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index f3fd5ec710b6..f944ed858f3e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2777,12 +2777,11 @@ static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
>         struct amdgpu_device *adev =
>                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
>
> -       mutex_lock(&adev->gfx.gfx_off_mutex);
> -       if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
> -               if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
> -                       adev->gfx.gfx_off_state = true;
> -       }
> -       mutex_unlock(&adev->gfx.gfx_off_mutex);
> +       WARN_ON_ONCE(adev->gfx.gfx_off_state);
> +       WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
> +
> +       if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
> +               adev->gfx.gfx_off_state = true;
>  }
>
>  /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a0be0772c8b3..b4ced45301be 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -563,24 +563,38 @@ void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
>
>         mutex_lock(&adev->gfx.gfx_off_mutex);
>
> -       if (!enable)
> -               adev->gfx.gfx_off_req_count++;
> -       else if (adev->gfx.gfx_off_req_count > 0)
> +       if (enable) {
> +               /* If the count is already 0, it means there's an imbalance bug somewhere.
> +                * Note that the bug may be in a different caller than the one which triggers the
> +                * WARN_ON_ONCE.
> +                */
> +               if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
> +                       goto unlock;
> +
>                 adev->gfx.gfx_off_req_count--;
>
> -       if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
> -               schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
> -       } else if (!enable && adev->gfx.gfx_off_state) {
> -               if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
> -                       adev->gfx.gfx_off_state = false;
> +               if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
> +                       schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
> +       } else {
> +               if (adev->gfx.gfx_off_req_count == 0) {
> +                       cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
> +
> +                       if (adev->gfx.gfx_off_state &&
> +                           !amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
> +                               adev->gfx.gfx_off_state = false;
>
> -                       if (adev->gfx.funcs->init_spm_golden) {
> -                               dev_dbg(adev->dev, "GFXOFF is disabled, re-init SPM golden settings\n");
> -                               amdgpu_gfx_init_spm_golden(adev);
> +                               if (adev->gfx.funcs->init_spm_golden) {
> +                                       dev_dbg(adev->dev,
> +                                               "GFXOFF is disabled, re-init SPM golden settings\n");
> +                                       amdgpu_gfx_init_spm_golden(adev);
> +                               }
>                         }
>                 }
> +
> +               adev->gfx.gfx_off_req_count++;
>         }
>
> +unlock:
>         mutex_unlock(&adev->gfx.gfx_off_mutex);
>  }
>
> --
> 2.32.0
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index f3fd5ec710b6..f944ed858f3e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2777,12 +2777,11 @@  static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
 	struct amdgpu_device *adev =
 		container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
 
-	mutex_lock(&adev->gfx.gfx_off_mutex);
-	if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
-		if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
-			adev->gfx.gfx_off_state = true;
-	}
-	mutex_unlock(&adev->gfx.gfx_off_mutex);
+	WARN_ON_ONCE(adev->gfx.gfx_off_state);
+	WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
+
+	if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
+		adev->gfx.gfx_off_state = true;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a0be0772c8b3..b4ced45301be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -563,24 +563,38 @@  void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable)
 
 	mutex_lock(&adev->gfx.gfx_off_mutex);
 
-	if (!enable)
-		adev->gfx.gfx_off_req_count++;
-	else if (adev->gfx.gfx_off_req_count > 0)
+	if (enable) {
+		/* If the count is already 0, it means there's an imbalance bug somewhere.
+		 * Note that the bug may be in a different caller than the one which triggers the
+		 * WARN_ON_ONCE.
+		 */
+		if (WARN_ON_ONCE(adev->gfx.gfx_off_req_count == 0))
+			goto unlock;
+
 		adev->gfx.gfx_off_req_count--;
 
-	if (enable && !adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
-		schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
-	} else if (!enable && adev->gfx.gfx_off_state) {
-		if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
-			adev->gfx.gfx_off_state = false;
+		if (adev->gfx.gfx_off_req_count == 0 && !adev->gfx.gfx_off_state)
+			schedule_delayed_work(&adev->gfx.gfx_off_delay_work, GFX_OFF_DELAY_ENABLE);
+	} else {
+		if (adev->gfx.gfx_off_req_count == 0) {
+			cancel_delayed_work_sync(&adev->gfx.gfx_off_delay_work);
+
+			if (adev->gfx.gfx_off_state &&
+			    !amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, false)) {
+				adev->gfx.gfx_off_state = false;
 
-			if (adev->gfx.funcs->init_spm_golden) {
-				dev_dbg(adev->dev, "GFXOFF is disabled, re-init SPM golden settings\n");
-				amdgpu_gfx_init_spm_golden(adev);
+				if (adev->gfx.funcs->init_spm_golden) {
+					dev_dbg(adev->dev,
+						"GFXOFF is disabled, re-init SPM golden settings\n");
+					amdgpu_gfx_init_spm_golden(adev);
+				}
 			}
 		}
+
+		adev->gfx.gfx_off_req_count++;
 	}
 
+unlock:
 	mutex_unlock(&adev->gfx.gfx_off_mutex);
 }