diff mbox series

[1/1] amdgpu fix for gfx1103 queue evict/restore crash

Message ID 20241127114638.11216-2-lamikr@gmail.com (mailing list archive)
State New
Headers show
Series amdgpu fix for gfx1103 queue evict/restore crash v2 | expand

Commit Message

Mika Laitio Nov. 27, 2024, 11:46 a.m. UTC
AMD gfx1103 / M780 iGPU will crash eventually when used for
pytorch ML/AI operations on rocm sdk stack. After kernel error
the application exits on error and linux desktop can itself
sometimes either freeze or reset back to login screen.

Error will happen randomly when kernel calls evict_process_queues_cpsch and
restore_process_queues_cpsch methods to remove and restore the queues
that has been created earlier.

The fix is to remove the evict and restore calls when device used is
iGPU. The queues that has been added during the user space application execution
time will still be removed when the application exits

On evety test attempts the crash has always happened on the
same location while removing the 2nd queue of 3 with doorbell id 0x1002.

Below is the trace captured by adding more printouts to problem
location to print message also when the queue is evicted or resrored
succesfully.

[  948.324174] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1202, queue: 2, caller: restore_process_queues_cpsch
[  948.334344] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1002, queue: 1, caller: restore_process_queues_cpsch
[  948.344499] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1000, queue: 0, caller: restore_process_queues_cpsch
[  952.380614] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1202, queue: 2, caller: evict_process_queues_cpsch
[  952.391330] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1002, queue: 1, caller: evict_process_queues_cpsch
[  952.401634] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1000, queue: 0, caller: evict_process_queues_cpsch
[  952.414507] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1202, queue: 2, caller: restore_process_queues_cpsch
[  952.424618] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1002, queue: 1, caller: restore_process_queues_cpsch
[  952.434922] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1000, queue: 0, caller: restore_process_queues_cpsch
[  952.446272] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1202, queue: 2, caller: evict_process_queues_cpsch
[  954.460341] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
[  954.460356] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes failed to remove hardware queue from MES, doorbell=0x1002, queue: 1, caller: evict_process_queues_cpsch
[  954.460360] amdgpu 0000:c4:00.0: amdgpu: MES might be in unrecoverable state, issue a GPU reset
[  954.460366] amdgpu 0000:c4:00.0: amdgpu: Failed to evict queue 1
[  954.460368] amdgpu 0000:c4:00.0: amdgpu: Failed to evict process queues
[  954.460439] amdgpu 0000:c4:00.0: amdgpu: GPU reset begin!
[  954.460464] amdgpu 0000:c4:00.0: amdgpu: remove_all_queues_mes: Failed to remove queue 0 for dev 5257
[  954.460515] amdgpu 0000:c4:00.0: amdgpu: Dumping IP State
[  954.462637] amdgpu 0000:c4:00.0: amdgpu: Dumping IP State Completed
[  955.865591] amdgpu: process_termination_cpsch started
[  955.866432] amdgpu: process_termination_cpsch started
[  955.866445] amdgpu 0000:c4:00.0: amdgpu: Failed to remove queue 0
[  956.503043] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
[  956.503059] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue
[  958.507491] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
[  958.507507] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue
[  960.512077] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
[  960.512093] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue
[  960.785816] [drm:gfx_v11_0_hw_fini [amdgpu]] *ERROR* failed to halt cp gfx

Signed-off-by: Mika Laitio <lamikr@gmail.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c | 24 ++++++++++++-------
 1 file changed, 16 insertions(+), 8 deletions(-)

Comments

Christian König Nov. 27, 2024, 11:51 a.m. UTC | #1
Am 27.11.24 um 12:46 schrieb Mika Laitio:
> AMD gfx1103 / M780 iGPU will crash eventually when used for
> pytorch ML/AI operations on rocm sdk stack. After kernel error
> the application exits on error and linux desktop can itself
> sometimes either freeze or reset back to login screen.
>
> Error will happen randomly when kernel calls evict_process_queues_cpsch and
> restore_process_queues_cpsch methods to remove and restore the queues
> that has been created earlier.
>
> The fix is to remove the evict and restore calls when device used is
> iGPU. The queues that has been added during the user space application execution
> time will still be removed when the application exits

As far as I can see that is absolutely not a fix but rather a obviously 
broken workaround.

Evicting and restoring queues is usually mandatory for correct operation.

So just ignore that this doesn't work will just is not something you can do.

Regards,
Christian.

>
> On evety test attempts the crash has always happened on the
> same location while removing the 2nd queue of 3 with doorbell id 0x1002.
>
> Below is the trace captured by adding more printouts to problem
> location to print message also when the queue is evicted or resrored
> succesfully.
>
> [  948.324174] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1202, queue: 2, caller: restore_process_queues_cpsch
> [  948.334344] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1002, queue: 1, caller: restore_process_queues_cpsch
> [  948.344499] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1000, queue: 0, caller: restore_process_queues_cpsch
> [  952.380614] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1202, queue: 2, caller: evict_process_queues_cpsch
> [  952.391330] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1002, queue: 1, caller: evict_process_queues_cpsch
> [  952.401634] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1000, queue: 0, caller: evict_process_queues_cpsch
> [  952.414507] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1202, queue: 2, caller: restore_process_queues_cpsch
> [  952.424618] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1002, queue: 1, caller: restore_process_queues_cpsch
> [  952.434922] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added hardware queue to MES, doorbell=0x1000, queue: 0, caller: restore_process_queues_cpsch
> [  952.446272] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed hardware queue from MES, doorbell=0x1202, queue: 2, caller: evict_process_queues_cpsch
> [  954.460341] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
> [  954.460356] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes failed to remove hardware queue from MES, doorbell=0x1002, queue: 1, caller: evict_process_queues_cpsch
> [  954.460360] amdgpu 0000:c4:00.0: amdgpu: MES might be in unrecoverable state, issue a GPU reset
> [  954.460366] amdgpu 0000:c4:00.0: amdgpu: Failed to evict queue 1
> [  954.460368] amdgpu 0000:c4:00.0: amdgpu: Failed to evict process queues
> [  954.460439] amdgpu 0000:c4:00.0: amdgpu: GPU reset begin!
> [  954.460464] amdgpu 0000:c4:00.0: amdgpu: remove_all_queues_mes: Failed to remove queue 0 for dev 5257
> [  954.460515] amdgpu 0000:c4:00.0: amdgpu: Dumping IP State
> [  954.462637] amdgpu 0000:c4:00.0: amdgpu: Dumping IP State Completed
> [  955.865591] amdgpu: process_termination_cpsch started
> [  955.866432] amdgpu: process_termination_cpsch started
> [  955.866445] amdgpu 0000:c4:00.0: amdgpu: Failed to remove queue 0
> [  956.503043] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
> [  956.503059] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue
> [  958.507491] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
> [  958.507507] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue
> [  960.512077] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to msg=REMOVE_QUEUE
> [  960.512093] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* failed to unmap legacy queue
> [  960.785816] [drm:gfx_v11_0_hw_fini [amdgpu]] *ERROR* failed to halt cp gfx
>
> Signed-off-by: Mika Laitio <lamikr@gmail.com>
> ---
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 24 ++++++++++++-------
>   1 file changed, 16 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c79fe9069e22..96088d480e09 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1187,9 +1187,12 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   	struct kfd_process_device *pdd;
>   	int retval = 0;
>   
> +	// gfx1103 APU can fail to remove queue on evict/restore cycle
> +	if (dqm->dev->adev->flags & AMD_IS_APU)
> +		goto out;
>   	dqm_lock(dqm);
>   	if (qpd->evicted++ > 0) /* already evicted, do nothing */
> -		goto out;
> +		goto out_unlock;
>   
>   	pdd = qpd_to_pdd(qpd);
>   
> @@ -1198,7 +1201,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   	 * Skip queue eviction on process eviction.
>   	 */
>   	if (!pdd->drm_priv)
> -		goto out;
> +		goto out_unlock;
>   
>   	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
>   			    pdd->process->pasid);
> @@ -1219,7 +1222,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   			if (retval) {
>   				dev_err(dev, "Failed to evict queue %d\n",
>   					q->properties.queue_id);
> -				goto out;
> +				goto out_unlock;
>   			}
>   		}
>   	}
> @@ -1231,8 +1234,9 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
>   					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
>   					      USE_DEFAULT_GRACE_PERIOD);
>   
> -out:
> +out_unlock:
>   	dqm_unlock(dqm);
> +out:
>   	return retval;
>   }
>   
> @@ -1326,14 +1330,17 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   	uint64_t eviction_duration;
>   	int retval = 0;
>   
> +	// gfx1103 APU can fail to remove queue on evict/restore cycle
> +	if (dqm->dev->adev->flags & AMD_IS_APU)
> +		goto out;
>   	pdd = qpd_to_pdd(qpd);
>   
>   	dqm_lock(dqm);
>   	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
> -		goto out;
> +		goto out_unlock;
>   	if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
>   		qpd->evicted--;
> -		goto out;
> +		goto out_unlock;
>   	}
>   
>   	/* The debugger creates processes that temporarily have not acquired
> @@ -1364,7 +1371,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   			if (retval) {
>   				dev_err(dev, "Failed to restore queue %d\n",
>   					q->properties.queue_id);
> -				goto out;
> +				goto out_unlock;
>   			}
>   		}
>   	}
> @@ -1375,8 +1382,9 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
>   	atomic64_add(eviction_duration, &pdd->evict_duration_counter);
>   vm_not_acquired:
>   	qpd->evicted = 0;
> -out:
> +out_unlock:
>   	dqm_unlock(dqm);
> +out:
>   	return retval;
>   }
>
Felix Kuehling Nov. 27, 2024, 11:50 p.m. UTC | #2
On 2024-11-27 06:51, Christian König wrote:
> Am 27.11.24 um 12:46 schrieb Mika Laitio:
>> AMD gfx1103 / M780 iGPU will crash eventually when used for
>> pytorch ML/AI operations on rocm sdk stack. After kernel error
>> the application exits on error and linux desktop can itself
>> sometimes either freeze or reset back to login screen.
>>
>> Error will happen randomly when kernel calls 
>> evict_process_queues_cpsch and
>> restore_process_queues_cpsch methods to remove and restore the queues
>> that has been created earlier.
>>
>> The fix is to remove the evict and restore calls when device used is
>> iGPU. The queues that has been added during the user space 
>> application execution
>> time will still be removed when the application exits
>
> As far as I can see that is absolutely not a fix but rather a 
> obviously broken workaround.
>
> Evicting and restoring queues is usually mandatory for correct operation.
>
> So just ignore that this doesn't work will just is not something you 
> can do.

I agree. Eviction happens for example in MMU notifiers where we need to 
assure the kernel that memory won't be accessed by the GPU once the 
notifier returns, until the memory mappings in the GPU page tables can 
be revalidated.

This looks like a crude workaround for an MES firmware problem or some 
other kind of intermittent hang that needs to be root-caused. It's a 
NACK from me as well.

Regards,
   Felix


>
> Regards,
> Christian.
>
>>
>> On evety test attempts the crash has always happened on the
>> same location while removing the 2nd queue of 3 with doorbell id 0x1002.
>>
>> Below is the trace captured by adding more printouts to problem
>> location to print message also when the queue is evicted or resrored
>> succesfully.
>>
>> [  948.324174] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added 
>> hardware queue to MES, doorbell=0x1202, queue: 2, caller: 
>> restore_process_queues_cpsch
>> [  948.334344] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added 
>> hardware queue to MES, doorbell=0x1002, queue: 1, caller: 
>> restore_process_queues_cpsch
>> [  948.344499] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added 
>> hardware queue to MES, doorbell=0x1000, queue: 0, caller: 
>> restore_process_queues_cpsch
>> [  952.380614] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed 
>> hardware queue from MES, doorbell=0x1202, queue: 2, caller: 
>> evict_process_queues_cpsch
>> [  952.391330] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed 
>> hardware queue from MES, doorbell=0x1002, queue: 1, caller: 
>> evict_process_queues_cpsch
>> [  952.401634] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed 
>> hardware queue from MES, doorbell=0x1000, queue: 0, caller: 
>> evict_process_queues_cpsch
>> [  952.414507] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added 
>> hardware queue to MES, doorbell=0x1202, queue: 2, caller: 
>> restore_process_queues_cpsch
>> [  952.424618] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added 
>> hardware queue to MES, doorbell=0x1002, queue: 1, caller: 
>> restore_process_queues_cpsch
>> [  952.434922] amdgpu 0000:c4:00.0: amdgpu: add_queue_mes added 
>> hardware queue to MES, doorbell=0x1000, queue: 0, caller: 
>> restore_process_queues_cpsch
>> [  952.446272] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes removed 
>> hardware queue from MES, doorbell=0x1202, queue: 2, caller: 
>> evict_process_queues_cpsch
>> [  954.460341] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to 
>> msg=REMOVE_QUEUE
>> [  954.460356] amdgpu 0000:c4:00.0: amdgpu: remove_queue_mes failed 
>> to remove hardware queue from MES, doorbell=0x1002, queue: 1, caller: 
>> evict_process_queues_cpsch
>> [  954.460360] amdgpu 0000:c4:00.0: amdgpu: MES might be in 
>> unrecoverable state, issue a GPU reset
>> [  954.460366] amdgpu 0000:c4:00.0: amdgpu: Failed to evict queue 1
>> [  954.460368] amdgpu 0000:c4:00.0: amdgpu: Failed to evict process 
>> queues
>> [  954.460439] amdgpu 0000:c4:00.0: amdgpu: GPU reset begin!
>> [  954.460464] amdgpu 0000:c4:00.0: amdgpu: remove_all_queues_mes: 
>> Failed to remove queue 0 for dev 5257
>> [  954.460515] amdgpu 0000:c4:00.0: amdgpu: Dumping IP State
>> [  954.462637] amdgpu 0000:c4:00.0: amdgpu: Dumping IP State Completed
>> [  955.865591] amdgpu: process_termination_cpsch started
>> [  955.866432] amdgpu: process_termination_cpsch started
>> [  955.866445] amdgpu 0000:c4:00.0: amdgpu: Failed to remove queue 0
>> [  956.503043] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to 
>> msg=REMOVE_QUEUE
>> [  956.503059] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* 
>> failed to unmap legacy queue
>> [  958.507491] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to 
>> msg=REMOVE_QUEUE
>> [  958.507507] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* 
>> failed to unmap legacy queue
>> [  960.512077] amdgpu 0000:c4:00.0: amdgpu: MES failed to respond to 
>> msg=REMOVE_QUEUE
>> [  960.512093] [drm:amdgpu_mes_unmap_legacy_queue [amdgpu]] *ERROR* 
>> failed to unmap legacy queue
>> [  960.785816] [drm:gfx_v11_0_hw_fini [amdgpu]] *ERROR* failed to 
>> halt cp gfx
>>
>> Signed-off-by: Mika Laitio <lamikr@gmail.com>
>> ---
>>   .../drm/amd/amdkfd/kfd_device_queue_manager.c | 24 ++++++++++++-------
>>   1 file changed, 16 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index c79fe9069e22..96088d480e09 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -1187,9 +1187,12 @@ static int evict_process_queues_cpsch(struct 
>> device_queue_manager *dqm,
>>       struct kfd_process_device *pdd;
>>       int retval = 0;
>>   +    // gfx1103 APU can fail to remove queue on evict/restore cycle
>> +    if (dqm->dev->adev->flags & AMD_IS_APU)
>> +        goto out;
>>       dqm_lock(dqm);
>>       if (qpd->evicted++ > 0) /* already evicted, do nothing */
>> -        goto out;
>> +        goto out_unlock;
>>         pdd = qpd_to_pdd(qpd);
>>   @@ -1198,7 +1201,7 @@ static int evict_process_queues_cpsch(struct 
>> device_queue_manager *dqm,
>>        * Skip queue eviction on process eviction.
>>        */
>>       if (!pdd->drm_priv)
>> -        goto out;
>> +        goto out_unlock;
>>         pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
>>                   pdd->process->pasid);
>> @@ -1219,7 +1222,7 @@ static int evict_process_queues_cpsch(struct 
>> device_queue_manager *dqm,
>>               if (retval) {
>>                   dev_err(dev, "Failed to evict queue %d\n",
>>                       q->properties.queue_id);
>> -                goto out;
>> +                goto out_unlock;
>>               }
>>           }
>>       }
>> @@ -1231,8 +1234,9 @@ static int evict_process_queues_cpsch(struct 
>> device_queue_manager *dqm,
>> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
>>                             USE_DEFAULT_GRACE_PERIOD);
>>   -out:
>> +out_unlock:
>>       dqm_unlock(dqm);
>> +out:
>>       return retval;
>>   }
>>   @@ -1326,14 +1330,17 @@ static int 
>> restore_process_queues_cpsch(struct device_queue_manager *dqm,
>>       uint64_t eviction_duration;
>>       int retval = 0;
>>   +    // gfx1103 APU can fail to remove queue on evict/restore cycle
>> +    if (dqm->dev->adev->flags & AMD_IS_APU)
>> +        goto out;
>>       pdd = qpd_to_pdd(qpd);
>>         dqm_lock(dqm);
>>       if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do 
>> nothing */
>> -        goto out;
>> +        goto out_unlock;
>>       if (qpd->evicted > 1) { /* ref count still > 0, decrement & 
>> quit */
>>           qpd->evicted--;
>> -        goto out;
>> +        goto out_unlock;
>>       }
>>         /* The debugger creates processes that temporarily have not 
>> acquired
>> @@ -1364,7 +1371,7 @@ static int restore_process_queues_cpsch(struct 
>> device_queue_manager *dqm,
>>               if (retval) {
>>                   dev_err(dev, "Failed to restore queue %d\n",
>>                       q->properties.queue_id);
>> -                goto out;
>> +                goto out_unlock;
>>               }
>>           }
>>       }
>> @@ -1375,8 +1382,9 @@ static int restore_process_queues_cpsch(struct 
>> device_queue_manager *dqm,
>>       atomic64_add(eviction_duration, &pdd->evict_duration_counter);
>>   vm_not_acquired:
>>       qpd->evicted = 0;
>> -out:
>> +out_unlock:
>>       dqm_unlock(dqm);
>> +out:
>>       return retval;
>>   }
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c79fe9069e22..96088d480e09 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1187,9 +1187,12 @@  static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 	struct kfd_process_device *pdd;
 	int retval = 0;
 
+	// gfx1103 APU can fail to remove queue on evict/restore cycle
+	if (dqm->dev->adev->flags & AMD_IS_APU)
+		goto out;
 	dqm_lock(dqm);
 	if (qpd->evicted++ > 0) /* already evicted, do nothing */
-		goto out;
+		goto out_unlock;
 
 	pdd = qpd_to_pdd(qpd);
 
@@ -1198,7 +1201,7 @@  static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 	 * Skip queue eviction on process eviction.
 	 */
 	if (!pdd->drm_priv)
-		goto out;
+		goto out_unlock;
 
 	pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
 			    pdd->process->pasid);
@@ -1219,7 +1222,7 @@  static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 			if (retval) {
 				dev_err(dev, "Failed to evict queue %d\n",
 					q->properties.queue_id);
-				goto out;
+				goto out_unlock;
 			}
 		}
 	}
@@ -1231,8 +1234,9 @@  static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
 					      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0,
 					      USE_DEFAULT_GRACE_PERIOD);
 
-out:
+out_unlock:
 	dqm_unlock(dqm);
+out:
 	return retval;
 }
 
@@ -1326,14 +1330,17 @@  static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 	uint64_t eviction_duration;
 	int retval = 0;
 
+	// gfx1103 APU can fail to remove queue on evict/restore cycle
+	if (dqm->dev->adev->flags & AMD_IS_APU)
+		goto out;
 	pdd = qpd_to_pdd(qpd);
 
 	dqm_lock(dqm);
 	if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
-		goto out;
+		goto out_unlock;
 	if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
 		qpd->evicted--;
-		goto out;
+		goto out_unlock;
 	}
 
 	/* The debugger creates processes that temporarily have not acquired
@@ -1364,7 +1371,7 @@  static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 			if (retval) {
 				dev_err(dev, "Failed to restore queue %d\n",
 					q->properties.queue_id);
-				goto out;
+				goto out_unlock;
 			}
 		}
 	}
@@ -1375,8 +1382,9 @@  static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
 	atomic64_add(eviction_duration, &pdd->evict_duration_counter);
 vm_not_acquired:
 	qpd->evicted = 0;
-out:
+out_unlock:
 	dqm_unlock(dqm);
+out:
 	return retval;
 }