diff mbox series

[v7,12/16] drm/amdgpu: Fix hang on device removal.

Message ID 20210512142648.666476-13-andrey.grodzovsky@amd.com (mailing list archive)
State Not Applicable
Headers show
Series RFC Support hot device unplug in amdgpu | expand

Commit Message

Andrey Grodzovsky May 12, 2021, 2:26 p.m. UTC
If removing while commands in flight you cannot wait to flush the
HW fences on a ring since the device is gone.

Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

Comments

Andrey Grodzovsky May 14, 2021, 2:42 p.m. UTC | #1
Ping

Andrey

On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote:
> If removing while commands in flight you cannot wait to flush the
> HW fences on a ring since the device is gone.
> 
> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------
>   1 file changed, 10 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 1ffb36bd0b19..fa03702ecbfb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -36,6 +36,7 @@
>   #include <linux/firmware.h>
>   #include <linux/pm_runtime.h>
>   
> +#include <drm/drm_drv.h>
>   #include "amdgpu.h"
>   #include "amdgpu_trace.h"
>   
> @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev)
>    */
>   void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
>   {
> -	unsigned i, j;
> -	int r;
> +	int i, r;
>   
>   	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>   		struct amdgpu_ring *ring = adev->rings[i];
> @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
>   			continue;
>   		if (!ring->no_scheduler)
>   			drm_sched_fini(&ring->sched);
> -		r = amdgpu_fence_wait_empty(ring);
> -		if (r) {
> -			/* no need to trigger GPU reset as we are unloading */
> +		/* You can't wait for HW to signal if it's gone */
> +		if (!drm_dev_is_unplugged(&adev->ddev))
> +			r = amdgpu_fence_wait_empty(ring);
> +		else
> +			r = -ENODEV;
> +		/* no need to trigger GPU reset as we are unloading */
> +		if (r)
>   			amdgpu_fence_driver_force_completion(ring);
> -		}
> +
>   		if (ring->fence_drv.irq_src)
>   			amdgpu_irq_put(adev, ring->fence_drv.irq_src,
>   				       ring->fence_drv.irq_type);
>
Andrey Grodzovsky May 17, 2021, 2:40 p.m. UTC | #2
Ping

Andrey

On 2021-05-14 10:42 a.m., Andrey Grodzovsky wrote:
> Ping
> 
> Andrey
> 
> On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote:
>> If removing while commands in flight you cannot wait to flush the
>> HW fences on a ring since the device is gone.
>>
>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------
>>   1 file changed, 10 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> index 1ffb36bd0b19..fa03702ecbfb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>> @@ -36,6 +36,7 @@
>>   #include <linux/firmware.h>
>>   #include <linux/pm_runtime.h>
>> +#include <drm/drm_drv.h>
>>   #include "amdgpu.h"
>>   #include "amdgpu_trace.h"
>> @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct amdgpu_device 
>> *adev)
>>    */
>>   void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
>>   {
>> -    unsigned i, j;
>> -    int r;
>> +    int i, r;
>>       for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>>           struct amdgpu_ring *ring = adev->rings[i];
>> @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct 
>> amdgpu_device *adev)
>>               continue;
>>           if (!ring->no_scheduler)
>>               drm_sched_fini(&ring->sched);
>> -        r = amdgpu_fence_wait_empty(ring);
>> -        if (r) {
>> -            /* no need to trigger GPU reset as we are unloading */
>> +        /* You can't wait for HW to signal if it's gone */
>> +        if (!drm_dev_is_unplugged(&adev->ddev))
>> +            r = amdgpu_fence_wait_empty(ring);
>> +        else
>> +            r = -ENODEV;
>> +        /* no need to trigger GPU reset as we are unloading */
>> +        if (r)
>>               amdgpu_fence_driver_force_completion(ring);
>> -        }
>> +
>>           if (ring->fence_drv.irq_src)
>>               amdgpu_irq_put(adev, ring->fence_drv.irq_src,
>>                          ring->fence_drv.irq_type);
>>
Alex Deucher May 17, 2021, 5:39 p.m. UTC | #3
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>

On Mon, May 17, 2021 at 10:40 AM Andrey Grodzovsky
<andrey.grodzovsky@amd.com> wrote:
>
> Ping
>
> Andrey
>
> On 2021-05-14 10:42 a.m., Andrey Grodzovsky wrote:
> > Ping
> >
> > Andrey
> >
> > On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote:
> >> If removing while commands in flight you cannot wait to flush the
> >> HW fences on a ring since the device is gone.
> >>
> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
> >> ---
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------
> >>   1 file changed, 10 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> >> index 1ffb36bd0b19..fa03702ecbfb 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> >> @@ -36,6 +36,7 @@
> >>   #include <linux/firmware.h>
> >>   #include <linux/pm_runtime.h>
> >> +#include <drm/drm_drv.h>
> >>   #include "amdgpu.h"
> >>   #include "amdgpu_trace.h"
> >> @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct amdgpu_device
> >> *adev)
> >>    */
> >>   void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
> >>   {
> >> -    unsigned i, j;
> >> -    int r;
> >> +    int i, r;
> >>       for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
> >>           struct amdgpu_ring *ring = adev->rings[i];
> >> @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct
> >> amdgpu_device *adev)
> >>               continue;
> >>           if (!ring->no_scheduler)
> >>               drm_sched_fini(&ring->sched);
> >> -        r = amdgpu_fence_wait_empty(ring);
> >> -        if (r) {
> >> -            /* no need to trigger GPU reset as we are unloading */
> >> +        /* You can't wait for HW to signal if it's gone */
> >> +        if (!drm_dev_is_unplugged(&adev->ddev))
> >> +            r = amdgpu_fence_wait_empty(ring);
> >> +        else
> >> +            r = -ENODEV;
> >> +        /* no need to trigger GPU reset as we are unloading */
> >> +        if (r)
> >>               amdgpu_fence_driver_force_completion(ring);
> >> -        }
> >> +
> >>           if (ring->fence_drv.irq_src)
> >>               amdgpu_irq_put(adev, ring->fence_drv.irq_src,
> >>                          ring->fence_drv.irq_type);
> >>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Christian König May 17, 2021, 7:39 p.m. UTC | #4
You need to note who you are pinging here.

I'm still assuming you wait for feedback from Daniel. Or should I take a 
look?

Christian.

Am 17.05.21 um 16:40 schrieb Andrey Grodzovsky:
> Ping
>
> Andrey
>
> On 2021-05-14 10:42 a.m., Andrey Grodzovsky wrote:
>> Ping
>>
>> Andrey
>>
>> On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote:
>>> If removing while commands in flight you cannot wait to flush the
>>> HW fences on a ring since the device is gone.
>>>
>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------
>>>   1 file changed, 10 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index 1ffb36bd0b19..fa03702ecbfb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -36,6 +36,7 @@
>>>   #include <linux/firmware.h>
>>>   #include <linux/pm_runtime.h>
>>> +#include <drm/drm_drv.h>
>>>   #include "amdgpu.h"
>>>   #include "amdgpu_trace.h"
>>> @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct 
>>> amdgpu_device *adev)
>>>    */
>>>   void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
>>>   {
>>> -    unsigned i, j;
>>> -    int r;
>>> +    int i, r;
>>>       for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>>>           struct amdgpu_ring *ring = adev->rings[i];
>>> @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct 
>>> amdgpu_device *adev)
>>>               continue;
>>>           if (!ring->no_scheduler)
>>>               drm_sched_fini(&ring->sched);
>>> -        r = amdgpu_fence_wait_empty(ring);
>>> -        if (r) {
>>> -            /* no need to trigger GPU reset as we are unloading */
>>> +        /* You can't wait for HW to signal if it's gone */
>>> +        if (!drm_dev_is_unplugged(&adev->ddev))
>>> +            r = amdgpu_fence_wait_empty(ring);
>>> +        else
>>> +            r = -ENODEV;
>>> +        /* no need to trigger GPU reset as we are unloading */
>>> +        if (r)
>>>               amdgpu_fence_driver_force_completion(ring);
>>> -        }
>>> +
>>>           if (ring->fence_drv.irq_src)
>>>               amdgpu_irq_put(adev, ring->fence_drv.irq_src,
>>>                          ring->fence_drv.irq_type);
>>>
Andrey Grodzovsky May 17, 2021, 7:46 p.m. UTC | #5
Yep, you can take a look.

Andrey

On 2021-05-17 3:39 p.m., Christian König wrote:
> You need to note who you are pinging here.
> 
> I'm still assuming you wait for feedback from Daniel. Or should I take a 
> look?
> 
> Christian.
> 
> Am 17.05.21 um 16:40 schrieb Andrey Grodzovsky:
>> Ping
>>
>> Andrey
>>
>> On 2021-05-14 10:42 a.m., Andrey Grodzovsky wrote:
>>> Ping
>>>
>>> Andrey
>>>
>>> On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote:
>>>> If removing while commands in flight you cannot wait to flush the
>>>> HW fences on a ring since the device is gone.
>>>>
>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------
>>>>   1 file changed, 10 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> index 1ffb36bd0b19..fa03702ecbfb 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>> @@ -36,6 +36,7 @@
>>>>   #include <linux/firmware.h>
>>>>   #include <linux/pm_runtime.h>
>>>> +#include <drm/drm_drv.h>
>>>>   #include "amdgpu.h"
>>>>   #include "amdgpu_trace.h"
>>>> @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct 
>>>> amdgpu_device *adev)
>>>>    */
>>>>   void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
>>>>   {
>>>> -    unsigned i, j;
>>>> -    int r;
>>>> +    int i, r;
>>>>       for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>>>>           struct amdgpu_ring *ring = adev->rings[i];
>>>> @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct 
>>>> amdgpu_device *adev)
>>>>               continue;
>>>>           if (!ring->no_scheduler)
>>>>               drm_sched_fini(&ring->sched);
>>>> -        r = amdgpu_fence_wait_empty(ring);
>>>> -        if (r) {
>>>> -            /* no need to trigger GPU reset as we are unloading */
>>>> +        /* You can't wait for HW to signal if it's gone */
>>>> +        if (!drm_dev_is_unplugged(&adev->ddev))
>>>> +            r = amdgpu_fence_wait_empty(ring);
>>>> +        else
>>>> +            r = -ENODEV;
>>>> +        /* no need to trigger GPU reset as we are unloading */
>>>> +        if (r)
>>>>               amdgpu_fence_driver_force_completion(ring);
>>>> -        }
>>>> +
>>>>           if (ring->fence_drv.irq_src)
>>>>               amdgpu_irq_put(adev, ring->fence_drv.irq_src,
>>>>                          ring->fence_drv.irq_type);
>>>>
>
Christian König May 17, 2021, 7:54 p.m. UTC | #6
Ok, then putting that on my TODO list for tomorrow.

I've already found a problem with how we finish of fences, going to 
write more on this tomorrow.

Christian.

Am 17.05.21 um 21:46 schrieb Andrey Grodzovsky:
> Yep, you can take a look.
>
> Andrey
>
> On 2021-05-17 3:39 p.m., Christian König wrote:
>> You need to note who you are pinging here.
>>
>> I'm still assuming you wait for feedback from Daniel. Or should I 
>> take a look?
>>
>> Christian.
>>
>> Am 17.05.21 um 16:40 schrieb Andrey Grodzovsky:
>>> Ping
>>>
>>> Andrey
>>>
>>> On 2021-05-14 10:42 a.m., Andrey Grodzovsky wrote:
>>>> Ping
>>>>
>>>> Andrey
>>>>
>>>> On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote:
>>>>> If removing while commands in flight you cannot wait to flush the
>>>>> HW fences on a ring since the device is gone.
>>>>>
>>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------
>>>>>   1 file changed, 10 insertions(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>> index 1ffb36bd0b19..fa03702ecbfb 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>>>> @@ -36,6 +36,7 @@
>>>>>   #include <linux/firmware.h>
>>>>>   #include <linux/pm_runtime.h>
>>>>> +#include <drm/drm_drv.h>
>>>>>   #include "amdgpu.h"
>>>>>   #include "amdgpu_trace.h"
>>>>> @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct 
>>>>> amdgpu_device *adev)
>>>>>    */
>>>>>   void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
>>>>>   {
>>>>> -    unsigned i, j;
>>>>> -    int r;
>>>>> +    int i, r;
>>>>>       for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
>>>>>           struct amdgpu_ring *ring = adev->rings[i];
>>>>> @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct 
>>>>> amdgpu_device *adev)
>>>>>               continue;
>>>>>           if (!ring->no_scheduler)
>>>>>               drm_sched_fini(&ring->sched);
>>>>> -        r = amdgpu_fence_wait_empty(ring);
>>>>> -        if (r) {
>>>>> -            /* no need to trigger GPU reset as we are unloading */
>>>>> +        /* You can't wait for HW to signal if it's gone */
>>>>> +        if (!drm_dev_is_unplugged(&adev->ddev))
>>>>> +            r = amdgpu_fence_wait_empty(ring);
>>>>> +        else
>>>>> +            r = -ENODEV;
>>>>> +        /* no need to trigger GPU reset as we are unloading */
>>>>> +        if (r)
>>>>>               amdgpu_fence_driver_force_completion(ring);
>>>>> -        }
>>>>> +
>>>>>           if (ring->fence_drv.irq_src)
>>>>>               amdgpu_irq_put(adev, ring->fence_drv.irq_src,
>>>>>                          ring->fence_drv.irq_type);
>>>>>
>>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 1ffb36bd0b19..fa03702ecbfb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -36,6 +36,7 @@ 
 #include <linux/firmware.h>
 #include <linux/pm_runtime.h>
 
+#include <drm/drm_drv.h>
 #include "amdgpu.h"
 #include "amdgpu_trace.h"
 
@@ -525,8 +526,7 @@  int amdgpu_fence_driver_init(struct amdgpu_device *adev)
  */
 void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
 {
-	unsigned i, j;
-	int r;
+	int i, r;
 
 	for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
 		struct amdgpu_ring *ring = adev->rings[i];
@@ -535,11 +535,15 @@  void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev)
 			continue;
 		if (!ring->no_scheduler)
 			drm_sched_fini(&ring->sched);
-		r = amdgpu_fence_wait_empty(ring);
-		if (r) {
-			/* no need to trigger GPU reset as we are unloading */
+		/* You can't wait for HW to signal if it's gone */
+		if (!drm_dev_is_unplugged(&adev->ddev))
+			r = amdgpu_fence_wait_empty(ring);
+		else
+			r = -ENODEV;
+		/* no need to trigger GPU reset as we are unloading */
+		if (r)
 			amdgpu_fence_driver_force_completion(ring);
-		}
+
 		if (ring->fence_drv.irq_src)
 			amdgpu_irq_put(adev, ring->fence_drv.irq_src,
 				       ring->fence_drv.irq_type);