Message ID | 20210512142648.666476-13-andrey.grodzovsky@amd.com (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | RFC Support hot device unplug in amdgpu | expand |
Ping Andrey On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote: > If removing while commands in flight you cannot wait to flush the > HW fences on a ring since the device is gone. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------ > 1 file changed, 10 insertions(+), 6 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > index 1ffb36bd0b19..fa03702ecbfb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > @@ -36,6 +36,7 @@ > #include <linux/firmware.h> > #include <linux/pm_runtime.h> > > +#include <drm/drm_drv.h> > #include "amdgpu.h" > #include "amdgpu_trace.h" > > @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev) > */ > void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev) > { > - unsigned i, j; > - int r; > + int i, r; > > for (i = 0; i < AMDGPU_MAX_RINGS; i++) { > struct amdgpu_ring *ring = adev->rings[i]; > @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev) > continue; > if (!ring->no_scheduler) > drm_sched_fini(&ring->sched); > - r = amdgpu_fence_wait_empty(ring); > - if (r) { > - /* no need to trigger GPU reset as we are unloading */ > + /* You can't wait for HW to signal if it's gone */ > + if (!drm_dev_is_unplugged(&adev->ddev)) > + r = amdgpu_fence_wait_empty(ring); > + else > + r = -ENODEV; > + /* no need to trigger GPU reset as we are unloading */ > + if (r) > amdgpu_fence_driver_force_completion(ring); > - } > + > if (ring->fence_drv.irq_src) > amdgpu_irq_put(adev, ring->fence_drv.irq_src, > ring->fence_drv.irq_type); >
Ping Andrey On 2021-05-14 10:42 a.m., Andrey Grodzovsky wrote: > Ping > > Andrey > > On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote: >> If removing while commands in flight you cannot wait to flush the >> HW fences on a ring since the device is gone. >> >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------ >> 1 file changed, 10 insertions(+), 6 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >> index 1ffb36bd0b19..fa03702ecbfb 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >> @@ -36,6 +36,7 @@ >> #include <linux/firmware.h> >> #include <linux/pm_runtime.h> >> +#include <drm/drm_drv.h> >> #include "amdgpu.h" >> #include "amdgpu_trace.h" >> @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct amdgpu_device >> *adev) >> */ >> void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev) >> { >> - unsigned i, j; >> - int r; >> + int i, r; >> for (i = 0; i < AMDGPU_MAX_RINGS; i++) { >> struct amdgpu_ring *ring = adev->rings[i]; >> @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct >> amdgpu_device *adev) >> continue; >> if (!ring->no_scheduler) >> drm_sched_fini(&ring->sched); >> - r = amdgpu_fence_wait_empty(ring); >> - if (r) { >> - /* no need to trigger GPU reset as we are unloading */ >> + /* You can't wait for HW to signal if it's gone */ >> + if (!drm_dev_is_unplugged(&adev->ddev)) >> + r = amdgpu_fence_wait_empty(ring); >> + else >> + r = -ENODEV; >> + /* no need to trigger GPU reset as we are unloading */ >> + if (r) >> amdgpu_fence_driver_force_completion(ring); >> - } >> + >> if (ring->fence_drv.irq_src) >> amdgpu_irq_put(adev, ring->fence_drv.irq_src, >> ring->fence_drv.irq_type); >>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com> On Mon, May 17, 2021 at 10:40 AM Andrey Grodzovsky <andrey.grodzovsky@amd.com> wrote: > > Ping > > Andrey > > On 2021-05-14 10:42 a.m., Andrey Grodzovsky wrote: > > Ping > > > > Andrey > > > > On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote: > >> If removing while commands in flight you cannot wait to flush the > >> HW fences on a ring since the device is gone. > >> > >> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> > >> --- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------ > >> 1 file changed, 10 insertions(+), 6 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> index 1ffb36bd0b19..fa03702ecbfb 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > >> @@ -36,6 +36,7 @@ > >> #include <linux/firmware.h> > >> #include <linux/pm_runtime.h> > >> +#include <drm/drm_drv.h> > >> #include "amdgpu.h" > >> #include "amdgpu_trace.h" > >> @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct amdgpu_device > >> *adev) > >> */ > >> void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev) > >> { > >> - unsigned i, j; > >> - int r; > >> + int i, r; > >> for (i = 0; i < AMDGPU_MAX_RINGS; i++) { > >> struct amdgpu_ring *ring = adev->rings[i]; > >> @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct > >> amdgpu_device *adev) > >> continue; > >> if (!ring->no_scheduler) > >> drm_sched_fini(&ring->sched); > >> - r = amdgpu_fence_wait_empty(ring); > >> - if (r) { > >> - /* no need to trigger GPU reset as we are unloading */ > >> + /* You can't wait for HW to signal if it's gone */ > >> + if (!drm_dev_is_unplugged(&adev->ddev)) > >> + r = amdgpu_fence_wait_empty(ring); > >> + else > >> + r = -ENODEV; > >> + /* no need to trigger GPU reset as we are unloading */ > >> + if (r) > >> amdgpu_fence_driver_force_completion(ring); > >> - } > >> + > >> if (ring->fence_drv.irq_src) > >> amdgpu_irq_put(adev, ring->fence_drv.irq_src, > >> ring->fence_drv.irq_type); > >> > _______________________________________________ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
You need to note who you are pinging here. I'm still assuming you wait for feedback from Daniel. Or should I take a look? Christian. Am 17.05.21 um 16:40 schrieb Andrey Grodzovsky: > Ping > > Andrey > > On 2021-05-14 10:42 a.m., Andrey Grodzovsky wrote: >> Ping >> >> Andrey >> >> On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote: >>> If removing while commands in flight you cannot wait to flush the >>> HW fences on a ring since the device is gone. >>> >>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>> --- >>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------ >>> 1 file changed, 10 insertions(+), 6 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>> index 1ffb36bd0b19..fa03702ecbfb 100644 >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>> @@ -36,6 +36,7 @@ >>> #include <linux/firmware.h> >>> #include <linux/pm_runtime.h> >>> +#include <drm/drm_drv.h> >>> #include "amdgpu.h" >>> #include "amdgpu_trace.h" >>> @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct >>> amdgpu_device *adev) >>> */ >>> void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev) >>> { >>> - unsigned i, j; >>> - int r; >>> + int i, r; >>> for (i = 0; i < AMDGPU_MAX_RINGS; i++) { >>> struct amdgpu_ring *ring = adev->rings[i]; >>> @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct >>> amdgpu_device *adev) >>> continue; >>> if (!ring->no_scheduler) >>> drm_sched_fini(&ring->sched); >>> - r = amdgpu_fence_wait_empty(ring); >>> - if (r) { >>> - /* no need to trigger GPU reset as we are unloading */ >>> + /* You can't wait for HW to signal if it's gone */ >>> + if (!drm_dev_is_unplugged(&adev->ddev)) >>> + r = amdgpu_fence_wait_empty(ring); >>> + else >>> + r = -ENODEV; >>> + /* no need to trigger GPU reset as we are unloading */ >>> + if (r) >>> amdgpu_fence_driver_force_completion(ring); >>> - } >>> + >>> if (ring->fence_drv.irq_src) >>> amdgpu_irq_put(adev, ring->fence_drv.irq_src, >>> ring->fence_drv.irq_type); >>>
Yep, you can take a look. Andrey On 2021-05-17 3:39 p.m., Christian König wrote: > You need to note who you are pinging here. > > I'm still assuming you wait for feedback from Daniel. Or should I take a > look? > > Christian. > > Am 17.05.21 um 16:40 schrieb Andrey Grodzovsky: >> Ping >> >> Andrey >> >> On 2021-05-14 10:42 a.m., Andrey Grodzovsky wrote: >>> Ping >>> >>> Andrey >>> >>> On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote: >>>> If removing while commands in flight you cannot wait to flush the >>>> HW fences on a ring since the device is gone. >>>> >>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>> --- >>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------ >>>> 1 file changed, 10 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>>> index 1ffb36bd0b19..fa03702ecbfb 100644 >>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>>> @@ -36,6 +36,7 @@ >>>> #include <linux/firmware.h> >>>> #include <linux/pm_runtime.h> >>>> +#include <drm/drm_drv.h> >>>> #include "amdgpu.h" >>>> #include "amdgpu_trace.h" >>>> @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct >>>> amdgpu_device *adev) >>>> */ >>>> void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev) >>>> { >>>> - unsigned i, j; >>>> - int r; >>>> + int i, r; >>>> for (i = 0; i < AMDGPU_MAX_RINGS; i++) { >>>> struct amdgpu_ring *ring = adev->rings[i]; >>>> @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct >>>> amdgpu_device *adev) >>>> continue; >>>> if (!ring->no_scheduler) >>>> drm_sched_fini(&ring->sched); >>>> - r = amdgpu_fence_wait_empty(ring); >>>> - if (r) { >>>> - /* no need to trigger GPU reset as we are unloading */ >>>> + /* You can't wait for HW to signal if it's gone */ >>>> + if (!drm_dev_is_unplugged(&adev->ddev)) >>>> + r = amdgpu_fence_wait_empty(ring); >>>> + else >>>> + r = -ENODEV; >>>> + /* no need to trigger GPU reset as we are unloading */ >>>> + if (r) >>>> amdgpu_fence_driver_force_completion(ring); >>>> - } >>>> + >>>> if (ring->fence_drv.irq_src) >>>> amdgpu_irq_put(adev, ring->fence_drv.irq_src, >>>> ring->fence_drv.irq_type); >>>> >
Ok, then putting that on my TODO list for tomorrow. I've already found a problem with how we finish of fences, going to write more on this tomorrow. Christian. Am 17.05.21 um 21:46 schrieb Andrey Grodzovsky: > Yep, you can take a look. > > Andrey > > On 2021-05-17 3:39 p.m., Christian König wrote: >> You need to note who you are pinging here. >> >> I'm still assuming you wait for feedback from Daniel. Or should I >> take a look? >> >> Christian. >> >> Am 17.05.21 um 16:40 schrieb Andrey Grodzovsky: >>> Ping >>> >>> Andrey >>> >>> On 2021-05-14 10:42 a.m., Andrey Grodzovsky wrote: >>>> Ping >>>> >>>> Andrey >>>> >>>> On 2021-05-12 10:26 a.m., Andrey Grodzovsky wrote: >>>>> If removing while commands in flight you cannot wait to flush the >>>>> HW fences on a ring since the device is gone. >>>>> >>>>> Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> >>>>> --- >>>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------ >>>>> 1 file changed, 10 insertions(+), 6 deletions(-) >>>>> >>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>>>> index 1ffb36bd0b19..fa03702ecbfb 100644 >>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c >>>>> @@ -36,6 +36,7 @@ >>>>> #include <linux/firmware.h> >>>>> #include <linux/pm_runtime.h> >>>>> +#include <drm/drm_drv.h> >>>>> #include "amdgpu.h" >>>>> #include "amdgpu_trace.h" >>>>> @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct >>>>> amdgpu_device *adev) >>>>> */ >>>>> void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev) >>>>> { >>>>> - unsigned i, j; >>>>> - int r; >>>>> + int i, r; >>>>> for (i = 0; i < AMDGPU_MAX_RINGS; i++) { >>>>> struct amdgpu_ring *ring = adev->rings[i]; >>>>> @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct >>>>> amdgpu_device *adev) >>>>> continue; >>>>> if (!ring->no_scheduler) >>>>> drm_sched_fini(&ring->sched); >>>>> - r = amdgpu_fence_wait_empty(ring); >>>>> - if (r) { >>>>> - /* no need to trigger GPU reset as we are unloading */ >>>>> + /* You can't wait for HW to signal if it's gone */ >>>>> + if (!drm_dev_is_unplugged(&adev->ddev)) >>>>> + r = amdgpu_fence_wait_empty(ring); >>>>> + else >>>>> + r = -ENODEV; >>>>> + /* no need to trigger GPU reset as we are unloading */ >>>>> + if (r) >>>>> amdgpu_fence_driver_force_completion(ring); >>>>> - } >>>>> + >>>>> if (ring->fence_drv.irq_src) >>>>> amdgpu_irq_put(adev, ring->fence_drv.irq_src, >>>>> ring->fence_drv.irq_type); >>>>> >>
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 1ffb36bd0b19..fa03702ecbfb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -36,6 +36,7 @@ #include <linux/firmware.h> #include <linux/pm_runtime.h> +#include <drm/drm_drv.h> #include "amdgpu.h" #include "amdgpu_trace.h" @@ -525,8 +526,7 @@ int amdgpu_fence_driver_init(struct amdgpu_device *adev) */ void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev) { - unsigned i, j; - int r; + int i, r; for (i = 0; i < AMDGPU_MAX_RINGS; i++) { struct amdgpu_ring *ring = adev->rings[i]; @@ -535,11 +535,15 @@ void amdgpu_fence_driver_fini_hw(struct amdgpu_device *adev) continue; if (!ring->no_scheduler) drm_sched_fini(&ring->sched); - r = amdgpu_fence_wait_empty(ring); - if (r) { - /* no need to trigger GPU reset as we are unloading */ + /* You can't wait for HW to signal if it's gone */ + if (!drm_dev_is_unplugged(&adev->ddev)) + r = amdgpu_fence_wait_empty(ring); + else + r = -ENODEV; + /* no need to trigger GPU reset as we are unloading */ + if (r) amdgpu_fence_driver_force_completion(ring); - } + if (ring->fence_drv.irq_src) amdgpu_irq_put(adev, ring->fence_drv.irq_src, ring->fence_drv.irq_type);
If removing while commands in flight you cannot wait to flush the HW fences on a ring since the device is gone. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-)