Message ID | 20180427061724.28497-14-daniel.vetter@ffwll.ch (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
NAK, there is a subtitle but major difference: > - if (rdev->needs_reset) { > - t = -EDEADLK; > - break; > - } Without that the whole radeon GPU reset code breaks. Regards, Christian. Am 27.04.2018 um 08:17 schrieb Daniel Vetter: > It's a copy of dma_fence_default_wait, written slightly differently. > > Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> > Cc: Alex Deucher <alexander.deucher@amd.com> > Cc: "Christian König" <christian.koenig@amd.com> > Cc: "David (ChunMing) Zhou" <David1.Zhou@amd.com> > Cc: amd-gfx@lists.freedesktop.org > --- > drivers/gpu/drm/radeon/radeon_fence.c | 63 --------------------------- > 1 file changed, 63 deletions(-) > > diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c > index e86f2bd38410..32690a525bfc 100644 > --- a/drivers/gpu/drm/radeon/radeon_fence.c > +++ b/drivers/gpu/drm/radeon/radeon_fence.c > @@ -1051,72 +1051,9 @@ static const char *radeon_fence_get_timeline_name(struct dma_fence *f) > } > } > > -static inline bool radeon_test_signaled(struct radeon_fence *fence) > -{ > - return test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->base.flags); > -} > - > -struct radeon_wait_cb { > - struct dma_fence_cb base; > - struct task_struct *task; > -}; > - > -static void > -radeon_fence_wait_cb(struct dma_fence *fence, struct dma_fence_cb *cb) > -{ > - struct radeon_wait_cb *wait = > - container_of(cb, struct radeon_wait_cb, base); > - > - wake_up_process(wait->task); > -} > - > -static signed long radeon_fence_default_wait(struct dma_fence *f, bool intr, > - signed long t) > -{ > - struct radeon_fence *fence = to_radeon_fence(f); > - struct radeon_device *rdev = fence->rdev; > - struct radeon_wait_cb cb; > - > - cb.task = current; > - > - if (dma_fence_add_callback(f, &cb.base, radeon_fence_wait_cb)) > - return t; > - > - while (t > 0) { > - if (intr) > - set_current_state(TASK_INTERRUPTIBLE); > - else > - set_current_state(TASK_UNINTERRUPTIBLE); > - > - /* > - * radeon_test_signaled must be called after > - * set_current_state to prevent a race with wake_up_process > - */ > - if (radeon_test_signaled(fence)) > - break; > - > - if (rdev->needs_reset) { > - t = -EDEADLK; > - break; > - } > - > - t = schedule_timeout(t); > - > - if (t > 0 && intr && signal_pending(current)) > - t = -ERESTARTSYS; > - } > - > - __set_current_state(TASK_RUNNING); > - dma_fence_remove_callback(f, &cb.base); > - > - return t; > -} > - > const struct dma_fence_ops radeon_fence_ops = { > .get_driver_name = radeon_fence_get_driver_name, > .get_timeline_name = radeon_fence_get_timeline_name, > .enable_signaling = radeon_fence_enable_signaling, > .signaled = radeon_fence_is_signaled, > - .wait = radeon_fence_default_wait, > - .release = NULL, > };
On Sun, Apr 29, 2018 at 09:08:31AM +0200, Christian König wrote: > NAK, there is a subtitle but major difference: > > > - if (rdev->needs_reset) { > > - t = -EDEADLK; > > - break; > > - } > > Without that the whole radeon GPU reset code breaks. Oops I've missed that. How does this work when you register a callback using ->enable_signaling and then block on it? Everything just dies? We have lots of users of that for buffer/fence sharing. A really ugly, but probably working fix for this would be a kthread worker that just looks for ->needs_reset and force-completes all fences with dma_fence_set_error(-EIO), which is kinda what's supposed to happen here anyway. -Daniel > > Regards, > Christian. > > > Am 27.04.2018 um 08:17 schrieb Daniel Vetter: > > It's a copy of dma_fence_default_wait, written slightly differently. > > > > Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> > > Cc: Alex Deucher <alexander.deucher@amd.com> > > Cc: "Christian König" <christian.koenig@amd.com> > > Cc: "David (ChunMing) Zhou" <David1.Zhou@amd.com> > > Cc: amd-gfx@lists.freedesktop.org > > --- > > drivers/gpu/drm/radeon/radeon_fence.c | 63 --------------------------- > > 1 file changed, 63 deletions(-) > > > > diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c > > index e86f2bd38410..32690a525bfc 100644 > > --- a/drivers/gpu/drm/radeon/radeon_fence.c > > +++ b/drivers/gpu/drm/radeon/radeon_fence.c > > @@ -1051,72 +1051,9 @@ static const char *radeon_fence_get_timeline_name(struct dma_fence *f) > > } > > } > > -static inline bool radeon_test_signaled(struct radeon_fence *fence) > > -{ > > - return test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->base.flags); > > -} > > - > > -struct radeon_wait_cb { > > - struct dma_fence_cb base; > > - struct task_struct *task; > > -}; > > - > > -static void > > -radeon_fence_wait_cb(struct dma_fence *fence, struct dma_fence_cb *cb) > > -{ > > - struct radeon_wait_cb *wait = > > - container_of(cb, struct radeon_wait_cb, base); > > - > > - wake_up_process(wait->task); > > -} > > - > > -static signed long radeon_fence_default_wait(struct dma_fence *f, bool intr, > > - signed long t) > > -{ > > - struct radeon_fence *fence = to_radeon_fence(f); > > - struct radeon_device *rdev = fence->rdev; > > - struct radeon_wait_cb cb; > > - > > - cb.task = current; > > - > > - if (dma_fence_add_callback(f, &cb.base, radeon_fence_wait_cb)) > > - return t; > > - > > - while (t > 0) { > > - if (intr) > > - set_current_state(TASK_INTERRUPTIBLE); > > - else > > - set_current_state(TASK_UNINTERRUPTIBLE); > > - > > - /* > > - * radeon_test_signaled must be called after > > - * set_current_state to prevent a race with wake_up_process > > - */ > > - if (radeon_test_signaled(fence)) > > - break; > > - > > - if (rdev->needs_reset) { > > - t = -EDEADLK; > > - break; > > - } > > - > > - t = schedule_timeout(t); > > - > > - if (t > 0 && intr && signal_pending(current)) > > - t = -ERESTARTSYS; > > - } > > - > > - __set_current_state(TASK_RUNNING); > > - dma_fence_remove_callback(f, &cb.base); > > - > > - return t; > > -} > > - > > const struct dma_fence_ops radeon_fence_ops = { > > .get_driver_name = radeon_fence_get_driver_name, > > .get_timeline_name = radeon_fence_get_timeline_name, > > .enable_signaling = radeon_fence_enable_signaling, > > .signaled = radeon_fence_is_signaled, > > - .wait = radeon_fence_default_wait, > > - .release = NULL, > > }; >
Am 30.04.2018 um 17:38 schrieb Daniel Vetter: > On Sun, Apr 29, 2018 at 09:08:31AM +0200, Christian König wrote: >> NAK, there is a subtitle but major difference: >> >>> - if (rdev->needs_reset) { >>> - t = -EDEADLK; >>> - break; >>> - } >> Without that the whole radeon GPU reset code breaks. > Oops I've missed that. How does this work when you register a callback > using ->enable_signaling and then block on it? Everything just dies? The short answer is we simply avoid using enable_signaling() from inside driver IOCTLs. > We have lots of users of that for buffer/fence sharing. A really ugly, but > probably working fix for this would be a kthread worker that just looks > for ->needs_reset and force-completes all fences with > dma_fence_set_error(-EIO), which is kinda what's supposed to happen here > anyway. That actually won't help. Radeon does this dance to return an error from dma_fence_wait() when the GPU needs a reset. This way all IOCTLs should return to userspace with -EAGAIN and when they are restarted we block for the running GPU reset to finish. I was against this approach, but it works as long as radeon only has to deal with it's own fences. Christian. > -Daniel > >> Regards, >> Christian. >> >> >> Am 27.04.2018 um 08:17 schrieb Daniel Vetter: >>> It's a copy of dma_fence_default_wait, written slightly differently. >>> >>> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> >>> Cc: Alex Deucher <alexander.deucher@amd.com> >>> Cc: "Christian König" <christian.koenig@amd.com> >>> Cc: "David (ChunMing) Zhou" <David1.Zhou@amd.com> >>> Cc: amd-gfx@lists.freedesktop.org >>> --- >>> drivers/gpu/drm/radeon/radeon_fence.c | 63 --------------------------- >>> 1 file changed, 63 deletions(-) >>> >>> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c >>> index e86f2bd38410..32690a525bfc 100644 >>> --- a/drivers/gpu/drm/radeon/radeon_fence.c >>> +++ b/drivers/gpu/drm/radeon/radeon_fence.c >>> @@ -1051,72 +1051,9 @@ static const char *radeon_fence_get_timeline_name(struct dma_fence *f) >>> } >>> } >>> -static inline bool radeon_test_signaled(struct radeon_fence *fence) >>> -{ >>> - return test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->base.flags); >>> -} >>> - >>> -struct radeon_wait_cb { >>> - struct dma_fence_cb base; >>> - struct task_struct *task; >>> -}; >>> - >>> -static void >>> -radeon_fence_wait_cb(struct dma_fence *fence, struct dma_fence_cb *cb) >>> -{ >>> - struct radeon_wait_cb *wait = >>> - container_of(cb, struct radeon_wait_cb, base); >>> - >>> - wake_up_process(wait->task); >>> -} >>> - >>> -static signed long radeon_fence_default_wait(struct dma_fence *f, bool intr, >>> - signed long t) >>> -{ >>> - struct radeon_fence *fence = to_radeon_fence(f); >>> - struct radeon_device *rdev = fence->rdev; >>> - struct radeon_wait_cb cb; >>> - >>> - cb.task = current; >>> - >>> - if (dma_fence_add_callback(f, &cb.base, radeon_fence_wait_cb)) >>> - return t; >>> - >>> - while (t > 0) { >>> - if (intr) >>> - set_current_state(TASK_INTERRUPTIBLE); >>> - else >>> - set_current_state(TASK_UNINTERRUPTIBLE); >>> - >>> - /* >>> - * radeon_test_signaled must be called after >>> - * set_current_state to prevent a race with wake_up_process >>> - */ >>> - if (radeon_test_signaled(fence)) >>> - break; >>> - >>> - if (rdev->needs_reset) { >>> - t = -EDEADLK; >>> - break; >>> - } >>> - >>> - t = schedule_timeout(t); >>> - >>> - if (t > 0 && intr && signal_pending(current)) >>> - t = -ERESTARTSYS; >>> - } >>> - >>> - __set_current_state(TASK_RUNNING); >>> - dma_fence_remove_callback(f, &cb.base); >>> - >>> - return t; >>> -} >>> - >>> const struct dma_fence_ops radeon_fence_ops = { >>> .get_driver_name = radeon_fence_get_driver_name, >>> .get_timeline_name = radeon_fence_get_timeline_name, >>> .enable_signaling = radeon_fence_enable_signaling, >>> .signaled = radeon_fence_is_signaled, >>> - .wait = radeon_fence_default_wait, >>> - .release = NULL, >>> };
On Mon, Apr 30, 2018 at 8:26 PM, Christian König <ckoenig.leichtzumerken@gmail.com> wrote: > Am 30.04.2018 um 17:38 schrieb Daniel Vetter: >> >> On Sun, Apr 29, 2018 at 09:08:31AM +0200, Christian König wrote: >>> >>> NAK, there is a subtitle but major difference: >>> >>>> - if (rdev->needs_reset) { >>>> - t = -EDEADLK; >>>> - break; >>>> - } >>> >>> Without that the whole radeon GPU reset code breaks. >> >> Oops I've missed that. How does this work when you register a callback >> using ->enable_signaling and then block on it? Everything just dies? > > > The short answer is we simply avoid using enable_signaling() from inside > driver IOCTLs. > >> We have lots of users of that for buffer/fence sharing. A really ugly, but >> probably working fix for this would be a kthread worker that just looks >> for ->needs_reset and force-completes all fences with >> dma_fence_set_error(-EIO), which is kinda what's supposed to happen here >> anyway. > > > That actually won't help. Radeon does this dance to return an error from > dma_fence_wait() when the GPU needs a reset. > > This way all IOCTLs should return to userspace with -EAGAIN and when they > are restarted we block for the running GPU reset to finish. > > I was against this approach, but it works as long as radeon only has to deal > with it's own fences. Yeah, as soon as you mix in a 2nd driver it goes boom, since you can easily construct loops (even if they go through ->enable_signaling and maybe just an irq handler to fire off something somewhere else). Currently we're really bad a detecting these loops (aka there's nothing), but I hope that the cross-release lockdep stuff gets enabled soon. Then we could annotate dma_fences and lockdep should complain about a lot of these issues. Alas, problem exists already, but I'm not going to attempt to fix it. Anyway, I'll drop this patch here. -Daniel > > Christian. > > >> -Daniel >> >>> Regards, >>> Christian. >>> >>> >>> Am 27.04.2018 um 08:17 schrieb Daniel Vetter: >>>> >>>> It's a copy of dma_fence_default_wait, written slightly differently. >>>> >>>> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> >>>> Cc: Alex Deucher <alexander.deucher@amd.com> >>>> Cc: "Christian König" <christian.koenig@amd.com> >>>> Cc: "David (ChunMing) Zhou" <David1.Zhou@amd.com> >>>> Cc: amd-gfx@lists.freedesktop.org >>>> --- >>>> drivers/gpu/drm/radeon/radeon_fence.c | 63 >>>> --------------------------- >>>> 1 file changed, 63 deletions(-) >>>> >>>> diff --git a/drivers/gpu/drm/radeon/radeon_fence.c >>>> b/drivers/gpu/drm/radeon/radeon_fence.c >>>> index e86f2bd38410..32690a525bfc 100644 >>>> --- a/drivers/gpu/drm/radeon/radeon_fence.c >>>> +++ b/drivers/gpu/drm/radeon/radeon_fence.c >>>> @@ -1051,72 +1051,9 @@ static const char >>>> *radeon_fence_get_timeline_name(struct dma_fence *f) >>>> } >>>> } >>>> -static inline bool radeon_test_signaled(struct radeon_fence *fence) >>>> -{ >>>> - return test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, >>>> &fence->base.flags); >>>> -} >>>> - >>>> -struct radeon_wait_cb { >>>> - struct dma_fence_cb base; >>>> - struct task_struct *task; >>>> -}; >>>> - >>>> -static void >>>> -radeon_fence_wait_cb(struct dma_fence *fence, struct dma_fence_cb *cb) >>>> -{ >>>> - struct radeon_wait_cb *wait = >>>> - container_of(cb, struct radeon_wait_cb, base); >>>> - >>>> - wake_up_process(wait->task); >>>> -} >>>> - >>>> -static signed long radeon_fence_default_wait(struct dma_fence *f, bool >>>> intr, >>>> - signed long t) >>>> -{ >>>> - struct radeon_fence *fence = to_radeon_fence(f); >>>> - struct radeon_device *rdev = fence->rdev; >>>> - struct radeon_wait_cb cb; >>>> - >>>> - cb.task = current; >>>> - >>>> - if (dma_fence_add_callback(f, &cb.base, radeon_fence_wait_cb)) >>>> - return t; >>>> - >>>> - while (t > 0) { >>>> - if (intr) >>>> - set_current_state(TASK_INTERRUPTIBLE); >>>> - else >>>> - set_current_state(TASK_UNINTERRUPTIBLE); >>>> - >>>> - /* >>>> - * radeon_test_signaled must be called after >>>> - * set_current_state to prevent a race with >>>> wake_up_process >>>> - */ >>>> - if (radeon_test_signaled(fence)) >>>> - break; >>>> - >>>> - if (rdev->needs_reset) { >>>> - t = -EDEADLK; >>>> - break; >>>> - } >>>> - >>>> - t = schedule_timeout(t); >>>> - >>>> - if (t > 0 && intr && signal_pending(current)) >>>> - t = -ERESTARTSYS; >>>> - } >>>> - >>>> - __set_current_state(TASK_RUNNING); >>>> - dma_fence_remove_callback(f, &cb.base); >>>> - >>>> - return t; >>>> -} >>>> - >>>> const struct dma_fence_ops radeon_fence_ops = { >>>> .get_driver_name = radeon_fence_get_driver_name, >>>> .get_timeline_name = radeon_fence_get_timeline_name, >>>> .enable_signaling = radeon_fence_enable_signaling, >>>> .signaled = radeon_fence_is_signaled, >>>> - .wait = radeon_fence_default_wait, >>>> - .release = NULL, >>>> }; > >
diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index e86f2bd38410..32690a525bfc 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -1051,72 +1051,9 @@ static const char *radeon_fence_get_timeline_name(struct dma_fence *f) } } -static inline bool radeon_test_signaled(struct radeon_fence *fence) -{ - return test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->base.flags); -} - -struct radeon_wait_cb { - struct dma_fence_cb base; - struct task_struct *task; -}; - -static void -radeon_fence_wait_cb(struct dma_fence *fence, struct dma_fence_cb *cb) -{ - struct radeon_wait_cb *wait = - container_of(cb, struct radeon_wait_cb, base); - - wake_up_process(wait->task); -} - -static signed long radeon_fence_default_wait(struct dma_fence *f, bool intr, - signed long t) -{ - struct radeon_fence *fence = to_radeon_fence(f); - struct radeon_device *rdev = fence->rdev; - struct radeon_wait_cb cb; - - cb.task = current; - - if (dma_fence_add_callback(f, &cb.base, radeon_fence_wait_cb)) - return t; - - while (t > 0) { - if (intr) - set_current_state(TASK_INTERRUPTIBLE); - else - set_current_state(TASK_UNINTERRUPTIBLE); - - /* - * radeon_test_signaled must be called after - * set_current_state to prevent a race with wake_up_process - */ - if (radeon_test_signaled(fence)) - break; - - if (rdev->needs_reset) { - t = -EDEADLK; - break; - } - - t = schedule_timeout(t); - - if (t > 0 && intr && signal_pending(current)) - t = -ERESTARTSYS; - } - - __set_current_state(TASK_RUNNING); - dma_fence_remove_callback(f, &cb.base); - - return t; -} - const struct dma_fence_ops radeon_fence_ops = { .get_driver_name = radeon_fence_get_driver_name, .get_timeline_name = radeon_fence_get_timeline_name, .enable_signaling = radeon_fence_enable_signaling, .signaled = radeon_fence_is_signaled, - .wait = radeon_fence_default_wait, - .release = NULL, };
It's a copy of dma_fence_default_wait, written slightly differently. Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: "Christian König" <christian.koenig@amd.com> Cc: "David (ChunMing) Zhou" <David1.Zhou@amd.com> Cc: amd-gfx@lists.freedesktop.org --- drivers/gpu/drm/radeon/radeon_fence.c | 63 --------------------------- 1 file changed, 63 deletions(-)