Message ID | 1528124636-17283-1-git-send-email-andrey.grodzovsky@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Am 04.06.2018 um 17:03 schrieb Andrey Grodzovsky: > Dying process might be blocked from receiving any more signals > so avoid using it. > > Also retire enity->fini_status and just check the SW queue, > if it's not empty do the fallback cleanup. > > Also handle entity->last_scheduled == NULL use case which > happens when HW ring is already hangged whem a new entity > tried to enqeue jobs. > > v2: > Return the remaining timeout and use that as parameter for the next call. > This way when we need to cleanup multiple queues we don't wait for the > entire TO period for each queue but rather in total. > Styling comments. > Rebase. > > v3: > Update types from unsigned to long. > Work with jiffies instead of ms. > Return 0 when TO expires. > Rebase. > > Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> > --- > drivers/gpu/drm/scheduler/gpu_scheduler.c | 70 +++++++++++++++++++++++-------- > include/drm/gpu_scheduler.h | 7 ++-- > 2 files changed, 56 insertions(+), 21 deletions(-) > > diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c > index 8c1e80c..1e65221 100644 > --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c > +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c > @@ -181,7 +181,6 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, > entity->rq = rq; > entity->sched = sched; > entity->guilty = guilty; > - entity->fini_status = 0; > entity->last_scheduled = NULL; > > spin_lock_init(&entity->rq_lock); > @@ -219,7 +218,8 @@ static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, > static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) > { > rmb(); > - if (spsc_queue_peek(&entity->job_queue) == NULL) > + > + if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) > return true; > > return false; > @@ -260,25 +260,42 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, > * > * @sched: scheduler instance > * @entity: scheduler entity > + * @timeout: time to wait in for Q to become empty in jiffies. > * > * Splitting drm_sched_entity_fini() into two functions, The first one does the waiting, > * removes the entity from the runqueue and returns an error when the process was killed. > + * > + * Returns the remaining time in jiffies left from the input timeout > */ > -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, > - struct drm_sched_entity *entity) > +long drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, > + struct drm_sched_entity *entity, long timeout) > { > + long ret = timeout; > + > if (!drm_sched_entity_is_initialized(sched, entity)) > - return; > + return ret; > /** > * The client will not queue more IBs during this fini, consume existing > * queued IBs or discard them on SIGKILL > */ > - if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL) > - entity->fini_status = -ERESTARTSYS; > - else > - entity->fini_status = wait_event_killable(sched->job_scheduled, > - drm_sched_entity_is_idle(entity)); > - drm_sched_entity_set_rq(entity, NULL); > + if (current->flags & PF_EXITING) { > + if (timeout) { > + ret = wait_event_timeout( > + sched->job_scheduled, > + drm_sched_entity_is_idle(entity), > + timeout); > + > + ret = ret ? timeout - ret : ret; Ok we still seem to have a misunderstanding here what wait_event_timeout() returns. As far as I know that line shouldn't be necessary and is actually quite harmful. Apart from that this patch looks fine to me now, Christian. > + } > + } else > + wait_event_killable(sched->job_scheduled, drm_sched_entity_is_idle(entity)); > + > + > + /* For killed process disable any more IBs enqueue right now */ > + if ((current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) > + drm_sched_entity_set_rq(entity, NULL); > + > + return ret; > } > EXPORT_SYMBOL(drm_sched_entity_do_release); > > @@ -290,11 +307,18 @@ EXPORT_SYMBOL(drm_sched_entity_do_release); > * > * This should be called after @drm_sched_entity_do_release. It goes over the > * entity and signals all jobs with an error code if the process was killed. > + * > */ > void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, > struct drm_sched_entity *entity) > { > - if (entity->fini_status) { > + > + drm_sched_entity_set_rq(entity, NULL); > + > + /* Consumption of existing IBs wasn't completed. Forcefully > + * remove them here. > + */ > + if (spsc_queue_peek(&entity->job_queue)) { > struct drm_sched_job *job; > int r; > > @@ -314,12 +338,22 @@ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, > struct drm_sched_fence *s_fence = job->s_fence; > drm_sched_fence_scheduled(s_fence); > dma_fence_set_error(&s_fence->finished, -ESRCH); > - r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, > - drm_sched_entity_kill_jobs_cb); > - if (r == -ENOENT) > + > + /* > + * When pipe is hanged by older entity, new entity might > + * not even have chance to submit it's first job to HW > + * and so entity->last_scheduled will remain NULL > + */ > + if (!entity->last_scheduled) { > drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); > - else if (r) > - DRM_ERROR("fence add callback failed (%d)\n", r); > + } else { > + r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, > + drm_sched_entity_kill_jobs_cb); > + if (r == -ENOENT) > + drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); > + else if (r) > + DRM_ERROR("fence add callback failed (%d)\n", r); > + } > } > } > > @@ -339,7 +373,7 @@ EXPORT_SYMBOL(drm_sched_entity_cleanup); > void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, > struct drm_sched_entity *entity) > { > - drm_sched_entity_do_release(sched, entity); > + drm_sched_entity_do_release(sched, entity, MAX_WAIT_SCHED_ENTITY_Q_EMPTY); > drm_sched_entity_cleanup(sched, entity); > } > EXPORT_SYMBOL(drm_sched_entity_fini); > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > index 496442f..7c2dfd6 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -27,6 +27,8 @@ > #include <drm/spsc_queue.h> > #include <linux/dma-fence.h> > > +#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000) > + > struct drm_gpu_scheduler; > struct drm_sched_rq; > > @@ -84,7 +86,6 @@ struct drm_sched_entity { > struct dma_fence *dependency; > struct dma_fence_cb cb; > atomic_t *guilty; > - int fini_status; > struct dma_fence *last_scheduled; > }; > > @@ -283,8 +284,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, > struct drm_sched_entity *entity, > struct drm_sched_rq *rq, > atomic_t *guilty); > -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, > - struct drm_sched_entity *entity); > +long drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, > + struct drm_sched_entity *entity, long timeout); > void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, > struct drm_sched_entity *entity); > void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
I reread the documentation for wait_event_timeout, my bad, all this time i assumed it returns amount of jiffies he spent in waiting while actually it returns "the remaining jiffies (at least 1) if the @condition evaluated * to %true before the @timeout elapsed." Will fix that, please take a look also at the second patch. Andrey On 06/04/2018 01:53 PM, Christian König wrote: >> + ret = wait_event_timeout( >> + sched->job_scheduled, >> + drm_sched_entity_is_idle(entity), >> + timeout); >> + >> + ret = ret ? timeout - ret : ret; > > Ok we still seem to have a misunderstanding here what > wait_event_timeout() returns. > > As far as I know that line shouldn't be necessary and is actually > quite harmful. > > Apart from that this patch looks fine to me now, > Christian.
Ok that explains it. I was already wondering what the heck I was missing :) The second patch already looked fine to me as well. Just send it out once more to get an rb, Christian. Am 04.06.2018 um 20:03 schrieb Andrey Grodzovsky: > > I reread the documentation for wait_event_timeout, my bad, all this > time i assumed it returns > > amount of jiffies he spent in waiting while actually it returns "the > remaining jiffies (at least 1) > > if the @condition evaluated > * to %true before the @timeout elapsed." > > Will fix that, please take a look also at the second patch. > > Andrey > > > On 06/04/2018 01:53 PM, Christian König wrote: >>> + ret = wait_event_timeout( >>> + sched->job_scheduled, >>> + drm_sched_entity_is_idle(entity), >>> + timeout); >>> + >>> + ret = ret ? timeout - ret : ret; >> >> Ok we still seem to have a misunderstanding here what >> wait_event_timeout() returns. >> >> As far as I know that line shouldn't be necessary and is actually >> quite harmful. >> >> Apart from that this patch looks fine to me now, >> Christian. >
diff --git a/drivers/gpu/drm/scheduler/gpu_scheduler.c b/drivers/gpu/drm/scheduler/gpu_scheduler.c index 8c1e80c..1e65221 100644 --- a/drivers/gpu/drm/scheduler/gpu_scheduler.c +++ b/drivers/gpu/drm/scheduler/gpu_scheduler.c @@ -181,7 +181,6 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, entity->rq = rq; entity->sched = sched; entity->guilty = guilty; - entity->fini_status = 0; entity->last_scheduled = NULL; spin_lock_init(&entity->rq_lock); @@ -219,7 +218,8 @@ static bool drm_sched_entity_is_initialized(struct drm_gpu_scheduler *sched, static bool drm_sched_entity_is_idle(struct drm_sched_entity *entity) { rmb(); - if (spsc_queue_peek(&entity->job_queue) == NULL) + + if (!entity->rq || spsc_queue_peek(&entity->job_queue) == NULL) return true; return false; @@ -260,25 +260,42 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, * * @sched: scheduler instance * @entity: scheduler entity + * @timeout: time to wait in for Q to become empty in jiffies. * * Splitting drm_sched_entity_fini() into two functions, The first one does the waiting, * removes the entity from the runqueue and returns an error when the process was killed. + * + * Returns the remaining time in jiffies left from the input timeout */ -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity) +long drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, + struct drm_sched_entity *entity, long timeout) { + long ret = timeout; + if (!drm_sched_entity_is_initialized(sched, entity)) - return; + return ret; /** * The client will not queue more IBs during this fini, consume existing * queued IBs or discard them on SIGKILL */ - if ((current->flags & PF_SIGNALED) && current->exit_code == SIGKILL) - entity->fini_status = -ERESTARTSYS; - else - entity->fini_status = wait_event_killable(sched->job_scheduled, - drm_sched_entity_is_idle(entity)); - drm_sched_entity_set_rq(entity, NULL); + if (current->flags & PF_EXITING) { + if (timeout) { + ret = wait_event_timeout( + sched->job_scheduled, + drm_sched_entity_is_idle(entity), + timeout); + + ret = ret ? timeout - ret : ret; + } + } else + wait_event_killable(sched->job_scheduled, drm_sched_entity_is_idle(entity)); + + + /* For killed process disable any more IBs enqueue right now */ + if ((current->flags & PF_EXITING) && (current->exit_code == SIGKILL)) + drm_sched_entity_set_rq(entity, NULL); + + return ret; } EXPORT_SYMBOL(drm_sched_entity_do_release); @@ -290,11 +307,18 @@ EXPORT_SYMBOL(drm_sched_entity_do_release); * * This should be called after @drm_sched_entity_do_release. It goes over the * entity and signals all jobs with an error code if the process was killed. + * */ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity) { - if (entity->fini_status) { + + drm_sched_entity_set_rq(entity, NULL); + + /* Consumption of existing IBs wasn't completed. Forcefully + * remove them here. + */ + if (spsc_queue_peek(&entity->job_queue)) { struct drm_sched_job *job; int r; @@ -314,12 +338,22 @@ void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_fence *s_fence = job->s_fence; drm_sched_fence_scheduled(s_fence); dma_fence_set_error(&s_fence->finished, -ESRCH); - r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, - drm_sched_entity_kill_jobs_cb); - if (r == -ENOENT) + + /* + * When pipe is hanged by older entity, new entity might + * not even have chance to submit it's first job to HW + * and so entity->last_scheduled will remain NULL + */ + if (!entity->last_scheduled) { drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); - else if (r) - DRM_ERROR("fence add callback failed (%d)\n", r); + } else { + r = dma_fence_add_callback(entity->last_scheduled, &job->finish_cb, + drm_sched_entity_kill_jobs_cb); + if (r == -ENOENT) + drm_sched_entity_kill_jobs_cb(NULL, &job->finish_cb); + else if (r) + DRM_ERROR("fence add callback failed (%d)\n", r); + } } } @@ -339,7 +373,7 @@ EXPORT_SYMBOL(drm_sched_entity_cleanup); void drm_sched_entity_fini(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity) { - drm_sched_entity_do_release(sched, entity); + drm_sched_entity_do_release(sched, entity, MAX_WAIT_SCHED_ENTITY_Q_EMPTY); drm_sched_entity_cleanup(sched, entity); } EXPORT_SYMBOL(drm_sched_entity_fini); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 496442f..7c2dfd6 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -27,6 +27,8 @@ #include <drm/spsc_queue.h> #include <linux/dma-fence.h> +#define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000) + struct drm_gpu_scheduler; struct drm_sched_rq; @@ -84,7 +86,6 @@ struct drm_sched_entity { struct dma_fence *dependency; struct dma_fence_cb cb; atomic_t *guilty; - int fini_status; struct dma_fence *last_scheduled; }; @@ -283,8 +284,8 @@ int drm_sched_entity_init(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity, struct drm_sched_rq *rq, atomic_t *guilty); -void drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, - struct drm_sched_entity *entity); +long drm_sched_entity_do_release(struct drm_gpu_scheduler *sched, + struct drm_sched_entity *entity, long timeout); void drm_sched_entity_cleanup(struct drm_gpu_scheduler *sched, struct drm_sched_entity *entity); void drm_sched_entity_fini(struct drm_gpu_scheduler *sched,
Dying process might be blocked from receiving any more signals so avoid using it. Also retire enity->fini_status and just check the SW queue, if it's not empty do the fallback cleanup. Also handle entity->last_scheduled == NULL use case which happens when HW ring is already hangged whem a new entity tried to enqeue jobs. v2: Return the remaining timeout and use that as parameter for the next call. This way when we need to cleanup multiple queues we don't wait for the entire TO period for each queue but rather in total. Styling comments. Rebase. v3: Update types from unsigned to long. Work with jiffies instead of ms. Return 0 when TO expires. Rebase. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> --- drivers/gpu/drm/scheduler/gpu_scheduler.c | 70 +++++++++++++++++++++++-------- include/drm/gpu_scheduler.h | 7 ++-- 2 files changed, 56 insertions(+), 21 deletions(-)