diff mbox series

[2/2] drm/scheduler: remove timeout work_struct from drm_sched_job

Message ID 20180918161710.2669-2-nayan26deshmukh@gmail.com (mailing list archive)
State New, archived
Headers show
Series [1/2] drm/scheduler: add a current job field to scheduler | expand

Commit Message

Nayan Deshmukh Sept. 18, 2018, 4:17 p.m. UTC
having a delayed work item per job is redundant as we only need one
per scheduler to track the time out the currently executing job.

Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com>
Suggested-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++-------
 include/drm/gpu_scheduler.h            |  6 +++---
 2 files changed, 12 insertions(+), 10 deletions(-)

Comments

Christian König Sept. 19, 2018, 12:30 p.m. UTC | #1
Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh:
> having a delayed work item per job is redundant as we only need one
> per scheduler to track the time out the currently executing job.

Well that looks simpler than I thought it would be.

But it shows the next problem that the timeout and the completion could 
race.

As far as I can see that can be fixed by moving the 
dma_fence_remove_callback()/dma_fence_add_callback() dance from 
drm_sched_hw_job_reset() to drm_sched_job_timedout().

Anyway, I would say drop patch #1 and fix the one comment below and we 
can use this.

>
> Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com>
> Suggested-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++-------
>   include/drm/gpu_scheduler.h            |  6 +++---
>   2 files changed, 12 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
> index 0e6ccc8243db..f213b5c7f718 100644
> --- a/drivers/gpu/drm/scheduler/sched_main.c
> +++ b/drivers/gpu/drm/scheduler/sched_main.c
> @@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct work_struct *work)
>   	 * manages to find this job as the next job in the list, the fence
>   	 * signaled check below will prevent the timeout to be restarted.
>   	 */
> -	cancel_delayed_work_sync(&s_job->work_tdr);
> +	cancel_delayed_work_sync(&sched->work_tdr);
>   
>   	spin_lock(&sched->job_list_lock);
>   	/* queue TDR for next job */
> @@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct work_struct *work)
>   	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>   	    !list_is_last(&s_job->node, &sched->ring_mirror_list)) {
>   		if (!dma_fence_is_signaled(&next->s_fence->finished))

Since we now have only one delayed work item we can just drop the test 
if next is already signaled.

Regards,
Christian.

> -			schedule_delayed_work(&next->work_tdr, sched->timeout);
> +			schedule_delayed_work(&sched->work_tdr, sched->timeout);
>   	}
>   	/* remove job from ring_mirror_list */
>   	list_del(&s_job->node);
> @@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
>   	if (list_first_entry_or_null(&sched->ring_mirror_list,
>   				struct drm_sched_job, node) == s_job) {
>   		if (sched->timeout != MAX_SCHEDULE_TIMEOUT)
> -			schedule_delayed_work(&s_job->work_tdr, sched->timeout);
> +			schedule_delayed_work(&sched->work_tdr, sched->timeout);
>   		sched->curr_job = s_job;
>   	}
>   	spin_unlock(&sched->job_list_lock);
> @@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)
>   
>   static void drm_sched_job_timedout(struct work_struct *work)
>   {
> -	struct drm_sched_job *job = container_of(work, struct drm_sched_job,
> -						 work_tdr.work);
> +	struct drm_gpu_scheduler *sched = container_of(work,
> +						struct drm_gpu_scheduler,
> +						work_tdr.work);
> +	struct drm_sched_job *job = sched->curr_job;
>   
>   	job->sched->ops->timedout_job(job);
>   }
> @@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
>   	s_job = list_first_entry_or_null(&sched->ring_mirror_list,
>   					 struct drm_sched_job, node);
>   	if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT)
> -		schedule_delayed_work(&s_job->work_tdr, sched->timeout);
> +		schedule_delayed_work(&sched->work_tdr, sched->timeout);
>   	if (s_job)
>   		sched->curr_job = s_job;
>   
> @@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job *job,
>   
>   	INIT_WORK(&job->finish_work, drm_sched_job_finish);
>   	INIT_LIST_HEAD(&job->node);
> -	INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout);
>   
>   	return 0;
>   }
> @@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
>   	INIT_LIST_HEAD(&sched->ring_mirror_list);
>   	spin_lock_init(&sched->job_list_lock);
>   	atomic_set(&sched->hw_rq_count, 0);
> +	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>   	atomic_set(&sched->num_jobs, 0);
>   	atomic64_set(&sched->job_id_count, 0);
>   
> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> index 07e776b1ca42..9d50d7f3eaa4 100644
> --- a/include/drm/gpu_scheduler.h
> +++ b/include/drm/gpu_scheduler.h
> @@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>    *               finished to remove the job from the
>    *               @drm_gpu_scheduler.ring_mirror_list.
>    * @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list.
> - * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the timeout
> - *            interval is over.
>    * @id: a unique id assigned to each job scheduled on the scheduler.
>    * @karma: increment on every hang caused by this job. If this exceeds the hang
>    *         limit of the scheduler then the job is marked guilty and will not
> @@ -195,7 +193,6 @@ struct drm_sched_job {
>   	struct dma_fence_cb		finish_cb;
>   	struct work_struct		finish_work;
>   	struct list_head		node;
> -	struct delayed_work		work_tdr;
>   	uint64_t			id;
>   	atomic_t			karma;
>   	enum drm_sched_priority		s_priority;
> @@ -260,6 +257,8 @@ struct drm_sched_backend_ops {
>    *                 finished.
>    * @hw_rq_count: the number of jobs currently in the hardware queue.
>    * @job_id_count: used to assign unique id to the each job.
> + * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
> + *            timeout interval is over.
>    * @thread: the kthread on which the scheduler which run.
>    * @ring_mirror_list: the list of jobs which are currently in the job queue.
>    * @job_list_lock: lock to protect the ring_mirror_list.
> @@ -280,6 +279,7 @@ struct drm_gpu_scheduler {
>   	wait_queue_head_t		job_scheduled;
>   	atomic_t			hw_rq_count;
>   	atomic64_t			job_id_count;
> +	struct delayed_work		work_tdr;
>   	struct task_struct		*thread;
>   	struct list_head		ring_mirror_list;
>   	spinlock_t			job_list_lock;
Michel Dänzer Sept. 19, 2018, 3:39 p.m. UTC | #2
On 2018-09-19 2:30 p.m., Christian König wrote:
> Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh:
>> having a delayed work item per job is redundant as we only need one
>> per scheduler to track the time out the currently executing job.
> 
> Well that looks simpler than I thought it would be.
> 
> But it shows the next problem that the timeout and the completion could
> race.
> 
> As far as I can see that can be fixed by moving the
> dma_fence_remove_callback()/dma_fence_add_callback() dance from
> drm_sched_hw_job_reset() to drm_sched_job_timedout().

BTW, while you guys are looking into this code, please keep an eye open
for things that could explain https://bugs.freedesktop.org/107762 .
Christian König Sept. 20, 2018, 6:41 a.m. UTC | #3
Am 19.09.2018 um 17:39 schrieb Michel Dänzer:
> On 2018-09-19 2:30 p.m., Christian König wrote:
>> Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh:
>>> having a delayed work item per job is redundant as we only need one
>>> per scheduler to track the time out the currently executing job.
>> Well that looks simpler than I thought it would be.
>>
>> But it shows the next problem that the timeout and the completion could
>> race.
>>
>> As far as I can see that can be fixed by moving the
>> dma_fence_remove_callback()/dma_fence_add_callback() dance from
>> drm_sched_hw_job_reset() to drm_sched_job_timedout().
> BTW, while you guys are looking into this code, please keep an eye open
> for things that could explain https://bugs.freedesktop.org/107762 .

Yeah, since we now have only one timer that should be fixed by this as well.

Christian.
Nayan Deshmukh Sept. 20, 2018, 11:25 a.m. UTC | #4
On Wed, Sep 19, 2018, 9:31 PM Christian König <christian.koenig@amd.com>
wrote:

> Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh:
> > having a delayed work item per job is redundant as we only need one
> > per scheduler to track the time out the currently executing job.
>
> Well that looks simpler than I thought it would be.
>
> But it shows the next problem that the timeout and the completion could
> race.
>
> As far as I can see that can be fixed by moving the
> dma_fence_remove_callback()/dma_fence_add_callback() dance from
> drm_sched_hw_job_reset() to drm_sched_job_timedout().
>
> Anyway, I would say drop patch #1 and fix the one comment below and we
> can use this.
>
> >
> > Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com>
> > Suggested-by: Christian König <christian.koenig@amd.com>
> > ---
> >   drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++-------
> >   include/drm/gpu_scheduler.h            |  6 +++---
> >   2 files changed, 12 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/scheduler/sched_main.c
> b/drivers/gpu/drm/scheduler/sched_main.c
> > index 0e6ccc8243db..f213b5c7f718 100644
> > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > @@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct work_struct
> *work)
> >        * manages to find this job as the next job in the list, the fence
> >        * signaled check below will prevent the timeout to be restarted.
> >        */
> > -     cancel_delayed_work_sync(&s_job->work_tdr);
> > +     cancel_delayed_work_sync(&sched->work_tdr);
> >
> >       spin_lock(&sched->job_list_lock);
> >       /* queue TDR for next job */
> > @@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct work_struct
> *work)
> >       if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
> >           !list_is_last(&s_job->node, &sched->ring_mirror_list)) {
> >               if (!dma_fence_is_signaled(&next->s_fence->finished))
>
> Since we now have only one delayed work item we can just drop the test
> if next is already signaled.
>
Can you elaborate more on this. Which test are you talking about?

Regards,
Nayan

>
>
> Regards,
> Christian.
>
> > -                     schedule_delayed_work(&next->work_tdr,
> sched->timeout);
> > +                     schedule_delayed_work(&sched->work_tdr,
> sched->timeout);
> >       }
> >       /* remove job from ring_mirror_list */
> >       list_del(&s_job->node);
> > @@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct drm_sched_job
> *s_job)
> >       if (list_first_entry_or_null(&sched->ring_mirror_list,
> >                               struct drm_sched_job, node) == s_job) {
> >               if (sched->timeout != MAX_SCHEDULE_TIMEOUT)
> > -                     schedule_delayed_work(&s_job->work_tdr,
> sched->timeout);
> > +                     schedule_delayed_work(&sched->work_tdr,
> sched->timeout);
> >               sched->curr_job = s_job;
> >       }
> >       spin_unlock(&sched->job_list_lock);
> > @@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct
> drm_sched_job *s_job)
> >
> >   static void drm_sched_job_timedout(struct work_struct *work)
> >   {
> > -     struct drm_sched_job *job = container_of(work, struct
> drm_sched_job,
> > -                                              work_tdr.work);
> > +     struct drm_gpu_scheduler *sched = container_of(work,
> > +                                             struct drm_gpu_scheduler,
> > +                                             work_tdr.work);
> > +     struct drm_sched_job *job = sched->curr_job;
> >
> >       job->sched->ops->timedout_job(job);
> >   }
> > @@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler
> *sched)
> >       s_job = list_first_entry_or_null(&sched->ring_mirror_list,
> >                                        struct drm_sched_job, node);
> >       if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT)
> > -             schedule_delayed_work(&s_job->work_tdr, sched->timeout);
> > +             schedule_delayed_work(&sched->work_tdr, sched->timeout);
> >       if (s_job)
> >               sched->curr_job = s_job;
> >
> > @@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job *job,
> >
> >       INIT_WORK(&job->finish_work, drm_sched_job_finish);
> >       INIT_LIST_HEAD(&job->node);
> > -     INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout);
> >
> >       return 0;
> >   }
> > @@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,
> >       INIT_LIST_HEAD(&sched->ring_mirror_list);
> >       spin_lock_init(&sched->job_list_lock);
> >       atomic_set(&sched->hw_rq_count, 0);
> > +     INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
> >       atomic_set(&sched->num_jobs, 0);
> >       atomic64_set(&sched->job_id_count, 0);
> >
> > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
> > index 07e776b1ca42..9d50d7f3eaa4 100644
> > --- a/include/drm/gpu_scheduler.h
> > +++ b/include/drm/gpu_scheduler.h
> > @@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct
> dma_fence *f);
> >    *               finished to remove the job from the
> >    *               @drm_gpu_scheduler.ring_mirror_list.
> >    * @node: used to append this struct to the
> @drm_gpu_scheduler.ring_mirror_list.
> > - * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after
> the timeout
> > - *            interval is over.
> >    * @id: a unique id assigned to each job scheduled on the scheduler.
> >    * @karma: increment on every hang caused by this job. If this exceeds
> the hang
> >    *         limit of the scheduler then the job is marked guilty and
> will not
> > @@ -195,7 +193,6 @@ struct drm_sched_job {
> >       struct dma_fence_cb             finish_cb;
> >       struct work_struct              finish_work;
> >       struct list_head                node;
> > -     struct delayed_work             work_tdr;
> >       uint64_t                        id;
> >       atomic_t                        karma;
> >       enum drm_sched_priority         s_priority;
> > @@ -260,6 +257,8 @@ struct drm_sched_backend_ops {
> >    *                 finished.
> >    * @hw_rq_count: the number of jobs currently in the hardware queue.
> >    * @job_id_count: used to assign unique id to the each job.
> > + * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after
> the
> > + *            timeout interval is over.
> >    * @thread: the kthread on which the scheduler which run.
> >    * @ring_mirror_list: the list of jobs which are currently in the job
> queue.
> >    * @job_list_lock: lock to protect the ring_mirror_list.
> > @@ -280,6 +279,7 @@ struct drm_gpu_scheduler {
> >       wait_queue_head_t               job_scheduled;
> >       atomic_t                        hw_rq_count;
> >       atomic64_t                      job_id_count;
> > +     struct delayed_work             work_tdr;
> >       struct task_struct              *thread;
> >       struct list_head                ring_mirror_list;
> >       spinlock_t                      job_list_lock;
>
>
<div dir="auto"><div><br><br><div class="gmail_quote"><div dir="ltr">On Wed, Sep 19, 2018, 9:31 PM Christian König &lt;<a href="mailto:christian.koenig@amd.com">christian.koenig@amd.com</a>&gt; wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh:<br>
&gt; having a delayed work item per job is redundant as we only need one<br>
&gt; per scheduler to track the time out the currently executing job.<br>
<br>
Well that looks simpler than I thought it would be.<br>
<br>
But it shows the next problem that the timeout and the completion could <br>
race.<br>
<br>
As far as I can see that can be fixed by moving the <br>
dma_fence_remove_callback()/dma_fence_add_callback() dance from <br>
drm_sched_hw_job_reset() to drm_sched_job_timedout().<br>
<br>
Anyway, I would say drop patch #1 and fix the one comment below and we <br>
can use this.<br>
<br>
&gt;<br>
&gt; Signed-off-by: Nayan Deshmukh &lt;<a href="mailto:nayan26deshmukh@gmail.com" target="_blank" rel="noreferrer">nayan26deshmukh@gmail.com</a>&gt;<br>
&gt; Suggested-by: Christian König &lt;<a href="mailto:christian.koenig@amd.com" target="_blank" rel="noreferrer">christian.koenig@amd.com</a>&gt;<br>
&gt; ---<br>
&gt;   drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++-------<br>
&gt;   include/drm/gpu_scheduler.h            |  6 +++---<br>
&gt;   2 files changed, 12 insertions(+), 10 deletions(-)<br>
&gt;<br>
&gt; diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c<br>
&gt; index 0e6ccc8243db..f213b5c7f718 100644<br>
&gt; --- a/drivers/gpu/drm/scheduler/sched_main.c<br>
&gt; +++ b/drivers/gpu/drm/scheduler/sched_main.c<br>
&gt; @@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct work_struct *work)<br>
&gt;        * manages to find this job as the next job in the list, the fence<br>
&gt;        * signaled check below will prevent the timeout to be restarted.<br>
&gt;        */<br>
&gt; -     cancel_delayed_work_sync(&amp;s_job-&gt;work_tdr);<br>
&gt; +     cancel_delayed_work_sync(&amp;sched-&gt;work_tdr);<br>
&gt;   <br>
&gt;       spin_lock(&amp;sched-&gt;job_list_lock);<br>
&gt;       /* queue TDR for next job */<br>
&gt; @@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct work_struct *work)<br>
&gt;       if (sched-&gt;timeout != MAX_SCHEDULE_TIMEOUT &amp;&amp;<br>
&gt;           !list_is_last(&amp;s_job-&gt;node, &amp;sched-&gt;ring_mirror_list)) {<br>
&gt;               if (!dma_fence_is_signaled(&amp;next-&gt;s_fence-&gt;finished))<br>
<br>
Since we now have only one delayed work item we can just drop the test <br>
if next is already signaled.<br></blockquote></div></div><div dir="auto"><span style="font-family:sans-serif;font-size:12.8px">Can you elaborate more on this. Which test are you talking about?</span><br style="font-family:sans-serif;font-size:12.8px"><br style="font-family:sans-serif;font-size:12.8px"><span style="font-family:sans-serif;font-size:12.8px">Regards,</span><br style="font-family:sans-serif;font-size:12.8px"><span style="font-family:sans-serif;font-size:12.8px">Nayan</span><br></div><div dir="auto"><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br><br>
Regards,<br>
Christian.<br>
<br>
&gt; -                     schedule_delayed_work(&amp;next-&gt;work_tdr, sched-&gt;timeout);<br>
&gt; +                     schedule_delayed_work(&amp;sched-&gt;work_tdr, sched-&gt;timeout);<br>
&gt;       }<br>
&gt;       /* remove job from ring_mirror_list */<br>
&gt;       list_del(&amp;s_job-&gt;node);<br>
&gt; @@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)<br>
&gt;       if (list_first_entry_or_null(&amp;sched-&gt;ring_mirror_list,<br>
&gt;                               struct drm_sched_job, node) == s_job) {<br>
&gt;               if (sched-&gt;timeout != MAX_SCHEDULE_TIMEOUT)<br>
&gt; -                     schedule_delayed_work(&amp;s_job-&gt;work_tdr, sched-&gt;timeout);<br>
&gt; +                     schedule_delayed_work(&amp;sched-&gt;work_tdr, sched-&gt;timeout);<br>
&gt;               sched-&gt;curr_job = s_job;<br>
&gt;       }<br>
&gt;       spin_unlock(&amp;sched-&gt;job_list_lock);<br>
&gt; @@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct drm_sched_job *s_job)<br>
&gt;   <br>
&gt;   static void drm_sched_job_timedout(struct work_struct *work)<br>
&gt;   {<br>
&gt; -     struct drm_sched_job *job = container_of(work, struct drm_sched_job,<br>
&gt; -                                              <a href="http://work_tdr.work" rel="noreferrer noreferrer" target="_blank">work_tdr.work</a>);<br>
&gt; +     struct drm_gpu_scheduler *sched = container_of(work,<br>
&gt; +                                             struct drm_gpu_scheduler,<br>
&gt; +                                             <a href="http://work_tdr.work" rel="noreferrer noreferrer" target="_blank">work_tdr.work</a>);<br>
&gt; +     struct drm_sched_job *job = sched-&gt;curr_job;<br>
&gt;   <br>
&gt;       job-&gt;sched-&gt;ops-&gt;timedout_job(job);<br>
&gt;   }<br>
&gt; @@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)<br>
&gt;       s_job = list_first_entry_or_null(&amp;sched-&gt;ring_mirror_list,<br>
&gt;                                        struct drm_sched_job, node);<br>
&gt;       if (s_job &amp;&amp; sched-&gt;timeout != MAX_SCHEDULE_TIMEOUT)<br>
&gt; -             schedule_delayed_work(&amp;s_job-&gt;work_tdr, sched-&gt;timeout);<br>
&gt; +             schedule_delayed_work(&amp;sched-&gt;work_tdr, sched-&gt;timeout);<br>
&gt;       if (s_job)<br>
&gt;               sched-&gt;curr_job = s_job;<br>
&gt;   <br>
&gt; @@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job *job,<br>
&gt;   <br>
&gt;       INIT_WORK(&amp;job-&gt;finish_work, drm_sched_job_finish);<br>
&gt;       INIT_LIST_HEAD(&amp;job-&gt;node);<br>
&gt; -     INIT_DELAYED_WORK(&amp;job-&gt;work_tdr, drm_sched_job_timedout);<br>
&gt;   <br>
&gt;       return 0;<br>
&gt;   }<br>
&gt; @@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler *sched,<br>
&gt;       INIT_LIST_HEAD(&amp;sched-&gt;ring_mirror_list);<br>
&gt;       spin_lock_init(&amp;sched-&gt;job_list_lock);<br>
&gt;       atomic_set(&amp;sched-&gt;hw_rq_count, 0);<br>
&gt; +     INIT_DELAYED_WORK(&amp;sched-&gt;work_tdr, drm_sched_job_timedout);<br>
&gt;       atomic_set(&amp;sched-&gt;num_jobs, 0);<br>
&gt;       atomic64_set(&amp;sched-&gt;job_id_count, 0);<br>
&gt;   <br>
&gt; diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h<br>
&gt; index 07e776b1ca42..9d50d7f3eaa4 100644<br>
&gt; --- a/include/drm/gpu_scheduler.h<br>
&gt; +++ b/include/drm/gpu_scheduler.h<br>
&gt; @@ -175,8 +175,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);<br>
&gt;    *               finished to remove the job from the<br>
&gt;    *               @drm_gpu_scheduler.ring_mirror_list.<br>
&gt;    * @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list.<br>
&gt; - * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the timeout<br>
&gt; - *            interval is over.<br>
&gt;    * @id: a unique id assigned to each job scheduled on the scheduler.<br>
&gt;    * @karma: increment on every hang caused by this job. If this exceeds the hang<br>
&gt;    *         limit of the scheduler then the job is marked guilty and will not<br>
&gt; @@ -195,7 +193,6 @@ struct drm_sched_job {<br>
&gt;       struct dma_fence_cb             finish_cb;<br>
&gt;       struct work_struct              finish_work;<br>
&gt;       struct list_head                node;<br>
&gt; -     struct delayed_work             work_tdr;<br>
&gt;       uint64_t                        id;<br>
&gt;       atomic_t                        karma;<br>
&gt;       enum drm_sched_priority         s_priority;<br>
&gt; @@ -260,6 +257,8 @@ struct drm_sched_backend_ops {<br>
&gt;    *                 finished.<br>
&gt;    * @hw_rq_count: the number of jobs currently in the hardware queue.<br>
&gt;    * @job_id_count: used to assign unique id to the each job.<br>
&gt; + * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the<br>
&gt; + *            timeout interval is over.<br>
&gt;    * @thread: the kthread on which the scheduler which run.<br>
&gt;    * @ring_mirror_list: the list of jobs which are currently in the job queue.<br>
&gt;    * @job_list_lock: lock to protect the ring_mirror_list.<br>
&gt; @@ -280,6 +279,7 @@ struct drm_gpu_scheduler {<br>
&gt;       wait_queue_head_t               job_scheduled;<br>
&gt;       atomic_t                        hw_rq_count;<br>
&gt;       atomic64_t                      job_id_count;<br>
&gt; +     struct delayed_work             work_tdr;<br>
&gt;       struct task_struct              *thread;<br>
&gt;       struct list_head                ring_mirror_list;<br>
&gt;       spinlock_t                      job_list_lock;<br>
<br>
</blockquote></div></div></div>
Christian König Sept. 20, 2018, 11:30 a.m. UTC | #5
Am 20.09.2018 um 13:25 schrieb Nayan Deshmukh:
>
>
> On Wed, Sep 19, 2018, 9:31 PM Christian König 
> <christian.koenig@amd.com <mailto:christian.koenig@amd.com>> wrote:
>
>     Am 18.09.2018 um 18:17 schrieb Nayan Deshmukh:
>     > having a delayed work item per job is redundant as we only need one
>     > per scheduler to track the time out the currently executing job.
>
>     Well that looks simpler than I thought it would be.
>
>     But it shows the next problem that the timeout and the completion
>     could
>     race.
>
>     As far as I can see that can be fixed by moving the
>     dma_fence_remove_callback()/dma_fence_add_callback() dance from
>     drm_sched_hw_job_reset() to drm_sched_job_timedout().
>
>     Anyway, I would say drop patch #1 and fix the one comment below
>     and we
>     can use this.
>
>     >
>     > Signed-off-by: Nayan Deshmukh <nayan26deshmukh@gmail.com
>     <mailto:nayan26deshmukh@gmail.com>>
>     > Suggested-by: Christian König <christian.koenig@amd.com
>     <mailto:christian.koenig@amd.com>>
>     > ---
>     >   drivers/gpu/drm/scheduler/sched_main.c | 16 +++++++++-------
>     >   include/drm/gpu_scheduler.h            |  6 +++---
>     >   2 files changed, 12 insertions(+), 10 deletions(-)
>     >
>     > diff --git a/drivers/gpu/drm/scheduler/sched_main.c
>     b/drivers/gpu/drm/scheduler/sched_main.c
>     > index 0e6ccc8243db..f213b5c7f718 100644
>     > --- a/drivers/gpu/drm/scheduler/sched_main.c
>     > +++ b/drivers/gpu/drm/scheduler/sched_main.c
>     > @@ -198,7 +198,7 @@ static void drm_sched_job_finish(struct
>     work_struct *work)
>     >        * manages to find this job as the next job in the list,
>     the fence
>     >        * signaled check below will prevent the timeout to be
>     restarted.
>     >        */
>     > -  cancel_delayed_work_sync(&s_job->work_tdr);
>     > +  cancel_delayed_work_sync(&sched->work_tdr);
>     >
>     >       spin_lock(&sched->job_list_lock);
>     >       /* queue TDR for next job */
>     > @@ -207,7 +207,7 @@ static void drm_sched_job_finish(struct
>     work_struct *work)
>     >       if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
>     >           !list_is_last(&s_job->node, &sched->ring_mirror_list)) {
>     >               if (!dma_fence_is_signaled(&next->s_fence->finished))
>
>     Since we now have only one delayed work item we can just drop the
>     test
>     if next is already signaled.
>
> Can you elaborate more on this. Which test are you talking about?

I was talking about the "!dma_fence_is_signaled()" test here.

>
> Regards,
> Nayan
>
>
>
>     Regards,
>     Christian.
>
>     > -  schedule_delayed_work(&next->work_tdr, sched->timeout);
>     > +  schedule_delayed_work(&sched->work_tdr, sched->timeout);
>     >       }
>     >       /* remove job from ring_mirror_list */
>     >       list_del(&s_job->node);
>

Basically you could do this first and then you need to only test if 
sched->ring_mirror_list is empty.

Regards,
Christian.

>     > @@ -237,7 +237,7 @@ static void drm_sched_job_begin(struct
>     drm_sched_job *s_job)
>     >       if (list_first_entry_or_null(&sched->ring_mirror_list,
>     >                               struct drm_sched_job, node) ==
>     s_job) {
>     >               if (sched->timeout != MAX_SCHEDULE_TIMEOUT)
>     > -  schedule_delayed_work(&s_job->work_tdr, sched->timeout);
>     > +  schedule_delayed_work(&sched->work_tdr, sched->timeout);
>     >               sched->curr_job = s_job;
>     >       }
>     >       spin_unlock(&sched->job_list_lock);
>     > @@ -245,8 +245,10 @@ static void drm_sched_job_begin(struct
>     drm_sched_job *s_job)
>     >
>     >   static void drm_sched_job_timedout(struct work_struct *work)
>     >   {
>     > -     struct drm_sched_job *job = container_of(work, struct
>     drm_sched_job,
>     > - work_tdr.work <http://work_tdr.work>);
>     > +     struct drm_gpu_scheduler *sched = container_of(work,
>     > +                                             struct
>     drm_gpu_scheduler,
>     > + work_tdr.work <http://work_tdr.work>);
>     > +     struct drm_sched_job *job = sched->curr_job;
>     >
>     >       job->sched->ops->timedout_job(job);
>     >   }
>     > @@ -318,7 +320,7 @@ void drm_sched_job_recovery(struct
>     drm_gpu_scheduler *sched)
>     >       s_job = list_first_entry_or_null(&sched->ring_mirror_list,
>     >                                        struct drm_sched_job, node);
>     >       if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT)
>     > -  schedule_delayed_work(&s_job->work_tdr, sched->timeout);
>     > +  schedule_delayed_work(&sched->work_tdr, sched->timeout);
>     >       if (s_job)
>     >               sched->curr_job = s_job;
>     >
>     > @@ -389,7 +391,6 @@ int drm_sched_job_init(struct drm_sched_job
>     *job,
>     >
>     >       INIT_WORK(&job->finish_work, drm_sched_job_finish);
>     >       INIT_LIST_HEAD(&job->node);
>     > -     INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout);
>     >
>     >       return 0;
>     >   }
>     > @@ -580,6 +581,7 @@ int drm_sched_init(struct drm_gpu_scheduler
>     *sched,
>     >  INIT_LIST_HEAD(&sched->ring_mirror_list);
>     >       spin_lock_init(&sched->job_list_lock);
>     >       atomic_set(&sched->hw_rq_count, 0);
>     > +     INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
>     >       atomic_set(&sched->num_jobs, 0);
>     >       atomic64_set(&sched->job_id_count, 0);
>     >
>     > diff --git a/include/drm/gpu_scheduler.h
>     b/include/drm/gpu_scheduler.h
>     > index 07e776b1ca42..9d50d7f3eaa4 100644
>     > --- a/include/drm/gpu_scheduler.h
>     > +++ b/include/drm/gpu_scheduler.h
>     > @@ -175,8 +175,6 @@ struct drm_sched_fence
>     *to_drm_sched_fence(struct dma_fence *f);
>     >    *               finished to remove the job from the
>     >    *  @drm_gpu_scheduler.ring_mirror_list.
>     >    * @node: used to append this struct to the
>     @drm_gpu_scheduler.ring_mirror_list.
>     > - * @work_tdr: schedules a delayed call to
>     @drm_sched_job_timedout after the timeout
>     > - *            interval is over.
>     >    * @id: a unique id assigned to each job scheduled on the
>     scheduler.
>     >    * @karma: increment on every hang caused by this job. If this
>     exceeds the hang
>     >    *         limit of the scheduler then the job is marked
>     guilty and will not
>     > @@ -195,7 +193,6 @@ struct drm_sched_job {
>     >       struct dma_fence_cb             finish_cb;
>     >       struct work_struct              finish_work;
>     >       struct list_head                node;
>     > -     struct delayed_work             work_tdr;
>     >       uint64_t                        id;
>     >       atomic_t                        karma;
>     >       enum drm_sched_priority         s_priority;
>     > @@ -260,6 +257,8 @@ struct drm_sched_backend_ops {
>     >    *                 finished.
>     >    * @hw_rq_count: the number of jobs currently in the hardware
>     queue.
>     >    * @job_id_count: used to assign unique id to the each job.
>     > + * @work_tdr: schedules a delayed call to
>     @drm_sched_job_timedout after the
>     > + *            timeout interval is over.
>     >    * @thread: the kthread on which the scheduler which run.
>     >    * @ring_mirror_list: the list of jobs which are currently in
>     the job queue.
>     >    * @job_list_lock: lock to protect the ring_mirror_list.
>     > @@ -280,6 +279,7 @@ struct drm_gpu_scheduler {
>     >       wait_queue_head_t               job_scheduled;
>     >       atomic_t                        hw_rq_count;
>     >       atomic64_t                      job_id_count;
>     > +     struct delayed_work             work_tdr;
>     >       struct task_struct              *thread;
>     >       struct list_head ring_mirror_list;
>     >       spinlock_t                      job_list_lock;
>
<html>
  <head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body text="#000000" bgcolor="#FFFFFF">
    <div class="moz-cite-prefix">Am 20.09.2018 um 13:25 schrieb Nayan
      Deshmukh:<br>
    </div>
    <blockquote type="cite"
cite="mid:CAFd4ddzZo2M20igKZuRdxrRg6__FCn4LBRxYvC2xD=sYogs7PQ@mail.gmail.com">
      <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
      <div dir="auto">
        <div><br>
          <br>
          <div class="gmail_quote">
            <div dir="ltr">On Wed, Sep 19, 2018, 9:31 PM Christian König
              &lt;<a href="mailto:christian.koenig@amd.com"
                moz-do-not-send="true">christian.koenig@amd.com</a>&gt;
              wrote:<br>
            </div>
            <blockquote class="gmail_quote" style="margin:0 0 0
              .8ex;border-left:1px #ccc solid;padding-left:1ex">Am
              18.09.2018 um 18:17 schrieb Nayan Deshmukh:<br>
              &gt; having a delayed work item per job is redundant as we
              only need one<br>
              &gt; per scheduler to track the time out the currently
              executing job.<br>
              <br>
              Well that looks simpler than I thought it would be.<br>
              <br>
              But it shows the next problem that the timeout and the
              completion could <br>
              race.<br>
              <br>
              As far as I can see that can be fixed by moving the <br>
              dma_fence_remove_callback()/dma_fence_add_callback() dance
              from <br>
              drm_sched_hw_job_reset() to drm_sched_job_timedout().<br>
              <br>
              Anyway, I would say drop patch #1 and fix the one comment
              below and we <br>
              can use this.<br>
              <br>
              &gt;<br>
              &gt; Signed-off-by: Nayan Deshmukh &lt;<a
                href="mailto:nayan26deshmukh@gmail.com" target="_blank"
                rel="noreferrer" moz-do-not-send="true">nayan26deshmukh@gmail.com</a>&gt;<br>
              &gt; Suggested-by: Christian König &lt;<a
                href="mailto:christian.koenig@amd.com" target="_blank"
                rel="noreferrer" moz-do-not-send="true">christian.koenig@amd.com</a>&gt;<br>
              &gt; ---<br>
              &gt;   drivers/gpu/drm/scheduler/sched_main.c | 16
              +++++++++-------<br>
              &gt;   include/drm/gpu_scheduler.h            |  6 +++---<br>
              &gt;   2 files changed, 12 insertions(+), 10 deletions(-)<br>
              &gt;<br>
              &gt; diff --git a/drivers/gpu/drm/scheduler/sched_main.c
              b/drivers/gpu/drm/scheduler/sched_main.c<br>
              &gt; index 0e6ccc8243db..f213b5c7f718 100644<br>
              &gt; --- a/drivers/gpu/drm/scheduler/sched_main.c<br>
              &gt; +++ b/drivers/gpu/drm/scheduler/sched_main.c<br>
              &gt; @@ -198,7 +198,7 @@ static void
              drm_sched_job_finish(struct work_struct *work)<br>
              &gt;        * manages to find this job as the next job in
              the list, the fence<br>
              &gt;        * signaled check below will prevent the
              timeout to be restarted.<br>
              &gt;        */<br>
              &gt; -   
               cancel_delayed_work_sync(&amp;s_job-&gt;work_tdr);<br>
              &gt; +   
               cancel_delayed_work_sync(&amp;sched-&gt;work_tdr);<br>
              &gt;   <br>
              &gt;       spin_lock(&amp;sched-&gt;job_list_lock);<br>
              &gt;       /* queue TDR for next job */<br>
              &gt; @@ -207,7 +207,7 @@ static void
              drm_sched_job_finish(struct work_struct *work)<br>
              &gt;       if (sched-&gt;timeout != MAX_SCHEDULE_TIMEOUT
              &amp;&amp;<br>
              &gt;           !list_is_last(&amp;s_job-&gt;node,
              &amp;sched-&gt;ring_mirror_list)) {<br>
              &gt;               if
              (!dma_fence_is_signaled(&amp;next-&gt;s_fence-&gt;finished))<br>
              <br>
              Since we now have only one delayed work item we can just
              drop the test <br>
              if next is already signaled.<br>
            </blockquote>
          </div>
        </div>
        <div dir="auto"><span
            style="font-family:sans-serif;font-size:12.8px">Can you
            elaborate more on this. Which test are you talking about?</span><br
            style="font-family:sans-serif;font-size:12.8px">
        </div>
      </div>
    </blockquote>
    <br>
    I was talking about the "!dma_fence_is_signaled()" test here.<br>
    <br>
    <blockquote type="cite"
cite="mid:CAFd4ddzZo2M20igKZuRdxrRg6__FCn4LBRxYvC2xD=sYogs7PQ@mail.gmail.com">
      <div dir="auto">
        <div dir="auto"><br
            style="font-family:sans-serif;font-size:12.8px">
          <span style="font-family:sans-serif;font-size:12.8px">Regards,</span><br
            style="font-family:sans-serif;font-size:12.8px">
          <span style="font-family:sans-serif;font-size:12.8px">Nayan</span><br>
        </div>
        <div dir="auto">
          <div class="gmail_quote">
            <blockquote class="gmail_quote" style="margin:0 0 0
              .8ex;border-left:1px #ccc solid;padding-left:1ex">
              <br>
              <br>
              Regards,<br>
              Christian.<br>
              <br>
              &gt; -                   
               schedule_delayed_work(&amp;next-&gt;work_tdr,
              sched-&gt;timeout);<br>
              &gt; +                   
               schedule_delayed_work(&amp;sched-&gt;work_tdr,
              sched-&gt;timeout);<br>
              &gt;       }<br>
              &gt;       /* remove job from ring_mirror_list */<br>
              &gt;       list_del(&amp;s_job-&gt;node);<br>
            </blockquote>
          </div>
        </div>
      </div>
    </blockquote>
    <br>
    Basically you could do this first and then you need to only test if
    sched-&gt;ring_mirror_list is empty.<br>
    <br>
    Regards,<br>
    Christian.<br>
    <br>
    <blockquote type="cite"
cite="mid:CAFd4ddzZo2M20igKZuRdxrRg6__FCn4LBRxYvC2xD=sYogs7PQ@mail.gmail.com">
      <div dir="auto">
        <div dir="auto">
          <div class="gmail_quote">
            <blockquote class="gmail_quote" style="margin:0 0 0
              .8ex;border-left:1px #ccc solid;padding-left:1ex">
              &gt; @@ -237,7 +237,7 @@ static void
              drm_sched_job_begin(struct drm_sched_job *s_job)<br>
              &gt;       if
              (list_first_entry_or_null(&amp;sched-&gt;ring_mirror_list,<br>
              &gt;                               struct drm_sched_job,
              node) == s_job) {<br>
              &gt;               if (sched-&gt;timeout !=
              MAX_SCHEDULE_TIMEOUT)<br>
              &gt; -                   
               schedule_delayed_work(&amp;s_job-&gt;work_tdr,
              sched-&gt;timeout);<br>
              &gt; +                   
               schedule_delayed_work(&amp;sched-&gt;work_tdr,
              sched-&gt;timeout);<br>
              &gt;               sched-&gt;curr_job = s_job;<br>
              &gt;       }<br>
              &gt;       spin_unlock(&amp;sched-&gt;job_list_lock);<br>
              &gt; @@ -245,8 +245,10 @@ static void
              drm_sched_job_begin(struct drm_sched_job *s_job)<br>
              &gt;   <br>
              &gt;   static void drm_sched_job_timedout(struct
              work_struct *work)<br>
              &gt;   {<br>
              &gt; -     struct drm_sched_job *job = container_of(work,
              struct drm_sched_job,<br>
              &gt; -                                              <a
                href="http://work_tdr.work" rel="noreferrer noreferrer"
                target="_blank" moz-do-not-send="true">work_tdr.work</a>);<br>
              &gt; +     struct drm_gpu_scheduler *sched =
              container_of(work,<br>
              &gt; +                                             struct
              drm_gpu_scheduler,<br>
              &gt; +                                             <a
                href="http://work_tdr.work" rel="noreferrer noreferrer"
                target="_blank" moz-do-not-send="true">work_tdr.work</a>);<br>
              &gt; +     struct drm_sched_job *job = sched-&gt;curr_job;<br>
              &gt;   <br>
              &gt;       job-&gt;sched-&gt;ops-&gt;timedout_job(job);<br>
              &gt;   }<br>
              &gt; @@ -318,7 +320,7 @@ void
              drm_sched_job_recovery(struct drm_gpu_scheduler *sched)<br>
              &gt;       s_job =
              list_first_entry_or_null(&amp;sched-&gt;ring_mirror_list,<br>
              &gt;                                        struct
              drm_sched_job, node);<br>
              &gt;       if (s_job &amp;&amp; sched-&gt;timeout !=
              MAX_SCHEDULE_TIMEOUT)<br>
              &gt; -           
               schedule_delayed_work(&amp;s_job-&gt;work_tdr,
              sched-&gt;timeout);<br>
              &gt; +           
               schedule_delayed_work(&amp;sched-&gt;work_tdr,
              sched-&gt;timeout);<br>
              &gt;       if (s_job)<br>
              &gt;               sched-&gt;curr_job = s_job;<br>
              &gt;   <br>
              &gt; @@ -389,7 +391,6 @@ int drm_sched_job_init(struct
              drm_sched_job *job,<br>
              &gt;   <br>
              &gt;       INIT_WORK(&amp;job-&gt;finish_work,
              drm_sched_job_finish);<br>
              &gt;       INIT_LIST_HEAD(&amp;job-&gt;node);<br>
              &gt; -     INIT_DELAYED_WORK(&amp;job-&gt;work_tdr,
              drm_sched_job_timedout);<br>
              &gt;   <br>
              &gt;       return 0;<br>
              &gt;   }<br>
              &gt; @@ -580,6 +581,7 @@ int drm_sched_init(struct
              drm_gpu_scheduler *sched,<br>
              &gt;     
               INIT_LIST_HEAD(&amp;sched-&gt;ring_mirror_list);<br>
              &gt;       spin_lock_init(&amp;sched-&gt;job_list_lock);<br>
              &gt;       atomic_set(&amp;sched-&gt;hw_rq_count, 0);<br>
              &gt; +     INIT_DELAYED_WORK(&amp;sched-&gt;work_tdr,
              drm_sched_job_timedout);<br>
              &gt;       atomic_set(&amp;sched-&gt;num_jobs, 0);<br>
              &gt;       atomic64_set(&amp;sched-&gt;job_id_count, 0);<br>
              &gt;   <br>
              &gt; diff --git a/include/drm/gpu_scheduler.h
              b/include/drm/gpu_scheduler.h<br>
              &gt; index 07e776b1ca42..9d50d7f3eaa4 100644<br>
              &gt; --- a/include/drm/gpu_scheduler.h<br>
              &gt; +++ b/include/drm/gpu_scheduler.h<br>
              &gt; @@ -175,8 +175,6 @@ struct drm_sched_fence
              *to_drm_sched_fence(struct dma_fence *f);<br>
              &gt;    *               finished to remove the job from
              the<br>
              &gt;    *             
               @drm_gpu_scheduler.ring_mirror_list.<br>
              &gt;    * @node: used to append this struct to the
              @drm_gpu_scheduler.ring_mirror_list.<br>
              &gt; - * @work_tdr: schedules a delayed call to
              @drm_sched_job_timedout after the timeout<br>
              &gt; - *            interval is over.<br>
              &gt;    * @id: a unique id assigned to each job scheduled
              on the scheduler.<br>
              &gt;    * @karma: increment on every hang caused by this
              job. If this exceeds the hang<br>
              &gt;    *         limit of the scheduler then the job is
              marked guilty and will not<br>
              &gt; @@ -195,7 +193,6 @@ struct drm_sched_job {<br>
              &gt;       struct dma_fence_cb             finish_cb;<br>
              &gt;       struct work_struct              finish_work;<br>
              &gt;       struct list_head                node;<br>
              &gt; -     struct delayed_work             work_tdr;<br>
              &gt;       uint64_t                        id;<br>
              &gt;       atomic_t                        karma;<br>
              &gt;       enum drm_sched_priority         s_priority;<br>
              &gt; @@ -260,6 +257,8 @@ struct drm_sched_backend_ops {<br>
              &gt;    *                 finished.<br>
              &gt;    * @hw_rq_count: the number of jobs currently in
              the hardware queue.<br>
              &gt;    * @job_id_count: used to assign unique id to the
              each job.<br>
              &gt; + * @work_tdr: schedules a delayed call to
              @drm_sched_job_timedout after the<br>
              &gt; + *            timeout interval is over.<br>
              &gt;    * @thread: the kthread on which the scheduler
              which run.<br>
              &gt;    * @ring_mirror_list: the list of jobs which are
              currently in the job queue.<br>
              &gt;    * @job_list_lock: lock to protect the
              ring_mirror_list.<br>
              &gt; @@ -280,6 +279,7 @@ struct drm_gpu_scheduler {<br>
              &gt;       wait_queue_head_t               job_scheduled;<br>
              &gt;       atomic_t                        hw_rq_count;<br>
              &gt;       atomic64_t                      job_id_count;<br>
              &gt; +     struct delayed_work             work_tdr;<br>
              &gt;       struct task_struct              *thread;<br>
              &gt;       struct list_head               
              ring_mirror_list;<br>
              &gt;       spinlock_t                      job_list_lock;<br>
              <br>
            </blockquote>
          </div>
        </div>
      </div>
    </blockquote>
    <br>
  </body>
</html>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
index 0e6ccc8243db..f213b5c7f718 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -198,7 +198,7 @@  static void drm_sched_job_finish(struct work_struct *work)
 	 * manages to find this job as the next job in the list, the fence
 	 * signaled check below will prevent the timeout to be restarted.
 	 */
-	cancel_delayed_work_sync(&s_job->work_tdr);
+	cancel_delayed_work_sync(&sched->work_tdr);
 
 	spin_lock(&sched->job_list_lock);
 	/* queue TDR for next job */
@@ -207,7 +207,7 @@  static void drm_sched_job_finish(struct work_struct *work)
 	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
 	    !list_is_last(&s_job->node, &sched->ring_mirror_list)) {
 		if (!dma_fence_is_signaled(&next->s_fence->finished))
-			schedule_delayed_work(&next->work_tdr, sched->timeout);
+			schedule_delayed_work(&sched->work_tdr, sched->timeout);
 	}
 	/* remove job from ring_mirror_list */
 	list_del(&s_job->node);
@@ -237,7 +237,7 @@  static void drm_sched_job_begin(struct drm_sched_job *s_job)
 	if (list_first_entry_or_null(&sched->ring_mirror_list,
 				struct drm_sched_job, node) == s_job) {
 		if (sched->timeout != MAX_SCHEDULE_TIMEOUT)
-			schedule_delayed_work(&s_job->work_tdr, sched->timeout);
+			schedule_delayed_work(&sched->work_tdr, sched->timeout);
 		sched->curr_job = s_job;
 	}
 	spin_unlock(&sched->job_list_lock);
@@ -245,8 +245,10 @@  static void drm_sched_job_begin(struct drm_sched_job *s_job)
 
 static void drm_sched_job_timedout(struct work_struct *work)
 {
-	struct drm_sched_job *job = container_of(work, struct drm_sched_job,
-						 work_tdr.work);
+	struct drm_gpu_scheduler *sched = container_of(work,
+						struct drm_gpu_scheduler,
+						work_tdr.work);
+	struct drm_sched_job *job = sched->curr_job;
 
 	job->sched->ops->timedout_job(job);
 }
@@ -318,7 +320,7 @@  void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
 	s_job = list_first_entry_or_null(&sched->ring_mirror_list,
 					 struct drm_sched_job, node);
 	if (s_job && sched->timeout != MAX_SCHEDULE_TIMEOUT)
-		schedule_delayed_work(&s_job->work_tdr, sched->timeout);
+		schedule_delayed_work(&sched->work_tdr, sched->timeout);
 	if (s_job)
 		sched->curr_job = s_job;
 
@@ -389,7 +391,6 @@  int drm_sched_job_init(struct drm_sched_job *job,
 
 	INIT_WORK(&job->finish_work, drm_sched_job_finish);
 	INIT_LIST_HEAD(&job->node);
-	INIT_DELAYED_WORK(&job->work_tdr, drm_sched_job_timedout);
 
 	return 0;
 }
@@ -580,6 +581,7 @@  int drm_sched_init(struct drm_gpu_scheduler *sched,
 	INIT_LIST_HEAD(&sched->ring_mirror_list);
 	spin_lock_init(&sched->job_list_lock);
 	atomic_set(&sched->hw_rq_count, 0);
+	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
 	atomic_set(&sched->num_jobs, 0);
 	atomic64_set(&sched->job_id_count, 0);
 
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index 07e776b1ca42..9d50d7f3eaa4 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -175,8 +175,6 @@  struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
  *               finished to remove the job from the
  *               @drm_gpu_scheduler.ring_mirror_list.
  * @node: used to append this struct to the @drm_gpu_scheduler.ring_mirror_list.
- * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the timeout
- *            interval is over.
  * @id: a unique id assigned to each job scheduled on the scheduler.
  * @karma: increment on every hang caused by this job. If this exceeds the hang
  *         limit of the scheduler then the job is marked guilty and will not
@@ -195,7 +193,6 @@  struct drm_sched_job {
 	struct dma_fence_cb		finish_cb;
 	struct work_struct		finish_work;
 	struct list_head		node;
-	struct delayed_work		work_tdr;
 	uint64_t			id;
 	atomic_t			karma;
 	enum drm_sched_priority		s_priority;
@@ -260,6 +257,8 @@  struct drm_sched_backend_ops {
  *                 finished.
  * @hw_rq_count: the number of jobs currently in the hardware queue.
  * @job_id_count: used to assign unique id to the each job.
+ * @work_tdr: schedules a delayed call to @drm_sched_job_timedout after the
+ *            timeout interval is over.
  * @thread: the kthread on which the scheduler which run.
  * @ring_mirror_list: the list of jobs which are currently in the job queue.
  * @job_list_lock: lock to protect the ring_mirror_list.
@@ -280,6 +279,7 @@  struct drm_gpu_scheduler {
 	wait_queue_head_t		job_scheduled;
 	atomic_t			hw_rq_count;
 	atomic64_t			job_id_count;
+	struct delayed_work		work_tdr;
 	struct task_struct		*thread;
 	struct list_head		ring_mirror_list;
 	spinlock_t			job_list_lock;