diff mbox series

[3/3] drm/v3d: Add job timeout module param

Message ID 20200903164821.2879-4-i.can.speak.c.and.basic@gmail.com (mailing list archive)
State New, archived
Headers show
Series drm/v3d: CL/CSD job timeout fixes | expand

Commit Message

Yukimasa Sugizaki Sept. 3, 2020, 4:48 p.m. UTC
From: Yukimasa Sugizaki <ysugi@idein.jp>

The default timeout is 500 ms which is too short for some workloads
including Piglit.  Adding this parameter will help users to run heavier
tasks.

Signed-off-by: Yukimasa Sugizaki <ysugi@idein.jp>
---
 drivers/gpu/drm/v3d/v3d_sched.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

--
2.7.4

Comments

Chema Casanova Feb. 4, 2021, 6:09 p.m. UTC | #1
On 3/9/20 18:48, Yukimasa Sugizaki wrote:
> From: Yukimasa Sugizaki <ysugi@idein.jp>
>
> The default timeout is 500 ms which is too short for some workloads
> including Piglit.  Adding this parameter will help users to run heavier
> tasks.
>
> Signed-off-by: Yukimasa Sugizaki <ysugi@idein.jp>
> ---
>   drivers/gpu/drm/v3d/v3d_sched.c | 24 +++++++++++++-----------
>   1 file changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index feef0c749e68..983efb018560 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -19,11 +19,17 @@
>    */
>
>   #include <linux/kthread.h>
> +#include <linux/moduleparam.h>
>
>   #include "v3d_drv.h"
>   #include "v3d_regs.h"
>   #include "v3d_trace.h"
>
> +static uint timeout = 500;
> +module_param(timeout, uint, 0444);
> +MODULE_PARM_DESC(timeout,
> +	"Timeout for a job in ms (0 means infinity and default is 500 ms)");
> +

I think that  parameter definition could be included at v3d_drv.c as 
other drivers do. Then we would have all future parameters together. In 
that case we would need also to include the extern definition at 
v3d_drv.h for the driver variable.

The param name could be more descriptive like "sched_timeout_ms" in the 
lima driver.

I wonder if it would make sense to have different timeouts parameters 
for different type of jobs we have. At least this series come from the 
need additional time to complete compute jobs. This is what amdgpu does, 
but we probably don't need something such complex.

I think it would also make sense to increase our default value for the 
compute jobs.

What do you think?

In any case I think that having this general scheduling timeout 
parameter as other drivers do is a good idea.

Regards,

Chema Casanova

>   static struct v3d_job *
>   to_v3d_job(struct drm_sched_job *sched_job)
>   {
> @@ -432,13 +438,13 @@ v3d_sched_init(struct v3d_dev *v3d)
>   {
>   	int hw_jobs_limit = 1;
>   	int job_hang_limit = 0;
> -	int hang_limit_ms = 500;
> +	long timeout_jiffies = timeout ?
> +			msecs_to_jiffies(timeout) : MAX_SCHEDULE_TIMEOUT;
>   	int ret;
>
>   	ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
>   			     &v3d_bin_sched_ops,
> -			     hw_jobs_limit, job_hang_limit,
> -			     msecs_to_jiffies(hang_limit_ms),
> +			     hw_jobs_limit, job_hang_limit, timeout_jiffies,
>   			     "v3d_bin");
>   	if (ret) {
>   		dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret);
> @@ -447,8 +453,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>
>   	ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
>   			     &v3d_render_sched_ops,
> -			     hw_jobs_limit, job_hang_limit,
> -			     msecs_to_jiffies(hang_limit_ms),
> +			     hw_jobs_limit, job_hang_limit, timeout_jiffies,
>   			     "v3d_render");
>   	if (ret) {
>   		dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.",
> @@ -459,8 +464,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>
>   	ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
>   			     &v3d_tfu_sched_ops,
> -			     hw_jobs_limit, job_hang_limit,
> -			     msecs_to_jiffies(hang_limit_ms),
> +			     hw_jobs_limit, job_hang_limit, timeout_jiffies,
>   			     "v3d_tfu");
>   	if (ret) {
>   		dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
> @@ -472,8 +476,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>   	if (v3d_has_csd(v3d)) {
>   		ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
>   				     &v3d_csd_sched_ops,
> -				     hw_jobs_limit, job_hang_limit,
> -				     msecs_to_jiffies(hang_limit_ms),
> +				     hw_jobs_limit, job_hang_limit, timeout_jiffies,
>   				     "v3d_csd");
>   		if (ret) {
>   			dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.",
> @@ -484,8 +487,7 @@ v3d_sched_init(struct v3d_dev *v3d)
>
>   		ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
>   				     &v3d_cache_clean_sched_ops,
> -				     hw_jobs_limit, job_hang_limit,
> -				     msecs_to_jiffies(hang_limit_ms),
> +				     hw_jobs_limit, job_hang_limit, timeout_jiffies,
>   				     "v3d_cache_clean");
>   		if (ret) {
>   			dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: %d.",
> --
> 2.7.4
>
> _______________________________________________
> dri-devel mailing list
> dri-devel@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
>
Eric Anholt Feb. 4, 2021, 7:34 p.m. UTC | #2
On Thu, Feb 4, 2021 at 10:09 AM Chema Casanova <jmcasanova@igalia.com> wrote:
>
> On 3/9/20 18:48, Yukimasa Sugizaki wrote:
> > From: Yukimasa Sugizaki <ysugi@idein.jp>
> >
> > The default timeout is 500 ms which is too short for some workloads
> > including Piglit.  Adding this parameter will help users to run heavier
> > tasks.
> >
> > Signed-off-by: Yukimasa Sugizaki <ysugi@idein.jp>
> > ---
> >   drivers/gpu/drm/v3d/v3d_sched.c | 24 +++++++++++++-----------
> >   1 file changed, 13 insertions(+), 11 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> > index feef0c749e68..983efb018560 100644
> > --- a/drivers/gpu/drm/v3d/v3d_sched.c
> > +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> > @@ -19,11 +19,17 @@
> >    */
> >
> >   #include <linux/kthread.h>
> > +#include <linux/moduleparam.h>
> >
> >   #include "v3d_drv.h"
> >   #include "v3d_regs.h"
> >   #include "v3d_trace.h"
> >
> > +static uint timeout = 500;
> > +module_param(timeout, uint, 0444);
> > +MODULE_PARM_DESC(timeout,
> > +     "Timeout for a job in ms (0 means infinity and default is 500 ms)");
> > +
>
> I think that  parameter definition could be included at v3d_drv.c as
> other drivers do. Then we would have all future parameters together. In
> that case we would need also to include the extern definition at
> v3d_drv.h for the driver variable.
>
> The param name could be more descriptive like "sched_timeout_ms" in the
> lima driver.
>
> I wonder if it would make sense to have different timeouts parameters
> for different type of jobs we have. At least this series come from the
> need additional time to complete compute jobs. This is what amdgpu does,
> but we probably don't need something such complex.
>
> I think it would also make sense to increase our default value for the
> compute jobs.
>
> What do you think?
>
> In any case I think that having this general scheduling timeout
> parameter as other drivers do is a good idea.

Having a param for being able to test if the job completes eventually
is a good idea, but if tests are triggering timeouts, then you
probably need to investigate what's going on in the driver -- it's not
like you want .5second unpreemptible jobs to be easy to produce.

Also, for CS, I wonder if we have another reg besides CSD_CURRENT_CFG4
we could be looking at for "we're making progress on the job".  Half a
second with no discernible progress sounds like a bug.
Yukimasa Sugizaki Feb. 5, 2021, 12:28 p.m. UTC | #3
On 05/02/2021, Eric Anholt <eric@anholt.net> wrote:
> On Thu, Feb 4, 2021 at 10:09 AM Chema Casanova <jmcasanova@igalia.com>
> wrote:
>>
>> On 3/9/20 18:48, Yukimasa Sugizaki wrote:
>> > From: Yukimasa Sugizaki <ysugi@idein.jp>
>> >
>> > The default timeout is 500 ms which is too short for some workloads
>> > including Piglit.  Adding this parameter will help users to run heavier
>> > tasks.
>> >
>> > Signed-off-by: Yukimasa Sugizaki <ysugi@idein.jp>
>> > ---
>> >   drivers/gpu/drm/v3d/v3d_sched.c | 24 +++++++++++++-----------
>> >   1 file changed, 13 insertions(+), 11 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>> > b/drivers/gpu/drm/v3d/v3d_sched.c
>> > index feef0c749e68..983efb018560 100644
>> > --- a/drivers/gpu/drm/v3d/v3d_sched.c
>> > +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>> > @@ -19,11 +19,17 @@
>> >    */
>> >
>> >   #include <linux/kthread.h>
>> > +#include <linux/moduleparam.h>
>> >
>> >   #include "v3d_drv.h"
>> >   #include "v3d_regs.h"
>> >   #include "v3d_trace.h"
>> >
>> > +static uint timeout = 500;
>> > +module_param(timeout, uint, 0444);
>> > +MODULE_PARM_DESC(timeout,
>> > +     "Timeout for a job in ms (0 means infinity and default is 500
>> > ms)");
>> > +
>>
>> I think that  parameter definition could be included at v3d_drv.c as
>> other drivers do. Then we would have all future parameters together. In
>> that case we would need also to include the extern definition at
>> v3d_drv.h for the driver variable.
>>
>> The param name could be more descriptive like "sched_timeout_ms" in the
>> lima driver.
>>
>> I wonder if it would make sense to have different timeouts parameters
>> for different type of jobs we have. At least this series come from the
>> need additional time to complete compute jobs. This is what amdgpu does,
>> but we probably don't need something such complex.
>>
>> I think it would also make sense to increase our default value for the
>> compute jobs.
>>
>> What do you think?
>>
>> In any case I think that having this general scheduling timeout
>> parameter as other drivers do is a good idea.

I agree with your observations.
I'm going to add bin_timeout_ms, render_timeout_ms, tfu_timeout_ms,
v3d_timeout_ms, and cache_clean_timeout_ms parameters to v3d_drv.c
instead of the sole timeout parameter.
Though not sophisticated, this should be enough.

> Having a param for being able to test if the job completes eventually
> is a good idea, but if tests are triggering timeouts, then you
> probably need to investigate what's going on in the driver -- it's not
> like you want .5second unpreemptible jobs to be easy to produce.
>
> Also, for CS, I wonder if we have another reg besides CSD_CURRENT_CFG4
> we could be looking at for "we're making progress on the job".  Half a
> second with no discernible progress sounds like a bug.

If binning/render/TFU/cache_clean jobs keep running over 0.5 seconds,
then it should be a bug because they normally complete processing
within the period.
However, a CSD job can do anything.
For example, SaschaWillems's ray tracing example requires about 0.7
seconds to compose a frame with compute shader with Igalia's Vulkan
driver.
Another example is our matrix computation library for QPU:
https://github.com/Idein/qmkl6
It requires about 1.1 seconds to multiply two 2048x2048 matrices.
Because in general we do not know what is the expected behavior of a
QPU program and what is not, we also cannot detect a progress the
program is making.
This is why we want to have the timeout parameters to be able to be modified.

Regards,
Y. Sugizaki.
Chema Casanova Feb. 10, 2021, 5:59 p.m. UTC | #4
On 5/2/21 13:28, Yukimasa Sugizaki wrote:
> On 05/02/2021, Eric Anholt <eric@anholt.net> wrote:
>> On Thu, Feb 4, 2021 at 10:09 AM Chema Casanova <jmcasanova@igalia.com>
>> wrote:
>>> On 3/9/20 18:48, Yukimasa Sugizaki wrote:
>>>> From: Yukimasa Sugizaki <ysugi@idein.jp>
>>>>
>>>> The default timeout is 500 ms which is too short for some workloads
>>>> including Piglit.  Adding this parameter will help users to run heavier
>>>> tasks.
>>>>
>>>> Signed-off-by: Yukimasa Sugizaki <ysugi@idein.jp>
>>>> ---
>>>>    drivers/gpu/drm/v3d/v3d_sched.c | 24 +++++++++++++-----------
>>>>    1 file changed, 13 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>>> index feef0c749e68..983efb018560 100644
>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>>> @@ -19,11 +19,17 @@
>>>>     */
>>>>
>>>>    #include <linux/kthread.h>
>>>> +#include <linux/moduleparam.h>
>>>>
>>>>    #include "v3d_drv.h"
>>>>    #include "v3d_regs.h"
>>>>    #include "v3d_trace.h"
>>>>
>>>> +static uint timeout = 500;
>>>> +module_param(timeout, uint, 0444);
>>>> +MODULE_PARM_DESC(timeout,
>>>> +     "Timeout for a job in ms (0 means infinity and default is 500
>>>> ms)");
>>>> +
>>> I think that  parameter definition could be included at v3d_drv.c as
>>> other drivers do. Then we would have all future parameters together. In
>>> that case we would need also to include the extern definition at
>>> v3d_drv.h for the driver variable.
>>>
>>> The param name could be more descriptive like "sched_timeout_ms" in the
>>> lima driver.
>>>
>>> I wonder if it would make sense to have different timeouts parameters
>>> for different type of jobs we have. At least this series come from the
>>> need additional time to complete compute jobs. This is what amdgpu does,
>>> but we probably don't need something such complex.
>>>
>>> I think it would also make sense to increase our default value for the
>>> compute jobs.
>>>
>>> What do you think?
>>>
>>> In any case I think that having this general scheduling timeout
>>> parameter as other drivers do is a good idea.
> I agree with your observations.
> I'm going to add bin_timeout_ms, render_timeout_ms, tfu_timeout_ms,
> v3d_timeout_ms, and cache_clean_timeout_ms parameters to v3d_drv.c
> instead of the sole timeout parameter.
> Though not sophisticated, this should be enough.

>> Having a param for being able to test if the job completes eventually
>> is a good idea, but if tests are triggering timeouts, then you
>> probably need to investigate what's going on in the driver -- it's not
>> like you want .5second unpreemptible jobs to be easy to produce.
>>
>> Also, for CS, I wonder if we have another reg besides CSD_CURRENT_CFG4
>> we could be looking at for "we're making progress on the job".  Half a
>> second with no discernible progress sounds like a bug.
> If binning/render/TFU/cache_clean jobs keep running over 0.5 seconds,
> then it should be a bug because they normally complete processing
> within the period.
> However, a CSD job can do anything.
> For example, SaschaWillems's ray tracing example requires about 0.7
> seconds to compose a frame with compute shader with Igalia's Vulkan
> driver.

I've been checking the values of the registers with SaschaWillems 
computeraytracing on job submission, timeout evaluation and job destroy. 
The driver detects progress. CSD_CURRENT_CFG4 values are decreased as 
expected based in the number of pending batches. Next debug info starts 
with 262144 - 1, and we can see it ends with 0xfffffffff. So, I think 
that using CSD_CURRENT_CFG4 is working as expected. We could see also 
that CSD_CURRENT_ID0 and CSD_CURRENT_ID1 could also be used to track 
progress as we see how values are changing for X, Y and Z component of 
current workgroup ID. But comparisons on CSD_CURRENT_CFG4 are simpler.

10661.766782] [drm] CSD launched: timeout_batches (0x00000000) | cfg4 
(0x0003fff9) | id0 (0x00000080) | id1 (0x00000000) | status (0x00000056)
[10662.272345] [drm] CSD timeout check: timeout_batches (0x00000000) -> 
cfg4 (0x0002dfaf) | id0 (0x00050000) | id1 (0x00000024) | status 
(0x00000056)
[10662.792324] [drm] CSD timeout check: timeout_batches (0x0002dfaf) -> 
cfg4 (0x00018c1f) | id0 (0x003e0000) | id1 (0x0000004e) | status 
(0x00000056)
[10663.302350] [drm] CSD timeout check: timeout_batches (0x00018c1f) -> 
cfg4 (0x0000724f) | id0 (0x005b0000) | id1 (0x00000071) | status 
(0x00000056)
[10663.487139] [drm] CSD free: timeout_batches (0x0000724f) -> cfg4 
(0xffffffff) | id0 (0x00000000) | id1 (0x00010000) | status (0x00000060)

> Another example is our matrix computation library for QPU:
> https://github.com/Idein/qmkl6
> It requires about 1.1 seconds to multiply two 2048x2048 matrices.
> Because in general we do not know what is the expected behavior of a
> QPU program and what is not, we also cannot detect a progress the
> program is making.
> This is why we want to have the timeout parameters to be able to be modified.

In case of qmkl6, all jobs are launched with CSD_CURRENT_CFG4 is 
initialized as 7 (Number of batches minus 1) so when the job starts, all 
the batches are on execution. So, the HW cannot report any progress, and 
CFG4 reports that there are no pending batches queued with ~0.

This is the trace of v3d_submit_csd_ioctl calls showing CFG4.

      sscal-qmkl6-26222   [002] .... 11978.399780: v3d_submit_csd_ioctl: 
dev=1, CFG4 0x00000007, CFG5 0x00560000, CFG6 0x00280000
      sscal-qmkl6-26222   [002] .... 11978.518960: v3d_submit_csd_ioctl: 
dev=1, CFG4 0x00000007, CFG5 0x00560000, CFG6 0x00280000

Here is a example of the process finishing before reaching the timeout:

[10339.374376] [drm] CSD launched: timeout_batches (0x00000000) | cfg4 
(0x00000001) | id0 (0x00080800) | id1 (0x00000000) | status (0x00000b34)
[10339.470097] [drm] CSD free: timeout_batches (0x00000000) -> cfg4 
(0xffffffff) | id0 (0x00080800) | id1 (0x00000000) | status (0x00000b40)

Here is the example of a job that reaches the timeout and RESET:

[10339.980849] [drm] CSD launched: timeout_batches (0x00000000) | cfg4 
(0x00000000) | id0 (0x00080800) | id1 (0x00000000) | status (0x00000b44)
[10340.497731] [drm] CSD timeout: timeout_batches (0x00000000) -> cfg4 
(0xffffffff) | id0 (0x00080800) | id1 (0x00000000) | status (0x00000b44)
[10341.007815] [drm] CSD timeout RESET: timeout_batches (0xffffffff) | 
cfg4 (0xffffffff) | id0 (0x00080800) | id1 (0x00000000) | status 
(0x00000b44)
[10341.008143] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* Resetting 
GPU for hang.
[10341.008241] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* 
V3D_ERR_STAT: 0x00001000
[10341.008311] [drm] Skipping CSD job resubmission due to previous error 
(-125)
[10341.008375] [drm] CSD free: timeout_batches (0xffffffff) -> cfg4 
(0x00000000) | id0 (0x00000000) | id1 (0x00000000) | status (0x00000000)

In the general case we need at least to reach the timeout twice to reset 
the GPU.

Jobs start with 0 on timeout_batches, then the value is updated to 
0xffffffff and at next timeout evaluation there is no progress as all 
batches have already been submitted. Then we reset the GPU.

I don't know if the algorithms in qmkl6 or py-videocore6 cannot be 
launched using more batches. But if this is not the case adding the 
timeout module parameter seems to be our first solution. It would 
probably be enough adding a general timeout "timeout_ms" for debugging 
any kind of job and a specific "csd_timeout_ms" that only applies to CSD 
jobs and could be used by systems that need to execute this kind of 
jobs. When the CSD timeout is reached we could probably notify with a 
DRM_WARN suggesting increasing the csd_timeout.

Kind regards,

Chema Casanova
Yukimasa Sugizaki Feb. 11, 2021, 6:31 a.m. UTC | #5
On 11/02/2021, Chema Casanova <jmcasanova@igalia.com> wrote:
>
> On 5/2/21 13:28, Yukimasa Sugizaki wrote:
>> On 05/02/2021, Eric Anholt <eric@anholt.net> wrote:
>>> On Thu, Feb 4, 2021 at 10:09 AM Chema Casanova <jmcasanova@igalia.com>
>>> wrote:
>>>> On 3/9/20 18:48, Yukimasa Sugizaki wrote:
>>>>> From: Yukimasa Sugizaki <ysugi@idein.jp>
>>>>>
>>>>> The default timeout is 500 ms which is too short for some workloads
>>>>> including Piglit.  Adding this parameter will help users to run
>>>>> heavier
>>>>> tasks.
>>>>>
>>>>> Signed-off-by: Yukimasa Sugizaki <ysugi@idein.jp>
>>>>> ---
>>>>>    drivers/gpu/drm/v3d/v3d_sched.c | 24 +++++++++++++-----------
>>>>>    1 file changed, 13 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>> b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>> index feef0c749e68..983efb018560 100644
>>>>> --- a/drivers/gpu/drm/v3d/v3d_sched.c
>>>>> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
>>>>> @@ -19,11 +19,17 @@
>>>>>     */
>>>>>
>>>>>    #include <linux/kthread.h>
>>>>> +#include <linux/moduleparam.h>
>>>>>
>>>>>    #include "v3d_drv.h"
>>>>>    #include "v3d_regs.h"
>>>>>    #include "v3d_trace.h"
>>>>>
>>>>> +static uint timeout = 500;
>>>>> +module_param(timeout, uint, 0444);
>>>>> +MODULE_PARM_DESC(timeout,
>>>>> +     "Timeout for a job in ms (0 means infinity and default is 500
>>>>> ms)");
>>>>> +
>>>> I think that  parameter definition could be included at v3d_drv.c as
>>>> other drivers do. Then we would have all future parameters together. In
>>>> that case we would need also to include the extern definition at
>>>> v3d_drv.h for the driver variable.
>>>>
>>>> The param name could be more descriptive like "sched_timeout_ms" in the
>>>> lima driver.
>>>>
>>>> I wonder if it would make sense to have different timeouts parameters
>>>> for different type of jobs we have. At least this series come from the
>>>> need additional time to complete compute jobs. This is what amdgpu
>>>> does,
>>>> but we probably don't need something such complex.
>>>>
>>>> I think it would also make sense to increase our default value for the
>>>> compute jobs.
>>>>
>>>> What do you think?
>>>>
>>>> In any case I think that having this general scheduling timeout
>>>> parameter as other drivers do is a good idea.
>> I agree with your observations.
>> I'm going to add bin_timeout_ms, render_timeout_ms, tfu_timeout_ms,
>> v3d_timeout_ms, and cache_clean_timeout_ms parameters to v3d_drv.c
>> instead of the sole timeout parameter.
>> Though not sophisticated, this should be enough.
>
>>> Having a param for being able to test if the job completes eventually
>>> is a good idea, but if tests are triggering timeouts, then you
>>> probably need to investigate what's going on in the driver -- it's not
>>> like you want .5second unpreemptible jobs to be easy to produce.
>>>
>>> Also, for CS, I wonder if we have another reg besides CSD_CURRENT_CFG4
>>> we could be looking at for "we're making progress on the job".  Half a
>>> second with no discernible progress sounds like a bug.
>> If binning/render/TFU/cache_clean jobs keep running over 0.5 seconds,
>> then it should be a bug because they normally complete processing
>> within the period.
>> However, a CSD job can do anything.
>> For example, SaschaWillems's ray tracing example requires about 0.7
>> seconds to compose a frame with compute shader with Igalia's Vulkan
>> driver.
>
> I've been checking the values of the registers with SaschaWillems
> computeraytracing on job submission, timeout evaluation and job destroy.
> The driver detects progress. CSD_CURRENT_CFG4 values are decreased as
> expected based in the number of pending batches. Next debug info starts
> with 262144 - 1, and we can see it ends with 0xfffffffff. So, I think
> that using CSD_CURRENT_CFG4 is working as expected. We could see also
> that CSD_CURRENT_ID0 and CSD_CURRENT_ID1 could also be used to track
> progress as we see how values are changing for X, Y and Z component of
> current workgroup ID. But comparisons on CSD_CURRENT_CFG4 are simpler.
>
> 10661.766782] [drm] CSD launched: timeout_batches (0x00000000) | cfg4
> (0x0003fff9) | id0 (0x00000080) | id1 (0x00000000) | status (0x00000056)
> [10662.272345] [drm] CSD timeout check: timeout_batches (0x00000000) ->
> cfg4 (0x0002dfaf) | id0 (0x00050000) | id1 (0x00000024) | status
> (0x00000056)
> [10662.792324] [drm] CSD timeout check: timeout_batches (0x0002dfaf) ->
> cfg4 (0x00018c1f) | id0 (0x003e0000) | id1 (0x0000004e) | status
> (0x00000056)
> [10663.302350] [drm] CSD timeout check: timeout_batches (0x00018c1f) ->
> cfg4 (0x0000724f) | id0 (0x005b0000) | id1 (0x00000071) | status
> (0x00000056)
> [10663.487139] [drm] CSD free: timeout_batches (0x0000724f) -> cfg4
> (0xffffffff) | id0 (0x00000000) | id1 (0x00010000) | status (0x00000060)
>
>> Another example is our matrix computation library for QPU:
>> https://github.com/Idein/qmkl6
>> It requires about 1.1 seconds to multiply two 2048x2048 matrices.
>> Because in general we do not know what is the expected behavior of a
>> QPU program and what is not, we also cannot detect a progress the
>> program is making.
>> This is why we want to have the timeout parameters to be able to be
>> modified.
>
> In case of qmkl6, all jobs are launched with CSD_CURRENT_CFG4 is
> initialized as 7 (Number of batches minus 1) so when the job starts, all
> the batches are on execution. So, the HW cannot report any progress, and
> CFG4 reports that there are no pending batches queued with ~0.
>
> This is the trace of v3d_submit_csd_ioctl calls showing CFG4.
>
>       sscal-qmkl6-26222   [002] .... 11978.399780: v3d_submit_csd_ioctl:
> dev=1, CFG4 0x00000007, CFG5 0x00560000, CFG6 0x00280000
>       sscal-qmkl6-26222   [002] .... 11978.518960: v3d_submit_csd_ioctl:
> dev=1, CFG4 0x00000007, CFG5 0x00560000, CFG6 0x00280000
>
> Here is a example of the process finishing before reaching the timeout:
>
> [10339.374376] [drm] CSD launched: timeout_batches (0x00000000) | cfg4
> (0x00000001) | id0 (0x00080800) | id1 (0x00000000) | status (0x00000b34)
> [10339.470097] [drm] CSD free: timeout_batches (0x00000000) -> cfg4
> (0xffffffff) | id0 (0x00080800) | id1 (0x00000000) | status (0x00000b40)
>
> Here is the example of a job that reaches the timeout and RESET:
>
> [10339.980849] [drm] CSD launched: timeout_batches (0x00000000) | cfg4
> (0x00000000) | id0 (0x00080800) | id1 (0x00000000) | status (0x00000b44)
> [10340.497731] [drm] CSD timeout: timeout_batches (0x00000000) -> cfg4
> (0xffffffff) | id0 (0x00080800) | id1 (0x00000000) | status (0x00000b44)
> [10341.007815] [drm] CSD timeout RESET: timeout_batches (0xffffffff) |
> cfg4 (0xffffffff) | id0 (0x00080800) | id1 (0x00000000) | status
> (0x00000b44)
> [10341.008143] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR* Resetting
> GPU for hang.
> [10341.008241] v3d fec00000.v3d: [drm:v3d_reset [v3d]] *ERROR*
> V3D_ERR_STAT: 0x00001000
> [10341.008311] [drm] Skipping CSD job resubmission due to previous error
> (-125)
> [10341.008375] [drm] CSD free: timeout_batches (0xffffffff) -> cfg4
> (0x00000000) | id0 (0x00000000) | id1 (0x00000000) | status (0x00000000)
>
> In the general case we need at least to reach the timeout twice to reset
> the GPU.
>
> Jobs start with 0 on timeout_batches, then the value is updated to
> 0xffffffff and at next timeout evaluation there is no progress as all
> batches have already been submitted. Then we reset the GPU.
>
> I don't know if the algorithms in qmkl6 or py-videocore6 cannot be
> launched using more batches. But if this is not the case adding the
> timeout module parameter seems to be our first solution. It would
> probably be enough adding a general timeout "timeout_ms" for debugging
> any kind of job and a specific "csd_timeout_ms" that only applies to CSD
> jobs and could be used by systems that need to execute this kind of
> jobs. When the CSD timeout is reached we could probably notify with a
> DRM_WARN suggesting increasing the csd_timeout.
>
> Kind regards,
>
> Chema Casanova

Yes, our CSD codes perform computation with just eight batches (one
batch per QPU) for two reasons.
One is for efficiency and the other is that we cannot always divide
the problem to run within 500 milliseconds.
So we too think it makes sense to add the timeout parameters.

Thank you for the suggestion on the parameters.
Though I've completed implementing them, we are facing an issue where
very occasionally v3d_irq() is called by a finished CSD job during the
reset by v3d_gpu_reset_for_timeout().
In this situation, v3d_irq() reads V3D_CTL_INT_STS as 0xdeadbeef (no
response), and the remaining operations cause a kernel panic.
I suspect this can be fixed by holding v3d->reset_lock in v3d_irq(),
but we've not completed testing it yet.
I appreciate your patience.

Regards,
Y. Sugizaki.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
index feef0c749e68..983efb018560 100644
--- a/drivers/gpu/drm/v3d/v3d_sched.c
+++ b/drivers/gpu/drm/v3d/v3d_sched.c
@@ -19,11 +19,17 @@ 
  */

 #include <linux/kthread.h>
+#include <linux/moduleparam.h>

 #include "v3d_drv.h"
 #include "v3d_regs.h"
 #include "v3d_trace.h"

+static uint timeout = 500;
+module_param(timeout, uint, 0444);
+MODULE_PARM_DESC(timeout,
+	"Timeout for a job in ms (0 means infinity and default is 500 ms)");
+
 static struct v3d_job *
 to_v3d_job(struct drm_sched_job *sched_job)
 {
@@ -432,13 +438,13 @@  v3d_sched_init(struct v3d_dev *v3d)
 {
 	int hw_jobs_limit = 1;
 	int job_hang_limit = 0;
-	int hang_limit_ms = 500;
+	long timeout_jiffies = timeout ?
+			msecs_to_jiffies(timeout) : MAX_SCHEDULE_TIMEOUT;
 	int ret;

 	ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
 			     &v3d_bin_sched_ops,
-			     hw_jobs_limit, job_hang_limit,
-			     msecs_to_jiffies(hang_limit_ms),
+			     hw_jobs_limit, job_hang_limit, timeout_jiffies,
 			     "v3d_bin");
 	if (ret) {
 		dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret);
@@ -447,8 +453,7 @@  v3d_sched_init(struct v3d_dev *v3d)

 	ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
 			     &v3d_render_sched_ops,
-			     hw_jobs_limit, job_hang_limit,
-			     msecs_to_jiffies(hang_limit_ms),
+			     hw_jobs_limit, job_hang_limit, timeout_jiffies,
 			     "v3d_render");
 	if (ret) {
 		dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.",
@@ -459,8 +464,7 @@  v3d_sched_init(struct v3d_dev *v3d)

 	ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
 			     &v3d_tfu_sched_ops,
-			     hw_jobs_limit, job_hang_limit,
-			     msecs_to_jiffies(hang_limit_ms),
+			     hw_jobs_limit, job_hang_limit, timeout_jiffies,
 			     "v3d_tfu");
 	if (ret) {
 		dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
@@ -472,8 +476,7 @@  v3d_sched_init(struct v3d_dev *v3d)
 	if (v3d_has_csd(v3d)) {
 		ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
 				     &v3d_csd_sched_ops,
-				     hw_jobs_limit, job_hang_limit,
-				     msecs_to_jiffies(hang_limit_ms),
+				     hw_jobs_limit, job_hang_limit, timeout_jiffies,
 				     "v3d_csd");
 		if (ret) {
 			dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.",
@@ -484,8 +487,7 @@  v3d_sched_init(struct v3d_dev *v3d)

 		ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
 				     &v3d_cache_clean_sched_ops,
-				     hw_jobs_limit, job_hang_limit,
-				     msecs_to_jiffies(hang_limit_ms),
+				     hw_jobs_limit, job_hang_limit, timeout_jiffies,
 				     "v3d_cache_clean");
 		if (ret) {
 			dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: %d.",