Message ID | 20240826122541.85663-1-christian.koenig@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/4] drm/sched: add optional errno to drm_sched_start() | expand |
On Mon, 2024-08-26 at 14:25 +0200, Christian König wrote: > The current implementation of drm_sched_start uses a hardcoded > -ECANCELED to dispose of a job when the parent/hw fence is NULL. > This results in drm_sched_job_done being called with -ECANCELED for > each job with a NULL parent in the pending list, making it difficult > to distinguish between recovery methods, whether a queue reset or a > full GPU reset was used. > > To improve this, we first try a soft recovery for timeout jobs and > use the error code -ENODATA. If soft recovery fails, we proceed with > a queue reset, where the error code remains -ENODATA for the job. > Finally, for a full GPU reset, we use error codes -ECANCELED or > -ETIME. This patch adds an error code parameter to drm_sched_start, > allowing us to differentiate between queue reset and GPU reset > failures. This enables user mode and test applications to validate > the expected correctness of the requested operation. After a > successful queue reset, the only way to continue normal operation is > to call drm_sched_job_done with the specific error code -ENODATA. > > v1: Initial implementation by Jesse utilized > amdgpu_device_lock_reset_domain > and amdgpu_device_unlock_reset_domain to allow user mode to track > the queue reset status and distinguish between queue reset and > GPU reset. > v2: Christian suggested using the error codes -ENODATA for queue > reset > and -ECANCELED or -ETIME for GPU reset, returned to > amdgpu_cs_wait_ioctl. > v3: To meet the requirements, we introduce a new function > drm_sched_start_ex with an additional parameter to set > dma_fence_set_error, allowing us to handle the specific error > codes appropriately and dispose of bad jobs with the selected > error code depending on whether it was a queue reset or GPU > reset. > v4: Alex suggested using a new name, > drm_sched_start_with_recovery_error, > which more accurately describes the function's purpose. > Additionally, it was recommended to add documentation details > about the new method. > v5: Fixed declaration of new function > drm_sched_start_with_recovery_error.(Alex) > v6 (chk): rebase on upstream changes, cleanup the commit message, > drop the new function again and update all callers, > apply the errno also to scheduler fences with hw fences > > Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> > Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com> > Signed-off-by: Christian König <christian.koenig@amd.com> > Cc: Alex Deucher <alexander.deucher@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- > drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +- > drivers/gpu/drm/imagination/pvr_queue.c | 4 ++-- > drivers/gpu/drm/lima/lima_sched.c | 2 +- > drivers/gpu/drm/nouveau/nouveau_sched.c | 2 +- > drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- > drivers/gpu/drm/panthor/panthor_mmu.c | 2 +- > drivers/gpu/drm/scheduler/sched_main.c | 7 ++++--- > drivers/gpu/drm/v3d/v3d_sched.c | 2 +- > include/drm/gpu_scheduler.h | 2 +- > 11 files changed, 16 insertions(+), 15 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > index 2320df51c914..18135d8235f9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > @@ -300,7 +300,7 @@ static int > suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus > if (r) > goto out; > } else { > - drm_sched_start(&ring->sched); > + drm_sched_start(&ring->sched, 0); > } > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 1cd7d355689c..5891312e44ea 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5879,7 +5879,7 @@ int amdgpu_device_gpu_recover(struct > amdgpu_device *adev, > if (!amdgpu_ring_sched_ready(ring)) > continue; > > - drm_sched_start(&ring->sched); > + drm_sched_start(&ring->sched, 0); > } > > if > (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && > !job_signaled) > @@ -6374,7 +6374,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev) > if (!amdgpu_ring_sched_ready(ring)) > continue; > > - drm_sched_start(&ring->sched); > + drm_sched_start(&ring->sched, 0); > } > > amdgpu_device_unset_mp1_state(adev); > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c > b/drivers/gpu/drm/etnaviv/etnaviv_sched.c > index ab9ca4824b62..23ced5896c7c 100644 > --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c > +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c > @@ -72,7 +72,7 @@ static enum drm_gpu_sched_stat > etnaviv_sched_timedout_job(struct drm_sched_job > > drm_sched_resubmit_jobs(&gpu->sched); > > - drm_sched_start(&gpu->sched); > + drm_sched_start(&gpu->sched, 0); > return DRM_GPU_SCHED_STAT_NOMINAL; > > out_no_timeout: > diff --git a/drivers/gpu/drm/imagination/pvr_queue.c > b/drivers/gpu/drm/imagination/pvr_queue.c > index 20cb46012082..c4f08432882b 100644 > --- a/drivers/gpu/drm/imagination/pvr_queue.c > +++ b/drivers/gpu/drm/imagination/pvr_queue.c > @@ -782,7 +782,7 @@ static void pvr_queue_start(struct pvr_queue > *queue) > } > } > > - drm_sched_start(&queue->scheduler); > + drm_sched_start(&queue->scheduler, 0); > } > > /** > @@ -842,7 +842,7 @@ pvr_queue_timedout_job(struct drm_sched_job > *s_job) > } > mutex_unlock(&pvr_dev->queues.lock); > > - drm_sched_start(sched); > + drm_sched_start(sched, 0); > > return DRM_GPU_SCHED_STAT_NOMINAL; > } > diff --git a/drivers/gpu/drm/lima/lima_sched.c > b/drivers/gpu/drm/lima/lima_sched.c > index 1a944edb6ddc..b40c90e97d7e 100644 > --- a/drivers/gpu/drm/lima/lima_sched.c > +++ b/drivers/gpu/drm/lima/lima_sched.c > @@ -463,7 +463,7 @@ static enum drm_gpu_sched_stat > lima_sched_timedout_job(struct drm_sched_job *job > lima_pm_idle(ldev); > > drm_sched_resubmit_jobs(&pipe->base); > - drm_sched_start(&pipe->base); > + drm_sched_start(&pipe->base, 0); > > return DRM_GPU_SCHED_STAT_NOMINAL; > } > diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c > b/drivers/gpu/drm/nouveau/nouveau_sched.c > index eb6c3f9a01f5..4412f2711fb5 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_sched.c > +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c > @@ -379,7 +379,7 @@ nouveau_sched_timedout_job(struct drm_sched_job > *sched_job) > else > NV_PRINTK(warn, job->cli, "Generic job timeout.\n"); > > - drm_sched_start(sched); > + drm_sched_start(sched, 0); > > return stat; > } > diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c > b/drivers/gpu/drm/panfrost/panfrost_job.c > index df49d37d0e7e..d140800606bf 100644 > --- a/drivers/gpu/drm/panfrost/panfrost_job.c > +++ b/drivers/gpu/drm/panfrost/panfrost_job.c > @@ -727,7 +727,7 @@ panfrost_reset(struct panfrost_device *pfdev, > > /* Restart the schedulers */ > for (i = 0; i < NUM_JOB_SLOTS; i++) > - drm_sched_start(&pfdev->js->queue[i].sched); > + drm_sched_start(&pfdev->js->queue[i].sched, 0); > > /* Re-enable job interrupts now that everything has been > restarted. */ > job_write(pfdev, JOB_INT_MASK, > diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c > b/drivers/gpu/drm/panthor/panthor_mmu.c > index d47972806d50..e630cdf47f99 100644 > --- a/drivers/gpu/drm/panthor/panthor_mmu.c > +++ b/drivers/gpu/drm/panthor/panthor_mmu.c > @@ -827,7 +827,7 @@ static void panthor_vm_stop(struct panthor_vm > *vm) > > static void panthor_vm_start(struct panthor_vm *vm) > { > - drm_sched_start(&vm->sched); > + drm_sched_start(&vm->sched, 0); > } > > /** > diff --git a/drivers/gpu/drm/scheduler/sched_main.c > b/drivers/gpu/drm/scheduler/sched_main.c > index ab53ab486fe6..f093616fe53c 100644 > --- a/drivers/gpu/drm/scheduler/sched_main.c > +++ b/drivers/gpu/drm/scheduler/sched_main.c > @@ -674,9 +674,10 @@ EXPORT_SYMBOL(drm_sched_stop); > * drm_sched_start - recover jobs after a reset > * > * @sched: scheduler instance > + * @errno: error to set on the pending fences > * > */ > -void drm_sched_start(struct drm_gpu_scheduler *sched) > +void drm_sched_start(struct drm_gpu_scheduler *sched, int errno) > { > struct drm_sched_job *s_job, *tmp; > > @@ -691,13 +692,13 @@ void drm_sched_start(struct drm_gpu_scheduler > *sched) > atomic_add(s_job->credits, &sched->credit_count); > > if (!fence) { > - drm_sched_job_done(s_job, -ECANCELED); > + drm_sched_job_done(s_job, errno ?: - > ECANCELED); > continue; > } > > if (dma_fence_add_callback(fence, &s_job->cb, > drm_sched_job_done_cb)) > - drm_sched_job_done(s_job, fence->error); > + drm_sched_job_done(s_job, fence->error ?: > errno); > } > > drm_sched_start_timeout_unlocked(sched); > diff --git a/drivers/gpu/drm/v3d/v3d_sched.c > b/drivers/gpu/drm/v3d/v3d_sched.c > index fd29a00b233c..b6a89171824b 100644 > --- a/drivers/gpu/drm/v3d/v3d_sched.c > +++ b/drivers/gpu/drm/v3d/v3d_sched.c > @@ -663,7 +663,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, > struct drm_sched_job *sched_job) > > /* Unblock schedulers and restart their jobs. */ > for (q = 0; q < V3D_MAX_QUEUES; q++) { > - drm_sched_start(&v3d->queue[q].sched); > + drm_sched_start(&v3d->queue[q].sched, 0); > } > > mutex_unlock(&v3d->reset_lock); > diff --git a/include/drm/gpu_scheduler.h > b/include/drm/gpu_scheduler.h > index fe8edb917360..a8d19b10f9b8 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -579,7 +579,7 @@ bool drm_sched_wqueue_ready(struct > drm_gpu_scheduler *sched); > void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched); > void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched); > void drm_sched_stop(struct drm_gpu_scheduler *sched, struct > drm_sched_job *bad); > -void drm_sched_start(struct drm_gpu_scheduler *sched); > +void drm_sched_start(struct drm_gpu_scheduler *sched, int errno); I personally only recently started using the scheduler and many things were quite confusing. In particular, I thought it's not good to call this function drm_sched_start(), because that implies to new users / programmers that this function is supposed to start the scheduler. Accordingly, drm_sched_stop() sounds as if it's intended to be used on scheduler teardown. "Alright, I'll stop the scheduler with drm_sched_stop(), then can safely call drm_sched_entity_destroy() and then tear it down completely through drm_sched_fini()". Now the comments make it obvious that those start and stop functions are more intended for error recovery. So my stance is that start should be called, e.g., drm_sched_restart() or perhaps drm_sched_recover_start(). So since you're already touching all the lines where the function is used, this might be a good opportunity to improve the name, too. If that's deemed desirable. P. > void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); > void drm_sched_increase_karma(struct drm_sched_job *bad); > void drm_sched_reset_karma(struct drm_sched_job *bad);
On Mon, Aug 26, 2024 at 8:35 AM Christian König <ckoenig.leichtzumerken@gmail.com> wrote: > > The current implementation of drm_sched_start uses a hardcoded > -ECANCELED to dispose of a job when the parent/hw fence is NULL. > This results in drm_sched_job_done being called with -ECANCELED for > each job with a NULL parent in the pending list, making it difficult > to distinguish between recovery methods, whether a queue reset or a > full GPU reset was used. > > To improve this, we first try a soft recovery for timeout jobs and > use the error code -ENODATA. If soft recovery fails, we proceed with > a queue reset, where the error code remains -ENODATA for the job. > Finally, for a full GPU reset, we use error codes -ECANCELED or > -ETIME. This patch adds an error code parameter to drm_sched_start, > allowing us to differentiate between queue reset and GPU reset > failures. This enables user mode and test applications to validate > the expected correctness of the requested operation. After a > successful queue reset, the only way to continue normal operation is > to call drm_sched_job_done with the specific error code -ENODATA. > > v1: Initial implementation by Jesse utilized amdgpu_device_lock_reset_domain > and amdgpu_device_unlock_reset_domain to allow user mode to track > the queue reset status and distinguish between queue reset and > GPU reset. > v2: Christian suggested using the error codes -ENODATA for queue reset > and -ECANCELED or -ETIME for GPU reset, returned to > amdgpu_cs_wait_ioctl. > v3: To meet the requirements, we introduce a new function > drm_sched_start_ex with an additional parameter to set > dma_fence_set_error, allowing us to handle the specific error > codes appropriately and dispose of bad jobs with the selected > error code depending on whether it was a queue reset or GPU reset. > v4: Alex suggested using a new name, drm_sched_start_with_recovery_error, > which more accurately describes the function's purpose. > Additionally, it was recommended to add documentation details > about the new method. > v5: Fixed declaration of new function drm_sched_start_with_recovery_error.(Alex) > v6 (chk): rebase on upstream changes, cleanup the commit message, > drop the new function again and update all callers, > apply the errno also to scheduler fences with hw fences > > Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> > Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com> > Signed-off-by: Christian König <christian.koenig@amd.com> > Cc: Alex Deucher <alexander.deucher@amd.com> Series is: Reviewed-by: Alex Deucher <alexander.deucher@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- > drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +- > drivers/gpu/drm/imagination/pvr_queue.c | 4 ++-- > drivers/gpu/drm/lima/lima_sched.c | 2 +- > drivers/gpu/drm/nouveau/nouveau_sched.c | 2 +- > drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- > drivers/gpu/drm/panthor/panthor_mmu.c | 2 +- > drivers/gpu/drm/scheduler/sched_main.c | 7 ++++--- > drivers/gpu/drm/v3d/v3d_sched.c | 2 +- > include/drm/gpu_scheduler.h | 2 +- > 11 files changed, 16 insertions(+), 15 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > index 2320df51c914..18135d8235f9 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > @@ -300,7 +300,7 @@ static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus > if (r) > goto out; > } else { > - drm_sched_start(&ring->sched); > + drm_sched_start(&ring->sched, 0); > } > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 1cd7d355689c..5891312e44ea 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5879,7 +5879,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > if (!amdgpu_ring_sched_ready(ring)) > continue; > > - drm_sched_start(&ring->sched); > + drm_sched_start(&ring->sched, 0); > } > > if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) > @@ -6374,7 +6374,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev) > if (!amdgpu_ring_sched_ready(ring)) > continue; > > - drm_sched_start(&ring->sched); > + drm_sched_start(&ring->sched, 0); > } > > amdgpu_device_unset_mp1_state(adev); > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c > index ab9ca4824b62..23ced5896c7c 100644 > --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c > +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c > @@ -72,7 +72,7 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job > > drm_sched_resubmit_jobs(&gpu->sched); > > - drm_sched_start(&gpu->sched); > + drm_sched_start(&gpu->sched, 0); > return DRM_GPU_SCHED_STAT_NOMINAL; > > out_no_timeout: > diff --git a/drivers/gpu/drm/imagination/pvr_queue.c b/drivers/gpu/drm/imagination/pvr_queue.c > index 20cb46012082..c4f08432882b 100644 > --- a/drivers/gpu/drm/imagination/pvr_queue.c > +++ b/drivers/gpu/drm/imagination/pvr_queue.c > @@ -782,7 +782,7 @@ static void pvr_queue_start(struct pvr_queue *queue) > } > } > > - drm_sched_start(&queue->scheduler); > + drm_sched_start(&queue->scheduler, 0); > } > > /** > @@ -842,7 +842,7 @@ pvr_queue_timedout_job(struct drm_sched_job *s_job) > } > mutex_unlock(&pvr_dev->queues.lock); > > - drm_sched_start(sched); > + drm_sched_start(sched, 0); > > return DRM_GPU_SCHED_STAT_NOMINAL; > } > diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c > index 1a944edb6ddc..b40c90e97d7e 100644 > --- a/drivers/gpu/drm/lima/lima_sched.c > +++ b/drivers/gpu/drm/lima/lima_sched.c > @@ -463,7 +463,7 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job > lima_pm_idle(ldev); > > drm_sched_resubmit_jobs(&pipe->base); > - drm_sched_start(&pipe->base); > + drm_sched_start(&pipe->base, 0); > > return DRM_GPU_SCHED_STAT_NOMINAL; > } > diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c > index eb6c3f9a01f5..4412f2711fb5 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_sched.c > +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c > @@ -379,7 +379,7 @@ nouveau_sched_timedout_job(struct drm_sched_job *sched_job) > else > NV_PRINTK(warn, job->cli, "Generic job timeout.\n"); > > - drm_sched_start(sched); > + drm_sched_start(sched, 0); > > return stat; > } > diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c > index df49d37d0e7e..d140800606bf 100644 > --- a/drivers/gpu/drm/panfrost/panfrost_job.c > +++ b/drivers/gpu/drm/panfrost/panfrost_job.c > @@ -727,7 +727,7 @@ panfrost_reset(struct panfrost_device *pfdev, > > /* Restart the schedulers */ > for (i = 0; i < NUM_JOB_SLOTS; i++) > - drm_sched_start(&pfdev->js->queue[i].sched); > + drm_sched_start(&pfdev->js->queue[i].sched, 0); > > /* Re-enable job interrupts now that everything has been restarted. */ > job_write(pfdev, JOB_INT_MASK, > diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c > index d47972806d50..e630cdf47f99 100644 > --- a/drivers/gpu/drm/panthor/panthor_mmu.c > +++ b/drivers/gpu/drm/panthor/panthor_mmu.c > @@ -827,7 +827,7 @@ static void panthor_vm_stop(struct panthor_vm *vm) > > static void panthor_vm_start(struct panthor_vm *vm) > { > - drm_sched_start(&vm->sched); > + drm_sched_start(&vm->sched, 0); > } > > /** > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c > index ab53ab486fe6..f093616fe53c 100644 > --- a/drivers/gpu/drm/scheduler/sched_main.c > +++ b/drivers/gpu/drm/scheduler/sched_main.c > @@ -674,9 +674,10 @@ EXPORT_SYMBOL(drm_sched_stop); > * drm_sched_start - recover jobs after a reset > * > * @sched: scheduler instance > + * @errno: error to set on the pending fences > * > */ > -void drm_sched_start(struct drm_gpu_scheduler *sched) > +void drm_sched_start(struct drm_gpu_scheduler *sched, int errno) > { > struct drm_sched_job *s_job, *tmp; > > @@ -691,13 +692,13 @@ void drm_sched_start(struct drm_gpu_scheduler *sched) > atomic_add(s_job->credits, &sched->credit_count); > > if (!fence) { > - drm_sched_job_done(s_job, -ECANCELED); > + drm_sched_job_done(s_job, errno ?: -ECANCELED); > continue; > } > > if (dma_fence_add_callback(fence, &s_job->cb, > drm_sched_job_done_cb)) > - drm_sched_job_done(s_job, fence->error); > + drm_sched_job_done(s_job, fence->error ?: errno); > } > > drm_sched_start_timeout_unlocked(sched); > diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c > index fd29a00b233c..b6a89171824b 100644 > --- a/drivers/gpu/drm/v3d/v3d_sched.c > +++ b/drivers/gpu/drm/v3d/v3d_sched.c > @@ -663,7 +663,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) > > /* Unblock schedulers and restart their jobs. */ > for (q = 0; q < V3D_MAX_QUEUES; q++) { > - drm_sched_start(&v3d->queue[q].sched); > + drm_sched_start(&v3d->queue[q].sched, 0); > } > > mutex_unlock(&v3d->reset_lock); > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > index fe8edb917360..a8d19b10f9b8 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -579,7 +579,7 @@ bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched); > void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched); > void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched); > void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad); > -void drm_sched_start(struct drm_gpu_scheduler *sched); > +void drm_sched_start(struct drm_gpu_scheduler *sched, int errno); > void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); > void drm_sched_increase_karma(struct drm_sched_job *bad); > void drm_sched_reset_karma(struct drm_sched_job *bad); > -- > 2.34.1 >
Hi Christian, kernel test robot noticed the following build errors: [auto build test ERROR on next-20240826] [cannot apply to drm-intel/for-linux-next drm-intel/for-linux-next-fixes linus/master v6.11-rc5 v6.11-rc4 v6.11-rc3 v6.11-rc5] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Christian-K-nig/dma-buf-give-examples-of-error-codes-to-use/20240826-202848 base: next-20240826 patch link: https://lore.kernel.org/r/20240826122541.85663-1-christian.koenig%40amd.com patch subject: [PATCH 1/4] drm/sched: add optional errno to drm_sched_start() config: i386-allmodconfig (https://download.01.org/0day-ci/archive/20240830/202408301500.GD1SE900-lkp@intel.com/config) compiler: gcc-12 (Debian 12.2.0-14) 12.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240830/202408301500.GD1SE900-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202408301500.GD1SE900-lkp@intel.com/ All errors (new ones prefixed by >>): drivers/gpu/drm/amd/amdgpu/amdgpu_job.c: In function 'amdgpu_job_timedout': >> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c:90:33: error: too few arguments to function 'drm_sched_start' 90 | drm_sched_start(&ring->sched); | ^~~~~~~~~~~~~~~ In file included from drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h:28, from drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h:29, from drivers/gpu/drm/amd/amdgpu/amdgpu.h:43, from drivers/gpu/drm/amd/amdgpu/amdgpu_job.c:30: include/drm/gpu_scheduler.h:582:6: note: declared here 582 | void drm_sched_start(struct drm_gpu_scheduler *sched, int errno); | ^~~~~~~~~~~~~~~ -- drivers/gpu/drm/panthor/panthor_sched.c: In function 'queue_start': >> drivers/gpu/drm/panthor/panthor_sched.c:2541:9: error: too few arguments to function 'drm_sched_start' 2541 | drm_sched_start(&queue->scheduler); | ^~~~~~~~~~~~~~~ In file included from drivers/gpu/drm/panthor/panthor_sched.c:8: include/drm/gpu_scheduler.h:582:6: note: declared here 582 | void drm_sched_start(struct drm_gpu_scheduler *sched, int errno); | ^~~~~~~~~~~~~~~ Kconfig warnings: (for reference only) WARNING: unmet direct dependencies detected for OMAP2PLUS_MBOX Depends on [n]: MAILBOX [=y] && (ARCH_OMAP2PLUS || ARCH_K3) Selected by [m]: - TI_K3_M4_REMOTEPROC [=m] && REMOTEPROC [=y] && (ARCH_K3 || COMPILE_TEST [=y]) vim +/drm_sched_start +90 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c c1b69ed0c62f9d8 drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c Chunming Zhou 2015-07-21 33 a6a1f036c74e3d2 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Luben Tuikov 2021-01-20 34 static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job) 0de2479c953ae07 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Monk Liu 2016-03-04 35 { 3320b8d2acd3d48 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Christian König 2018-07-13 36 struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched); 3320b8d2acd3d48 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Christian König 2018-07-13 37 struct amdgpu_job *job = to_amdgpu_job(s_job); b8f67b9ddf4f8fe drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Shashank Sharma 2024-01-18 38 struct amdgpu_task_info *ti; 95a2f917387a23c drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Yintian Tao 2020-04-07 39 struct amdgpu_device *adev = ring->adev; ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 40 int idx; 7258fa31eabd882 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Surbhi Kakarya 2022-01-26 41 int r; ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 42 c58a863b1ccf638 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Guchun Chen 2021-10-08 43 if (!drm_dev_enter(adev_to_drm(adev), &idx)) { 7d570f56f1e1005 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-07-08 44 dev_info(adev->dev, "%s - device unplugged skipping recovery on scheduler:%s", ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 45 __func__, s_job->sched->name); ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 46 ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 47 /* Effectively the job is aborted as the device is gone */ ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 48 return DRM_GPU_SCHED_STAT_ENODEV; ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 49 } 0346bfd9fe5ade3 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Trigger Huang 2018-12-18 50 b8f67b9ddf4f8fe drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Shashank Sharma 2024-01-18 51 194eb174cbe4fe2 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Victor Zhao 2022-06-24 52 adev->job_hang = true; 0e51a772e2014db drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Christian König 2016-05-18 53 cc063ea2ec7cc09 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Marek Olšák 2020-07-06 54 if (amdgpu_gpu_recovery && cc063ea2ec7cc09 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Marek Olšák 2020-07-06 55 amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) { 7d570f56f1e1005 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-07-08 56 dev_err(adev->dev, "ring %s timeout, but soft recovered\n", 7876fa4f55fda4a drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Christian König 2018-08-21 57 s_job->sched->name); ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 58 goto exit; 7876fa4f55fda4a drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Christian König 2018-08-21 59 } 7876fa4f55fda4a drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Christian König 2018-08-21 60 7d570f56f1e1005 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-07-08 61 dev_err(adev->dev, "ring %s timeout, signaled seq=%u, emitted seq=%u\n", 3320b8d2acd3d48 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Christian König 2018-07-13 62 job->base.sched->name, atomic_read(&ring->fence_drv.last_seq), 3320b8d2acd3d48 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Christian König 2018-07-13 63 ring->fence_drv.sync_seq); b8f67b9ddf4f8fe drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Shashank Sharma 2024-01-18 64 b8f67b9ddf4f8fe drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Shashank Sharma 2024-01-18 65 ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid); b8f67b9ddf4f8fe drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Shashank Sharma 2024-01-18 66 if (ti) { 7d570f56f1e1005 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-07-08 67 dev_err(adev->dev, 7d570f56f1e1005 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-07-08 68 "Process information: process %s pid %d thread %s pid %d\n", b8f67b9ddf4f8fe drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Shashank Sharma 2024-01-18 69 ti->process_name, ti->tgid, ti->task_name, ti->pid); b8f67b9ddf4f8fe drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Shashank Sharma 2024-01-18 70 amdgpu_vm_put_task_info(ti); b8f67b9ddf4f8fe drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Shashank Sharma 2024-01-18 71 } 4fbf87e2fe47211 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Monk Liu 2017-05-05 72 7a66ad6c087ee38 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c ZhenGuo Yin 2023-05-09 73 dma_fence_set_error(&s_job->s_fence->finished, -ETIME); 7a66ad6c087ee38 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c ZhenGuo Yin 2023-05-09 74 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 75 /* attempt a per ring reset */ 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 76 if (amdgpu_gpu_recovery && 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 77 ring->funcs->reset) { 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 78 /* stop the scheduler, but don't mess with the 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 79 * bad job yet because if ring reset fails 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 80 * we'll fall back to full GPU reset. 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 81 */ 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 82 drm_sched_wqueue_stop(&ring->sched); 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 83 r = amdgpu_ring_reset(ring, job->vmid); 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 84 if (!r) { 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 85 if (amdgpu_ring_sched_ready(ring)) 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 86 drm_sched_stop(&ring->sched, s_job); fb0a5834a338329 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Prike Liang 2024-06-12 87 atomic_inc(&ring->adev->gpu_reset_counter); 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 88 amdgpu_fence_driver_force_completion(ring); 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 89 if (amdgpu_ring_sched_ready(ring)) 8a591034b0b5338 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Stephen Rothwell 2024-08-26 @90 drm_sched_start(&ring->sched); 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 91 goto exit; 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 92 } 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 93 } 15789fa0f0e29cf drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-06-03 94 95a2f917387a23c drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Yintian Tao 2020-04-07 95 if (amdgpu_device_should_recover_gpu(ring->adev)) { f1549c09c520877 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Likun Gao 2022-07-08 96 struct amdgpu_reset_context reset_context; f1549c09c520877 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Likun Gao 2022-07-08 97 memset(&reset_context, 0, sizeof(reset_context)); f1549c09c520877 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Likun Gao 2022-07-08 98 f1549c09c520877 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Likun Gao 2022-07-08 99 reset_context.method = AMD_RESET_METHOD_NONE; f1549c09c520877 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Likun Gao 2022-07-08 100 reset_context.reset_req_dev = adev; bac640ddb51e806 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Eric Huang 2024-06-04 101 reset_context.src = AMDGPU_RESET_SRC_JOB; f1549c09c520877 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Likun Gao 2022-07-08 102 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); f1549c09c520877 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Likun Gao 2022-07-08 103 f1549c09c520877 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Likun Gao 2022-07-08 104 r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context); 7258fa31eabd882 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Surbhi Kakarya 2022-01-26 105 if (r) 7d570f56f1e1005 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Alex Deucher 2024-07-08 106 dev_err(adev->dev, "GPU Recovery Failed: %d\n", r); 95a2f917387a23c drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Yintian Tao 2020-04-07 107 } else { c3b6c6074166499 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Monk Liu 2019-05-13 108 drm_sched_suspend_timeout(&ring->sched); 95a2f917387a23c drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Yintian Tao 2020-04-07 109 if (amdgpu_sriov_vf(adev)) 95a2f917387a23c drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Yintian Tao 2020-04-07 110 adev->virt.tdr_debug = true; 95a2f917387a23c drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Yintian Tao 2020-04-07 111 } ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 112 ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 113 exit: 194eb174cbe4fe2 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Victor Zhao 2022-06-24 114 adev->job_hang = false; ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 115 drm_dev_exit(idx); ca4e17244bd213e drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Andrey Grodzovsky 2021-05-12 116 return DRM_GPU_SCHED_STAT_NOMINAL; 0de2479c953ae07 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Monk Liu 2016-03-04 117 } 0de2479c953ae07 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c Monk Liu 2016-03-04 118
Hi Christian,
kernel test robot noticed the following build errors:
[auto build test ERROR on next-20240826]
[cannot apply to drm-intel/for-linux-next drm-intel/for-linux-next-fixes linus/master v6.11-rc5 v6.11-rc4 v6.11-rc3 v6.11-rc5]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Christian-K-nig/dma-buf-give-examples-of-error-codes-to-use/20240826-202848
base: next-20240826
patch link: https://lore.kernel.org/r/20240826122541.85663-1-christian.koenig%40amd.com
patch subject: [PATCH 1/4] drm/sched: add optional errno to drm_sched_start()
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20240830/202408301653.9umdd9cC-lkp@intel.com/config)
compiler: clang version 18.1.5 (https://github.com/llvm/llvm-project 617a15a9eac96088ae5e9134248d8236e34b91b1)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240830/202408301653.9umdd9cC-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202408301653.9umdd9cC-lkp@intel.com/
All errors (new ones prefixed by >>):
>> drivers/gpu/drm/panthor/panthor_sched.c:2541:35: error: too few arguments to function call, expected 2, have 1
2541 | drm_sched_start(&queue->scheduler);
| ~~~~~~~~~~~~~~~ ^
include/drm/gpu_scheduler.h:582:6: note: 'drm_sched_start' declared here
582 | void drm_sched_start(struct drm_gpu_scheduler *sched, int errno);
| ^ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 error generated.
Kconfig warnings: (for reference only)
WARNING: unmet direct dependencies detected for OMAP2PLUS_MBOX
Depends on [n]: MAILBOX [=y] && (ARCH_OMAP2PLUS || ARCH_K3)
Selected by [y]:
- TI_K3_M4_REMOTEPROC [=y] && REMOTEPROC [=y] && (ARCH_K3 || COMPILE_TEST [=y])
vim +2541 drivers/gpu/drm/panthor/panthor_sched.c
de85488138247d0 Boris Brezillon 2024-02-29 2532
de85488138247d0 Boris Brezillon 2024-02-29 2533 static void queue_start(struct panthor_queue *queue)
de85488138247d0 Boris Brezillon 2024-02-29 2534 {
de85488138247d0 Boris Brezillon 2024-02-29 2535 struct panthor_job *job;
de85488138247d0 Boris Brezillon 2024-02-29 2536
de85488138247d0 Boris Brezillon 2024-02-29 2537 /* Re-assign the parent fences. */
de85488138247d0 Boris Brezillon 2024-02-29 2538 list_for_each_entry(job, &queue->scheduler.pending_list, base.list)
de85488138247d0 Boris Brezillon 2024-02-29 2539 job->base.s_fence->parent = dma_fence_get(job->done_fence);
de85488138247d0 Boris Brezillon 2024-02-29 2540
83b501c1799a96a Christian König 2024-07-19 @2541 drm_sched_start(&queue->scheduler);
de85488138247d0 Boris Brezillon 2024-02-29 2542 }
de85488138247d0 Boris Brezillon 2024-02-29 2543
Am 28.08.24 um 11:30 schrieb Philipp Stanner: > On Mon, 2024-08-26 at 14:25 +0200, Christian König wrote: >> The current implementation of drm_sched_start uses a hardcoded >> -ECANCELED to dispose of a job when the parent/hw fence is NULL. >> This results in drm_sched_job_done being called with -ECANCELED for >> each job with a NULL parent in the pending list, making it difficult >> to distinguish between recovery methods, whether a queue reset or a >> full GPU reset was used. >> >> To improve this, we first try a soft recovery for timeout jobs and >> use the error code -ENODATA. If soft recovery fails, we proceed with >> a queue reset, where the error code remains -ENODATA for the job. >> Finally, for a full GPU reset, we use error codes -ECANCELED or >> -ETIME. This patch adds an error code parameter to drm_sched_start, >> allowing us to differentiate between queue reset and GPU reset >> failures. This enables user mode and test applications to validate >> the expected correctness of the requested operation. After a >> successful queue reset, the only way to continue normal operation is >> to call drm_sched_job_done with the specific error code -ENODATA. >> >> v1: Initial implementation by Jesse utilized >> amdgpu_device_lock_reset_domain >> and amdgpu_device_unlock_reset_domain to allow user mode to track >> the queue reset status and distinguish between queue reset and >> GPU reset. >> v2: Christian suggested using the error codes -ENODATA for queue >> reset >> and -ECANCELED or -ETIME for GPU reset, returned to >> amdgpu_cs_wait_ioctl. >> v3: To meet the requirements, we introduce a new function >> drm_sched_start_ex with an additional parameter to set >> dma_fence_set_error, allowing us to handle the specific error >> codes appropriately and dispose of bad jobs with the selected >> error code depending on whether it was a queue reset or GPU >> reset. >> v4: Alex suggested using a new name, >> drm_sched_start_with_recovery_error, >> which more accurately describes the function's purpose. >> Additionally, it was recommended to add documentation details >> about the new method. >> v5: Fixed declaration of new function >> drm_sched_start_with_recovery_error.(Alex) >> v6 (chk): rebase on upstream changes, cleanup the commit message, >> drop the new function again and update all callers, >> apply the errno also to scheduler fences with hw fences >> >> Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> >> Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com> >> Signed-off-by: Christian König <christian.koenig@amd.com> >> Cc: Alex Deucher <alexander.deucher@amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 2 +- >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- >> drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +- >> drivers/gpu/drm/imagination/pvr_queue.c | 4 ++-- >> drivers/gpu/drm/lima/lima_sched.c | 2 +- >> drivers/gpu/drm/nouveau/nouveau_sched.c | 2 +- >> drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- >> drivers/gpu/drm/panthor/panthor_mmu.c | 2 +- >> drivers/gpu/drm/scheduler/sched_main.c | 7 ++++--- >> drivers/gpu/drm/v3d/v3d_sched.c | 2 +- >> include/drm/gpu_scheduler.h | 2 +- >> 11 files changed, 16 insertions(+), 15 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c >> index 2320df51c914..18135d8235f9 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c >> @@ -300,7 +300,7 @@ static int >> suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus >> if (r) >> goto out; >> } else { >> - drm_sched_start(&ring->sched); >> + drm_sched_start(&ring->sched, 0); >> } >> } >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> index 1cd7d355689c..5891312e44ea 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c >> @@ -5879,7 +5879,7 @@ int amdgpu_device_gpu_recover(struct >> amdgpu_device *adev, >> if (!amdgpu_ring_sched_ready(ring)) >> continue; >> >> - drm_sched_start(&ring->sched); >> + drm_sched_start(&ring->sched, 0); >> } >> >> if >> (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && >> !job_signaled) >> @@ -6374,7 +6374,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev) >> if (!amdgpu_ring_sched_ready(ring)) >> continue; >> >> - drm_sched_start(&ring->sched); >> + drm_sched_start(&ring->sched, 0); >> } >> >> amdgpu_device_unset_mp1_state(adev); >> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> index ab9ca4824b62..23ced5896c7c 100644 >> --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c >> @@ -72,7 +72,7 @@ static enum drm_gpu_sched_stat >> etnaviv_sched_timedout_job(struct drm_sched_job >> >> drm_sched_resubmit_jobs(&gpu->sched); >> >> - drm_sched_start(&gpu->sched); >> + drm_sched_start(&gpu->sched, 0); >> return DRM_GPU_SCHED_STAT_NOMINAL; >> >> out_no_timeout: >> diff --git a/drivers/gpu/drm/imagination/pvr_queue.c >> b/drivers/gpu/drm/imagination/pvr_queue.c >> index 20cb46012082..c4f08432882b 100644 >> --- a/drivers/gpu/drm/imagination/pvr_queue.c >> +++ b/drivers/gpu/drm/imagination/pvr_queue.c >> @@ -782,7 +782,7 @@ static void pvr_queue_start(struct pvr_queue >> *queue) >> } >> } >> >> - drm_sched_start(&queue->scheduler); >> + drm_sched_start(&queue->scheduler, 0); >> } >> >> /** >> @@ -842,7 +842,7 @@ pvr_queue_timedout_job(struct drm_sched_job >> *s_job) >> } >> mutex_unlock(&pvr_dev->queues.lock); >> >> - drm_sched_start(sched); >> + drm_sched_start(sched, 0); >> >> return DRM_GPU_SCHED_STAT_NOMINAL; >> } >> diff --git a/drivers/gpu/drm/lima/lima_sched.c >> b/drivers/gpu/drm/lima/lima_sched.c >> index 1a944edb6ddc..b40c90e97d7e 100644 >> --- a/drivers/gpu/drm/lima/lima_sched.c >> +++ b/drivers/gpu/drm/lima/lima_sched.c >> @@ -463,7 +463,7 @@ static enum drm_gpu_sched_stat >> lima_sched_timedout_job(struct drm_sched_job *job >> lima_pm_idle(ldev); >> >> drm_sched_resubmit_jobs(&pipe->base); >> - drm_sched_start(&pipe->base); >> + drm_sched_start(&pipe->base, 0); >> >> return DRM_GPU_SCHED_STAT_NOMINAL; >> } >> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c >> b/drivers/gpu/drm/nouveau/nouveau_sched.c >> index eb6c3f9a01f5..4412f2711fb5 100644 >> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c >> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c >> @@ -379,7 +379,7 @@ nouveau_sched_timedout_job(struct drm_sched_job >> *sched_job) >> else >> NV_PRINTK(warn, job->cli, "Generic job timeout.\n"); >> >> - drm_sched_start(sched); >> + drm_sched_start(sched, 0); >> >> return stat; >> } >> diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c >> b/drivers/gpu/drm/panfrost/panfrost_job.c >> index df49d37d0e7e..d140800606bf 100644 >> --- a/drivers/gpu/drm/panfrost/panfrost_job.c >> +++ b/drivers/gpu/drm/panfrost/panfrost_job.c >> @@ -727,7 +727,7 @@ panfrost_reset(struct panfrost_device *pfdev, >> >> /* Restart the schedulers */ >> for (i = 0; i < NUM_JOB_SLOTS; i++) >> - drm_sched_start(&pfdev->js->queue[i].sched); >> + drm_sched_start(&pfdev->js->queue[i].sched, 0); >> >> /* Re-enable job interrupts now that everything has been >> restarted. */ >> job_write(pfdev, JOB_INT_MASK, >> diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c >> b/drivers/gpu/drm/panthor/panthor_mmu.c >> index d47972806d50..e630cdf47f99 100644 >> --- a/drivers/gpu/drm/panthor/panthor_mmu.c >> +++ b/drivers/gpu/drm/panthor/panthor_mmu.c >> @@ -827,7 +827,7 @@ static void panthor_vm_stop(struct panthor_vm >> *vm) >> >> static void panthor_vm_start(struct panthor_vm *vm) >> { >> - drm_sched_start(&vm->sched); >> + drm_sched_start(&vm->sched, 0); >> } >> >> /** >> diff --git a/drivers/gpu/drm/scheduler/sched_main.c >> b/drivers/gpu/drm/scheduler/sched_main.c >> index ab53ab486fe6..f093616fe53c 100644 >> --- a/drivers/gpu/drm/scheduler/sched_main.c >> +++ b/drivers/gpu/drm/scheduler/sched_main.c >> @@ -674,9 +674,10 @@ EXPORT_SYMBOL(drm_sched_stop); >> * drm_sched_start - recover jobs after a reset >> * >> * @sched: scheduler instance >> + * @errno: error to set on the pending fences >> * >> */ >> -void drm_sched_start(struct drm_gpu_scheduler *sched) >> +void drm_sched_start(struct drm_gpu_scheduler *sched, int errno) >> { >> struct drm_sched_job *s_job, *tmp; >> >> @@ -691,13 +692,13 @@ void drm_sched_start(struct drm_gpu_scheduler >> *sched) >> atomic_add(s_job->credits, &sched->credit_count); >> >> if (!fence) { >> - drm_sched_job_done(s_job, -ECANCELED); >> + drm_sched_job_done(s_job, errno ?: - >> ECANCELED); >> continue; >> } >> >> if (dma_fence_add_callback(fence, &s_job->cb, >> drm_sched_job_done_cb)) >> - drm_sched_job_done(s_job, fence->error); >> + drm_sched_job_done(s_job, fence->error ?: >> errno); >> } >> >> drm_sched_start_timeout_unlocked(sched); >> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c >> b/drivers/gpu/drm/v3d/v3d_sched.c >> index fd29a00b233c..b6a89171824b 100644 >> --- a/drivers/gpu/drm/v3d/v3d_sched.c >> +++ b/drivers/gpu/drm/v3d/v3d_sched.c >> @@ -663,7 +663,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, >> struct drm_sched_job *sched_job) >> >> /* Unblock schedulers and restart their jobs. */ >> for (q = 0; q < V3D_MAX_QUEUES; q++) { >> - drm_sched_start(&v3d->queue[q].sched); >> + drm_sched_start(&v3d->queue[q].sched, 0); >> } >> >> mutex_unlock(&v3d->reset_lock); >> diff --git a/include/drm/gpu_scheduler.h >> b/include/drm/gpu_scheduler.h >> index fe8edb917360..a8d19b10f9b8 100644 >> --- a/include/drm/gpu_scheduler.h >> +++ b/include/drm/gpu_scheduler.h >> @@ -579,7 +579,7 @@ bool drm_sched_wqueue_ready(struct >> drm_gpu_scheduler *sched); >> void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched); >> void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched); >> void drm_sched_stop(struct drm_gpu_scheduler *sched, struct >> drm_sched_job *bad); >> -void drm_sched_start(struct drm_gpu_scheduler *sched); >> +void drm_sched_start(struct drm_gpu_scheduler *sched, int errno); > I personally only recently started using the scheduler and many things > were quite confusing. > > In particular, I thought it's not good to call this function > drm_sched_start(), because that implies to new users / programmers that > this function is supposed to start the scheduler. > > Accordingly, drm_sched_stop() sounds as if it's intended to be used on > scheduler teardown. "Alright, I'll stop the scheduler with > drm_sched_stop(), then can safely call drm_sched_entity_destroy() and > then tear it down completely through drm_sched_fini()". > > Now the comments make it obvious that those start and stop functions > are more intended for error recovery. > > So my stance is that start should be called, e.g., drm_sched_restart() > or perhaps drm_sched_recover_start(). > > So since you're already touching all the lines where the function is > used, this might be a good opportunity to improve the name, too. > > If that's deemed desirable. Yeah completely agree. We also had people incorrectly thinking that they should call drm_sched_stop/start on suspend/resume resulting in a system which didn't come up again after resume. How about we call it drm_sched_suspend_before_reset() and drm_sched_resume_after_reset()? Thanks, Christian. > > P. > > >> void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); >> void drm_sched_increase_karma(struct drm_sched_job *bad); >> void drm_sched_reset_karma(struct drm_sched_job *bad);
On Fri, 2024-08-30 at 15:15 +0200, Christian König wrote: > Am 28.08.24 um 11:30 schrieb Philipp Stanner: > > On Mon, 2024-08-26 at 14:25 +0200, Christian König wrote: > > > The current implementation of drm_sched_start uses a hardcoded > > > -ECANCELED to dispose of a job when the parent/hw fence is NULL. > > > This results in drm_sched_job_done being called with -ECANCELED > > > for > > > each job with a NULL parent in the pending list, making it > > > difficult > > > to distinguish between recovery methods, whether a queue reset or > > > a > > > full GPU reset was used. > > > > > > To improve this, we first try a soft recovery for timeout jobs > > > and > > > use the error code -ENODATA. If soft recovery fails, we proceed > > > with > > > a queue reset, where the error code remains -ENODATA for the job. > > > Finally, for a full GPU reset, we use error codes -ECANCELED or > > > -ETIME. This patch adds an error code parameter to > > > drm_sched_start, > > > allowing us to differentiate between queue reset and GPU reset > > > failures. This enables user mode and test applications to > > > validate > > > the expected correctness of the requested operation. After a > > > successful queue reset, the only way to continue normal operation > > > is > > > to call drm_sched_job_done with the specific error code -ENODATA. > > > > > > v1: Initial implementation by Jesse utilized > > > amdgpu_device_lock_reset_domain > > > and amdgpu_device_unlock_reset_domain to allow user mode to > > > track > > > the queue reset status and distinguish between queue reset > > > and > > > GPU reset. > > > v2: Christian suggested using the error codes -ENODATA for queue > > > reset > > > and -ECANCELED or -ETIME for GPU reset, returned to > > > amdgpu_cs_wait_ioctl. > > > v3: To meet the requirements, we introduce a new function > > > drm_sched_start_ex with an additional parameter to set > > > dma_fence_set_error, allowing us to handle the specific > > > error > > > codes appropriately and dispose of bad jobs with the > > > selected > > > error code depending on whether it was a queue reset or GPU > > > reset. > > > v4: Alex suggested using a new name, > > > drm_sched_start_with_recovery_error, > > > which more accurately describes the function's purpose. > > > Additionally, it was recommended to add documentation > > > details > > > about the new method. > > > v5: Fixed declaration of new function > > > drm_sched_start_with_recovery_error.(Alex) > > > v6 (chk): rebase on upstream changes, cleanup the commit message, > > > drop the new function again and update all callers, > > > apply the errno also to scheduler fences with hw > > > fences > > > > > > Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> > > > Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com> > > > Signed-off-by: Christian König <christian.koenig@amd.com> > > > Cc: Alex Deucher <alexander.deucher@amd.com> > > > --- > > > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 2 +- > > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- > > > drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +- > > > drivers/gpu/drm/imagination/pvr_queue.c | 4 ++-- > > > drivers/gpu/drm/lima/lima_sched.c | 2 +- > > > drivers/gpu/drm/nouveau/nouveau_sched.c | 2 +- > > > drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- > > > drivers/gpu/drm/panthor/panthor_mmu.c | 2 +- > > > drivers/gpu/drm/scheduler/sched_main.c | 7 ++++--- > > > drivers/gpu/drm/v3d/v3d_sched.c | 2 +- > > > include/drm/gpu_scheduler.h | 2 +- > > > 11 files changed, 16 insertions(+), 15 deletions(-) > > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > > index 2320df51c914..18135d8235f9 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > > @@ -300,7 +300,7 @@ static int > > > suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool > > > sus > > > if (r) > > > goto out; > > > } else { > > > - drm_sched_start(&ring->sched); > > > + drm_sched_start(&ring->sched, 0); > > > } > > > } > > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > index 1cd7d355689c..5891312e44ea 100644 > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > @@ -5879,7 +5879,7 @@ int amdgpu_device_gpu_recover(struct > > > amdgpu_device *adev, > > > if (!amdgpu_ring_sched_ready(ring)) > > > continue; > > > > > > - drm_sched_start(&ring->sched); > > > + drm_sched_start(&ring->sched, 0); > > > } > > > > > > if > > > (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && > > > !job_signaled) > > > @@ -6374,7 +6374,7 @@ void amdgpu_pci_resume(struct pci_dev > > > *pdev) > > > if (!amdgpu_ring_sched_ready(ring)) > > > continue; > > > > > > - drm_sched_start(&ring->sched); > > > + drm_sched_start(&ring->sched, 0); > > > } > > > > > > amdgpu_device_unset_mp1_state(adev); > > > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c > > > b/drivers/gpu/drm/etnaviv/etnaviv_sched.c > > > index ab9ca4824b62..23ced5896c7c 100644 > > > --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c > > > +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c > > > @@ -72,7 +72,7 @@ static enum drm_gpu_sched_stat > > > etnaviv_sched_timedout_job(struct drm_sched_job > > > > > > drm_sched_resubmit_jobs(&gpu->sched); > > > > > > - drm_sched_start(&gpu->sched); > > > + drm_sched_start(&gpu->sched, 0); > > > return DRM_GPU_SCHED_STAT_NOMINAL; > > > > > > out_no_timeout: > > > diff --git a/drivers/gpu/drm/imagination/pvr_queue.c > > > b/drivers/gpu/drm/imagination/pvr_queue.c > > > index 20cb46012082..c4f08432882b 100644 > > > --- a/drivers/gpu/drm/imagination/pvr_queue.c > > > +++ b/drivers/gpu/drm/imagination/pvr_queue.c > > > @@ -782,7 +782,7 @@ static void pvr_queue_start(struct pvr_queue > > > *queue) > > > } > > > } > > > > > > - drm_sched_start(&queue->scheduler); > > > + drm_sched_start(&queue->scheduler, 0); > > > } > > > > > > /** > > > @@ -842,7 +842,7 @@ pvr_queue_timedout_job(struct drm_sched_job > > > *s_job) > > > } > > > mutex_unlock(&pvr_dev->queues.lock); > > > > > > - drm_sched_start(sched); > > > + drm_sched_start(sched, 0); > > > > > > return DRM_GPU_SCHED_STAT_NOMINAL; > > > } > > > diff --git a/drivers/gpu/drm/lima/lima_sched.c > > > b/drivers/gpu/drm/lima/lima_sched.c > > > index 1a944edb6ddc..b40c90e97d7e 100644 > > > --- a/drivers/gpu/drm/lima/lima_sched.c > > > +++ b/drivers/gpu/drm/lima/lima_sched.c > > > @@ -463,7 +463,7 @@ static enum drm_gpu_sched_stat > > > lima_sched_timedout_job(struct drm_sched_job *job > > > lima_pm_idle(ldev); > > > > > > drm_sched_resubmit_jobs(&pipe->base); > > > - drm_sched_start(&pipe->base); > > > + drm_sched_start(&pipe->base, 0); > > > > > > return DRM_GPU_SCHED_STAT_NOMINAL; > > > } > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c > > > b/drivers/gpu/drm/nouveau/nouveau_sched.c > > > index eb6c3f9a01f5..4412f2711fb5 100644 > > > --- a/drivers/gpu/drm/nouveau/nouveau_sched.c > > > +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c > > > @@ -379,7 +379,7 @@ nouveau_sched_timedout_job(struct > > > drm_sched_job > > > *sched_job) > > > else > > > NV_PRINTK(warn, job->cli, "Generic job > > > timeout.\n"); > > > > > > - drm_sched_start(sched); > > > + drm_sched_start(sched, 0); > > > > > > return stat; > > > } > > > diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c > > > b/drivers/gpu/drm/panfrost/panfrost_job.c > > > index df49d37d0e7e..d140800606bf 100644 > > > --- a/drivers/gpu/drm/panfrost/panfrost_job.c > > > +++ b/drivers/gpu/drm/panfrost/panfrost_job.c > > > @@ -727,7 +727,7 @@ panfrost_reset(struct panfrost_device *pfdev, > > > > > > /* Restart the schedulers */ > > > for (i = 0; i < NUM_JOB_SLOTS; i++) > > > - drm_sched_start(&pfdev->js->queue[i].sched); > > > + drm_sched_start(&pfdev->js->queue[i].sched, 0); > > > > > > /* Re-enable job interrupts now that everything has been > > > restarted. */ > > > job_write(pfdev, JOB_INT_MASK, > > > diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c > > > b/drivers/gpu/drm/panthor/panthor_mmu.c > > > index d47972806d50..e630cdf47f99 100644 > > > --- a/drivers/gpu/drm/panthor/panthor_mmu.c > > > +++ b/drivers/gpu/drm/panthor/panthor_mmu.c > > > @@ -827,7 +827,7 @@ static void panthor_vm_stop(struct panthor_vm > > > *vm) > > > > > > static void panthor_vm_start(struct panthor_vm *vm) > > > { > > > - drm_sched_start(&vm->sched); > > > + drm_sched_start(&vm->sched, 0); > > > } > > > > > > /** > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c > > > b/drivers/gpu/drm/scheduler/sched_main.c > > > index ab53ab486fe6..f093616fe53c 100644 > > > --- a/drivers/gpu/drm/scheduler/sched_main.c > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c > > > @@ -674,9 +674,10 @@ EXPORT_SYMBOL(drm_sched_stop); > > > * drm_sched_start - recover jobs after a reset > > > * > > > * @sched: scheduler instance > > > + * @errno: error to set on the pending fences > > > * > > > */ > > > -void drm_sched_start(struct drm_gpu_scheduler *sched) > > > +void drm_sched_start(struct drm_gpu_scheduler *sched, int errno) > > > { > > > struct drm_sched_job *s_job, *tmp; > > > > > > @@ -691,13 +692,13 @@ void drm_sched_start(struct > > > drm_gpu_scheduler > > > *sched) > > > atomic_add(s_job->credits, &sched- > > > >credit_count); > > > > > > if (!fence) { > > > - drm_sched_job_done(s_job, -ECANCELED); > > > + drm_sched_job_done(s_job, errno ?: - > > > ECANCELED); > > > continue; > > > } > > > > > > if (dma_fence_add_callback(fence, &s_job->cb, > > > > > > drm_sched_job_done_cb)) > > > - drm_sched_job_done(s_job, fence->error); > > > + drm_sched_job_done(s_job, fence->error > > > ?: > > > errno); > > > } > > > > > > drm_sched_start_timeout_unlocked(sched); > > > diff --git a/drivers/gpu/drm/v3d/v3d_sched.c > > > b/drivers/gpu/drm/v3d/v3d_sched.c > > > index fd29a00b233c..b6a89171824b 100644 > > > --- a/drivers/gpu/drm/v3d/v3d_sched.c > > > +++ b/drivers/gpu/drm/v3d/v3d_sched.c > > > @@ -663,7 +663,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev > > > *v3d, > > > struct drm_sched_job *sched_job) > > > > > > /* Unblock schedulers and restart their jobs. */ > > > for (q = 0; q < V3D_MAX_QUEUES; q++) { > > > - drm_sched_start(&v3d->queue[q].sched); > > > + drm_sched_start(&v3d->queue[q].sched, 0); > > > } > > > > > > mutex_unlock(&v3d->reset_lock); > > > diff --git a/include/drm/gpu_scheduler.h > > > b/include/drm/gpu_scheduler.h > > > index fe8edb917360..a8d19b10f9b8 100644 > > > --- a/include/drm/gpu_scheduler.h > > > +++ b/include/drm/gpu_scheduler.h > > > @@ -579,7 +579,7 @@ bool drm_sched_wqueue_ready(struct > > > drm_gpu_scheduler *sched); > > > void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched); > > > void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched); > > > void drm_sched_stop(struct drm_gpu_scheduler *sched, struct > > > drm_sched_job *bad); > > > -void drm_sched_start(struct drm_gpu_scheduler *sched); > > > +void drm_sched_start(struct drm_gpu_scheduler *sched, int > > > errno); > > I personally only recently started using the scheduler and many > > things > > were quite confusing. > > > > In particular, I thought it's not good to call this function > > drm_sched_start(), because that implies to new users / programmers > > that > > this function is supposed to start the scheduler. > > > > Accordingly, drm_sched_stop() sounds as if it's intended to be used > > on > > scheduler teardown. "Alright, I'll stop the scheduler with > > drm_sched_stop(), then can safely call drm_sched_entity_destroy() > > and > > then tear it down completely through drm_sched_fini()". > > > > Now the comments make it obvious that those start and stop > > functions > > are more intended for error recovery. > > > > So my stance is that start should be called, e.g., > > drm_sched_restart() > > or perhaps drm_sched_recover_start(). > > > > So since you're already touching all the lines where the function > > is > > used, this might be a good opportunity to improve the name, too. > > > > If that's deemed desirable. > > Yeah completely agree. > > We also had people incorrectly thinking that they should call > drm_sched_stop/start on suspend/resume resulting in a system which > didn't come up again after resume. > > How about we call it drm_sched_suspend_before_reset() and > drm_sched_resume_after_reset()? Yes, that sounds bullet-proof to me :) One might also consider drm_sched_start()'s function summary "recover jobs after a reset". Maybe have a sentence about what "recover" means would help there, too. Regards, P. > > Thanks, > Christian. > > > > > P. > > > > > > > void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); > > > void drm_sched_increase_karma(struct drm_sched_job *bad); > > > void drm_sched_reset_karma(struct drm_sched_job *bad); >
On Mon, Sep 02, 2024 at 09:01:48AM +0200, Philipp Stanner wrote: > On Fri, 2024-08-30 at 15:15 +0200, Christian König wrote: > > Am 28.08.24 um 11:30 schrieb Philipp Stanner: > > > On Mon, 2024-08-26 at 14:25 +0200, Christian König wrote: > > > > The current implementation of drm_sched_start uses a hardcoded > > > > -ECANCELED to dispose of a job when the parent/hw fence is NULL. > > > > This results in drm_sched_job_done being called with -ECANCELED > > > > for > > > > each job with a NULL parent in the pending list, making it > > > > difficult > > > > to distinguish between recovery methods, whether a queue reset or > > > > a > > > > full GPU reset was used. > > > > > > > > To improve this, we first try a soft recovery for timeout jobs > > > > and > > > > use the error code -ENODATA. If soft recovery fails, we proceed > > > > with > > > > a queue reset, where the error code remains -ENODATA for the job. > > > > Finally, for a full GPU reset, we use error codes -ECANCELED or > > > > -ETIME. This patch adds an error code parameter to > > > > drm_sched_start, > > > > allowing us to differentiate between queue reset and GPU reset > > > > failures. This enables user mode and test applications to > > > > validate > > > > the expected correctness of the requested operation. After a > > > > successful queue reset, the only way to continue normal operation > > > > is > > > > to call drm_sched_job_done with the specific error code -ENODATA. > > > > > > > > v1: Initial implementation by Jesse utilized > > > > amdgpu_device_lock_reset_domain > > > > and amdgpu_device_unlock_reset_domain to allow user mode to > > > > track > > > > the queue reset status and distinguish between queue reset > > > > and > > > > GPU reset. > > > > v2: Christian suggested using the error codes -ENODATA for queue > > > > reset > > > > and -ECANCELED or -ETIME for GPU reset, returned to > > > > amdgpu_cs_wait_ioctl. > > > > v3: To meet the requirements, we introduce a new function > > > > drm_sched_start_ex with an additional parameter to set > > > > dma_fence_set_error, allowing us to handle the specific > > > > error > > > > codes appropriately and dispose of bad jobs with the > > > > selected > > > > error code depending on whether it was a queue reset or GPU > > > > reset. > > > > v4: Alex suggested using a new name, > > > > drm_sched_start_with_recovery_error, > > > > which more accurately describes the function's purpose. > > > > Additionally, it was recommended to add documentation > > > > details > > > > about the new method. > > > > v5: Fixed declaration of new function > > > > drm_sched_start_with_recovery_error.(Alex) > > > > v6 (chk): rebase on upstream changes, cleanup the commit message, > > > > drop the new function again and update all callers, > > > > apply the errno also to scheduler fences with hw > > > > fences > > > > > > > > Signed-off-by: Jesse Zhang <Jesse.Zhang@amd.com> > > > > Signed-off-by: Vitaly Prosyak <vitaly.prosyak@amd.com> > > > > Signed-off-by: Christian König <christian.koenig@amd.com> > > > > Cc: Alex Deucher <alexander.deucher@amd.com> > > > > --- > > > > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 2 +- > > > > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- > > > > drivers/gpu/drm/etnaviv/etnaviv_sched.c | 2 +- > > > > drivers/gpu/drm/imagination/pvr_queue.c | 4 ++-- > > > > drivers/gpu/drm/lima/lima_sched.c | 2 +- > > > > drivers/gpu/drm/nouveau/nouveau_sched.c | 2 +- > > > > drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- > > > > drivers/gpu/drm/panthor/panthor_mmu.c | 2 +- > > > > drivers/gpu/drm/scheduler/sched_main.c | 7 ++++--- > > > > drivers/gpu/drm/v3d/v3d_sched.c | 2 +- > > > > include/drm/gpu_scheduler.h | 2 +- > > > > 11 files changed, 16 insertions(+), 15 deletions(-) > > > > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > > > index 2320df51c914..18135d8235f9 100644 > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > > > @@ -300,7 +300,7 @@ static int > > > > suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool > > > > sus > > > > if (r) > > > > goto out; > > > > } else { > > > > - drm_sched_start(&ring->sched); > > > > + drm_sched_start(&ring->sched, 0); > > > > } > > > > } > > > > > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > > index 1cd7d355689c..5891312e44ea 100644 > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > > > > @@ -5879,7 +5879,7 @@ int amdgpu_device_gpu_recover(struct > > > > amdgpu_device *adev, > > > > if (!amdgpu_ring_sched_ready(ring)) > > > > continue; > > > > > > > > - drm_sched_start(&ring->sched); > > > > + drm_sched_start(&ring->sched, 0); > > > > } > > > > > > > > if > > > > (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && > > > > !job_signaled) > > > > @@ -6374,7 +6374,7 @@ void amdgpu_pci_resume(struct pci_dev > > > > *pdev) > > > > if (!amdgpu_ring_sched_ready(ring)) > > > > continue; > > > > > > > > - drm_sched_start(&ring->sched); > > > > + drm_sched_start(&ring->sched, 0); > > > > } > > > > > > > > amdgpu_device_unset_mp1_state(adev); > > > > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c > > > > b/drivers/gpu/drm/etnaviv/etnaviv_sched.c > > > > index ab9ca4824b62..23ced5896c7c 100644 > > > > --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c > > > > +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c > > > > @@ -72,7 +72,7 @@ static enum drm_gpu_sched_stat > > > > etnaviv_sched_timedout_job(struct drm_sched_job > > > > > > > > drm_sched_resubmit_jobs(&gpu->sched); > > > > > > > > - drm_sched_start(&gpu->sched); > > > > + drm_sched_start(&gpu->sched, 0); > > > > return DRM_GPU_SCHED_STAT_NOMINAL; > > > > > > > > out_no_timeout: > > > > diff --git a/drivers/gpu/drm/imagination/pvr_queue.c > > > > b/drivers/gpu/drm/imagination/pvr_queue.c > > > > index 20cb46012082..c4f08432882b 100644 > > > > --- a/drivers/gpu/drm/imagination/pvr_queue.c > > > > +++ b/drivers/gpu/drm/imagination/pvr_queue.c > > > > @@ -782,7 +782,7 @@ static void pvr_queue_start(struct pvr_queue > > > > *queue) > > > > } > > > > } > > > > > > > > - drm_sched_start(&queue->scheduler); > > > > + drm_sched_start(&queue->scheduler, 0); > > > > } > > > > > > > > /** > > > > @@ -842,7 +842,7 @@ pvr_queue_timedout_job(struct drm_sched_job > > > > *s_job) > > > > } > > > > mutex_unlock(&pvr_dev->queues.lock); > > > > > > > > - drm_sched_start(sched); > > > > + drm_sched_start(sched, 0); > > > > > > > > return DRM_GPU_SCHED_STAT_NOMINAL; > > > > } > > > > diff --git a/drivers/gpu/drm/lima/lima_sched.c > > > > b/drivers/gpu/drm/lima/lima_sched.c > > > > index 1a944edb6ddc..b40c90e97d7e 100644 > > > > --- a/drivers/gpu/drm/lima/lima_sched.c > > > > +++ b/drivers/gpu/drm/lima/lima_sched.c > > > > @@ -463,7 +463,7 @@ static enum drm_gpu_sched_stat > > > > lima_sched_timedout_job(struct drm_sched_job *job > > > > lima_pm_idle(ldev); > > > > > > > > drm_sched_resubmit_jobs(&pipe->base); > > > > - drm_sched_start(&pipe->base); > > > > + drm_sched_start(&pipe->base, 0); > > > > > > > > return DRM_GPU_SCHED_STAT_NOMINAL; > > > > } > > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c > > > > b/drivers/gpu/drm/nouveau/nouveau_sched.c > > > > index eb6c3f9a01f5..4412f2711fb5 100644 > > > > --- a/drivers/gpu/drm/nouveau/nouveau_sched.c > > > > +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c > > > > @@ -379,7 +379,7 @@ nouveau_sched_timedout_job(struct > > > > drm_sched_job > > > > *sched_job) > > > > else > > > > NV_PRINTK(warn, job->cli, "Generic job > > > > timeout.\n"); > > > > > > > > - drm_sched_start(sched); > > > > + drm_sched_start(sched, 0); > > > > > > > > return stat; > > > > } > > > > diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c > > > > b/drivers/gpu/drm/panfrost/panfrost_job.c > > > > index df49d37d0e7e..d140800606bf 100644 > > > > --- a/drivers/gpu/drm/panfrost/panfrost_job.c > > > > +++ b/drivers/gpu/drm/panfrost/panfrost_job.c > > > > @@ -727,7 +727,7 @@ panfrost_reset(struct panfrost_device *pfdev, > > > > > > > > /* Restart the schedulers */ > > > > for (i = 0; i < NUM_JOB_SLOTS; i++) > > > > - drm_sched_start(&pfdev->js->queue[i].sched); > > > > + drm_sched_start(&pfdev->js->queue[i].sched, 0); > > > > > > > > /* Re-enable job interrupts now that everything has been > > > > restarted. */ > > > > job_write(pfdev, JOB_INT_MASK, > > > > diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c > > > > b/drivers/gpu/drm/panthor/panthor_mmu.c > > > > index d47972806d50..e630cdf47f99 100644 > > > > --- a/drivers/gpu/drm/panthor/panthor_mmu.c > > > > +++ b/drivers/gpu/drm/panthor/panthor_mmu.c > > > > @@ -827,7 +827,7 @@ static void panthor_vm_stop(struct panthor_vm > > > > *vm) > > > > > > > > static void panthor_vm_start(struct panthor_vm *vm) > > > > { > > > > - drm_sched_start(&vm->sched); > > > > + drm_sched_start(&vm->sched, 0); > > > > } > > > > > > > > /** > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c > > > > b/drivers/gpu/drm/scheduler/sched_main.c > > > > index ab53ab486fe6..f093616fe53c 100644 > > > > --- a/drivers/gpu/drm/scheduler/sched_main.c > > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c > > > > @@ -674,9 +674,10 @@ EXPORT_SYMBOL(drm_sched_stop); > > > > * drm_sched_start - recover jobs after a reset > > > > * > > > > * @sched: scheduler instance > > > > + * @errno: error to set on the pending fences > > > > * > > > > */ > > > > -void drm_sched_start(struct drm_gpu_scheduler *sched) > > > > +void drm_sched_start(struct drm_gpu_scheduler *sched, int errno) > > > > { > > > > struct drm_sched_job *s_job, *tmp; > > > > > > > > @@ -691,13 +692,13 @@ void drm_sched_start(struct > > > > drm_gpu_scheduler > > > > *sched) > > > > atomic_add(s_job->credits, &sched- > > > > >credit_count); > > > > > > > > if (!fence) { > > > > - drm_sched_job_done(s_job, -ECANCELED); > > > > + drm_sched_job_done(s_job, errno ?: - > > > > ECANCELED); > > > > continue; > > > > } > > > > > > > > if (dma_fence_add_callback(fence, &s_job->cb, > > > > > > > > drm_sched_job_done_cb)) > > > > - drm_sched_job_done(s_job, fence->error); > > > > + drm_sched_job_done(s_job, fence->error > > > > ?: > > > > errno); > > > > } > > > > > > > > drm_sched_start_timeout_unlocked(sched); > > > > diff --git a/drivers/gpu/drm/v3d/v3d_sched.c > > > > b/drivers/gpu/drm/v3d/v3d_sched.c > > > > index fd29a00b233c..b6a89171824b 100644 > > > > --- a/drivers/gpu/drm/v3d/v3d_sched.c > > > > +++ b/drivers/gpu/drm/v3d/v3d_sched.c > > > > @@ -663,7 +663,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev > > > > *v3d, > > > > struct drm_sched_job *sched_job) > > > > > > > > /* Unblock schedulers and restart their jobs. */ > > > > for (q = 0; q < V3D_MAX_QUEUES; q++) { > > > > - drm_sched_start(&v3d->queue[q].sched); > > > > + drm_sched_start(&v3d->queue[q].sched, 0); > > > > } > > > > > > > > mutex_unlock(&v3d->reset_lock); > > > > diff --git a/include/drm/gpu_scheduler.h > > > > b/include/drm/gpu_scheduler.h > > > > index fe8edb917360..a8d19b10f9b8 100644 > > > > --- a/include/drm/gpu_scheduler.h > > > > +++ b/include/drm/gpu_scheduler.h > > > > @@ -579,7 +579,7 @@ bool drm_sched_wqueue_ready(struct > > > > drm_gpu_scheduler *sched); > > > > void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched); > > > > void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched); > > > > void drm_sched_stop(struct drm_gpu_scheduler *sched, struct > > > > drm_sched_job *bad); > > > > -void drm_sched_start(struct drm_gpu_scheduler *sched); > > > > +void drm_sched_start(struct drm_gpu_scheduler *sched, int > > > > errno); > > > I personally only recently started using the scheduler and many > > > things > > > were quite confusing. > > > > > > In particular, I thought it's not good to call this function > > > drm_sched_start(), because that implies to new users / programmers > > > that > > > this function is supposed to start the scheduler. > > > > > > Accordingly, drm_sched_stop() sounds as if it's intended to be used > > > on > > > scheduler teardown. "Alright, I'll stop the scheduler with > > > drm_sched_stop(), then can safely call drm_sched_entity_destroy() > > > and > > > then tear it down completely through drm_sched_fini()". > > > > > > Now the comments make it obvious that those start and stop > > > functions > > > are more intended for error recovery. > > > > > > So my stance is that start should be called, e.g., > > > drm_sched_restart() > > > or perhaps drm_sched_recover_start(). > > > > > > So since you're already touching all the lines where the function > > > is > > > used, this might be a good opportunity to improve the name, too. > > > > > > If that's deemed desirable. > > > > Yeah completely agree. > > > > We also had people incorrectly thinking that they should call > > drm_sched_stop/start on suspend/resume resulting in a system which > > didn't come up again after resume. > > > > How about we call it drm_sched_suspend_before_reset() and > > drm_sched_resume_after_reset()? > > Yes, that sounds bullet-proof to me :) > > One might also consider drm_sched_start()'s function summary "recover > jobs after a reset". Maybe have a sentence about what "recover" means > would help there, too. Yeah, with that team drm_sched_reset_prepare/recover might also be a good function pair. suspend/resume could be mixed up with runtime or system suspend/resume, leading with reset feels better to me. -Sima > > > Regards, > P. > > > > > > Thanks, > > Christian. > > > > > > > > P. > > > > > > > > > > void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); > > > > void drm_sched_increase_karma(struct drm_sched_job *bad); > > > > void drm_sched_reset_karma(struct drm_sched_job *bad); > > >
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c index 2320df51c914..18135d8235f9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c @@ -300,7 +300,7 @@ static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus if (r) goto out; } else { - drm_sched_start(&ring->sched); + drm_sched_start(&ring->sched, 0); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 1cd7d355689c..5891312e44ea 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5879,7 +5879,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (!amdgpu_ring_sched_ready(ring)) continue; - drm_sched_start(&ring->sched); + drm_sched_start(&ring->sched, 0); } if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) @@ -6374,7 +6374,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev) if (!amdgpu_ring_sched_ready(ring)) continue; - drm_sched_start(&ring->sched); + drm_sched_start(&ring->sched, 0); } amdgpu_device_unset_mp1_state(adev); diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c index ab9ca4824b62..23ced5896c7c 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c @@ -72,7 +72,7 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job drm_sched_resubmit_jobs(&gpu->sched); - drm_sched_start(&gpu->sched); + drm_sched_start(&gpu->sched, 0); return DRM_GPU_SCHED_STAT_NOMINAL; out_no_timeout: diff --git a/drivers/gpu/drm/imagination/pvr_queue.c b/drivers/gpu/drm/imagination/pvr_queue.c index 20cb46012082..c4f08432882b 100644 --- a/drivers/gpu/drm/imagination/pvr_queue.c +++ b/drivers/gpu/drm/imagination/pvr_queue.c @@ -782,7 +782,7 @@ static void pvr_queue_start(struct pvr_queue *queue) } } - drm_sched_start(&queue->scheduler); + drm_sched_start(&queue->scheduler, 0); } /** @@ -842,7 +842,7 @@ pvr_queue_timedout_job(struct drm_sched_job *s_job) } mutex_unlock(&pvr_dev->queues.lock); - drm_sched_start(sched); + drm_sched_start(sched, 0); return DRM_GPU_SCHED_STAT_NOMINAL; } diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c index 1a944edb6ddc..b40c90e97d7e 100644 --- a/drivers/gpu/drm/lima/lima_sched.c +++ b/drivers/gpu/drm/lima/lima_sched.c @@ -463,7 +463,7 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job lima_pm_idle(ldev); drm_sched_resubmit_jobs(&pipe->base); - drm_sched_start(&pipe->base); + drm_sched_start(&pipe->base, 0); return DRM_GPU_SCHED_STAT_NOMINAL; } diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c index eb6c3f9a01f5..4412f2711fb5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sched.c +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c @@ -379,7 +379,7 @@ nouveau_sched_timedout_job(struct drm_sched_job *sched_job) else NV_PRINTK(warn, job->cli, "Generic job timeout.\n"); - drm_sched_start(sched); + drm_sched_start(sched, 0); return stat; } diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index df49d37d0e7e..d140800606bf 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -727,7 +727,7 @@ panfrost_reset(struct panfrost_device *pfdev, /* Restart the schedulers */ for (i = 0; i < NUM_JOB_SLOTS; i++) - drm_sched_start(&pfdev->js->queue[i].sched); + drm_sched_start(&pfdev->js->queue[i].sched, 0); /* Re-enable job interrupts now that everything has been restarted. */ job_write(pfdev, JOB_INT_MASK, diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c index d47972806d50..e630cdf47f99 100644 --- a/drivers/gpu/drm/panthor/panthor_mmu.c +++ b/drivers/gpu/drm/panthor/panthor_mmu.c @@ -827,7 +827,7 @@ static void panthor_vm_stop(struct panthor_vm *vm) static void panthor_vm_start(struct panthor_vm *vm) { - drm_sched_start(&vm->sched); + drm_sched_start(&vm->sched, 0); } /** diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index ab53ab486fe6..f093616fe53c 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -674,9 +674,10 @@ EXPORT_SYMBOL(drm_sched_stop); * drm_sched_start - recover jobs after a reset * * @sched: scheduler instance + * @errno: error to set on the pending fences * */ -void drm_sched_start(struct drm_gpu_scheduler *sched) +void drm_sched_start(struct drm_gpu_scheduler *sched, int errno) { struct drm_sched_job *s_job, *tmp; @@ -691,13 +692,13 @@ void drm_sched_start(struct drm_gpu_scheduler *sched) atomic_add(s_job->credits, &sched->credit_count); if (!fence) { - drm_sched_job_done(s_job, -ECANCELED); + drm_sched_job_done(s_job, errno ?: -ECANCELED); continue; } if (dma_fence_add_callback(fence, &s_job->cb, drm_sched_job_done_cb)) - drm_sched_job_done(s_job, fence->error); + drm_sched_job_done(s_job, fence->error ?: errno); } drm_sched_start_timeout_unlocked(sched); diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index fd29a00b233c..b6a89171824b 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -663,7 +663,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) /* Unblock schedulers and restart their jobs. */ for (q = 0; q < V3D_MAX_QUEUES; q++) { - drm_sched_start(&v3d->queue[q].sched); + drm_sched_start(&v3d->queue[q].sched, 0); } mutex_unlock(&v3d->reset_lock); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index fe8edb917360..a8d19b10f9b8 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -579,7 +579,7 @@ bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched); void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched); void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched); void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad); -void drm_sched_start(struct drm_gpu_scheduler *sched); +void drm_sched_start(struct drm_gpu_scheduler *sched, int errno); void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); void drm_sched_increase_karma(struct drm_sched_job *bad); void drm_sched_reset_karma(struct drm_sched_job *bad);