Message ID | 20240722083816.99685-1-christian.koenig@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/scheduler: remove full_recover from drm_sched_start | expand |
On Mon, Jul 22, 2024 at 10:38:16AM +0200, Christian König wrote: > This was basically just another one of amdgpus hacks. The parameter > allowed to restart the scheduler without turning fence signaling on > again. > > That this is absolutely not a good idea should be obvious by now since > the fences will then just sit there and never signal. > Yep, this obvious upon looking and it can't be a good idea to never signal fences. > While at it cleanup the code a bit. > Patch LGTM, also made be realize in Xe we don't use the drm_sched_stop and drm_sched_start and probably should as TDR isn't restarted after we do a device reset if scheduler / entity didn't cause the reset. Also I think we have a bug in drm_sched_job_begin too wrt to restarting the TDR even an existing job is running, will follow up on that too. Anyways: Reviewed-by: Matthew Brost <matthew.brost@intel.com> > Signed-off-by: Christian König <christian.koenig@amd.com> > --- > .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 2 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +-- > drivers/gpu/drm/etnaviv/etnaviv_sched.c | 4 +-- > drivers/gpu/drm/imagination/pvr_queue.c | 4 +-- > drivers/gpu/drm/lima/lima_sched.c | 2 +- > drivers/gpu/drm/nouveau/nouveau_sched.c | 2 +- > drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- > drivers/gpu/drm/panthor/panthor_mmu.c | 2 +- > drivers/gpu/drm/panthor/panthor_sched.c | 2 +- > drivers/gpu/drm/scheduler/sched_main.c | 25 ++++++------------- > drivers/gpu/drm/v3d/v3d_sched.c | 2 +- > include/drm/gpu_scheduler.h | 2 +- > 12 files changed, 22 insertions(+), 31 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > index 3a3f3ce09f00..2320df51c914 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > @@ -300,7 +300,7 @@ static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus > if (r) > goto out; > } else { > - drm_sched_start(&ring->sched, false); > + drm_sched_start(&ring->sched); > } > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 861ccff78af9..c186fdb198ad 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -5862,7 +5862,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, > if (!amdgpu_ring_sched_ready(ring)) > continue; > > - drm_sched_start(&ring->sched, true); > + drm_sched_start(&ring->sched); > } > > if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) > @@ -6360,7 +6360,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev) > if (!amdgpu_ring_sched_ready(ring)) > continue; > > - drm_sched_start(&ring->sched, true); > + drm_sched_start(&ring->sched); > } > > amdgpu_device_unset_mp1_state(adev); > diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c > index c4b04b0dee16..c53641aa146f 100644 > --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c > +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c > @@ -72,12 +72,12 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job > > drm_sched_resubmit_jobs(&gpu->sched); > > - drm_sched_start(&gpu->sched, true); > + drm_sched_start(&gpu->sched); > return DRM_GPU_SCHED_STAT_NOMINAL; > > out_no_timeout: > /* restart scheduler after GPU is usable again */ > - drm_sched_start(&gpu->sched, true); > + drm_sched_start(&gpu->sched); > return DRM_GPU_SCHED_STAT_NOMINAL; > } > > diff --git a/drivers/gpu/drm/imagination/pvr_queue.c b/drivers/gpu/drm/imagination/pvr_queue.c > index 5ed9c98fb599..20cb46012082 100644 > --- a/drivers/gpu/drm/imagination/pvr_queue.c > +++ b/drivers/gpu/drm/imagination/pvr_queue.c > @@ -782,7 +782,7 @@ static void pvr_queue_start(struct pvr_queue *queue) > } > } > > - drm_sched_start(&queue->scheduler, true); > + drm_sched_start(&queue->scheduler); > } > > /** > @@ -842,7 +842,7 @@ pvr_queue_timedout_job(struct drm_sched_job *s_job) > } > mutex_unlock(&pvr_dev->queues.lock); > > - drm_sched_start(sched, true); > + drm_sched_start(sched); > > return DRM_GPU_SCHED_STAT_NOMINAL; > } > diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c > index bbf3f8feab94..1a944edb6ddc 100644 > --- a/drivers/gpu/drm/lima/lima_sched.c > +++ b/drivers/gpu/drm/lima/lima_sched.c > @@ -463,7 +463,7 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job > lima_pm_idle(ldev); > > drm_sched_resubmit_jobs(&pipe->base); > - drm_sched_start(&pipe->base, true); > + drm_sched_start(&pipe->base); > > return DRM_GPU_SCHED_STAT_NOMINAL; > } > diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c > index ba4139288a6d..eb6c3f9a01f5 100644 > --- a/drivers/gpu/drm/nouveau/nouveau_sched.c > +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c > @@ -379,7 +379,7 @@ nouveau_sched_timedout_job(struct drm_sched_job *sched_job) > else > NV_PRINTK(warn, job->cli, "Generic job timeout.\n"); > > - drm_sched_start(sched, true); > + drm_sched_start(sched); > > return stat; > } > diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c > index a61ef0af9a4e..df49d37d0e7e 100644 > --- a/drivers/gpu/drm/panfrost/panfrost_job.c > +++ b/drivers/gpu/drm/panfrost/panfrost_job.c > @@ -727,7 +727,7 @@ panfrost_reset(struct panfrost_device *pfdev, > > /* Restart the schedulers */ > for (i = 0; i < NUM_JOB_SLOTS; i++) > - drm_sched_start(&pfdev->js->queue[i].sched, true); > + drm_sched_start(&pfdev->js->queue[i].sched); > > /* Re-enable job interrupts now that everything has been restarted. */ > job_write(pfdev, JOB_INT_MASK, > diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c > index fa0a002b1016..d47972806d50 100644 > --- a/drivers/gpu/drm/panthor/panthor_mmu.c > +++ b/drivers/gpu/drm/panthor/panthor_mmu.c > @@ -827,7 +827,7 @@ static void panthor_vm_stop(struct panthor_vm *vm) > > static void panthor_vm_start(struct panthor_vm *vm) > { > - drm_sched_start(&vm->sched, true); > + drm_sched_start(&vm->sched); > } > > /** > diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c > index 42929e147107..2e1becd87e3a 100644 > --- a/drivers/gpu/drm/panthor/panthor_sched.c > +++ b/drivers/gpu/drm/panthor/panthor_sched.c > @@ -2525,7 +2525,7 @@ static void queue_start(struct panthor_queue *queue) > list_for_each_entry(job, &queue->scheduler.pending_list, base.list) > job->base.s_fence->parent = dma_fence_get(job->done_fence); > > - drm_sched_start(&queue->scheduler, true); > + drm_sched_start(&queue->scheduler); > } > > static void panthor_group_stop(struct panthor_group *group) > diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c > index 7e90c9f95611..ab53ab486fe6 100644 > --- a/drivers/gpu/drm/scheduler/sched_main.c > +++ b/drivers/gpu/drm/scheduler/sched_main.c > @@ -674,13 +674,11 @@ EXPORT_SYMBOL(drm_sched_stop); > * drm_sched_start - recover jobs after a reset > * > * @sched: scheduler instance > - * @full_recovery: proceed with complete sched restart > * > */ > -void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) > +void drm_sched_start(struct drm_gpu_scheduler *sched) > { > struct drm_sched_job *s_job, *tmp; > - int r; > > /* > * Locking the list is not required here as the sched thread is parked > @@ -692,24 +690,17 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) > > atomic_add(s_job->credits, &sched->credit_count); > > - if (!full_recovery) > + if (!fence) { > + drm_sched_job_done(s_job, -ECANCELED); > continue; > + } > > - if (fence) { > - r = dma_fence_add_callback(fence, &s_job->cb, > - drm_sched_job_done_cb); > - if (r == -ENOENT) > - drm_sched_job_done(s_job, fence->error); > - else if (r) > - DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", > - r); > - } else > - drm_sched_job_done(s_job, -ECANCELED); > + if (dma_fence_add_callback(fence, &s_job->cb, > + drm_sched_job_done_cb)) > + drm_sched_job_done(s_job, fence->error); > } > > - if (full_recovery) > - drm_sched_start_timeout_unlocked(sched); > - > + drm_sched_start_timeout_unlocked(sched); > drm_sched_wqueue_start(sched); > } > EXPORT_SYMBOL(drm_sched_start); > diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c > index d193072703f3..42d4f4a2dba2 100644 > --- a/drivers/gpu/drm/v3d/v3d_sched.c > +++ b/drivers/gpu/drm/v3d/v3d_sched.c > @@ -653,7 +653,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) > > /* Unblock schedulers and restart their jobs. */ > for (q = 0; q < V3D_MAX_QUEUES; q++) { > - drm_sched_start(&v3d->queue[q].sched, true); > + drm_sched_start(&v3d->queue[q].sched); > } > > mutex_unlock(&v3d->reset_lock); > diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h > index 5acc64954a88..fe8edb917360 100644 > --- a/include/drm/gpu_scheduler.h > +++ b/include/drm/gpu_scheduler.h > @@ -579,7 +579,7 @@ bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched); > void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched); > void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched); > void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad); > -void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery); > +void drm_sched_start(struct drm_gpu_scheduler *sched); > void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); > void drm_sched_increase_karma(struct drm_sched_job *bad); > void drm_sched_reset_karma(struct drm_sched_job *bad); > -- > 2.34.1 >
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c index 3a3f3ce09f00..2320df51c914 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c @@ -300,7 +300,7 @@ static int suspend_resume_compute_scheduler(struct amdgpu_device *adev, bool sus if (r) goto out; } else { - drm_sched_start(&ring->sched, false); + drm_sched_start(&ring->sched); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 861ccff78af9..c186fdb198ad 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5862,7 +5862,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, if (!amdgpu_ring_sched_ready(ring)) continue; - drm_sched_start(&ring->sched, true); + drm_sched_start(&ring->sched); } if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) @@ -6360,7 +6360,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev) if (!amdgpu_ring_sched_ready(ring)) continue; - drm_sched_start(&ring->sched, true); + drm_sched_start(&ring->sched); } amdgpu_device_unset_mp1_state(adev); diff --git a/drivers/gpu/drm/etnaviv/etnaviv_sched.c b/drivers/gpu/drm/etnaviv/etnaviv_sched.c index c4b04b0dee16..c53641aa146f 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_sched.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_sched.c @@ -72,12 +72,12 @@ static enum drm_gpu_sched_stat etnaviv_sched_timedout_job(struct drm_sched_job drm_sched_resubmit_jobs(&gpu->sched); - drm_sched_start(&gpu->sched, true); + drm_sched_start(&gpu->sched); return DRM_GPU_SCHED_STAT_NOMINAL; out_no_timeout: /* restart scheduler after GPU is usable again */ - drm_sched_start(&gpu->sched, true); + drm_sched_start(&gpu->sched); return DRM_GPU_SCHED_STAT_NOMINAL; } diff --git a/drivers/gpu/drm/imagination/pvr_queue.c b/drivers/gpu/drm/imagination/pvr_queue.c index 5ed9c98fb599..20cb46012082 100644 --- a/drivers/gpu/drm/imagination/pvr_queue.c +++ b/drivers/gpu/drm/imagination/pvr_queue.c @@ -782,7 +782,7 @@ static void pvr_queue_start(struct pvr_queue *queue) } } - drm_sched_start(&queue->scheduler, true); + drm_sched_start(&queue->scheduler); } /** @@ -842,7 +842,7 @@ pvr_queue_timedout_job(struct drm_sched_job *s_job) } mutex_unlock(&pvr_dev->queues.lock); - drm_sched_start(sched, true); + drm_sched_start(sched); return DRM_GPU_SCHED_STAT_NOMINAL; } diff --git a/drivers/gpu/drm/lima/lima_sched.c b/drivers/gpu/drm/lima/lima_sched.c index bbf3f8feab94..1a944edb6ddc 100644 --- a/drivers/gpu/drm/lima/lima_sched.c +++ b/drivers/gpu/drm/lima/lima_sched.c @@ -463,7 +463,7 @@ static enum drm_gpu_sched_stat lima_sched_timedout_job(struct drm_sched_job *job lima_pm_idle(ldev); drm_sched_resubmit_jobs(&pipe->base); - drm_sched_start(&pipe->base, true); + drm_sched_start(&pipe->base); return DRM_GPU_SCHED_STAT_NOMINAL; } diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c index ba4139288a6d..eb6c3f9a01f5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_sched.c +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c @@ -379,7 +379,7 @@ nouveau_sched_timedout_job(struct drm_sched_job *sched_job) else NV_PRINTK(warn, job->cli, "Generic job timeout.\n"); - drm_sched_start(sched, true); + drm_sched_start(sched); return stat; } diff --git a/drivers/gpu/drm/panfrost/panfrost_job.c b/drivers/gpu/drm/panfrost/panfrost_job.c index a61ef0af9a4e..df49d37d0e7e 100644 --- a/drivers/gpu/drm/panfrost/panfrost_job.c +++ b/drivers/gpu/drm/panfrost/panfrost_job.c @@ -727,7 +727,7 @@ panfrost_reset(struct panfrost_device *pfdev, /* Restart the schedulers */ for (i = 0; i < NUM_JOB_SLOTS; i++) - drm_sched_start(&pfdev->js->queue[i].sched, true); + drm_sched_start(&pfdev->js->queue[i].sched); /* Re-enable job interrupts now that everything has been restarted. */ job_write(pfdev, JOB_INT_MASK, diff --git a/drivers/gpu/drm/panthor/panthor_mmu.c b/drivers/gpu/drm/panthor/panthor_mmu.c index fa0a002b1016..d47972806d50 100644 --- a/drivers/gpu/drm/panthor/panthor_mmu.c +++ b/drivers/gpu/drm/panthor/panthor_mmu.c @@ -827,7 +827,7 @@ static void panthor_vm_stop(struct panthor_vm *vm) static void panthor_vm_start(struct panthor_vm *vm) { - drm_sched_start(&vm->sched, true); + drm_sched_start(&vm->sched); } /** diff --git a/drivers/gpu/drm/panthor/panthor_sched.c b/drivers/gpu/drm/panthor/panthor_sched.c index 42929e147107..2e1becd87e3a 100644 --- a/drivers/gpu/drm/panthor/panthor_sched.c +++ b/drivers/gpu/drm/panthor/panthor_sched.c @@ -2525,7 +2525,7 @@ static void queue_start(struct panthor_queue *queue) list_for_each_entry(job, &queue->scheduler.pending_list, base.list) job->base.s_fence->parent = dma_fence_get(job->done_fence); - drm_sched_start(&queue->scheduler, true); + drm_sched_start(&queue->scheduler); } static void panthor_group_stop(struct panthor_group *group) diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index 7e90c9f95611..ab53ab486fe6 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -674,13 +674,11 @@ EXPORT_SYMBOL(drm_sched_stop); * drm_sched_start - recover jobs after a reset * * @sched: scheduler instance - * @full_recovery: proceed with complete sched restart * */ -void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) +void drm_sched_start(struct drm_gpu_scheduler *sched) { struct drm_sched_job *s_job, *tmp; - int r; /* * Locking the list is not required here as the sched thread is parked @@ -692,24 +690,17 @@ void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery) atomic_add(s_job->credits, &sched->credit_count); - if (!full_recovery) + if (!fence) { + drm_sched_job_done(s_job, -ECANCELED); continue; + } - if (fence) { - r = dma_fence_add_callback(fence, &s_job->cb, - drm_sched_job_done_cb); - if (r == -ENOENT) - drm_sched_job_done(s_job, fence->error); - else if (r) - DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n", - r); - } else - drm_sched_job_done(s_job, -ECANCELED); + if (dma_fence_add_callback(fence, &s_job->cb, + drm_sched_job_done_cb)) + drm_sched_job_done(s_job, fence->error); } - if (full_recovery) - drm_sched_start_timeout_unlocked(sched); - + drm_sched_start_timeout_unlocked(sched); drm_sched_wqueue_start(sched); } EXPORT_SYMBOL(drm_sched_start); diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index d193072703f3..42d4f4a2dba2 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -653,7 +653,7 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) /* Unblock schedulers and restart their jobs. */ for (q = 0; q < V3D_MAX_QUEUES; q++) { - drm_sched_start(&v3d->queue[q].sched, true); + drm_sched_start(&v3d->queue[q].sched); } mutex_unlock(&v3d->reset_lock); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index 5acc64954a88..fe8edb917360 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -579,7 +579,7 @@ bool drm_sched_wqueue_ready(struct drm_gpu_scheduler *sched); void drm_sched_wqueue_stop(struct drm_gpu_scheduler *sched); void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched); void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad); -void drm_sched_start(struct drm_gpu_scheduler *sched, bool full_recovery); +void drm_sched_start(struct drm_gpu_scheduler *sched); void drm_sched_resubmit_jobs(struct drm_gpu_scheduler *sched); void drm_sched_increase_karma(struct drm_sched_job *bad); void drm_sched_reset_karma(struct drm_sched_job *bad);
This was basically just another one of amdgpus hacks. The parameter allowed to restart the scheduler without turning fence signaling on again. That this is absolutely not a good idea should be obvious by now since the fences will then just sit there and never signal. While at it cleanup the code a bit. Signed-off-by: Christian König <christian.koenig@amd.com> --- .../drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 +-- drivers/gpu/drm/etnaviv/etnaviv_sched.c | 4 +-- drivers/gpu/drm/imagination/pvr_queue.c | 4 +-- drivers/gpu/drm/lima/lima_sched.c | 2 +- drivers/gpu/drm/nouveau/nouveau_sched.c | 2 +- drivers/gpu/drm/panfrost/panfrost_job.c | 2 +- drivers/gpu/drm/panthor/panthor_mmu.c | 2 +- drivers/gpu/drm/panthor/panthor_sched.c | 2 +- drivers/gpu/drm/scheduler/sched_main.c | 25 ++++++------------- drivers/gpu/drm/v3d/v3d_sched.c | 2 +- include/drm/gpu_scheduler.h | 2 +- 12 files changed, 22 insertions(+), 31 deletions(-)