Message ID | 20221014084641.128280-11-christian.koenig@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [01/13] drm/scheduler: fix fence ref counting | expand |
Title: "dependencies" Regards, Luben On 2022-10-14 04:46, Christian König wrote: > Entirely remove the sync obj in the job. > > Signed-off-by: Christian König <christian.koenig@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 21 ++++++++++----------- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 2 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +-------- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 - > 4 files changed, 13 insertions(+), 20 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index d45b86bcf7fa..0528c2b1db6e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -426,7 +426,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p, > dma_fence_put(old); > } > > - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); > + r = amdgpu_sync_fence(&p->sync, fence); > dma_fence_put(fence); > if (r) > return r; > @@ -448,7 +448,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p, > return r; > } > > - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); > + r = amdgpu_sync_fence(&p->sync, fence); > if (r) > goto error; > > @@ -1108,7 +1108,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update); > + r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update); > if (r) > return r; > > @@ -1119,7 +1119,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); > + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); > if (r) > return r; > } > @@ -1138,7 +1138,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); > + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); > if (r) > return r; > } > @@ -1151,7 +1151,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&job->sync, vm->last_update); > + r = amdgpu_sync_fence(&p->sync, vm->last_update); > if (r) > return r; > > @@ -1183,7 +1183,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) > { > struct amdgpu_fpriv *fpriv = p->filp->driver_priv; > - struct amdgpu_job *leader = p->gang_leader; > struct amdgpu_bo_list_entry *e; > unsigned int i; > int r; > @@ -1195,14 +1194,14 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) > > sync_mode = amdgpu_bo_explicit_sync(bo) ? > AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER; > - r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode, > + r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode, > &fpriv->vm); > if (r) > return r; > } > > - for (i = 0; i < p->gang_size - 1; ++i) { > - r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync); > + for (i = 0; i < p->gang_size; ++i) { > + r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]); > if (r) > return r; > } > @@ -1248,7 +1247,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, > struct dma_fence *fence; > > fence = &p->jobs[i]->base.s_fence->scheduled; > - r = amdgpu_sync_fence(&leader->sync, fence); > + r = drm_sched_job_add_dependency(&leader->base, fence); > if (r) > goto error_cleanup; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > index cbaa19b2b8a3..207e801c24ed 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > @@ -75,6 +75,8 @@ struct amdgpu_cs_parser { > > unsigned num_post_deps; > struct amdgpu_cs_post_dep *post_deps; > + > + struct amdgpu_sync sync; > }; > > int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index ba98d65835b4..b8494c3b3b8a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -106,7 +106,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, > (*job)->base.sched = &adev->rings[0]->sched; > (*job)->vm = vm; > > - amdgpu_sync_create(&(*job)->sync); > amdgpu_sync_create(&(*job)->explicit_sync); > (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); > (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; > @@ -174,9 +173,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) > > drm_sched_job_cleanup(s_job); > > - amdgpu_sync_free(&job->sync); > amdgpu_sync_free(&job->explicit_sync); > - > dma_fence_put(&job->hw_fence); > } > > @@ -202,7 +199,6 @@ void amdgpu_job_free(struct amdgpu_job *job) > drm_sched_job_cleanup(&job->base); > > amdgpu_job_free_resources(job); > - amdgpu_sync_free(&job->sync); > amdgpu_sync_free(&job->explicit_sync); > if (job->gang_submit != &job->base.s_fence->scheduled) > dma_fence_put(job->gang_submit); > @@ -246,10 +242,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job, > { > struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); > struct amdgpu_job *job = to_amdgpu_job(sched_job); > - struct dma_fence *fence; > + struct dma_fence *fence = NULL; > int r; > > - fence = amdgpu_sync_get_fence(&job->sync); > while (fence == NULL && job->vm && !job->vmid) { > r = amdgpu_vmid_grab(job->vm, ring, job, &fence); > if (r) > @@ -273,8 +268,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) > job = to_amdgpu_job(sched_job); > finished = &job->base.s_fence->finished; > > - BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); > - > trace_amdgpu_sched_run_job(job); > > /* Skip job if VRAM is lost and never resubmit gangs */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > index 9c10b9bd0084..6558839fda03 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > @@ -47,7 +47,6 @@ enum amdgpu_ib_pool_type; > struct amdgpu_job { > struct drm_sched_job base; > struct amdgpu_vm *vm; > - struct amdgpu_sync sync; > struct amdgpu_sync explicit_sync; > struct dma_fence hw_fence; > struct dma_fence *gang_submit;
On Fri, 14 Oct 2022 at 09:47, Christian König <ckoenig.leichtzumerken@gmail.com> wrote: > > Entirely remove the sync obj in the job. > > Signed-off-by: Christian König <christian.koenig@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 21 ++++++++++----------- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 2 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +-------- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 - > 4 files changed, 13 insertions(+), 20 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > index d45b86bcf7fa..0528c2b1db6e 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > @@ -426,7 +426,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p, > dma_fence_put(old); > } > > - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); > + r = amdgpu_sync_fence(&p->sync, fence); > dma_fence_put(fence); > if (r) > return r; > @@ -448,7 +448,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p, > return r; > } > > - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); > + r = amdgpu_sync_fence(&p->sync, fence); > if (r) > goto error; > > @@ -1108,7 +1108,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update); > + r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update); > if (r) > return r; > > @@ -1119,7 +1119,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); > + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); > if (r) > return r; > } > @@ -1138,7 +1138,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); > + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); > if (r) > return r; > } > @@ -1151,7 +1151,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > if (r) > return r; > > - r = amdgpu_sync_fence(&job->sync, vm->last_update); > + r = amdgpu_sync_fence(&p->sync, vm->last_update); > if (r) > return r; > > @@ -1183,7 +1183,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) > { > struct amdgpu_fpriv *fpriv = p->filp->driver_priv; > - struct amdgpu_job *leader = p->gang_leader; > struct amdgpu_bo_list_entry *e; > unsigned int i; > int r; > @@ -1195,14 +1194,14 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) > > sync_mode = amdgpu_bo_explicit_sync(bo) ? > AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER; > - r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode, > + r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode, > &fpriv->vm); > if (r) > return r; > } > > - for (i = 0; i < p->gang_size - 1; ++i) { > - r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync); > + for (i = 0; i < p->gang_size; ++i) { > + r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]); > if (r) > return r; > } > @@ -1248,7 +1247,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, > struct dma_fence *fence; > > fence = &p->jobs[i]->base.s_fence->scheduled; > - r = amdgpu_sync_fence(&leader->sync, fence); > + r = drm_sched_job_add_dependency(&leader->base, fence); > if (r) > goto error_cleanup; > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > index cbaa19b2b8a3..207e801c24ed 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > @@ -75,6 +75,8 @@ struct amdgpu_cs_parser { > > unsigned num_post_deps; > struct amdgpu_cs_post_dep *post_deps; > + > + struct amdgpu_sync sync; > }; > > int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > index ba98d65835b4..b8494c3b3b8a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > @@ -106,7 +106,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, > (*job)->base.sched = &adev->rings[0]->sched; > (*job)->vm = vm; > > - amdgpu_sync_create(&(*job)->sync); > amdgpu_sync_create(&(*job)->explicit_sync); > (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); > (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; > @@ -174,9 +173,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) > > drm_sched_job_cleanup(s_job); > > - amdgpu_sync_free(&job->sync); > amdgpu_sync_free(&job->explicit_sync); > - > dma_fence_put(&job->hw_fence); > } > > @@ -202,7 +199,6 @@ void amdgpu_job_free(struct amdgpu_job *job) > drm_sched_job_cleanup(&job->base); > > amdgpu_job_free_resources(job); > - amdgpu_sync_free(&job->sync); > amdgpu_sync_free(&job->explicit_sync); > if (job->gang_submit != &job->base.s_fence->scheduled) > dma_fence_put(job->gang_submit); > @@ -246,10 +242,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job, > { > struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); > struct amdgpu_job *job = to_amdgpu_job(sched_job); > - struct dma_fence *fence; > + struct dma_fence *fence = NULL; > int r; > > - fence = amdgpu_sync_get_fence(&job->sync); > while (fence == NULL && job->vm && !job->vmid) { > r = amdgpu_vmid_grab(job->vm, ring, job, &fence); > if (r) > @@ -273,8 +268,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) > job = to_amdgpu_job(sched_job); > finished = &job->base.s_fence->finished; > > - BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); > - > trace_amdgpu_sched_run_job(job); > > /* Skip job if VRAM is lost and never resubmit gangs */ > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > index 9c10b9bd0084..6558839fda03 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > @@ -47,7 +47,6 @@ enum amdgpu_ib_pool_type; > struct amdgpu_job { > struct drm_sched_job base; > struct amdgpu_vm *vm; > - struct amdgpu_sync sync; > struct amdgpu_sync explicit_sync; > struct dma_fence hw_fence; > struct dma_fence *gang_submit; > -- > 2.25.1 > Hi, I've been testing the Mesh shader benchmark in GravityMark and I've bisected my laptop freezing up and rebooting, to this commit 1728baa7e4e60054bf13dd9b1212d133cbd53b3f is the first bad commit commit 1728baa7e4e60054bf13dd9b1212d133cbd53b3f Author: Christian König <christian.koenig@amd.com> Date: Thu Sep 29 14:04:01 2022 +0200 drm/amdgpu: use scheduler dependencies for CS Entirely remove the sync obj in the job. Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Luben Tuikov <luben.tuikov@amd.com> Link: https://patchwork.freedesktop.org/patch/msgid/20221014084641.128280-11-christian.koenig@amd.com drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 21 ++++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +-------- drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 - 4 files changed, 13 insertions(+), 20 deletions(-) This is on a prime system 6800M with the latest mesa I tried reverting this patch however it didn't revert cleanly, and my attempt doesn't work and only partially freezes up the system Would you like me to open a bug for this on https://gitlab.freedesktop.org/drm/amd/-/issues ? Cheers Mike
https://gitlab.freedesktop.org/drm/amd/-/issues/2309 On Wed, 21 Dec 2022 at 15:34, Mike Lothian <mike@fireburn.co.uk> wrote: > > On Fri, 14 Oct 2022 at 09:47, Christian König > <ckoenig.leichtzumerken@gmail.com> wrote: > > > > Entirely remove the sync obj in the job. > > > > Signed-off-by: Christian König <christian.koenig@amd.com> > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 21 ++++++++++----------- > > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 2 ++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +-------- > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 - > > 4 files changed, 13 insertions(+), 20 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > > index d45b86bcf7fa..0528c2b1db6e 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > > @@ -426,7 +426,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p, > > dma_fence_put(old); > > } > > > > - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); > > + r = amdgpu_sync_fence(&p->sync, fence); > > dma_fence_put(fence); > > if (r) > > return r; > > @@ -448,7 +448,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p, > > return r; > > } > > > > - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); > > + r = amdgpu_sync_fence(&p->sync, fence); > > if (r) > > goto error; > > > > @@ -1108,7 +1108,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > > if (r) > > return r; > > > > - r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update); > > + r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update); > > if (r) > > return r; > > > > @@ -1119,7 +1119,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > > if (r) > > return r; > > > > - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); > > + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); > > if (r) > > return r; > > } > > @@ -1138,7 +1138,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > > if (r) > > return r; > > > > - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); > > + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); > > if (r) > > return r; > > } > > @@ -1151,7 +1151,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > > if (r) > > return r; > > > > - r = amdgpu_sync_fence(&job->sync, vm->last_update); > > + r = amdgpu_sync_fence(&p->sync, vm->last_update); > > if (r) > > return r; > > > > @@ -1183,7 +1183,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > > static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) > > { > > struct amdgpu_fpriv *fpriv = p->filp->driver_priv; > > - struct amdgpu_job *leader = p->gang_leader; > > struct amdgpu_bo_list_entry *e; > > unsigned int i; > > int r; > > @@ -1195,14 +1194,14 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) > > > > sync_mode = amdgpu_bo_explicit_sync(bo) ? > > AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER; > > - r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode, > > + r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode, > > &fpriv->vm); > > if (r) > > return r; > > } > > > > - for (i = 0; i < p->gang_size - 1; ++i) { > > - r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync); > > + for (i = 0; i < p->gang_size; ++i) { > > + r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]); > > if (r) > > return r; > > } > > @@ -1248,7 +1247,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, > > struct dma_fence *fence; > > > > fence = &p->jobs[i]->base.s_fence->scheduled; > > - r = amdgpu_sync_fence(&leader->sync, fence); > > + r = drm_sched_job_add_dependency(&leader->base, fence); > > if (r) > > goto error_cleanup; > > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > > index cbaa19b2b8a3..207e801c24ed 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > > @@ -75,6 +75,8 @@ struct amdgpu_cs_parser { > > > > unsigned num_post_deps; > > struct amdgpu_cs_post_dep *post_deps; > > + > > + struct amdgpu_sync sync; > > }; > > > > int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > index ba98d65835b4..b8494c3b3b8a 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > > @@ -106,7 +106,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, > > (*job)->base.sched = &adev->rings[0]->sched; > > (*job)->vm = vm; > > > > - amdgpu_sync_create(&(*job)->sync); > > amdgpu_sync_create(&(*job)->explicit_sync); > > (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); > > (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; > > @@ -174,9 +173,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) > > > > drm_sched_job_cleanup(s_job); > > > > - amdgpu_sync_free(&job->sync); > > amdgpu_sync_free(&job->explicit_sync); > > - > > dma_fence_put(&job->hw_fence); > > } > > > > @@ -202,7 +199,6 @@ void amdgpu_job_free(struct amdgpu_job *job) > > drm_sched_job_cleanup(&job->base); > > > > amdgpu_job_free_resources(job); > > - amdgpu_sync_free(&job->sync); > > amdgpu_sync_free(&job->explicit_sync); > > if (job->gang_submit != &job->base.s_fence->scheduled) > > dma_fence_put(job->gang_submit); > > @@ -246,10 +242,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job, > > { > > struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); > > struct amdgpu_job *job = to_amdgpu_job(sched_job); > > - struct dma_fence *fence; > > + struct dma_fence *fence = NULL; > > int r; > > > > - fence = amdgpu_sync_get_fence(&job->sync); > > while (fence == NULL && job->vm && !job->vmid) { > > r = amdgpu_vmid_grab(job->vm, ring, job, &fence); > > if (r) > > @@ -273,8 +268,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) > > job = to_amdgpu_job(sched_job); > > finished = &job->base.s_fence->finished; > > > > - BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); > > - > > trace_amdgpu_sched_run_job(job); > > > > /* Skip job if VRAM is lost and never resubmit gangs */ > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > > index 9c10b9bd0084..6558839fda03 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > > @@ -47,7 +47,6 @@ enum amdgpu_ib_pool_type; > > struct amdgpu_job { > > struct drm_sched_job base; > > struct amdgpu_vm *vm; > > - struct amdgpu_sync sync; > > struct amdgpu_sync explicit_sync; > > struct dma_fence hw_fence; > > struct dma_fence *gang_submit; > > -- > > 2.25.1 > > > > Hi, I've been testing the Mesh shader benchmark in GravityMark and > I've bisected my laptop freezing up and rebooting, to this commit > > 1728baa7e4e60054bf13dd9b1212d133cbd53b3f is the first bad commit > commit 1728baa7e4e60054bf13dd9b1212d133cbd53b3f > Author: Christian König <christian.koenig@amd.com> > Date: Thu Sep 29 14:04:01 2022 +0200 > > drm/amdgpu: use scheduler dependencies for CS > > Entirely remove the sync obj in the job. > > Signed-off-by: Christian König <christian.koenig@amd.com> > Reviewed-by: Luben Tuikov <luben.tuikov@amd.com> > Link: https://patchwork.freedesktop.org/patch/msgid/20221014084641.128280-11-christian.koenig@amd.com > > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 21 ++++++++++----------- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 2 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +-------- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 - > 4 files changed, 13 insertions(+), 20 deletions(-) > > This is on a prime system 6800M with the latest mesa > > I tried reverting this patch however it didn't revert cleanly, and my > attempt doesn't work and only partially freezes up the system > > Would you like me to open a bug for this on > https://gitlab.freedesktop.org/drm/amd/-/issues ? > > Cheers > > Mike
On 2022-12-21 10:34, Mike Lothian wrote: > On Fri, 14 Oct 2022 at 09:47, Christian König > <ckoenig.leichtzumerken@gmail.com> wrote: >> >> Entirely remove the sync obj in the job. >> >> Signed-off-by: Christian König <christian.koenig@amd.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 21 ++++++++++----------- >> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 2 ++ >> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +-------- >> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 - >> 4 files changed, 13 insertions(+), 20 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> index d45b86bcf7fa..0528c2b1db6e 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c >> @@ -426,7 +426,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p, >> dma_fence_put(old); >> } >> >> - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); >> + r = amdgpu_sync_fence(&p->sync, fence); >> dma_fence_put(fence); >> if (r) >> return r; >> @@ -448,7 +448,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p, >> return r; >> } >> >> - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); >> + r = amdgpu_sync_fence(&p->sync, fence); >> if (r) >> goto error; >> >> @@ -1108,7 +1108,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) >> if (r) >> return r; >> >> - r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update); >> + r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update); >> if (r) >> return r; >> >> @@ -1119,7 +1119,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) >> if (r) >> return r; >> >> - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); >> + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); >> if (r) >> return r; >> } >> @@ -1138,7 +1138,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) >> if (r) >> return r; >> >> - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); >> + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); >> if (r) >> return r; >> } >> @@ -1151,7 +1151,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) >> if (r) >> return r; >> >> - r = amdgpu_sync_fence(&job->sync, vm->last_update); >> + r = amdgpu_sync_fence(&p->sync, vm->last_update); >> if (r) >> return r; >> >> @@ -1183,7 +1183,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) >> static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) >> { >> struct amdgpu_fpriv *fpriv = p->filp->driver_priv; >> - struct amdgpu_job *leader = p->gang_leader; >> struct amdgpu_bo_list_entry *e; >> unsigned int i; >> int r; >> @@ -1195,14 +1194,14 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) >> >> sync_mode = amdgpu_bo_explicit_sync(bo) ? >> AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER; >> - r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode, >> + r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode, >> &fpriv->vm); >> if (r) >> return r; >> } >> >> - for (i = 0; i < p->gang_size - 1; ++i) { >> - r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync); >> + for (i = 0; i < p->gang_size; ++i) { >> + r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]); >> if (r) >> return r; >> } >> @@ -1248,7 +1247,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, >> struct dma_fence *fence; >> >> fence = &p->jobs[i]->base.s_fence->scheduled; >> - r = amdgpu_sync_fence(&leader->sync, fence); >> + r = drm_sched_job_add_dependency(&leader->base, fence); >> if (r) >> goto error_cleanup; >> } >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h >> index cbaa19b2b8a3..207e801c24ed 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h >> @@ -75,6 +75,8 @@ struct amdgpu_cs_parser { >> >> unsigned num_post_deps; >> struct amdgpu_cs_post_dep *post_deps; >> + >> + struct amdgpu_sync sync; >> }; >> >> int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> index ba98d65835b4..b8494c3b3b8a 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c >> @@ -106,7 +106,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, >> (*job)->base.sched = &adev->rings[0]->sched; >> (*job)->vm = vm; >> >> - amdgpu_sync_create(&(*job)->sync); >> amdgpu_sync_create(&(*job)->explicit_sync); >> (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); >> (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; >> @@ -174,9 +173,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) >> >> drm_sched_job_cleanup(s_job); >> >> - amdgpu_sync_free(&job->sync); >> amdgpu_sync_free(&job->explicit_sync); >> - >> dma_fence_put(&job->hw_fence); >> } >> >> @@ -202,7 +199,6 @@ void amdgpu_job_free(struct amdgpu_job *job) >> drm_sched_job_cleanup(&job->base); >> >> amdgpu_job_free_resources(job); >> - amdgpu_sync_free(&job->sync); >> amdgpu_sync_free(&job->explicit_sync); >> if (job->gang_submit != &job->base.s_fence->scheduled) >> dma_fence_put(job->gang_submit); >> @@ -246,10 +242,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job, >> { >> struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); >> struct amdgpu_job *job = to_amdgpu_job(sched_job); >> - struct dma_fence *fence; >> + struct dma_fence *fence = NULL; >> int r; >> >> - fence = amdgpu_sync_get_fence(&job->sync); >> while (fence == NULL && job->vm && !job->vmid) { >> r = amdgpu_vmid_grab(job->vm, ring, job, &fence); >> if (r) >> @@ -273,8 +268,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) >> job = to_amdgpu_job(sched_job); >> finished = &job->base.s_fence->finished; >> >> - BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); >> - >> trace_amdgpu_sched_run_job(job); >> >> /* Skip job if VRAM is lost and never resubmit gangs */ >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h >> index 9c10b9bd0084..6558839fda03 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h >> @@ -47,7 +47,6 @@ enum amdgpu_ib_pool_type; >> struct amdgpu_job { >> struct drm_sched_job base; >> struct amdgpu_vm *vm; >> - struct amdgpu_sync sync; >> struct amdgpu_sync explicit_sync; >> struct dma_fence hw_fence; >> struct dma_fence *gang_submit; >> -- >> 2.25.1 >> > > Hi, I've been testing the Mesh shader benchmark in GravityMark and > I've bisected my laptop freezing up and rebooting, to this commit > > 1728baa7e4e60054bf13dd9b1212d133cbd53b3f is the first bad commit > commit 1728baa7e4e60054bf13dd9b1212d133cbd53b3f > Author: Christian König <christian.koenig@amd.com> > Date: Thu Sep 29 14:04:01 2022 +0200 > > drm/amdgpu: use scheduler dependencies for CS > > Entirely remove the sync obj in the job. > > Signed-off-by: Christian König <christian.koenig@amd.com> > Reviewed-by: Luben Tuikov <luben.tuikov@amd.com> > Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2Fmsgid%2F20221014084641.128280-11-christian.koenig%40amd.com&data=05%7C01%7Cluben.tuikov%40amd.com%7C89490e3fad4843fd789308dae368e10a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638072336848708258%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=yinQfgx3pcqZjCzafxTysYlhb4RUwJN8t8cb2VjOOes%3D&reserved=0 > > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 21 ++++++++++----------- > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 2 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +-------- > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 - > 4 files changed, 13 insertions(+), 20 deletions(-) > > This is on a prime system 6800M with the latest mesa > > I tried reverting this patch however it didn't revert cleanly, and my > attempt doesn't work and only partially freezes up the system > > Would you like me to open a bug for this on > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues&data=05%7C01%7Cluben.tuikov%40amd.com%7C89490e3fad4843fd789308dae368e10a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638072336848708258%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=M8d6vBXgByuQCRm9844a9jYtIDfuDy7efv3NM03Bmho%3D&reserved=0 ? > Hi Mike, Could you try this patch: https://lore.kernel.org/all/20221219104718.21677-1-christian.koenig@amd.com/ Regards, Luben
On Wed, 21 Dec 2022 at 15:52, Luben Tuikov <luben.tuikov@amd.com> wrote: > > On 2022-12-21 10:34, Mike Lothian wrote: > > On Fri, 14 Oct 2022 at 09:47, Christian König > > <ckoenig.leichtzumerken@gmail.com> wrote: > >> > >> Entirely remove the sync obj in the job. > >> > >> Signed-off-by: Christian König <christian.koenig@amd.com> > >> --- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 21 ++++++++++----------- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 2 ++ > >> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +-------- > >> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 - > >> 4 files changed, 13 insertions(+), 20 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > >> index d45b86bcf7fa..0528c2b1db6e 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c > >> @@ -426,7 +426,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p, > >> dma_fence_put(old); > >> } > >> > >> - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); > >> + r = amdgpu_sync_fence(&p->sync, fence); > >> dma_fence_put(fence); > >> if (r) > >> return r; > >> @@ -448,7 +448,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p, > >> return r; > >> } > >> > >> - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); > >> + r = amdgpu_sync_fence(&p->sync, fence); > >> if (r) > >> goto error; > >> > >> @@ -1108,7 +1108,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > >> if (r) > >> return r; > >> > >> - r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update); > >> + r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update); > >> if (r) > >> return r; > >> > >> @@ -1119,7 +1119,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > >> if (r) > >> return r; > >> > >> - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); > >> + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); > >> if (r) > >> return r; > >> } > >> @@ -1138,7 +1138,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > >> if (r) > >> return r; > >> > >> - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); > >> + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); > >> if (r) > >> return r; > >> } > >> @@ -1151,7 +1151,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > >> if (r) > >> return r; > >> > >> - r = amdgpu_sync_fence(&job->sync, vm->last_update); > >> + r = amdgpu_sync_fence(&p->sync, vm->last_update); > >> if (r) > >> return r; > >> > >> @@ -1183,7 +1183,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) > >> static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) > >> { > >> struct amdgpu_fpriv *fpriv = p->filp->driver_priv; > >> - struct amdgpu_job *leader = p->gang_leader; > >> struct amdgpu_bo_list_entry *e; > >> unsigned int i; > >> int r; > >> @@ -1195,14 +1194,14 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) > >> > >> sync_mode = amdgpu_bo_explicit_sync(bo) ? > >> AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER; > >> - r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode, > >> + r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode, > >> &fpriv->vm); > >> if (r) > >> return r; > >> } > >> > >> - for (i = 0; i < p->gang_size - 1; ++i) { > >> - r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync); > >> + for (i = 0; i < p->gang_size; ++i) { > >> + r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]); > >> if (r) > >> return r; > >> } > >> @@ -1248,7 +1247,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, > >> struct dma_fence *fence; > >> > >> fence = &p->jobs[i]->base.s_fence->scheduled; > >> - r = amdgpu_sync_fence(&leader->sync, fence); > >> + r = drm_sched_job_add_dependency(&leader->base, fence); > >> if (r) > >> goto error_cleanup; > >> } > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > >> index cbaa19b2b8a3..207e801c24ed 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h > >> @@ -75,6 +75,8 @@ struct amdgpu_cs_parser { > >> > >> unsigned num_post_deps; > >> struct amdgpu_cs_post_dep *post_deps; > >> + > >> + struct amdgpu_sync sync; > >> }; > >> > >> int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > >> index ba98d65835b4..b8494c3b3b8a 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c > >> @@ -106,7 +106,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, > >> (*job)->base.sched = &adev->rings[0]->sched; > >> (*job)->vm = vm; > >> > >> - amdgpu_sync_create(&(*job)->sync); > >> amdgpu_sync_create(&(*job)->explicit_sync); > >> (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); > >> (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; > >> @@ -174,9 +173,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) > >> > >> drm_sched_job_cleanup(s_job); > >> > >> - amdgpu_sync_free(&job->sync); > >> amdgpu_sync_free(&job->explicit_sync); > >> - > >> dma_fence_put(&job->hw_fence); > >> } > >> > >> @@ -202,7 +199,6 @@ void amdgpu_job_free(struct amdgpu_job *job) > >> drm_sched_job_cleanup(&job->base); > >> > >> amdgpu_job_free_resources(job); > >> - amdgpu_sync_free(&job->sync); > >> amdgpu_sync_free(&job->explicit_sync); > >> if (job->gang_submit != &job->base.s_fence->scheduled) > >> dma_fence_put(job->gang_submit); > >> @@ -246,10 +242,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job, > >> { > >> struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); > >> struct amdgpu_job *job = to_amdgpu_job(sched_job); > >> - struct dma_fence *fence; > >> + struct dma_fence *fence = NULL; > >> int r; > >> > >> - fence = amdgpu_sync_get_fence(&job->sync); > >> while (fence == NULL && job->vm && !job->vmid) { > >> r = amdgpu_vmid_grab(job->vm, ring, job, &fence); > >> if (r) > >> @@ -273,8 +268,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) > >> job = to_amdgpu_job(sched_job); > >> finished = &job->base.s_fence->finished; > >> > >> - BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); > >> - > >> trace_amdgpu_sched_run_job(job); > >> > >> /* Skip job if VRAM is lost and never resubmit gangs */ > >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > >> index 9c10b9bd0084..6558839fda03 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h > >> @@ -47,7 +47,6 @@ enum amdgpu_ib_pool_type; > >> struct amdgpu_job { > >> struct drm_sched_job base; > >> struct amdgpu_vm *vm; > >> - struct amdgpu_sync sync; > >> struct amdgpu_sync explicit_sync; > >> struct dma_fence hw_fence; > >> struct dma_fence *gang_submit; > >> -- > >> 2.25.1 > >> > > > > Hi, I've been testing the Mesh shader benchmark in GravityMark and > > I've bisected my laptop freezing up and rebooting, to this commit > > > > 1728baa7e4e60054bf13dd9b1212d133cbd53b3f is the first bad commit > > commit 1728baa7e4e60054bf13dd9b1212d133cbd53b3f > > Author: Christian König <christian.koenig@amd.com> > > Date: Thu Sep 29 14:04:01 2022 +0200 > > > > drm/amdgpu: use scheduler dependencies for CS > > > > Entirely remove the sync obj in the job. > > > > Signed-off-by: Christian König <christian.koenig@amd.com> > > Reviewed-by: Luben Tuikov <luben.tuikov@amd.com> > > Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2Fmsgid%2F20221014084641.128280-11-christian.koenig%40amd.com&data=05%7C01%7Cluben.tuikov%40amd.com%7C89490e3fad4843fd789308dae368e10a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638072336848708258%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=yinQfgx3pcqZjCzafxTysYlhb4RUwJN8t8cb2VjOOes%3D&reserved=0 > > > > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 21 ++++++++++----------- > > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 2 ++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +-------- > > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 - > > 4 files changed, 13 insertions(+), 20 deletions(-) > > > > This is on a prime system 6800M with the latest mesa > > > > I tried reverting this patch however it didn't revert cleanly, and my > > attempt doesn't work and only partially freezes up the system > > > > Would you like me to open a bug for this on > > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues&data=05%7C01%7Cluben.tuikov%40amd.com%7C89490e3fad4843fd789308dae368e10a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638072336848708258%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=M8d6vBXgByuQCRm9844a9jYtIDfuDy7efv3NM03Bmho%3D&reserved=0 ? > > > > Hi Mike, > > Could you try this patch: > > https://lore.kernel.org/all/20221219104718.21677-1-christian.koenig@amd.com/ > > Regards, > Luben > > I still see the same issue with this patch
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index d45b86bcf7fa..0528c2b1db6e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -426,7 +426,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p, dma_fence_put(old); } - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); + r = amdgpu_sync_fence(&p->sync, fence); dma_fence_put(fence); if (r) return r; @@ -448,7 +448,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p, return r; } - r = amdgpu_sync_fence(&p->gang_leader->sync, fence); + r = amdgpu_sync_fence(&p->sync, fence); if (r) goto error; @@ -1108,7 +1108,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update); + r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update); if (r) return r; @@ -1119,7 +1119,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); if (r) return r; } @@ -1138,7 +1138,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update); + r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update); if (r) return r; } @@ -1151,7 +1151,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) if (r) return r; - r = amdgpu_sync_fence(&job->sync, vm->last_update); + r = amdgpu_sync_fence(&p->sync, vm->last_update); if (r) return r; @@ -1183,7 +1183,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p) static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) { struct amdgpu_fpriv *fpriv = p->filp->driver_priv; - struct amdgpu_job *leader = p->gang_leader; struct amdgpu_bo_list_entry *e; unsigned int i; int r; @@ -1195,14 +1194,14 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p) sync_mode = amdgpu_bo_explicit_sync(bo) ? AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER; - r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode, + r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode, &fpriv->vm); if (r) return r; } - for (i = 0; i < p->gang_size - 1; ++i) { - r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync); + for (i = 0; i < p->gang_size; ++i) { + r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]); if (r) return r; } @@ -1248,7 +1247,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, struct dma_fence *fence; fence = &p->jobs[i]->base.s_fence->scheduled; - r = amdgpu_sync_fence(&leader->sync, fence); + r = drm_sched_job_add_dependency(&leader->base, fence); if (r) goto error_cleanup; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h index cbaa19b2b8a3..207e801c24ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h @@ -75,6 +75,8 @@ struct amdgpu_cs_parser { unsigned num_post_deps; struct amdgpu_cs_post_dep *post_deps; + + struct amdgpu_sync sync; }; int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c index ba98d65835b4..b8494c3b3b8a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c @@ -106,7 +106,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm, (*job)->base.sched = &adev->rings[0]->sched; (*job)->vm = vm; - amdgpu_sync_create(&(*job)->sync); amdgpu_sync_create(&(*job)->explicit_sync); (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter); (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET; @@ -174,9 +173,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job) drm_sched_job_cleanup(s_job); - amdgpu_sync_free(&job->sync); amdgpu_sync_free(&job->explicit_sync); - dma_fence_put(&job->hw_fence); } @@ -202,7 +199,6 @@ void amdgpu_job_free(struct amdgpu_job *job) drm_sched_job_cleanup(&job->base); amdgpu_job_free_resources(job); - amdgpu_sync_free(&job->sync); amdgpu_sync_free(&job->explicit_sync); if (job->gang_submit != &job->base.s_fence->scheduled) dma_fence_put(job->gang_submit); @@ -246,10 +242,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job, { struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched); struct amdgpu_job *job = to_amdgpu_job(sched_job); - struct dma_fence *fence; + struct dma_fence *fence = NULL; int r; - fence = amdgpu_sync_get_fence(&job->sync); while (fence == NULL && job->vm && !job->vmid) { r = amdgpu_vmid_grab(job->vm, ring, job, &fence); if (r) @@ -273,8 +268,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job) job = to_amdgpu_job(sched_job); finished = &job->base.s_fence->finished; - BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL)); - trace_amdgpu_sched_run_job(job); /* Skip job if VRAM is lost and never resubmit gangs */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h index 9c10b9bd0084..6558839fda03 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h @@ -47,7 +47,6 @@ enum amdgpu_ib_pool_type; struct amdgpu_job { struct drm_sched_job base; struct amdgpu_vm *vm; - struct amdgpu_sync sync; struct amdgpu_sync explicit_sync; struct dma_fence hw_fence; struct dma_fence *gang_submit;
Entirely remove the sync obj in the job. Signed-off-by: Christian König <christian.koenig@amd.com> --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 21 ++++++++++----------- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 9 +-------- drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 1 - 4 files changed, 13 insertions(+), 20 deletions(-)