diff mbox series

[10/13] drm/amdgpu: use scheduler depenencies for CS

Message ID 20221014084641.128280-11-christian.koenig@amd.com (mailing list archive)
State New, archived
Headers show
Series [01/13] drm/scheduler: fix fence ref counting | expand

Commit Message

Christian König Oct. 14, 2022, 8:46 a.m. UTC
Entirely remove the sync obj in the job.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 21 ++++++++++-----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h  |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  9 +--------
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
 4 files changed, 13 insertions(+), 20 deletions(-)

Comments

Luben Tuikov Oct. 24, 2022, 5:55 a.m. UTC | #1
Title: "dependencies"

Regards,
Luben

On 2022-10-14 04:46, Christian König wrote:
> Entirely remove the sync obj in the job.
> 
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 21 ++++++++++-----------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h  |  2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  9 +--------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
>  4 files changed, 13 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index d45b86bcf7fa..0528c2b1db6e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -426,7 +426,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
>  			dma_fence_put(old);
>  		}
>  
> -		r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
> +		r = amdgpu_sync_fence(&p->sync, fence);
>  		dma_fence_put(fence);
>  		if (r)
>  			return r;
> @@ -448,7 +448,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
>  		return r;
>  	}
>  
> -	r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
> +	r = amdgpu_sync_fence(&p->sync, fence);
>  	if (r)
>  		goto error;
>  
> @@ -1108,7 +1108,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>  	if (r)
>  		return r;
>  
> -	r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update);
> +	r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update);
>  	if (r)
>  		return r;
>  
> @@ -1119,7 +1119,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>  		if (r)
>  			return r;
>  
> -		r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
> +		r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
>  		if (r)
>  			return r;
>  	}
> @@ -1138,7 +1138,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>  		if (r)
>  			return r;
>  
> -		r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
> +		r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
>  		if (r)
>  			return r;
>  	}
> @@ -1151,7 +1151,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>  	if (r)
>  		return r;
>  
> -	r = amdgpu_sync_fence(&job->sync, vm->last_update);
> +	r = amdgpu_sync_fence(&p->sync, vm->last_update);
>  	if (r)
>  		return r;
>  
> @@ -1183,7 +1183,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>  static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
>  {
>  	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> -	struct amdgpu_job *leader = p->gang_leader;
>  	struct amdgpu_bo_list_entry *e;
>  	unsigned int i;
>  	int r;
> @@ -1195,14 +1194,14 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
>  
>  		sync_mode = amdgpu_bo_explicit_sync(bo) ?
>  			AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
> -		r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode,
> +		r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode,
>  				     &fpriv->vm);
>  		if (r)
>  			return r;
>  	}
>  
> -	for (i = 0; i < p->gang_size - 1; ++i) {
> -		r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync);
> +	for (i = 0; i < p->gang_size; ++i) {
> +		r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]);
>  		if (r)
>  			return r;
>  	}
> @@ -1248,7 +1247,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>  		struct dma_fence *fence;
>  
>  		fence = &p->jobs[i]->base.s_fence->scheduled;
> -		r = amdgpu_sync_fence(&leader->sync, fence);
> +		r = drm_sched_job_add_dependency(&leader->base, fence);
>  		if (r)
>  			goto error_cleanup;
>  	}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> index cbaa19b2b8a3..207e801c24ed 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> @@ -75,6 +75,8 @@ struct amdgpu_cs_parser {
>  
>  	unsigned			num_post_deps;
>  	struct amdgpu_cs_post_dep	*post_deps;
> +
> +	struct amdgpu_sync		sync;
>  };
>  
>  int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index ba98d65835b4..b8494c3b3b8a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -106,7 +106,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>  	(*job)->base.sched = &adev->rings[0]->sched;
>  	(*job)->vm = vm;
>  
> -	amdgpu_sync_create(&(*job)->sync);
>  	amdgpu_sync_create(&(*job)->explicit_sync);
>  	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>  	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
> @@ -174,9 +173,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>  
>  	drm_sched_job_cleanup(s_job);
>  
> -	amdgpu_sync_free(&job->sync);
>  	amdgpu_sync_free(&job->explicit_sync);
> -
>  	dma_fence_put(&job->hw_fence);
>  }
>  
> @@ -202,7 +199,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>  		drm_sched_job_cleanup(&job->base);
>  
>  	amdgpu_job_free_resources(job);
> -	amdgpu_sync_free(&job->sync);
>  	amdgpu_sync_free(&job->explicit_sync);
>  	if (job->gang_submit != &job->base.s_fence->scheduled)
>  		dma_fence_put(job->gang_submit);
> @@ -246,10 +242,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job,
>  {
>  	struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
>  	struct amdgpu_job *job = to_amdgpu_job(sched_job);
> -	struct dma_fence *fence;
> +	struct dma_fence *fence = NULL;
>  	int r;
>  
> -	fence = amdgpu_sync_get_fence(&job->sync);
>  	while (fence == NULL && job->vm && !job->vmid) {
>  		r = amdgpu_vmid_grab(job->vm, ring, job, &fence);
>  		if (r)
> @@ -273,8 +268,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>  	job = to_amdgpu_job(sched_job);
>  	finished = &job->base.s_fence->finished;
>  
> -	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
> -
>  	trace_amdgpu_sched_run_job(job);
>  
>  	/* Skip job if VRAM is lost and never resubmit gangs */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 9c10b9bd0084..6558839fda03 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -47,7 +47,6 @@ enum amdgpu_ib_pool_type;
>  struct amdgpu_job {
>  	struct drm_sched_job    base;
>  	struct amdgpu_vm	*vm;
> -	struct amdgpu_sync	sync;
>  	struct amdgpu_sync	explicit_sync;
>  	struct dma_fence	hw_fence;
>  	struct dma_fence	*gang_submit;
Mike Lothian Dec. 21, 2022, 3:34 p.m. UTC | #2
On Fri, 14 Oct 2022 at 09:47, Christian König
<ckoenig.leichtzumerken@gmail.com> wrote:
>
> Entirely remove the sync obj in the job.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 21 ++++++++++-----------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h  |  2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  9 +--------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
>  4 files changed, 13 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index d45b86bcf7fa..0528c2b1db6e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -426,7 +426,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
>                         dma_fence_put(old);
>                 }
>
> -               r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
> +               r = amdgpu_sync_fence(&p->sync, fence);
>                 dma_fence_put(fence);
>                 if (r)
>                         return r;
> @@ -448,7 +448,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
>                 return r;
>         }
>
> -       r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
> +       r = amdgpu_sync_fence(&p->sync, fence);
>         if (r)
>                 goto error;
>
> @@ -1108,7 +1108,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>         if (r)
>                 return r;
>
> -       r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update);
> +       r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update);
>         if (r)
>                 return r;
>
> @@ -1119,7 +1119,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>                 if (r)
>                         return r;
>
> -               r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
> +               r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
>                 if (r)
>                         return r;
>         }
> @@ -1138,7 +1138,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>                 if (r)
>                         return r;
>
> -               r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
> +               r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
>                 if (r)
>                         return r;
>         }
> @@ -1151,7 +1151,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>         if (r)
>                 return r;
>
> -       r = amdgpu_sync_fence(&job->sync, vm->last_update);
> +       r = amdgpu_sync_fence(&p->sync, vm->last_update);
>         if (r)
>                 return r;
>
> @@ -1183,7 +1183,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>  static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
>  {
>         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> -       struct amdgpu_job *leader = p->gang_leader;
>         struct amdgpu_bo_list_entry *e;
>         unsigned int i;
>         int r;
> @@ -1195,14 +1194,14 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
>
>                 sync_mode = amdgpu_bo_explicit_sync(bo) ?
>                         AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
> -               r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode,
> +               r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode,
>                                      &fpriv->vm);
>                 if (r)
>                         return r;
>         }
>
> -       for (i = 0; i < p->gang_size - 1; ++i) {
> -               r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync);
> +       for (i = 0; i < p->gang_size; ++i) {
> +               r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]);
>                 if (r)
>                         return r;
>         }
> @@ -1248,7 +1247,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>                 struct dma_fence *fence;
>
>                 fence = &p->jobs[i]->base.s_fence->scheduled;
> -               r = amdgpu_sync_fence(&leader->sync, fence);
> +               r = drm_sched_job_add_dependency(&leader->base, fence);
>                 if (r)
>                         goto error_cleanup;
>         }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> index cbaa19b2b8a3..207e801c24ed 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> @@ -75,6 +75,8 @@ struct amdgpu_cs_parser {
>
>         unsigned                        num_post_deps;
>         struct amdgpu_cs_post_dep       *post_deps;
> +
> +       struct amdgpu_sync              sync;
>  };
>
>  int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index ba98d65835b4..b8494c3b3b8a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -106,7 +106,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>         (*job)->base.sched = &adev->rings[0]->sched;
>         (*job)->vm = vm;
>
> -       amdgpu_sync_create(&(*job)->sync);
>         amdgpu_sync_create(&(*job)->explicit_sync);
>         (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>         (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
> @@ -174,9 +173,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>
>         drm_sched_job_cleanup(s_job);
>
> -       amdgpu_sync_free(&job->sync);
>         amdgpu_sync_free(&job->explicit_sync);
> -
>         dma_fence_put(&job->hw_fence);
>  }
>
> @@ -202,7 +199,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>                 drm_sched_job_cleanup(&job->base);
>
>         amdgpu_job_free_resources(job);
> -       amdgpu_sync_free(&job->sync);
>         amdgpu_sync_free(&job->explicit_sync);
>         if (job->gang_submit != &job->base.s_fence->scheduled)
>                 dma_fence_put(job->gang_submit);
> @@ -246,10 +242,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job,
>  {
>         struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
>         struct amdgpu_job *job = to_amdgpu_job(sched_job);
> -       struct dma_fence *fence;
> +       struct dma_fence *fence = NULL;
>         int r;
>
> -       fence = amdgpu_sync_get_fence(&job->sync);
>         while (fence == NULL && job->vm && !job->vmid) {
>                 r = amdgpu_vmid_grab(job->vm, ring, job, &fence);
>                 if (r)
> @@ -273,8 +268,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>         job = to_amdgpu_job(sched_job);
>         finished = &job->base.s_fence->finished;
>
> -       BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
> -
>         trace_amdgpu_sched_run_job(job);
>
>         /* Skip job if VRAM is lost and never resubmit gangs */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 9c10b9bd0084..6558839fda03 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -47,7 +47,6 @@ enum amdgpu_ib_pool_type;
>  struct amdgpu_job {
>         struct drm_sched_job    base;
>         struct amdgpu_vm        *vm;
> -       struct amdgpu_sync      sync;
>         struct amdgpu_sync      explicit_sync;
>         struct dma_fence        hw_fence;
>         struct dma_fence        *gang_submit;
> --
> 2.25.1
>

Hi, I've been testing the Mesh shader benchmark in GravityMark and
I've bisected my laptop freezing up and rebooting, to this commit

1728baa7e4e60054bf13dd9b1212d133cbd53b3f is the first bad commit
commit 1728baa7e4e60054bf13dd9b1212d133cbd53b3f
Author: Christian König <christian.koenig@amd.com>
Date:   Thu Sep 29 14:04:01 2022 +0200

   drm/amdgpu: use scheduler dependencies for CS

   Entirely remove the sync obj in the job.

   Signed-off-by: Christian König <christian.koenig@amd.com>
   Reviewed-by: Luben Tuikov <luben.tuikov@amd.com>
   Link: https://patchwork.freedesktop.org/patch/msgid/20221014084641.128280-11-christian.koenig@amd.com

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 21 ++++++++++-----------
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h  |  2 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  9 +--------
drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
4 files changed, 13 insertions(+), 20 deletions(-)

This is on a prime system 6800M with the latest mesa

I tried reverting this patch however it didn't revert cleanly, and my
attempt doesn't work and only partially freezes up the system

Would you like me to open a bug for this on
https://gitlab.freedesktop.org/drm/amd/-/issues ?

Cheers

Mike
Mike Lothian Dec. 21, 2022, 3:47 p.m. UTC | #3
https://gitlab.freedesktop.org/drm/amd/-/issues/2309

On Wed, 21 Dec 2022 at 15:34, Mike Lothian <mike@fireburn.co.uk> wrote:
>
> On Fri, 14 Oct 2022 at 09:47, Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
> >
> > Entirely remove the sync obj in the job.
> >
> > Signed-off-by: Christian König <christian.koenig@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 21 ++++++++++-----------
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h  |  2 ++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  9 +--------
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
> >  4 files changed, 13 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > index d45b86bcf7fa..0528c2b1db6e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> > @@ -426,7 +426,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
> >                         dma_fence_put(old);
> >                 }
> >
> > -               r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
> > +               r = amdgpu_sync_fence(&p->sync, fence);
> >                 dma_fence_put(fence);
> >                 if (r)
> >                         return r;
> > @@ -448,7 +448,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
> >                 return r;
> >         }
> >
> > -       r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
> > +       r = amdgpu_sync_fence(&p->sync, fence);
> >         if (r)
> >                 goto error;
> >
> > @@ -1108,7 +1108,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> >         if (r)
> >                 return r;
> >
> > -       r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update);
> > +       r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update);
> >         if (r)
> >                 return r;
> >
> > @@ -1119,7 +1119,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> >                 if (r)
> >                         return r;
> >
> > -               r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
> > +               r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
> >                 if (r)
> >                         return r;
> >         }
> > @@ -1138,7 +1138,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> >                 if (r)
> >                         return r;
> >
> > -               r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
> > +               r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
> >                 if (r)
> >                         return r;
> >         }
> > @@ -1151,7 +1151,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> >         if (r)
> >                 return r;
> >
> > -       r = amdgpu_sync_fence(&job->sync, vm->last_update);
> > +       r = amdgpu_sync_fence(&p->sync, vm->last_update);
> >         if (r)
> >                 return r;
> >
> > @@ -1183,7 +1183,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> >  static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
> >  {
> >         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> > -       struct amdgpu_job *leader = p->gang_leader;
> >         struct amdgpu_bo_list_entry *e;
> >         unsigned int i;
> >         int r;
> > @@ -1195,14 +1194,14 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
> >
> >                 sync_mode = amdgpu_bo_explicit_sync(bo) ?
> >                         AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
> > -               r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode,
> > +               r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode,
> >                                      &fpriv->vm);
> >                 if (r)
> >                         return r;
> >         }
> >
> > -       for (i = 0; i < p->gang_size - 1; ++i) {
> > -               r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync);
> > +       for (i = 0; i < p->gang_size; ++i) {
> > +               r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]);
> >                 if (r)
> >                         return r;
> >         }
> > @@ -1248,7 +1247,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
> >                 struct dma_fence *fence;
> >
> >                 fence = &p->jobs[i]->base.s_fence->scheduled;
> > -               r = amdgpu_sync_fence(&leader->sync, fence);
> > +               r = drm_sched_job_add_dependency(&leader->base, fence);
> >                 if (r)
> >                         goto error_cleanup;
> >         }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> > index cbaa19b2b8a3..207e801c24ed 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> > @@ -75,6 +75,8 @@ struct amdgpu_cs_parser {
> >
> >         unsigned                        num_post_deps;
> >         struct amdgpu_cs_post_dep       *post_deps;
> > +
> > +       struct amdgpu_sync              sync;
> >  };
> >
> >  int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > index ba98d65835b4..b8494c3b3b8a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> > @@ -106,7 +106,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> >         (*job)->base.sched = &adev->rings[0]->sched;
> >         (*job)->vm = vm;
> >
> > -       amdgpu_sync_create(&(*job)->sync);
> >         amdgpu_sync_create(&(*job)->explicit_sync);
> >         (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
> >         (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
> > @@ -174,9 +173,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
> >
> >         drm_sched_job_cleanup(s_job);
> >
> > -       amdgpu_sync_free(&job->sync);
> >         amdgpu_sync_free(&job->explicit_sync);
> > -
> >         dma_fence_put(&job->hw_fence);
> >  }
> >
> > @@ -202,7 +199,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
> >                 drm_sched_job_cleanup(&job->base);
> >
> >         amdgpu_job_free_resources(job);
> > -       amdgpu_sync_free(&job->sync);
> >         amdgpu_sync_free(&job->explicit_sync);
> >         if (job->gang_submit != &job->base.s_fence->scheduled)
> >                 dma_fence_put(job->gang_submit);
> > @@ -246,10 +242,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job,
> >  {
> >         struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
> >         struct amdgpu_job *job = to_amdgpu_job(sched_job);
> > -       struct dma_fence *fence;
> > +       struct dma_fence *fence = NULL;
> >         int r;
> >
> > -       fence = amdgpu_sync_get_fence(&job->sync);
> >         while (fence == NULL && job->vm && !job->vmid) {
> >                 r = amdgpu_vmid_grab(job->vm, ring, job, &fence);
> >                 if (r)
> > @@ -273,8 +268,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
> >         job = to_amdgpu_job(sched_job);
> >         finished = &job->base.s_fence->finished;
> >
> > -       BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
> > -
> >         trace_amdgpu_sched_run_job(job);
> >
> >         /* Skip job if VRAM is lost and never resubmit gangs */
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > index 9c10b9bd0084..6558839fda03 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> > @@ -47,7 +47,6 @@ enum amdgpu_ib_pool_type;
> >  struct amdgpu_job {
> >         struct drm_sched_job    base;
> >         struct amdgpu_vm        *vm;
> > -       struct amdgpu_sync      sync;
> >         struct amdgpu_sync      explicit_sync;
> >         struct dma_fence        hw_fence;
> >         struct dma_fence        *gang_submit;
> > --
> > 2.25.1
> >
>
> Hi, I've been testing the Mesh shader benchmark in GravityMark and
> I've bisected my laptop freezing up and rebooting, to this commit
>
> 1728baa7e4e60054bf13dd9b1212d133cbd53b3f is the first bad commit
> commit 1728baa7e4e60054bf13dd9b1212d133cbd53b3f
> Author: Christian König <christian.koenig@amd.com>
> Date:   Thu Sep 29 14:04:01 2022 +0200
>
>    drm/amdgpu: use scheduler dependencies for CS
>
>    Entirely remove the sync obj in the job.
>
>    Signed-off-by: Christian König <christian.koenig@amd.com>
>    Reviewed-by: Luben Tuikov <luben.tuikov@amd.com>
>    Link: https://patchwork.freedesktop.org/patch/msgid/20221014084641.128280-11-christian.koenig@amd.com
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 21 ++++++++++-----------
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h  |  2 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  9 +--------
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
> 4 files changed, 13 insertions(+), 20 deletions(-)
>
> This is on a prime system 6800M with the latest mesa
>
> I tried reverting this patch however it didn't revert cleanly, and my
> attempt doesn't work and only partially freezes up the system
>
> Would you like me to open a bug for this on
> https://gitlab.freedesktop.org/drm/amd/-/issues ?
>
> Cheers
>
> Mike
Luben Tuikov Dec. 21, 2022, 3:52 p.m. UTC | #4
On 2022-12-21 10:34, Mike Lothian wrote:
> On Fri, 14 Oct 2022 at 09:47, Christian König
> <ckoenig.leichtzumerken@gmail.com> wrote:
>>
>> Entirely remove the sync obj in the job.
>>
>> Signed-off-by: Christian König <christian.koenig@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 21 ++++++++++-----------
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h  |  2 ++
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  9 +--------
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
>>  4 files changed, 13 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index d45b86bcf7fa..0528c2b1db6e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -426,7 +426,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
>>                         dma_fence_put(old);
>>                 }
>>
>> -               r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
>> +               r = amdgpu_sync_fence(&p->sync, fence);
>>                 dma_fence_put(fence);
>>                 if (r)
>>                         return r;
>> @@ -448,7 +448,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
>>                 return r;
>>         }
>>
>> -       r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
>> +       r = amdgpu_sync_fence(&p->sync, fence);
>>         if (r)
>>                 goto error;
>>
>> @@ -1108,7 +1108,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>>         if (r)
>>                 return r;
>>
>> -       r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update);
>> +       r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update);
>>         if (r)
>>                 return r;
>>
>> @@ -1119,7 +1119,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>>                 if (r)
>>                         return r;
>>
>> -               r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
>> +               r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
>>                 if (r)
>>                         return r;
>>         }
>> @@ -1138,7 +1138,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>>                 if (r)
>>                         return r;
>>
>> -               r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
>> +               r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
>>                 if (r)
>>                         return r;
>>         }
>> @@ -1151,7 +1151,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>>         if (r)
>>                 return r;
>>
>> -       r = amdgpu_sync_fence(&job->sync, vm->last_update);
>> +       r = amdgpu_sync_fence(&p->sync, vm->last_update);
>>         if (r)
>>                 return r;
>>
>> @@ -1183,7 +1183,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
>>  static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
>>  {
>>         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
>> -       struct amdgpu_job *leader = p->gang_leader;
>>         struct amdgpu_bo_list_entry *e;
>>         unsigned int i;
>>         int r;
>> @@ -1195,14 +1194,14 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
>>
>>                 sync_mode = amdgpu_bo_explicit_sync(bo) ?
>>                         AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
>> -               r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode,
>> +               r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode,
>>                                      &fpriv->vm);
>>                 if (r)
>>                         return r;
>>         }
>>
>> -       for (i = 0; i < p->gang_size - 1; ++i) {
>> -               r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync);
>> +       for (i = 0; i < p->gang_size; ++i) {
>> +               r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]);
>>                 if (r)
>>                         return r;
>>         }
>> @@ -1248,7 +1247,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>>                 struct dma_fence *fence;
>>
>>                 fence = &p->jobs[i]->base.s_fence->scheduled;
>> -               r = amdgpu_sync_fence(&leader->sync, fence);
>> +               r = drm_sched_job_add_dependency(&leader->base, fence);
>>                 if (r)
>>                         goto error_cleanup;
>>         }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
>> index cbaa19b2b8a3..207e801c24ed 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
>> @@ -75,6 +75,8 @@ struct amdgpu_cs_parser {
>>
>>         unsigned                        num_post_deps;
>>         struct amdgpu_cs_post_dep       *post_deps;
>> +
>> +       struct amdgpu_sync              sync;
>>  };
>>
>>  int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index ba98d65835b4..b8494c3b3b8a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -106,7 +106,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>>         (*job)->base.sched = &adev->rings[0]->sched;
>>         (*job)->vm = vm;
>>
>> -       amdgpu_sync_create(&(*job)->sync);
>>         amdgpu_sync_create(&(*job)->explicit_sync);
>>         (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>>         (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
>> @@ -174,9 +173,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>
>>         drm_sched_job_cleanup(s_job);
>>
>> -       amdgpu_sync_free(&job->sync);
>>         amdgpu_sync_free(&job->explicit_sync);
>> -
>>         dma_fence_put(&job->hw_fence);
>>  }
>>
>> @@ -202,7 +199,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>                 drm_sched_job_cleanup(&job->base);
>>
>>         amdgpu_job_free_resources(job);
>> -       amdgpu_sync_free(&job->sync);
>>         amdgpu_sync_free(&job->explicit_sync);
>>         if (job->gang_submit != &job->base.s_fence->scheduled)
>>                 dma_fence_put(job->gang_submit);
>> @@ -246,10 +242,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job,
>>  {
>>         struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
>>         struct amdgpu_job *job = to_amdgpu_job(sched_job);
>> -       struct dma_fence *fence;
>> +       struct dma_fence *fence = NULL;
>>         int r;
>>
>> -       fence = amdgpu_sync_get_fence(&job->sync);
>>         while (fence == NULL && job->vm && !job->vmid) {
>>                 r = amdgpu_vmid_grab(job->vm, ring, job, &fence);
>>                 if (r)
>> @@ -273,8 +268,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>>         job = to_amdgpu_job(sched_job);
>>         finished = &job->base.s_fence->finished;
>>
>> -       BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
>> -
>>         trace_amdgpu_sched_run_job(job);
>>
>>         /* Skip job if VRAM is lost and never resubmit gangs */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> index 9c10b9bd0084..6558839fda03 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> @@ -47,7 +47,6 @@ enum amdgpu_ib_pool_type;
>>  struct amdgpu_job {
>>         struct drm_sched_job    base;
>>         struct amdgpu_vm        *vm;
>> -       struct amdgpu_sync      sync;
>>         struct amdgpu_sync      explicit_sync;
>>         struct dma_fence        hw_fence;
>>         struct dma_fence        *gang_submit;
>> --
>> 2.25.1
>>
> 
> Hi, I've been testing the Mesh shader benchmark in GravityMark and
> I've bisected my laptop freezing up and rebooting, to this commit
> 
> 1728baa7e4e60054bf13dd9b1212d133cbd53b3f is the first bad commit
> commit 1728baa7e4e60054bf13dd9b1212d133cbd53b3f
> Author: Christian König <christian.koenig@amd.com>
> Date:   Thu Sep 29 14:04:01 2022 +0200
> 
>    drm/amdgpu: use scheduler dependencies for CS
> 
>    Entirely remove the sync obj in the job.
> 
>    Signed-off-by: Christian König <christian.koenig@amd.com>
>    Reviewed-by: Luben Tuikov <luben.tuikov@amd.com>
>    Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2Fmsgid%2F20221014084641.128280-11-christian.koenig%40amd.com&data=05%7C01%7Cluben.tuikov%40amd.com%7C89490e3fad4843fd789308dae368e10a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638072336848708258%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=yinQfgx3pcqZjCzafxTysYlhb4RUwJN8t8cb2VjOOes%3D&reserved=0
> 
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 21 ++++++++++-----------
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h  |  2 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  9 +--------
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
> 4 files changed, 13 insertions(+), 20 deletions(-)
> 
> This is on a prime system 6800M with the latest mesa
> 
> I tried reverting this patch however it didn't revert cleanly, and my
> attempt doesn't work and only partially freezes up the system
> 
> Would you like me to open a bug for this on
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues&data=05%7C01%7Cluben.tuikov%40amd.com%7C89490e3fad4843fd789308dae368e10a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638072336848708258%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=M8d6vBXgByuQCRm9844a9jYtIDfuDy7efv3NM03Bmho%3D&reserved=0 ?
> 

Hi Mike,

Could you try this patch:

https://lore.kernel.org/all/20221219104718.21677-1-christian.koenig@amd.com/

Regards,
Luben
Mike Lothian Dec. 21, 2022, 3:55 p.m. UTC | #5
On Wed, 21 Dec 2022 at 15:52, Luben Tuikov <luben.tuikov@amd.com> wrote:
>
> On 2022-12-21 10:34, Mike Lothian wrote:
> > On Fri, 14 Oct 2022 at 09:47, Christian König
> > <ckoenig.leichtzumerken@gmail.com> wrote:
> >>
> >> Entirely remove the sync obj in the job.
> >>
> >> Signed-off-by: Christian König <christian.koenig@amd.com>
> >> ---
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 21 ++++++++++-----------
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h  |  2 ++
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  9 +--------
> >>  drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
> >>  4 files changed, 13 insertions(+), 20 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> >> index d45b86bcf7fa..0528c2b1db6e 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> >> @@ -426,7 +426,7 @@ static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
> >>                         dma_fence_put(old);
> >>                 }
> >>
> >> -               r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
> >> +               r = amdgpu_sync_fence(&p->sync, fence);
> >>                 dma_fence_put(fence);
> >>                 if (r)
> >>                         return r;
> >> @@ -448,7 +448,7 @@ static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
> >>                 return r;
> >>         }
> >>
> >> -       r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
> >> +       r = amdgpu_sync_fence(&p->sync, fence);
> >>         if (r)
> >>                 goto error;
> >>
> >> @@ -1108,7 +1108,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> >>         if (r)
> >>                 return r;
> >>
> >> -       r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update);
> >> +       r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update);
> >>         if (r)
> >>                 return r;
> >>
> >> @@ -1119,7 +1119,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> >>                 if (r)
> >>                         return r;
> >>
> >> -               r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
> >> +               r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
> >>                 if (r)
> >>                         return r;
> >>         }
> >> @@ -1138,7 +1138,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> >>                 if (r)
> >>                         return r;
> >>
> >> -               r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
> >> +               r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
> >>                 if (r)
> >>                         return r;
> >>         }
> >> @@ -1151,7 +1151,7 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> >>         if (r)
> >>                 return r;
> >>
> >> -       r = amdgpu_sync_fence(&job->sync, vm->last_update);
> >> +       r = amdgpu_sync_fence(&p->sync, vm->last_update);
> >>         if (r)
> >>                 return r;
> >>
> >> @@ -1183,7 +1183,6 @@ static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
> >>  static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
> >>  {
> >>         struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
> >> -       struct amdgpu_job *leader = p->gang_leader;
> >>         struct amdgpu_bo_list_entry *e;
> >>         unsigned int i;
> >>         int r;
> >> @@ -1195,14 +1194,14 @@ static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
> >>
> >>                 sync_mode = amdgpu_bo_explicit_sync(bo) ?
> >>                         AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
> >> -               r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode,
> >> +               r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode,
> >>                                      &fpriv->vm);
> >>                 if (r)
> >>                         return r;
> >>         }
> >>
> >> -       for (i = 0; i < p->gang_size - 1; ++i) {
> >> -               r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync);
> >> +       for (i = 0; i < p->gang_size; ++i) {
> >> +               r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]);
> >>                 if (r)
> >>                         return r;
> >>         }
> >> @@ -1248,7 +1247,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
> >>                 struct dma_fence *fence;
> >>
> >>                 fence = &p->jobs[i]->base.s_fence->scheduled;
> >> -               r = amdgpu_sync_fence(&leader->sync, fence);
> >> +               r = drm_sched_job_add_dependency(&leader->base, fence);
> >>                 if (r)
> >>                         goto error_cleanup;
> >>         }
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> >> index cbaa19b2b8a3..207e801c24ed 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
> >> @@ -75,6 +75,8 @@ struct amdgpu_cs_parser {
> >>
> >>         unsigned                        num_post_deps;
> >>         struct amdgpu_cs_post_dep       *post_deps;
> >> +
> >> +       struct amdgpu_sync              sync;
> >>  };
> >>
> >>  int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser,
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >> index ba98d65835b4..b8494c3b3b8a 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> >> @@ -106,7 +106,6 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> >>         (*job)->base.sched = &adev->rings[0]->sched;
> >>         (*job)->vm = vm;
> >>
> >> -       amdgpu_sync_create(&(*job)->sync);
> >>         amdgpu_sync_create(&(*job)->explicit_sync);
> >>         (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
> >>         (*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
> >> @@ -174,9 +173,7 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
> >>
> >>         drm_sched_job_cleanup(s_job);
> >>
> >> -       amdgpu_sync_free(&job->sync);
> >>         amdgpu_sync_free(&job->explicit_sync);
> >> -
> >>         dma_fence_put(&job->hw_fence);
> >>  }
> >>
> >> @@ -202,7 +199,6 @@ void amdgpu_job_free(struct amdgpu_job *job)
> >>                 drm_sched_job_cleanup(&job->base);
> >>
> >>         amdgpu_job_free_resources(job);
> >> -       amdgpu_sync_free(&job->sync);
> >>         amdgpu_sync_free(&job->explicit_sync);
> >>         if (job->gang_submit != &job->base.s_fence->scheduled)
> >>                 dma_fence_put(job->gang_submit);
> >> @@ -246,10 +242,9 @@ amdgpu_job_dependency(struct drm_sched_job *sched_job,
> >>  {
> >>         struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
> >>         struct amdgpu_job *job = to_amdgpu_job(sched_job);
> >> -       struct dma_fence *fence;
> >> +       struct dma_fence *fence = NULL;
> >>         int r;
> >>
> >> -       fence = amdgpu_sync_get_fence(&job->sync);
> >>         while (fence == NULL && job->vm && !job->vmid) {
> >>                 r = amdgpu_vmid_grab(job->vm, ring, job, &fence);
> >>                 if (r)
> >> @@ -273,8 +268,6 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
> >>         job = to_amdgpu_job(sched_job);
> >>         finished = &job->base.s_fence->finished;
> >>
> >> -       BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
> >> -
> >>         trace_amdgpu_sched_run_job(job);
> >>
> >>         /* Skip job if VRAM is lost and never resubmit gangs */
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >> index 9c10b9bd0084..6558839fda03 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> >> @@ -47,7 +47,6 @@ enum amdgpu_ib_pool_type;
> >>  struct amdgpu_job {
> >>         struct drm_sched_job    base;
> >>         struct amdgpu_vm        *vm;
> >> -       struct amdgpu_sync      sync;
> >>         struct amdgpu_sync      explicit_sync;
> >>         struct dma_fence        hw_fence;
> >>         struct dma_fence        *gang_submit;
> >> --
> >> 2.25.1
> >>
> >
> > Hi, I've been testing the Mesh shader benchmark in GravityMark and
> > I've bisected my laptop freezing up and rebooting, to this commit
> >
> > 1728baa7e4e60054bf13dd9b1212d133cbd53b3f is the first bad commit
> > commit 1728baa7e4e60054bf13dd9b1212d133cbd53b3f
> > Author: Christian König <christian.koenig@amd.com>
> > Date:   Thu Sep 29 14:04:01 2022 +0200
> >
> >    drm/amdgpu: use scheduler dependencies for CS
> >
> >    Entirely remove the sync obj in the job.
> >
> >    Signed-off-by: Christian König <christian.koenig@amd.com>
> >    Reviewed-by: Luben Tuikov <luben.tuikov@amd.com>
> >    Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fpatchwork.freedesktop.org%2Fpatch%2Fmsgid%2F20221014084641.128280-11-christian.koenig%40amd.com&data=05%7C01%7Cluben.tuikov%40amd.com%7C89490e3fad4843fd789308dae368e10a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638072336848708258%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=yinQfgx3pcqZjCzafxTysYlhb4RUwJN8t8cb2VjOOes%3D&reserved=0
> >
> > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 21 ++++++++++-----------
> > drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h  |  2 ++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  9 +--------
> > drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
> > 4 files changed, 13 insertions(+), 20 deletions(-)
> >
> > This is on a prime system 6800M with the latest mesa
> >
> > I tried reverting this patch however it didn't revert cleanly, and my
> > attempt doesn't work and only partially freezes up the system
> >
> > Would you like me to open a bug for this on
> > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues&data=05%7C01%7Cluben.tuikov%40amd.com%7C89490e3fad4843fd789308dae368e10a%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C638072336848708258%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=M8d6vBXgByuQCRm9844a9jYtIDfuDy7efv3NM03Bmho%3D&reserved=0 ?
> >
>
> Hi Mike,
>
> Could you try this patch:
>
> https://lore.kernel.org/all/20221219104718.21677-1-christian.koenig@amd.com/
>
> Regards,
> Luben
>
>

I still see the same issue with this patch
diff mbox series

Patch

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index d45b86bcf7fa..0528c2b1db6e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -426,7 +426,7 @@  static int amdgpu_cs_p2_dependencies(struct amdgpu_cs_parser *p,
 			dma_fence_put(old);
 		}
 
-		r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
+		r = amdgpu_sync_fence(&p->sync, fence);
 		dma_fence_put(fence);
 		if (r)
 			return r;
@@ -448,7 +448,7 @@  static int amdgpu_syncobj_lookup_and_add(struct amdgpu_cs_parser *p,
 		return r;
 	}
 
-	r = amdgpu_sync_fence(&p->gang_leader->sync, fence);
+	r = amdgpu_sync_fence(&p->sync, fence);
 	if (r)
 		goto error;
 
@@ -1108,7 +1108,7 @@  static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 	if (r)
 		return r;
 
-	r = amdgpu_sync_fence(&job->sync, fpriv->prt_va->last_pt_update);
+	r = amdgpu_sync_fence(&p->sync, fpriv->prt_va->last_pt_update);
 	if (r)
 		return r;
 
@@ -1119,7 +1119,7 @@  static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 		if (r)
 			return r;
 
-		r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
+		r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
 		if (r)
 			return r;
 	}
@@ -1138,7 +1138,7 @@  static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 		if (r)
 			return r;
 
-		r = amdgpu_sync_fence(&job->sync, bo_va->last_pt_update);
+		r = amdgpu_sync_fence(&p->sync, bo_va->last_pt_update);
 		if (r)
 			return r;
 	}
@@ -1151,7 +1151,7 @@  static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 	if (r)
 		return r;
 
-	r = amdgpu_sync_fence(&job->sync, vm->last_update);
+	r = amdgpu_sync_fence(&p->sync, vm->last_update);
 	if (r)
 		return r;
 
@@ -1183,7 +1183,6 @@  static int amdgpu_cs_vm_handling(struct amdgpu_cs_parser *p)
 static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
 {
 	struct amdgpu_fpriv *fpriv = p->filp->driver_priv;
-	struct amdgpu_job *leader = p->gang_leader;
 	struct amdgpu_bo_list_entry *e;
 	unsigned int i;
 	int r;
@@ -1195,14 +1194,14 @@  static int amdgpu_cs_sync_rings(struct amdgpu_cs_parser *p)
 
 		sync_mode = amdgpu_bo_explicit_sync(bo) ?
 			AMDGPU_SYNC_EXPLICIT : AMDGPU_SYNC_NE_OWNER;
-		r = amdgpu_sync_resv(p->adev, &leader->sync, resv, sync_mode,
+		r = amdgpu_sync_resv(p->adev, &p->sync, resv, sync_mode,
 				     &fpriv->vm);
 		if (r)
 			return r;
 	}
 
-	for (i = 0; i < p->gang_size - 1; ++i) {
-		r = amdgpu_sync_clone(&leader->sync, &p->jobs[i]->sync);
+	for (i = 0; i < p->gang_size; ++i) {
+		r = amdgpu_sync_push_to_job(&p->sync, p->jobs[i]);
 		if (r)
 			return r;
 	}
@@ -1248,7 +1247,7 @@  static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 		struct dma_fence *fence;
 
 		fence = &p->jobs[i]->base.s_fence->scheduled;
-		r = amdgpu_sync_fence(&leader->sync, fence);
+		r = drm_sched_job_add_dependency(&leader->base, fence);
 		if (r)
 			goto error_cleanup;
 	}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
index cbaa19b2b8a3..207e801c24ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.h
@@ -75,6 +75,8 @@  struct amdgpu_cs_parser {
 
 	unsigned			num_post_deps;
 	struct amdgpu_cs_post_dep	*post_deps;
+
+	struct amdgpu_sync		sync;
 };
 
 int amdgpu_cs_find_mapping(struct amdgpu_cs_parser *parser,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index ba98d65835b4..b8494c3b3b8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -106,7 +106,6 @@  int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	(*job)->base.sched = &adev->rings[0]->sched;
 	(*job)->vm = vm;
 
-	amdgpu_sync_create(&(*job)->sync);
 	amdgpu_sync_create(&(*job)->explicit_sync);
 	(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
 	(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
@@ -174,9 +173,7 @@  static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
 
 	drm_sched_job_cleanup(s_job);
 
-	amdgpu_sync_free(&job->sync);
 	amdgpu_sync_free(&job->explicit_sync);
-
 	dma_fence_put(&job->hw_fence);
 }
 
@@ -202,7 +199,6 @@  void amdgpu_job_free(struct amdgpu_job *job)
 		drm_sched_job_cleanup(&job->base);
 
 	amdgpu_job_free_resources(job);
-	amdgpu_sync_free(&job->sync);
 	amdgpu_sync_free(&job->explicit_sync);
 	if (job->gang_submit != &job->base.s_fence->scheduled)
 		dma_fence_put(job->gang_submit);
@@ -246,10 +242,9 @@  amdgpu_job_dependency(struct drm_sched_job *sched_job,
 {
 	struct amdgpu_ring *ring = to_amdgpu_ring(s_entity->rq->sched);
 	struct amdgpu_job *job = to_amdgpu_job(sched_job);
-	struct dma_fence *fence;
+	struct dma_fence *fence = NULL;
 	int r;
 
-	fence = amdgpu_sync_get_fence(&job->sync);
 	while (fence == NULL && job->vm && !job->vmid) {
 		r = amdgpu_vmid_grab(job->vm, ring, job, &fence);
 		if (r)
@@ -273,8 +268,6 @@  static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 	job = to_amdgpu_job(sched_job);
 	finished = &job->base.s_fence->finished;
 
-	BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
-
 	trace_amdgpu_sched_run_job(job);
 
 	/* Skip job if VRAM is lost and never resubmit gangs */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 9c10b9bd0084..6558839fda03 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -47,7 +47,6 @@  enum amdgpu_ib_pool_type;
 struct amdgpu_job {
 	struct drm_sched_job    base;
 	struct amdgpu_vm	*vm;
-	struct amdgpu_sync	sync;
 	struct amdgpu_sync	explicit_sync;
 	struct dma_fence	hw_fence;
 	struct dma_fence	*gang_submit;