diff mbox series

[drm-next,13/14] drm/nouveau: implement new VM_BIND UAPI

Message ID 20230118061256.2689-14-dakr@redhat.com (mailing list archive)
State New, archived
Headers show
Series DRM GPUVA Manager & Nouveau VM_BIND UAPI | expand

Commit Message

Danilo Krummrich Jan. 18, 2023, 6:12 a.m. UTC
This commit provides the implementation for the new uapi motivated by the
Vulkan API. It allows user mode drivers (UMDs) to:

1) Initialize a GPU virtual address (VA) space via the new
   DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
   space managed by the kernel and userspace, respectively.

2) Allocate and free a VA space region as well as bind and unbind memory
   to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
   UMDs can request the named operations to be processed either
   synchronously or asynchronously. It supports DRM syncobjs
   (incl. timelines) as synchronization mechanism. The management of the
   GPU VA mappings is implemented with the DRM GPU VA manager.

3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
   execution happens asynchronously. It supports DRM syncobj (incl.
   timelines) as synchronization mechanism. DRM GEM object locking is
   handled with drm_exec.

Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
GPU scheduler for the asynchronous paths.

Signed-off-by: Danilo Krummrich <dakr@redhat.com>
---
 Documentation/gpu/driver-uapi.rst       |   3 +
 drivers/gpu/drm/nouveau/Kbuild          |   2 +
 drivers/gpu/drm/nouveau/Kconfig         |   2 +
 drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
 drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
 drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
 drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
 drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
 drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
 drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
 drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
 11 files changed, 1295 insertions(+), 4 deletions(-)
 create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
 create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
 create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
 create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h

Comments

kernel test robot Jan. 18, 2023, 8:37 a.m. UTC | #1
Hi Danilo,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on 0b45ac1170ea6416bc1d36798414c04870cd356d]

url:    https://github.com/intel-lab-lkp/linux/commits/Danilo-Krummrich/drm-execution-context-for-GEM-buffers/20230118-141552
base:   0b45ac1170ea6416bc1d36798414c04870cd356d
patch link:    https://lore.kernel.org/r/20230118061256.2689-14-dakr%40redhat.com
patch subject: [PATCH drm-next 13/14] drm/nouveau: implement new VM_BIND UAPI
config: alpha-allyesconfig
compiler: alpha-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/85eb5d5193ee710b6ef76ec2a1b76e2254017a2d
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Danilo-Krummrich/drm-execution-context-for-GEM-buffers/20230118-141552
        git checkout 85eb5d5193ee710b6ef76ec2a1b76e2254017a2d
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=alpha olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross W=1 O=build_dir ARCH=alpha SHELL=/bin/bash drivers/gpu/drm/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/nouveau/nouveau_exec.c:153:5: warning: no previous prototype for 'nouveau_vm_bind' [-Wmissing-prototypes]
     153 | int nouveau_vm_bind(struct nouveau_exec_bind *bind)
         |     ^~~~~~~~~~~~~~~
--
>> drivers/gpu/drm/nouveau/nouveau_sched.c:173:1: warning: no previous prototype for 'nouveau_bind_job_submit' [-Wmissing-prototypes]
     173 | nouveau_bind_job_submit(struct nouveau_job *job)
         | ^~~~~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/nouveau/nouveau_sched.c:428:1: warning: no previous prototype for 'nouveau_exec_job_submit' [-Wmissing-prototypes]
     428 | nouveau_exec_job_submit(struct nouveau_job *job)
         | ^~~~~~~~~~~~~~~~~~~~~~~


vim +/nouveau_vm_bind +153 drivers/gpu/drm/nouveau/nouveau_exec.c

   152	
 > 153	int nouveau_vm_bind(struct nouveau_exec_bind *bind)
   154	{
   155		struct nouveau_bind_job *job;
   156		int ret;
   157	
   158		ret = nouveau_bind_job_init(&job, bind);
   159		if (ret)
   160			return ret;
   161	
   162		ret = nouveau_job_submit(&job->base);
   163		if (ret)
   164			goto err_job_fini;
   165	
   166		return 0;
   167	
   168	err_job_fini:
   169		nouveau_job_fini(&job->base);
   170		return ret;
   171	}
   172
Thomas Hellström (Intel) Jan. 18, 2023, 8:37 p.m. UTC | #2
On 1/18/23 07:12, Danilo Krummrich wrote:
> This commit provides the implementation for the new uapi motivated by the
> Vulkan API. It allows user mode drivers (UMDs) to:
>
> 1) Initialize a GPU virtual address (VA) space via the new
>     DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
>     space managed by the kernel and userspace, respectively.
>
> 2) Allocate and free a VA space region as well as bind and unbind memory
>     to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>     UMDs can request the named operations to be processed either
>     synchronously or asynchronously. It supports DRM syncobjs
>     (incl. timelines) as synchronization mechanism. The management of the
>     GPU VA mappings is implemented with the DRM GPU VA manager.
>
> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
>     execution happens asynchronously. It supports DRM syncobj (incl.
>     timelines) as synchronization mechanism. DRM GEM object locking is
>     handled with drm_exec.
>
> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
> GPU scheduler for the asynchronous paths.
>
> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> ---
>   Documentation/gpu/driver-uapi.rst       |   3 +
>   drivers/gpu/drm/nouveau/Kbuild          |   2 +
>   drivers/gpu/drm/nouveau/Kconfig         |   2 +
>   drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
>   drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
>   drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
>   drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
>   drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
>   drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
>   drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
>   drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
>   11 files changed, 1295 insertions(+), 4 deletions(-)
>   create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
>   create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
>   create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
>   create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h
...
>
> +static struct dma_fence *
> +nouveau_bind_job_run(struct nouveau_job *job)
> +{
> +	struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> +	struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
> +	struct bind_job_op *op;
> +	int ret = 0;
> +

I was looking at how nouveau does the async binding compared to how xe 
does it.
It looks to me that this function being a scheduler run_job callback is 
the main part of the VM_BIND dma-fence signalling critical section for 
the job's done_fence and if so, needs to be annotated as such?

For example nouveau_uvma_region_new allocates memory, which is not 
allowed if in a dma_fence signalling critical section and the locking 
also looks suspicious?

Thanks,

Thomas


> +	nouveau_uvmm_lock(uvmm);
> +	list_for_each_op(op, &bind_job->ops) {
> +		switch (op->op) {
> +		case OP_ALLOC: {
> +			bool sparse = op->flags & DRM_NOUVEAU_VM_BIND_SPARSE;
> +
> +			ret = nouveau_uvma_region_new(uvmm,
> +						      op->va.addr,
> +						      op->va.range,
> +						      sparse);
> +			if (ret)
> +				goto out_unlock;
> +			break;
> +		}
> +		case OP_FREE:
> +			ret = nouveau_uvma_region_destroy(uvmm,
> +							  op->va.addr,
> +							  op->va.range);
> +			if (ret)
> +				goto out_unlock;
> +			break;
> +		case OP_MAP:
> +			ret = nouveau_uvmm_sm_map(uvmm,
> +						  op->va.addr, op->va.range,
> +						  op->gem.obj, op->gem.offset,
> +						  op->flags && 0xff);
> +			if (ret)
> +				goto out_unlock;
> +			break;
> +		case OP_UNMAP:
> +			ret = nouveau_uvmm_sm_unmap(uvmm,
> +						    op->va.addr,
> +						    op->va.range);
> +			if (ret)
> +				goto out_unlock;
> +			break;
> +		}
> +	}
> +
> +out_unlock:
> +	nouveau_uvmm_unlock(uvmm);
> +	if (ret)
> +		NV_PRINTK(err, job->cli, "bind job failed: %d\n", ret);
> +	return ERR_PTR(ret);
> +}
> +
> +static void
> +nouveau_bind_job_free(struct nouveau_job *job)
> +{
> +	struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> +	struct bind_job_op *op, *next;
> +
> +	list_for_each_op_safe(op, next, &bind_job->ops) {
> +		struct drm_gem_object *obj = op->gem.obj;
> +
> +		if (obj)
> +			drm_gem_object_put(obj);
> +
> +		list_del(&op->entry);
> +		kfree(op);
> +	}
> +
> +	nouveau_base_job_free(job);
> +	kfree(bind_job);
> +}
> +
> +static struct nouveau_job_ops nouveau_bind_job_ops = {
> +	.submit = nouveau_bind_job_submit,
> +	.run = nouveau_bind_job_run,
> +	.free = nouveau_bind_job_free,
> +};
> +
> +static int
> +bind_job_op_from_uop(struct bind_job_op **pop,
> +		     struct drm_nouveau_vm_bind_op *uop)
> +{
> +	struct bind_job_op *op;
> +
> +	op = *pop = kzalloc(sizeof(*op), GFP_KERNEL);
> +	if (!op)
> +		return -ENOMEM;
> +
> +	op->op = uop->op;
> +	op->flags = uop->flags;
> +	op->va.addr = uop->addr;
> +	op->va.range = uop->range;
> +
> +	if (op->op == DRM_NOUVEAU_VM_BIND_OP_MAP) {
> +		op->gem.handle = uop->handle;
> +		op->gem.offset = uop->bo_offset;
> +	}
> +
> +	return 0;
> +}
> +
> +static void
> +bind_job_ops_free(struct list_head *ops)
> +{
> +	struct bind_job_op *op, *next;
> +
> +	list_for_each_op_safe(op, next, ops) {
> +		list_del(&op->entry);
> +		kfree(op);
> +	}
> +}
> +
> +int
> +nouveau_bind_job_init(struct nouveau_bind_job **pjob,
> +		      struct nouveau_exec_bind *bind)
> +{
> +	struct nouveau_bind_job *job;
> +	struct bind_job_op *op;
> +	int i, ret;
> +
> +	job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> +	if (!job)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&job->ops);
> +
> +	for (i = 0; i < bind->op.count; i++) {
> +		ret = bind_job_op_from_uop(&op, &bind->op.s[i]);
> +		if (ret)
> +			goto err_free;
> +
> +		list_add_tail(&op->entry, &job->ops);
> +	}
> +
> +	job->base.sync = !(bind->flags & DRM_NOUVEAU_VM_BIND_RUN_ASYNC);
> +	job->base.ops = &nouveau_bind_job_ops;
> +
> +	ret = nouveau_base_job_init(&job->base, &bind->base);
> +	if (ret)
> +		goto err_free;
> +
> +	return 0;
> +
> +err_free:
> +	bind_job_ops_free(&job->ops);
> +	kfree(job);
> +	*pjob = NULL;
> +
> +	return ret;
> +}
> +
> +static int
> +sync_find_fence(struct nouveau_job *job,
> +		struct drm_nouveau_sync *sync,
> +		struct dma_fence **fence)
> +{
> +	u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
> +	u64 point = 0;
> +	int ret;
> +
> +	if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
> +	    stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> +		return -EOPNOTSUPP;
> +
> +	if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> +		point = sync->timeline_value;
> +
> +	ret = drm_syncobj_find_fence(job->file_priv,
> +				     sync->handle, point,
> +				     sync->flags, fence);
> +	if (ret)
> +		return ret;
> +
> +	return 0;
> +}
> +
> +static int
> +exec_job_binds_wait(struct nouveau_job *job)
> +{
> +	struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> +	struct nouveau_cli *cli = exec_job->base.cli;
> +	struct nouveau_sched_entity *bind_entity = &cli->sched_entity;
> +	signed long ret;
> +	int i;
> +
> +	for (i = 0; i < job->in_sync.count; i++) {
> +		struct nouveau_job *it;
> +		struct drm_nouveau_sync *sync = &job->in_sync.s[i];
> +		struct dma_fence *fence;
> +		bool found;
> +
> +		ret = sync_find_fence(job, sync, &fence);
> +		if (ret)
> +			return ret;
> +
> +		mutex_lock(&bind_entity->job.mutex);
> +		found = false;
> +		list_for_each_entry(it, &bind_entity->job.list, head) {
> +			if (fence == it->done_fence) {
> +				found = true;
> +				break;
> +			}
> +		}
> +		mutex_unlock(&bind_entity->job.mutex);
> +
> +		/* If the fence is not from a VM_BIND job, don't wait for it. */
> +		if (!found)
> +			continue;
> +
> +		ret = dma_fence_wait_timeout(fence, true,
> +					     msecs_to_jiffies(500));
> +		if (ret < 0)
> +			return ret;
> +		else if (ret == 0)
> +			return -ETIMEDOUT;
> +	}
> +
> +	return 0;
> +}
> +
> +int
> +nouveau_exec_job_submit(struct nouveau_job *job)
> +{
> +	struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> +	struct nouveau_cli *cli = exec_job->base.cli;
> +	struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(cli);
> +	struct drm_exec *exec = &job->exec;
> +	struct drm_gem_object *obj;
> +	unsigned long index;
> +	int ret;
> +
> +	ret = exec_job_binds_wait(job);
> +	if (ret)
> +		return ret;
> +
> +	nouveau_uvmm_lock(uvmm);
> +	drm_exec_while_not_all_locked(exec) {
> +		struct drm_gpuva *va;
> +
> +		drm_gpuva_for_each_va(va, &uvmm->umgr) {
> +			ret = drm_exec_prepare_obj(exec, va->gem.obj, 1);
> +			drm_exec_break_on_contention(exec);
> +			if (ret)
> +				return ret;
> +		}
> +	}
> +	nouveau_uvmm_unlock(uvmm);
> +
> +	drm_exec_for_each_locked_object(exec, index, obj) {
> +		struct dma_resv *resv = obj->resv;
> +		struct nouveau_bo *nvbo = nouveau_gem_object(obj);
> +
> +		ret = nouveau_bo_validate(nvbo, true, false);
> +		if (ret)
> +			return ret;
> +
> +		dma_resv_add_fence(resv, job->done_fence, DMA_RESV_USAGE_WRITE);
> +	}
> +
> +	return 0;
> +}
> +
> +static struct dma_fence *
> +nouveau_exec_job_run(struct nouveau_job *job)
> +{
> +	struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> +	struct nouveau_fence *fence;
> +	int i, ret;
> +
> +	ret = nouveau_dma_wait(job->chan, exec_job->push.count + 1, 16);
> +	if (ret) {
> +		NV_PRINTK(err, job->cli, "nv50cal_space: %d\n", ret);
> +		return ERR_PTR(ret);
> +	}
> +
> +	for (i = 0; i < exec_job->push.count; i++) {
> +		nv50_dma_push(job->chan, exec_job->push.s[i].va,
> +			      exec_job->push.s[i].va_len);
> +	}
> +
> +	ret = nouveau_fence_new(job->chan, false, &fence);
> +	if (ret) {
> +		NV_PRINTK(err, job->cli, "error fencing pushbuf: %d\n", ret);
> +		WIND_RING(job->chan);
> +		return ERR_PTR(ret);
> +	}
> +
> +	return &fence->base;
> +}
> +static void
> +nouveau_exec_job_free(struct nouveau_job *job)
> +{
> +	struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> +
> +	nouveau_base_job_free(job);
> +
> +	kfree(exec_job->push.s);
> +	kfree(exec_job);
> +}
> +
> +static struct nouveau_job_ops nouveau_exec_job_ops = {
> +	.submit = nouveau_exec_job_submit,
> +	.run = nouveau_exec_job_run,
> +	.free = nouveau_exec_job_free,
> +};
> +
> +int
> +nouveau_exec_job_init(struct nouveau_exec_job **pjob,
> +		      struct nouveau_exec *exec)
> +{
> +	struct nouveau_exec_job *job;
> +	int ret;
> +
> +	job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> +	if (!job)
> +		return -ENOMEM;
> +
> +	job->push.count = exec->push.count;
> +	job->push.s = kmemdup(exec->push.s,
> +			      sizeof(*exec->push.s) *
> +			      exec->push.count,
> +			      GFP_KERNEL);
> +	if (!job->push.s) {
> +		ret = -ENOMEM;
> +		goto err_free_job;
> +	}
> +
> +	job->base.ops = &nouveau_exec_job_ops;
> +	ret = nouveau_base_job_init(&job->base, &exec->base);
> +	if (ret)
> +		goto err_free_pushs;
> +
> +	return 0;
> +
> +err_free_pushs:
> +	kfree(job->push.s);
> +err_free_job:
> +	kfree(job);
> +	*pjob = NULL;
> +
> +	return ret;
> +}
> +
> +void nouveau_job_fini(struct nouveau_job *job)
> +{
> +	dma_fence_put(job->done_fence);
> +	drm_sched_job_cleanup(&job->base);
> +	job->ops->free(job);
> +}
> +
> +static int
> +nouveau_job_add_deps(struct nouveau_job *job)
> +{
> +	struct dma_fence *in_fence = NULL;
> +	int ret, i;
> +
> +	for (i = 0; i < job->in_sync.count; i++) {
> +		struct drm_nouveau_sync *sync = &job->in_sync.s[i];
> +
> +		ret = sync_find_fence(job, sync, &in_fence);
> +		if (ret) {
> +			NV_PRINTK(warn, job->cli,
> +				  "Failed to find syncobj (-> in): handle=%d\n",
> +				  sync->handle);
> +			return ret;
> +		}
> +
> +		ret = drm_sched_job_add_dependency(&job->base, in_fence);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int
> +nouveau_job_fence_attach(struct nouveau_job *job, struct dma_fence *fence)
> +{
> +	struct drm_syncobj *out_sync;
> +	int i;
> +
> +	for (i = 0; i < job->out_sync.count; i++) {
> +		struct drm_nouveau_sync *sync = &job->out_sync.s[i];
> +		u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
> +
> +		if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
> +		    stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> +			return -EOPNOTSUPP;
> +
> +		out_sync = drm_syncobj_find(job->file_priv, sync->handle);
> +		if (!out_sync) {
> +			NV_PRINTK(warn, job->cli,
> +				  "Failed to find syncobj (-> out): handle=%d\n",
> +				  sync->handle);
> +			return -ENOENT;
> +		}
> +
> +		if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ) {
> +			struct dma_fence_chain *chain;
> +
> +			chain = dma_fence_chain_alloc();
> +			if (!chain) {
> +				drm_syncobj_put(out_sync);
> +				return -ENOMEM;
> +			}
> +
> +			drm_syncobj_add_point(out_sync, chain, fence,
> +					      sync->timeline_value);
> +		} else {
> +			drm_syncobj_replace_fence(out_sync, fence);
> +		}
> +
> +		drm_syncobj_put(out_sync);
> +	}
> +
> +	return 0;
> +}
> +
> +static struct dma_fence *
> +nouveau_job_run(struct nouveau_job *job)
> +{
> +	return job->ops->run(job);
> +}
> +
> +static int
> +nouveau_job_run_sync(struct nouveau_job *job)
> +{
> +	struct dma_fence *fence;
> +	int ret;
> +
> +	fence = nouveau_job_run(job);
> +	if (IS_ERR(fence)) {
> +		return PTR_ERR(fence);
> +	} else if (fence) {
> +		ret = dma_fence_wait(fence, true);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	dma_fence_signal(job->done_fence);
> +
> +	return 0;
> +}
> +
> +int
> +nouveau_job_submit(struct nouveau_job *job)
> +{
> +	struct nouveau_sched_entity *entity = to_nouveau_sched_entity(job->base.entity);
> +	int ret;
> +
> +	drm_exec_init(&job->exec, true);
> +
> +	ret = nouveau_job_add_deps(job);
> +	if (ret)
> +		goto out;
> +
> +	drm_sched_job_arm(&job->base);
> +	job->done_fence = dma_fence_get(&job->base.s_fence->finished);
> +
> +	ret = nouveau_job_fence_attach(job, job->done_fence);
> +	if (ret)
> +		goto out;
> +
> +	if (job->ops->submit) {
> +		ret = job->ops->submit(job);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	if (job->sync) {
> +		drm_exec_fini(&job->exec);
> +
> +		/* We're requested to run a synchronous job, hence don't push
> +		 * the job, bypassing the job scheduler, and execute the jobs
> +		 * run() function right away.
> +		 *
> +		 * As a consequence of bypassing the job scheduler we need to
> +		 * handle fencing and job cleanup ourselfes.
> +		 */
> +		ret = nouveau_job_run_sync(job);
> +
> +		/* If the job fails, the caller will do the cleanup for us. */
> +		if (!ret)
> +			nouveau_job_fini(job);
> +
> +		return ret;
> +	} else {
> +		mutex_lock(&entity->job.mutex);
> +		drm_sched_entity_push_job(&job->base);
> +		list_add_tail(&job->head, &entity->job.list);
> +		mutex_unlock(&entity->job.mutex);
> +	}
> +
> +out:
> +	drm_exec_fini(&job->exec);
> +	return ret;
> +}
> +
> +static struct dma_fence *
> +nouveau_sched_run_job(struct drm_sched_job *sched_job)
> +{
> +	struct nouveau_job *job = to_nouveau_job(sched_job);
> +
> +	return nouveau_job_run(job);
> +}
> +
> +static enum drm_gpu_sched_stat
> +nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
> +{
> +	struct nouveau_job *job = to_nouveau_job(sched_job);
> +	struct nouveau_channel *chan = job->chan;
> +
> +	if (unlikely(!atomic_read(&chan->killed)))
> +		nouveau_channel_kill(chan);
> +
> +	NV_PRINTK(warn, job->cli, "job timeout, channel %d killed!\n",
> +		  chan->chid);
> +
> +	nouveau_sched_entity_fini(job->entity);
> +
> +	return DRM_GPU_SCHED_STAT_ENODEV;
> +}
> +
> +static void
> +nouveau_sched_free_job(struct drm_sched_job *sched_job)
> +{
> +	struct nouveau_job *job = to_nouveau_job(sched_job);
> +	struct nouveau_sched_entity *entity = job->entity;
> +
> +	mutex_lock(&entity->job.mutex);
> +	list_del(&job->head);
> +	mutex_unlock(&entity->job.mutex);
> +
> +	nouveau_job_fini(job);
> +}
> +
> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
> +			      struct drm_gpu_scheduler *sched)
> +{
> +
> +	INIT_LIST_HEAD(&entity->job.list);
> +	mutex_init(&entity->job.mutex);
> +
> +	return drm_sched_entity_init(&entity->base,
> +				     DRM_SCHED_PRIORITY_NORMAL,
> +				     &sched, 1, NULL);
> +}
> +
> +void
> +nouveau_sched_entity_fini(struct nouveau_sched_entity *entity)
> +{
> +	drm_sched_entity_destroy(&entity->base);
> +}
> +
> +static const struct drm_sched_backend_ops nouveau_sched_ops = {
> +	.run_job = nouveau_sched_run_job,
> +	.timedout_job = nouveau_sched_timedout_job,
> +	.free_job = nouveau_sched_free_job,
> +};
> +
> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
> +		       struct nouveau_drm *drm)
> +{
> +	long job_hang_limit = msecs_to_jiffies(NOUVEAU_SCHED_JOB_TIMEOUT_MS);
> +
> +	return drm_sched_init(sched, &nouveau_sched_ops,
> +			      NOUVEAU_SCHED_HW_SUBMISSIONS, 0, job_hang_limit,
> +			      NULL, NULL, "nouveau", drm->dev->dev);
> +}
> +
> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched)
> +{
> +	drm_sched_fini(sched);
> +}
> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h b/drivers/gpu/drm/nouveau/nouveau_sched.h
> new file mode 100644
> index 000000000000..7fc5b7eea810
> --- /dev/null
> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
> @@ -0,0 +1,98 @@
> +// SPDX-License-Identifier: MIT
> +
> +#ifndef NOUVEAU_SCHED_H
> +#define NOUVEAU_SCHED_H
> +
> +#include <linux/types.h>
> +
> +#include <drm/drm_exec.h>
> +#include <drm/gpu_scheduler.h>
> +
> +#include "nouveau_drv.h"
> +#include "nouveau_exec.h"
> +
> +#define to_nouveau_job(sched_job)		\
> +		container_of((sched_job), struct nouveau_job, base)
> +
> +#define to_nouveau_exec_job(job)		\
> +		container_of((job), struct nouveau_exec_job, base)
> +
> +#define to_nouveau_bind_job(job)		\
> +		container_of((job), struct nouveau_bind_job, base)
> +
> +struct nouveau_job {
> +	struct drm_sched_job base;
> +	struct list_head head;
> +
> +	struct nouveau_sched_entity *entity;
> +
> +	struct drm_file *file_priv;
> +	struct nouveau_cli *cli;
> +	struct nouveau_channel *chan;
> +
> +	struct drm_exec exec;
> +	struct dma_fence *done_fence;
> +
> +	bool sync;
> +
> +	struct {
> +		struct drm_nouveau_sync *s;
> +		u32 count;
> +	} in_sync;
> +
> +	struct {
> +		struct drm_nouveau_sync *s;
> +		u32 count;
> +	} out_sync;
> +
> +	struct nouveau_job_ops {
> +		int (*submit)(struct nouveau_job *);
> +		struct dma_fence *(*run)(struct nouveau_job *);
> +		void (*free)(struct nouveau_job *);
> +	} *ops;
> +};
> +
> +struct nouveau_exec_job {
> +	struct nouveau_job base;
> +
> +	struct {
> +		struct drm_nouveau_exec_push *s;
> +		u32 count;
> +	} push;
> +};
> +
> +struct nouveau_bind_job {
> +	struct nouveau_job base;
> +
> +	/* struct bind_job_op */
> +	struct list_head ops;
> +};
> +
> +int nouveau_bind_job_init(struct nouveau_bind_job **job,
> +			  struct nouveau_exec_bind *bind);
> +int nouveau_exec_job_init(struct nouveau_exec_job **job,
> +			  struct nouveau_exec *exec);
> +
> +int nouveau_job_submit(struct nouveau_job *job);
> +void nouveau_job_fini(struct nouveau_job *job);
> +
> +#define to_nouveau_sched_entity(entity)		\
> +		container_of((entity), struct nouveau_sched_entity, base)
> +
> +struct nouveau_sched_entity {
> +	struct drm_sched_entity base;
> +	struct {
> +		struct list_head list;
> +		struct mutex mutex;
> +	} job;
> +};
> +
> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
> +			      struct drm_gpu_scheduler *sched);
> +void nouveau_sched_entity_fini(struct nouveau_sched_entity *entity);
> +
> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
> +		       struct nouveau_drm *drm);
> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched);
> +
> +#endif
Danilo Krummrich Jan. 19, 2023, 3:44 a.m. UTC | #3
On 1/18/23 21:37, Thomas Hellström (Intel) wrote:
> 
> On 1/18/23 07:12, Danilo Krummrich wrote:
>> This commit provides the implementation for the new uapi motivated by the
>> Vulkan API. It allows user mode drivers (UMDs) to:
>>
>> 1) Initialize a GPU virtual address (VA) space via the new
>>     DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
>>     space managed by the kernel and userspace, respectively.
>>
>> 2) Allocate and free a VA space region as well as bind and unbind memory
>>     to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>     UMDs can request the named operations to be processed either
>>     synchronously or asynchronously. It supports DRM syncobjs
>>     (incl. timelines) as synchronization mechanism. The management of the
>>     GPU VA mappings is implemented with the DRM GPU VA manager.
>>
>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
>>     execution happens asynchronously. It supports DRM syncobj (incl.
>>     timelines) as synchronization mechanism. DRM GEM object locking is
>>     handled with drm_exec.
>>
>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
>> GPU scheduler for the asynchronous paths.
>>
>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>> ---
>>   Documentation/gpu/driver-uapi.rst       |   3 +
>>   drivers/gpu/drm/nouveau/Kbuild          |   2 +
>>   drivers/gpu/drm/nouveau/Kconfig         |   2 +
>>   drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
>>   drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
>>   drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
>>   drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
>>   drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
>>   drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
>>   drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
>>   drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
>>   11 files changed, 1295 insertions(+), 4 deletions(-)
>>   create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
>>   create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
>>   create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
>>   create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h
> ...
>>
>> +static struct dma_fence *
>> +nouveau_bind_job_run(struct nouveau_job *job)
>> +{
>> +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
>> +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
>> +    struct bind_job_op *op;
>> +    int ret = 0;
>> +
> 
> I was looking at how nouveau does the async binding compared to how xe 
> does it.
> It looks to me that this function being a scheduler run_job callback is 
> the main part of the VM_BIND dma-fence signalling critical section for 
> the job's done_fence and if so, needs to be annotated as such?

Yes, that's the case.

> 
> For example nouveau_uvma_region_new allocates memory, which is not 
> allowed if in a dma_fence signalling critical section and the locking 
> also looks suspicious?

Thanks for pointing this out, I missed that somehow.

I will change it to pre-allocate new regions, mappings and page tables 
within the job's submit() function.

For the ops structures the drm_gpuva_manager allocates for reporting the 
split/merge steps back to the driver I have ideas to entirely avoid 
allocations, which also is a good thing in respect of Christians 
feedback regarding the huge amount of mapping requests some applications 
seem to generate.

Regarding the locking, anything specific that makes it look suspicious 
to you?

> 
> Thanks,
> 
> Thomas
> 
> 
>> +    nouveau_uvmm_lock(uvmm);
>> +    list_for_each_op(op, &bind_job->ops) {
>> +        switch (op->op) {
>> +        case OP_ALLOC: {
>> +            bool sparse = op->flags & DRM_NOUVEAU_VM_BIND_SPARSE;
>> +
>> +            ret = nouveau_uvma_region_new(uvmm,
>> +                              op->va.addr,
>> +                              op->va.range,
>> +                              sparse);
>> +            if (ret)
>> +                goto out_unlock;
>> +            break;
>> +        }
>> +        case OP_FREE:
>> +            ret = nouveau_uvma_region_destroy(uvmm,
>> +                              op->va.addr,
>> +                              op->va.range);
>> +            if (ret)
>> +                goto out_unlock;
>> +            break;
>> +        case OP_MAP:
>> +            ret = nouveau_uvmm_sm_map(uvmm,
>> +                          op->va.addr, op->va.range,
>> +                          op->gem.obj, op->gem.offset,
>> +                          op->flags && 0xff);
>> +            if (ret)
>> +                goto out_unlock;
>> +            break;
>> +        case OP_UNMAP:
>> +            ret = nouveau_uvmm_sm_unmap(uvmm,
>> +                            op->va.addr,
>> +                            op->va.range);
>> +            if (ret)
>> +                goto out_unlock;
>> +            break;
>> +        }
>> +    }
>> +
>> +out_unlock:
>> +    nouveau_uvmm_unlock(uvmm);
>> +    if (ret)
>> +        NV_PRINTK(err, job->cli, "bind job failed: %d\n", ret);
>> +    return ERR_PTR(ret);
>> +}
>> +
>> +static void
>> +nouveau_bind_job_free(struct nouveau_job *job)
>> +{
>> +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
>> +    struct bind_job_op *op, *next;
>> +
>> +    list_for_each_op_safe(op, next, &bind_job->ops) {
>> +        struct drm_gem_object *obj = op->gem.obj;
>> +
>> +        if (obj)
>> +            drm_gem_object_put(obj);
>> +
>> +        list_del(&op->entry);
>> +        kfree(op);
>> +    }
>> +
>> +    nouveau_base_job_free(job);
>> +    kfree(bind_job);
>> +}
>> +
>> +static struct nouveau_job_ops nouveau_bind_job_ops = {
>> +    .submit = nouveau_bind_job_submit,
>> +    .run = nouveau_bind_job_run,
>> +    .free = nouveau_bind_job_free,
>> +};
>> +
>> +static int
>> +bind_job_op_from_uop(struct bind_job_op **pop,
>> +             struct drm_nouveau_vm_bind_op *uop)
>> +{
>> +    struct bind_job_op *op;
>> +
>> +    op = *pop = kzalloc(sizeof(*op), GFP_KERNEL);
>> +    if (!op)
>> +        return -ENOMEM;
>> +
>> +    op->op = uop->op;
>> +    op->flags = uop->flags;
>> +    op->va.addr = uop->addr;
>> +    op->va.range = uop->range;
>> +
>> +    if (op->op == DRM_NOUVEAU_VM_BIND_OP_MAP) {
>> +        op->gem.handle = uop->handle;
>> +        op->gem.offset = uop->bo_offset;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static void
>> +bind_job_ops_free(struct list_head *ops)
>> +{
>> +    struct bind_job_op *op, *next;
>> +
>> +    list_for_each_op_safe(op, next, ops) {
>> +        list_del(&op->entry);
>> +        kfree(op);
>> +    }
>> +}
>> +
>> +int
>> +nouveau_bind_job_init(struct nouveau_bind_job **pjob,
>> +              struct nouveau_exec_bind *bind)
>> +{
>> +    struct nouveau_bind_job *job;
>> +    struct bind_job_op *op;
>> +    int i, ret;
>> +
>> +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>> +    if (!job)
>> +        return -ENOMEM;
>> +
>> +    INIT_LIST_HEAD(&job->ops);
>> +
>> +    for (i = 0; i < bind->op.count; i++) {
>> +        ret = bind_job_op_from_uop(&op, &bind->op.s[i]);
>> +        if (ret)
>> +            goto err_free;
>> +
>> +        list_add_tail(&op->entry, &job->ops);
>> +    }
>> +
>> +    job->base.sync = !(bind->flags & DRM_NOUVEAU_VM_BIND_RUN_ASYNC);
>> +    job->base.ops = &nouveau_bind_job_ops;
>> +
>> +    ret = nouveau_base_job_init(&job->base, &bind->base);
>> +    if (ret)
>> +        goto err_free;
>> +
>> +    return 0;
>> +
>> +err_free:
>> +    bind_job_ops_free(&job->ops);
>> +    kfree(job);
>> +    *pjob = NULL;
>> +
>> +    return ret;
>> +}
>> +
>> +static int
>> +sync_find_fence(struct nouveau_job *job,
>> +        struct drm_nouveau_sync *sync,
>> +        struct dma_fence **fence)
>> +{
>> +    u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
>> +    u64 point = 0;
>> +    int ret;
>> +
>> +    if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
>> +        stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>> +        return -EOPNOTSUPP;
>> +
>> +    if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>> +        point = sync->timeline_value;
>> +
>> +    ret = drm_syncobj_find_fence(job->file_priv,
>> +                     sync->handle, point,
>> +                     sync->flags, fence);
>> +    if (ret)
>> +        return ret;
>> +
>> +    return 0;
>> +}
>> +
>> +static int
>> +exec_job_binds_wait(struct nouveau_job *job)
>> +{
>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>> +    struct nouveau_cli *cli = exec_job->base.cli;
>> +    struct nouveau_sched_entity *bind_entity = &cli->sched_entity;
>> +    signed long ret;
>> +    int i;
>> +
>> +    for (i = 0; i < job->in_sync.count; i++) {
>> +        struct nouveau_job *it;
>> +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
>> +        struct dma_fence *fence;
>> +        bool found;
>> +
>> +        ret = sync_find_fence(job, sync, &fence);
>> +        if (ret)
>> +            return ret;
>> +
>> +        mutex_lock(&bind_entity->job.mutex);
>> +        found = false;
>> +        list_for_each_entry(it, &bind_entity->job.list, head) {
>> +            if (fence == it->done_fence) {
>> +                found = true;
>> +                break;
>> +            }
>> +        }
>> +        mutex_unlock(&bind_entity->job.mutex);
>> +
>> +        /* If the fence is not from a VM_BIND job, don't wait for it. */
>> +        if (!found)
>> +            continue;
>> +
>> +        ret = dma_fence_wait_timeout(fence, true,
>> +                         msecs_to_jiffies(500));
>> +        if (ret < 0)
>> +            return ret;
>> +        else if (ret == 0)
>> +            return -ETIMEDOUT;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +int
>> +nouveau_exec_job_submit(struct nouveau_job *job)
>> +{
>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>> +    struct nouveau_cli *cli = exec_job->base.cli;
>> +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(cli);
>> +    struct drm_exec *exec = &job->exec;
>> +    struct drm_gem_object *obj;
>> +    unsigned long index;
>> +    int ret;
>> +
>> +    ret = exec_job_binds_wait(job);
>> +    if (ret)
>> +        return ret;
>> +
>> +    nouveau_uvmm_lock(uvmm);
>> +    drm_exec_while_not_all_locked(exec) {
>> +        struct drm_gpuva *va;
>> +
>> +        drm_gpuva_for_each_va(va, &uvmm->umgr) {
>> +            ret = drm_exec_prepare_obj(exec, va->gem.obj, 1);
>> +            drm_exec_break_on_contention(exec);
>> +            if (ret)
>> +                return ret;
>> +        }
>> +    }
>> +    nouveau_uvmm_unlock(uvmm);
>> +
>> +    drm_exec_for_each_locked_object(exec, index, obj) {
>> +        struct dma_resv *resv = obj->resv;
>> +        struct nouveau_bo *nvbo = nouveau_gem_object(obj);
>> +
>> +        ret = nouveau_bo_validate(nvbo, true, false);
>> +        if (ret)
>> +            return ret;
>> +
>> +        dma_resv_add_fence(resv, job->done_fence, DMA_RESV_USAGE_WRITE);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static struct dma_fence *
>> +nouveau_exec_job_run(struct nouveau_job *job)
>> +{
>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>> +    struct nouveau_fence *fence;
>> +    int i, ret;
>> +
>> +    ret = nouveau_dma_wait(job->chan, exec_job->push.count + 1, 16);
>> +    if (ret) {
>> +        NV_PRINTK(err, job->cli, "nv50cal_space: %d\n", ret);
>> +        return ERR_PTR(ret);
>> +    }
>> +
>> +    for (i = 0; i < exec_job->push.count; i++) {
>> +        nv50_dma_push(job->chan, exec_job->push.s[i].va,
>> +                  exec_job->push.s[i].va_len);
>> +    }
>> +
>> +    ret = nouveau_fence_new(job->chan, false, &fence);
>> +    if (ret) {
>> +        NV_PRINTK(err, job->cli, "error fencing pushbuf: %d\n", ret);
>> +        WIND_RING(job->chan);
>> +        return ERR_PTR(ret);
>> +    }
>> +
>> +    return &fence->base;
>> +}
>> +static void
>> +nouveau_exec_job_free(struct nouveau_job *job)
>> +{
>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>> +
>> +    nouveau_base_job_free(job);
>> +
>> +    kfree(exec_job->push.s);
>> +    kfree(exec_job);
>> +}
>> +
>> +static struct nouveau_job_ops nouveau_exec_job_ops = {
>> +    .submit = nouveau_exec_job_submit,
>> +    .run = nouveau_exec_job_run,
>> +    .free = nouveau_exec_job_free,
>> +};
>> +
>> +int
>> +nouveau_exec_job_init(struct nouveau_exec_job **pjob,
>> +              struct nouveau_exec *exec)
>> +{
>> +    struct nouveau_exec_job *job;
>> +    int ret;
>> +
>> +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>> +    if (!job)
>> +        return -ENOMEM;
>> +
>> +    job->push.count = exec->push.count;
>> +    job->push.s = kmemdup(exec->push.s,
>> +                  sizeof(*exec->push.s) *
>> +                  exec->push.count,
>> +                  GFP_KERNEL);
>> +    if (!job->push.s) {
>> +        ret = -ENOMEM;
>> +        goto err_free_job;
>> +    }
>> +
>> +    job->base.ops = &nouveau_exec_job_ops;
>> +    ret = nouveau_base_job_init(&job->base, &exec->base);
>> +    if (ret)
>> +        goto err_free_pushs;
>> +
>> +    return 0;
>> +
>> +err_free_pushs:
>> +    kfree(job->push.s);
>> +err_free_job:
>> +    kfree(job);
>> +    *pjob = NULL;
>> +
>> +    return ret;
>> +}
>> +
>> +void nouveau_job_fini(struct nouveau_job *job)
>> +{
>> +    dma_fence_put(job->done_fence);
>> +    drm_sched_job_cleanup(&job->base);
>> +    job->ops->free(job);
>> +}
>> +
>> +static int
>> +nouveau_job_add_deps(struct nouveau_job *job)
>> +{
>> +    struct dma_fence *in_fence = NULL;
>> +    int ret, i;
>> +
>> +    for (i = 0; i < job->in_sync.count; i++) {
>> +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
>> +
>> +        ret = sync_find_fence(job, sync, &in_fence);
>> +        if (ret) {
>> +            NV_PRINTK(warn, job->cli,
>> +                  "Failed to find syncobj (-> in): handle=%d\n",
>> +                  sync->handle);
>> +            return ret;
>> +        }
>> +
>> +        ret = drm_sched_job_add_dependency(&job->base, in_fence);
>> +        if (ret)
>> +            return ret;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int
>> +nouveau_job_fence_attach(struct nouveau_job *job, struct dma_fence 
>> *fence)
>> +{
>> +    struct drm_syncobj *out_sync;
>> +    int i;
>> +
>> +    for (i = 0; i < job->out_sync.count; i++) {
>> +        struct drm_nouveau_sync *sync = &job->out_sync.s[i];
>> +        u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
>> +
>> +        if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
>> +            stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>> +            return -EOPNOTSUPP;
>> +
>> +        out_sync = drm_syncobj_find(job->file_priv, sync->handle);
>> +        if (!out_sync) {
>> +            NV_PRINTK(warn, job->cli,
>> +                  "Failed to find syncobj (-> out): handle=%d\n",
>> +                  sync->handle);
>> +            return -ENOENT;
>> +        }
>> +
>> +        if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ) {
>> +            struct dma_fence_chain *chain;
>> +
>> +            chain = dma_fence_chain_alloc();
>> +            if (!chain) {
>> +                drm_syncobj_put(out_sync);
>> +                return -ENOMEM;
>> +            }
>> +
>> +            drm_syncobj_add_point(out_sync, chain, fence,
>> +                          sync->timeline_value);
>> +        } else {
>> +            drm_syncobj_replace_fence(out_sync, fence);
>> +        }
>> +
>> +        drm_syncobj_put(out_sync);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static struct dma_fence *
>> +nouveau_job_run(struct nouveau_job *job)
>> +{
>> +    return job->ops->run(job);
>> +}
>> +
>> +static int
>> +nouveau_job_run_sync(struct nouveau_job *job)
>> +{
>> +    struct dma_fence *fence;
>> +    int ret;
>> +
>> +    fence = nouveau_job_run(job);
>> +    if (IS_ERR(fence)) {
>> +        return PTR_ERR(fence);
>> +    } else if (fence) {
>> +        ret = dma_fence_wait(fence, true);
>> +        if (ret)
>> +            return ret;
>> +    }
>> +
>> +    dma_fence_signal(job->done_fence);
>> +
>> +    return 0;
>> +}
>> +
>> +int
>> +nouveau_job_submit(struct nouveau_job *job)
>> +{
>> +    struct nouveau_sched_entity *entity = 
>> to_nouveau_sched_entity(job->base.entity);
>> +    int ret;
>> +
>> +    drm_exec_init(&job->exec, true);
>> +
>> +    ret = nouveau_job_add_deps(job);
>> +    if (ret)
>> +        goto out;
>> +
>> +    drm_sched_job_arm(&job->base);
>> +    job->done_fence = dma_fence_get(&job->base.s_fence->finished);
>> +
>> +    ret = nouveau_job_fence_attach(job, job->done_fence);
>> +    if (ret)
>> +        goto out;
>> +
>> +    if (job->ops->submit) {
>> +        ret = job->ops->submit(job);
>> +        if (ret)
>> +            goto out;
>> +    }
>> +
>> +    if (job->sync) {
>> +        drm_exec_fini(&job->exec);
>> +
>> +        /* We're requested to run a synchronous job, hence don't push
>> +         * the job, bypassing the job scheduler, and execute the jobs
>> +         * run() function right away.
>> +         *
>> +         * As a consequence of bypassing the job scheduler we need to
>> +         * handle fencing and job cleanup ourselfes.
>> +         */
>> +        ret = nouveau_job_run_sync(job);
>> +
>> +        /* If the job fails, the caller will do the cleanup for us. */
>> +        if (!ret)
>> +            nouveau_job_fini(job);
>> +
>> +        return ret;
>> +    } else {
>> +        mutex_lock(&entity->job.mutex);
>> +        drm_sched_entity_push_job(&job->base);
>> +        list_add_tail(&job->head, &entity->job.list);
>> +        mutex_unlock(&entity->job.mutex);
>> +    }
>> +
>> +out:
>> +    drm_exec_fini(&job->exec);
>> +    return ret;
>> +}
>> +
>> +static struct dma_fence *
>> +nouveau_sched_run_job(struct drm_sched_job *sched_job)
>> +{
>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>> +
>> +    return nouveau_job_run(job);
>> +}
>> +
>> +static enum drm_gpu_sched_stat
>> +nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
>> +{
>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>> +    struct nouveau_channel *chan = job->chan;
>> +
>> +    if (unlikely(!atomic_read(&chan->killed)))
>> +        nouveau_channel_kill(chan);
>> +
>> +    NV_PRINTK(warn, job->cli, "job timeout, channel %d killed!\n",
>> +          chan->chid);
>> +
>> +    nouveau_sched_entity_fini(job->entity);
>> +
>> +    return DRM_GPU_SCHED_STAT_ENODEV;
>> +}
>> +
>> +static void
>> +nouveau_sched_free_job(struct drm_sched_job *sched_job)
>> +{
>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>> +    struct nouveau_sched_entity *entity = job->entity;
>> +
>> +    mutex_lock(&entity->job.mutex);
>> +    list_del(&job->head);
>> +    mutex_unlock(&entity->job.mutex);
>> +
>> +    nouveau_job_fini(job);
>> +}
>> +
>> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
>> +                  struct drm_gpu_scheduler *sched)
>> +{
>> +
>> +    INIT_LIST_HEAD(&entity->job.list);
>> +    mutex_init(&entity->job.mutex);
>> +
>> +    return drm_sched_entity_init(&entity->base,
>> +                     DRM_SCHED_PRIORITY_NORMAL,
>> +                     &sched, 1, NULL);
>> +}
>> +
>> +void
>> +nouveau_sched_entity_fini(struct nouveau_sched_entity *entity)
>> +{
>> +    drm_sched_entity_destroy(&entity->base);
>> +}
>> +
>> +static const struct drm_sched_backend_ops nouveau_sched_ops = {
>> +    .run_job = nouveau_sched_run_job,
>> +    .timedout_job = nouveau_sched_timedout_job,
>> +    .free_job = nouveau_sched_free_job,
>> +};
>> +
>> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
>> +               struct nouveau_drm *drm)
>> +{
>> +    long job_hang_limit = 
>> msecs_to_jiffies(NOUVEAU_SCHED_JOB_TIMEOUT_MS);
>> +
>> +    return drm_sched_init(sched, &nouveau_sched_ops,
>> +                  NOUVEAU_SCHED_HW_SUBMISSIONS, 0, job_hang_limit,
>> +                  NULL, NULL, "nouveau", drm->dev->dev);
>> +}
>> +
>> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched)
>> +{
>> +    drm_sched_fini(sched);
>> +}
>> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h 
>> b/drivers/gpu/drm/nouveau/nouveau_sched.h
>> new file mode 100644
>> index 000000000000..7fc5b7eea810
>> --- /dev/null
>> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
>> @@ -0,0 +1,98 @@
>> +// SPDX-License-Identifier: MIT
>> +
>> +#ifndef NOUVEAU_SCHED_H
>> +#define NOUVEAU_SCHED_H
>> +
>> +#include <linux/types.h>
>> +
>> +#include <drm/drm_exec.h>
>> +#include <drm/gpu_scheduler.h>
>> +
>> +#include "nouveau_drv.h"
>> +#include "nouveau_exec.h"
>> +
>> +#define to_nouveau_job(sched_job)        \
>> +        container_of((sched_job), struct nouveau_job, base)
>> +
>> +#define to_nouveau_exec_job(job)        \
>> +        container_of((job), struct nouveau_exec_job, base)
>> +
>> +#define to_nouveau_bind_job(job)        \
>> +        container_of((job), struct nouveau_bind_job, base)
>> +
>> +struct nouveau_job {
>> +    struct drm_sched_job base;
>> +    struct list_head head;
>> +
>> +    struct nouveau_sched_entity *entity;
>> +
>> +    struct drm_file *file_priv;
>> +    struct nouveau_cli *cli;
>> +    struct nouveau_channel *chan;
>> +
>> +    struct drm_exec exec;
>> +    struct dma_fence *done_fence;
>> +
>> +    bool sync;
>> +
>> +    struct {
>> +        struct drm_nouveau_sync *s;
>> +        u32 count;
>> +    } in_sync;
>> +
>> +    struct {
>> +        struct drm_nouveau_sync *s;
>> +        u32 count;
>> +    } out_sync;
>> +
>> +    struct nouveau_job_ops {
>> +        int (*submit)(struct nouveau_job *);
>> +        struct dma_fence *(*run)(struct nouveau_job *);
>> +        void (*free)(struct nouveau_job *);
>> +    } *ops;
>> +};
>> +
>> +struct nouveau_exec_job {
>> +    struct nouveau_job base;
>> +
>> +    struct {
>> +        struct drm_nouveau_exec_push *s;
>> +        u32 count;
>> +    } push;
>> +};
>> +
>> +struct nouveau_bind_job {
>> +    struct nouveau_job base;
>> +
>> +    /* struct bind_job_op */
>> +    struct list_head ops;
>> +};
>> +
>> +int nouveau_bind_job_init(struct nouveau_bind_job **job,
>> +              struct nouveau_exec_bind *bind);
>> +int nouveau_exec_job_init(struct nouveau_exec_job **job,
>> +              struct nouveau_exec *exec);
>> +
>> +int nouveau_job_submit(struct nouveau_job *job);
>> +void nouveau_job_fini(struct nouveau_job *job);
>> +
>> +#define to_nouveau_sched_entity(entity)        \
>> +        container_of((entity), struct nouveau_sched_entity, base)
>> +
>> +struct nouveau_sched_entity {
>> +    struct drm_sched_entity base;
>> +    struct {
>> +        struct list_head list;
>> +        struct mutex mutex;
>> +    } job;
>> +};
>> +
>> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
>> +                  struct drm_gpu_scheduler *sched);
>> +void nouveau_sched_entity_fini(struct nouveau_sched_entity *entity);
>> +
>> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
>> +               struct nouveau_drm *drm);
>> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched);
>> +
>> +#endif
>
Matthew Brost Jan. 19, 2023, 4:58 a.m. UTC | #4
On Thu, Jan 19, 2023 at 04:44:23AM +0100, Danilo Krummrich wrote:
> On 1/18/23 21:37, Thomas Hellström (Intel) wrote:
> > 
> > On 1/18/23 07:12, Danilo Krummrich wrote:
> > > This commit provides the implementation for the new uapi motivated by the
> > > Vulkan API. It allows user mode drivers (UMDs) to:
> > > 
> > > 1) Initialize a GPU virtual address (VA) space via the new
> > >     DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
> > >     space managed by the kernel and userspace, respectively.
> > > 
> > > 2) Allocate and free a VA space region as well as bind and unbind memory
> > >     to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> > >     UMDs can request the named operations to be processed either
> > >     synchronously or asynchronously. It supports DRM syncobjs
> > >     (incl. timelines) as synchronization mechanism. The management of the
> > >     GPU VA mappings is implemented with the DRM GPU VA manager.
> > > 
> > > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
> > >     execution happens asynchronously. It supports DRM syncobj (incl.
> > >     timelines) as synchronization mechanism. DRM GEM object locking is
> > >     handled with drm_exec.
> > > 
> > > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
> > > GPU scheduler for the asynchronous paths.
> > > 
> > > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > > ---
> > >   Documentation/gpu/driver-uapi.rst       |   3 +
> > >   drivers/gpu/drm/nouveau/Kbuild          |   2 +
> > >   drivers/gpu/drm/nouveau/Kconfig         |   2 +
> > >   drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
> > >   drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
> > >   drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
> > >   drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
> > >   drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
> > >   drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
> > >   drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
> > >   drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
> > >   11 files changed, 1295 insertions(+), 4 deletions(-)
> > >   create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
> > >   create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
> > >   create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
> > >   create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h
> > ...
> > > 
> > > +static struct dma_fence *
> > > +nouveau_bind_job_run(struct nouveau_job *job)
> > > +{
> > > +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> > > +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
> > > +    struct bind_job_op *op;
> > > +    int ret = 0;
> > > +
> > 
> > I was looking at how nouveau does the async binding compared to how xe
> > does it.
> > It looks to me that this function being a scheduler run_job callback is
> > the main part of the VM_BIND dma-fence signalling critical section for
> > the job's done_fence and if so, needs to be annotated as such?
> 
> Yes, that's the case.
> 
> > 
> > For example nouveau_uvma_region_new allocates memory, which is not
> > allowed if in a dma_fence signalling critical section and the locking
> > also looks suspicious?
> 
> Thanks for pointing this out, I missed that somehow.
> 
> I will change it to pre-allocate new regions, mappings and page tables
> within the job's submit() function.
>

Yea that what we basically do in Xe, in the IOCTL step allocate all the
backing store for new page tables, populate new page tables (these are
not yet visible in the page table structure), and in last step which is
executed after all the dependencies are satified program all the leaf
entires making the new binding visible.

We screwed have this up by defering most of the IOCTL to a worker but
will fix this fix this one way or another soon - get rid of worker or
introduce a type of sync that is signaled after the worker + publish the
dma-fence in the worker. I'd like to close on this one soon.
 
> For the ops structures the drm_gpuva_manager allocates for reporting the
> split/merge steps back to the driver I have ideas to entirely avoid
> allocations, which also is a good thing in respect of Christians feedback
> regarding the huge amount of mapping requests some applications seem to
> generate.
>

It should be fine to have allocations to report the split/merge step as
this step should be before a dma-fence is published, but yea if possible
to avoid extra allocs as that is always better.

Also BTW, great work on drm_gpuva_manager too. We will almost likely
pick this up in Xe rather than open coding all of this as we currently
do. We should probably start the port to this soon so we can contribute
to the implementation and get both of our drivers upstream sooner.
 
> Regarding the locking, anything specific that makes it look suspicious to
> you?
> 

I haven't looked into this too but almost certainly Thomas is suggesting
that if you allocate memory anywhere under the nouveau_uvmm_lock then
you can't use this lock in the run_job() callback as this in the
dma-fencing path.

Matt 

> > 
> > Thanks,
> > 
> > Thomas
> > 
> > 
> > > +    nouveau_uvmm_lock(uvmm);
> > > +    list_for_each_op(op, &bind_job->ops) {
> > > +        switch (op->op) {
> > > +        case OP_ALLOC: {
> > > +            bool sparse = op->flags & DRM_NOUVEAU_VM_BIND_SPARSE;
> > > +
> > > +            ret = nouveau_uvma_region_new(uvmm,
> > > +                              op->va.addr,
> > > +                              op->va.range,
> > > +                              sparse);
> > > +            if (ret)
> > > +                goto out_unlock;
> > > +            break;
> > > +        }
> > > +        case OP_FREE:
> > > +            ret = nouveau_uvma_region_destroy(uvmm,
> > > +                              op->va.addr,
> > > +                              op->va.range);
> > > +            if (ret)
> > > +                goto out_unlock;
> > > +            break;
> > > +        case OP_MAP:
> > > +            ret = nouveau_uvmm_sm_map(uvmm,
> > > +                          op->va.addr, op->va.range,
> > > +                          op->gem.obj, op->gem.offset,
> > > +                          op->flags && 0xff);
> > > +            if (ret)
> > > +                goto out_unlock;
> > > +            break;
> > > +        case OP_UNMAP:
> > > +            ret = nouveau_uvmm_sm_unmap(uvmm,
> > > +                            op->va.addr,
> > > +                            op->va.range);
> > > +            if (ret)
> > > +                goto out_unlock;
> > > +            break;
> > > +        }
> > > +    }
> > > +
> > > +out_unlock:
> > > +    nouveau_uvmm_unlock(uvmm);
> > > +    if (ret)
> > > +        NV_PRINTK(err, job->cli, "bind job failed: %d\n", ret);
> > > +    return ERR_PTR(ret);
> > > +}
> > > +
> > > +static void
> > > +nouveau_bind_job_free(struct nouveau_job *job)
> > > +{
> > > +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> > > +    struct bind_job_op *op, *next;
> > > +
> > > +    list_for_each_op_safe(op, next, &bind_job->ops) {
> > > +        struct drm_gem_object *obj = op->gem.obj;
> > > +
> > > +        if (obj)
> > > +            drm_gem_object_put(obj);
> > > +
> > > +        list_del(&op->entry);
> > > +        kfree(op);
> > > +    }
> > > +
> > > +    nouveau_base_job_free(job);
> > > +    kfree(bind_job);
> > > +}
> > > +
> > > +static struct nouveau_job_ops nouveau_bind_job_ops = {
> > > +    .submit = nouveau_bind_job_submit,
> > > +    .run = nouveau_bind_job_run,
> > > +    .free = nouveau_bind_job_free,
> > > +};
> > > +
> > > +static int
> > > +bind_job_op_from_uop(struct bind_job_op **pop,
> > > +             struct drm_nouveau_vm_bind_op *uop)
> > > +{
> > > +    struct bind_job_op *op;
> > > +
> > > +    op = *pop = kzalloc(sizeof(*op), GFP_KERNEL);
> > > +    if (!op)
> > > +        return -ENOMEM;
> > > +
> > > +    op->op = uop->op;
> > > +    op->flags = uop->flags;
> > > +    op->va.addr = uop->addr;
> > > +    op->va.range = uop->range;
> > > +
> > > +    if (op->op == DRM_NOUVEAU_VM_BIND_OP_MAP) {
> > > +        op->gem.handle = uop->handle;
> > > +        op->gem.offset = uop->bo_offset;
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static void
> > > +bind_job_ops_free(struct list_head *ops)
> > > +{
> > > +    struct bind_job_op *op, *next;
> > > +
> > > +    list_for_each_op_safe(op, next, ops) {
> > > +        list_del(&op->entry);
> > > +        kfree(op);
> > > +    }
> > > +}
> > > +
> > > +int
> > > +nouveau_bind_job_init(struct nouveau_bind_job **pjob,
> > > +              struct nouveau_exec_bind *bind)
> > > +{
> > > +    struct nouveau_bind_job *job;
> > > +    struct bind_job_op *op;
> > > +    int i, ret;
> > > +
> > > +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> > > +    if (!job)
> > > +        return -ENOMEM;
> > > +
> > > +    INIT_LIST_HEAD(&job->ops);
> > > +
> > > +    for (i = 0; i < bind->op.count; i++) {
> > > +        ret = bind_job_op_from_uop(&op, &bind->op.s[i]);
> > > +        if (ret)
> > > +            goto err_free;
> > > +
> > > +        list_add_tail(&op->entry, &job->ops);
> > > +    }
> > > +
> > > +    job->base.sync = !(bind->flags & DRM_NOUVEAU_VM_BIND_RUN_ASYNC);
> > > +    job->base.ops = &nouveau_bind_job_ops;
> > > +
> > > +    ret = nouveau_base_job_init(&job->base, &bind->base);
> > > +    if (ret)
> > > +        goto err_free;
> > > +
> > > +    return 0;
> > > +
> > > +err_free:
> > > +    bind_job_ops_free(&job->ops);
> > > +    kfree(job);
> > > +    *pjob = NULL;
> > > +
> > > +    return ret;
> > > +}
> > > +
> > > +static int
> > > +sync_find_fence(struct nouveau_job *job,
> > > +        struct drm_nouveau_sync *sync,
> > > +        struct dma_fence **fence)
> > > +{
> > > +    u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
> > > +    u64 point = 0;
> > > +    int ret;
> > > +
> > > +    if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
> > > +        stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > +        return -EOPNOTSUPP;
> > > +
> > > +    if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > +        point = sync->timeline_value;
> > > +
> > > +    ret = drm_syncobj_find_fence(job->file_priv,
> > > +                     sync->handle, point,
> > > +                     sync->flags, fence);
> > > +    if (ret)
> > > +        return ret;
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int
> > > +exec_job_binds_wait(struct nouveau_job *job)
> > > +{
> > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > +    struct nouveau_cli *cli = exec_job->base.cli;
> > > +    struct nouveau_sched_entity *bind_entity = &cli->sched_entity;
> > > +    signed long ret;
> > > +    int i;
> > > +
> > > +    for (i = 0; i < job->in_sync.count; i++) {
> > > +        struct nouveau_job *it;
> > > +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
> > > +        struct dma_fence *fence;
> > > +        bool found;
> > > +
> > > +        ret = sync_find_fence(job, sync, &fence);
> > > +        if (ret)
> > > +            return ret;
> > > +
> > > +        mutex_lock(&bind_entity->job.mutex);
> > > +        found = false;
> > > +        list_for_each_entry(it, &bind_entity->job.list, head) {
> > > +            if (fence == it->done_fence) {
> > > +                found = true;
> > > +                break;
> > > +            }
> > > +        }
> > > +        mutex_unlock(&bind_entity->job.mutex);
> > > +
> > > +        /* If the fence is not from a VM_BIND job, don't wait for it. */
> > > +        if (!found)
> > > +            continue;
> > > +
> > > +        ret = dma_fence_wait_timeout(fence, true,
> > > +                         msecs_to_jiffies(500));
> > > +        if (ret < 0)
> > > +            return ret;
> > > +        else if (ret == 0)
> > > +            return -ETIMEDOUT;
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +int
> > > +nouveau_exec_job_submit(struct nouveau_job *job)
> > > +{
> > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > +    struct nouveau_cli *cli = exec_job->base.cli;
> > > +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(cli);
> > > +    struct drm_exec *exec = &job->exec;
> > > +    struct drm_gem_object *obj;
> > > +    unsigned long index;
> > > +    int ret;
> > > +
> > > +    ret = exec_job_binds_wait(job);
> > > +    if (ret)
> > > +        return ret;
> > > +
> > > +    nouveau_uvmm_lock(uvmm);
> > > +    drm_exec_while_not_all_locked(exec) {
> > > +        struct drm_gpuva *va;
> > > +
> > > +        drm_gpuva_for_each_va(va, &uvmm->umgr) {
> > > +            ret = drm_exec_prepare_obj(exec, va->gem.obj, 1);
> > > +            drm_exec_break_on_contention(exec);
> > > +            if (ret)
> > > +                return ret;
> > > +        }
> > > +    }
> > > +    nouveau_uvmm_unlock(uvmm);
> > > +
> > > +    drm_exec_for_each_locked_object(exec, index, obj) {
> > > +        struct dma_resv *resv = obj->resv;
> > > +        struct nouveau_bo *nvbo = nouveau_gem_object(obj);
> > > +
> > > +        ret = nouveau_bo_validate(nvbo, true, false);
> > > +        if (ret)
> > > +            return ret;
> > > +
> > > +        dma_resv_add_fence(resv, job->done_fence, DMA_RESV_USAGE_WRITE);
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static struct dma_fence *
> > > +nouveau_exec_job_run(struct nouveau_job *job)
> > > +{
> > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > +    struct nouveau_fence *fence;
> > > +    int i, ret;
> > > +
> > > +    ret = nouveau_dma_wait(job->chan, exec_job->push.count + 1, 16);
> > > +    if (ret) {
> > > +        NV_PRINTK(err, job->cli, "nv50cal_space: %d\n", ret);
> > > +        return ERR_PTR(ret);
> > > +    }
> > > +
> > > +    for (i = 0; i < exec_job->push.count; i++) {
> > > +        nv50_dma_push(job->chan, exec_job->push.s[i].va,
> > > +                  exec_job->push.s[i].va_len);
> > > +    }
> > > +
> > > +    ret = nouveau_fence_new(job->chan, false, &fence);
> > > +    if (ret) {
> > > +        NV_PRINTK(err, job->cli, "error fencing pushbuf: %d\n", ret);
> > > +        WIND_RING(job->chan);
> > > +        return ERR_PTR(ret);
> > > +    }
> > > +
> > > +    return &fence->base;
> > > +}
> > > +static void
> > > +nouveau_exec_job_free(struct nouveau_job *job)
> > > +{
> > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > +
> > > +    nouveau_base_job_free(job);
> > > +
> > > +    kfree(exec_job->push.s);
> > > +    kfree(exec_job);
> > > +}
> > > +
> > > +static struct nouveau_job_ops nouveau_exec_job_ops = {
> > > +    .submit = nouveau_exec_job_submit,
> > > +    .run = nouveau_exec_job_run,
> > > +    .free = nouveau_exec_job_free,
> > > +};
> > > +
> > > +int
> > > +nouveau_exec_job_init(struct nouveau_exec_job **pjob,
> > > +              struct nouveau_exec *exec)
> > > +{
> > > +    struct nouveau_exec_job *job;
> > > +    int ret;
> > > +
> > > +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> > > +    if (!job)
> > > +        return -ENOMEM;
> > > +
> > > +    job->push.count = exec->push.count;
> > > +    job->push.s = kmemdup(exec->push.s,
> > > +                  sizeof(*exec->push.s) *
> > > +                  exec->push.count,
> > > +                  GFP_KERNEL);
> > > +    if (!job->push.s) {
> > > +        ret = -ENOMEM;
> > > +        goto err_free_job;
> > > +    }
> > > +
> > > +    job->base.ops = &nouveau_exec_job_ops;
> > > +    ret = nouveau_base_job_init(&job->base, &exec->base);
> > > +    if (ret)
> > > +        goto err_free_pushs;
> > > +
> > > +    return 0;
> > > +
> > > +err_free_pushs:
> > > +    kfree(job->push.s);
> > > +err_free_job:
> > > +    kfree(job);
> > > +    *pjob = NULL;
> > > +
> > > +    return ret;
> > > +}
> > > +
> > > +void nouveau_job_fini(struct nouveau_job *job)
> > > +{
> > > +    dma_fence_put(job->done_fence);
> > > +    drm_sched_job_cleanup(&job->base);
> > > +    job->ops->free(job);
> > > +}
> > > +
> > > +static int
> > > +nouveau_job_add_deps(struct nouveau_job *job)
> > > +{
> > > +    struct dma_fence *in_fence = NULL;
> > > +    int ret, i;
> > > +
> > > +    for (i = 0; i < job->in_sync.count; i++) {
> > > +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
> > > +
> > > +        ret = sync_find_fence(job, sync, &in_fence);
> > > +        if (ret) {
> > > +            NV_PRINTK(warn, job->cli,
> > > +                  "Failed to find syncobj (-> in): handle=%d\n",
> > > +                  sync->handle);
> > > +            return ret;
> > > +        }
> > > +
> > > +        ret = drm_sched_job_add_dependency(&job->base, in_fence);
> > > +        if (ret)
> > > +            return ret;
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int
> > > +nouveau_job_fence_attach(struct nouveau_job *job, struct dma_fence
> > > *fence)
> > > +{
> > > +    struct drm_syncobj *out_sync;
> > > +    int i;
> > > +
> > > +    for (i = 0; i < job->out_sync.count; i++) {
> > > +        struct drm_nouveau_sync *sync = &job->out_sync.s[i];
> > > +        u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
> > > +
> > > +        if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
> > > +            stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > +            return -EOPNOTSUPP;
> > > +
> > > +        out_sync = drm_syncobj_find(job->file_priv, sync->handle);
> > > +        if (!out_sync) {
> > > +            NV_PRINTK(warn, job->cli,
> > > +                  "Failed to find syncobj (-> out): handle=%d\n",
> > > +                  sync->handle);
> > > +            return -ENOENT;
> > > +        }
> > > +
> > > +        if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ) {
> > > +            struct dma_fence_chain *chain;
> > > +
> > > +            chain = dma_fence_chain_alloc();
> > > +            if (!chain) {
> > > +                drm_syncobj_put(out_sync);
> > > +                return -ENOMEM;
> > > +            }
> > > +
> > > +            drm_syncobj_add_point(out_sync, chain, fence,
> > > +                          sync->timeline_value);
> > > +        } else {
> > > +            drm_syncobj_replace_fence(out_sync, fence);
> > > +        }
> > > +
> > > +        drm_syncobj_put(out_sync);
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static struct dma_fence *
> > > +nouveau_job_run(struct nouveau_job *job)
> > > +{
> > > +    return job->ops->run(job);
> > > +}
> > > +
> > > +static int
> > > +nouveau_job_run_sync(struct nouveau_job *job)
> > > +{
> > > +    struct dma_fence *fence;
> > > +    int ret;
> > > +
> > > +    fence = nouveau_job_run(job);
> > > +    if (IS_ERR(fence)) {
> > > +        return PTR_ERR(fence);
> > > +    } else if (fence) {
> > > +        ret = dma_fence_wait(fence, true);
> > > +        if (ret)
> > > +            return ret;
> > > +    }
> > > +
> > > +    dma_fence_signal(job->done_fence);
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +int
> > > +nouveau_job_submit(struct nouveau_job *job)
> > > +{
> > > +    struct nouveau_sched_entity *entity =
> > > to_nouveau_sched_entity(job->base.entity);
> > > +    int ret;
> > > +
> > > +    drm_exec_init(&job->exec, true);
> > > +
> > > +    ret = nouveau_job_add_deps(job);
> > > +    if (ret)
> > > +        goto out;
> > > +
> > > +    drm_sched_job_arm(&job->base);
> > > +    job->done_fence = dma_fence_get(&job->base.s_fence->finished);
> > > +
> > > +    ret = nouveau_job_fence_attach(job, job->done_fence);
> > > +    if (ret)
> > > +        goto out;
> > > +
> > > +    if (job->ops->submit) {
> > > +        ret = job->ops->submit(job);
> > > +        if (ret)
> > > +            goto out;
> > > +    }
> > > +
> > > +    if (job->sync) {
> > > +        drm_exec_fini(&job->exec);
> > > +
> > > +        /* We're requested to run a synchronous job, hence don't push
> > > +         * the job, bypassing the job scheduler, and execute the jobs
> > > +         * run() function right away.
> > > +         *
> > > +         * As a consequence of bypassing the job scheduler we need to
> > > +         * handle fencing and job cleanup ourselfes.
> > > +         */
> > > +        ret = nouveau_job_run_sync(job);
> > > +
> > > +        /* If the job fails, the caller will do the cleanup for us. */
> > > +        if (!ret)
> > > +            nouveau_job_fini(job);
> > > +
> > > +        return ret;
> > > +    } else {
> > > +        mutex_lock(&entity->job.mutex);
> > > +        drm_sched_entity_push_job(&job->base);
> > > +        list_add_tail(&job->head, &entity->job.list);
> > > +        mutex_unlock(&entity->job.mutex);
> > > +    }
> > > +
> > > +out:
> > > +    drm_exec_fini(&job->exec);
> > > +    return ret;
> > > +}
> > > +
> > > +static struct dma_fence *
> > > +nouveau_sched_run_job(struct drm_sched_job *sched_job)
> > > +{
> > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > +
> > > +    return nouveau_job_run(job);
> > > +}
> > > +
> > > +static enum drm_gpu_sched_stat
> > > +nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
> > > +{
> > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > +    struct nouveau_channel *chan = job->chan;
> > > +
> > > +    if (unlikely(!atomic_read(&chan->killed)))
> > > +        nouveau_channel_kill(chan);
> > > +
> > > +    NV_PRINTK(warn, job->cli, "job timeout, channel %d killed!\n",
> > > +          chan->chid);
> > > +
> > > +    nouveau_sched_entity_fini(job->entity);
> > > +
> > > +    return DRM_GPU_SCHED_STAT_ENODEV;
> > > +}
> > > +
> > > +static void
> > > +nouveau_sched_free_job(struct drm_sched_job *sched_job)
> > > +{
> > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > +    struct nouveau_sched_entity *entity = job->entity;
> > > +
> > > +    mutex_lock(&entity->job.mutex);
> > > +    list_del(&job->head);
> > > +    mutex_unlock(&entity->job.mutex);
> > > +
> > > +    nouveau_job_fini(job);
> > > +}
> > > +
> > > +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
> > > +                  struct drm_gpu_scheduler *sched)
> > > +{
> > > +
> > > +    INIT_LIST_HEAD(&entity->job.list);
> > > +    mutex_init(&entity->job.mutex);
> > > +
> > > +    return drm_sched_entity_init(&entity->base,
> > > +                     DRM_SCHED_PRIORITY_NORMAL,
> > > +                     &sched, 1, NULL);
> > > +}
> > > +
> > > +void
> > > +nouveau_sched_entity_fini(struct nouveau_sched_entity *entity)
> > > +{
> > > +    drm_sched_entity_destroy(&entity->base);
> > > +}
> > > +
> > > +static const struct drm_sched_backend_ops nouveau_sched_ops = {
> > > +    .run_job = nouveau_sched_run_job,
> > > +    .timedout_job = nouveau_sched_timedout_job,
> > > +    .free_job = nouveau_sched_free_job,
> > > +};
> > > +
> > > +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
> > > +               struct nouveau_drm *drm)
> > > +{
> > > +    long job_hang_limit =
> > > msecs_to_jiffies(NOUVEAU_SCHED_JOB_TIMEOUT_MS);
> > > +
> > > +    return drm_sched_init(sched, &nouveau_sched_ops,
> > > +                  NOUVEAU_SCHED_HW_SUBMISSIONS, 0, job_hang_limit,
> > > +                  NULL, NULL, "nouveau", drm->dev->dev);
> > > +}
> > > +
> > > +void nouveau_sched_fini(struct drm_gpu_scheduler *sched)
> > > +{
> > > +    drm_sched_fini(sched);
> > > +}
> > > diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > b/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > new file mode 100644
> > > index 000000000000..7fc5b7eea810
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > @@ -0,0 +1,98 @@
> > > +// SPDX-License-Identifier: MIT
> > > +
> > > +#ifndef NOUVEAU_SCHED_H
> > > +#define NOUVEAU_SCHED_H
> > > +
> > > +#include <linux/types.h>
> > > +
> > > +#include <drm/drm_exec.h>
> > > +#include <drm/gpu_scheduler.h>
> > > +
> > > +#include "nouveau_drv.h"
> > > +#include "nouveau_exec.h"
> > > +
> > > +#define to_nouveau_job(sched_job)        \
> > > +        container_of((sched_job), struct nouveau_job, base)
> > > +
> > > +#define to_nouveau_exec_job(job)        \
> > > +        container_of((job), struct nouveau_exec_job, base)
> > > +
> > > +#define to_nouveau_bind_job(job)        \
> > > +        container_of((job), struct nouveau_bind_job, base)
> > > +
> > > +struct nouveau_job {
> > > +    struct drm_sched_job base;
> > > +    struct list_head head;
> > > +
> > > +    struct nouveau_sched_entity *entity;
> > > +
> > > +    struct drm_file *file_priv;
> > > +    struct nouveau_cli *cli;
> > > +    struct nouveau_channel *chan;
> > > +
> > > +    struct drm_exec exec;
> > > +    struct dma_fence *done_fence;
> > > +
> > > +    bool sync;
> > > +
> > > +    struct {
> > > +        struct drm_nouveau_sync *s;
> > > +        u32 count;
> > > +    } in_sync;
> > > +
> > > +    struct {
> > > +        struct drm_nouveau_sync *s;
> > > +        u32 count;
> > > +    } out_sync;
> > > +
> > > +    struct nouveau_job_ops {
> > > +        int (*submit)(struct nouveau_job *);
> > > +        struct dma_fence *(*run)(struct nouveau_job *);
> > > +        void (*free)(struct nouveau_job *);
> > > +    } *ops;
> > > +};
> > > +
> > > +struct nouveau_exec_job {
> > > +    struct nouveau_job base;
> > > +
> > > +    struct {
> > > +        struct drm_nouveau_exec_push *s;
> > > +        u32 count;
> > > +    } push;
> > > +};
> > > +
> > > +struct nouveau_bind_job {
> > > +    struct nouveau_job base;
> > > +
> > > +    /* struct bind_job_op */
> > > +    struct list_head ops;
> > > +};
> > > +
> > > +int nouveau_bind_job_init(struct nouveau_bind_job **job,
> > > +              struct nouveau_exec_bind *bind);
> > > +int nouveau_exec_job_init(struct nouveau_exec_job **job,
> > > +              struct nouveau_exec *exec);
> > > +
> > > +int nouveau_job_submit(struct nouveau_job *job);
> > > +void nouveau_job_fini(struct nouveau_job *job);
> > > +
> > > +#define to_nouveau_sched_entity(entity)        \
> > > +        container_of((entity), struct nouveau_sched_entity, base)
> > > +
> > > +struct nouveau_sched_entity {
> > > +    struct drm_sched_entity base;
> > > +    struct {
> > > +        struct list_head list;
> > > +        struct mutex mutex;
> > > +    } job;
> > > +};
> > > +
> > > +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
> > > +                  struct drm_gpu_scheduler *sched);
> > > +void nouveau_sched_entity_fini(struct nouveau_sched_entity *entity);
> > > +
> > > +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
> > > +               struct nouveau_drm *drm);
> > > +void nouveau_sched_fini(struct drm_gpu_scheduler *sched);
> > > +
> > > +#endif
> > 
>
Thomas Hellström (Intel) Jan. 19, 2023, 7:32 a.m. UTC | #5
On 1/19/23 05:58, Matthew Brost wrote:
> On Thu, Jan 19, 2023 at 04:44:23AM +0100, Danilo Krummrich wrote:
>> On 1/18/23 21:37, Thomas Hellström (Intel) wrote:
>>> On 1/18/23 07:12, Danilo Krummrich wrote:
>>>> This commit provides the implementation for the new uapi motivated by the
>>>> Vulkan API. It allows user mode drivers (UMDs) to:
>>>>
>>>> 1) Initialize a GPU virtual address (VA) space via the new
>>>>      DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
>>>>      space managed by the kernel and userspace, respectively.
>>>>
>>>> 2) Allocate and free a VA space region as well as bind and unbind memory
>>>>      to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>>>      UMDs can request the named operations to be processed either
>>>>      synchronously or asynchronously. It supports DRM syncobjs
>>>>      (incl. timelines) as synchronization mechanism. The management of the
>>>>      GPU VA mappings is implemented with the DRM GPU VA manager.
>>>>
>>>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
>>>>      execution happens asynchronously. It supports DRM syncobj (incl.
>>>>      timelines) as synchronization mechanism. DRM GEM object locking is
>>>>      handled with drm_exec.
>>>>
>>>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
>>>> GPU scheduler for the asynchronous paths.
>>>>
>>>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>>>> ---
>>>>    Documentation/gpu/driver-uapi.rst       |   3 +
>>>>    drivers/gpu/drm/nouveau/Kbuild          |   2 +
>>>>    drivers/gpu/drm/nouveau/Kconfig         |   2 +
>>>>    drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
>>>>    drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
>>>>    drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
>>>>    drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
>>>>    drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
>>>>    drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
>>>>    drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
>>>>    drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
>>>>    11 files changed, 1295 insertions(+), 4 deletions(-)
>>>>    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
>>>>    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
>>>>    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
>>>>    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h
>>> ...
>>>> +static struct dma_fence *
>>>> +nouveau_bind_job_run(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
>>>> +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
>>>> +    struct bind_job_op *op;
>>>> +    int ret = 0;
>>>> +
>>> I was looking at how nouveau does the async binding compared to how xe
>>> does it.
>>> It looks to me that this function being a scheduler run_job callback is
>>> the main part of the VM_BIND dma-fence signalling critical section for
>>> the job's done_fence and if so, needs to be annotated as such?
>> Yes, that's the case.
>>
>>> For example nouveau_uvma_region_new allocates memory, which is not
>>> allowed if in a dma_fence signalling critical section and the locking
>>> also looks suspicious?
>> Thanks for pointing this out, I missed that somehow.
>>
>> I will change it to pre-allocate new regions, mappings and page tables
>> within the job's submit() function.
>>
> Yea that what we basically do in Xe, in the IOCTL step allocate all the
> backing store for new page tables, populate new page tables (these are
> not yet visible in the page table structure), and in last step which is
> executed after all the dependencies are satified program all the leaf
> entires making the new binding visible.
>
> We screwed have this up by defering most of the IOCTL to a worker but
> will fix this fix this one way or another soon - get rid of worker or
> introduce a type of sync that is signaled after the worker + publish the
> dma-fence in the worker. I'd like to close on this one soon.
>   
>> For the ops structures the drm_gpuva_manager allocates for reporting the
>> split/merge steps back to the driver I have ideas to entirely avoid
>> allocations, which also is a good thing in respect of Christians feedback
>> regarding the huge amount of mapping requests some applications seem to
>> generate.
>>
> It should be fine to have allocations to report the split/merge step as
> this step should be before a dma-fence is published, but yea if possible
> to avoid extra allocs as that is always better.
>
> Also BTW, great work on drm_gpuva_manager too. We will almost likely
> pick this up in Xe rather than open coding all of this as we currently
> do. We should probably start the port to this soon so we can contribute
> to the implementation and get both of our drivers upstream sooner.
>   
>> Regarding the locking, anything specific that makes it look suspicious to
>> you?
>>
> I haven't looked into this too but almost certainly Thomas is suggesting
> that if you allocate memory anywhere under the nouveau_uvmm_lock then
> you can't use this lock in the run_job() callback as this in the
> dma-fencing path.

Yes, that was what looked suspicious to me, although I haven't either 
looked at the code in detail to say for sure.

But starting by annotating this with dma_fence_[begin | 
end]_signalling() would help find all issues with this.

FWIW, by coincidence I  discussed drm-scheduler dma-fence annotation 
with Daniel Vetter yesterday and it appears he has a patch-set to enable 
that, at least for drivers that want to opt-in. We probably should try 
to get that merged and then we'd be able to catch this type of things 
earlier.

Thanks,

Thomas



>
> Matt
>
>>> Thanks,
>>>
>>> Thomas
>>>
>>>
>>>> +    nouveau_uvmm_lock(uvmm);
>>>> +    list_for_each_op(op, &bind_job->ops) {
>>>> +        switch (op->op) {
>>>> +        case OP_ALLOC: {
>>>> +            bool sparse = op->flags & DRM_NOUVEAU_VM_BIND_SPARSE;
>>>> +
>>>> +            ret = nouveau_uvma_region_new(uvmm,
>>>> +                              op->va.addr,
>>>> +                              op->va.range,
>>>> +                              sparse);
>>>> +            if (ret)
>>>> +                goto out_unlock;
>>>> +            break;
>>>> +        }
>>>> +        case OP_FREE:
>>>> +            ret = nouveau_uvma_region_destroy(uvmm,
>>>> +                              op->va.addr,
>>>> +                              op->va.range);
>>>> +            if (ret)
>>>> +                goto out_unlock;
>>>> +            break;
>>>> +        case OP_MAP:
>>>> +            ret = nouveau_uvmm_sm_map(uvmm,
>>>> +                          op->va.addr, op->va.range,
>>>> +                          op->gem.obj, op->gem.offset,
>>>> +                          op->flags && 0xff);
>>>> +            if (ret)
>>>> +                goto out_unlock;
>>>> +            break;
>>>> +        case OP_UNMAP:
>>>> +            ret = nouveau_uvmm_sm_unmap(uvmm,
>>>> +                            op->va.addr,
>>>> +                            op->va.range);
>>>> +            if (ret)
>>>> +                goto out_unlock;
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +
>>>> +out_unlock:
>>>> +    nouveau_uvmm_unlock(uvmm);
>>>> +    if (ret)
>>>> +        NV_PRINTK(err, job->cli, "bind job failed: %d\n", ret);
>>>> +    return ERR_PTR(ret);
>>>> +}
>>>> +
>>>> +static void
>>>> +nouveau_bind_job_free(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
>>>> +    struct bind_job_op *op, *next;
>>>> +
>>>> +    list_for_each_op_safe(op, next, &bind_job->ops) {
>>>> +        struct drm_gem_object *obj = op->gem.obj;
>>>> +
>>>> +        if (obj)
>>>> +            drm_gem_object_put(obj);
>>>> +
>>>> +        list_del(&op->entry);
>>>> +        kfree(op);
>>>> +    }
>>>> +
>>>> +    nouveau_base_job_free(job);
>>>> +    kfree(bind_job);
>>>> +}
>>>> +
>>>> +static struct nouveau_job_ops nouveau_bind_job_ops = {
>>>> +    .submit = nouveau_bind_job_submit,
>>>> +    .run = nouveau_bind_job_run,
>>>> +    .free = nouveau_bind_job_free,
>>>> +};
>>>> +
>>>> +static int
>>>> +bind_job_op_from_uop(struct bind_job_op **pop,
>>>> +             struct drm_nouveau_vm_bind_op *uop)
>>>> +{
>>>> +    struct bind_job_op *op;
>>>> +
>>>> +    op = *pop = kzalloc(sizeof(*op), GFP_KERNEL);
>>>> +    if (!op)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    op->op = uop->op;
>>>> +    op->flags = uop->flags;
>>>> +    op->va.addr = uop->addr;
>>>> +    op->va.range = uop->range;
>>>> +
>>>> +    if (op->op == DRM_NOUVEAU_VM_BIND_OP_MAP) {
>>>> +        op->gem.handle = uop->handle;
>>>> +        op->gem.offset = uop->bo_offset;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void
>>>> +bind_job_ops_free(struct list_head *ops)
>>>> +{
>>>> +    struct bind_job_op *op, *next;
>>>> +
>>>> +    list_for_each_op_safe(op, next, ops) {
>>>> +        list_del(&op->entry);
>>>> +        kfree(op);
>>>> +    }
>>>> +}
>>>> +
>>>> +int
>>>> +nouveau_bind_job_init(struct nouveau_bind_job **pjob,
>>>> +              struct nouveau_exec_bind *bind)
>>>> +{
>>>> +    struct nouveau_bind_job *job;
>>>> +    struct bind_job_op *op;
>>>> +    int i, ret;
>>>> +
>>>> +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>>>> +    if (!job)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    INIT_LIST_HEAD(&job->ops);
>>>> +
>>>> +    for (i = 0; i < bind->op.count; i++) {
>>>> +        ret = bind_job_op_from_uop(&op, &bind->op.s[i]);
>>>> +        if (ret)
>>>> +            goto err_free;
>>>> +
>>>> +        list_add_tail(&op->entry, &job->ops);
>>>> +    }
>>>> +
>>>> +    job->base.sync = !(bind->flags & DRM_NOUVEAU_VM_BIND_RUN_ASYNC);
>>>> +    job->base.ops = &nouveau_bind_job_ops;
>>>> +
>>>> +    ret = nouveau_base_job_init(&job->base, &bind->base);
>>>> +    if (ret)
>>>> +        goto err_free;
>>>> +
>>>> +    return 0;
>>>> +
>>>> +err_free:
>>>> +    bind_job_ops_free(&job->ops);
>>>> +    kfree(job);
>>>> +    *pjob = NULL;
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static int
>>>> +sync_find_fence(struct nouveau_job *job,
>>>> +        struct drm_nouveau_sync *sync,
>>>> +        struct dma_fence **fence)
>>>> +{
>>>> +    u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
>>>> +    u64 point = 0;
>>>> +    int ret;
>>>> +
>>>> +    if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
>>>> +        stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>> +        return -EOPNOTSUPP;
>>>> +
>>>> +    if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>> +        point = sync->timeline_value;
>>>> +
>>>> +    ret = drm_syncobj_find_fence(job->file_priv,
>>>> +                     sync->handle, point,
>>>> +                     sync->flags, fence);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int
>>>> +exec_job_binds_wait(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>> +    struct nouveau_cli *cli = exec_job->base.cli;
>>>> +    struct nouveau_sched_entity *bind_entity = &cli->sched_entity;
>>>> +    signed long ret;
>>>> +    int i;
>>>> +
>>>> +    for (i = 0; i < job->in_sync.count; i++) {
>>>> +        struct nouveau_job *it;
>>>> +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
>>>> +        struct dma_fence *fence;
>>>> +        bool found;
>>>> +
>>>> +        ret = sync_find_fence(job, sync, &fence);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +
>>>> +        mutex_lock(&bind_entity->job.mutex);
>>>> +        found = false;
>>>> +        list_for_each_entry(it, &bind_entity->job.list, head) {
>>>> +            if (fence == it->done_fence) {
>>>> +                found = true;
>>>> +                break;
>>>> +            }
>>>> +        }
>>>> +        mutex_unlock(&bind_entity->job.mutex);
>>>> +
>>>> +        /* If the fence is not from a VM_BIND job, don't wait for it. */
>>>> +        if (!found)
>>>> +            continue;
>>>> +
>>>> +        ret = dma_fence_wait_timeout(fence, true,
>>>> +                         msecs_to_jiffies(500));
>>>> +        if (ret < 0)
>>>> +            return ret;
>>>> +        else if (ret == 0)
>>>> +            return -ETIMEDOUT;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +int
>>>> +nouveau_exec_job_submit(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>> +    struct nouveau_cli *cli = exec_job->base.cli;
>>>> +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(cli);
>>>> +    struct drm_exec *exec = &job->exec;
>>>> +    struct drm_gem_object *obj;
>>>> +    unsigned long index;
>>>> +    int ret;
>>>> +
>>>> +    ret = exec_job_binds_wait(job);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +    nouveau_uvmm_lock(uvmm);
>>>> +    drm_exec_while_not_all_locked(exec) {
>>>> +        struct drm_gpuva *va;
>>>> +
>>>> +        drm_gpuva_for_each_va(va, &uvmm->umgr) {
>>>> +            ret = drm_exec_prepare_obj(exec, va->gem.obj, 1);
>>>> +            drm_exec_break_on_contention(exec);
>>>> +            if (ret)
>>>> +                return ret;
>>>> +        }
>>>> +    }
>>>> +    nouveau_uvmm_unlock(uvmm);
>>>> +
>>>> +    drm_exec_for_each_locked_object(exec, index, obj) {
>>>> +        struct dma_resv *resv = obj->resv;
>>>> +        struct nouveau_bo *nvbo = nouveau_gem_object(obj);
>>>> +
>>>> +        ret = nouveau_bo_validate(nvbo, true, false);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +
>>>> +        dma_resv_add_fence(resv, job->done_fence, DMA_RESV_USAGE_WRITE);
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static struct dma_fence *
>>>> +nouveau_exec_job_run(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>> +    struct nouveau_fence *fence;
>>>> +    int i, ret;
>>>> +
>>>> +    ret = nouveau_dma_wait(job->chan, exec_job->push.count + 1, 16);
>>>> +    if (ret) {
>>>> +        NV_PRINTK(err, job->cli, "nv50cal_space: %d\n", ret);
>>>> +        return ERR_PTR(ret);
>>>> +    }
>>>> +
>>>> +    for (i = 0; i < exec_job->push.count; i++) {
>>>> +        nv50_dma_push(job->chan, exec_job->push.s[i].va,
>>>> +                  exec_job->push.s[i].va_len);
>>>> +    }
>>>> +
>>>> +    ret = nouveau_fence_new(job->chan, false, &fence);
>>>> +    if (ret) {
>>>> +        NV_PRINTK(err, job->cli, "error fencing pushbuf: %d\n", ret);
>>>> +        WIND_RING(job->chan);
>>>> +        return ERR_PTR(ret);
>>>> +    }
>>>> +
>>>> +    return &fence->base;
>>>> +}
>>>> +static void
>>>> +nouveau_exec_job_free(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>> +
>>>> +    nouveau_base_job_free(job);
>>>> +
>>>> +    kfree(exec_job->push.s);
>>>> +    kfree(exec_job);
>>>> +}
>>>> +
>>>> +static struct nouveau_job_ops nouveau_exec_job_ops = {
>>>> +    .submit = nouveau_exec_job_submit,
>>>> +    .run = nouveau_exec_job_run,
>>>> +    .free = nouveau_exec_job_free,
>>>> +};
>>>> +
>>>> +int
>>>> +nouveau_exec_job_init(struct nouveau_exec_job **pjob,
>>>> +              struct nouveau_exec *exec)
>>>> +{
>>>> +    struct nouveau_exec_job *job;
>>>> +    int ret;
>>>> +
>>>> +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>>>> +    if (!job)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    job->push.count = exec->push.count;
>>>> +    job->push.s = kmemdup(exec->push.s,
>>>> +                  sizeof(*exec->push.s) *
>>>> +                  exec->push.count,
>>>> +                  GFP_KERNEL);
>>>> +    if (!job->push.s) {
>>>> +        ret = -ENOMEM;
>>>> +        goto err_free_job;
>>>> +    }
>>>> +
>>>> +    job->base.ops = &nouveau_exec_job_ops;
>>>> +    ret = nouveau_base_job_init(&job->base, &exec->base);
>>>> +    if (ret)
>>>> +        goto err_free_pushs;
>>>> +
>>>> +    return 0;
>>>> +
>>>> +err_free_pushs:
>>>> +    kfree(job->push.s);
>>>> +err_free_job:
>>>> +    kfree(job);
>>>> +    *pjob = NULL;
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +void nouveau_job_fini(struct nouveau_job *job)
>>>> +{
>>>> +    dma_fence_put(job->done_fence);
>>>> +    drm_sched_job_cleanup(&job->base);
>>>> +    job->ops->free(job);
>>>> +}
>>>> +
>>>> +static int
>>>> +nouveau_job_add_deps(struct nouveau_job *job)
>>>> +{
>>>> +    struct dma_fence *in_fence = NULL;
>>>> +    int ret, i;
>>>> +
>>>> +    for (i = 0; i < job->in_sync.count; i++) {
>>>> +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
>>>> +
>>>> +        ret = sync_find_fence(job, sync, &in_fence);
>>>> +        if (ret) {
>>>> +            NV_PRINTK(warn, job->cli,
>>>> +                  "Failed to find syncobj (-> in): handle=%d\n",
>>>> +                  sync->handle);
>>>> +            return ret;
>>>> +        }
>>>> +
>>>> +        ret = drm_sched_job_add_dependency(&job->base, in_fence);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int
>>>> +nouveau_job_fence_attach(struct nouveau_job *job, struct dma_fence
>>>> *fence)
>>>> +{
>>>> +    struct drm_syncobj *out_sync;
>>>> +    int i;
>>>> +
>>>> +    for (i = 0; i < job->out_sync.count; i++) {
>>>> +        struct drm_nouveau_sync *sync = &job->out_sync.s[i];
>>>> +        u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
>>>> +
>>>> +        if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
>>>> +            stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>> +            return -EOPNOTSUPP;
>>>> +
>>>> +        out_sync = drm_syncobj_find(job->file_priv, sync->handle);
>>>> +        if (!out_sync) {
>>>> +            NV_PRINTK(warn, job->cli,
>>>> +                  "Failed to find syncobj (-> out): handle=%d\n",
>>>> +                  sync->handle);
>>>> +            return -ENOENT;
>>>> +        }
>>>> +
>>>> +        if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ) {
>>>> +            struct dma_fence_chain *chain;
>>>> +
>>>> +            chain = dma_fence_chain_alloc();
>>>> +            if (!chain) {
>>>> +                drm_syncobj_put(out_sync);
>>>> +                return -ENOMEM;
>>>> +            }
>>>> +
>>>> +            drm_syncobj_add_point(out_sync, chain, fence,
>>>> +                          sync->timeline_value);
>>>> +        } else {
>>>> +            drm_syncobj_replace_fence(out_sync, fence);
>>>> +        }
>>>> +
>>>> +        drm_syncobj_put(out_sync);
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static struct dma_fence *
>>>> +nouveau_job_run(struct nouveau_job *job)
>>>> +{
>>>> +    return job->ops->run(job);
>>>> +}
>>>> +
>>>> +static int
>>>> +nouveau_job_run_sync(struct nouveau_job *job)
>>>> +{
>>>> +    struct dma_fence *fence;
>>>> +    int ret;
>>>> +
>>>> +    fence = nouveau_job_run(job);
>>>> +    if (IS_ERR(fence)) {
>>>> +        return PTR_ERR(fence);
>>>> +    } else if (fence) {
>>>> +        ret = dma_fence_wait(fence, true);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +    }
>>>> +
>>>> +    dma_fence_signal(job->done_fence);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +int
>>>> +nouveau_job_submit(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_sched_entity *entity =
>>>> to_nouveau_sched_entity(job->base.entity);
>>>> +    int ret;
>>>> +
>>>> +    drm_exec_init(&job->exec, true);
>>>> +
>>>> +    ret = nouveau_job_add_deps(job);
>>>> +    if (ret)
>>>> +        goto out;
>>>> +
>>>> +    drm_sched_job_arm(&job->base);
>>>> +    job->done_fence = dma_fence_get(&job->base.s_fence->finished);
>>>> +
>>>> +    ret = nouveau_job_fence_attach(job, job->done_fence);
>>>> +    if (ret)
>>>> +        goto out;
>>>> +
>>>> +    if (job->ops->submit) {
>>>> +        ret = job->ops->submit(job);
>>>> +        if (ret)
>>>> +            goto out;
>>>> +    }
>>>> +
>>>> +    if (job->sync) {
>>>> +        drm_exec_fini(&job->exec);
>>>> +
>>>> +        /* We're requested to run a synchronous job, hence don't push
>>>> +         * the job, bypassing the job scheduler, and execute the jobs
>>>> +         * run() function right away.
>>>> +         *
>>>> +         * As a consequence of bypassing the job scheduler we need to
>>>> +         * handle fencing and job cleanup ourselfes.
>>>> +         */
>>>> +        ret = nouveau_job_run_sync(job);
>>>> +
>>>> +        /* If the job fails, the caller will do the cleanup for us. */
>>>> +        if (!ret)
>>>> +            nouveau_job_fini(job);
>>>> +
>>>> +        return ret;
>>>> +    } else {
>>>> +        mutex_lock(&entity->job.mutex);
>>>> +        drm_sched_entity_push_job(&job->base);
>>>> +        list_add_tail(&job->head, &entity->job.list);
>>>> +        mutex_unlock(&entity->job.mutex);
>>>> +    }
>>>> +
>>>> +out:
>>>> +    drm_exec_fini(&job->exec);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static struct dma_fence *
>>>> +nouveau_sched_run_job(struct drm_sched_job *sched_job)
>>>> +{
>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>> +
>>>> +    return nouveau_job_run(job);
>>>> +}
>>>> +
>>>> +static enum drm_gpu_sched_stat
>>>> +nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
>>>> +{
>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>> +    struct nouveau_channel *chan = job->chan;
>>>> +
>>>> +    if (unlikely(!atomic_read(&chan->killed)))
>>>> +        nouveau_channel_kill(chan);
>>>> +
>>>> +    NV_PRINTK(warn, job->cli, "job timeout, channel %d killed!\n",
>>>> +          chan->chid);
>>>> +
>>>> +    nouveau_sched_entity_fini(job->entity);
>>>> +
>>>> +    return DRM_GPU_SCHED_STAT_ENODEV;
>>>> +}
>>>> +
>>>> +static void
>>>> +nouveau_sched_free_job(struct drm_sched_job *sched_job)
>>>> +{
>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>> +    struct nouveau_sched_entity *entity = job->entity;
>>>> +
>>>> +    mutex_lock(&entity->job.mutex);
>>>> +    list_del(&job->head);
>>>> +    mutex_unlock(&entity->job.mutex);
>>>> +
>>>> +    nouveau_job_fini(job);
>>>> +}
>>>> +
>>>> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
>>>> +                  struct drm_gpu_scheduler *sched)
>>>> +{
>>>> +
>>>> +    INIT_LIST_HEAD(&entity->job.list);
>>>> +    mutex_init(&entity->job.mutex);
>>>> +
>>>> +    return drm_sched_entity_init(&entity->base,
>>>> +                     DRM_SCHED_PRIORITY_NORMAL,
>>>> +                     &sched, 1, NULL);
>>>> +}
>>>> +
>>>> +void
>>>> +nouveau_sched_entity_fini(struct nouveau_sched_entity *entity)
>>>> +{
>>>> +    drm_sched_entity_destroy(&entity->base);
>>>> +}
>>>> +
>>>> +static const struct drm_sched_backend_ops nouveau_sched_ops = {
>>>> +    .run_job = nouveau_sched_run_job,
>>>> +    .timedout_job = nouveau_sched_timedout_job,
>>>> +    .free_job = nouveau_sched_free_job,
>>>> +};
>>>> +
>>>> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
>>>> +               struct nouveau_drm *drm)
>>>> +{
>>>> +    long job_hang_limit =
>>>> msecs_to_jiffies(NOUVEAU_SCHED_JOB_TIMEOUT_MS);
>>>> +
>>>> +    return drm_sched_init(sched, &nouveau_sched_ops,
>>>> +                  NOUVEAU_SCHED_HW_SUBMISSIONS, 0, job_hang_limit,
>>>> +                  NULL, NULL, "nouveau", drm->dev->dev);
>>>> +}
>>>> +
>>>> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched)
>>>> +{
>>>> +    drm_sched_fini(sched);
>>>> +}
>>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>> b/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>> new file mode 100644
>>>> index 000000000000..7fc5b7eea810
>>>> --- /dev/null
>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>> @@ -0,0 +1,98 @@
>>>> +// SPDX-License-Identifier: MIT
>>>> +
>>>> +#ifndef NOUVEAU_SCHED_H
>>>> +#define NOUVEAU_SCHED_H
>>>> +
>>>> +#include <linux/types.h>
>>>> +
>>>> +#include <drm/drm_exec.h>
>>>> +#include <drm/gpu_scheduler.h>
>>>> +
>>>> +#include "nouveau_drv.h"
>>>> +#include "nouveau_exec.h"
>>>> +
>>>> +#define to_nouveau_job(sched_job)        \
>>>> +        container_of((sched_job), struct nouveau_job, base)
>>>> +
>>>> +#define to_nouveau_exec_job(job)        \
>>>> +        container_of((job), struct nouveau_exec_job, base)
>>>> +
>>>> +#define to_nouveau_bind_job(job)        \
>>>> +        container_of((job), struct nouveau_bind_job, base)
>>>> +
>>>> +struct nouveau_job {
>>>> +    struct drm_sched_job base;
>>>> +    struct list_head head;
>>>> +
>>>> +    struct nouveau_sched_entity *entity;
>>>> +
>>>> +    struct drm_file *file_priv;
>>>> +    struct nouveau_cli *cli;
>>>> +    struct nouveau_channel *chan;
>>>> +
>>>> +    struct drm_exec exec;
>>>> +    struct dma_fence *done_fence;
>>>> +
>>>> +    bool sync;
>>>> +
>>>> +    struct {
>>>> +        struct drm_nouveau_sync *s;
>>>> +        u32 count;
>>>> +    } in_sync;
>>>> +
>>>> +    struct {
>>>> +        struct drm_nouveau_sync *s;
>>>> +        u32 count;
>>>> +    } out_sync;
>>>> +
>>>> +    struct nouveau_job_ops {
>>>> +        int (*submit)(struct nouveau_job *);
>>>> +        struct dma_fence *(*run)(struct nouveau_job *);
>>>> +        void (*free)(struct nouveau_job *);
>>>> +    } *ops;
>>>> +};
>>>> +
>>>> +struct nouveau_exec_job {
>>>> +    struct nouveau_job base;
>>>> +
>>>> +    struct {
>>>> +        struct drm_nouveau_exec_push *s;
>>>> +        u32 count;
>>>> +    } push;
>>>> +};
>>>> +
>>>> +struct nouveau_bind_job {
>>>> +    struct nouveau_job base;
>>>> +
>>>> +    /* struct bind_job_op */
>>>> +    struct list_head ops;
>>>> +};
>>>> +
>>>> +int nouveau_bind_job_init(struct nouveau_bind_job **job,
>>>> +              struct nouveau_exec_bind *bind);
>>>> +int nouveau_exec_job_init(struct nouveau_exec_job **job,
>>>> +              struct nouveau_exec *exec);
>>>> +
>>>> +int nouveau_job_submit(struct nouveau_job *job);
>>>> +void nouveau_job_fini(struct nouveau_job *job);
>>>> +
>>>> +#define to_nouveau_sched_entity(entity)        \
>>>> +        container_of((entity), struct nouveau_sched_entity, base)
>>>> +
>>>> +struct nouveau_sched_entity {
>>>> +    struct drm_sched_entity base;
>>>> +    struct {
>>>> +        struct list_head list;
>>>> +        struct mutex mutex;
>>>> +    } job;
>>>> +};
>>>> +
>>>> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
>>>> +                  struct drm_gpu_scheduler *sched);
>>>> +void nouveau_sched_entity_fini(struct nouveau_sched_entity *entity);
>>>> +
>>>> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
>>>> +               struct nouveau_drm *drm);
>>>> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched);
>>>> +
>>>> +#endif
Danilo Krummrich Jan. 19, 2023, 3:36 p.m. UTC | #6
On 1/19/23 05:58, Matthew Brost wrote:
> On Thu, Jan 19, 2023 at 04:44:23AM +0100, Danilo Krummrich wrote:
>> On 1/18/23 21:37, Thomas Hellström (Intel) wrote:
>>>
>>> On 1/18/23 07:12, Danilo Krummrich wrote:
>>>> This commit provides the implementation for the new uapi motivated by the
>>>> Vulkan API. It allows user mode drivers (UMDs) to:
>>>>
>>>> 1) Initialize a GPU virtual address (VA) space via the new
>>>>      DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
>>>>      space managed by the kernel and userspace, respectively.
>>>>
>>>> 2) Allocate and free a VA space region as well as bind and unbind memory
>>>>      to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>>>      UMDs can request the named operations to be processed either
>>>>      synchronously or asynchronously. It supports DRM syncobjs
>>>>      (incl. timelines) as synchronization mechanism. The management of the
>>>>      GPU VA mappings is implemented with the DRM GPU VA manager.
>>>>
>>>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
>>>>      execution happens asynchronously. It supports DRM syncobj (incl.
>>>>      timelines) as synchronization mechanism. DRM GEM object locking is
>>>>      handled with drm_exec.
>>>>
>>>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
>>>> GPU scheduler for the asynchronous paths.
>>>>
>>>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>>>> ---
>>>>    Documentation/gpu/driver-uapi.rst       |   3 +
>>>>    drivers/gpu/drm/nouveau/Kbuild          |   2 +
>>>>    drivers/gpu/drm/nouveau/Kconfig         |   2 +
>>>>    drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
>>>>    drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
>>>>    drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
>>>>    drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
>>>>    drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
>>>>    drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
>>>>    drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
>>>>    drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
>>>>    11 files changed, 1295 insertions(+), 4 deletions(-)
>>>>    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
>>>>    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
>>>>    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
>>>>    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h
>>> ...
>>>>
>>>> +static struct dma_fence *
>>>> +nouveau_bind_job_run(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
>>>> +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
>>>> +    struct bind_job_op *op;
>>>> +    int ret = 0;
>>>> +
>>>
>>> I was looking at how nouveau does the async binding compared to how xe
>>> does it.
>>> It looks to me that this function being a scheduler run_job callback is
>>> the main part of the VM_BIND dma-fence signalling critical section for
>>> the job's done_fence and if so, needs to be annotated as such?
>>
>> Yes, that's the case.
>>
>>>
>>> For example nouveau_uvma_region_new allocates memory, which is not
>>> allowed if in a dma_fence signalling critical section and the locking
>>> also looks suspicious?
>>
>> Thanks for pointing this out, I missed that somehow.
>>
>> I will change it to pre-allocate new regions, mappings and page tables
>> within the job's submit() function.
>>
> 
> Yea that what we basically do in Xe, in the IOCTL step allocate all the
> backing store for new page tables, populate new page tables (these are
> not yet visible in the page table structure), and in last step which is
> executed after all the dependencies are satified program all the leaf
> entires making the new binding visible.
> 
> We screwed have this up by defering most of the IOCTL to a worker but
> will fix this fix this one way or another soon - get rid of worker or
> introduce a type of sync that is signaled after the worker + publish the
> dma-fence in the worker. I'd like to close on this one soon.
>   
>> For the ops structures the drm_gpuva_manager allocates for reporting the
>> split/merge steps back to the driver I have ideas to entirely avoid
>> allocations, which also is a good thing in respect of Christians feedback
>> regarding the huge amount of mapping requests some applications seem to
>> generate.
>>
> 
> It should be fine to have allocations to report the split/merge step as
> this step should be before a dma-fence is published, but yea if possible
> to avoid extra allocs as that is always better.

I think we can't really ask for the split/merge steps before actually 
running the job, since it requires the particular VA space not to change 
while performing those operations.

E.g. if we'd run the split/merge steps at job submit() time the 
underlying VA space could be changed by other bind jobs executing before 
this one, which would make the calculated split/merge steps obsolete and 
wrong.

Anyway, I should be able to get rid of all the allocations to make this 
safe.

> 
> Also BTW, great work on drm_gpuva_manager too. We will almost likely
> pick this up in Xe rather than open coding all of this as we currently
> do. We should probably start the port to this soon so we can contribute
> to the implementation and get both of our drivers upstream sooner.

Sounds great!

>   
>> Regarding the locking, anything specific that makes it look suspicious to
>> you?
>>
> 
> I haven't looked into this too but almost certainly Thomas is suggesting
> that if you allocate memory anywhere under the nouveau_uvmm_lock then
> you can't use this lock in the run_job() callback as this in the
> dma-fencing path.

Oh, sure. I already checked that, luckily there aren't any further 
allocations under this lock, so this should be safe once I changed to 
run_job() parts to pre-allocation in submit().

> 
> Matt
> 
>>>
>>> Thanks,
>>>
>>> Thomas
>>>
>>>
>>>> +    nouveau_uvmm_lock(uvmm);
>>>> +    list_for_each_op(op, &bind_job->ops) {
>>>> +        switch (op->op) {
>>>> +        case OP_ALLOC: {
>>>> +            bool sparse = op->flags & DRM_NOUVEAU_VM_BIND_SPARSE;
>>>> +
>>>> +            ret = nouveau_uvma_region_new(uvmm,
>>>> +                              op->va.addr,
>>>> +                              op->va.range,
>>>> +                              sparse);
>>>> +            if (ret)
>>>> +                goto out_unlock;
>>>> +            break;
>>>> +        }
>>>> +        case OP_FREE:
>>>> +            ret = nouveau_uvma_region_destroy(uvmm,
>>>> +                              op->va.addr,
>>>> +                              op->va.range);
>>>> +            if (ret)
>>>> +                goto out_unlock;
>>>> +            break;
>>>> +        case OP_MAP:
>>>> +            ret = nouveau_uvmm_sm_map(uvmm,
>>>> +                          op->va.addr, op->va.range,
>>>> +                          op->gem.obj, op->gem.offset,
>>>> +                          op->flags && 0xff);
>>>> +            if (ret)
>>>> +                goto out_unlock;
>>>> +            break;
>>>> +        case OP_UNMAP:
>>>> +            ret = nouveau_uvmm_sm_unmap(uvmm,
>>>> +                            op->va.addr,
>>>> +                            op->va.range);
>>>> +            if (ret)
>>>> +                goto out_unlock;
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +
>>>> +out_unlock:
>>>> +    nouveau_uvmm_unlock(uvmm);
>>>> +    if (ret)
>>>> +        NV_PRINTK(err, job->cli, "bind job failed: %d\n", ret);
>>>> +    return ERR_PTR(ret);
>>>> +}
>>>> +
>>>> +static void
>>>> +nouveau_bind_job_free(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
>>>> +    struct bind_job_op *op, *next;
>>>> +
>>>> +    list_for_each_op_safe(op, next, &bind_job->ops) {
>>>> +        struct drm_gem_object *obj = op->gem.obj;
>>>> +
>>>> +        if (obj)
>>>> +            drm_gem_object_put(obj);
>>>> +
>>>> +        list_del(&op->entry);
>>>> +        kfree(op);
>>>> +    }
>>>> +
>>>> +    nouveau_base_job_free(job);
>>>> +    kfree(bind_job);
>>>> +}
>>>> +
>>>> +static struct nouveau_job_ops nouveau_bind_job_ops = {
>>>> +    .submit = nouveau_bind_job_submit,
>>>> +    .run = nouveau_bind_job_run,
>>>> +    .free = nouveau_bind_job_free,
>>>> +};
>>>> +
>>>> +static int
>>>> +bind_job_op_from_uop(struct bind_job_op **pop,
>>>> +             struct drm_nouveau_vm_bind_op *uop)
>>>> +{
>>>> +    struct bind_job_op *op;
>>>> +
>>>> +    op = *pop = kzalloc(sizeof(*op), GFP_KERNEL);
>>>> +    if (!op)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    op->op = uop->op;
>>>> +    op->flags = uop->flags;
>>>> +    op->va.addr = uop->addr;
>>>> +    op->va.range = uop->range;
>>>> +
>>>> +    if (op->op == DRM_NOUVEAU_VM_BIND_OP_MAP) {
>>>> +        op->gem.handle = uop->handle;
>>>> +        op->gem.offset = uop->bo_offset;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void
>>>> +bind_job_ops_free(struct list_head *ops)
>>>> +{
>>>> +    struct bind_job_op *op, *next;
>>>> +
>>>> +    list_for_each_op_safe(op, next, ops) {
>>>> +        list_del(&op->entry);
>>>> +        kfree(op);
>>>> +    }
>>>> +}
>>>> +
>>>> +int
>>>> +nouveau_bind_job_init(struct nouveau_bind_job **pjob,
>>>> +              struct nouveau_exec_bind *bind)
>>>> +{
>>>> +    struct nouveau_bind_job *job;
>>>> +    struct bind_job_op *op;
>>>> +    int i, ret;
>>>> +
>>>> +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>>>> +    if (!job)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    INIT_LIST_HEAD(&job->ops);
>>>> +
>>>> +    for (i = 0; i < bind->op.count; i++) {
>>>> +        ret = bind_job_op_from_uop(&op, &bind->op.s[i]);
>>>> +        if (ret)
>>>> +            goto err_free;
>>>> +
>>>> +        list_add_tail(&op->entry, &job->ops);
>>>> +    }
>>>> +
>>>> +    job->base.sync = !(bind->flags & DRM_NOUVEAU_VM_BIND_RUN_ASYNC);
>>>> +    job->base.ops = &nouveau_bind_job_ops;
>>>> +
>>>> +    ret = nouveau_base_job_init(&job->base, &bind->base);
>>>> +    if (ret)
>>>> +        goto err_free;
>>>> +
>>>> +    return 0;
>>>> +
>>>> +err_free:
>>>> +    bind_job_ops_free(&job->ops);
>>>> +    kfree(job);
>>>> +    *pjob = NULL;
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static int
>>>> +sync_find_fence(struct nouveau_job *job,
>>>> +        struct drm_nouveau_sync *sync,
>>>> +        struct dma_fence **fence)
>>>> +{
>>>> +    u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
>>>> +    u64 point = 0;
>>>> +    int ret;
>>>> +
>>>> +    if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
>>>> +        stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>> +        return -EOPNOTSUPP;
>>>> +
>>>> +    if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>> +        point = sync->timeline_value;
>>>> +
>>>> +    ret = drm_syncobj_find_fence(job->file_priv,
>>>> +                     sync->handle, point,
>>>> +                     sync->flags, fence);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int
>>>> +exec_job_binds_wait(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>> +    struct nouveau_cli *cli = exec_job->base.cli;
>>>> +    struct nouveau_sched_entity *bind_entity = &cli->sched_entity;
>>>> +    signed long ret;
>>>> +    int i;
>>>> +
>>>> +    for (i = 0; i < job->in_sync.count; i++) {
>>>> +        struct nouveau_job *it;
>>>> +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
>>>> +        struct dma_fence *fence;
>>>> +        bool found;
>>>> +
>>>> +        ret = sync_find_fence(job, sync, &fence);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +
>>>> +        mutex_lock(&bind_entity->job.mutex);
>>>> +        found = false;
>>>> +        list_for_each_entry(it, &bind_entity->job.list, head) {
>>>> +            if (fence == it->done_fence) {
>>>> +                found = true;
>>>> +                break;
>>>> +            }
>>>> +        }
>>>> +        mutex_unlock(&bind_entity->job.mutex);
>>>> +
>>>> +        /* If the fence is not from a VM_BIND job, don't wait for it. */
>>>> +        if (!found)
>>>> +            continue;
>>>> +
>>>> +        ret = dma_fence_wait_timeout(fence, true,
>>>> +                         msecs_to_jiffies(500));
>>>> +        if (ret < 0)
>>>> +            return ret;
>>>> +        else if (ret == 0)
>>>> +            return -ETIMEDOUT;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +int
>>>> +nouveau_exec_job_submit(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>> +    struct nouveau_cli *cli = exec_job->base.cli;
>>>> +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(cli);
>>>> +    struct drm_exec *exec = &job->exec;
>>>> +    struct drm_gem_object *obj;
>>>> +    unsigned long index;
>>>> +    int ret;
>>>> +
>>>> +    ret = exec_job_binds_wait(job);
>>>> +    if (ret)
>>>> +        return ret;
>>>> +
>>>> +    nouveau_uvmm_lock(uvmm);
>>>> +    drm_exec_while_not_all_locked(exec) {
>>>> +        struct drm_gpuva *va;
>>>> +
>>>> +        drm_gpuva_for_each_va(va, &uvmm->umgr) {
>>>> +            ret = drm_exec_prepare_obj(exec, va->gem.obj, 1);
>>>> +            drm_exec_break_on_contention(exec);
>>>> +            if (ret)
>>>> +                return ret;
>>>> +        }
>>>> +    }
>>>> +    nouveau_uvmm_unlock(uvmm);
>>>> +
>>>> +    drm_exec_for_each_locked_object(exec, index, obj) {
>>>> +        struct dma_resv *resv = obj->resv;
>>>> +        struct nouveau_bo *nvbo = nouveau_gem_object(obj);
>>>> +
>>>> +        ret = nouveau_bo_validate(nvbo, true, false);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +
>>>> +        dma_resv_add_fence(resv, job->done_fence, DMA_RESV_USAGE_WRITE);
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static struct dma_fence *
>>>> +nouveau_exec_job_run(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>> +    struct nouveau_fence *fence;
>>>> +    int i, ret;
>>>> +
>>>> +    ret = nouveau_dma_wait(job->chan, exec_job->push.count + 1, 16);
>>>> +    if (ret) {
>>>> +        NV_PRINTK(err, job->cli, "nv50cal_space: %d\n", ret);
>>>> +        return ERR_PTR(ret);
>>>> +    }
>>>> +
>>>> +    for (i = 0; i < exec_job->push.count; i++) {
>>>> +        nv50_dma_push(job->chan, exec_job->push.s[i].va,
>>>> +                  exec_job->push.s[i].va_len);
>>>> +    }
>>>> +
>>>> +    ret = nouveau_fence_new(job->chan, false, &fence);
>>>> +    if (ret) {
>>>> +        NV_PRINTK(err, job->cli, "error fencing pushbuf: %d\n", ret);
>>>> +        WIND_RING(job->chan);
>>>> +        return ERR_PTR(ret);
>>>> +    }
>>>> +
>>>> +    return &fence->base;
>>>> +}
>>>> +static void
>>>> +nouveau_exec_job_free(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>> +
>>>> +    nouveau_base_job_free(job);
>>>> +
>>>> +    kfree(exec_job->push.s);
>>>> +    kfree(exec_job);
>>>> +}
>>>> +
>>>> +static struct nouveau_job_ops nouveau_exec_job_ops = {
>>>> +    .submit = nouveau_exec_job_submit,
>>>> +    .run = nouveau_exec_job_run,
>>>> +    .free = nouveau_exec_job_free,
>>>> +};
>>>> +
>>>> +int
>>>> +nouveau_exec_job_init(struct nouveau_exec_job **pjob,
>>>> +              struct nouveau_exec *exec)
>>>> +{
>>>> +    struct nouveau_exec_job *job;
>>>> +    int ret;
>>>> +
>>>> +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>>>> +    if (!job)
>>>> +        return -ENOMEM;
>>>> +
>>>> +    job->push.count = exec->push.count;
>>>> +    job->push.s = kmemdup(exec->push.s,
>>>> +                  sizeof(*exec->push.s) *
>>>> +                  exec->push.count,
>>>> +                  GFP_KERNEL);
>>>> +    if (!job->push.s) {
>>>> +        ret = -ENOMEM;
>>>> +        goto err_free_job;
>>>> +    }
>>>> +
>>>> +    job->base.ops = &nouveau_exec_job_ops;
>>>> +    ret = nouveau_base_job_init(&job->base, &exec->base);
>>>> +    if (ret)
>>>> +        goto err_free_pushs;
>>>> +
>>>> +    return 0;
>>>> +
>>>> +err_free_pushs:
>>>> +    kfree(job->push.s);
>>>> +err_free_job:
>>>> +    kfree(job);
>>>> +    *pjob = NULL;
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +void nouveau_job_fini(struct nouveau_job *job)
>>>> +{
>>>> +    dma_fence_put(job->done_fence);
>>>> +    drm_sched_job_cleanup(&job->base);
>>>> +    job->ops->free(job);
>>>> +}
>>>> +
>>>> +static int
>>>> +nouveau_job_add_deps(struct nouveau_job *job)
>>>> +{
>>>> +    struct dma_fence *in_fence = NULL;
>>>> +    int ret, i;
>>>> +
>>>> +    for (i = 0; i < job->in_sync.count; i++) {
>>>> +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
>>>> +
>>>> +        ret = sync_find_fence(job, sync, &in_fence);
>>>> +        if (ret) {
>>>> +            NV_PRINTK(warn, job->cli,
>>>> +                  "Failed to find syncobj (-> in): handle=%d\n",
>>>> +                  sync->handle);
>>>> +            return ret;
>>>> +        }
>>>> +
>>>> +        ret = drm_sched_job_add_dependency(&job->base, in_fence);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int
>>>> +nouveau_job_fence_attach(struct nouveau_job *job, struct dma_fence
>>>> *fence)
>>>> +{
>>>> +    struct drm_syncobj *out_sync;
>>>> +    int i;
>>>> +
>>>> +    for (i = 0; i < job->out_sync.count; i++) {
>>>> +        struct drm_nouveau_sync *sync = &job->out_sync.s[i];
>>>> +        u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
>>>> +
>>>> +        if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
>>>> +            stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>> +            return -EOPNOTSUPP;
>>>> +
>>>> +        out_sync = drm_syncobj_find(job->file_priv, sync->handle);
>>>> +        if (!out_sync) {
>>>> +            NV_PRINTK(warn, job->cli,
>>>> +                  "Failed to find syncobj (-> out): handle=%d\n",
>>>> +                  sync->handle);
>>>> +            return -ENOENT;
>>>> +        }
>>>> +
>>>> +        if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ) {
>>>> +            struct dma_fence_chain *chain;
>>>> +
>>>> +            chain = dma_fence_chain_alloc();
>>>> +            if (!chain) {
>>>> +                drm_syncobj_put(out_sync);
>>>> +                return -ENOMEM;
>>>> +            }
>>>> +
>>>> +            drm_syncobj_add_point(out_sync, chain, fence,
>>>> +                          sync->timeline_value);
>>>> +        } else {
>>>> +            drm_syncobj_replace_fence(out_sync, fence);
>>>> +        }
>>>> +
>>>> +        drm_syncobj_put(out_sync);
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static struct dma_fence *
>>>> +nouveau_job_run(struct nouveau_job *job)
>>>> +{
>>>> +    return job->ops->run(job);
>>>> +}
>>>> +
>>>> +static int
>>>> +nouveau_job_run_sync(struct nouveau_job *job)
>>>> +{
>>>> +    struct dma_fence *fence;
>>>> +    int ret;
>>>> +
>>>> +    fence = nouveau_job_run(job);
>>>> +    if (IS_ERR(fence)) {
>>>> +        return PTR_ERR(fence);
>>>> +    } else if (fence) {
>>>> +        ret = dma_fence_wait(fence, true);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +    }
>>>> +
>>>> +    dma_fence_signal(job->done_fence);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +int
>>>> +nouveau_job_submit(struct nouveau_job *job)
>>>> +{
>>>> +    struct nouveau_sched_entity *entity =
>>>> to_nouveau_sched_entity(job->base.entity);
>>>> +    int ret;
>>>> +
>>>> +    drm_exec_init(&job->exec, true);
>>>> +
>>>> +    ret = nouveau_job_add_deps(job);
>>>> +    if (ret)
>>>> +        goto out;
>>>> +
>>>> +    drm_sched_job_arm(&job->base);
>>>> +    job->done_fence = dma_fence_get(&job->base.s_fence->finished);
>>>> +
>>>> +    ret = nouveau_job_fence_attach(job, job->done_fence);
>>>> +    if (ret)
>>>> +        goto out;
>>>> +
>>>> +    if (job->ops->submit) {
>>>> +        ret = job->ops->submit(job);
>>>> +        if (ret)
>>>> +            goto out;
>>>> +    }
>>>> +
>>>> +    if (job->sync) {
>>>> +        drm_exec_fini(&job->exec);
>>>> +
>>>> +        /* We're requested to run a synchronous job, hence don't push
>>>> +         * the job, bypassing the job scheduler, and execute the jobs
>>>> +         * run() function right away.
>>>> +         *
>>>> +         * As a consequence of bypassing the job scheduler we need to
>>>> +         * handle fencing and job cleanup ourselfes.
>>>> +         */
>>>> +        ret = nouveau_job_run_sync(job);
>>>> +
>>>> +        /* If the job fails, the caller will do the cleanup for us. */
>>>> +        if (!ret)
>>>> +            nouveau_job_fini(job);
>>>> +
>>>> +        return ret;
>>>> +    } else {
>>>> +        mutex_lock(&entity->job.mutex);
>>>> +        drm_sched_entity_push_job(&job->base);
>>>> +        list_add_tail(&job->head, &entity->job.list);
>>>> +        mutex_unlock(&entity->job.mutex);
>>>> +    }
>>>> +
>>>> +out:
>>>> +    drm_exec_fini(&job->exec);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static struct dma_fence *
>>>> +nouveau_sched_run_job(struct drm_sched_job *sched_job)
>>>> +{
>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>> +
>>>> +    return nouveau_job_run(job);
>>>> +}
>>>> +
>>>> +static enum drm_gpu_sched_stat
>>>> +nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
>>>> +{
>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>> +    struct nouveau_channel *chan = job->chan;
>>>> +
>>>> +    if (unlikely(!atomic_read(&chan->killed)))
>>>> +        nouveau_channel_kill(chan);
>>>> +
>>>> +    NV_PRINTK(warn, job->cli, "job timeout, channel %d killed!\n",
>>>> +          chan->chid);
>>>> +
>>>> +    nouveau_sched_entity_fini(job->entity);
>>>> +
>>>> +    return DRM_GPU_SCHED_STAT_ENODEV;
>>>> +}
>>>> +
>>>> +static void
>>>> +nouveau_sched_free_job(struct drm_sched_job *sched_job)
>>>> +{
>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>> +    struct nouveau_sched_entity *entity = job->entity;
>>>> +
>>>> +    mutex_lock(&entity->job.mutex);
>>>> +    list_del(&job->head);
>>>> +    mutex_unlock(&entity->job.mutex);
>>>> +
>>>> +    nouveau_job_fini(job);
>>>> +}
>>>> +
>>>> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
>>>> +                  struct drm_gpu_scheduler *sched)
>>>> +{
>>>> +
>>>> +    INIT_LIST_HEAD(&entity->job.list);
>>>> +    mutex_init(&entity->job.mutex);
>>>> +
>>>> +    return drm_sched_entity_init(&entity->base,
>>>> +                     DRM_SCHED_PRIORITY_NORMAL,
>>>> +                     &sched, 1, NULL);
>>>> +}
>>>> +
>>>> +void
>>>> +nouveau_sched_entity_fini(struct nouveau_sched_entity *entity)
>>>> +{
>>>> +    drm_sched_entity_destroy(&entity->base);
>>>> +}
>>>> +
>>>> +static const struct drm_sched_backend_ops nouveau_sched_ops = {
>>>> +    .run_job = nouveau_sched_run_job,
>>>> +    .timedout_job = nouveau_sched_timedout_job,
>>>> +    .free_job = nouveau_sched_free_job,
>>>> +};
>>>> +
>>>> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
>>>> +               struct nouveau_drm *drm)
>>>> +{
>>>> +    long job_hang_limit =
>>>> msecs_to_jiffies(NOUVEAU_SCHED_JOB_TIMEOUT_MS);
>>>> +
>>>> +    return drm_sched_init(sched, &nouveau_sched_ops,
>>>> +                  NOUVEAU_SCHED_HW_SUBMISSIONS, 0, job_hang_limit,
>>>> +                  NULL, NULL, "nouveau", drm->dev->dev);
>>>> +}
>>>> +
>>>> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched)
>>>> +{
>>>> +    drm_sched_fini(sched);
>>>> +}
>>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>> b/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>> new file mode 100644
>>>> index 000000000000..7fc5b7eea810
>>>> --- /dev/null
>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>> @@ -0,0 +1,98 @@
>>>> +// SPDX-License-Identifier: MIT
>>>> +
>>>> +#ifndef NOUVEAU_SCHED_H
>>>> +#define NOUVEAU_SCHED_H
>>>> +
>>>> +#include <linux/types.h>
>>>> +
>>>> +#include <drm/drm_exec.h>
>>>> +#include <drm/gpu_scheduler.h>
>>>> +
>>>> +#include "nouveau_drv.h"
>>>> +#include "nouveau_exec.h"
>>>> +
>>>> +#define to_nouveau_job(sched_job)        \
>>>> +        container_of((sched_job), struct nouveau_job, base)
>>>> +
>>>> +#define to_nouveau_exec_job(job)        \
>>>> +        container_of((job), struct nouveau_exec_job, base)
>>>> +
>>>> +#define to_nouveau_bind_job(job)        \
>>>> +        container_of((job), struct nouveau_bind_job, base)
>>>> +
>>>> +struct nouveau_job {
>>>> +    struct drm_sched_job base;
>>>> +    struct list_head head;
>>>> +
>>>> +    struct nouveau_sched_entity *entity;
>>>> +
>>>> +    struct drm_file *file_priv;
>>>> +    struct nouveau_cli *cli;
>>>> +    struct nouveau_channel *chan;
>>>> +
>>>> +    struct drm_exec exec;
>>>> +    struct dma_fence *done_fence;
>>>> +
>>>> +    bool sync;
>>>> +
>>>> +    struct {
>>>> +        struct drm_nouveau_sync *s;
>>>> +        u32 count;
>>>> +    } in_sync;
>>>> +
>>>> +    struct {
>>>> +        struct drm_nouveau_sync *s;
>>>> +        u32 count;
>>>> +    } out_sync;
>>>> +
>>>> +    struct nouveau_job_ops {
>>>> +        int (*submit)(struct nouveau_job *);
>>>> +        struct dma_fence *(*run)(struct nouveau_job *);
>>>> +        void (*free)(struct nouveau_job *);
>>>> +    } *ops;
>>>> +};
>>>> +
>>>> +struct nouveau_exec_job {
>>>> +    struct nouveau_job base;
>>>> +
>>>> +    struct {
>>>> +        struct drm_nouveau_exec_push *s;
>>>> +        u32 count;
>>>> +    } push;
>>>> +};
>>>> +
>>>> +struct nouveau_bind_job {
>>>> +    struct nouveau_job base;
>>>> +
>>>> +    /* struct bind_job_op */
>>>> +    struct list_head ops;
>>>> +};
>>>> +
>>>> +int nouveau_bind_job_init(struct nouveau_bind_job **job,
>>>> +              struct nouveau_exec_bind *bind);
>>>> +int nouveau_exec_job_init(struct nouveau_exec_job **job,
>>>> +              struct nouveau_exec *exec);
>>>> +
>>>> +int nouveau_job_submit(struct nouveau_job *job);
>>>> +void nouveau_job_fini(struct nouveau_job *job);
>>>> +
>>>> +#define to_nouveau_sched_entity(entity)        \
>>>> +        container_of((entity), struct nouveau_sched_entity, base)
>>>> +
>>>> +struct nouveau_sched_entity {
>>>> +    struct drm_sched_entity base;
>>>> +    struct {
>>>> +        struct list_head list;
>>>> +        struct mutex mutex;
>>>> +    } job;
>>>> +};
>>>> +
>>>> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
>>>> +                  struct drm_gpu_scheduler *sched);
>>>> +void nouveau_sched_entity_fini(struct nouveau_sched_entity *entity);
>>>> +
>>>> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
>>>> +               struct nouveau_drm *drm);
>>>> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched);
>>>> +
>>>> +#endif
>>>
>>
>
Matthew Brost Jan. 19, 2023, 4:38 p.m. UTC | #7
On Thu, Jan 19, 2023 at 04:36:43PM +0100, Danilo Krummrich wrote:
> On 1/19/23 05:58, Matthew Brost wrote:
> > On Thu, Jan 19, 2023 at 04:44:23AM +0100, Danilo Krummrich wrote:
> > > On 1/18/23 21:37, Thomas Hellström (Intel) wrote:
> > > > 
> > > > On 1/18/23 07:12, Danilo Krummrich wrote:
> > > > > This commit provides the implementation for the new uapi motivated by the
> > > > > Vulkan API. It allows user mode drivers (UMDs) to:
> > > > > 
> > > > > 1) Initialize a GPU virtual address (VA) space via the new
> > > > >      DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
> > > > >      space managed by the kernel and userspace, respectively.
> > > > > 
> > > > > 2) Allocate and free a VA space region as well as bind and unbind memory
> > > > >      to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> > > > >      UMDs can request the named operations to be processed either
> > > > >      synchronously or asynchronously. It supports DRM syncobjs
> > > > >      (incl. timelines) as synchronization mechanism. The management of the
> > > > >      GPU VA mappings is implemented with the DRM GPU VA manager.
> > > > > 
> > > > > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
> > > > >      execution happens asynchronously. It supports DRM syncobj (incl.
> > > > >      timelines) as synchronization mechanism. DRM GEM object locking is
> > > > >      handled with drm_exec.
> > > > > 
> > > > > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
> > > > > GPU scheduler for the asynchronous paths.
> > > > > 
> > > > > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > > > > ---
> > > > >    Documentation/gpu/driver-uapi.rst       |   3 +
> > > > >    drivers/gpu/drm/nouveau/Kbuild          |   2 +
> > > > >    drivers/gpu/drm/nouveau/Kconfig         |   2 +
> > > > >    drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
> > > > >    drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
> > > > >    drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
> > > > >    drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
> > > > >    drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
> > > > >    drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
> > > > >    drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
> > > > >    drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
> > > > >    11 files changed, 1295 insertions(+), 4 deletions(-)
> > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
> > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
> > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
> > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > ...
> > > > > 
> > > > > +static struct dma_fence *
> > > > > +nouveau_bind_job_run(struct nouveau_job *job)
> > > > > +{
> > > > > +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> > > > > +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
> > > > > +    struct bind_job_op *op;
> > > > > +    int ret = 0;
> > > > > +
> > > > 
> > > > I was looking at how nouveau does the async binding compared to how xe
> > > > does it.
> > > > It looks to me that this function being a scheduler run_job callback is
> > > > the main part of the VM_BIND dma-fence signalling critical section for
> > > > the job's done_fence and if so, needs to be annotated as such?
> > > 
> > > Yes, that's the case.
> > > 
> > > > 
> > > > For example nouveau_uvma_region_new allocates memory, which is not
> > > > allowed if in a dma_fence signalling critical section and the locking
> > > > also looks suspicious?
> > > 
> > > Thanks for pointing this out, I missed that somehow.
> > > 
> > > I will change it to pre-allocate new regions, mappings and page tables
> > > within the job's submit() function.
> > > 
> > 
> > Yea that what we basically do in Xe, in the IOCTL step allocate all the
> > backing store for new page tables, populate new page tables (these are
> > not yet visible in the page table structure), and in last step which is
> > executed after all the dependencies are satified program all the leaf
> > entires making the new binding visible.
> > 
> > We screwed have this up by defering most of the IOCTL to a worker but
> > will fix this fix this one way or another soon - get rid of worker or
> > introduce a type of sync that is signaled after the worker + publish the
> > dma-fence in the worker. I'd like to close on this one soon.
> > > For the ops structures the drm_gpuva_manager allocates for reporting the
> > > split/merge steps back to the driver I have ideas to entirely avoid
> > > allocations, which also is a good thing in respect of Christians feedback
> > > regarding the huge amount of mapping requests some applications seem to
> > > generate.
> > > 
> > 
> > It should be fine to have allocations to report the split/merge step as
> > this step should be before a dma-fence is published, but yea if possible
> > to avoid extra allocs as that is always better.
> 
> I think we can't really ask for the split/merge steps before actually
> running the job, since it requires the particular VA space not to change
> while performing those operations.
> 
> E.g. if we'd run the split/merge steps at job submit() time the underlying
> VA space could be changed by other bind jobs executing before this one,
> which would make the calculated split/merge steps obsolete and wrong.
> 

Hmm, maybe I'm not understanding this implementation, admittedly I
haven't studied the gpuva manager code in detail.

Let me explain what we are doing in Xe.

Map 0x0000 - 0x3000 -> this resolves into 1 bind operation and 1 VMA
Unmap 0x1000-0x2000 -> this resolves into 1 unbind and 2 rebind operations

1. unbind 0x0000-0x3000 -> destroy old VMA
2. rebind 0x0000-0x1000 -> new VMA
3. rebind 0x2000-0x3000 -> new VMA

All of the above steps resolving the operations can be done in the IOCTL
phase and VM's VMA structure is also updated. When the dependencies
are resolved the actual bindings are done on the GPU. We use the BO's
dma-resv slots to ensure there is never a window 0x0000-0x1000 and
0x2000-0x3000 are never mapped with respect to execs (I forget the exact
details of how we do this but if you want to know I'll explain further).

Can we not use drm_gpuvs_manager in a similar manner to generate the
ops + update the VM's VMA structure early? Again maybe I missing
something here as I haven't fully studied the drm_gpuva_manager.

Matt

> Anyway, I should be able to get rid of all the allocations to make this
> safe.
> 
> > 
> > Also BTW, great work on drm_gpuva_manager too. We will almost likely
> > pick this up in Xe rather than open coding all of this as we currently
> > do. We should probably start the port to this soon so we can contribute
> > to the implementation and get both of our drivers upstream sooner.
> 
> Sounds great!
> 
> > > Regarding the locking, anything specific that makes it look suspicious to
> > > you?
> > > 
> > 
> > I haven't looked into this too but almost certainly Thomas is suggesting
> > that if you allocate memory anywhere under the nouveau_uvmm_lock then
> > you can't use this lock in the run_job() callback as this in the
> > dma-fencing path.
> 
> Oh, sure. I already checked that, luckily there aren't any further
> allocations under this lock, so this should be safe once I changed to
> run_job() parts to pre-allocation in submit().
> 
> > 
> > Matt
> > 
> > > > 
> > > > Thanks,
> > > > 
> > > > Thomas
> > > > 
> > > > 
> > > > > +    nouveau_uvmm_lock(uvmm);
> > > > > +    list_for_each_op(op, &bind_job->ops) {
> > > > > +        switch (op->op) {
> > > > > +        case OP_ALLOC: {
> > > > > +            bool sparse = op->flags & DRM_NOUVEAU_VM_BIND_SPARSE;
> > > > > +
> > > > > +            ret = nouveau_uvma_region_new(uvmm,
> > > > > +                              op->va.addr,
> > > > > +                              op->va.range,
> > > > > +                              sparse);
> > > > > +            if (ret)
> > > > > +                goto out_unlock;
> > > > > +            break;
> > > > > +        }
> > > > > +        case OP_FREE:
> > > > > +            ret = nouveau_uvma_region_destroy(uvmm,
> > > > > +                              op->va.addr,
> > > > > +                              op->va.range);
> > > > > +            if (ret)
> > > > > +                goto out_unlock;
> > > > > +            break;
> > > > > +        case OP_MAP:
> > > > > +            ret = nouveau_uvmm_sm_map(uvmm,
> > > > > +                          op->va.addr, op->va.range,
> > > > > +                          op->gem.obj, op->gem.offset,
> > > > > +                          op->flags && 0xff);
> > > > > +            if (ret)
> > > > > +                goto out_unlock;
> > > > > +            break;
> > > > > +        case OP_UNMAP:
> > > > > +            ret = nouveau_uvmm_sm_unmap(uvmm,
> > > > > +                            op->va.addr,
> > > > > +                            op->va.range);
> > > > > +            if (ret)
> > > > > +                goto out_unlock;
> > > > > +            break;
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +out_unlock:
> > > > > +    nouveau_uvmm_unlock(uvmm);
> > > > > +    if (ret)
> > > > > +        NV_PRINTK(err, job->cli, "bind job failed: %d\n", ret);
> > > > > +    return ERR_PTR(ret);
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +nouveau_bind_job_free(struct nouveau_job *job)
> > > > > +{
> > > > > +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> > > > > +    struct bind_job_op *op, *next;
> > > > > +
> > > > > +    list_for_each_op_safe(op, next, &bind_job->ops) {
> > > > > +        struct drm_gem_object *obj = op->gem.obj;
> > > > > +
> > > > > +        if (obj)
> > > > > +            drm_gem_object_put(obj);
> > > > > +
> > > > > +        list_del(&op->entry);
> > > > > +        kfree(op);
> > > > > +    }
> > > > > +
> > > > > +    nouveau_base_job_free(job);
> > > > > +    kfree(bind_job);
> > > > > +}
> > > > > +
> > > > > +static struct nouveau_job_ops nouveau_bind_job_ops = {
> > > > > +    .submit = nouveau_bind_job_submit,
> > > > > +    .run = nouveau_bind_job_run,
> > > > > +    .free = nouveau_bind_job_free,
> > > > > +};
> > > > > +
> > > > > +static int
> > > > > +bind_job_op_from_uop(struct bind_job_op **pop,
> > > > > +             struct drm_nouveau_vm_bind_op *uop)
> > > > > +{
> > > > > +    struct bind_job_op *op;
> > > > > +
> > > > > +    op = *pop = kzalloc(sizeof(*op), GFP_KERNEL);
> > > > > +    if (!op)
> > > > > +        return -ENOMEM;
> > > > > +
> > > > > +    op->op = uop->op;
> > > > > +    op->flags = uop->flags;
> > > > > +    op->va.addr = uop->addr;
> > > > > +    op->va.range = uop->range;
> > > > > +
> > > > > +    if (op->op == DRM_NOUVEAU_VM_BIND_OP_MAP) {
> > > > > +        op->gem.handle = uop->handle;
> > > > > +        op->gem.offset = uop->bo_offset;
> > > > > +    }
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +bind_job_ops_free(struct list_head *ops)
> > > > > +{
> > > > > +    struct bind_job_op *op, *next;
> > > > > +
> > > > > +    list_for_each_op_safe(op, next, ops) {
> > > > > +        list_del(&op->entry);
> > > > > +        kfree(op);
> > > > > +    }
> > > > > +}
> > > > > +
> > > > > +int
> > > > > +nouveau_bind_job_init(struct nouveau_bind_job **pjob,
> > > > > +              struct nouveau_exec_bind *bind)
> > > > > +{
> > > > > +    struct nouveau_bind_job *job;
> > > > > +    struct bind_job_op *op;
> > > > > +    int i, ret;
> > > > > +
> > > > > +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> > > > > +    if (!job)
> > > > > +        return -ENOMEM;
> > > > > +
> > > > > +    INIT_LIST_HEAD(&job->ops);
> > > > > +
> > > > > +    for (i = 0; i < bind->op.count; i++) {
> > > > > +        ret = bind_job_op_from_uop(&op, &bind->op.s[i]);
> > > > > +        if (ret)
> > > > > +            goto err_free;
> > > > > +
> > > > > +        list_add_tail(&op->entry, &job->ops);
> > > > > +    }
> > > > > +
> > > > > +    job->base.sync = !(bind->flags & DRM_NOUVEAU_VM_BIND_RUN_ASYNC);
> > > > > +    job->base.ops = &nouveau_bind_job_ops;
> > > > > +
> > > > > +    ret = nouveau_base_job_init(&job->base, &bind->base);
> > > > > +    if (ret)
> > > > > +        goto err_free;
> > > > > +
> > > > > +    return 0;
> > > > > +
> > > > > +err_free:
> > > > > +    bind_job_ops_free(&job->ops);
> > > > > +    kfree(job);
> > > > > +    *pjob = NULL;
> > > > > +
> > > > > +    return ret;
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +sync_find_fence(struct nouveau_job *job,
> > > > > +        struct drm_nouveau_sync *sync,
> > > > > +        struct dma_fence **fence)
> > > > > +{
> > > > > +    u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
> > > > > +    u64 point = 0;
> > > > > +    int ret;
> > > > > +
> > > > > +    if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
> > > > > +        stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > > > +        return -EOPNOTSUPP;
> > > > > +
> > > > > +    if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > > > +        point = sync->timeline_value;
> > > > > +
> > > > > +    ret = drm_syncobj_find_fence(job->file_priv,
> > > > > +                     sync->handle, point,
> > > > > +                     sync->flags, fence);
> > > > > +    if (ret)
> > > > > +        return ret;
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +exec_job_binds_wait(struct nouveau_job *job)
> > > > > +{
> > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > +    struct nouveau_cli *cli = exec_job->base.cli;
> > > > > +    struct nouveau_sched_entity *bind_entity = &cli->sched_entity;
> > > > > +    signed long ret;
> > > > > +    int i;
> > > > > +
> > > > > +    for (i = 0; i < job->in_sync.count; i++) {
> > > > > +        struct nouveau_job *it;
> > > > > +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
> > > > > +        struct dma_fence *fence;
> > > > > +        bool found;
> > > > > +
> > > > > +        ret = sync_find_fence(job, sync, &fence);
> > > > > +        if (ret)
> > > > > +            return ret;
> > > > > +
> > > > > +        mutex_lock(&bind_entity->job.mutex);
> > > > > +        found = false;
> > > > > +        list_for_each_entry(it, &bind_entity->job.list, head) {
> > > > > +            if (fence == it->done_fence) {
> > > > > +                found = true;
> > > > > +                break;
> > > > > +            }
> > > > > +        }
> > > > > +        mutex_unlock(&bind_entity->job.mutex);
> > > > > +
> > > > > +        /* If the fence is not from a VM_BIND job, don't wait for it. */
> > > > > +        if (!found)
> > > > > +            continue;
> > > > > +
> > > > > +        ret = dma_fence_wait_timeout(fence, true,
> > > > > +                         msecs_to_jiffies(500));
> > > > > +        if (ret < 0)
> > > > > +            return ret;
> > > > > +        else if (ret == 0)
> > > > > +            return -ETIMEDOUT;
> > > > > +    }
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +int
> > > > > +nouveau_exec_job_submit(struct nouveau_job *job)
> > > > > +{
> > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > +    struct nouveau_cli *cli = exec_job->base.cli;
> > > > > +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(cli);
> > > > > +    struct drm_exec *exec = &job->exec;
> > > > > +    struct drm_gem_object *obj;
> > > > > +    unsigned long index;
> > > > > +    int ret;
> > > > > +
> > > > > +    ret = exec_job_binds_wait(job);
> > > > > +    if (ret)
> > > > > +        return ret;
> > > > > +
> > > > > +    nouveau_uvmm_lock(uvmm);
> > > > > +    drm_exec_while_not_all_locked(exec) {
> > > > > +        struct drm_gpuva *va;
> > > > > +
> > > > > +        drm_gpuva_for_each_va(va, &uvmm->umgr) {
> > > > > +            ret = drm_exec_prepare_obj(exec, va->gem.obj, 1);
> > > > > +            drm_exec_break_on_contention(exec);
> > > > > +            if (ret)
> > > > > +                return ret;
> > > > > +        }
> > > > > +    }
> > > > > +    nouveau_uvmm_unlock(uvmm);
> > > > > +
> > > > > +    drm_exec_for_each_locked_object(exec, index, obj) {
> > > > > +        struct dma_resv *resv = obj->resv;
> > > > > +        struct nouveau_bo *nvbo = nouveau_gem_object(obj);
> > > > > +
> > > > > +        ret = nouveau_bo_validate(nvbo, true, false);
> > > > > +        if (ret)
> > > > > +            return ret;
> > > > > +
> > > > > +        dma_resv_add_fence(resv, job->done_fence, DMA_RESV_USAGE_WRITE);
> > > > > +    }
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +static struct dma_fence *
> > > > > +nouveau_exec_job_run(struct nouveau_job *job)
> > > > > +{
> > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > +    struct nouveau_fence *fence;
> > > > > +    int i, ret;
> > > > > +
> > > > > +    ret = nouveau_dma_wait(job->chan, exec_job->push.count + 1, 16);
> > > > > +    if (ret) {
> > > > > +        NV_PRINTK(err, job->cli, "nv50cal_space: %d\n", ret);
> > > > > +        return ERR_PTR(ret);
> > > > > +    }
> > > > > +
> > > > > +    for (i = 0; i < exec_job->push.count; i++) {
> > > > > +        nv50_dma_push(job->chan, exec_job->push.s[i].va,
> > > > > +                  exec_job->push.s[i].va_len);
> > > > > +    }
> > > > > +
> > > > > +    ret = nouveau_fence_new(job->chan, false, &fence);
> > > > > +    if (ret) {
> > > > > +        NV_PRINTK(err, job->cli, "error fencing pushbuf: %d\n", ret);
> > > > > +        WIND_RING(job->chan);
> > > > > +        return ERR_PTR(ret);
> > > > > +    }
> > > > > +
> > > > > +    return &fence->base;
> > > > > +}
> > > > > +static void
> > > > > +nouveau_exec_job_free(struct nouveau_job *job)
> > > > > +{
> > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > +
> > > > > +    nouveau_base_job_free(job);
> > > > > +
> > > > > +    kfree(exec_job->push.s);
> > > > > +    kfree(exec_job);
> > > > > +}
> > > > > +
> > > > > +static struct nouveau_job_ops nouveau_exec_job_ops = {
> > > > > +    .submit = nouveau_exec_job_submit,
> > > > > +    .run = nouveau_exec_job_run,
> > > > > +    .free = nouveau_exec_job_free,
> > > > > +};
> > > > > +
> > > > > +int
> > > > > +nouveau_exec_job_init(struct nouveau_exec_job **pjob,
> > > > > +              struct nouveau_exec *exec)
> > > > > +{
> > > > > +    struct nouveau_exec_job *job;
> > > > > +    int ret;
> > > > > +
> > > > > +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> > > > > +    if (!job)
> > > > > +        return -ENOMEM;
> > > > > +
> > > > > +    job->push.count = exec->push.count;
> > > > > +    job->push.s = kmemdup(exec->push.s,
> > > > > +                  sizeof(*exec->push.s) *
> > > > > +                  exec->push.count,
> > > > > +                  GFP_KERNEL);
> > > > > +    if (!job->push.s) {
> > > > > +        ret = -ENOMEM;
> > > > > +        goto err_free_job;
> > > > > +    }
> > > > > +
> > > > > +    job->base.ops = &nouveau_exec_job_ops;
> > > > > +    ret = nouveau_base_job_init(&job->base, &exec->base);
> > > > > +    if (ret)
> > > > > +        goto err_free_pushs;
> > > > > +
> > > > > +    return 0;
> > > > > +
> > > > > +err_free_pushs:
> > > > > +    kfree(job->push.s);
> > > > > +err_free_job:
> > > > > +    kfree(job);
> > > > > +    *pjob = NULL;
> > > > > +
> > > > > +    return ret;
> > > > > +}
> > > > > +
> > > > > +void nouveau_job_fini(struct nouveau_job *job)
> > > > > +{
> > > > > +    dma_fence_put(job->done_fence);
> > > > > +    drm_sched_job_cleanup(&job->base);
> > > > > +    job->ops->free(job);
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +nouveau_job_add_deps(struct nouveau_job *job)
> > > > > +{
> > > > > +    struct dma_fence *in_fence = NULL;
> > > > > +    int ret, i;
> > > > > +
> > > > > +    for (i = 0; i < job->in_sync.count; i++) {
> > > > > +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
> > > > > +
> > > > > +        ret = sync_find_fence(job, sync, &in_fence);
> > > > > +        if (ret) {
> > > > > +            NV_PRINTK(warn, job->cli,
> > > > > +                  "Failed to find syncobj (-> in): handle=%d\n",
> > > > > +                  sync->handle);
> > > > > +            return ret;
> > > > > +        }
> > > > > +
> > > > > +        ret = drm_sched_job_add_dependency(&job->base, in_fence);
> > > > > +        if (ret)
> > > > > +            return ret;
> > > > > +    }
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +nouveau_job_fence_attach(struct nouveau_job *job, struct dma_fence
> > > > > *fence)
> > > > > +{
> > > > > +    struct drm_syncobj *out_sync;
> > > > > +    int i;
> > > > > +
> > > > > +    for (i = 0; i < job->out_sync.count; i++) {
> > > > > +        struct drm_nouveau_sync *sync = &job->out_sync.s[i];
> > > > > +        u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
> > > > > +
> > > > > +        if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
> > > > > +            stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > > > +            return -EOPNOTSUPP;
> > > > > +
> > > > > +        out_sync = drm_syncobj_find(job->file_priv, sync->handle);
> > > > > +        if (!out_sync) {
> > > > > +            NV_PRINTK(warn, job->cli,
> > > > > +                  "Failed to find syncobj (-> out): handle=%d\n",
> > > > > +                  sync->handle);
> > > > > +            return -ENOENT;
> > > > > +        }
> > > > > +
> > > > > +        if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ) {
> > > > > +            struct dma_fence_chain *chain;
> > > > > +
> > > > > +            chain = dma_fence_chain_alloc();
> > > > > +            if (!chain) {
> > > > > +                drm_syncobj_put(out_sync);
> > > > > +                return -ENOMEM;
> > > > > +            }
> > > > > +
> > > > > +            drm_syncobj_add_point(out_sync, chain, fence,
> > > > > +                          sync->timeline_value);
> > > > > +        } else {
> > > > > +            drm_syncobj_replace_fence(out_sync, fence);
> > > > > +        }
> > > > > +
> > > > > +        drm_syncobj_put(out_sync);
> > > > > +    }
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +static struct dma_fence *
> > > > > +nouveau_job_run(struct nouveau_job *job)
> > > > > +{
> > > > > +    return job->ops->run(job);
> > > > > +}
> > > > > +
> > > > > +static int
> > > > > +nouveau_job_run_sync(struct nouveau_job *job)
> > > > > +{
> > > > > +    struct dma_fence *fence;
> > > > > +    int ret;
> > > > > +
> > > > > +    fence = nouveau_job_run(job);
> > > > > +    if (IS_ERR(fence)) {
> > > > > +        return PTR_ERR(fence);
> > > > > +    } else if (fence) {
> > > > > +        ret = dma_fence_wait(fence, true);
> > > > > +        if (ret)
> > > > > +            return ret;
> > > > > +    }
> > > > > +
> > > > > +    dma_fence_signal(job->done_fence);
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +int
> > > > > +nouveau_job_submit(struct nouveau_job *job)
> > > > > +{
> > > > > +    struct nouveau_sched_entity *entity =
> > > > > to_nouveau_sched_entity(job->base.entity);
> > > > > +    int ret;
> > > > > +
> > > > > +    drm_exec_init(&job->exec, true);
> > > > > +
> > > > > +    ret = nouveau_job_add_deps(job);
> > > > > +    if (ret)
> > > > > +        goto out;
> > > > > +
> > > > > +    drm_sched_job_arm(&job->base);
> > > > > +    job->done_fence = dma_fence_get(&job->base.s_fence->finished);
> > > > > +
> > > > > +    ret = nouveau_job_fence_attach(job, job->done_fence);
> > > > > +    if (ret)
> > > > > +        goto out;
> > > > > +
> > > > > +    if (job->ops->submit) {
> > > > > +        ret = job->ops->submit(job);
> > > > > +        if (ret)
> > > > > +            goto out;
> > > > > +    }
> > > > > +
> > > > > +    if (job->sync) {
> > > > > +        drm_exec_fini(&job->exec);
> > > > > +
> > > > > +        /* We're requested to run a synchronous job, hence don't push
> > > > > +         * the job, bypassing the job scheduler, and execute the jobs
> > > > > +         * run() function right away.
> > > > > +         *
> > > > > +         * As a consequence of bypassing the job scheduler we need to
> > > > > +         * handle fencing and job cleanup ourselfes.
> > > > > +         */
> > > > > +        ret = nouveau_job_run_sync(job);
> > > > > +
> > > > > +        /* If the job fails, the caller will do the cleanup for us. */
> > > > > +        if (!ret)
> > > > > +            nouveau_job_fini(job);
> > > > > +
> > > > > +        return ret;
> > > > > +    } else {
> > > > > +        mutex_lock(&entity->job.mutex);
> > > > > +        drm_sched_entity_push_job(&job->base);
> > > > > +        list_add_tail(&job->head, &entity->job.list);
> > > > > +        mutex_unlock(&entity->job.mutex);
> > > > > +    }
> > > > > +
> > > > > +out:
> > > > > +    drm_exec_fini(&job->exec);
> > > > > +    return ret;
> > > > > +}
> > > > > +
> > > > > +static struct dma_fence *
> > > > > +nouveau_sched_run_job(struct drm_sched_job *sched_job)
> > > > > +{
> > > > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > > > +
> > > > > +    return nouveau_job_run(job);
> > > > > +}
> > > > > +
> > > > > +static enum drm_gpu_sched_stat
> > > > > +nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
> > > > > +{
> > > > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > > > +    struct nouveau_channel *chan = job->chan;
> > > > > +
> > > > > +    if (unlikely(!atomic_read(&chan->killed)))
> > > > > +        nouveau_channel_kill(chan);
> > > > > +
> > > > > +    NV_PRINTK(warn, job->cli, "job timeout, channel %d killed!\n",
> > > > > +          chan->chid);
> > > > > +
> > > > > +    nouveau_sched_entity_fini(job->entity);
> > > > > +
> > > > > +    return DRM_GPU_SCHED_STAT_ENODEV;
> > > > > +}
> > > > > +
> > > > > +static void
> > > > > +nouveau_sched_free_job(struct drm_sched_job *sched_job)
> > > > > +{
> > > > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > > > +    struct nouveau_sched_entity *entity = job->entity;
> > > > > +
> > > > > +    mutex_lock(&entity->job.mutex);
> > > > > +    list_del(&job->head);
> > > > > +    mutex_unlock(&entity->job.mutex);
> > > > > +
> > > > > +    nouveau_job_fini(job);
> > > > > +}
> > > > > +
> > > > > +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
> > > > > +                  struct drm_gpu_scheduler *sched)
> > > > > +{
> > > > > +
> > > > > +    INIT_LIST_HEAD(&entity->job.list);
> > > > > +    mutex_init(&entity->job.mutex);
> > > > > +
> > > > > +    return drm_sched_entity_init(&entity->base,
> > > > > +                     DRM_SCHED_PRIORITY_NORMAL,
> > > > > +                     &sched, 1, NULL);
> > > > > +}
> > > > > +
> > > > > +void
> > > > > +nouveau_sched_entity_fini(struct nouveau_sched_entity *entity)
> > > > > +{
> > > > > +    drm_sched_entity_destroy(&entity->base);
> > > > > +}
> > > > > +
> > > > > +static const struct drm_sched_backend_ops nouveau_sched_ops = {
> > > > > +    .run_job = nouveau_sched_run_job,
> > > > > +    .timedout_job = nouveau_sched_timedout_job,
> > > > > +    .free_job = nouveau_sched_free_job,
> > > > > +};
> > > > > +
> > > > > +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
> > > > > +               struct nouveau_drm *drm)
> > > > > +{
> > > > > +    long job_hang_limit =
> > > > > msecs_to_jiffies(NOUVEAU_SCHED_JOB_TIMEOUT_MS);
> > > > > +
> > > > > +    return drm_sched_init(sched, &nouveau_sched_ops,
> > > > > +                  NOUVEAU_SCHED_HW_SUBMISSIONS, 0, job_hang_limit,
> > > > > +                  NULL, NULL, "nouveau", drm->dev->dev);
> > > > > +}
> > > > > +
> > > > > +void nouveau_sched_fini(struct drm_gpu_scheduler *sched)
> > > > > +{
> > > > > +    drm_sched_fini(sched);
> > > > > +}
> > > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > > b/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > > new file mode 100644
> > > > > index 000000000000..7fc5b7eea810
> > > > > --- /dev/null
> > > > > +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > > @@ -0,0 +1,98 @@
> > > > > +// SPDX-License-Identifier: MIT
> > > > > +
> > > > > +#ifndef NOUVEAU_SCHED_H
> > > > > +#define NOUVEAU_SCHED_H
> > > > > +
> > > > > +#include <linux/types.h>
> > > > > +
> > > > > +#include <drm/drm_exec.h>
> > > > > +#include <drm/gpu_scheduler.h>
> > > > > +
> > > > > +#include "nouveau_drv.h"
> > > > > +#include "nouveau_exec.h"
> > > > > +
> > > > > +#define to_nouveau_job(sched_job)        \
> > > > > +        container_of((sched_job), struct nouveau_job, base)
> > > > > +
> > > > > +#define to_nouveau_exec_job(job)        \
> > > > > +        container_of((job), struct nouveau_exec_job, base)
> > > > > +
> > > > > +#define to_nouveau_bind_job(job)        \
> > > > > +        container_of((job), struct nouveau_bind_job, base)
> > > > > +
> > > > > +struct nouveau_job {
> > > > > +    struct drm_sched_job base;
> > > > > +    struct list_head head;
> > > > > +
> > > > > +    struct nouveau_sched_entity *entity;
> > > > > +
> > > > > +    struct drm_file *file_priv;
> > > > > +    struct nouveau_cli *cli;
> > > > > +    struct nouveau_channel *chan;
> > > > > +
> > > > > +    struct drm_exec exec;
> > > > > +    struct dma_fence *done_fence;
> > > > > +
> > > > > +    bool sync;
> > > > > +
> > > > > +    struct {
> > > > > +        struct drm_nouveau_sync *s;
> > > > > +        u32 count;
> > > > > +    } in_sync;
> > > > > +
> > > > > +    struct {
> > > > > +        struct drm_nouveau_sync *s;
> > > > > +        u32 count;
> > > > > +    } out_sync;
> > > > > +
> > > > > +    struct nouveau_job_ops {
> > > > > +        int (*submit)(struct nouveau_job *);
> > > > > +        struct dma_fence *(*run)(struct nouveau_job *);
> > > > > +        void (*free)(struct nouveau_job *);
> > > > > +    } *ops;
> > > > > +};
> > > > > +
> > > > > +struct nouveau_exec_job {
> > > > > +    struct nouveau_job base;
> > > > > +
> > > > > +    struct {
> > > > > +        struct drm_nouveau_exec_push *s;
> > > > > +        u32 count;
> > > > > +    } push;
> > > > > +};
> > > > > +
> > > > > +struct nouveau_bind_job {
> > > > > +    struct nouveau_job base;
> > > > > +
> > > > > +    /* struct bind_job_op */
> > > > > +    struct list_head ops;
> > > > > +};
> > > > > +
> > > > > +int nouveau_bind_job_init(struct nouveau_bind_job **job,
> > > > > +              struct nouveau_exec_bind *bind);
> > > > > +int nouveau_exec_job_init(struct nouveau_exec_job **job,
> > > > > +              struct nouveau_exec *exec);
> > > > > +
> > > > > +int nouveau_job_submit(struct nouveau_job *job);
> > > > > +void nouveau_job_fini(struct nouveau_job *job);
> > > > > +
> > > > > +#define to_nouveau_sched_entity(entity)        \
> > > > > +        container_of((entity), struct nouveau_sched_entity, base)
> > > > > +
> > > > > +struct nouveau_sched_entity {
> > > > > +    struct drm_sched_entity base;
> > > > > +    struct {
> > > > > +        struct list_head list;
> > > > > +        struct mutex mutex;
> > > > > +    } job;
> > > > > +};
> > > > > +
> > > > > +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
> > > > > +                  struct drm_gpu_scheduler *sched);
> > > > > +void nouveau_sched_entity_fini(struct nouveau_sched_entity *entity);
> > > > > +
> > > > > +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
> > > > > +               struct nouveau_drm *drm);
> > > > > +void nouveau_sched_fini(struct drm_gpu_scheduler *sched);
> > > > > +
> > > > > +#endif
> > > > 
> > > 
> > 
>
Danilo Krummrich Jan. 19, 2023, 5:46 p.m. UTC | #8
On 1/19/23 17:38, Matthew Brost wrote:
> On Thu, Jan 19, 2023 at 04:36:43PM +0100, Danilo Krummrich wrote:
>> On 1/19/23 05:58, Matthew Brost wrote:
>>> On Thu, Jan 19, 2023 at 04:44:23AM +0100, Danilo Krummrich wrote:
>>>> On 1/18/23 21:37, Thomas Hellström (Intel) wrote:
>>>>>
>>>>> On 1/18/23 07:12, Danilo Krummrich wrote:
>>>>>> This commit provides the implementation for the new uapi motivated by the
>>>>>> Vulkan API. It allows user mode drivers (UMDs) to:
>>>>>>
>>>>>> 1) Initialize a GPU virtual address (VA) space via the new
>>>>>>       DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
>>>>>>       space managed by the kernel and userspace, respectively.
>>>>>>
>>>>>> 2) Allocate and free a VA space region as well as bind and unbind memory
>>>>>>       to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>>>>>       UMDs can request the named operations to be processed either
>>>>>>       synchronously or asynchronously. It supports DRM syncobjs
>>>>>>       (incl. timelines) as synchronization mechanism. The management of the
>>>>>>       GPU VA mappings is implemented with the DRM GPU VA manager.
>>>>>>
>>>>>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
>>>>>>       execution happens asynchronously. It supports DRM syncobj (incl.
>>>>>>       timelines) as synchronization mechanism. DRM GEM object locking is
>>>>>>       handled with drm_exec.
>>>>>>
>>>>>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
>>>>>> GPU scheduler for the asynchronous paths.
>>>>>>
>>>>>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>>>>>> ---
>>>>>>     Documentation/gpu/driver-uapi.rst       |   3 +
>>>>>>     drivers/gpu/drm/nouveau/Kbuild          |   2 +
>>>>>>     drivers/gpu/drm/nouveau/Kconfig         |   2 +
>>>>>>     drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
>>>>>>     drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
>>>>>>     drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
>>>>>>     drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
>>>>>>     drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
>>>>>>     drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
>>>>>>     drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
>>>>>>     drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
>>>>>>     11 files changed, 1295 insertions(+), 4 deletions(-)
>>>>>>     create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
>>>>>>     create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
>>>>>>     create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
>>>>>>     create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h
>>>>> ...
>>>>>>
>>>>>> +static struct dma_fence *
>>>>>> +nouveau_bind_job_run(struct nouveau_job *job)
>>>>>> +{
>>>>>> +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
>>>>>> +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
>>>>>> +    struct bind_job_op *op;
>>>>>> +    int ret = 0;
>>>>>> +
>>>>>
>>>>> I was looking at how nouveau does the async binding compared to how xe
>>>>> does it.
>>>>> It looks to me that this function being a scheduler run_job callback is
>>>>> the main part of the VM_BIND dma-fence signalling critical section for
>>>>> the job's done_fence and if so, needs to be annotated as such?
>>>>
>>>> Yes, that's the case.
>>>>
>>>>>
>>>>> For example nouveau_uvma_region_new allocates memory, which is not
>>>>> allowed if in a dma_fence signalling critical section and the locking
>>>>> also looks suspicious?
>>>>
>>>> Thanks for pointing this out, I missed that somehow.
>>>>
>>>> I will change it to pre-allocate new regions, mappings and page tables
>>>> within the job's submit() function.
>>>>
>>>
>>> Yea that what we basically do in Xe, in the IOCTL step allocate all the
>>> backing store for new page tables, populate new page tables (these are
>>> not yet visible in the page table structure), and in last step which is
>>> executed after all the dependencies are satified program all the leaf
>>> entires making the new binding visible.
>>>
>>> We screwed have this up by defering most of the IOCTL to a worker but
>>> will fix this fix this one way or another soon - get rid of worker or
>>> introduce a type of sync that is signaled after the worker + publish the
>>> dma-fence in the worker. I'd like to close on this one soon.
>>>> For the ops structures the drm_gpuva_manager allocates for reporting the
>>>> split/merge steps back to the driver I have ideas to entirely avoid
>>>> allocations, which also is a good thing in respect of Christians feedback
>>>> regarding the huge amount of mapping requests some applications seem to
>>>> generate.
>>>>
>>>
>>> It should be fine to have allocations to report the split/merge step as
>>> this step should be before a dma-fence is published, but yea if possible
>>> to avoid extra allocs as that is always better.
>>
>> I think we can't really ask for the split/merge steps before actually
>> running the job, since it requires the particular VA space not to change
>> while performing those operations.
>>
>> E.g. if we'd run the split/merge steps at job submit() time the underlying
>> VA space could be changed by other bind jobs executing before this one,
>> which would make the calculated split/merge steps obsolete and wrong.
>>
> 
> Hmm, maybe I'm not understanding this implementation, admittedly I
> haven't studied the gpuva manager code in detail.

The limitation I mentioned above doesn't really come from the 
drm_gpuva_manager, but from how the driver executes the jobs.

> 
> Let me explain what we are doing in Xe.
> 
> Map 0x0000 - 0x3000 -> this resolves into 1 bind operation and 1 VMA
> Unmap 0x1000-0x2000 -> this resolves into 1 unbind and 2 rebind operations
> 
> 1. unbind 0x0000-0x3000 -> destroy old VMA
> 2. rebind 0x0000-0x1000 -> new VMA
> 3. rebind 0x2000-0x3000 -> new VMA
> 
> All of the above steps resolving the operations can be done in the IOCTL
> phase and VM's VMA structure is also updated. When the dependencies
> are resolved the actual bindings are done on the GPU. We use the BO's
> dma-resv slots to ensure there is never a window 0x0000-0x1000 and
> 0x2000-0x3000 are never mapped with respect to execs (I forget the exact
> details of how we do this but if you want to know I'll explain further).

Ok, so you're not only generating the split/merge steps without updating 
the view of the VA space (which would cause the issue I described) but 
also already change the view of the VA space in the IOCTL, before the 
actual page table update happens later on, right?

Currently, in nouveau I do both, the actual page table update and the 
range allocator update, in run_job(), such that walking the allocator 
always represents the actual page table layout.

How do you handle map/unmap on BO eviction?

> 
> Can we not use drm_gpuvs_manager in a similar manner to generate the
> ops + update the VM's VMA structure early? Again maybe I missing
> something here as I haven't fully studied the drm_gpuva_manager.

You can use the drm_gpuvs_manager in exactly the way you just described. 
Though, in your concrete example it would generate just 1 unbind and 1 
bind, which it would combine in a re-bind operation. A re-bind operation 
always has 1 unbind and up to 2 (but a minimum of 1) bind (sub-)operations.

Rebind:
     1. unbind 0x0000-0x3000
     2. NULL
     3.   bind 0x1000-0x3000

It's then up to the driver to remove the old gpuva entry and add a new 
one. With the given re-bind operation the driver can conclude to just do 
a partial page table update from 0x0000-0x1000.

- Danilo

> 
> Matt
> 
>> Anyway, I should be able to get rid of all the allocations to make this
>> safe.
>>
>>>
>>> Also BTW, great work on drm_gpuva_manager too. We will almost likely
>>> pick this up in Xe rather than open coding all of this as we currently
>>> do. We should probably start the port to this soon so we can contribute
>>> to the implementation and get both of our drivers upstream sooner.
>>
>> Sounds great!
>>
>>>> Regarding the locking, anything specific that makes it look suspicious to
>>>> you?
>>>>
>>>
>>> I haven't looked into this too but almost certainly Thomas is suggesting
>>> that if you allocate memory anywhere under the nouveau_uvmm_lock then
>>> you can't use this lock in the run_job() callback as this in the
>>> dma-fencing path.
>>
>> Oh, sure. I already checked that, luckily there aren't any further
>> allocations under this lock, so this should be safe once I changed to
>> run_job() parts to pre-allocation in submit().
>>
>>>
>>> Matt
>>>
>>>>>
>>>>> Thanks,
>>>>>
>>>>> Thomas
>>>>>
>>>>>
>>>>>> +    nouveau_uvmm_lock(uvmm);
>>>>>> +    list_for_each_op(op, &bind_job->ops) {
>>>>>> +        switch (op->op) {
>>>>>> +        case OP_ALLOC: {
>>>>>> +            bool sparse = op->flags & DRM_NOUVEAU_VM_BIND_SPARSE;
>>>>>> +
>>>>>> +            ret = nouveau_uvma_region_new(uvmm,
>>>>>> +                              op->va.addr,
>>>>>> +                              op->va.range,
>>>>>> +                              sparse);
>>>>>> +            if (ret)
>>>>>> +                goto out_unlock;
>>>>>> +            break;
>>>>>> +        }
>>>>>> +        case OP_FREE:
>>>>>> +            ret = nouveau_uvma_region_destroy(uvmm,
>>>>>> +                              op->va.addr,
>>>>>> +                              op->va.range);
>>>>>> +            if (ret)
>>>>>> +                goto out_unlock;
>>>>>> +            break;
>>>>>> +        case OP_MAP:
>>>>>> +            ret = nouveau_uvmm_sm_map(uvmm,
>>>>>> +                          op->va.addr, op->va.range,
>>>>>> +                          op->gem.obj, op->gem.offset,
>>>>>> +                          op->flags && 0xff);
>>>>>> +            if (ret)
>>>>>> +                goto out_unlock;
>>>>>> +            break;
>>>>>> +        case OP_UNMAP:
>>>>>> +            ret = nouveau_uvmm_sm_unmap(uvmm,
>>>>>> +                            op->va.addr,
>>>>>> +                            op->va.range);
>>>>>> +            if (ret)
>>>>>> +                goto out_unlock;
>>>>>> +            break;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +out_unlock:
>>>>>> +    nouveau_uvmm_unlock(uvmm);
>>>>>> +    if (ret)
>>>>>> +        NV_PRINTK(err, job->cli, "bind job failed: %d\n", ret);
>>>>>> +    return ERR_PTR(ret);
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +nouveau_bind_job_free(struct nouveau_job *job)
>>>>>> +{
>>>>>> +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
>>>>>> +    struct bind_job_op *op, *next;
>>>>>> +
>>>>>> +    list_for_each_op_safe(op, next, &bind_job->ops) {
>>>>>> +        struct drm_gem_object *obj = op->gem.obj;
>>>>>> +
>>>>>> +        if (obj)
>>>>>> +            drm_gem_object_put(obj);
>>>>>> +
>>>>>> +        list_del(&op->entry);
>>>>>> +        kfree(op);
>>>>>> +    }
>>>>>> +
>>>>>> +    nouveau_base_job_free(job);
>>>>>> +    kfree(bind_job);
>>>>>> +}
>>>>>> +
>>>>>> +static struct nouveau_job_ops nouveau_bind_job_ops = {
>>>>>> +    .submit = nouveau_bind_job_submit,
>>>>>> +    .run = nouveau_bind_job_run,
>>>>>> +    .free = nouveau_bind_job_free,
>>>>>> +};
>>>>>> +
>>>>>> +static int
>>>>>> +bind_job_op_from_uop(struct bind_job_op **pop,
>>>>>> +             struct drm_nouveau_vm_bind_op *uop)
>>>>>> +{
>>>>>> +    struct bind_job_op *op;
>>>>>> +
>>>>>> +    op = *pop = kzalloc(sizeof(*op), GFP_KERNEL);
>>>>>> +    if (!op)
>>>>>> +        return -ENOMEM;
>>>>>> +
>>>>>> +    op->op = uop->op;
>>>>>> +    op->flags = uop->flags;
>>>>>> +    op->va.addr = uop->addr;
>>>>>> +    op->va.range = uop->range;
>>>>>> +
>>>>>> +    if (op->op == DRM_NOUVEAU_VM_BIND_OP_MAP) {
>>>>>> +        op->gem.handle = uop->handle;
>>>>>> +        op->gem.offset = uop->bo_offset;
>>>>>> +    }
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +bind_job_ops_free(struct list_head *ops)
>>>>>> +{
>>>>>> +    struct bind_job_op *op, *next;
>>>>>> +
>>>>>> +    list_for_each_op_safe(op, next, ops) {
>>>>>> +        list_del(&op->entry);
>>>>>> +        kfree(op);
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +int
>>>>>> +nouveau_bind_job_init(struct nouveau_bind_job **pjob,
>>>>>> +              struct nouveau_exec_bind *bind)
>>>>>> +{
>>>>>> +    struct nouveau_bind_job *job;
>>>>>> +    struct bind_job_op *op;
>>>>>> +    int i, ret;
>>>>>> +
>>>>>> +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>>>>>> +    if (!job)
>>>>>> +        return -ENOMEM;
>>>>>> +
>>>>>> +    INIT_LIST_HEAD(&job->ops);
>>>>>> +
>>>>>> +    for (i = 0; i < bind->op.count; i++) {
>>>>>> +        ret = bind_job_op_from_uop(&op, &bind->op.s[i]);
>>>>>> +        if (ret)
>>>>>> +            goto err_free;
>>>>>> +
>>>>>> +        list_add_tail(&op->entry, &job->ops);
>>>>>> +    }
>>>>>> +
>>>>>> +    job->base.sync = !(bind->flags & DRM_NOUVEAU_VM_BIND_RUN_ASYNC);
>>>>>> +    job->base.ops = &nouveau_bind_job_ops;
>>>>>> +
>>>>>> +    ret = nouveau_base_job_init(&job->base, &bind->base);
>>>>>> +    if (ret)
>>>>>> +        goto err_free;
>>>>>> +
>>>>>> +    return 0;
>>>>>> +
>>>>>> +err_free:
>>>>>> +    bind_job_ops_free(&job->ops);
>>>>>> +    kfree(job);
>>>>>> +    *pjob = NULL;
>>>>>> +
>>>>>> +    return ret;
>>>>>> +}
>>>>>> +
>>>>>> +static int
>>>>>> +sync_find_fence(struct nouveau_job *job,
>>>>>> +        struct drm_nouveau_sync *sync,
>>>>>> +        struct dma_fence **fence)
>>>>>> +{
>>>>>> +    u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
>>>>>> +    u64 point = 0;
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
>>>>>> +        stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>>>> +        return -EOPNOTSUPP;
>>>>>> +
>>>>>> +    if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>>>> +        point = sync->timeline_value;
>>>>>> +
>>>>>> +    ret = drm_syncobj_find_fence(job->file_priv,
>>>>>> +                     sync->handle, point,
>>>>>> +                     sync->flags, fence);
>>>>>> +    if (ret)
>>>>>> +        return ret;
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int
>>>>>> +exec_job_binds_wait(struct nouveau_job *job)
>>>>>> +{
>>>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>>>> +    struct nouveau_cli *cli = exec_job->base.cli;
>>>>>> +    struct nouveau_sched_entity *bind_entity = &cli->sched_entity;
>>>>>> +    signed long ret;
>>>>>> +    int i;
>>>>>> +
>>>>>> +    for (i = 0; i < job->in_sync.count; i++) {
>>>>>> +        struct nouveau_job *it;
>>>>>> +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
>>>>>> +        struct dma_fence *fence;
>>>>>> +        bool found;
>>>>>> +
>>>>>> +        ret = sync_find_fence(job, sync, &fence);
>>>>>> +        if (ret)
>>>>>> +            return ret;
>>>>>> +
>>>>>> +        mutex_lock(&bind_entity->job.mutex);
>>>>>> +        found = false;
>>>>>> +        list_for_each_entry(it, &bind_entity->job.list, head) {
>>>>>> +            if (fence == it->done_fence) {
>>>>>> +                found = true;
>>>>>> +                break;
>>>>>> +            }
>>>>>> +        }
>>>>>> +        mutex_unlock(&bind_entity->job.mutex);
>>>>>> +
>>>>>> +        /* If the fence is not from a VM_BIND job, don't wait for it. */
>>>>>> +        if (!found)
>>>>>> +            continue;
>>>>>> +
>>>>>> +        ret = dma_fence_wait_timeout(fence, true,
>>>>>> +                         msecs_to_jiffies(500));
>>>>>> +        if (ret < 0)
>>>>>> +            return ret;
>>>>>> +        else if (ret == 0)
>>>>>> +            return -ETIMEDOUT;
>>>>>> +    }
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +int
>>>>>> +nouveau_exec_job_submit(struct nouveau_job *job)
>>>>>> +{
>>>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>>>> +    struct nouveau_cli *cli = exec_job->base.cli;
>>>>>> +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(cli);
>>>>>> +    struct drm_exec *exec = &job->exec;
>>>>>> +    struct drm_gem_object *obj;
>>>>>> +    unsigned long index;
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    ret = exec_job_binds_wait(job);
>>>>>> +    if (ret)
>>>>>> +        return ret;
>>>>>> +
>>>>>> +    nouveau_uvmm_lock(uvmm);
>>>>>> +    drm_exec_while_not_all_locked(exec) {
>>>>>> +        struct drm_gpuva *va;
>>>>>> +
>>>>>> +        drm_gpuva_for_each_va(va, &uvmm->umgr) {
>>>>>> +            ret = drm_exec_prepare_obj(exec, va->gem.obj, 1);
>>>>>> +            drm_exec_break_on_contention(exec);
>>>>>> +            if (ret)
>>>>>> +                return ret;
>>>>>> +        }
>>>>>> +    }
>>>>>> +    nouveau_uvmm_unlock(uvmm);
>>>>>> +
>>>>>> +    drm_exec_for_each_locked_object(exec, index, obj) {
>>>>>> +        struct dma_resv *resv = obj->resv;
>>>>>> +        struct nouveau_bo *nvbo = nouveau_gem_object(obj);
>>>>>> +
>>>>>> +        ret = nouveau_bo_validate(nvbo, true, false);
>>>>>> +        if (ret)
>>>>>> +            return ret;
>>>>>> +
>>>>>> +        dma_resv_add_fence(resv, job->done_fence, DMA_RESV_USAGE_WRITE);
>>>>>> +    }
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static struct dma_fence *
>>>>>> +nouveau_exec_job_run(struct nouveau_job *job)
>>>>>> +{
>>>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>>>> +    struct nouveau_fence *fence;
>>>>>> +    int i, ret;
>>>>>> +
>>>>>> +    ret = nouveau_dma_wait(job->chan, exec_job->push.count + 1, 16);
>>>>>> +    if (ret) {
>>>>>> +        NV_PRINTK(err, job->cli, "nv50cal_space: %d\n", ret);
>>>>>> +        return ERR_PTR(ret);
>>>>>> +    }
>>>>>> +
>>>>>> +    for (i = 0; i < exec_job->push.count; i++) {
>>>>>> +        nv50_dma_push(job->chan, exec_job->push.s[i].va,
>>>>>> +                  exec_job->push.s[i].va_len);
>>>>>> +    }
>>>>>> +
>>>>>> +    ret = nouveau_fence_new(job->chan, false, &fence);
>>>>>> +    if (ret) {
>>>>>> +        NV_PRINTK(err, job->cli, "error fencing pushbuf: %d\n", ret);
>>>>>> +        WIND_RING(job->chan);
>>>>>> +        return ERR_PTR(ret);
>>>>>> +    }
>>>>>> +
>>>>>> +    return &fence->base;
>>>>>> +}
>>>>>> +static void
>>>>>> +nouveau_exec_job_free(struct nouveau_job *job)
>>>>>> +{
>>>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>>>> +
>>>>>> +    nouveau_base_job_free(job);
>>>>>> +
>>>>>> +    kfree(exec_job->push.s);
>>>>>> +    kfree(exec_job);
>>>>>> +}
>>>>>> +
>>>>>> +static struct nouveau_job_ops nouveau_exec_job_ops = {
>>>>>> +    .submit = nouveau_exec_job_submit,
>>>>>> +    .run = nouveau_exec_job_run,
>>>>>> +    .free = nouveau_exec_job_free,
>>>>>> +};
>>>>>> +
>>>>>> +int
>>>>>> +nouveau_exec_job_init(struct nouveau_exec_job **pjob,
>>>>>> +              struct nouveau_exec *exec)
>>>>>> +{
>>>>>> +    struct nouveau_exec_job *job;
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>>>>>> +    if (!job)
>>>>>> +        return -ENOMEM;
>>>>>> +
>>>>>> +    job->push.count = exec->push.count;
>>>>>> +    job->push.s = kmemdup(exec->push.s,
>>>>>> +                  sizeof(*exec->push.s) *
>>>>>> +                  exec->push.count,
>>>>>> +                  GFP_KERNEL);
>>>>>> +    if (!job->push.s) {
>>>>>> +        ret = -ENOMEM;
>>>>>> +        goto err_free_job;
>>>>>> +    }
>>>>>> +
>>>>>> +    job->base.ops = &nouveau_exec_job_ops;
>>>>>> +    ret = nouveau_base_job_init(&job->base, &exec->base);
>>>>>> +    if (ret)
>>>>>> +        goto err_free_pushs;
>>>>>> +
>>>>>> +    return 0;
>>>>>> +
>>>>>> +err_free_pushs:
>>>>>> +    kfree(job->push.s);
>>>>>> +err_free_job:
>>>>>> +    kfree(job);
>>>>>> +    *pjob = NULL;
>>>>>> +
>>>>>> +    return ret;
>>>>>> +}
>>>>>> +
>>>>>> +void nouveau_job_fini(struct nouveau_job *job)
>>>>>> +{
>>>>>> +    dma_fence_put(job->done_fence);
>>>>>> +    drm_sched_job_cleanup(&job->base);
>>>>>> +    job->ops->free(job);
>>>>>> +}
>>>>>> +
>>>>>> +static int
>>>>>> +nouveau_job_add_deps(struct nouveau_job *job)
>>>>>> +{
>>>>>> +    struct dma_fence *in_fence = NULL;
>>>>>> +    int ret, i;
>>>>>> +
>>>>>> +    for (i = 0; i < job->in_sync.count; i++) {
>>>>>> +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
>>>>>> +
>>>>>> +        ret = sync_find_fence(job, sync, &in_fence);
>>>>>> +        if (ret) {
>>>>>> +            NV_PRINTK(warn, job->cli,
>>>>>> +                  "Failed to find syncobj (-> in): handle=%d\n",
>>>>>> +                  sync->handle);
>>>>>> +            return ret;
>>>>>> +        }
>>>>>> +
>>>>>> +        ret = drm_sched_job_add_dependency(&job->base, in_fence);
>>>>>> +        if (ret)
>>>>>> +            return ret;
>>>>>> +    }
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int
>>>>>> +nouveau_job_fence_attach(struct nouveau_job *job, struct dma_fence
>>>>>> *fence)
>>>>>> +{
>>>>>> +    struct drm_syncobj *out_sync;
>>>>>> +    int i;
>>>>>> +
>>>>>> +    for (i = 0; i < job->out_sync.count; i++) {
>>>>>> +        struct drm_nouveau_sync *sync = &job->out_sync.s[i];
>>>>>> +        u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
>>>>>> +
>>>>>> +        if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
>>>>>> +            stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>>>> +            return -EOPNOTSUPP;
>>>>>> +
>>>>>> +        out_sync = drm_syncobj_find(job->file_priv, sync->handle);
>>>>>> +        if (!out_sync) {
>>>>>> +            NV_PRINTK(warn, job->cli,
>>>>>> +                  "Failed to find syncobj (-> out): handle=%d\n",
>>>>>> +                  sync->handle);
>>>>>> +            return -ENOENT;
>>>>>> +        }
>>>>>> +
>>>>>> +        if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ) {
>>>>>> +            struct dma_fence_chain *chain;
>>>>>> +
>>>>>> +            chain = dma_fence_chain_alloc();
>>>>>> +            if (!chain) {
>>>>>> +                drm_syncobj_put(out_sync);
>>>>>> +                return -ENOMEM;
>>>>>> +            }
>>>>>> +
>>>>>> +            drm_syncobj_add_point(out_sync, chain, fence,
>>>>>> +                          sync->timeline_value);
>>>>>> +        } else {
>>>>>> +            drm_syncobj_replace_fence(out_sync, fence);
>>>>>> +        }
>>>>>> +
>>>>>> +        drm_syncobj_put(out_sync);
>>>>>> +    }
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static struct dma_fence *
>>>>>> +nouveau_job_run(struct nouveau_job *job)
>>>>>> +{
>>>>>> +    return job->ops->run(job);
>>>>>> +}
>>>>>> +
>>>>>> +static int
>>>>>> +nouveau_job_run_sync(struct nouveau_job *job)
>>>>>> +{
>>>>>> +    struct dma_fence *fence;
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    fence = nouveau_job_run(job);
>>>>>> +    if (IS_ERR(fence)) {
>>>>>> +        return PTR_ERR(fence);
>>>>>> +    } else if (fence) {
>>>>>> +        ret = dma_fence_wait(fence, true);
>>>>>> +        if (ret)
>>>>>> +            return ret;
>>>>>> +    }
>>>>>> +
>>>>>> +    dma_fence_signal(job->done_fence);
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +int
>>>>>> +nouveau_job_submit(struct nouveau_job *job)
>>>>>> +{
>>>>>> +    struct nouveau_sched_entity *entity =
>>>>>> to_nouveau_sched_entity(job->base.entity);
>>>>>> +    int ret;
>>>>>> +
>>>>>> +    drm_exec_init(&job->exec, true);
>>>>>> +
>>>>>> +    ret = nouveau_job_add_deps(job);
>>>>>> +    if (ret)
>>>>>> +        goto out;
>>>>>> +
>>>>>> +    drm_sched_job_arm(&job->base);
>>>>>> +    job->done_fence = dma_fence_get(&job->base.s_fence->finished);
>>>>>> +
>>>>>> +    ret = nouveau_job_fence_attach(job, job->done_fence);
>>>>>> +    if (ret)
>>>>>> +        goto out;
>>>>>> +
>>>>>> +    if (job->ops->submit) {
>>>>>> +        ret = job->ops->submit(job);
>>>>>> +        if (ret)
>>>>>> +            goto out;
>>>>>> +    }
>>>>>> +
>>>>>> +    if (job->sync) {
>>>>>> +        drm_exec_fini(&job->exec);
>>>>>> +
>>>>>> +        /* We're requested to run a synchronous job, hence don't push
>>>>>> +         * the job, bypassing the job scheduler, and execute the jobs
>>>>>> +         * run() function right away.
>>>>>> +         *
>>>>>> +         * As a consequence of bypassing the job scheduler we need to
>>>>>> +         * handle fencing and job cleanup ourselfes.
>>>>>> +         */
>>>>>> +        ret = nouveau_job_run_sync(job);
>>>>>> +
>>>>>> +        /* If the job fails, the caller will do the cleanup for us. */
>>>>>> +        if (!ret)
>>>>>> +            nouveau_job_fini(job);
>>>>>> +
>>>>>> +        return ret;
>>>>>> +    } else {
>>>>>> +        mutex_lock(&entity->job.mutex);
>>>>>> +        drm_sched_entity_push_job(&job->base);
>>>>>> +        list_add_tail(&job->head, &entity->job.list);
>>>>>> +        mutex_unlock(&entity->job.mutex);
>>>>>> +    }
>>>>>> +
>>>>>> +out:
>>>>>> +    drm_exec_fini(&job->exec);
>>>>>> +    return ret;
>>>>>> +}
>>>>>> +
>>>>>> +static struct dma_fence *
>>>>>> +nouveau_sched_run_job(struct drm_sched_job *sched_job)
>>>>>> +{
>>>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>>>> +
>>>>>> +    return nouveau_job_run(job);
>>>>>> +}
>>>>>> +
>>>>>> +static enum drm_gpu_sched_stat
>>>>>> +nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
>>>>>> +{
>>>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>>>> +    struct nouveau_channel *chan = job->chan;
>>>>>> +
>>>>>> +    if (unlikely(!atomic_read(&chan->killed)))
>>>>>> +        nouveau_channel_kill(chan);
>>>>>> +
>>>>>> +    NV_PRINTK(warn, job->cli, "job timeout, channel %d killed!\n",
>>>>>> +          chan->chid);
>>>>>> +
>>>>>> +    nouveau_sched_entity_fini(job->entity);
>>>>>> +
>>>>>> +    return DRM_GPU_SCHED_STAT_ENODEV;
>>>>>> +}
>>>>>> +
>>>>>> +static void
>>>>>> +nouveau_sched_free_job(struct drm_sched_job *sched_job)
>>>>>> +{
>>>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>>>> +    struct nouveau_sched_entity *entity = job->entity;
>>>>>> +
>>>>>> +    mutex_lock(&entity->job.mutex);
>>>>>> +    list_del(&job->head);
>>>>>> +    mutex_unlock(&entity->job.mutex);
>>>>>> +
>>>>>> +    nouveau_job_fini(job);
>>>>>> +}
>>>>>> +
>>>>>> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
>>>>>> +                  struct drm_gpu_scheduler *sched)
>>>>>> +{
>>>>>> +
>>>>>> +    INIT_LIST_HEAD(&entity->job.list);
>>>>>> +    mutex_init(&entity->job.mutex);
>>>>>> +
>>>>>> +    return drm_sched_entity_init(&entity->base,
>>>>>> +                     DRM_SCHED_PRIORITY_NORMAL,
>>>>>> +                     &sched, 1, NULL);
>>>>>> +}
>>>>>> +
>>>>>> +void
>>>>>> +nouveau_sched_entity_fini(struct nouveau_sched_entity *entity)
>>>>>> +{
>>>>>> +    drm_sched_entity_destroy(&entity->base);
>>>>>> +}
>>>>>> +
>>>>>> +static const struct drm_sched_backend_ops nouveau_sched_ops = {
>>>>>> +    .run_job = nouveau_sched_run_job,
>>>>>> +    .timedout_job = nouveau_sched_timedout_job,
>>>>>> +    .free_job = nouveau_sched_free_job,
>>>>>> +};
>>>>>> +
>>>>>> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
>>>>>> +               struct nouveau_drm *drm)
>>>>>> +{
>>>>>> +    long job_hang_limit =
>>>>>> msecs_to_jiffies(NOUVEAU_SCHED_JOB_TIMEOUT_MS);
>>>>>> +
>>>>>> +    return drm_sched_init(sched, &nouveau_sched_ops,
>>>>>> +                  NOUVEAU_SCHED_HW_SUBMISSIONS, 0, job_hang_limit,
>>>>>> +                  NULL, NULL, "nouveau", drm->dev->dev);
>>>>>> +}
>>>>>> +
>>>>>> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched)
>>>>>> +{
>>>>>> +    drm_sched_fini(sched);
>>>>>> +}
>>>>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>>>> b/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>>>> new file mode 100644
>>>>>> index 000000000000..7fc5b7eea810
>>>>>> --- /dev/null
>>>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>>>> @@ -0,0 +1,98 @@
>>>>>> +// SPDX-License-Identifier: MIT
>>>>>> +
>>>>>> +#ifndef NOUVEAU_SCHED_H
>>>>>> +#define NOUVEAU_SCHED_H
>>>>>> +
>>>>>> +#include <linux/types.h>
>>>>>> +
>>>>>> +#include <drm/drm_exec.h>
>>>>>> +#include <drm/gpu_scheduler.h>
>>>>>> +
>>>>>> +#include "nouveau_drv.h"
>>>>>> +#include "nouveau_exec.h"
>>>>>> +
>>>>>> +#define to_nouveau_job(sched_job)        \
>>>>>> +        container_of((sched_job), struct nouveau_job, base)
>>>>>> +
>>>>>> +#define to_nouveau_exec_job(job)        \
>>>>>> +        container_of((job), struct nouveau_exec_job, base)
>>>>>> +
>>>>>> +#define to_nouveau_bind_job(job)        \
>>>>>> +        container_of((job), struct nouveau_bind_job, base)
>>>>>> +
>>>>>> +struct nouveau_job {
>>>>>> +    struct drm_sched_job base;
>>>>>> +    struct list_head head;
>>>>>> +
>>>>>> +    struct nouveau_sched_entity *entity;
>>>>>> +
>>>>>> +    struct drm_file *file_priv;
>>>>>> +    struct nouveau_cli *cli;
>>>>>> +    struct nouveau_channel *chan;
>>>>>> +
>>>>>> +    struct drm_exec exec;
>>>>>> +    struct dma_fence *done_fence;
>>>>>> +
>>>>>> +    bool sync;
>>>>>> +
>>>>>> +    struct {
>>>>>> +        struct drm_nouveau_sync *s;
>>>>>> +        u32 count;
>>>>>> +    } in_sync;
>>>>>> +
>>>>>> +    struct {
>>>>>> +        struct drm_nouveau_sync *s;
>>>>>> +        u32 count;
>>>>>> +    } out_sync;
>>>>>> +
>>>>>> +    struct nouveau_job_ops {
>>>>>> +        int (*submit)(struct nouveau_job *);
>>>>>> +        struct dma_fence *(*run)(struct nouveau_job *);
>>>>>> +        void (*free)(struct nouveau_job *);
>>>>>> +    } *ops;
>>>>>> +};
>>>>>> +
>>>>>> +struct nouveau_exec_job {
>>>>>> +    struct nouveau_job base;
>>>>>> +
>>>>>> +    struct {
>>>>>> +        struct drm_nouveau_exec_push *s;
>>>>>> +        u32 count;
>>>>>> +    } push;
>>>>>> +};
>>>>>> +
>>>>>> +struct nouveau_bind_job {
>>>>>> +    struct nouveau_job base;
>>>>>> +
>>>>>> +    /* struct bind_job_op */
>>>>>> +    struct list_head ops;
>>>>>> +};
>>>>>> +
>>>>>> +int nouveau_bind_job_init(struct nouveau_bind_job **job,
>>>>>> +              struct nouveau_exec_bind *bind);
>>>>>> +int nouveau_exec_job_init(struct nouveau_exec_job **job,
>>>>>> +              struct nouveau_exec *exec);
>>>>>> +
>>>>>> +int nouveau_job_submit(struct nouveau_job *job);
>>>>>> +void nouveau_job_fini(struct nouveau_job *job);
>>>>>> +
>>>>>> +#define to_nouveau_sched_entity(entity)        \
>>>>>> +        container_of((entity), struct nouveau_sched_entity, base)
>>>>>> +
>>>>>> +struct nouveau_sched_entity {
>>>>>> +    struct drm_sched_entity base;
>>>>>> +    struct {
>>>>>> +        struct list_head list;
>>>>>> +        struct mutex mutex;
>>>>>> +    } job;
>>>>>> +};
>>>>>> +
>>>>>> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
>>>>>> +                  struct drm_gpu_scheduler *sched);
>>>>>> +void nouveau_sched_entity_fini(struct nouveau_sched_entity *entity);
>>>>>> +
>>>>>> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
>>>>>> +               struct nouveau_drm *drm);
>>>>>> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched);
>>>>>> +
>>>>>> +#endif
>>>>>
>>>>
>>>
>>
>
Matthew Brost Jan. 19, 2023, 9:47 p.m. UTC | #9
On Thu, Jan 19, 2023 at 06:46:30PM +0100, Danilo Krummrich wrote:
> 
> 
> On 1/19/23 17:38, Matthew Brost wrote:
> > On Thu, Jan 19, 2023 at 04:36:43PM +0100, Danilo Krummrich wrote:
> > > On 1/19/23 05:58, Matthew Brost wrote:
> > > > On Thu, Jan 19, 2023 at 04:44:23AM +0100, Danilo Krummrich wrote:
> > > > > On 1/18/23 21:37, Thomas Hellström (Intel) wrote:
> > > > > > 
> > > > > > On 1/18/23 07:12, Danilo Krummrich wrote:
> > > > > > > This commit provides the implementation for the new uapi motivated by the
> > > > > > > Vulkan API. It allows user mode drivers (UMDs) to:
> > > > > > > 
> > > > > > > 1) Initialize a GPU virtual address (VA) space via the new
> > > > > > >       DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
> > > > > > >       space managed by the kernel and userspace, respectively.
> > > > > > > 
> > > > > > > 2) Allocate and free a VA space region as well as bind and unbind memory
> > > > > > >       to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> > > > > > >       UMDs can request the named operations to be processed either
> > > > > > >       synchronously or asynchronously. It supports DRM syncobjs
> > > > > > >       (incl. timelines) as synchronization mechanism. The management of the
> > > > > > >       GPU VA mappings is implemented with the DRM GPU VA manager.
> > > > > > > 
> > > > > > > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
> > > > > > >       execution happens asynchronously. It supports DRM syncobj (incl.
> > > > > > >       timelines) as synchronization mechanism. DRM GEM object locking is
> > > > > > >       handled with drm_exec.
> > > > > > > 
> > > > > > > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
> > > > > > > GPU scheduler for the asynchronous paths.
> > > > > > > 
> > > > > > > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > > > > > > ---
> > > > > > >     Documentation/gpu/driver-uapi.rst       |   3 +
> > > > > > >     drivers/gpu/drm/nouveau/Kbuild          |   2 +
> > > > > > >     drivers/gpu/drm/nouveau/Kconfig         |   2 +
> > > > > > >     drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
> > > > > > >     drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
> > > > > > >     drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
> > > > > > >     drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
> > > > > > >     drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
> > > > > > >     drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
> > > > > > >     drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
> > > > > > >     drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
> > > > > > >     11 files changed, 1295 insertions(+), 4 deletions(-)
> > > > > > >     create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
> > > > > > >     create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
> > > > > > >     create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
> > > > > > >     create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > > > ...
> > > > > > > 
> > > > > > > +static struct dma_fence *
> > > > > > > +nouveau_bind_job_run(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> > > > > > > +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
> > > > > > > +    struct bind_job_op *op;
> > > > > > > +    int ret = 0;
> > > > > > > +
> > > > > > 
> > > > > > I was looking at how nouveau does the async binding compared to how xe
> > > > > > does it.
> > > > > > It looks to me that this function being a scheduler run_job callback is
> > > > > > the main part of the VM_BIND dma-fence signalling critical section for
> > > > > > the job's done_fence and if so, needs to be annotated as such?
> > > > > 
> > > > > Yes, that's the case.
> > > > > 
> > > > > > 
> > > > > > For example nouveau_uvma_region_new allocates memory, which is not
> > > > > > allowed if in a dma_fence signalling critical section and the locking
> > > > > > also looks suspicious?
> > > > > 
> > > > > Thanks for pointing this out, I missed that somehow.
> > > > > 
> > > > > I will change it to pre-allocate new regions, mappings and page tables
> > > > > within the job's submit() function.
> > > > > 
> > > > 
> > > > Yea that what we basically do in Xe, in the IOCTL step allocate all the
> > > > backing store for new page tables, populate new page tables (these are
> > > > not yet visible in the page table structure), and in last step which is
> > > > executed after all the dependencies are satified program all the leaf
> > > > entires making the new binding visible.
> > > > 
> > > > We screwed have this up by defering most of the IOCTL to a worker but
> > > > will fix this fix this one way or another soon - get rid of worker or
> > > > introduce a type of sync that is signaled after the worker + publish the
> > > > dma-fence in the worker. I'd like to close on this one soon.
> > > > > For the ops structures the drm_gpuva_manager allocates for reporting the
> > > > > split/merge steps back to the driver I have ideas to entirely avoid
> > > > > allocations, which also is a good thing in respect of Christians feedback
> > > > > regarding the huge amount of mapping requests some applications seem to
> > > > > generate.
> > > > > 
> > > > 
> > > > It should be fine to have allocations to report the split/merge step as
> > > > this step should be before a dma-fence is published, but yea if possible
> > > > to avoid extra allocs as that is always better.
> > > 
> > > I think we can't really ask for the split/merge steps before actually
> > > running the job, since it requires the particular VA space not to change
> > > while performing those operations.
> > > 
> > > E.g. if we'd run the split/merge steps at job submit() time the underlying
> > > VA space could be changed by other bind jobs executing before this one,
> > > which would make the calculated split/merge steps obsolete and wrong.
> > > 
> > 
> > Hmm, maybe I'm not understanding this implementation, admittedly I
> > haven't studied the gpuva manager code in detail.
> 
> The limitation I mentioned above doesn't really come from the
> drm_gpuva_manager, but from how the driver executes the jobs.
> 
> > 
> > Let me explain what we are doing in Xe.
> > 
> > Map 0x0000 - 0x3000 -> this resolves into 1 bind operation and 1 VMA
> > Unmap 0x1000-0x2000 -> this resolves into 1 unbind and 2 rebind operations
> > 
> > 1. unbind 0x0000-0x3000 -> destroy old VMA
> > 2. rebind 0x0000-0x1000 -> new VMA
> > 3. rebind 0x2000-0x3000 -> new VMA
> > 
> > All of the above steps resolving the operations can be done in the IOCTL
> > phase and VM's VMA structure is also updated. When the dependencies
> > are resolved the actual bindings are done on the GPU. We use the BO's
> > dma-resv slots to ensure there is never a window 0x0000-0x1000 and
> > 0x2000-0x3000 are never mapped with respect to execs (I forget the exact
> > details of how we do this but if you want to know I'll explain further).
> 
> Ok, so you're not only generating the split/merge steps without updating the
> view of the VA space (which would cause the issue I described) but also
> already change the view of the VA space in the IOCTL, before the actual page
> table update happens later on, right?
> 

Yes, we generate the operations + update the view VA space in the IOCTL
while the actual page table update on the GPU occurs later if there are
dependencies.

> Currently, in nouveau I do both, the actual page table update and the range
> allocator update, in run_job(), such that walking the allocator always
> represents the actual page table layout.
> 

In Xe the VA view is always if all submited bind / unbind ops has
completed even if some are pending.

Also you may want to take a look at generic page walker Thomas Hellstrom
wrote for Xe which we use to program the page tables. It is pretty slick
and probably could use in Nouveau:
https://patchwork.freedesktop.org/patch/515856/?series=112188&rev=1

In Xe xe_pt.c is the wrapper for this, it can map, unmap, and invalidate
VA:
https://cgit.freedesktop.org/drm/drm-xe/tree/drivers/gpu/drm/xe/xe_pt.c?h=drm-xe-next

> How do you handle map/unmap on BO eviction?
>

We use the dma-resv slots to order all of this.

Basically exec/map/unmap wait on pending BOs (I believe moves are in the
KERNEL slot) and BOs moves wait on pending exec/map/unmaps (I believe
these are BOOKKEEP slot).

I think more details are in the Xe kernel doc, see 'dma-resv usage':
https://cgit.freedesktop.org/drm/drm-xe/tree/drivers/gpu/drm/xe/xe_vm_doc.h?h=drm-xe-next 

This might be a little stale, we dropped the idea of
DMA_RESV_USAGE_PREEMPT_FENCE and is now just in the BOOKKEEP but I think
everything else is corret.

> > 
> > Can we not use drm_gpuvs_manager in a similar manner to generate the
> > ops + update the VM's VMA structure early? Again maybe I missing
> > something here as I haven't fully studied the drm_gpuva_manager.
> 
> You can use the drm_gpuvs_manager in exactly the way you just described.
> Though, in your concrete example it would generate just 1 unbind and 1 bind,
> which it would combine in a re-bind operation. A re-bind operation always
> has 1 unbind and up to 2 (but a minimum of 1) bind (sub-)operations.
>

Cool this is what I wanted to hear. Hopefully we can get around to
building our VM / VMA management on top the drm_gpuvs_manager soon.

Matt
 
> Rebind:
>     1. unbind 0x0000-0x3000
>     2. NULL
>     3.   bind 0x1000-0x3000
> 
> It's then up to the driver to remove the old gpuva entry and add a new one.
> With the given re-bind operation the driver can conclude to just do a
> partial page table update from 0x0000-0x1000.
> 
> - Danilo
> 
> > 
> > Matt
> > 
> > > Anyway, I should be able to get rid of all the allocations to make this
> > > safe.
> > > 
> > > > 
> > > > Also BTW, great work on drm_gpuva_manager too. We will almost likely
> > > > pick this up in Xe rather than open coding all of this as we currently
> > > > do. We should probably start the port to this soon so we can contribute
> > > > to the implementation and get both of our drivers upstream sooner.
> > > 
> > > Sounds great!
> > > 
> > > > > Regarding the locking, anything specific that makes it look suspicious to
> > > > > you?
> > > > > 
> > > > 
> > > > I haven't looked into this too but almost certainly Thomas is suggesting
> > > > that if you allocate memory anywhere under the nouveau_uvmm_lock then
> > > > you can't use this lock in the run_job() callback as this in the
> > > > dma-fencing path.
> > > 
> > > Oh, sure. I already checked that, luckily there aren't any further
> > > allocations under this lock, so this should be safe once I changed to
> > > run_job() parts to pre-allocation in submit().
> > > 
> > > > 
> > > > Matt
> > > > 
> > > > > > 
> > > > > > Thanks,
> > > > > > 
> > > > > > Thomas
> > > > > > 
> > > > > > 
> > > > > > > +    nouveau_uvmm_lock(uvmm);
> > > > > > > +    list_for_each_op(op, &bind_job->ops) {
> > > > > > > +        switch (op->op) {
> > > > > > > +        case OP_ALLOC: {
> > > > > > > +            bool sparse = op->flags & DRM_NOUVEAU_VM_BIND_SPARSE;
> > > > > > > +
> > > > > > > +            ret = nouveau_uvma_region_new(uvmm,
> > > > > > > +                              op->va.addr,
> > > > > > > +                              op->va.range,
> > > > > > > +                              sparse);
> > > > > > > +            if (ret)
> > > > > > > +                goto out_unlock;
> > > > > > > +            break;
> > > > > > > +        }
> > > > > > > +        case OP_FREE:
> > > > > > > +            ret = nouveau_uvma_region_destroy(uvmm,
> > > > > > > +                              op->va.addr,
> > > > > > > +                              op->va.range);
> > > > > > > +            if (ret)
> > > > > > > +                goto out_unlock;
> > > > > > > +            break;
> > > > > > > +        case OP_MAP:
> > > > > > > +            ret = nouveau_uvmm_sm_map(uvmm,
> > > > > > > +                          op->va.addr, op->va.range,
> > > > > > > +                          op->gem.obj, op->gem.offset,
> > > > > > > +                          op->flags && 0xff);
> > > > > > > +            if (ret)
> > > > > > > +                goto out_unlock;
> > > > > > > +            break;
> > > > > > > +        case OP_UNMAP:
> > > > > > > +            ret = nouveau_uvmm_sm_unmap(uvmm,
> > > > > > > +                            op->va.addr,
> > > > > > > +                            op->va.range);
> > > > > > > +            if (ret)
> > > > > > > +                goto out_unlock;
> > > > > > > +            break;
> > > > > > > +        }
> > > > > > > +    }
> > > > > > > +
> > > > > > > +out_unlock:
> > > > > > > +    nouveau_uvmm_unlock(uvmm);
> > > > > > > +    if (ret)
> > > > > > > +        NV_PRINTK(err, job->cli, "bind job failed: %d\n", ret);
> > > > > > > +    return ERR_PTR(ret);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > > +nouveau_bind_job_free(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> > > > > > > +    struct bind_job_op *op, *next;
> > > > > > > +
> > > > > > > +    list_for_each_op_safe(op, next, &bind_job->ops) {
> > > > > > > +        struct drm_gem_object *obj = op->gem.obj;
> > > > > > > +
> > > > > > > +        if (obj)
> > > > > > > +            drm_gem_object_put(obj);
> > > > > > > +
> > > > > > > +        list_del(&op->entry);
> > > > > > > +        kfree(op);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    nouveau_base_job_free(job);
> > > > > > > +    kfree(bind_job);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static struct nouveau_job_ops nouveau_bind_job_ops = {
> > > > > > > +    .submit = nouveau_bind_job_submit,
> > > > > > > +    .run = nouveau_bind_job_run,
> > > > > > > +    .free = nouveau_bind_job_free,
> > > > > > > +};
> > > > > > > +
> > > > > > > +static int
> > > > > > > +bind_job_op_from_uop(struct bind_job_op **pop,
> > > > > > > +             struct drm_nouveau_vm_bind_op *uop)
> > > > > > > +{
> > > > > > > +    struct bind_job_op *op;
> > > > > > > +
> > > > > > > +    op = *pop = kzalloc(sizeof(*op), GFP_KERNEL);
> > > > > > > +    if (!op)
> > > > > > > +        return -ENOMEM;
> > > > > > > +
> > > > > > > +    op->op = uop->op;
> > > > > > > +    op->flags = uop->flags;
> > > > > > > +    op->va.addr = uop->addr;
> > > > > > > +    op->va.range = uop->range;
> > > > > > > +
> > > > > > > +    if (op->op == DRM_NOUVEAU_VM_BIND_OP_MAP) {
> > > > > > > +        op->gem.handle = uop->handle;
> > > > > > > +        op->gem.offset = uop->bo_offset;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > > +bind_job_ops_free(struct list_head *ops)
> > > > > > > +{
> > > > > > > +    struct bind_job_op *op, *next;
> > > > > > > +
> > > > > > > +    list_for_each_op_safe(op, next, ops) {
> > > > > > > +        list_del(&op->entry);
> > > > > > > +        kfree(op);
> > > > > > > +    }
> > > > > > > +}
> > > > > > > +
> > > > > > > +int
> > > > > > > +nouveau_bind_job_init(struct nouveau_bind_job **pjob,
> > > > > > > +              struct nouveau_exec_bind *bind)
> > > > > > > +{
> > > > > > > +    struct nouveau_bind_job *job;
> > > > > > > +    struct bind_job_op *op;
> > > > > > > +    int i, ret;
> > > > > > > +
> > > > > > > +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> > > > > > > +    if (!job)
> > > > > > > +        return -ENOMEM;
> > > > > > > +
> > > > > > > +    INIT_LIST_HEAD(&job->ops);
> > > > > > > +
> > > > > > > +    for (i = 0; i < bind->op.count; i++) {
> > > > > > > +        ret = bind_job_op_from_uop(&op, &bind->op.s[i]);
> > > > > > > +        if (ret)
> > > > > > > +            goto err_free;
> > > > > > > +
> > > > > > > +        list_add_tail(&op->entry, &job->ops);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    job->base.sync = !(bind->flags & DRM_NOUVEAU_VM_BIND_RUN_ASYNC);
> > > > > > > +    job->base.ops = &nouveau_bind_job_ops;
> > > > > > > +
> > > > > > > +    ret = nouveau_base_job_init(&job->base, &bind->base);
> > > > > > > +    if (ret)
> > > > > > > +        goto err_free;
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > > +
> > > > > > > +err_free:
> > > > > > > +    bind_job_ops_free(&job->ops);
> > > > > > > +    kfree(job);
> > > > > > > +    *pjob = NULL;
> > > > > > > +
> > > > > > > +    return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int
> > > > > > > +sync_find_fence(struct nouveau_job *job,
> > > > > > > +        struct drm_nouveau_sync *sync,
> > > > > > > +        struct dma_fence **fence)
> > > > > > > +{
> > > > > > > +    u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
> > > > > > > +    u64 point = 0;
> > > > > > > +    int ret;
> > > > > > > +
> > > > > > > +    if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
> > > > > > > +        stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > > > > > +        return -EOPNOTSUPP;
> > > > > > > +
> > > > > > > +    if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > > > > > +        point = sync->timeline_value;
> > > > > > > +
> > > > > > > +    ret = drm_syncobj_find_fence(job->file_priv,
> > > > > > > +                     sync->handle, point,
> > > > > > > +                     sync->flags, fence);
> > > > > > > +    if (ret)
> > > > > > > +        return ret;
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int
> > > > > > > +exec_job_binds_wait(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > > > +    struct nouveau_cli *cli = exec_job->base.cli;
> > > > > > > +    struct nouveau_sched_entity *bind_entity = &cli->sched_entity;
> > > > > > > +    signed long ret;
> > > > > > > +    int i;
> > > > > > > +
> > > > > > > +    for (i = 0; i < job->in_sync.count; i++) {
> > > > > > > +        struct nouveau_job *it;
> > > > > > > +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
> > > > > > > +        struct dma_fence *fence;
> > > > > > > +        bool found;
> > > > > > > +
> > > > > > > +        ret = sync_find_fence(job, sync, &fence);
> > > > > > > +        if (ret)
> > > > > > > +            return ret;
> > > > > > > +
> > > > > > > +        mutex_lock(&bind_entity->job.mutex);
> > > > > > > +        found = false;
> > > > > > > +        list_for_each_entry(it, &bind_entity->job.list, head) {
> > > > > > > +            if (fence == it->done_fence) {
> > > > > > > +                found = true;
> > > > > > > +                break;
> > > > > > > +            }
> > > > > > > +        }
> > > > > > > +        mutex_unlock(&bind_entity->job.mutex);
> > > > > > > +
> > > > > > > +        /* If the fence is not from a VM_BIND job, don't wait for it. */
> > > > > > > +        if (!found)
> > > > > > > +            continue;
> > > > > > > +
> > > > > > > +        ret = dma_fence_wait_timeout(fence, true,
> > > > > > > +                         msecs_to_jiffies(500));
> > > > > > > +        if (ret < 0)
> > > > > > > +            return ret;
> > > > > > > +        else if (ret == 0)
> > > > > > > +            return -ETIMEDOUT;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +int
> > > > > > > +nouveau_exec_job_submit(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > > > +    struct nouveau_cli *cli = exec_job->base.cli;
> > > > > > > +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(cli);
> > > > > > > +    struct drm_exec *exec = &job->exec;
> > > > > > > +    struct drm_gem_object *obj;
> > > > > > > +    unsigned long index;
> > > > > > > +    int ret;
> > > > > > > +
> > > > > > > +    ret = exec_job_binds_wait(job);
> > > > > > > +    if (ret)
> > > > > > > +        return ret;
> > > > > > > +
> > > > > > > +    nouveau_uvmm_lock(uvmm);
> > > > > > > +    drm_exec_while_not_all_locked(exec) {
> > > > > > > +        struct drm_gpuva *va;
> > > > > > > +
> > > > > > > +        drm_gpuva_for_each_va(va, &uvmm->umgr) {
> > > > > > > +            ret = drm_exec_prepare_obj(exec, va->gem.obj, 1);
> > > > > > > +            drm_exec_break_on_contention(exec);
> > > > > > > +            if (ret)
> > > > > > > +                return ret;
> > > > > > > +        }
> > > > > > > +    }
> > > > > > > +    nouveau_uvmm_unlock(uvmm);
> > > > > > > +
> > > > > > > +    drm_exec_for_each_locked_object(exec, index, obj) {
> > > > > > > +        struct dma_resv *resv = obj->resv;
> > > > > > > +        struct nouveau_bo *nvbo = nouveau_gem_object(obj);
> > > > > > > +
> > > > > > > +        ret = nouveau_bo_validate(nvbo, true, false);
> > > > > > > +        if (ret)
> > > > > > > +            return ret;
> > > > > > > +
> > > > > > > +        dma_resv_add_fence(resv, job->done_fence, DMA_RESV_USAGE_WRITE);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static struct dma_fence *
> > > > > > > +nouveau_exec_job_run(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > > > +    struct nouveau_fence *fence;
> > > > > > > +    int i, ret;
> > > > > > > +
> > > > > > > +    ret = nouveau_dma_wait(job->chan, exec_job->push.count + 1, 16);
> > > > > > > +    if (ret) {
> > > > > > > +        NV_PRINTK(err, job->cli, "nv50cal_space: %d\n", ret);
> > > > > > > +        return ERR_PTR(ret);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    for (i = 0; i < exec_job->push.count; i++) {
> > > > > > > +        nv50_dma_push(job->chan, exec_job->push.s[i].va,
> > > > > > > +                  exec_job->push.s[i].va_len);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    ret = nouveau_fence_new(job->chan, false, &fence);
> > > > > > > +    if (ret) {
> > > > > > > +        NV_PRINTK(err, job->cli, "error fencing pushbuf: %d\n", ret);
> > > > > > > +        WIND_RING(job->chan);
> > > > > > > +        return ERR_PTR(ret);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    return &fence->base;
> > > > > > > +}
> > > > > > > +static void
> > > > > > > +nouveau_exec_job_free(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > > > +
> > > > > > > +    nouveau_base_job_free(job);
> > > > > > > +
> > > > > > > +    kfree(exec_job->push.s);
> > > > > > > +    kfree(exec_job);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static struct nouveau_job_ops nouveau_exec_job_ops = {
> > > > > > > +    .submit = nouveau_exec_job_submit,
> > > > > > > +    .run = nouveau_exec_job_run,
> > > > > > > +    .free = nouveau_exec_job_free,
> > > > > > > +};
> > > > > > > +
> > > > > > > +int
> > > > > > > +nouveau_exec_job_init(struct nouveau_exec_job **pjob,
> > > > > > > +              struct nouveau_exec *exec)
> > > > > > > +{
> > > > > > > +    struct nouveau_exec_job *job;
> > > > > > > +    int ret;
> > > > > > > +
> > > > > > > +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> > > > > > > +    if (!job)
> > > > > > > +        return -ENOMEM;
> > > > > > > +
> > > > > > > +    job->push.count = exec->push.count;
> > > > > > > +    job->push.s = kmemdup(exec->push.s,
> > > > > > > +                  sizeof(*exec->push.s) *
> > > > > > > +                  exec->push.count,
> > > > > > > +                  GFP_KERNEL);
> > > > > > > +    if (!job->push.s) {
> > > > > > > +        ret = -ENOMEM;
> > > > > > > +        goto err_free_job;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    job->base.ops = &nouveau_exec_job_ops;
> > > > > > > +    ret = nouveau_base_job_init(&job->base, &exec->base);
> > > > > > > +    if (ret)
> > > > > > > +        goto err_free_pushs;
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > > +
> > > > > > > +err_free_pushs:
> > > > > > > +    kfree(job->push.s);
> > > > > > > +err_free_job:
> > > > > > > +    kfree(job);
> > > > > > > +    *pjob = NULL;
> > > > > > > +
> > > > > > > +    return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +void nouveau_job_fini(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    dma_fence_put(job->done_fence);
> > > > > > > +    drm_sched_job_cleanup(&job->base);
> > > > > > > +    job->ops->free(job);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int
> > > > > > > +nouveau_job_add_deps(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    struct dma_fence *in_fence = NULL;
> > > > > > > +    int ret, i;
> > > > > > > +
> > > > > > > +    for (i = 0; i < job->in_sync.count; i++) {
> > > > > > > +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
> > > > > > > +
> > > > > > > +        ret = sync_find_fence(job, sync, &in_fence);
> > > > > > > +        if (ret) {
> > > > > > > +            NV_PRINTK(warn, job->cli,
> > > > > > > +                  "Failed to find syncobj (-> in): handle=%d\n",
> > > > > > > +                  sync->handle);
> > > > > > > +            return ret;
> > > > > > > +        }
> > > > > > > +
> > > > > > > +        ret = drm_sched_job_add_dependency(&job->base, in_fence);
> > > > > > > +        if (ret)
> > > > > > > +            return ret;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int
> > > > > > > +nouveau_job_fence_attach(struct nouveau_job *job, struct dma_fence
> > > > > > > *fence)
> > > > > > > +{
> > > > > > > +    struct drm_syncobj *out_sync;
> > > > > > > +    int i;
> > > > > > > +
> > > > > > > +    for (i = 0; i < job->out_sync.count; i++) {
> > > > > > > +        struct drm_nouveau_sync *sync = &job->out_sync.s[i];
> > > > > > > +        u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
> > > > > > > +
> > > > > > > +        if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
> > > > > > > +            stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > > > > > +            return -EOPNOTSUPP;
> > > > > > > +
> > > > > > > +        out_sync = drm_syncobj_find(job->file_priv, sync->handle);
> > > > > > > +        if (!out_sync) {
> > > > > > > +            NV_PRINTK(warn, job->cli,
> > > > > > > +                  "Failed to find syncobj (-> out): handle=%d\n",
> > > > > > > +                  sync->handle);
> > > > > > > +            return -ENOENT;
> > > > > > > +        }
> > > > > > > +
> > > > > > > +        if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ) {
> > > > > > > +            struct dma_fence_chain *chain;
> > > > > > > +
> > > > > > > +            chain = dma_fence_chain_alloc();
> > > > > > > +            if (!chain) {
> > > > > > > +                drm_syncobj_put(out_sync);
> > > > > > > +                return -ENOMEM;
> > > > > > > +            }
> > > > > > > +
> > > > > > > +            drm_syncobj_add_point(out_sync, chain, fence,
> > > > > > > +                          sync->timeline_value);
> > > > > > > +        } else {
> > > > > > > +            drm_syncobj_replace_fence(out_sync, fence);
> > > > > > > +        }
> > > > > > > +
> > > > > > > +        drm_syncobj_put(out_sync);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static struct dma_fence *
> > > > > > > +nouveau_job_run(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    return job->ops->run(job);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int
> > > > > > > +nouveau_job_run_sync(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    struct dma_fence *fence;
> > > > > > > +    int ret;
> > > > > > > +
> > > > > > > +    fence = nouveau_job_run(job);
> > > > > > > +    if (IS_ERR(fence)) {
> > > > > > > +        return PTR_ERR(fence);
> > > > > > > +    } else if (fence) {
> > > > > > > +        ret = dma_fence_wait(fence, true);
> > > > > > > +        if (ret)
> > > > > > > +            return ret;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    dma_fence_signal(job->done_fence);
> > > > > > > +
> > > > > > > +    return 0;
> > > > > > > +}
> > > > > > > +
> > > > > > > +int
> > > > > > > +nouveau_job_submit(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    struct nouveau_sched_entity *entity =
> > > > > > > to_nouveau_sched_entity(job->base.entity);
> > > > > > > +    int ret;
> > > > > > > +
> > > > > > > +    drm_exec_init(&job->exec, true);
> > > > > > > +
> > > > > > > +    ret = nouveau_job_add_deps(job);
> > > > > > > +    if (ret)
> > > > > > > +        goto out;
> > > > > > > +
> > > > > > > +    drm_sched_job_arm(&job->base);
> > > > > > > +    job->done_fence = dma_fence_get(&job->base.s_fence->finished);
> > > > > > > +
> > > > > > > +    ret = nouveau_job_fence_attach(job, job->done_fence);
> > > > > > > +    if (ret)
> > > > > > > +        goto out;
> > > > > > > +
> > > > > > > +    if (job->ops->submit) {
> > > > > > > +        ret = job->ops->submit(job);
> > > > > > > +        if (ret)
> > > > > > > +            goto out;
> > > > > > > +    }
> > > > > > > +
> > > > > > > +    if (job->sync) {
> > > > > > > +        drm_exec_fini(&job->exec);
> > > > > > > +
> > > > > > > +        /* We're requested to run a synchronous job, hence don't push
> > > > > > > +         * the job, bypassing the job scheduler, and execute the jobs
> > > > > > > +         * run() function right away.
> > > > > > > +         *
> > > > > > > +         * As a consequence of bypassing the job scheduler we need to
> > > > > > > +         * handle fencing and job cleanup ourselfes.
> > > > > > > +         */
> > > > > > > +        ret = nouveau_job_run_sync(job);
> > > > > > > +
> > > > > > > +        /* If the job fails, the caller will do the cleanup for us. */
> > > > > > > +        if (!ret)
> > > > > > > +            nouveau_job_fini(job);
> > > > > > > +
> > > > > > > +        return ret;
> > > > > > > +    } else {
> > > > > > > +        mutex_lock(&entity->job.mutex);
> > > > > > > +        drm_sched_entity_push_job(&job->base);
> > > > > > > +        list_add_tail(&job->head, &entity->job.list);
> > > > > > > +        mutex_unlock(&entity->job.mutex);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +out:
> > > > > > > +    drm_exec_fini(&job->exec);
> > > > > > > +    return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static struct dma_fence *
> > > > > > > +nouveau_sched_run_job(struct drm_sched_job *sched_job)
> > > > > > > +{
> > > > > > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > > > > > +
> > > > > > > +    return nouveau_job_run(job);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static enum drm_gpu_sched_stat
> > > > > > > +nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
> > > > > > > +{
> > > > > > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > > > > > +    struct nouveau_channel *chan = job->chan;
> > > > > > > +
> > > > > > > +    if (unlikely(!atomic_read(&chan->killed)))
> > > > > > > +        nouveau_channel_kill(chan);
> > > > > > > +
> > > > > > > +    NV_PRINTK(warn, job->cli, "job timeout, channel %d killed!\n",
> > > > > > > +          chan->chid);
> > > > > > > +
> > > > > > > +    nouveau_sched_entity_fini(job->entity);
> > > > > > > +
> > > > > > > +    return DRM_GPU_SCHED_STAT_ENODEV;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static void
> > > > > > > +nouveau_sched_free_job(struct drm_sched_job *sched_job)
> > > > > > > +{
> > > > > > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > > > > > +    struct nouveau_sched_entity *entity = job->entity;
> > > > > > > +
> > > > > > > +    mutex_lock(&entity->job.mutex);
> > > > > > > +    list_del(&job->head);
> > > > > > > +    mutex_unlock(&entity->job.mutex);
> > > > > > > +
> > > > > > > +    nouveau_job_fini(job);
> > > > > > > +}
> > > > > > > +
> > > > > > > +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
> > > > > > > +                  struct drm_gpu_scheduler *sched)
> > > > > > > +{
> > > > > > > +
> > > > > > > +    INIT_LIST_HEAD(&entity->job.list);
> > > > > > > +    mutex_init(&entity->job.mutex);
> > > > > > > +
> > > > > > > +    return drm_sched_entity_init(&entity->base,
> > > > > > > +                     DRM_SCHED_PRIORITY_NORMAL,
> > > > > > > +                     &sched, 1, NULL);
> > > > > > > +}
> > > > > > > +
> > > > > > > +void
> > > > > > > +nouveau_sched_entity_fini(struct nouveau_sched_entity *entity)
> > > > > > > +{
> > > > > > > +    drm_sched_entity_destroy(&entity->base);
> > > > > > > +}
> > > > > > > +
> > > > > > > +static const struct drm_sched_backend_ops nouveau_sched_ops = {
> > > > > > > +    .run_job = nouveau_sched_run_job,
> > > > > > > +    .timedout_job = nouveau_sched_timedout_job,
> > > > > > > +    .free_job = nouveau_sched_free_job,
> > > > > > > +};
> > > > > > > +
> > > > > > > +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
> > > > > > > +               struct nouveau_drm *drm)
> > > > > > > +{
> > > > > > > +    long job_hang_limit =
> > > > > > > msecs_to_jiffies(NOUVEAU_SCHED_JOB_TIMEOUT_MS);
> > > > > > > +
> > > > > > > +    return drm_sched_init(sched, &nouveau_sched_ops,
> > > > > > > +                  NOUVEAU_SCHED_HW_SUBMISSIONS, 0, job_hang_limit,
> > > > > > > +                  NULL, NULL, "nouveau", drm->dev->dev);
> > > > > > > +}
> > > > > > > +
> > > > > > > +void nouveau_sched_fini(struct drm_gpu_scheduler *sched)
> > > > > > > +{
> > > > > > > +    drm_sched_fini(sched);
> > > > > > > +}
> > > > > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > > > > b/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > > > > new file mode 100644
> > > > > > > index 000000000000..7fc5b7eea810
> > > > > > > --- /dev/null
> > > > > > > +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > > > > @@ -0,0 +1,98 @@
> > > > > > > +// SPDX-License-Identifier: MIT
> > > > > > > +
> > > > > > > +#ifndef NOUVEAU_SCHED_H
> > > > > > > +#define NOUVEAU_SCHED_H
> > > > > > > +
> > > > > > > +#include <linux/types.h>
> > > > > > > +
> > > > > > > +#include <drm/drm_exec.h>
> > > > > > > +#include <drm/gpu_scheduler.h>
> > > > > > > +
> > > > > > > +#include "nouveau_drv.h"
> > > > > > > +#include "nouveau_exec.h"
> > > > > > > +
> > > > > > > +#define to_nouveau_job(sched_job)        \
> > > > > > > +        container_of((sched_job), struct nouveau_job, base)
> > > > > > > +
> > > > > > > +#define to_nouveau_exec_job(job)        \
> > > > > > > +        container_of((job), struct nouveau_exec_job, base)
> > > > > > > +
> > > > > > > +#define to_nouveau_bind_job(job)        \
> > > > > > > +        container_of((job), struct nouveau_bind_job, base)
> > > > > > > +
> > > > > > > +struct nouveau_job {
> > > > > > > +    struct drm_sched_job base;
> > > > > > > +    struct list_head head;
> > > > > > > +
> > > > > > > +    struct nouveau_sched_entity *entity;
> > > > > > > +
> > > > > > > +    struct drm_file *file_priv;
> > > > > > > +    struct nouveau_cli *cli;
> > > > > > > +    struct nouveau_channel *chan;
> > > > > > > +
> > > > > > > +    struct drm_exec exec;
> > > > > > > +    struct dma_fence *done_fence;
> > > > > > > +
> > > > > > > +    bool sync;
> > > > > > > +
> > > > > > > +    struct {
> > > > > > > +        struct drm_nouveau_sync *s;
> > > > > > > +        u32 count;
> > > > > > > +    } in_sync;
> > > > > > > +
> > > > > > > +    struct {
> > > > > > > +        struct drm_nouveau_sync *s;
> > > > > > > +        u32 count;
> > > > > > > +    } out_sync;
> > > > > > > +
> > > > > > > +    struct nouveau_job_ops {
> > > > > > > +        int (*submit)(struct nouveau_job *);
> > > > > > > +        struct dma_fence *(*run)(struct nouveau_job *);
> > > > > > > +        void (*free)(struct nouveau_job *);
> > > > > > > +    } *ops;
> > > > > > > +};
> > > > > > > +
> > > > > > > +struct nouveau_exec_job {
> > > > > > > +    struct nouveau_job base;
> > > > > > > +
> > > > > > > +    struct {
> > > > > > > +        struct drm_nouveau_exec_push *s;
> > > > > > > +        u32 count;
> > > > > > > +    } push;
> > > > > > > +};
> > > > > > > +
> > > > > > > +struct nouveau_bind_job {
> > > > > > > +    struct nouveau_job base;
> > > > > > > +
> > > > > > > +    /* struct bind_job_op */
> > > > > > > +    struct list_head ops;
> > > > > > > +};
> > > > > > > +
> > > > > > > +int nouveau_bind_job_init(struct nouveau_bind_job **job,
> > > > > > > +              struct nouveau_exec_bind *bind);
> > > > > > > +int nouveau_exec_job_init(struct nouveau_exec_job **job,
> > > > > > > +              struct nouveau_exec *exec);
> > > > > > > +
> > > > > > > +int nouveau_job_submit(struct nouveau_job *job);
> > > > > > > +void nouveau_job_fini(struct nouveau_job *job);
> > > > > > > +
> > > > > > > +#define to_nouveau_sched_entity(entity)        \
> > > > > > > +        container_of((entity), struct nouveau_sched_entity, base)
> > > > > > > +
> > > > > > > +struct nouveau_sched_entity {
> > > > > > > +    struct drm_sched_entity base;
> > > > > > > +    struct {
> > > > > > > +        struct list_head list;
> > > > > > > +        struct mutex mutex;
> > > > > > > +    } job;
> > > > > > > +};
> > > > > > > +
> > > > > > > +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
> > > > > > > +                  struct drm_gpu_scheduler *sched);
> > > > > > > +void nouveau_sched_entity_fini(struct nouveau_sched_entity *entity);
> > > > > > > +
> > > > > > > +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
> > > > > > > +               struct nouveau_drm *drm);
> > > > > > > +void nouveau_sched_fini(struct drm_gpu_scheduler *sched);
> > > > > > > +
> > > > > > > +#endif
> > > > > > 
> > > > > 
> > > > 
> > > 
> > 
>
Danilo Krummrich Jan. 19, 2023, 10:25 p.m. UTC | #10
On 1/19/23 22:47, Matthew Brost wrote:
> On Thu, Jan 19, 2023 at 06:46:30PM +0100, Danilo Krummrich wrote:
>>
>>
>> On 1/19/23 17:38, Matthew Brost wrote:
>>> On Thu, Jan 19, 2023 at 04:36:43PM +0100, Danilo Krummrich wrote:
>>>> On 1/19/23 05:58, Matthew Brost wrote:
>>>>> On Thu, Jan 19, 2023 at 04:44:23AM +0100, Danilo Krummrich wrote:
>>>>>> On 1/18/23 21:37, Thomas Hellström (Intel) wrote:
>>>>>>>
>>>>>>> On 1/18/23 07:12, Danilo Krummrich wrote:
>>>>>>>> This commit provides the implementation for the new uapi motivated by the
>>>>>>>> Vulkan API. It allows user mode drivers (UMDs) to:
>>>>>>>>
>>>>>>>> 1) Initialize a GPU virtual address (VA) space via the new
>>>>>>>>        DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
>>>>>>>>        space managed by the kernel and userspace, respectively.
>>>>>>>>
>>>>>>>> 2) Allocate and free a VA space region as well as bind and unbind memory
>>>>>>>>        to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>>>>>>>>        UMDs can request the named operations to be processed either
>>>>>>>>        synchronously or asynchronously. It supports DRM syncobjs
>>>>>>>>        (incl. timelines) as synchronization mechanism. The management of the
>>>>>>>>        GPU VA mappings is implemented with the DRM GPU VA manager.
>>>>>>>>
>>>>>>>> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
>>>>>>>>        execution happens asynchronously. It supports DRM syncobj (incl.
>>>>>>>>        timelines) as synchronization mechanism. DRM GEM object locking is
>>>>>>>>        handled with drm_exec.
>>>>>>>>
>>>>>>>> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
>>>>>>>> GPU scheduler for the asynchronous paths.
>>>>>>>>
>>>>>>>> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
>>>>>>>> ---
>>>>>>>>      Documentation/gpu/driver-uapi.rst       |   3 +
>>>>>>>>      drivers/gpu/drm/nouveau/Kbuild          |   2 +
>>>>>>>>      drivers/gpu/drm/nouveau/Kconfig         |   2 +
>>>>>>>>      drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
>>>>>>>>      drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
>>>>>>>>      drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
>>>>>>>>      drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
>>>>>>>>      drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
>>>>>>>>      drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
>>>>>>>>      drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
>>>>>>>>      drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
>>>>>>>>      11 files changed, 1295 insertions(+), 4 deletions(-)
>>>>>>>>      create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
>>>>>>>>      create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
>>>>>>>>      create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
>>>>>>>>      create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h
>>>>>>> ...
>>>>>>>>
>>>>>>>> +static struct dma_fence *
>>>>>>>> +nouveau_bind_job_run(struct nouveau_job *job)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
>>>>>>>> +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
>>>>>>>> +    struct bind_job_op *op;
>>>>>>>> +    int ret = 0;
>>>>>>>> +
>>>>>>>
>>>>>>> I was looking at how nouveau does the async binding compared to how xe
>>>>>>> does it.
>>>>>>> It looks to me that this function being a scheduler run_job callback is
>>>>>>> the main part of the VM_BIND dma-fence signalling critical section for
>>>>>>> the job's done_fence and if so, needs to be annotated as such?
>>>>>>
>>>>>> Yes, that's the case.
>>>>>>
>>>>>>>
>>>>>>> For example nouveau_uvma_region_new allocates memory, which is not
>>>>>>> allowed if in a dma_fence signalling critical section and the locking
>>>>>>> also looks suspicious?
>>>>>>
>>>>>> Thanks for pointing this out, I missed that somehow.
>>>>>>
>>>>>> I will change it to pre-allocate new regions, mappings and page tables
>>>>>> within the job's submit() function.
>>>>>>
>>>>>
>>>>> Yea that what we basically do in Xe, in the IOCTL step allocate all the
>>>>> backing store for new page tables, populate new page tables (these are
>>>>> not yet visible in the page table structure), and in last step which is
>>>>> executed after all the dependencies are satified program all the leaf
>>>>> entires making the new binding visible.
>>>>>
>>>>> We screwed have this up by defering most of the IOCTL to a worker but
>>>>> will fix this fix this one way or another soon - get rid of worker or
>>>>> introduce a type of sync that is signaled after the worker + publish the
>>>>> dma-fence in the worker. I'd like to close on this one soon.
>>>>>> For the ops structures the drm_gpuva_manager allocates for reporting the
>>>>>> split/merge steps back to the driver I have ideas to entirely avoid
>>>>>> allocations, which also is a good thing in respect of Christians feedback
>>>>>> regarding the huge amount of mapping requests some applications seem to
>>>>>> generate.
>>>>>>
>>>>>
>>>>> It should be fine to have allocations to report the split/merge step as
>>>>> this step should be before a dma-fence is published, but yea if possible
>>>>> to avoid extra allocs as that is always better.
>>>>
>>>> I think we can't really ask for the split/merge steps before actually
>>>> running the job, since it requires the particular VA space not to change
>>>> while performing those operations.
>>>>
>>>> E.g. if we'd run the split/merge steps at job submit() time the underlying
>>>> VA space could be changed by other bind jobs executing before this one,
>>>> which would make the calculated split/merge steps obsolete and wrong.
>>>>
>>>
>>> Hmm, maybe I'm not understanding this implementation, admittedly I
>>> haven't studied the gpuva manager code in detail.
>>
>> The limitation I mentioned above doesn't really come from the
>> drm_gpuva_manager, but from how the driver executes the jobs.
>>
>>>
>>> Let me explain what we are doing in Xe.
>>>
>>> Map 0x0000 - 0x3000 -> this resolves into 1 bind operation and 1 VMA
>>> Unmap 0x1000-0x2000 -> this resolves into 1 unbind and 2 rebind operations
>>>
>>> 1. unbind 0x0000-0x3000 -> destroy old VMA
>>> 2. rebind 0x0000-0x1000 -> new VMA
>>> 3. rebind 0x2000-0x3000 -> new VMA
>>>
>>> All of the above steps resolving the operations can be done in the IOCTL
>>> phase and VM's VMA structure is also updated. When the dependencies
>>> are resolved the actual bindings are done on the GPU. We use the BO's
>>> dma-resv slots to ensure there is never a window 0x0000-0x1000 and
>>> 0x2000-0x3000 are never mapped with respect to execs (I forget the exact
>>> details of how we do this but if you want to know I'll explain further).
>>
>> Ok, so you're not only generating the split/merge steps without updating the
>> view of the VA space (which would cause the issue I described) but also
>> already change the view of the VA space in the IOCTL, before the actual page
>> table update happens later on, right?
>>
> 
> Yes, we generate the operations + update the view VA space in the IOCTL
> while the actual page table update on the GPU occurs later if there are
> dependencies.
> 
>> Currently, in nouveau I do both, the actual page table update and the range
>> allocator update, in run_job(), such that walking the allocator always
>> represents the actual page table layout.
>>
> 
> In Xe the VA view is always if all submited bind / unbind ops has
> completed even if some are pending.

If they're completed, yes. But in the time frame from the VA space 
update until the GPU actually updated the page tables it isn't? Not that 
I think it's a problem, just curious.

> 
> Also you may want to take a look at generic page walker Thomas Hellstrom
> wrote for Xe which we use to program the page tables. It is pretty slick
> and probably could use in Nouveau:
> https://patchwork.freedesktop.org/patch/515856/?series=112188&rev=1
> 
> In Xe xe_pt.c is the wrapper for this, it can map, unmap, and invalidate
> VA:
> https://cgit.freedesktop.org/drm/drm-xe/tree/drivers/gpu/drm/xe/xe_pt.c?h=drm-xe-next
> 
>> How do you handle map/unmap on BO eviction?
>>
> 
> We use the dma-resv slots to order all of this.
> 
> Basically exec/map/unmap wait on pending BOs (I believe moves are in the
> KERNEL slot) and BOs moves wait on pending exec/map/unmaps (I believe
> these are BOOKKEEP slot).
> 
> I think more details are in the Xe kernel doc, see 'dma-resv usage':
> https://cgit.freedesktop.org/drm/drm-xe/tree/drivers/gpu/drm/xe/xe_vm_doc.h?h=drm-xe-next
> 
> This might be a little stale, we dropped the idea of
> DMA_RESV_USAGE_PREEMPT_FENCE and is now just in the BOOKKEEP but I think
> everything else is corret.

If a BO is currently eviceted, did you consider to just add it to your 
rebind list rather than calling ttm_bo_validate() and creating the 
actual mapping right away?

I was thinking about this kind of "lazy mapping" approach for cases like 
partial unbind requests, where I wouldn't expect the application to need 
the mapping right away, but also for normal bind requests. I mean, when 
there is already memory pressure, why making the situation worse until 
an actual EXEC is requested?

Also, if I see it correctly unmaps caused by xe_bo_move_notify() happen 
on the CPU, right?

> 
>>>
>>> Can we not use drm_gpuvs_manager in a similar manner to generate the
>>> ops + update the VM's VMA structure early? Again maybe I missing
>>> something here as I haven't fully studied the drm_gpuva_manager.
>>
>> You can use the drm_gpuvs_manager in exactly the way you just described.
>> Though, in your concrete example it would generate just 1 unbind and 1 bind,
>> which it would combine in a re-bind operation. A re-bind operation always
>> has 1 unbind and up to 2 (but a minimum of 1) bind (sub-)operations.
>>
> 
> Cool this is what I wanted to hear. Hopefully we can get around to
> building our VM / VMA management on top the drm_gpuvs_manager soon.

I will re-work the drm_gpuva_manager memory allocation parts first, 
since those changes will influence it's API. How the internal allocator 
works, if it's drm_mm or something else, shouldn't matter to users of 
the drm_gpuva_manager from an API PoV.

- Danilo

> 
> Matt
>   
>> Rebind:
>>      1. unbind 0x0000-0x3000
>>      2. NULL
>>      3.   bind 0x1000-0x3000
>>
>> It's then up to the driver to remove the old gpuva entry and add a new one.
>> With the given re-bind operation the driver can conclude to just do a
>> partial page table update from 0x0000-0x1000.
>>
>> - Danilo
>>
>>>
>>> Matt
>>>
>>>> Anyway, I should be able to get rid of all the allocations to make this
>>>> safe.
>>>>
>>>>>
>>>>> Also BTW, great work on drm_gpuva_manager too. We will almost likely
>>>>> pick this up in Xe rather than open coding all of this as we currently
>>>>> do. We should probably start the port to this soon so we can contribute
>>>>> to the implementation and get both of our drivers upstream sooner.
>>>>
>>>> Sounds great!
>>>>
>>>>>> Regarding the locking, anything specific that makes it look suspicious to
>>>>>> you?
>>>>>>
>>>>>
>>>>> I haven't looked into this too but almost certainly Thomas is suggesting
>>>>> that if you allocate memory anywhere under the nouveau_uvmm_lock then
>>>>> you can't use this lock in the run_job() callback as this in the
>>>>> dma-fencing path.
>>>>
>>>> Oh, sure. I already checked that, luckily there aren't any further
>>>> allocations under this lock, so this should be safe once I changed to
>>>> run_job() parts to pre-allocation in submit().
>>>>
>>>>>
>>>>> Matt
>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>>
>>>>>>> Thomas
>>>>>>>
>>>>>>>
>>>>>>>> +    nouveau_uvmm_lock(uvmm);
>>>>>>>> +    list_for_each_op(op, &bind_job->ops) {
>>>>>>>> +        switch (op->op) {
>>>>>>>> +        case OP_ALLOC: {
>>>>>>>> +            bool sparse = op->flags & DRM_NOUVEAU_VM_BIND_SPARSE;
>>>>>>>> +
>>>>>>>> +            ret = nouveau_uvma_region_new(uvmm,
>>>>>>>> +                              op->va.addr,
>>>>>>>> +                              op->va.range,
>>>>>>>> +                              sparse);
>>>>>>>> +            if (ret)
>>>>>>>> +                goto out_unlock;
>>>>>>>> +            break;
>>>>>>>> +        }
>>>>>>>> +        case OP_FREE:
>>>>>>>> +            ret = nouveau_uvma_region_destroy(uvmm,
>>>>>>>> +                              op->va.addr,
>>>>>>>> +                              op->va.range);
>>>>>>>> +            if (ret)
>>>>>>>> +                goto out_unlock;
>>>>>>>> +            break;
>>>>>>>> +        case OP_MAP:
>>>>>>>> +            ret = nouveau_uvmm_sm_map(uvmm,
>>>>>>>> +                          op->va.addr, op->va.range,
>>>>>>>> +                          op->gem.obj, op->gem.offset,
>>>>>>>> +                          op->flags && 0xff);
>>>>>>>> +            if (ret)
>>>>>>>> +                goto out_unlock;
>>>>>>>> +            break;
>>>>>>>> +        case OP_UNMAP:
>>>>>>>> +            ret = nouveau_uvmm_sm_unmap(uvmm,
>>>>>>>> +                            op->va.addr,
>>>>>>>> +                            op->va.range);
>>>>>>>> +            if (ret)
>>>>>>>> +                goto out_unlock;
>>>>>>>> +            break;
>>>>>>>> +        }
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +out_unlock:
>>>>>>>> +    nouveau_uvmm_unlock(uvmm);
>>>>>>>> +    if (ret)
>>>>>>>> +        NV_PRINTK(err, job->cli, "bind job failed: %d\n", ret);
>>>>>>>> +    return ERR_PTR(ret);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void
>>>>>>>> +nouveau_bind_job_free(struct nouveau_job *job)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
>>>>>>>> +    struct bind_job_op *op, *next;
>>>>>>>> +
>>>>>>>> +    list_for_each_op_safe(op, next, &bind_job->ops) {
>>>>>>>> +        struct drm_gem_object *obj = op->gem.obj;
>>>>>>>> +
>>>>>>>> +        if (obj)
>>>>>>>> +            drm_gem_object_put(obj);
>>>>>>>> +
>>>>>>>> +        list_del(&op->entry);
>>>>>>>> +        kfree(op);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    nouveau_base_job_free(job);
>>>>>>>> +    kfree(bind_job);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static struct nouveau_job_ops nouveau_bind_job_ops = {
>>>>>>>> +    .submit = nouveau_bind_job_submit,
>>>>>>>> +    .run = nouveau_bind_job_run,
>>>>>>>> +    .free = nouveau_bind_job_free,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +static int
>>>>>>>> +bind_job_op_from_uop(struct bind_job_op **pop,
>>>>>>>> +             struct drm_nouveau_vm_bind_op *uop)
>>>>>>>> +{
>>>>>>>> +    struct bind_job_op *op;
>>>>>>>> +
>>>>>>>> +    op = *pop = kzalloc(sizeof(*op), GFP_KERNEL);
>>>>>>>> +    if (!op)
>>>>>>>> +        return -ENOMEM;
>>>>>>>> +
>>>>>>>> +    op->op = uop->op;
>>>>>>>> +    op->flags = uop->flags;
>>>>>>>> +    op->va.addr = uop->addr;
>>>>>>>> +    op->va.range = uop->range;
>>>>>>>> +
>>>>>>>> +    if (op->op == DRM_NOUVEAU_VM_BIND_OP_MAP) {
>>>>>>>> +        op->gem.handle = uop->handle;
>>>>>>>> +        op->gem.offset = uop->bo_offset;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void
>>>>>>>> +bind_job_ops_free(struct list_head *ops)
>>>>>>>> +{
>>>>>>>> +    struct bind_job_op *op, *next;
>>>>>>>> +
>>>>>>>> +    list_for_each_op_safe(op, next, ops) {
>>>>>>>> +        list_del(&op->entry);
>>>>>>>> +        kfree(op);
>>>>>>>> +    }
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +int
>>>>>>>> +nouveau_bind_job_init(struct nouveau_bind_job **pjob,
>>>>>>>> +              struct nouveau_exec_bind *bind)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_bind_job *job;
>>>>>>>> +    struct bind_job_op *op;
>>>>>>>> +    int i, ret;
>>>>>>>> +
>>>>>>>> +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>>>>>>>> +    if (!job)
>>>>>>>> +        return -ENOMEM;
>>>>>>>> +
>>>>>>>> +    INIT_LIST_HEAD(&job->ops);
>>>>>>>> +
>>>>>>>> +    for (i = 0; i < bind->op.count; i++) {
>>>>>>>> +        ret = bind_job_op_from_uop(&op, &bind->op.s[i]);
>>>>>>>> +        if (ret)
>>>>>>>> +            goto err_free;
>>>>>>>> +
>>>>>>>> +        list_add_tail(&op->entry, &job->ops);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    job->base.sync = !(bind->flags & DRM_NOUVEAU_VM_BIND_RUN_ASYNC);
>>>>>>>> +    job->base.ops = &nouveau_bind_job_ops;
>>>>>>>> +
>>>>>>>> +    ret = nouveau_base_job_init(&job->base, &bind->base);
>>>>>>>> +    if (ret)
>>>>>>>> +        goto err_free;
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +
>>>>>>>> +err_free:
>>>>>>>> +    bind_job_ops_free(&job->ops);
>>>>>>>> +    kfree(job);
>>>>>>>> +    *pjob = NULL;
>>>>>>>> +
>>>>>>>> +    return ret;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int
>>>>>>>> +sync_find_fence(struct nouveau_job *job,
>>>>>>>> +        struct drm_nouveau_sync *sync,
>>>>>>>> +        struct dma_fence **fence)
>>>>>>>> +{
>>>>>>>> +    u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
>>>>>>>> +    u64 point = 0;
>>>>>>>> +    int ret;
>>>>>>>> +
>>>>>>>> +    if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
>>>>>>>> +        stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>>>>>> +        return -EOPNOTSUPP;
>>>>>>>> +
>>>>>>>> +    if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>>>>>> +        point = sync->timeline_value;
>>>>>>>> +
>>>>>>>> +    ret = drm_syncobj_find_fence(job->file_priv,
>>>>>>>> +                     sync->handle, point,
>>>>>>>> +                     sync->flags, fence);
>>>>>>>> +    if (ret)
>>>>>>>> +        return ret;
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int
>>>>>>>> +exec_job_binds_wait(struct nouveau_job *job)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>>>>>> +    struct nouveau_cli *cli = exec_job->base.cli;
>>>>>>>> +    struct nouveau_sched_entity *bind_entity = &cli->sched_entity;
>>>>>>>> +    signed long ret;
>>>>>>>> +    int i;
>>>>>>>> +
>>>>>>>> +    for (i = 0; i < job->in_sync.count; i++) {
>>>>>>>> +        struct nouveau_job *it;
>>>>>>>> +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
>>>>>>>> +        struct dma_fence *fence;
>>>>>>>> +        bool found;
>>>>>>>> +
>>>>>>>> +        ret = sync_find_fence(job, sync, &fence);
>>>>>>>> +        if (ret)
>>>>>>>> +            return ret;
>>>>>>>> +
>>>>>>>> +        mutex_lock(&bind_entity->job.mutex);
>>>>>>>> +        found = false;
>>>>>>>> +        list_for_each_entry(it, &bind_entity->job.list, head) {
>>>>>>>> +            if (fence == it->done_fence) {
>>>>>>>> +                found = true;
>>>>>>>> +                break;
>>>>>>>> +            }
>>>>>>>> +        }
>>>>>>>> +        mutex_unlock(&bind_entity->job.mutex);
>>>>>>>> +
>>>>>>>> +        /* If the fence is not from a VM_BIND job, don't wait for it. */
>>>>>>>> +        if (!found)
>>>>>>>> +            continue;
>>>>>>>> +
>>>>>>>> +        ret = dma_fence_wait_timeout(fence, true,
>>>>>>>> +                         msecs_to_jiffies(500));
>>>>>>>> +        if (ret < 0)
>>>>>>>> +            return ret;
>>>>>>>> +        else if (ret == 0)
>>>>>>>> +            return -ETIMEDOUT;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +int
>>>>>>>> +nouveau_exec_job_submit(struct nouveau_job *job)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>>>>>> +    struct nouveau_cli *cli = exec_job->base.cli;
>>>>>>>> +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(cli);
>>>>>>>> +    struct drm_exec *exec = &job->exec;
>>>>>>>> +    struct drm_gem_object *obj;
>>>>>>>> +    unsigned long index;
>>>>>>>> +    int ret;
>>>>>>>> +
>>>>>>>> +    ret = exec_job_binds_wait(job);
>>>>>>>> +    if (ret)
>>>>>>>> +        return ret;
>>>>>>>> +
>>>>>>>> +    nouveau_uvmm_lock(uvmm);
>>>>>>>> +    drm_exec_while_not_all_locked(exec) {
>>>>>>>> +        struct drm_gpuva *va;
>>>>>>>> +
>>>>>>>> +        drm_gpuva_for_each_va(va, &uvmm->umgr) {
>>>>>>>> +            ret = drm_exec_prepare_obj(exec, va->gem.obj, 1);
>>>>>>>> +            drm_exec_break_on_contention(exec);
>>>>>>>> +            if (ret)
>>>>>>>> +                return ret;
>>>>>>>> +        }
>>>>>>>> +    }
>>>>>>>> +    nouveau_uvmm_unlock(uvmm);
>>>>>>>> +
>>>>>>>> +    drm_exec_for_each_locked_object(exec, index, obj) {
>>>>>>>> +        struct dma_resv *resv = obj->resv;
>>>>>>>> +        struct nouveau_bo *nvbo = nouveau_gem_object(obj);
>>>>>>>> +
>>>>>>>> +        ret = nouveau_bo_validate(nvbo, true, false);
>>>>>>>> +        if (ret)
>>>>>>>> +            return ret;
>>>>>>>> +
>>>>>>>> +        dma_resv_add_fence(resv, job->done_fence, DMA_RESV_USAGE_WRITE);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static struct dma_fence *
>>>>>>>> +nouveau_exec_job_run(struct nouveau_job *job)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>>>>>> +    struct nouveau_fence *fence;
>>>>>>>> +    int i, ret;
>>>>>>>> +
>>>>>>>> +    ret = nouveau_dma_wait(job->chan, exec_job->push.count + 1, 16);
>>>>>>>> +    if (ret) {
>>>>>>>> +        NV_PRINTK(err, job->cli, "nv50cal_space: %d\n", ret);
>>>>>>>> +        return ERR_PTR(ret);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    for (i = 0; i < exec_job->push.count; i++) {
>>>>>>>> +        nv50_dma_push(job->chan, exec_job->push.s[i].va,
>>>>>>>> +                  exec_job->push.s[i].va_len);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    ret = nouveau_fence_new(job->chan, false, &fence);
>>>>>>>> +    if (ret) {
>>>>>>>> +        NV_PRINTK(err, job->cli, "error fencing pushbuf: %d\n", ret);
>>>>>>>> +        WIND_RING(job->chan);
>>>>>>>> +        return ERR_PTR(ret);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    return &fence->base;
>>>>>>>> +}
>>>>>>>> +static void
>>>>>>>> +nouveau_exec_job_free(struct nouveau_job *job)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
>>>>>>>> +
>>>>>>>> +    nouveau_base_job_free(job);
>>>>>>>> +
>>>>>>>> +    kfree(exec_job->push.s);
>>>>>>>> +    kfree(exec_job);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static struct nouveau_job_ops nouveau_exec_job_ops = {
>>>>>>>> +    .submit = nouveau_exec_job_submit,
>>>>>>>> +    .run = nouveau_exec_job_run,
>>>>>>>> +    .free = nouveau_exec_job_free,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +int
>>>>>>>> +nouveau_exec_job_init(struct nouveau_exec_job **pjob,
>>>>>>>> +              struct nouveau_exec *exec)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_exec_job *job;
>>>>>>>> +    int ret;
>>>>>>>> +
>>>>>>>> +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>>>>>>>> +    if (!job)
>>>>>>>> +        return -ENOMEM;
>>>>>>>> +
>>>>>>>> +    job->push.count = exec->push.count;
>>>>>>>> +    job->push.s = kmemdup(exec->push.s,
>>>>>>>> +                  sizeof(*exec->push.s) *
>>>>>>>> +                  exec->push.count,
>>>>>>>> +                  GFP_KERNEL);
>>>>>>>> +    if (!job->push.s) {
>>>>>>>> +        ret = -ENOMEM;
>>>>>>>> +        goto err_free_job;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    job->base.ops = &nouveau_exec_job_ops;
>>>>>>>> +    ret = nouveau_base_job_init(&job->base, &exec->base);
>>>>>>>> +    if (ret)
>>>>>>>> +        goto err_free_pushs;
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +
>>>>>>>> +err_free_pushs:
>>>>>>>> +    kfree(job->push.s);
>>>>>>>> +err_free_job:
>>>>>>>> +    kfree(job);
>>>>>>>> +    *pjob = NULL;
>>>>>>>> +
>>>>>>>> +    return ret;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +void nouveau_job_fini(struct nouveau_job *job)
>>>>>>>> +{
>>>>>>>> +    dma_fence_put(job->done_fence);
>>>>>>>> +    drm_sched_job_cleanup(&job->base);
>>>>>>>> +    job->ops->free(job);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int
>>>>>>>> +nouveau_job_add_deps(struct nouveau_job *job)
>>>>>>>> +{
>>>>>>>> +    struct dma_fence *in_fence = NULL;
>>>>>>>> +    int ret, i;
>>>>>>>> +
>>>>>>>> +    for (i = 0; i < job->in_sync.count; i++) {
>>>>>>>> +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
>>>>>>>> +
>>>>>>>> +        ret = sync_find_fence(job, sync, &in_fence);
>>>>>>>> +        if (ret) {
>>>>>>>> +            NV_PRINTK(warn, job->cli,
>>>>>>>> +                  "Failed to find syncobj (-> in): handle=%d\n",
>>>>>>>> +                  sync->handle);
>>>>>>>> +            return ret;
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        ret = drm_sched_job_add_dependency(&job->base, in_fence);
>>>>>>>> +        if (ret)
>>>>>>>> +            return ret;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int
>>>>>>>> +nouveau_job_fence_attach(struct nouveau_job *job, struct dma_fence
>>>>>>>> *fence)
>>>>>>>> +{
>>>>>>>> +    struct drm_syncobj *out_sync;
>>>>>>>> +    int i;
>>>>>>>> +
>>>>>>>> +    for (i = 0; i < job->out_sync.count; i++) {
>>>>>>>> +        struct drm_nouveau_sync *sync = &job->out_sync.s[i];
>>>>>>>> +        u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
>>>>>>>> +
>>>>>>>> +        if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
>>>>>>>> +            stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
>>>>>>>> +            return -EOPNOTSUPP;
>>>>>>>> +
>>>>>>>> +        out_sync = drm_syncobj_find(job->file_priv, sync->handle);
>>>>>>>> +        if (!out_sync) {
>>>>>>>> +            NV_PRINTK(warn, job->cli,
>>>>>>>> +                  "Failed to find syncobj (-> out): handle=%d\n",
>>>>>>>> +                  sync->handle);
>>>>>>>> +            return -ENOENT;
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ) {
>>>>>>>> +            struct dma_fence_chain *chain;
>>>>>>>> +
>>>>>>>> +            chain = dma_fence_chain_alloc();
>>>>>>>> +            if (!chain) {
>>>>>>>> +                drm_syncobj_put(out_sync);
>>>>>>>> +                return -ENOMEM;
>>>>>>>> +            }
>>>>>>>> +
>>>>>>>> +            drm_syncobj_add_point(out_sync, chain, fence,
>>>>>>>> +                          sync->timeline_value);
>>>>>>>> +        } else {
>>>>>>>> +            drm_syncobj_replace_fence(out_sync, fence);
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        drm_syncobj_put(out_sync);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static struct dma_fence *
>>>>>>>> +nouveau_job_run(struct nouveau_job *job)
>>>>>>>> +{
>>>>>>>> +    return job->ops->run(job);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static int
>>>>>>>> +nouveau_job_run_sync(struct nouveau_job *job)
>>>>>>>> +{
>>>>>>>> +    struct dma_fence *fence;
>>>>>>>> +    int ret;
>>>>>>>> +
>>>>>>>> +    fence = nouveau_job_run(job);
>>>>>>>> +    if (IS_ERR(fence)) {
>>>>>>>> +        return PTR_ERR(fence);
>>>>>>>> +    } else if (fence) {
>>>>>>>> +        ret = dma_fence_wait(fence, true);
>>>>>>>> +        if (ret)
>>>>>>>> +            return ret;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    dma_fence_signal(job->done_fence);
>>>>>>>> +
>>>>>>>> +    return 0;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +int
>>>>>>>> +nouveau_job_submit(struct nouveau_job *job)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_sched_entity *entity =
>>>>>>>> to_nouveau_sched_entity(job->base.entity);
>>>>>>>> +    int ret;
>>>>>>>> +
>>>>>>>> +    drm_exec_init(&job->exec, true);
>>>>>>>> +
>>>>>>>> +    ret = nouveau_job_add_deps(job);
>>>>>>>> +    if (ret)
>>>>>>>> +        goto out;
>>>>>>>> +
>>>>>>>> +    drm_sched_job_arm(&job->base);
>>>>>>>> +    job->done_fence = dma_fence_get(&job->base.s_fence->finished);
>>>>>>>> +
>>>>>>>> +    ret = nouveau_job_fence_attach(job, job->done_fence);
>>>>>>>> +    if (ret)
>>>>>>>> +        goto out;
>>>>>>>> +
>>>>>>>> +    if (job->ops->submit) {
>>>>>>>> +        ret = job->ops->submit(job);
>>>>>>>> +        if (ret)
>>>>>>>> +            goto out;
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    if (job->sync) {
>>>>>>>> +        drm_exec_fini(&job->exec);
>>>>>>>> +
>>>>>>>> +        /* We're requested to run a synchronous job, hence don't push
>>>>>>>> +         * the job, bypassing the job scheduler, and execute the jobs
>>>>>>>> +         * run() function right away.
>>>>>>>> +         *
>>>>>>>> +         * As a consequence of bypassing the job scheduler we need to
>>>>>>>> +         * handle fencing and job cleanup ourselfes.
>>>>>>>> +         */
>>>>>>>> +        ret = nouveau_job_run_sync(job);
>>>>>>>> +
>>>>>>>> +        /* If the job fails, the caller will do the cleanup for us. */
>>>>>>>> +        if (!ret)
>>>>>>>> +            nouveau_job_fini(job);
>>>>>>>> +
>>>>>>>> +        return ret;
>>>>>>>> +    } else {
>>>>>>>> +        mutex_lock(&entity->job.mutex);
>>>>>>>> +        drm_sched_entity_push_job(&job->base);
>>>>>>>> +        list_add_tail(&job->head, &entity->job.list);
>>>>>>>> +        mutex_unlock(&entity->job.mutex);
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +out:
>>>>>>>> +    drm_exec_fini(&job->exec);
>>>>>>>> +    return ret;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static struct dma_fence *
>>>>>>>> +nouveau_sched_run_job(struct drm_sched_job *sched_job)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>>>>>> +
>>>>>>>> +    return nouveau_job_run(job);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static enum drm_gpu_sched_stat
>>>>>>>> +nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>>>>>> +    struct nouveau_channel *chan = job->chan;
>>>>>>>> +
>>>>>>>> +    if (unlikely(!atomic_read(&chan->killed)))
>>>>>>>> +        nouveau_channel_kill(chan);
>>>>>>>> +
>>>>>>>> +    NV_PRINTK(warn, job->cli, "job timeout, channel %d killed!\n",
>>>>>>>> +          chan->chid);
>>>>>>>> +
>>>>>>>> +    nouveau_sched_entity_fini(job->entity);
>>>>>>>> +
>>>>>>>> +    return DRM_GPU_SCHED_STAT_ENODEV;
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static void
>>>>>>>> +nouveau_sched_free_job(struct drm_sched_job *sched_job)
>>>>>>>> +{
>>>>>>>> +    struct nouveau_job *job = to_nouveau_job(sched_job);
>>>>>>>> +    struct nouveau_sched_entity *entity = job->entity;
>>>>>>>> +
>>>>>>>> +    mutex_lock(&entity->job.mutex);
>>>>>>>> +    list_del(&job->head);
>>>>>>>> +    mutex_unlock(&entity->job.mutex);
>>>>>>>> +
>>>>>>>> +    nouveau_job_fini(job);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
>>>>>>>> +                  struct drm_gpu_scheduler *sched)
>>>>>>>> +{
>>>>>>>> +
>>>>>>>> +    INIT_LIST_HEAD(&entity->job.list);
>>>>>>>> +    mutex_init(&entity->job.mutex);
>>>>>>>> +
>>>>>>>> +    return drm_sched_entity_init(&entity->base,
>>>>>>>> +                     DRM_SCHED_PRIORITY_NORMAL,
>>>>>>>> +                     &sched, 1, NULL);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +void
>>>>>>>> +nouveau_sched_entity_fini(struct nouveau_sched_entity *entity)
>>>>>>>> +{
>>>>>>>> +    drm_sched_entity_destroy(&entity->base);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +static const struct drm_sched_backend_ops nouveau_sched_ops = {
>>>>>>>> +    .run_job = nouveau_sched_run_job,
>>>>>>>> +    .timedout_job = nouveau_sched_timedout_job,
>>>>>>>> +    .free_job = nouveau_sched_free_job,
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>> +               struct nouveau_drm *drm)
>>>>>>>> +{
>>>>>>>> +    long job_hang_limit =
>>>>>>>> msecs_to_jiffies(NOUVEAU_SCHED_JOB_TIMEOUT_MS);
>>>>>>>> +
>>>>>>>> +    return drm_sched_init(sched, &nouveau_sched_ops,
>>>>>>>> +                  NOUVEAU_SCHED_HW_SUBMISSIONS, 0, job_hang_limit,
>>>>>>>> +                  NULL, NULL, "nouveau", drm->dev->dev);
>>>>>>>> +}
>>>>>>>> +
>>>>>>>> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched)
>>>>>>>> +{
>>>>>>>> +    drm_sched_fini(sched);
>>>>>>>> +}
>>>>>>>> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>>>>>> b/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>>>>>> new file mode 100644
>>>>>>>> index 000000000000..7fc5b7eea810
>>>>>>>> --- /dev/null
>>>>>>>> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
>>>>>>>> @@ -0,0 +1,98 @@
>>>>>>>> +// SPDX-License-Identifier: MIT
>>>>>>>> +
>>>>>>>> +#ifndef NOUVEAU_SCHED_H
>>>>>>>> +#define NOUVEAU_SCHED_H
>>>>>>>> +
>>>>>>>> +#include <linux/types.h>
>>>>>>>> +
>>>>>>>> +#include <drm/drm_exec.h>
>>>>>>>> +#include <drm/gpu_scheduler.h>
>>>>>>>> +
>>>>>>>> +#include "nouveau_drv.h"
>>>>>>>> +#include "nouveau_exec.h"
>>>>>>>> +
>>>>>>>> +#define to_nouveau_job(sched_job)        \
>>>>>>>> +        container_of((sched_job), struct nouveau_job, base)
>>>>>>>> +
>>>>>>>> +#define to_nouveau_exec_job(job)        \
>>>>>>>> +        container_of((job), struct nouveau_exec_job, base)
>>>>>>>> +
>>>>>>>> +#define to_nouveau_bind_job(job)        \
>>>>>>>> +        container_of((job), struct nouveau_bind_job, base)
>>>>>>>> +
>>>>>>>> +struct nouveau_job {
>>>>>>>> +    struct drm_sched_job base;
>>>>>>>> +    struct list_head head;
>>>>>>>> +
>>>>>>>> +    struct nouveau_sched_entity *entity;
>>>>>>>> +
>>>>>>>> +    struct drm_file *file_priv;
>>>>>>>> +    struct nouveau_cli *cli;
>>>>>>>> +    struct nouveau_channel *chan;
>>>>>>>> +
>>>>>>>> +    struct drm_exec exec;
>>>>>>>> +    struct dma_fence *done_fence;
>>>>>>>> +
>>>>>>>> +    bool sync;
>>>>>>>> +
>>>>>>>> +    struct {
>>>>>>>> +        struct drm_nouveau_sync *s;
>>>>>>>> +        u32 count;
>>>>>>>> +    } in_sync;
>>>>>>>> +
>>>>>>>> +    struct {
>>>>>>>> +        struct drm_nouveau_sync *s;
>>>>>>>> +        u32 count;
>>>>>>>> +    } out_sync;
>>>>>>>> +
>>>>>>>> +    struct nouveau_job_ops {
>>>>>>>> +        int (*submit)(struct nouveau_job *);
>>>>>>>> +        struct dma_fence *(*run)(struct nouveau_job *);
>>>>>>>> +        void (*free)(struct nouveau_job *);
>>>>>>>> +    } *ops;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +struct nouveau_exec_job {
>>>>>>>> +    struct nouveau_job base;
>>>>>>>> +
>>>>>>>> +    struct {
>>>>>>>> +        struct drm_nouveau_exec_push *s;
>>>>>>>> +        u32 count;
>>>>>>>> +    } push;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +struct nouveau_bind_job {
>>>>>>>> +    struct nouveau_job base;
>>>>>>>> +
>>>>>>>> +    /* struct bind_job_op */
>>>>>>>> +    struct list_head ops;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +int nouveau_bind_job_init(struct nouveau_bind_job **job,
>>>>>>>> +              struct nouveau_exec_bind *bind);
>>>>>>>> +int nouveau_exec_job_init(struct nouveau_exec_job **job,
>>>>>>>> +              struct nouveau_exec *exec);
>>>>>>>> +
>>>>>>>> +int nouveau_job_submit(struct nouveau_job *job);
>>>>>>>> +void nouveau_job_fini(struct nouveau_job *job);
>>>>>>>> +
>>>>>>>> +#define to_nouveau_sched_entity(entity)        \
>>>>>>>> +        container_of((entity), struct nouveau_sched_entity, base)
>>>>>>>> +
>>>>>>>> +struct nouveau_sched_entity {
>>>>>>>> +    struct drm_sched_entity base;
>>>>>>>> +    struct {
>>>>>>>> +        struct list_head list;
>>>>>>>> +        struct mutex mutex;
>>>>>>>> +    } job;
>>>>>>>> +};
>>>>>>>> +
>>>>>>>> +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
>>>>>>>> +                  struct drm_gpu_scheduler *sched);
>>>>>>>> +void nouveau_sched_entity_fini(struct nouveau_sched_entity *entity);
>>>>>>>> +
>>>>>>>> +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
>>>>>>>> +               struct nouveau_drm *drm);
>>>>>>>> +void nouveau_sched_fini(struct drm_gpu_scheduler *sched);
>>>>>>>> +
>>>>>>>> +#endif
>>>>>>>
>>>>>>
>>>>>
>>>>
>>>
>>
>
Matthew Brost Jan. 20, 2023, 4:30 a.m. UTC | #11
On Thu, Jan 19, 2023 at 11:25:51PM +0100, Danilo Krummrich wrote:
> On 1/19/23 22:47, Matthew Brost wrote:
> > On Thu, Jan 19, 2023 at 06:46:30PM +0100, Danilo Krummrich wrote:
> > > 
> > > 
> > > On 1/19/23 17:38, Matthew Brost wrote:
> > > > On Thu, Jan 19, 2023 at 04:36:43PM +0100, Danilo Krummrich wrote:
> > > > > On 1/19/23 05:58, Matthew Brost wrote:
> > > > > > On Thu, Jan 19, 2023 at 04:44:23AM +0100, Danilo Krummrich wrote:
> > > > > > > On 1/18/23 21:37, Thomas Hellström (Intel) wrote:
> > > > > > > > 
> > > > > > > > On 1/18/23 07:12, Danilo Krummrich wrote:
> > > > > > > > > This commit provides the implementation for the new uapi motivated by the
> > > > > > > > > Vulkan API. It allows user mode drivers (UMDs) to:
> > > > > > > > > 
> > > > > > > > > 1) Initialize a GPU virtual address (VA) space via the new
> > > > > > > > >        DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
> > > > > > > > >        space managed by the kernel and userspace, respectively.
> > > > > > > > > 
> > > > > > > > > 2) Allocate and free a VA space region as well as bind and unbind memory
> > > > > > > > >        to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> > > > > > > > >        UMDs can request the named operations to be processed either
> > > > > > > > >        synchronously or asynchronously. It supports DRM syncobjs
> > > > > > > > >        (incl. timelines) as synchronization mechanism. The management of the
> > > > > > > > >        GPU VA mappings is implemented with the DRM GPU VA manager.
> > > > > > > > > 
> > > > > > > > > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
> > > > > > > > >        execution happens asynchronously. It supports DRM syncobj (incl.
> > > > > > > > >        timelines) as synchronization mechanism. DRM GEM object locking is
> > > > > > > > >        handled with drm_exec.
> > > > > > > > > 
> > > > > > > > > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
> > > > > > > > > GPU scheduler for the asynchronous paths.
> > > > > > > > > 
> > > > > > > > > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > > > > > > > > ---
> > > > > > > > >      Documentation/gpu/driver-uapi.rst       |   3 +
> > > > > > > > >      drivers/gpu/drm/nouveau/Kbuild          |   2 +
> > > > > > > > >      drivers/gpu/drm/nouveau/Kconfig         |   2 +
> > > > > > > > >      drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
> > > > > > > > >      drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
> > > > > > > > >      drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
> > > > > > > > >      drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
> > > > > > > > >      drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
> > > > > > > > >      drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
> > > > > > > > >      drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
> > > > > > > > >      drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
> > > > > > > > >      11 files changed, 1295 insertions(+), 4 deletions(-)
> > > > > > > > >      create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
> > > > > > > > >      create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
> > > > > > > > >      create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
> > > > > > > > >      create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > > > > > ...
> > > > > > > > > 
> > > > > > > > > +static struct dma_fence *
> > > > > > > > > +nouveau_bind_job_run(struct nouveau_job *job)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> > > > > > > > > +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
> > > > > > > > > +    struct bind_job_op *op;
> > > > > > > > > +    int ret = 0;
> > > > > > > > > +
> > > > > > > > 
> > > > > > > > I was looking at how nouveau does the async binding compared to how xe
> > > > > > > > does it.
> > > > > > > > It looks to me that this function being a scheduler run_job callback is
> > > > > > > > the main part of the VM_BIND dma-fence signalling critical section for
> > > > > > > > the job's done_fence and if so, needs to be annotated as such?
> > > > > > > 
> > > > > > > Yes, that's the case.
> > > > > > > 
> > > > > > > > 
> > > > > > > > For example nouveau_uvma_region_new allocates memory, which is not
> > > > > > > > allowed if in a dma_fence signalling critical section and the locking
> > > > > > > > also looks suspicious?
> > > > > > > 
> > > > > > > Thanks for pointing this out, I missed that somehow.
> > > > > > > 
> > > > > > > I will change it to pre-allocate new regions, mappings and page tables
> > > > > > > within the job's submit() function.
> > > > > > > 
> > > > > > 
> > > > > > Yea that what we basically do in Xe, in the IOCTL step allocate all the
> > > > > > backing store for new page tables, populate new page tables (these are
> > > > > > not yet visible in the page table structure), and in last step which is
> > > > > > executed after all the dependencies are satified program all the leaf
> > > > > > entires making the new binding visible.
> > > > > > 
> > > > > > We screwed have this up by defering most of the IOCTL to a worker but
> > > > > > will fix this fix this one way or another soon - get rid of worker or
> > > > > > introduce a type of sync that is signaled after the worker + publish the
> > > > > > dma-fence in the worker. I'd like to close on this one soon.
> > > > > > > For the ops structures the drm_gpuva_manager allocates for reporting the
> > > > > > > split/merge steps back to the driver I have ideas to entirely avoid
> > > > > > > allocations, which also is a good thing in respect of Christians feedback
> > > > > > > regarding the huge amount of mapping requests some applications seem to
> > > > > > > generate.
> > > > > > > 
> > > > > > 
> > > > > > It should be fine to have allocations to report the split/merge step as
> > > > > > this step should be before a dma-fence is published, but yea if possible
> > > > > > to avoid extra allocs as that is always better.
> > > > > 
> > > > > I think we can't really ask for the split/merge steps before actually
> > > > > running the job, since it requires the particular VA space not to change
> > > > > while performing those operations.
> > > > > 
> > > > > E.g. if we'd run the split/merge steps at job submit() time the underlying
> > > > > VA space could be changed by other bind jobs executing before this one,
> > > > > which would make the calculated split/merge steps obsolete and wrong.
> > > > > 
> > > > 
> > > > Hmm, maybe I'm not understanding this implementation, admittedly I
> > > > haven't studied the gpuva manager code in detail.
> > > 
> > > The limitation I mentioned above doesn't really come from the
> > > drm_gpuva_manager, but from how the driver executes the jobs.
> > > 
> > > > 
> > > > Let me explain what we are doing in Xe.
> > > > 
> > > > Map 0x0000 - 0x3000 -> this resolves into 1 bind operation and 1 VMA
> > > > Unmap 0x1000-0x2000 -> this resolves into 1 unbind and 2 rebind operations
> > > > 
> > > > 1. unbind 0x0000-0x3000 -> destroy old VMA
> > > > 2. rebind 0x0000-0x1000 -> new VMA
> > > > 3. rebind 0x2000-0x3000 -> new VMA
> > > > 
> > > > All of the above steps resolving the operations can be done in the IOCTL
> > > > phase and VM's VMA structure is also updated. When the dependencies
> > > > are resolved the actual bindings are done on the GPU. We use the BO's
> > > > dma-resv slots to ensure there is never a window 0x0000-0x1000 and
> > > > 0x2000-0x3000 are never mapped with respect to execs (I forget the exact
> > > > details of how we do this but if you want to know I'll explain further).
> > > 
> > > Ok, so you're not only generating the split/merge steps without updating the
> > > view of the VA space (which would cause the issue I described) but also
> > > already change the view of the VA space in the IOCTL, before the actual page
> > > table update happens later on, right?
> > > 
> > 
> > Yes, we generate the operations + update the view VA space in the IOCTL
> > while the actual page table update on the GPU occurs later if there are
> > dependencies.
> > 
> > > Currently, in nouveau I do both, the actual page table update and the range
> > > allocator update, in run_job(), such that walking the allocator always
> > > represents the actual page table layout.
> > > 
> > 
> > In Xe the VA view is always if all submited bind / unbind ops has
> > completed even if some are pending.
> 
> If they're completed, yes. But in the time frame from the VA space update
> until the GPU actually updated the page tables it isn't? Not that I think
> it's a problem, just curious.
> 

There is a window where the VA space and actual page tables are not in
sync. This is definitely is fine as we've tested Xe quite thoroughly.
Essentially we are just pipelining the VA space in step with the actual
GPU page tables update in the another. The GPU page table update step
doesn't look of VA space rather the VA space step provides all the
information (statically) so the GPU page table can occur.

> > 
> > Also you may want to take a look at generic page walker Thomas Hellstrom
> > wrote for Xe which we use to program the page tables. It is pretty slick
> > and probably could use in Nouveau:
> > https://patchwork.freedesktop.org/patch/515856/?series=112188&rev=1
> > 
> > In Xe xe_pt.c is the wrapper for this, it can map, unmap, and invalidate
> > VA:
> > https://cgit.freedesktop.org/drm/drm-xe/tree/drivers/gpu/drm/xe/xe_pt.c?h=drm-xe-next
> > 
> > > How do you handle map/unmap on BO eviction?
> > > 
> > 
> > We use the dma-resv slots to order all of this.
> > 
> > Basically exec/map/unmap wait on pending BOs (I believe moves are in the
> > KERNEL slot) and BOs moves wait on pending exec/map/unmaps (I believe
> > these are BOOKKEEP slot).
> > 
> > I think more details are in the Xe kernel doc, see 'dma-resv usage':
> > https://cgit.freedesktop.org/drm/drm-xe/tree/drivers/gpu/drm/xe/xe_vm_doc.h?h=drm-xe-next
> > 
> > This might be a little stale, we dropped the idea of
> > DMA_RESV_USAGE_PREEMPT_FENCE and is now just in the BOOKKEEP but I think
> > everything else is corret.
> 
> If a BO is currently eviceted, did you consider to just add it to your
> rebind list rather than calling ttm_bo_validate() and creating the actual
> mapping right away?
>

No, but that is good idea. We probably should look at this optimization.
 
> I was thinking about this kind of "lazy mapping" approach for cases like
> partial unbind requests, where I wouldn't expect the application to need the
> mapping right away, but also for normal bind requests. I mean, when there is
> already memory pressure, why making the situation worse until an actual EXEC
> is requested?
> 
> Also, if I see it correctly unmaps caused by xe_bo_move_notify() happen on
> the CPU, right?
>

I'm not exactly sure what you asking here, but we have 3 cases here.

1. Fault mode, we just blow away the page tables, will get a fault on
next use, and then trigger a rebind.

2. Compute mode, trigger the preempt fences (kicks everything using the
BO off the hardware before the move), the rebind worker will issue a
rebind and resume execution.

3. dma-fence mode, the BO move is scheduled behind any pending execs,
the next exec IOCTL will issue the rebind.

In all case before the rebind the BO validated again perhap triggering
another move, all rebinds are scheduled behind the move.

> > 
> > > > 
> > > > Can we not use drm_gpuvs_manager in a similar manner to generate the
> > > > ops + update the VM's VMA structure early? Again maybe I missing
> > > > something here as I haven't fully studied the drm_gpuva_manager.
> > > 
> > > You can use the drm_gpuvs_manager in exactly the way you just described.
> > > Though, in your concrete example it would generate just 1 unbind and 1 bind,
> > > which it would combine in a re-bind operation. A re-bind operation always
> > > has 1 unbind and up to 2 (but a minimum of 1) bind (sub-)operations.
> > > 
> > 
> > Cool this is what I wanted to hear. Hopefully we can get around to
> > building our VM / VMA management on top the drm_gpuvs_manager soon.
> 
> I will re-work the drm_gpuva_manager memory allocation parts first, since
> those changes will influence it's API. How the internal allocator works, if
> it's drm_mm or something else, shouldn't matter to users of the
> drm_gpuva_manager from an API PoV.
>

Definitely work out the API changes first as we can work on / optimize
the internals later. If Christian can provide the benchmarks / WL that
do tons of binds we may be able to run these on Xe to help find the best
solution for the internals. VK is mostly working for Xe.

Matt
 
> - Danilo
> 
> > 
> > Matt
> > > Rebind:
> > >      1. unbind 0x0000-0x3000
> > >      2. NULL
> > >      3.   bind 0x1000-0x3000
> > > 
> > > It's then up to the driver to remove the old gpuva entry and add a new one.
> > > With the given re-bind operation the driver can conclude to just do a
> > > partial page table update from 0x0000-0x1000.
> > > 
> > > - Danilo
> > > 
> > > > 
> > > > Matt
> > > > 
> > > > > Anyway, I should be able to get rid of all the allocations to make this
> > > > > safe.
> > > > > 
> > > > > > 
> > > > > > Also BTW, great work on drm_gpuva_manager too. We will almost likely
> > > > > > pick this up in Xe rather than open coding all of this as we currently
> > > > > > do. We should probably start the port to this soon so we can contribute
> > > > > > to the implementation and get both of our drivers upstream sooner.
> > > > > 
> > > > > Sounds great!
> > > > > 
> > > > > > > Regarding the locking, anything specific that makes it look suspicious to
> > > > > > > you?
> > > > > > > 
> > > > > > 
> > > > > > I haven't looked into this too but almost certainly Thomas is suggesting
> > > > > > that if you allocate memory anywhere under the nouveau_uvmm_lock then
> > > > > > you can't use this lock in the run_job() callback as this in the
> > > > > > dma-fencing path.
> > > > > 
> > > > > Oh, sure. I already checked that, luckily there aren't any further
> > > > > allocations under this lock, so this should be safe once I changed to
> > > > > run_job() parts to pre-allocation in submit().
> > > > > 
> > > > > > 
> > > > > > Matt
> > > > > > 
> > > > > > > > 
> > > > > > > > Thanks,
> > > > > > > > 
> > > > > > > > Thomas
> > > > > > > > 
> > > > > > > > 
> > > > > > > > > +    nouveau_uvmm_lock(uvmm);
> > > > > > > > > +    list_for_each_op(op, &bind_job->ops) {
> > > > > > > > > +        switch (op->op) {
> > > > > > > > > +        case OP_ALLOC: {
> > > > > > > > > +            bool sparse = op->flags & DRM_NOUVEAU_VM_BIND_SPARSE;
> > > > > > > > > +
> > > > > > > > > +            ret = nouveau_uvma_region_new(uvmm,
> > > > > > > > > +                              op->va.addr,
> > > > > > > > > +                              op->va.range,
> > > > > > > > > +                              sparse);
> > > > > > > > > +            if (ret)
> > > > > > > > > +                goto out_unlock;
> > > > > > > > > +            break;
> > > > > > > > > +        }
> > > > > > > > > +        case OP_FREE:
> > > > > > > > > +            ret = nouveau_uvma_region_destroy(uvmm,
> > > > > > > > > +                              op->va.addr,
> > > > > > > > > +                              op->va.range);
> > > > > > > > > +            if (ret)
> > > > > > > > > +                goto out_unlock;
> > > > > > > > > +            break;
> > > > > > > > > +        case OP_MAP:
> > > > > > > > > +            ret = nouveau_uvmm_sm_map(uvmm,
> > > > > > > > > +                          op->va.addr, op->va.range,
> > > > > > > > > +                          op->gem.obj, op->gem.offset,
> > > > > > > > > +                          op->flags && 0xff);
> > > > > > > > > +            if (ret)
> > > > > > > > > +                goto out_unlock;
> > > > > > > > > +            break;
> > > > > > > > > +        case OP_UNMAP:
> > > > > > > > > +            ret = nouveau_uvmm_sm_unmap(uvmm,
> > > > > > > > > +                            op->va.addr,
> > > > > > > > > +                            op->va.range);
> > > > > > > > > +            if (ret)
> > > > > > > > > +                goto out_unlock;
> > > > > > > > > +            break;
> > > > > > > > > +        }
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +out_unlock:
> > > > > > > > > +    nouveau_uvmm_unlock(uvmm);
> > > > > > > > > +    if (ret)
> > > > > > > > > +        NV_PRINTK(err, job->cli, "bind job failed: %d\n", ret);
> > > > > > > > > +    return ERR_PTR(ret);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static void
> > > > > > > > > +nouveau_bind_job_free(struct nouveau_job *job)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> > > > > > > > > +    struct bind_job_op *op, *next;
> > > > > > > > > +
> > > > > > > > > +    list_for_each_op_safe(op, next, &bind_job->ops) {
> > > > > > > > > +        struct drm_gem_object *obj = op->gem.obj;
> > > > > > > > > +
> > > > > > > > > +        if (obj)
> > > > > > > > > +            drm_gem_object_put(obj);
> > > > > > > > > +
> > > > > > > > > +        list_del(&op->entry);
> > > > > > > > > +        kfree(op);
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    nouveau_base_job_free(job);
> > > > > > > > > +    kfree(bind_job);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static struct nouveau_job_ops nouveau_bind_job_ops = {
> > > > > > > > > +    .submit = nouveau_bind_job_submit,
> > > > > > > > > +    .run = nouveau_bind_job_run,
> > > > > > > > > +    .free = nouveau_bind_job_free,
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > +static int
> > > > > > > > > +bind_job_op_from_uop(struct bind_job_op **pop,
> > > > > > > > > +             struct drm_nouveau_vm_bind_op *uop)
> > > > > > > > > +{
> > > > > > > > > +    struct bind_job_op *op;
> > > > > > > > > +
> > > > > > > > > +    op = *pop = kzalloc(sizeof(*op), GFP_KERNEL);
> > > > > > > > > +    if (!op)
> > > > > > > > > +        return -ENOMEM;
> > > > > > > > > +
> > > > > > > > > +    op->op = uop->op;
> > > > > > > > > +    op->flags = uop->flags;
> > > > > > > > > +    op->va.addr = uop->addr;
> > > > > > > > > +    op->va.range = uop->range;
> > > > > > > > > +
> > > > > > > > > +    if (op->op == DRM_NOUVEAU_VM_BIND_OP_MAP) {
> > > > > > > > > +        op->gem.handle = uop->handle;
> > > > > > > > > +        op->gem.offset = uop->bo_offset;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    return 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static void
> > > > > > > > > +bind_job_ops_free(struct list_head *ops)
> > > > > > > > > +{
> > > > > > > > > +    struct bind_job_op *op, *next;
> > > > > > > > > +
> > > > > > > > > +    list_for_each_op_safe(op, next, ops) {
> > > > > > > > > +        list_del(&op->entry);
> > > > > > > > > +        kfree(op);
> > > > > > > > > +    }
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +int
> > > > > > > > > +nouveau_bind_job_init(struct nouveau_bind_job **pjob,
> > > > > > > > > +              struct nouveau_exec_bind *bind)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_bind_job *job;
> > > > > > > > > +    struct bind_job_op *op;
> > > > > > > > > +    int i, ret;
> > > > > > > > > +
> > > > > > > > > +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> > > > > > > > > +    if (!job)
> > > > > > > > > +        return -ENOMEM;
> > > > > > > > > +
> > > > > > > > > +    INIT_LIST_HEAD(&job->ops);
> > > > > > > > > +
> > > > > > > > > +    for (i = 0; i < bind->op.count; i++) {
> > > > > > > > > +        ret = bind_job_op_from_uop(&op, &bind->op.s[i]);
> > > > > > > > > +        if (ret)
> > > > > > > > > +            goto err_free;
> > > > > > > > > +
> > > > > > > > > +        list_add_tail(&op->entry, &job->ops);
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    job->base.sync = !(bind->flags & DRM_NOUVEAU_VM_BIND_RUN_ASYNC);
> > > > > > > > > +    job->base.ops = &nouveau_bind_job_ops;
> > > > > > > > > +
> > > > > > > > > +    ret = nouveau_base_job_init(&job->base, &bind->base);
> > > > > > > > > +    if (ret)
> > > > > > > > > +        goto err_free;
> > > > > > > > > +
> > > > > > > > > +    return 0;
> > > > > > > > > +
> > > > > > > > > +err_free:
> > > > > > > > > +    bind_job_ops_free(&job->ops);
> > > > > > > > > +    kfree(job);
> > > > > > > > > +    *pjob = NULL;
> > > > > > > > > +
> > > > > > > > > +    return ret;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static int
> > > > > > > > > +sync_find_fence(struct nouveau_job *job,
> > > > > > > > > +        struct drm_nouveau_sync *sync,
> > > > > > > > > +        struct dma_fence **fence)
> > > > > > > > > +{
> > > > > > > > > +    u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
> > > > > > > > > +    u64 point = 0;
> > > > > > > > > +    int ret;
> > > > > > > > > +
> > > > > > > > > +    if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
> > > > > > > > > +        stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > > > > > > > +        return -EOPNOTSUPP;
> > > > > > > > > +
> > > > > > > > > +    if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > > > > > > > +        point = sync->timeline_value;
> > > > > > > > > +
> > > > > > > > > +    ret = drm_syncobj_find_fence(job->file_priv,
> > > > > > > > > +                     sync->handle, point,
> > > > > > > > > +                     sync->flags, fence);
> > > > > > > > > +    if (ret)
> > > > > > > > > +        return ret;
> > > > > > > > > +
> > > > > > > > > +    return 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static int
> > > > > > > > > +exec_job_binds_wait(struct nouveau_job *job)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > > > > > +    struct nouveau_cli *cli = exec_job->base.cli;
> > > > > > > > > +    struct nouveau_sched_entity *bind_entity = &cli->sched_entity;
> > > > > > > > > +    signed long ret;
> > > > > > > > > +    int i;
> > > > > > > > > +
> > > > > > > > > +    for (i = 0; i < job->in_sync.count; i++) {
> > > > > > > > > +        struct nouveau_job *it;
> > > > > > > > > +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
> > > > > > > > > +        struct dma_fence *fence;
> > > > > > > > > +        bool found;
> > > > > > > > > +
> > > > > > > > > +        ret = sync_find_fence(job, sync, &fence);
> > > > > > > > > +        if (ret)
> > > > > > > > > +            return ret;
> > > > > > > > > +
> > > > > > > > > +        mutex_lock(&bind_entity->job.mutex);
> > > > > > > > > +        found = false;
> > > > > > > > > +        list_for_each_entry(it, &bind_entity->job.list, head) {
> > > > > > > > > +            if (fence == it->done_fence) {
> > > > > > > > > +                found = true;
> > > > > > > > > +                break;
> > > > > > > > > +            }
> > > > > > > > > +        }
> > > > > > > > > +        mutex_unlock(&bind_entity->job.mutex);
> > > > > > > > > +
> > > > > > > > > +        /* If the fence is not from a VM_BIND job, don't wait for it. */
> > > > > > > > > +        if (!found)
> > > > > > > > > +            continue;
> > > > > > > > > +
> > > > > > > > > +        ret = dma_fence_wait_timeout(fence, true,
> > > > > > > > > +                         msecs_to_jiffies(500));
> > > > > > > > > +        if (ret < 0)
> > > > > > > > > +            return ret;
> > > > > > > > > +        else if (ret == 0)
> > > > > > > > > +            return -ETIMEDOUT;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    return 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +int
> > > > > > > > > +nouveau_exec_job_submit(struct nouveau_job *job)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > > > > > +    struct nouveau_cli *cli = exec_job->base.cli;
> > > > > > > > > +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(cli);
> > > > > > > > > +    struct drm_exec *exec = &job->exec;
> > > > > > > > > +    struct drm_gem_object *obj;
> > > > > > > > > +    unsigned long index;
> > > > > > > > > +    int ret;
> > > > > > > > > +
> > > > > > > > > +    ret = exec_job_binds_wait(job);
> > > > > > > > > +    if (ret)
> > > > > > > > > +        return ret;
> > > > > > > > > +
> > > > > > > > > +    nouveau_uvmm_lock(uvmm);
> > > > > > > > > +    drm_exec_while_not_all_locked(exec) {
> > > > > > > > > +        struct drm_gpuva *va;
> > > > > > > > > +
> > > > > > > > > +        drm_gpuva_for_each_va(va, &uvmm->umgr) {
> > > > > > > > > +            ret = drm_exec_prepare_obj(exec, va->gem.obj, 1);
> > > > > > > > > +            drm_exec_break_on_contention(exec);
> > > > > > > > > +            if (ret)
> > > > > > > > > +                return ret;
> > > > > > > > > +        }
> > > > > > > > > +    }
> > > > > > > > > +    nouveau_uvmm_unlock(uvmm);
> > > > > > > > > +
> > > > > > > > > +    drm_exec_for_each_locked_object(exec, index, obj) {
> > > > > > > > > +        struct dma_resv *resv = obj->resv;
> > > > > > > > > +        struct nouveau_bo *nvbo = nouveau_gem_object(obj);
> > > > > > > > > +
> > > > > > > > > +        ret = nouveau_bo_validate(nvbo, true, false);
> > > > > > > > > +        if (ret)
> > > > > > > > > +            return ret;
> > > > > > > > > +
> > > > > > > > > +        dma_resv_add_fence(resv, job->done_fence, DMA_RESV_USAGE_WRITE);
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    return 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static struct dma_fence *
> > > > > > > > > +nouveau_exec_job_run(struct nouveau_job *job)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > > > > > +    struct nouveau_fence *fence;
> > > > > > > > > +    int i, ret;
> > > > > > > > > +
> > > > > > > > > +    ret = nouveau_dma_wait(job->chan, exec_job->push.count + 1, 16);
> > > > > > > > > +    if (ret) {
> > > > > > > > > +        NV_PRINTK(err, job->cli, "nv50cal_space: %d\n", ret);
> > > > > > > > > +        return ERR_PTR(ret);
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    for (i = 0; i < exec_job->push.count; i++) {
> > > > > > > > > +        nv50_dma_push(job->chan, exec_job->push.s[i].va,
> > > > > > > > > +                  exec_job->push.s[i].va_len);
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    ret = nouveau_fence_new(job->chan, false, &fence);
> > > > > > > > > +    if (ret) {
> > > > > > > > > +        NV_PRINTK(err, job->cli, "error fencing pushbuf: %d\n", ret);
> > > > > > > > > +        WIND_RING(job->chan);
> > > > > > > > > +        return ERR_PTR(ret);
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    return &fence->base;
> > > > > > > > > +}
> > > > > > > > > +static void
> > > > > > > > > +nouveau_exec_job_free(struct nouveau_job *job)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
> > > > > > > > > +
> > > > > > > > > +    nouveau_base_job_free(job);
> > > > > > > > > +
> > > > > > > > > +    kfree(exec_job->push.s);
> > > > > > > > > +    kfree(exec_job);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static struct nouveau_job_ops nouveau_exec_job_ops = {
> > > > > > > > > +    .submit = nouveau_exec_job_submit,
> > > > > > > > > +    .run = nouveau_exec_job_run,
> > > > > > > > > +    .free = nouveau_exec_job_free,
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > +int
> > > > > > > > > +nouveau_exec_job_init(struct nouveau_exec_job **pjob,
> > > > > > > > > +              struct nouveau_exec *exec)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_exec_job *job;
> > > > > > > > > +    int ret;
> > > > > > > > > +
> > > > > > > > > +    job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> > > > > > > > > +    if (!job)
> > > > > > > > > +        return -ENOMEM;
> > > > > > > > > +
> > > > > > > > > +    job->push.count = exec->push.count;
> > > > > > > > > +    job->push.s = kmemdup(exec->push.s,
> > > > > > > > > +                  sizeof(*exec->push.s) *
> > > > > > > > > +                  exec->push.count,
> > > > > > > > > +                  GFP_KERNEL);
> > > > > > > > > +    if (!job->push.s) {
> > > > > > > > > +        ret = -ENOMEM;
> > > > > > > > > +        goto err_free_job;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    job->base.ops = &nouveau_exec_job_ops;
> > > > > > > > > +    ret = nouveau_base_job_init(&job->base, &exec->base);
> > > > > > > > > +    if (ret)
> > > > > > > > > +        goto err_free_pushs;
> > > > > > > > > +
> > > > > > > > > +    return 0;
> > > > > > > > > +
> > > > > > > > > +err_free_pushs:
> > > > > > > > > +    kfree(job->push.s);
> > > > > > > > > +err_free_job:
> > > > > > > > > +    kfree(job);
> > > > > > > > > +    *pjob = NULL;
> > > > > > > > > +
> > > > > > > > > +    return ret;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void nouveau_job_fini(struct nouveau_job *job)
> > > > > > > > > +{
> > > > > > > > > +    dma_fence_put(job->done_fence);
> > > > > > > > > +    drm_sched_job_cleanup(&job->base);
> > > > > > > > > +    job->ops->free(job);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static int
> > > > > > > > > +nouveau_job_add_deps(struct nouveau_job *job)
> > > > > > > > > +{
> > > > > > > > > +    struct dma_fence *in_fence = NULL;
> > > > > > > > > +    int ret, i;
> > > > > > > > > +
> > > > > > > > > +    for (i = 0; i < job->in_sync.count; i++) {
> > > > > > > > > +        struct drm_nouveau_sync *sync = &job->in_sync.s[i];
> > > > > > > > > +
> > > > > > > > > +        ret = sync_find_fence(job, sync, &in_fence);
> > > > > > > > > +        if (ret) {
> > > > > > > > > +            NV_PRINTK(warn, job->cli,
> > > > > > > > > +                  "Failed to find syncobj (-> in): handle=%d\n",
> > > > > > > > > +                  sync->handle);
> > > > > > > > > +            return ret;
> > > > > > > > > +        }
> > > > > > > > > +
> > > > > > > > > +        ret = drm_sched_job_add_dependency(&job->base, in_fence);
> > > > > > > > > +        if (ret)
> > > > > > > > > +            return ret;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    return 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static int
> > > > > > > > > +nouveau_job_fence_attach(struct nouveau_job *job, struct dma_fence
> > > > > > > > > *fence)
> > > > > > > > > +{
> > > > > > > > > +    struct drm_syncobj *out_sync;
> > > > > > > > > +    int i;
> > > > > > > > > +
> > > > > > > > > +    for (i = 0; i < job->out_sync.count; i++) {
> > > > > > > > > +        struct drm_nouveau_sync *sync = &job->out_sync.s[i];
> > > > > > > > > +        u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
> > > > > > > > > +
> > > > > > > > > +        if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
> > > > > > > > > +            stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
> > > > > > > > > +            return -EOPNOTSUPP;
> > > > > > > > > +
> > > > > > > > > +        out_sync = drm_syncobj_find(job->file_priv, sync->handle);
> > > > > > > > > +        if (!out_sync) {
> > > > > > > > > +            NV_PRINTK(warn, job->cli,
> > > > > > > > > +                  "Failed to find syncobj (-> out): handle=%d\n",
> > > > > > > > > +                  sync->handle);
> > > > > > > > > +            return -ENOENT;
> > > > > > > > > +        }
> > > > > > > > > +
> > > > > > > > > +        if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ) {
> > > > > > > > > +            struct dma_fence_chain *chain;
> > > > > > > > > +
> > > > > > > > > +            chain = dma_fence_chain_alloc();
> > > > > > > > > +            if (!chain) {
> > > > > > > > > +                drm_syncobj_put(out_sync);
> > > > > > > > > +                return -ENOMEM;
> > > > > > > > > +            }
> > > > > > > > > +
> > > > > > > > > +            drm_syncobj_add_point(out_sync, chain, fence,
> > > > > > > > > +                          sync->timeline_value);
> > > > > > > > > +        } else {
> > > > > > > > > +            drm_syncobj_replace_fence(out_sync, fence);
> > > > > > > > > +        }
> > > > > > > > > +
> > > > > > > > > +        drm_syncobj_put(out_sync);
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    return 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static struct dma_fence *
> > > > > > > > > +nouveau_job_run(struct nouveau_job *job)
> > > > > > > > > +{
> > > > > > > > > +    return job->ops->run(job);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static int
> > > > > > > > > +nouveau_job_run_sync(struct nouveau_job *job)
> > > > > > > > > +{
> > > > > > > > > +    struct dma_fence *fence;
> > > > > > > > > +    int ret;
> > > > > > > > > +
> > > > > > > > > +    fence = nouveau_job_run(job);
> > > > > > > > > +    if (IS_ERR(fence)) {
> > > > > > > > > +        return PTR_ERR(fence);
> > > > > > > > > +    } else if (fence) {
> > > > > > > > > +        ret = dma_fence_wait(fence, true);
> > > > > > > > > +        if (ret)
> > > > > > > > > +            return ret;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    dma_fence_signal(job->done_fence);
> > > > > > > > > +
> > > > > > > > > +    return 0;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +int
> > > > > > > > > +nouveau_job_submit(struct nouveau_job *job)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_sched_entity *entity =
> > > > > > > > > to_nouveau_sched_entity(job->base.entity);
> > > > > > > > > +    int ret;
> > > > > > > > > +
> > > > > > > > > +    drm_exec_init(&job->exec, true);
> > > > > > > > > +
> > > > > > > > > +    ret = nouveau_job_add_deps(job);
> > > > > > > > > +    if (ret)
> > > > > > > > > +        goto out;
> > > > > > > > > +
> > > > > > > > > +    drm_sched_job_arm(&job->base);
> > > > > > > > > +    job->done_fence = dma_fence_get(&job->base.s_fence->finished);
> > > > > > > > > +
> > > > > > > > > +    ret = nouveau_job_fence_attach(job, job->done_fence);
> > > > > > > > > +    if (ret)
> > > > > > > > > +        goto out;
> > > > > > > > > +
> > > > > > > > > +    if (job->ops->submit) {
> > > > > > > > > +        ret = job->ops->submit(job);
> > > > > > > > > +        if (ret)
> > > > > > > > > +            goto out;
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +    if (job->sync) {
> > > > > > > > > +        drm_exec_fini(&job->exec);
> > > > > > > > > +
> > > > > > > > > +        /* We're requested to run a synchronous job, hence don't push
> > > > > > > > > +         * the job, bypassing the job scheduler, and execute the jobs
> > > > > > > > > +         * run() function right away.
> > > > > > > > > +         *
> > > > > > > > > +         * As a consequence of bypassing the job scheduler we need to
> > > > > > > > > +         * handle fencing and job cleanup ourselfes.
> > > > > > > > > +         */
> > > > > > > > > +        ret = nouveau_job_run_sync(job);
> > > > > > > > > +
> > > > > > > > > +        /* If the job fails, the caller will do the cleanup for us. */
> > > > > > > > > +        if (!ret)
> > > > > > > > > +            nouveau_job_fini(job);
> > > > > > > > > +
> > > > > > > > > +        return ret;
> > > > > > > > > +    } else {
> > > > > > > > > +        mutex_lock(&entity->job.mutex);
> > > > > > > > > +        drm_sched_entity_push_job(&job->base);
> > > > > > > > > +        list_add_tail(&job->head, &entity->job.list);
> > > > > > > > > +        mutex_unlock(&entity->job.mutex);
> > > > > > > > > +    }
> > > > > > > > > +
> > > > > > > > > +out:
> > > > > > > > > +    drm_exec_fini(&job->exec);
> > > > > > > > > +    return ret;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static struct dma_fence *
> > > > > > > > > +nouveau_sched_run_job(struct drm_sched_job *sched_job)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > > > > > > > +
> > > > > > > > > +    return nouveau_job_run(job);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static enum drm_gpu_sched_stat
> > > > > > > > > +nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > > > > > > > +    struct nouveau_channel *chan = job->chan;
> > > > > > > > > +
> > > > > > > > > +    if (unlikely(!atomic_read(&chan->killed)))
> > > > > > > > > +        nouveau_channel_kill(chan);
> > > > > > > > > +
> > > > > > > > > +    NV_PRINTK(warn, job->cli, "job timeout, channel %d killed!\n",
> > > > > > > > > +          chan->chid);
> > > > > > > > > +
> > > > > > > > > +    nouveau_sched_entity_fini(job->entity);
> > > > > > > > > +
> > > > > > > > > +    return DRM_GPU_SCHED_STAT_ENODEV;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static void
> > > > > > > > > +nouveau_sched_free_job(struct drm_sched_job *sched_job)
> > > > > > > > > +{
> > > > > > > > > +    struct nouveau_job *job = to_nouveau_job(sched_job);
> > > > > > > > > +    struct nouveau_sched_entity *entity = job->entity;
> > > > > > > > > +
> > > > > > > > > +    mutex_lock(&entity->job.mutex);
> > > > > > > > > +    list_del(&job->head);
> > > > > > > > > +    mutex_unlock(&entity->job.mutex);
> > > > > > > > > +
> > > > > > > > > +    nouveau_job_fini(job);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
> > > > > > > > > +                  struct drm_gpu_scheduler *sched)
> > > > > > > > > +{
> > > > > > > > > +
> > > > > > > > > +    INIT_LIST_HEAD(&entity->job.list);
> > > > > > > > > +    mutex_init(&entity->job.mutex);
> > > > > > > > > +
> > > > > > > > > +    return drm_sched_entity_init(&entity->base,
> > > > > > > > > +                     DRM_SCHED_PRIORITY_NORMAL,
> > > > > > > > > +                     &sched, 1, NULL);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void
> > > > > > > > > +nouveau_sched_entity_fini(struct nouveau_sched_entity *entity)
> > > > > > > > > +{
> > > > > > > > > +    drm_sched_entity_destroy(&entity->base);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static const struct drm_sched_backend_ops nouveau_sched_ops = {
> > > > > > > > > +    .run_job = nouveau_sched_run_job,
> > > > > > > > > +    .timedout_job = nouveau_sched_timedout_job,
> > > > > > > > > +    .free_job = nouveau_sched_free_job,
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
> > > > > > > > > +               struct nouveau_drm *drm)
> > > > > > > > > +{
> > > > > > > > > +    long job_hang_limit =
> > > > > > > > > msecs_to_jiffies(NOUVEAU_SCHED_JOB_TIMEOUT_MS);
> > > > > > > > > +
> > > > > > > > > +    return drm_sched_init(sched, &nouveau_sched_ops,
> > > > > > > > > +                  NOUVEAU_SCHED_HW_SUBMISSIONS, 0, job_hang_limit,
> > > > > > > > > +                  NULL, NULL, "nouveau", drm->dev->dev);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +void nouveau_sched_fini(struct drm_gpu_scheduler *sched)
> > > > > > > > > +{
> > > > > > > > > +    drm_sched_fini(sched);
> > > > > > > > > +}
> > > > > > > > > diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > > > > > > b/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > > > > > > new file mode 100644
> > > > > > > > > index 000000000000..7fc5b7eea810
> > > > > > > > > --- /dev/null
> > > > > > > > > +++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
> > > > > > > > > @@ -0,0 +1,98 @@
> > > > > > > > > +// SPDX-License-Identifier: MIT
> > > > > > > > > +
> > > > > > > > > +#ifndef NOUVEAU_SCHED_H
> > > > > > > > > +#define NOUVEAU_SCHED_H
> > > > > > > > > +
> > > > > > > > > +#include <linux/types.h>
> > > > > > > > > +
> > > > > > > > > +#include <drm/drm_exec.h>
> > > > > > > > > +#include <drm/gpu_scheduler.h>
> > > > > > > > > +
> > > > > > > > > +#include "nouveau_drv.h"
> > > > > > > > > +#include "nouveau_exec.h"
> > > > > > > > > +
> > > > > > > > > +#define to_nouveau_job(sched_job)        \
> > > > > > > > > +        container_of((sched_job), struct nouveau_job, base)
> > > > > > > > > +
> > > > > > > > > +#define to_nouveau_exec_job(job)        \
> > > > > > > > > +        container_of((job), struct nouveau_exec_job, base)
> > > > > > > > > +
> > > > > > > > > +#define to_nouveau_bind_job(job)        \
> > > > > > > > > +        container_of((job), struct nouveau_bind_job, base)
> > > > > > > > > +
> > > > > > > > > +struct nouveau_job {
> > > > > > > > > +    struct drm_sched_job base;
> > > > > > > > > +    struct list_head head;
> > > > > > > > > +
> > > > > > > > > +    struct nouveau_sched_entity *entity;
> > > > > > > > > +
> > > > > > > > > +    struct drm_file *file_priv;
> > > > > > > > > +    struct nouveau_cli *cli;
> > > > > > > > > +    struct nouveau_channel *chan;
> > > > > > > > > +
> > > > > > > > > +    struct drm_exec exec;
> > > > > > > > > +    struct dma_fence *done_fence;
> > > > > > > > > +
> > > > > > > > > +    bool sync;
> > > > > > > > > +
> > > > > > > > > +    struct {
> > > > > > > > > +        struct drm_nouveau_sync *s;
> > > > > > > > > +        u32 count;
> > > > > > > > > +    } in_sync;
> > > > > > > > > +
> > > > > > > > > +    struct {
> > > > > > > > > +        struct drm_nouveau_sync *s;
> > > > > > > > > +        u32 count;
> > > > > > > > > +    } out_sync;
> > > > > > > > > +
> > > > > > > > > +    struct nouveau_job_ops {
> > > > > > > > > +        int (*submit)(struct nouveau_job *);
> > > > > > > > > +        struct dma_fence *(*run)(struct nouveau_job *);
> > > > > > > > > +        void (*free)(struct nouveau_job *);
> > > > > > > > > +    } *ops;
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > +struct nouveau_exec_job {
> > > > > > > > > +    struct nouveau_job base;
> > > > > > > > > +
> > > > > > > > > +    struct {
> > > > > > > > > +        struct drm_nouveau_exec_push *s;
> > > > > > > > > +        u32 count;
> > > > > > > > > +    } push;
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > +struct nouveau_bind_job {
> > > > > > > > > +    struct nouveau_job base;
> > > > > > > > > +
> > > > > > > > > +    /* struct bind_job_op */
> > > > > > > > > +    struct list_head ops;
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > +int nouveau_bind_job_init(struct nouveau_bind_job **job,
> > > > > > > > > +              struct nouveau_exec_bind *bind);
> > > > > > > > > +int nouveau_exec_job_init(struct nouveau_exec_job **job,
> > > > > > > > > +              struct nouveau_exec *exec);
> > > > > > > > > +
> > > > > > > > > +int nouveau_job_submit(struct nouveau_job *job);
> > > > > > > > > +void nouveau_job_fini(struct nouveau_job *job);
> > > > > > > > > +
> > > > > > > > > +#define to_nouveau_sched_entity(entity)        \
> > > > > > > > > +        container_of((entity), struct nouveau_sched_entity, base)
> > > > > > > > > +
> > > > > > > > > +struct nouveau_sched_entity {
> > > > > > > > > +    struct drm_sched_entity base;
> > > > > > > > > +    struct {
> > > > > > > > > +        struct list_head list;
> > > > > > > > > +        struct mutex mutex;
> > > > > > > > > +    } job;
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > +int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
> > > > > > > > > +                  struct drm_gpu_scheduler *sched);
> > > > > > > > > +void nouveau_sched_entity_fini(struct nouveau_sched_entity *entity);
> > > > > > > > > +
> > > > > > > > > +int nouveau_sched_init(struct drm_gpu_scheduler *sched,
> > > > > > > > > +               struct nouveau_drm *drm);
> > > > > > > > > +void nouveau_sched_fini(struct drm_gpu_scheduler *sched);
> > > > > > > > > +
> > > > > > > > > +#endif
> > > > > > > > 
> > > > > > > 
> > > > > > 
> > > > > 
> > > > 
> > > 
> > 
>
Boris Brezillon Jan. 20, 2023, 10:08 a.m. UTC | #12
On Thu, 19 Jan 2023 04:58:48 +0000
Matthew Brost <matthew.brost@intel.com> wrote:

> > For the ops structures the drm_gpuva_manager allocates for reporting the
> > split/merge steps back to the driver I have ideas to entirely avoid
> > allocations, which also is a good thing in respect of Christians feedback
> > regarding the huge amount of mapping requests some applications seem to
> > generate.
> >  
> 
> It should be fine to have allocations to report the split/merge step as
> this step should be before a dma-fence is published, but yea if possible
> to avoid extra allocs as that is always better.
> 
> Also BTW, great work on drm_gpuva_manager too. We will almost likely
> pick this up in Xe rather than open coding all of this as we currently
> do. We should probably start the port to this soon so we can contribute
> to the implementation and get both of our drivers upstream sooner.

Also quite interested in using this drm_gpuva_manager for pancsf, since
I've been open-coding something similar. Didn't have the
gpuva_region concept to make sure VA mapping/unmapping requests don't
don't go outside a pre-reserved region, but it seems to automate some
of the stuff I've been doing quite nicely.
Boris Brezillon Jan. 20, 2023, 10:22 a.m. UTC | #13
On Thu, 19 Jan 2023 16:38:06 +0000
Matthew Brost <matthew.brost@intel.com> wrote:

> On Thu, Jan 19, 2023 at 04:36:43PM +0100, Danilo Krummrich wrote:
> > On 1/19/23 05:58, Matthew Brost wrote:  
> > > On Thu, Jan 19, 2023 at 04:44:23AM +0100, Danilo Krummrich wrote:  
> > > > On 1/18/23 21:37, Thomas Hellström (Intel) wrote:  
> > > > > 
> > > > > On 1/18/23 07:12, Danilo Krummrich wrote:  
> > > > > > This commit provides the implementation for the new uapi motivated by the
> > > > > > Vulkan API. It allows user mode drivers (UMDs) to:
> > > > > > 
> > > > > > 1) Initialize a GPU virtual address (VA) space via the new
> > > > > >      DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
> > > > > >      space managed by the kernel and userspace, respectively.
> > > > > > 
> > > > > > 2) Allocate and free a VA space region as well as bind and unbind memory
> > > > > >      to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> > > > > >      UMDs can request the named operations to be processed either
> > > > > >      synchronously or asynchronously. It supports DRM syncobjs
> > > > > >      (incl. timelines) as synchronization mechanism. The management of the
> > > > > >      GPU VA mappings is implemented with the DRM GPU VA manager.
> > > > > > 
> > > > > > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
> > > > > >      execution happens asynchronously. It supports DRM syncobj (incl.
> > > > > >      timelines) as synchronization mechanism. DRM GEM object locking is
> > > > > >      handled with drm_exec.
> > > > > > 
> > > > > > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
> > > > > > GPU scheduler for the asynchronous paths.
> > > > > > 
> > > > > > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > > > > > ---
> > > > > >    Documentation/gpu/driver-uapi.rst       |   3 +
> > > > > >    drivers/gpu/drm/nouveau/Kbuild          |   2 +
> > > > > >    drivers/gpu/drm/nouveau/Kconfig         |   2 +
> > > > > >    drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
> > > > > >    drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
> > > > > >    drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
> > > > > >    drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
> > > > > >    drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
> > > > > >    drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
> > > > > >    drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
> > > > > >    drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
> > > > > >    11 files changed, 1295 insertions(+), 4 deletions(-)
> > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
> > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
> > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
> > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h  
> > > > > ...  
> > > > > > 
> > > > > > +static struct dma_fence *
> > > > > > +nouveau_bind_job_run(struct nouveau_job *job)
> > > > > > +{
> > > > > > +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> > > > > > +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
> > > > > > +    struct bind_job_op *op;
> > > > > > +    int ret = 0;
> > > > > > +  
> > > > > 
> > > > > I was looking at how nouveau does the async binding compared to how xe
> > > > > does it.
> > > > > It looks to me that this function being a scheduler run_job callback is
> > > > > the main part of the VM_BIND dma-fence signalling critical section for
> > > > > the job's done_fence and if so, needs to be annotated as such?  
> > > > 
> > > > Yes, that's the case.
> > > >   
> > > > > 
> > > > > For example nouveau_uvma_region_new allocates memory, which is not
> > > > > allowed if in a dma_fence signalling critical section and the locking
> > > > > also looks suspicious?  
> > > > 
> > > > Thanks for pointing this out, I missed that somehow.
> > > > 
> > > > I will change it to pre-allocate new regions, mappings and page tables
> > > > within the job's submit() function.
> > > >   
> > > 
> > > Yea that what we basically do in Xe, in the IOCTL step allocate all the
> > > backing store for new page tables, populate new page tables (these are
> > > not yet visible in the page table structure), and in last step which is
> > > executed after all the dependencies are satified program all the leaf
> > > entires making the new binding visible.
> > > 
> > > We screwed have this up by defering most of the IOCTL to a worker but
> > > will fix this fix this one way or another soon - get rid of worker or
> > > introduce a type of sync that is signaled after the worker + publish the
> > > dma-fence in the worker. I'd like to close on this one soon.  
> > > > For the ops structures the drm_gpuva_manager allocates for reporting the
> > > > split/merge steps back to the driver I have ideas to entirely avoid
> > > > allocations, which also is a good thing in respect of Christians feedback
> > > > regarding the huge amount of mapping requests some applications seem to
> > > > generate.
> > > >   
> > > 
> > > It should be fine to have allocations to report the split/merge step as
> > > this step should be before a dma-fence is published, but yea if possible
> > > to avoid extra allocs as that is always better.  
> > 
> > I think we can't really ask for the split/merge steps before actually
> > running the job, since it requires the particular VA space not to change
> > while performing those operations.
> > 
> > E.g. if we'd run the split/merge steps at job submit() time the underlying
> > VA space could be changed by other bind jobs executing before this one,
> > which would make the calculated split/merge steps obsolete and wrong.
> >   
> 
> Hmm, maybe I'm not understanding this implementation, admittedly I
> haven't studied the gpuva manager code in detail.
> 
> Let me explain what we are doing in Xe.
> 
> Map 0x0000 - 0x3000 -> this resolves into 1 bind operation and 1 VMA
> Unmap 0x1000-0x2000 -> this resolves into 1 unbind and 2 rebind operations
> 
> 1. unbind 0x0000-0x3000 -> destroy old VMA
> 2. rebind 0x0000-0x1000 -> new VMA
> 3. rebind 0x2000-0x3000 -> new VMA
> 
> All of the above steps resolving the operations can be done in the IOCTL
> phase and VM's VMA structure is also updated. When the dependencies
> are resolved the actual bindings are done on the GPU. We use the BO's
> dma-resv slots to ensure there is never a window 0x0000-0x1000 and
> 0x2000-0x3000 are never mapped with respect to execs (I forget the exact
> details of how we do this but if you want to know I'll explain further).
> 

Ok, so I've been contemplating the idea of pre-reserving memory for any
future page-table updates, so I can guarantee the bind/unbind op in
->run_job() never fails (that's actually made more complicated in my
case, because we don't directly control the page table updates, but
defer that to the iommu/iopagetbl framework which does the allocation,
so I didn't really go as far as you did). But with bind ops happening in
a queue with dependencies to wait on, guessing what the page-table will
look like is a bit challenging. Sure, we can pre-allocate pages for all
levels needed to reach the leaf node(s) we're trying to insert or plan
for the worst case scenario in case of 2MB -> 4K block splits for
partial unmaps, but it sounds like a lot of memory reservation,
especially if we get lot of bind requests queued. Just curious to hear
how you solved that.
Matthew Brost Jan. 22, 2023, 5:48 p.m. UTC | #14
On Fri, Jan 20, 2023 at 11:22:45AM +0100, Boris Brezillon wrote:
> On Thu, 19 Jan 2023 16:38:06 +0000
> Matthew Brost <matthew.brost@intel.com> wrote:
> 
> > On Thu, Jan 19, 2023 at 04:36:43PM +0100, Danilo Krummrich wrote:
> > > On 1/19/23 05:58, Matthew Brost wrote:  
> > > > On Thu, Jan 19, 2023 at 04:44:23AM +0100, Danilo Krummrich wrote:  
> > > > > On 1/18/23 21:37, Thomas Hellström (Intel) wrote:  
> > > > > > 
> > > > > > On 1/18/23 07:12, Danilo Krummrich wrote:  
> > > > > > > This commit provides the implementation for the new uapi motivated by the
> > > > > > > Vulkan API. It allows user mode drivers (UMDs) to:
> > > > > > > 
> > > > > > > 1) Initialize a GPU virtual address (VA) space via the new
> > > > > > >      DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
> > > > > > >      space managed by the kernel and userspace, respectively.
> > > > > > > 
> > > > > > > 2) Allocate and free a VA space region as well as bind and unbind memory
> > > > > > >      to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> > > > > > >      UMDs can request the named operations to be processed either
> > > > > > >      synchronously or asynchronously. It supports DRM syncobjs
> > > > > > >      (incl. timelines) as synchronization mechanism. The management of the
> > > > > > >      GPU VA mappings is implemented with the DRM GPU VA manager.
> > > > > > > 
> > > > > > > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
> > > > > > >      execution happens asynchronously. It supports DRM syncobj (incl.
> > > > > > >      timelines) as synchronization mechanism. DRM GEM object locking is
> > > > > > >      handled with drm_exec.
> > > > > > > 
> > > > > > > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
> > > > > > > GPU scheduler for the asynchronous paths.
> > > > > > > 
> > > > > > > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > > > > > > ---
> > > > > > >    Documentation/gpu/driver-uapi.rst       |   3 +
> > > > > > >    drivers/gpu/drm/nouveau/Kbuild          |   2 +
> > > > > > >    drivers/gpu/drm/nouveau/Kconfig         |   2 +
> > > > > > >    drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
> > > > > > >    drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
> > > > > > >    drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
> > > > > > >    drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
> > > > > > >    drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
> > > > > > >    drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
> > > > > > >    drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
> > > > > > >    drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
> > > > > > >    11 files changed, 1295 insertions(+), 4 deletions(-)
> > > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
> > > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
> > > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
> > > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h  
> > > > > > ...  
> > > > > > > 
> > > > > > > +static struct dma_fence *
> > > > > > > +nouveau_bind_job_run(struct nouveau_job *job)
> > > > > > > +{
> > > > > > > +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> > > > > > > +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
> > > > > > > +    struct bind_job_op *op;
> > > > > > > +    int ret = 0;
> > > > > > > +  
> > > > > > 
> > > > > > I was looking at how nouveau does the async binding compared to how xe
> > > > > > does it.
> > > > > > It looks to me that this function being a scheduler run_job callback is
> > > > > > the main part of the VM_BIND dma-fence signalling critical section for
> > > > > > the job's done_fence and if so, needs to be annotated as such?  
> > > > > 
> > > > > Yes, that's the case.
> > > > >   
> > > > > > 
> > > > > > For example nouveau_uvma_region_new allocates memory, which is not
> > > > > > allowed if in a dma_fence signalling critical section and the locking
> > > > > > also looks suspicious?  
> > > > > 
> > > > > Thanks for pointing this out, I missed that somehow.
> > > > > 
> > > > > I will change it to pre-allocate new regions, mappings and page tables
> > > > > within the job's submit() function.
> > > > >   
> > > > 
> > > > Yea that what we basically do in Xe, in the IOCTL step allocate all the
> > > > backing store for new page tables, populate new page tables (these are
> > > > not yet visible in the page table structure), and in last step which is
> > > > executed after all the dependencies are satified program all the leaf
> > > > entires making the new binding visible.
> > > > 
> > > > We screwed have this up by defering most of the IOCTL to a worker but
> > > > will fix this fix this one way or another soon - get rid of worker or
> > > > introduce a type of sync that is signaled after the worker + publish the
> > > > dma-fence in the worker. I'd like to close on this one soon.  
> > > > > For the ops structures the drm_gpuva_manager allocates for reporting the
> > > > > split/merge steps back to the driver I have ideas to entirely avoid
> > > > > allocations, which also is a good thing in respect of Christians feedback
> > > > > regarding the huge amount of mapping requests some applications seem to
> > > > > generate.
> > > > >   
> > > > 
> > > > It should be fine to have allocations to report the split/merge step as
> > > > this step should be before a dma-fence is published, but yea if possible
> > > > to avoid extra allocs as that is always better.  
> > > 
> > > I think we can't really ask for the split/merge steps before actually
> > > running the job, since it requires the particular VA space not to change
> > > while performing those operations.
> > > 
> > > E.g. if we'd run the split/merge steps at job submit() time the underlying
> > > VA space could be changed by other bind jobs executing before this one,
> > > which would make the calculated split/merge steps obsolete and wrong.
> > >   
> > 
> > Hmm, maybe I'm not understanding this implementation, admittedly I
> > haven't studied the gpuva manager code in detail.
> > 
> > Let me explain what we are doing in Xe.
> > 
> > Map 0x0000 - 0x3000 -> this resolves into 1 bind operation and 1 VMA
> > Unmap 0x1000-0x2000 -> this resolves into 1 unbind and 2 rebind operations
> > 
> > 1. unbind 0x0000-0x3000 -> destroy old VMA
> > 2. rebind 0x0000-0x1000 -> new VMA
> > 3. rebind 0x2000-0x3000 -> new VMA
> > 
> > All of the above steps resolving the operations can be done in the IOCTL
> > phase and VM's VMA structure is also updated. When the dependencies
> > are resolved the actual bindings are done on the GPU. We use the BO's
> > dma-resv slots to ensure there is never a window 0x0000-0x1000 and
> > 0x2000-0x3000 are never mapped with respect to execs (I forget the exact
> > details of how we do this but if you want to know I'll explain further).
> > 
> 
> Ok, so I've been contemplating the idea of pre-reserving memory for any
> future page-table updates, so I can guarantee the bind/unbind op in
> ->run_job() never fails (that's actually made more complicated in my
> case, because we don't directly control the page table updates, but
> defer that to the iommu/iopagetbl framework which does the allocation,
> so I didn't really go as far as you did). But with bind ops happening in
> a queue with dependencies to wait on, guessing what the page-table will
> look like is a bit challenging. Sure, we can pre-allocate pages for all
> levels needed to reach the leaf node(s) we're trying to insert or plan
> for the worst case scenario in case of 2MB -> 4K block splits for
> partial unmaps, but it sounds like a lot of memory reservation,
> especially if we get lot of bind requests queued. Just curious to hear
> how you solved that.

A few thing here.

First the reason we always do an unbind and possibly 2 rebinds is
because we might crossing the 1GB / 2MB page boundary size. In the
example I gave we could just unbind 0x1000-0x2000 and call it day but
we'd have to build smarts into to our code to know that is ok which we
don't have at this time.

e.g. This wouldn't ok just to do an unbind as the page size goes from
2MB to 4k:

Map   0x000000-0x200000 -> 2MB page
Unmap 0x100000-0x200000 -> need 256 4k pages now

For simplity we always do a unbind + 2 possible rebinds. Maybe we will
be a little smarter in the future if needed.

WRT to the memory allocations, we haven't really thought about
consequences of reserving memory for pending binds. My initial reaction
is I don't think this an issue. Page table memory should be relatively
small compared to user memory so having a bunch queue'd up shouldn't
really be an issue, right? Going from 2MB -> 4k splits on either side is
just another 2 4k allocs on either side. Maybe I'm wrong, I just don't
see this being a major issue. If the user abusing the VM bind interface,
e.g. Allocatiing misaligned huge chunks of data, that is kinda on them.

Matt
Boris Brezillon Jan. 23, 2023, 10:01 a.m. UTC | #15
On Sun, 22 Jan 2023 17:48:37 +0000
Matthew Brost <matthew.brost@intel.com> wrote:

> On Fri, Jan 20, 2023 at 11:22:45AM +0100, Boris Brezillon wrote:
> > On Thu, 19 Jan 2023 16:38:06 +0000
> > Matthew Brost <matthew.brost@intel.com> wrote:
> >   
> > > On Thu, Jan 19, 2023 at 04:36:43PM +0100, Danilo Krummrich wrote:  
> > > > On 1/19/23 05:58, Matthew Brost wrote:    
> > > > > On Thu, Jan 19, 2023 at 04:44:23AM +0100, Danilo Krummrich wrote:    
> > > > > > On 1/18/23 21:37, Thomas Hellström (Intel) wrote:    
> > > > > > > 
> > > > > > > On 1/18/23 07:12, Danilo Krummrich wrote:    
> > > > > > > > This commit provides the implementation for the new uapi motivated by the
> > > > > > > > Vulkan API. It allows user mode drivers (UMDs) to:
> > > > > > > > 
> > > > > > > > 1) Initialize a GPU virtual address (VA) space via the new
> > > > > > > >      DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion of VA
> > > > > > > >      space managed by the kernel and userspace, respectively.
> > > > > > > > 
> > > > > > > > 2) Allocate and free a VA space region as well as bind and unbind memory
> > > > > > > >      to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> > > > > > > >      UMDs can request the named operations to be processed either
> > > > > > > >      synchronously or asynchronously. It supports DRM syncobjs
> > > > > > > >      (incl. timelines) as synchronization mechanism. The management of the
> > > > > > > >      GPU VA mappings is implemented with the DRM GPU VA manager.
> > > > > > > > 
> > > > > > > > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl. The
> > > > > > > >      execution happens asynchronously. It supports DRM syncobj (incl.
> > > > > > > >      timelines) as synchronization mechanism. DRM GEM object locking is
> > > > > > > >      handled with drm_exec.
> > > > > > > > 
> > > > > > > > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the DRM
> > > > > > > > GPU scheduler for the asynchronous paths.
> > > > > > > > 
> > > > > > > > Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> > > > > > > > ---
> > > > > > > >    Documentation/gpu/driver-uapi.rst       |   3 +
> > > > > > > >    drivers/gpu/drm/nouveau/Kbuild          |   2 +
> > > > > > > >    drivers/gpu/drm/nouveau/Kconfig         |   2 +
> > > > > > > >    drivers/gpu/drm/nouveau/nouveau_abi16.c |  16 +
> > > > > > > >    drivers/gpu/drm/nouveau/nouveau_abi16.h |   1 +
> > > > > > > >    drivers/gpu/drm/nouveau/nouveau_drm.c   |  23 +-
> > > > > > > >    drivers/gpu/drm/nouveau/nouveau_drv.h   |   9 +-
> > > > > > > >    drivers/gpu/drm/nouveau/nouveau_exec.c  | 310 ++++++++++
> > > > > > > >    drivers/gpu/drm/nouveau/nouveau_exec.h  |  55 ++
> > > > > > > >    drivers/gpu/drm/nouveau/nouveau_sched.c | 780 ++++++++++++++++++++++++
> > > > > > > >    drivers/gpu/drm/nouveau/nouveau_sched.h |  98 +++
> > > > > > > >    11 files changed, 1295 insertions(+), 4 deletions(-)
> > > > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.c
> > > > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_exec.h
> > > > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.c
> > > > > > > >    create mode 100644 drivers/gpu/drm/nouveau/nouveau_sched.h    
> > > > > > > ...    
> > > > > > > > 
> > > > > > > > +static struct dma_fence *
> > > > > > > > +nouveau_bind_job_run(struct nouveau_job *job)
> > > > > > > > +{
> > > > > > > > +    struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
> > > > > > > > +    struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
> > > > > > > > +    struct bind_job_op *op;
> > > > > > > > +    int ret = 0;
> > > > > > > > +    
> > > > > > > 
> > > > > > > I was looking at how nouveau does the async binding compared to how xe
> > > > > > > does it.
> > > > > > > It looks to me that this function being a scheduler run_job callback is
> > > > > > > the main part of the VM_BIND dma-fence signalling critical section for
> > > > > > > the job's done_fence and if so, needs to be annotated as such?    
> > > > > > 
> > > > > > Yes, that's the case.
> > > > > >     
> > > > > > > 
> > > > > > > For example nouveau_uvma_region_new allocates memory, which is not
> > > > > > > allowed if in a dma_fence signalling critical section and the locking
> > > > > > > also looks suspicious?    
> > > > > > 
> > > > > > Thanks for pointing this out, I missed that somehow.
> > > > > > 
> > > > > > I will change it to pre-allocate new regions, mappings and page tables
> > > > > > within the job's submit() function.
> > > > > >     
> > > > > 
> > > > > Yea that what we basically do in Xe, in the IOCTL step allocate all the
> > > > > backing store for new page tables, populate new page tables (these are
> > > > > not yet visible in the page table structure), and in last step which is
> > > > > executed after all the dependencies are satified program all the leaf
> > > > > entires making the new binding visible.
> > > > > 
> > > > > We screwed have this up by defering most of the IOCTL to a worker but
> > > > > will fix this fix this one way or another soon - get rid of worker or
> > > > > introduce a type of sync that is signaled after the worker + publish the
> > > > > dma-fence in the worker. I'd like to close on this one soon.    
> > > > > > For the ops structures the drm_gpuva_manager allocates for reporting the
> > > > > > split/merge steps back to the driver I have ideas to entirely avoid
> > > > > > allocations, which also is a good thing in respect of Christians feedback
> > > > > > regarding the huge amount of mapping requests some applications seem to
> > > > > > generate.
> > > > > >     
> > > > > 
> > > > > It should be fine to have allocations to report the split/merge step as
> > > > > this step should be before a dma-fence is published, but yea if possible
> > > > > to avoid extra allocs as that is always better.    
> > > > 
> > > > I think we can't really ask for the split/merge steps before actually
> > > > running the job, since it requires the particular VA space not to change
> > > > while performing those operations.
> > > > 
> > > > E.g. if we'd run the split/merge steps at job submit() time the underlying
> > > > VA space could be changed by other bind jobs executing before this one,
> > > > which would make the calculated split/merge steps obsolete and wrong.
> > > >     
> > > 
> > > Hmm, maybe I'm not understanding this implementation, admittedly I
> > > haven't studied the gpuva manager code in detail.
> > > 
> > > Let me explain what we are doing in Xe.
> > > 
> > > Map 0x0000 - 0x3000 -> this resolves into 1 bind operation and 1 VMA
> > > Unmap 0x1000-0x2000 -> this resolves into 1 unbind and 2 rebind operations
> > > 
> > > 1. unbind 0x0000-0x3000 -> destroy old VMA
> > > 2. rebind 0x0000-0x1000 -> new VMA
> > > 3. rebind 0x2000-0x3000 -> new VMA
> > > 
> > > All of the above steps resolving the operations can be done in the IOCTL
> > > phase and VM's VMA structure is also updated. When the dependencies
> > > are resolved the actual bindings are done on the GPU. We use the BO's
> > > dma-resv slots to ensure there is never a window 0x0000-0x1000 and
> > > 0x2000-0x3000 are never mapped with respect to execs (I forget the exact
> > > details of how we do this but if you want to know I'll explain further).
> > >   
> > 
> > Ok, so I've been contemplating the idea of pre-reserving memory for any
> > future page-table updates, so I can guarantee the bind/unbind op in  
> > ->run_job() never fails (that's actually made more complicated in my  
> > case, because we don't directly control the page table updates, but
> > defer that to the iommu/iopagetbl framework which does the allocation,
> > so I didn't really go as far as you did). But with bind ops happening in
> > a queue with dependencies to wait on, guessing what the page-table will
> > look like is a bit challenging. Sure, we can pre-allocate pages for all
> > levels needed to reach the leaf node(s) we're trying to insert or plan
> > for the worst case scenario in case of 2MB -> 4K block splits for
> > partial unmaps, but it sounds like a lot of memory reservation,
> > especially if we get lot of bind requests queued. Just curious to hear
> > how you solved that.  
> 
> A few thing here.
> 
> First the reason we always do an unbind and possibly 2 rebinds is
> because we might crossing the 1GB / 2MB page boundary size. In the
> example I gave we could just unbind 0x1000-0x2000 and call it day but
> we'd have to build smarts into to our code to know that is ok which we
> don't have at this time.
> 
> e.g. This wouldn't ok just to do an unbind as the page size goes from
> 2MB to 4k:
> 
> Map   0x000000-0x200000 -> 2MB page
> Unmap 0x100000-0x200000 -> need 256 4k pages now
> 
> For simplity we always do a unbind + 2 possible rebinds. Maybe we will
> be a little smarter in the future if needed.

Yeah, I see why you're doing that, but it's not really an option in
pancsf, because this is hidden behind the io_pgtable abstract, which
deals with splits/merges of PTEs on its own. So, when we call
io_pgtable_ops->unmap(0x1000-0x2000), what happens in practice is:

1/ allocate an L3 page table
2/ populate the 0x000000-0x001000 and 0x002000-0x200000 range in this
   new L3 page table
3/ update the L2 page table entry to point to the L3 one

In that regards, the io_pgtable abstraction makes it simpler for the
driver, but that also means we don't control memory allocation, and
both map and unmap operations can fail.

> 
> WRT to the memory allocations, we haven't really thought about
> consequences of reserving memory for pending binds. My initial reaction
> is I don't think this an issue. Page table memory should be relatively
> small compared to user memory so having a bunch queue'd up shouldn't
> really be an issue, right? Going from 2MB -> 4k splits on either side is
> just another 2 4k allocs on either side.

Well, yes, for unmap ops, you'd probably need a maximum of 2 pages, but
for a map op, if you need to allocate L1 and L2 tables (they might not
exist when the map request reaches time of execution, and it's pretty
hard to predict the state of the page-table), that's actually more than
just 2 pages. It depends on the size of the mapping actually. And yes,
I agree it's not much compared to the amount of user memory, but if you
have plenty of map/unmap requests queued, pre-reserving those pages can
still consume a non-negligible amount of memory. But maybe we'll have
bigger problems if the amount of queued VM_BIND requests grows to the
point where the amount of memory reserved for future page-table updates
is a problem, dunno.
diff mbox series

Patch

diff --git a/Documentation/gpu/driver-uapi.rst b/Documentation/gpu/driver-uapi.rst
index 9c7ca6e33a68..c08bcbb95fb3 100644
--- a/Documentation/gpu/driver-uapi.rst
+++ b/Documentation/gpu/driver-uapi.rst
@@ -13,4 +13,7 @@  drm/nouveau uAPI
 VM_BIND / EXEC uAPI
 -------------------
 
+.. kernel-doc:: drivers/gpu/drm/nouveau/nouveau_exec.c
+    :doc: Overview
+
 .. kernel-doc:: include/uapi/drm/nouveau_drm.h
diff --git a/drivers/gpu/drm/nouveau/Kbuild b/drivers/gpu/drm/nouveau/Kbuild
index ee281bb76463..cf6b3a80c0c8 100644
--- a/drivers/gpu/drm/nouveau/Kbuild
+++ b/drivers/gpu/drm/nouveau/Kbuild
@@ -47,6 +47,8 @@  nouveau-y += nouveau_prime.o
 nouveau-y += nouveau_sgdma.o
 nouveau-y += nouveau_ttm.o
 nouveau-y += nouveau_vmm.o
+nouveau-y += nouveau_exec.o
+nouveau-y += nouveau_sched.o
 nouveau-y += nouveau_uvmm.o
 
 # DRM - modesetting
diff --git a/drivers/gpu/drm/nouveau/Kconfig b/drivers/gpu/drm/nouveau/Kconfig
index a0bb3987bf63..59e5c13be9b6 100644
--- a/drivers/gpu/drm/nouveau/Kconfig
+++ b/drivers/gpu/drm/nouveau/Kconfig
@@ -10,6 +10,8 @@  config DRM_NOUVEAU
 	select DRM_KMS_HELPER
 	select DRM_TTM
 	select DRM_TTM_HELPER
+	select DRM_EXEC
+	select DRM_SCHED
 	select I2C
 	select I2C_ALGOBIT
 	select BACKLIGHT_CLASS_DEVICE if DRM_NOUVEAU_BACKLIGHT
diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c b/drivers/gpu/drm/nouveau/nouveau_abi16.c
index 36cc80eb0e20..694777a58bca 100644
--- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
+++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
@@ -35,6 +35,7 @@ 
 #include "nouveau_chan.h"
 #include "nouveau_abi16.h"
 #include "nouveau_vmm.h"
+#include "nouveau_sched.h"
 
 static struct nouveau_abi16 *
 nouveau_abi16(struct drm_file *file_priv)
@@ -125,6 +126,17 @@  nouveau_abi16_chan_fini(struct nouveau_abi16 *abi16,
 {
 	struct nouveau_abi16_ntfy *ntfy, *temp;
 
+	/* When a client exits without waiting for it's queued up jobs to
+	 * finish it might happen that we fault the channel. This is due to
+	 * drm_file_free() calling drm_gem_release() before the postclose()
+	 * callback. Hence, we can't tear down this scheduler entity before
+	 * uvmm mappings are unmapped. Currently, we can't detect this case.
+	 *
+	 * However, this should be rare and harmless, since the channel isn't
+	 * needed anymore.
+	 */
+	nouveau_sched_entity_fini(&chan->sched_entity);
+
 	/* wait for all activity to stop before cleaning up */
 	if (chan->chan)
 		nouveau_channel_idle(chan->chan);
@@ -311,6 +323,10 @@  nouveau_abi16_ioctl_channel_alloc(ABI16_IOCTL_ARGS)
 	if (ret)
 		goto done;
 
+	ret = nouveau_sched_entity_init(&chan->sched_entity, &drm->sched);
+	if (ret)
+		goto done;
+
 	init->channel = chan->chan->chid;
 
 	if (device->info.family >= NV_DEVICE_INFO_V0_TESLA)
diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.h b/drivers/gpu/drm/nouveau/nouveau_abi16.h
index 27eae85f33e6..8209eb28feaf 100644
--- a/drivers/gpu/drm/nouveau/nouveau_abi16.h
+++ b/drivers/gpu/drm/nouveau/nouveau_abi16.h
@@ -26,6 +26,7 @@  struct nouveau_abi16_chan {
 	struct nouveau_bo *ntfy;
 	struct nouveau_vma *ntfy_vma;
 	struct nvkm_mm  heap;
+	struct nouveau_sched_entity sched_entity;
 };
 
 struct nouveau_abi16 {
diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c
index 989f30a31ba9..5d018207ff92 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_drm.c
@@ -71,6 +71,7 @@ 
 #include "nouveau_svm.h"
 #include "nouveau_dmem.h"
 #include "nouveau_uvmm.h"
+#include "nouveau_sched.h"
 
 DECLARE_DYNDBG_CLASSMAP(drm_debug_classes, DD_CLASS_TYPE_DISJOINT_BITS, 0,
 			"DRM_UT_CORE",
@@ -192,6 +193,7 @@  nouveau_cli_fini(struct nouveau_cli *cli)
 	flush_work(&cli->work);
 	WARN_ON(!list_empty(&cli->worker));
 
+	nouveau_sched_entity_fini(&cli->sched_entity);
 	usif_client_fini(cli);
 	nouveau_uvmm_fini(&cli->uvmm);
 	nouveau_vmm_fini(&cli->svm);
@@ -299,6 +301,11 @@  nouveau_cli_init(struct nouveau_drm *drm, const char *sname,
 	}
 
 	cli->mem = &mems[ret];
+
+	ret = nouveau_sched_entity_init(&cli->sched_entity, &drm->sched);
+	if (ret)
+		goto done;
+
 	return 0;
 done:
 	if (ret)
@@ -611,8 +618,13 @@  nouveau_drm_device_init(struct drm_device *dev)
 		pm_runtime_put(dev->dev);
 	}
 
-	return 0;
+	ret = nouveau_sched_init(&drm->sched, drm);
+	if (ret)
+		goto fail_sched_init;
 
+	return 0;
+fail_sched_init:
+	nouveau_display_fini(dev, false, false);
 fail_dispinit:
 	nouveau_display_destroy(dev);
 fail_dispctor:
@@ -637,6 +649,8 @@  nouveau_drm_device_fini(struct drm_device *dev)
 	struct nouveau_cli *cli, *temp_cli;
 	struct nouveau_drm *drm = nouveau_drm(dev);
 
+	nouveau_sched_fini(&drm->sched);
+
 	if (nouveau_pmops_runtime()) {
 		pm_runtime_get_sync(dev->dev);
 		pm_runtime_forbid(dev->dev);
@@ -1177,6 +1191,9 @@  nouveau_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_PREP, nouveau_gem_ioctl_cpu_prep, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_CPU_FINI, nouveau_gem_ioctl_cpu_fini, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(NOUVEAU_GEM_INFO, nouveau_gem_ioctl_info, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(NOUVEAU_VM_INIT, nouveau_ioctl_vm_init, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(NOUVEAU_VM_BIND, nouveau_ioctl_vm_bind, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(NOUVEAU_EXEC, nouveau_ioctl_exec, DRM_RENDER_ALLOW),
 };
 
 long
@@ -1224,7 +1241,9 @@  nouveau_driver_fops = {
 static struct drm_driver
 driver_stub = {
 	.driver_features =
-		DRIVER_GEM | DRIVER_MODESET | DRIVER_RENDER
+		DRIVER_GEM | DRIVER_MODESET | DRIVER_RENDER |
+		DRIVER_SYNCOBJ | DRIVER_SYNCOBJ_TIMELINE |
+		DRIVER_GEM_GPUVA
 #if defined(CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT)
 		| DRIVER_KMS_LEGACY_CONTEXT
 #endif
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h b/drivers/gpu/drm/nouveau/nouveau_drv.h
index d634f1054d65..94de792ef3ca 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -10,8 +10,8 @@ 
 #define DRIVER_DATE		"20120801"
 
 #define DRIVER_MAJOR		1
-#define DRIVER_MINOR		3
-#define DRIVER_PATCHLEVEL	1
+#define DRIVER_MINOR		4
+#define DRIVER_PATCHLEVEL	0
 
 /*
  * 1.1.1:
@@ -63,6 +63,7 @@  struct platform_device;
 
 #include "nouveau_fence.h"
 #include "nouveau_bios.h"
+#include "nouveau_sched.h"
 #include "nouveau_vmm.h"
 #include "nouveau_uvmm.h"
 
@@ -94,6 +95,8 @@  struct nouveau_cli {
 	struct nouveau_vmm svm;
 	struct nouveau_uvmm uvmm;
 
+	struct nouveau_sched_entity sched_entity;
+
 	const struct nvif_mclass *mem;
 
 	struct list_head head;
@@ -305,6 +308,8 @@  struct nouveau_drm {
 		struct mutex lock;
 		bool component_registered;
 	} audio;
+
+	struct drm_gpu_scheduler sched;
 };
 
 static inline struct nouveau_drm *
diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c b/drivers/gpu/drm/nouveau/nouveau_exec.c
new file mode 100644
index 000000000000..512120bdb8a8
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
@@ -0,0 +1,310 @@ 
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright (c) 2022 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *     Danilo Krummrich <dakr@redhat.com>
+ *
+ */
+
+#include <drm/drm_exec.h>
+
+#include "nouveau_drv.h"
+#include "nouveau_gem.h"
+#include "nouveau_mem.h"
+#include "nouveau_dma.h"
+#include "nouveau_exec.h"
+#include "nouveau_abi16.h"
+#include "nouveau_chan.h"
+#include "nouveau_sched.h"
+#include "nouveau_uvmm.h"
+
+
+/**
+ * DOC: Overview
+ *
+ * Nouveau's VM_BIND / EXEC UAPI consists of three ioctls: DRM_NOUVEAU_VM_INIT,
+ * DRM_NOUVEAU_VM_BIND and DRM_NOUVEAU_EXEC.
+ *
+ * In order to use the UAPI firstly a user client must initialize the VA space
+ * using the DRM_NOUVEAU_VM_INIT ioctl specifying which region of the VA space
+ * should be managed by the kernel and which by the UMD.
+ *
+ * The DRM_NOUVEAU_VM_BIND ioctl provides clients an interface to manage the
+ * userspace-managable portion of the VA space. It provides operations to
+ * allocate and free a VA space regions and operations to map and unmap memory
+ * within such a region. Bind operations crossing region boundaries are not
+ * permitted.
+ *
+ * When allocating a VA space region userspace may flag this region as sparse.
+ * If a region is flagged as sparse the kernel will take care that for the whole
+ * region sparse mappings are created. Subsequently requested actual memory
+ * backed mappings for a sparse region will take precedence over the sparse
+ * mappings. If the memory backed mappings are unmapped the kernel will make
+ * sure that sparse mappings will take their place again.
+ *
+ * When using the VM_BIND ioctl to request the kernel to map memory to a given
+ * virtual address in the GPU's VA space there is no guarantee that the actual
+ * mappings are created in the GPU's MMU. If the given memory is swapped out
+ * at the time the bind operation is executed the kernel will stash the mapping
+ * details into it's internal alloctor and create the actual MMU mappings once
+ * the memory is swapped back in. While this is transparent for userspace, it is
+ * guaranteed that all the backing memory is swapped back in and all the memory
+ * mappings, as requested by userspace previously, are actually mapped once the
+ * DRM_NOUVEAU_EXEC ioctl is called to submit an exec job.
+ *
+ * Contrary to VM_BIND map requests, unmap requests are allowed to span over VA
+ * space regions and completely untouched areas of the VA space.
+ *
+ * Generally, all rules for constellations like mapping and unmapping over
+ * boundaries of existing mappings are documented in the &drm_gpuva_manager.
+ *
+ * When a VA space region is freed, all existing mappings within this region are
+ * unmapped automatically.
+ *
+ * A VM_BIND job can be executed either synchronously or asynchronously. If
+ * exectued asynchronously, userspace may provide a list of syncobjs this job
+ * will wait for and/or a list of syncobj the kernel will trigger once the
+ * VM_BIND finished execution. If executed synchronously the ioctl will block
+ * until the bind job is finished and no syncobjs are permitted by the kernel.
+ *
+ * To execute a push buffer the UAPI provides the DRM_NOUVEAU_EXEC ioctl. EXEC
+ * jobs are always executed asynchronously, and, equal to VM_BIND jobs, provide
+ * the option to synchronize them with syncobjs.
+ *
+ * Besides that EXEC job can be scheduled for a specified channel to execute on.
+ *
+ * EXEC jobs wait for VM_BIND jobs they depend on when userspace submitts the
+ * EXEC job rather than when this EXEC job actually executes. This is due to the
+ * fact that at submission time of the EXEC job we'd otherwise not have the
+ * correct view of the VA space for this EXEC job, since VM_BIND jobs, this EXEC
+ * job depends on might still be in the queue. Without a recent (and hence
+ * for this particular job correct) view of the VA space, we'd potentially miss
+ * to lock, swap in and re-bind BOs that have been evicted previously.
+ */
+
+static int
+nouveau_exec_ucopy_syncs(struct nouveau_exec_base *base,
+			u32 inc, u64 ins,
+			u32 outc, u64 outs)
+{
+	struct drm_nouveau_sync **s;
+	int ret;
+
+	if (inc) {
+		s = &base->in_sync.s;
+
+		base->in_sync.count = inc;
+		*s = u_memcpya(ins, inc, sizeof(**s));
+		if (IS_ERR(*s)) {
+			ret = PTR_ERR(*s);
+			goto err_out;
+		}
+	}
+
+	if (outc) {
+		s = &base->out_sync.s;
+
+		base->out_sync.count = outc;
+		*s = u_memcpya(outs, outc, sizeof(**s));
+		if (IS_ERR(*s)) {
+			ret = PTR_ERR(*s);
+			goto err_free_ins;
+		}
+	}
+
+	return 0;
+
+err_free_ins:
+	u_free(base->in_sync.s);
+err_out:
+	return ret;
+}
+
+int
+nouveau_ioctl_vm_init(struct drm_device *dev,
+		      void *data,
+		      struct drm_file *file_priv)
+{
+	struct nouveau_cli *cli = nouveau_cli(file_priv);
+	struct drm_nouveau_vm_init *init = data;
+
+	return nouveau_uvmm_init(&cli->uvmm, cli, init);
+}
+
+int nouveau_vm_bind(struct nouveau_exec_bind *bind)
+{
+	struct nouveau_bind_job *job;
+	int ret;
+
+	ret = nouveau_bind_job_init(&job, bind);
+	if (ret)
+		return ret;
+
+	ret = nouveau_job_submit(&job->base);
+	if (ret)
+		goto err_job_fini;
+
+	return 0;
+
+err_job_fini:
+	nouveau_job_fini(&job->base);
+	return ret;
+}
+
+int
+nouveau_ioctl_vm_bind(struct drm_device *dev,
+		      void *data,
+		      struct drm_file *file_priv)
+{
+	struct nouveau_cli *cli = nouveau_cli(file_priv);
+	struct nouveau_exec_bind bind = {};
+	struct drm_nouveau_vm_bind *req = data;
+	int ret = 0;
+
+	if (unlikely(!nouveau_cli_uvmm_locked(cli)))
+		return -ENOSYS;
+
+	bind.flags = req->flags;
+
+	bind.op.count = req->op_count;
+	bind.op.s = u_memcpya(req->op_ptr, req->op_count,
+			      sizeof(*bind.op.s));
+	if (IS_ERR(bind.op.s))
+		return PTR_ERR(bind.op.s);
+
+	ret = nouveau_exec_ucopy_syncs(&bind.base,
+				       req->wait_count, req->wait_ptr,
+				       req->sig_count, req->sig_ptr);
+	if (ret)
+		goto out_free_ops;
+
+	bind.base.sched_entity = &cli->sched_entity;
+	bind.base.file_priv = file_priv;
+
+	ret = nouveau_vm_bind(&bind);
+	if (ret)
+		goto out_free_syncs;
+
+out_free_syncs:
+	u_free(bind.base.out_sync.s);
+	u_free(bind.base.in_sync.s);
+out_free_ops:
+	u_free(bind.op.s);
+	return ret;
+}
+
+static int
+nouveau_exec(struct nouveau_exec *exec)
+{
+	struct nouveau_exec_job *job;
+	int ret;
+
+	ret = nouveau_exec_job_init(&job, exec);
+	if (ret)
+		return ret;
+
+	ret = nouveau_job_submit(&job->base);
+	if (ret)
+		goto err_job_fini;
+
+	return 0;
+
+err_job_fini:
+	nouveau_job_fini(&job->base);
+	return ret;
+}
+
+int
+nouveau_ioctl_exec(struct drm_device *dev,
+		   void *data,
+		   struct drm_file *file_priv)
+{
+	struct nouveau_abi16 *abi16 = nouveau_abi16_get(file_priv);
+	struct nouveau_cli *cli = nouveau_cli(file_priv);
+	struct nouveau_abi16_chan *chan16;
+	struct nouveau_channel *chan = NULL;
+	struct nouveau_exec exec = {};
+	struct drm_nouveau_exec *req = data;
+	int ret = 0;
+
+	if (unlikely(!abi16))
+		return -ENOMEM;
+
+	/* abi16 locks already */
+	if (unlikely(!nouveau_cli_uvmm(cli)))
+		return nouveau_abi16_put(abi16, -ENOSYS);
+
+	list_for_each_entry(chan16, &abi16->channels, head) {
+		if (chan16->chan->chid == req->channel) {
+			chan = chan16->chan;
+			break;
+		}
+	}
+
+	if (!chan)
+		return nouveau_abi16_put(abi16, -ENOENT);
+
+	if (unlikely(atomic_read(&chan->killed)))
+		return nouveau_abi16_put(abi16, -ENODEV);
+
+	if (!chan->dma.ib_max)
+		return nouveau_abi16_put(abi16, -ENOSYS);
+
+	if (unlikely(req->push_count == 0))
+		goto out;
+
+	if (unlikely(req->push_count > NOUVEAU_GEM_MAX_PUSH)) {
+		NV_PRINTK(err, cli, "pushbuf push count exceeds limit: %d max %d\n",
+			 req->push_count, NOUVEAU_GEM_MAX_PUSH);
+		return nouveau_abi16_put(abi16, -EINVAL);
+	}
+
+	exec.push.count = req->push_count;
+	exec.push.s = u_memcpya(req->push_ptr, req->push_count,
+				sizeof(*exec.push.s));
+	if (IS_ERR(exec.push.s)) {
+		ret = PTR_ERR(exec.push.s);
+		goto out;
+	}
+
+	ret = nouveau_exec_ucopy_syncs(&exec.base,
+				       req->wait_count, req->wait_ptr,
+				       req->sig_count, req->sig_ptr);
+	if (ret)
+		goto out_free_pushs;
+
+	exec.base.sched_entity = &chan16->sched_entity;
+	exec.base.chan = chan;
+	exec.base.file_priv = file_priv;
+
+	ret = nouveau_exec(&exec);
+	if (ret)
+		goto out_free_syncs;
+
+out_free_syncs:
+	u_free(exec.base.out_sync.s);
+	u_free(exec.base.in_sync.s);
+out_free_pushs:
+	u_free(exec.push.s);
+out:
+	return nouveau_abi16_put(abi16, ret);
+}
diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.h b/drivers/gpu/drm/nouveau/nouveau_exec.h
new file mode 100644
index 000000000000..3774fc338f5d
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_exec.h
@@ -0,0 +1,55 @@ 
+// SPDX-License-Identifier: MIT
+
+#ifndef __NOUVEAU_EXEC_H__
+#define __NOUVEAU_EXEC_H__
+
+#include <drm/drm_exec.h>
+
+#include "nouveau_drv.h"
+
+struct nouveau_exec_base {
+	struct nouveau_channel *chan;
+	struct drm_file *file_priv;
+	struct nouveau_sched_entity *sched_entity;
+
+	struct {
+		struct drm_nouveau_sync *s;
+		u32 count;
+	} in_sync;
+
+	struct {
+		struct drm_nouveau_sync *s;
+		u32 count;
+	} out_sync;
+};
+
+struct nouveau_exec_bind {
+	struct nouveau_exec_base base;
+	unsigned int flags;
+
+	struct {
+		struct drm_nouveau_vm_bind_op *s;
+		u32 count;
+	} op;
+};
+
+struct nouveau_exec {
+	struct nouveau_exec_base base;
+	struct drm_exec exec;
+
+	struct {
+		struct drm_nouveau_exec_push *s;
+		u32 count;
+	} push;
+};
+
+int nouveau_ioctl_vm_init(struct drm_device *dev, void *data,
+			  struct drm_file *file_priv);
+
+int nouveau_ioctl_vm_bind(struct drm_device *dev, void *data,
+			  struct drm_file *file_priv);
+
+int nouveau_ioctl_exec(struct drm_device *dev, void *data,
+		       struct drm_file *file_priv);
+
+#endif
diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c b/drivers/gpu/drm/nouveau/nouveau_sched.c
new file mode 100644
index 000000000000..2749aa1908ad
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
@@ -0,0 +1,780 @@ 
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright (c) 2022 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *     Danilo Krummrich <dakr@redhat.com>
+ *
+ */
+
+#include <linux/slab.h>
+#include <drm/gpu_scheduler.h>
+#include <drm/drm_syncobj.h>
+
+#include "nouveau_drv.h"
+#include "nouveau_gem.h"
+#include "nouveau_mem.h"
+#include "nouveau_dma.h"
+#include "nouveau_exec.h"
+#include "nouveau_abi16.h"
+#include "nouveau_chan.h"
+#include "nouveau_sched.h"
+
+/* FIXME
+ *
+ * We want to make sure that jobs currently executing can't be deferred by
+ * other jobs competing for the hardware. Otherwise we might end up with job
+ * timouts just because of too many clients submitting too many jobs. We don't
+ * want jobs to time out because of system load, but because of the job being
+ * too bulky.
+ *
+ * For now allow for up to 16 concurrent jobs in flight until we know how many
+ * rings the hardware can process in parallel.
+ */
+#define NOUVEAU_SCHED_HW_SUBMISSIONS		16
+#define NOUVEAU_SCHED_JOB_TIMEOUT_MS		10000
+
+#define list_for_each_op(_op, _ops) list_for_each_entry(_op, _ops, entry)
+#define list_for_each_op_safe(_op, _n, _ops) list_for_each_entry_safe(_op, _n, _ops, entry)
+
+enum bind_op {
+	OP_ALLOC = DRM_NOUVEAU_VM_BIND_OP_ALLOC,
+	OP_FREE = DRM_NOUVEAU_VM_BIND_OP_FREE,
+	OP_MAP = DRM_NOUVEAU_VM_BIND_OP_MAP,
+	OP_UNMAP = DRM_NOUVEAU_VM_BIND_OP_UNMAP,
+};
+
+struct bind_job_op {
+	struct list_head entry;
+
+	enum bind_op op;
+	u32 flags;
+
+	struct {
+		u64 addr;
+		u64 range;
+	} va;
+
+	struct {
+		u32 handle;
+		u64 offset;
+		struct drm_gem_object *obj;
+	} gem;
+};
+
+static int
+nouveau_base_job_init(struct nouveau_job *job,
+		      struct nouveau_exec_base *base)
+{
+	struct nouveau_sched_entity *entity = base->sched_entity;
+	int ret;
+
+	INIT_LIST_HEAD(&job->head);
+	job->file_priv = base->file_priv;
+	job->cli = nouveau_cli(base->file_priv);
+	job->chan = base->chan;
+	job->entity = entity;
+
+	job->in_sync.count = base->in_sync.count;
+	if (job->in_sync.count) {
+		if (job->sync)
+			return -EINVAL;
+
+		job->in_sync.s = kmemdup(base->in_sync.s,
+					 sizeof(*base->in_sync.s) *
+					 base->in_sync.count,
+					 GFP_KERNEL);
+		if (!job->in_sync.s)
+			return -ENOMEM;
+	}
+
+	job->out_sync.count = base->out_sync.count;
+	if (job->out_sync.count) {
+		if (job->sync) {
+			ret = -EINVAL;
+			goto err_free_in_sync;
+		}
+
+		job->out_sync.s = kmemdup(base->out_sync.s,
+					  sizeof(*base->out_sync.s) *
+					  base->out_sync.count,
+					  GFP_KERNEL);
+		if (!job->out_sync.s) {
+			ret = -ENOMEM;
+			goto err_free_in_sync;
+		}
+	}
+
+	ret = drm_sched_job_init(&job->base, &entity->base, NULL);
+	if (ret)
+		goto err_free_out_sync;
+
+	return 0;
+
+err_free_out_sync:
+	if (job->out_sync.s)
+		kfree(job->out_sync.s);
+err_free_in_sync:
+	if (job->in_sync.s)
+		kfree(job->in_sync.s);
+return ret;
+}
+
+static void
+nouveau_base_job_free(struct nouveau_job *job)
+{
+	if (job->in_sync.s)
+		kfree(job->in_sync.s);
+
+	if (job->out_sync.s)
+		kfree(job->out_sync.s);
+}
+
+static int
+bind_submit_validate_op(struct nouveau_job *job,
+			struct bind_job_op *op)
+{
+	struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
+	struct drm_gem_object *obj = op->gem.obj;
+
+	if (op->op == OP_MAP) {
+		if (op->gem.offset & ~PAGE_MASK)
+			return -EINVAL;
+
+		if (obj->size <= op->gem.offset)
+			return -EINVAL;
+
+		if (op->va.range > (obj->size - op->gem.offset))
+			return -EINVAL;
+	}
+
+	return nouveau_uvmm_validate_range(uvmm, op->va.addr, op->va.range);
+}
+
+int
+nouveau_bind_job_submit(struct nouveau_job *job)
+{
+	struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
+	struct bind_job_op *op;
+	int ret;
+
+	list_for_each_op(op, &bind_job->ops) {
+		switch (op->op) {
+		case OP_ALLOC:
+		case OP_FREE:
+		case OP_MAP:
+		case OP_UNMAP:
+			break;
+		default:
+			return -EINVAL;
+		}
+
+		if (op->op == OP_MAP) {
+			op->gem.obj = drm_gem_object_lookup(job->file_priv,
+							    op->gem.handle);
+			if (!op->gem.obj)
+				return -ENOENT;
+		}
+
+		ret = bind_submit_validate_op(job, op);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static struct dma_fence *
+nouveau_bind_job_run(struct nouveau_job *job)
+{
+	struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
+	struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(job->cli);
+	struct bind_job_op *op;
+	int ret = 0;
+
+	nouveau_uvmm_lock(uvmm);
+	list_for_each_op(op, &bind_job->ops) {
+		switch (op->op) {
+		case OP_ALLOC: {
+			bool sparse = op->flags & DRM_NOUVEAU_VM_BIND_SPARSE;
+
+			ret = nouveau_uvma_region_new(uvmm,
+						      op->va.addr,
+						      op->va.range,
+						      sparse);
+			if (ret)
+				goto out_unlock;
+			break;
+		}
+		case OP_FREE:
+			ret = nouveau_uvma_region_destroy(uvmm,
+							  op->va.addr,
+							  op->va.range);
+			if (ret)
+				goto out_unlock;
+			break;
+		case OP_MAP:
+			ret = nouveau_uvmm_sm_map(uvmm,
+						  op->va.addr, op->va.range,
+						  op->gem.obj, op->gem.offset,
+						  op->flags && 0xff);
+			if (ret)
+				goto out_unlock;
+			break;
+		case OP_UNMAP:
+			ret = nouveau_uvmm_sm_unmap(uvmm,
+						    op->va.addr,
+						    op->va.range);
+			if (ret)
+				goto out_unlock;
+			break;
+		}
+	}
+
+out_unlock:
+	nouveau_uvmm_unlock(uvmm);
+	if (ret)
+		NV_PRINTK(err, job->cli, "bind job failed: %d\n", ret);
+	return ERR_PTR(ret);
+}
+
+static void
+nouveau_bind_job_free(struct nouveau_job *job)
+{
+	struct nouveau_bind_job *bind_job = to_nouveau_bind_job(job);
+	struct bind_job_op *op, *next;
+
+	list_for_each_op_safe(op, next, &bind_job->ops) {
+		struct drm_gem_object *obj = op->gem.obj;
+
+		if (obj)
+			drm_gem_object_put(obj);
+
+		list_del(&op->entry);
+		kfree(op);
+	}
+
+	nouveau_base_job_free(job);
+	kfree(bind_job);
+}
+
+static struct nouveau_job_ops nouveau_bind_job_ops = {
+	.submit = nouveau_bind_job_submit,
+	.run = nouveau_bind_job_run,
+	.free = nouveau_bind_job_free,
+};
+
+static int
+bind_job_op_from_uop(struct bind_job_op **pop,
+		     struct drm_nouveau_vm_bind_op *uop)
+{
+	struct bind_job_op *op;
+
+	op = *pop = kzalloc(sizeof(*op), GFP_KERNEL);
+	if (!op)
+		return -ENOMEM;
+
+	op->op = uop->op;
+	op->flags = uop->flags;
+	op->va.addr = uop->addr;
+	op->va.range = uop->range;
+
+	if (op->op == DRM_NOUVEAU_VM_BIND_OP_MAP) {
+		op->gem.handle = uop->handle;
+		op->gem.offset = uop->bo_offset;
+	}
+
+	return 0;
+}
+
+static void
+bind_job_ops_free(struct list_head *ops)
+{
+	struct bind_job_op *op, *next;
+
+	list_for_each_op_safe(op, next, ops) {
+		list_del(&op->entry);
+		kfree(op);
+	}
+}
+
+int
+nouveau_bind_job_init(struct nouveau_bind_job **pjob,
+		      struct nouveau_exec_bind *bind)
+{
+	struct nouveau_bind_job *job;
+	struct bind_job_op *op;
+	int i, ret;
+
+	job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
+	if (!job)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&job->ops);
+
+	for (i = 0; i < bind->op.count; i++) {
+		ret = bind_job_op_from_uop(&op, &bind->op.s[i]);
+		if (ret)
+			goto err_free;
+
+		list_add_tail(&op->entry, &job->ops);
+	}
+
+	job->base.sync = !(bind->flags & DRM_NOUVEAU_VM_BIND_RUN_ASYNC);
+	job->base.ops = &nouveau_bind_job_ops;
+
+	ret = nouveau_base_job_init(&job->base, &bind->base);
+	if (ret)
+		goto err_free;
+
+	return 0;
+
+err_free:
+	bind_job_ops_free(&job->ops);
+	kfree(job);
+	*pjob = NULL;
+
+	return ret;
+}
+
+static int
+sync_find_fence(struct nouveau_job *job,
+		struct drm_nouveau_sync *sync,
+		struct dma_fence **fence)
+{
+	u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
+	u64 point = 0;
+	int ret;
+
+	if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
+	    stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
+		return -EOPNOTSUPP;
+
+	if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
+		point = sync->timeline_value;
+
+	ret = drm_syncobj_find_fence(job->file_priv,
+				     sync->handle, point,
+				     sync->flags, fence);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int
+exec_job_binds_wait(struct nouveau_job *job)
+{
+	struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
+	struct nouveau_cli *cli = exec_job->base.cli;
+	struct nouveau_sched_entity *bind_entity = &cli->sched_entity;
+	signed long ret;
+	int i;
+
+	for (i = 0; i < job->in_sync.count; i++) {
+		struct nouveau_job *it;
+		struct drm_nouveau_sync *sync = &job->in_sync.s[i];
+		struct dma_fence *fence;
+		bool found;
+
+		ret = sync_find_fence(job, sync, &fence);
+		if (ret)
+			return ret;
+
+		mutex_lock(&bind_entity->job.mutex);
+		found = false;
+		list_for_each_entry(it, &bind_entity->job.list, head) {
+			if (fence == it->done_fence) {
+				found = true;
+				break;
+			}
+		}
+		mutex_unlock(&bind_entity->job.mutex);
+
+		/* If the fence is not from a VM_BIND job, don't wait for it. */
+		if (!found)
+			continue;
+
+		ret = dma_fence_wait_timeout(fence, true,
+					     msecs_to_jiffies(500));
+		if (ret < 0)
+			return ret;
+		else if (ret == 0)
+			return -ETIMEDOUT;
+	}
+
+	return 0;
+}
+
+int
+nouveau_exec_job_submit(struct nouveau_job *job)
+{
+	struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
+	struct nouveau_cli *cli = exec_job->base.cli;
+	struct nouveau_uvmm *uvmm = nouveau_cli_uvmm(cli);
+	struct drm_exec *exec = &job->exec;
+	struct drm_gem_object *obj;
+	unsigned long index;
+	int ret;
+
+	ret = exec_job_binds_wait(job);
+	if (ret)
+		return ret;
+
+	nouveau_uvmm_lock(uvmm);
+	drm_exec_while_not_all_locked(exec) {
+		struct drm_gpuva *va;
+
+		drm_gpuva_for_each_va(va, &uvmm->umgr) {
+			ret = drm_exec_prepare_obj(exec, va->gem.obj, 1);
+			drm_exec_break_on_contention(exec);
+			if (ret)
+				return ret;
+		}
+	}
+	nouveau_uvmm_unlock(uvmm);
+
+	drm_exec_for_each_locked_object(exec, index, obj) {
+		struct dma_resv *resv = obj->resv;
+		struct nouveau_bo *nvbo = nouveau_gem_object(obj);
+
+		ret = nouveau_bo_validate(nvbo, true, false);
+		if (ret)
+			return ret;
+
+		dma_resv_add_fence(resv, job->done_fence, DMA_RESV_USAGE_WRITE);
+	}
+
+	return 0;
+}
+
+static struct dma_fence *
+nouveau_exec_job_run(struct nouveau_job *job)
+{
+	struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
+	struct nouveau_fence *fence;
+	int i, ret;
+
+	ret = nouveau_dma_wait(job->chan, exec_job->push.count + 1, 16);
+	if (ret) {
+		NV_PRINTK(err, job->cli, "nv50cal_space: %d\n", ret);
+		return ERR_PTR(ret);
+	}
+
+	for (i = 0; i < exec_job->push.count; i++) {
+		nv50_dma_push(job->chan, exec_job->push.s[i].va,
+			      exec_job->push.s[i].va_len);
+	}
+
+	ret = nouveau_fence_new(job->chan, false, &fence);
+	if (ret) {
+		NV_PRINTK(err, job->cli, "error fencing pushbuf: %d\n", ret);
+		WIND_RING(job->chan);
+		return ERR_PTR(ret);
+	}
+
+	return &fence->base;
+}
+static void
+nouveau_exec_job_free(struct nouveau_job *job)
+{
+	struct nouveau_exec_job *exec_job = to_nouveau_exec_job(job);
+
+	nouveau_base_job_free(job);
+
+	kfree(exec_job->push.s);
+	kfree(exec_job);
+}
+
+static struct nouveau_job_ops nouveau_exec_job_ops = {
+	.submit = nouveau_exec_job_submit,
+	.run = nouveau_exec_job_run,
+	.free = nouveau_exec_job_free,
+};
+
+int
+nouveau_exec_job_init(struct nouveau_exec_job **pjob,
+		      struct nouveau_exec *exec)
+{
+	struct nouveau_exec_job *job;
+	int ret;
+
+	job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
+	if (!job)
+		return -ENOMEM;
+
+	job->push.count = exec->push.count;
+	job->push.s = kmemdup(exec->push.s,
+			      sizeof(*exec->push.s) *
+			      exec->push.count,
+			      GFP_KERNEL);
+	if (!job->push.s) {
+		ret = -ENOMEM;
+		goto err_free_job;
+	}
+
+	job->base.ops = &nouveau_exec_job_ops;
+	ret = nouveau_base_job_init(&job->base, &exec->base);
+	if (ret)
+		goto err_free_pushs;
+
+	return 0;
+
+err_free_pushs:
+	kfree(job->push.s);
+err_free_job:
+	kfree(job);
+	*pjob = NULL;
+
+	return ret;
+}
+
+void nouveau_job_fini(struct nouveau_job *job)
+{
+	dma_fence_put(job->done_fence);
+	drm_sched_job_cleanup(&job->base);
+	job->ops->free(job);
+}
+
+static int
+nouveau_job_add_deps(struct nouveau_job *job)
+{
+	struct dma_fence *in_fence = NULL;
+	int ret, i;
+
+	for (i = 0; i < job->in_sync.count; i++) {
+		struct drm_nouveau_sync *sync = &job->in_sync.s[i];
+
+		ret = sync_find_fence(job, sync, &in_fence);
+		if (ret) {
+			NV_PRINTK(warn, job->cli,
+				  "Failed to find syncobj (-> in): handle=%d\n",
+				  sync->handle);
+			return ret;
+		}
+
+		ret = drm_sched_job_add_dependency(&job->base, in_fence);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int
+nouveau_job_fence_attach(struct nouveau_job *job, struct dma_fence *fence)
+{
+	struct drm_syncobj *out_sync;
+	int i;
+
+	for (i = 0; i < job->out_sync.count; i++) {
+		struct drm_nouveau_sync *sync = &job->out_sync.s[i];
+		u32 stype = sync->flags & DRM_NOUVEAU_SYNC_TYPE_MASK;
+
+		if (stype != DRM_NOUVEAU_SYNC_SYNCOBJ &&
+		    stype != DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ)
+			return -EOPNOTSUPP;
+
+		out_sync = drm_syncobj_find(job->file_priv, sync->handle);
+		if (!out_sync) {
+			NV_PRINTK(warn, job->cli,
+				  "Failed to find syncobj (-> out): handle=%d\n",
+				  sync->handle);
+			return -ENOENT;
+		}
+
+		if (stype == DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ) {
+			struct dma_fence_chain *chain;
+
+			chain = dma_fence_chain_alloc();
+			if (!chain) {
+				drm_syncobj_put(out_sync);
+				return -ENOMEM;
+			}
+
+			drm_syncobj_add_point(out_sync, chain, fence,
+					      sync->timeline_value);
+		} else {
+			drm_syncobj_replace_fence(out_sync, fence);
+		}
+
+		drm_syncobj_put(out_sync);
+	}
+
+	return 0;
+}
+
+static struct dma_fence *
+nouveau_job_run(struct nouveau_job *job)
+{
+	return job->ops->run(job);
+}
+
+static int
+nouveau_job_run_sync(struct nouveau_job *job)
+{
+	struct dma_fence *fence;
+	int ret;
+
+	fence = nouveau_job_run(job);
+	if (IS_ERR(fence)) {
+		return PTR_ERR(fence);
+	} else if (fence) {
+		ret = dma_fence_wait(fence, true);
+		if (ret)
+			return ret;
+	}
+
+	dma_fence_signal(job->done_fence);
+
+	return 0;
+}
+
+int
+nouveau_job_submit(struct nouveau_job *job)
+{
+	struct nouveau_sched_entity *entity = to_nouveau_sched_entity(job->base.entity);
+	int ret;
+
+	drm_exec_init(&job->exec, true);
+
+	ret = nouveau_job_add_deps(job);
+	if (ret)
+		goto out;
+
+	drm_sched_job_arm(&job->base);
+	job->done_fence = dma_fence_get(&job->base.s_fence->finished);
+
+	ret = nouveau_job_fence_attach(job, job->done_fence);
+	if (ret)
+		goto out;
+
+	if (job->ops->submit) {
+		ret = job->ops->submit(job);
+		if (ret)
+			goto out;
+	}
+
+	if (job->sync) {
+		drm_exec_fini(&job->exec);
+
+		/* We're requested to run a synchronous job, hence don't push
+		 * the job, bypassing the job scheduler, and execute the jobs
+		 * run() function right away.
+		 *
+		 * As a consequence of bypassing the job scheduler we need to
+		 * handle fencing and job cleanup ourselfes.
+		 */
+		ret = nouveau_job_run_sync(job);
+
+		/* If the job fails, the caller will do the cleanup for us. */
+		if (!ret)
+			nouveau_job_fini(job);
+
+		return ret;
+	} else {
+		mutex_lock(&entity->job.mutex);
+		drm_sched_entity_push_job(&job->base);
+		list_add_tail(&job->head, &entity->job.list);
+		mutex_unlock(&entity->job.mutex);
+	}
+
+out:
+	drm_exec_fini(&job->exec);
+	return ret;
+}
+
+static struct dma_fence *
+nouveau_sched_run_job(struct drm_sched_job *sched_job)
+{
+	struct nouveau_job *job = to_nouveau_job(sched_job);
+
+	return nouveau_job_run(job);
+}
+
+static enum drm_gpu_sched_stat
+nouveau_sched_timedout_job(struct drm_sched_job *sched_job)
+{
+	struct nouveau_job *job = to_nouveau_job(sched_job);
+	struct nouveau_channel *chan = job->chan;
+
+	if (unlikely(!atomic_read(&chan->killed)))
+		nouveau_channel_kill(chan);
+
+	NV_PRINTK(warn, job->cli, "job timeout, channel %d killed!\n",
+		  chan->chid);
+
+	nouveau_sched_entity_fini(job->entity);
+
+	return DRM_GPU_SCHED_STAT_ENODEV;
+}
+
+static void
+nouveau_sched_free_job(struct drm_sched_job *sched_job)
+{
+	struct nouveau_job *job = to_nouveau_job(sched_job);
+	struct nouveau_sched_entity *entity = job->entity;
+
+	mutex_lock(&entity->job.mutex);
+	list_del(&job->head);
+	mutex_unlock(&entity->job.mutex);
+
+	nouveau_job_fini(job);
+}
+
+int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
+			      struct drm_gpu_scheduler *sched)
+{
+
+	INIT_LIST_HEAD(&entity->job.list);
+	mutex_init(&entity->job.mutex);
+
+	return drm_sched_entity_init(&entity->base,
+				     DRM_SCHED_PRIORITY_NORMAL,
+				     &sched, 1, NULL);
+}
+
+void
+nouveau_sched_entity_fini(struct nouveau_sched_entity *entity)
+{
+	drm_sched_entity_destroy(&entity->base);
+}
+
+static const struct drm_sched_backend_ops nouveau_sched_ops = {
+	.run_job = nouveau_sched_run_job,
+	.timedout_job = nouveau_sched_timedout_job,
+	.free_job = nouveau_sched_free_job,
+};
+
+int nouveau_sched_init(struct drm_gpu_scheduler *sched,
+		       struct nouveau_drm *drm)
+{
+	long job_hang_limit = msecs_to_jiffies(NOUVEAU_SCHED_JOB_TIMEOUT_MS);
+
+	return drm_sched_init(sched, &nouveau_sched_ops,
+			      NOUVEAU_SCHED_HW_SUBMISSIONS, 0, job_hang_limit,
+			      NULL, NULL, "nouveau", drm->dev->dev);
+}
+
+void nouveau_sched_fini(struct drm_gpu_scheduler *sched)
+{
+	drm_sched_fini(sched);
+}
diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.h b/drivers/gpu/drm/nouveau/nouveau_sched.h
new file mode 100644
index 000000000000..7fc5b7eea810
--- /dev/null
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.h
@@ -0,0 +1,98 @@ 
+// SPDX-License-Identifier: MIT
+
+#ifndef NOUVEAU_SCHED_H
+#define NOUVEAU_SCHED_H
+
+#include <linux/types.h>
+
+#include <drm/drm_exec.h>
+#include <drm/gpu_scheduler.h>
+
+#include "nouveau_drv.h"
+#include "nouveau_exec.h"
+
+#define to_nouveau_job(sched_job)		\
+		container_of((sched_job), struct nouveau_job, base)
+
+#define to_nouveau_exec_job(job)		\
+		container_of((job), struct nouveau_exec_job, base)
+
+#define to_nouveau_bind_job(job)		\
+		container_of((job), struct nouveau_bind_job, base)
+
+struct nouveau_job {
+	struct drm_sched_job base;
+	struct list_head head;
+
+	struct nouveau_sched_entity *entity;
+
+	struct drm_file *file_priv;
+	struct nouveau_cli *cli;
+	struct nouveau_channel *chan;
+
+	struct drm_exec exec;
+	struct dma_fence *done_fence;
+
+	bool sync;
+
+	struct {
+		struct drm_nouveau_sync *s;
+		u32 count;
+	} in_sync;
+
+	struct {
+		struct drm_nouveau_sync *s;
+		u32 count;
+	} out_sync;
+
+	struct nouveau_job_ops {
+		int (*submit)(struct nouveau_job *);
+		struct dma_fence *(*run)(struct nouveau_job *);
+		void (*free)(struct nouveau_job *);
+	} *ops;
+};
+
+struct nouveau_exec_job {
+	struct nouveau_job base;
+
+	struct {
+		struct drm_nouveau_exec_push *s;
+		u32 count;
+	} push;
+};
+
+struct nouveau_bind_job {
+	struct nouveau_job base;
+
+	/* struct bind_job_op */
+	struct list_head ops;
+};
+
+int nouveau_bind_job_init(struct nouveau_bind_job **job,
+			  struct nouveau_exec_bind *bind);
+int nouveau_exec_job_init(struct nouveau_exec_job **job,
+			  struct nouveau_exec *exec);
+
+int nouveau_job_submit(struct nouveau_job *job);
+void nouveau_job_fini(struct nouveau_job *job);
+
+#define to_nouveau_sched_entity(entity)		\
+		container_of((entity), struct nouveau_sched_entity, base)
+
+struct nouveau_sched_entity {
+	struct drm_sched_entity base;
+	struct {
+		struct list_head list;
+		struct mutex mutex;
+	} job;
+};
+
+int nouveau_sched_entity_init(struct nouveau_sched_entity *entity,
+			      struct drm_gpu_scheduler *sched);
+void nouveau_sched_entity_fini(struct nouveau_sched_entity *entity);
+
+int nouveau_sched_init(struct drm_gpu_scheduler *sched,
+		       struct nouveau_drm *drm);
+void nouveau_sched_fini(struct drm_gpu_scheduler *sched);
+
+#endif