diff mbox series

[v5,20/21] drm/tegra: Implement job submission part of new UAPI

Message ID 20210111130019.3515669-21-mperttunen@nvidia.com (mailing list archive)
State New, archived
Headers show
Series Host1x/TegraDRM UAPI | expand

Commit Message

Mikko Perttunen Jan. 11, 2021, 1 p.m. UTC
Implement the job submission IOCTL with a minimum feature set.

Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
---
v5:
* Add 16K size limit to copies from userspace.
* Guard RELOC_BLOCKLINEAR flag handling to only exist in ARM64
  to prevent oversized shift on 32-bit platforms.
v4:
* Remove all features that are not strictly necessary.
* Split into two patches.
v3:
* Remove WRITE_RELOC. Relocations are now patched implicitly
  when patching is needed.
* Directly call PM runtime APIs on devices instead of using
  power_on/power_off callbacks.
* Remove incorrect mutex unlock in tegra_drm_ioctl_channel_open
* Use XA_FLAGS_ALLOC1 instead of XA_FLAGS_ALLOC
* Accommodate for removal of timeout field and inlining of
  syncpt_incrs array.
* Copy entire user arrays at a time instead of going through
  elements one-by-one.
* Implement waiting of DMA reservations.
* Split out gather_bo implementation into a separate file.
* Fix length parameter passed to sg_init_one in gather_bo
* Cosmetic cleanup.
---
 drivers/gpu/drm/tegra/Makefile         |   2 +
 drivers/gpu/drm/tegra/drm.c            |   2 +
 drivers/gpu/drm/tegra/uapi/gather_bo.c |  86 +++++
 drivers/gpu/drm/tegra/uapi/gather_bo.h |  22 ++
 drivers/gpu/drm/tegra/uapi/submit.c    | 428 +++++++++++++++++++++++++
 drivers/gpu/drm/tegra/uapi/submit.h    |  17 +
 6 files changed, 557 insertions(+)
 create mode 100644 drivers/gpu/drm/tegra/uapi/gather_bo.c
 create mode 100644 drivers/gpu/drm/tegra/uapi/gather_bo.h
 create mode 100644 drivers/gpu/drm/tegra/uapi/submit.c
 create mode 100644 drivers/gpu/drm/tegra/uapi/submit.h

Comments

Thierry Reding March 23, 2021, 1:38 p.m. UTC | #1
On Mon, Jan 11, 2021 at 03:00:18PM +0200, Mikko Perttunen wrote:
> Implement the job submission IOCTL with a minimum feature set.
> 
> Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
> ---
> v5:
> * Add 16K size limit to copies from userspace.
> * Guard RELOC_BLOCKLINEAR flag handling to only exist in ARM64
>   to prevent oversized shift on 32-bit platforms.
> v4:
> * Remove all features that are not strictly necessary.
> * Split into two patches.
> v3:
> * Remove WRITE_RELOC. Relocations are now patched implicitly
>   when patching is needed.
> * Directly call PM runtime APIs on devices instead of using
>   power_on/power_off callbacks.
> * Remove incorrect mutex unlock in tegra_drm_ioctl_channel_open
> * Use XA_FLAGS_ALLOC1 instead of XA_FLAGS_ALLOC
> * Accommodate for removal of timeout field and inlining of
>   syncpt_incrs array.
> * Copy entire user arrays at a time instead of going through
>   elements one-by-one.
> * Implement waiting of DMA reservations.
> * Split out gather_bo implementation into a separate file.
> * Fix length parameter passed to sg_init_one in gather_bo
> * Cosmetic cleanup.
> ---
>  drivers/gpu/drm/tegra/Makefile         |   2 +
>  drivers/gpu/drm/tegra/drm.c            |   2 +
>  drivers/gpu/drm/tegra/uapi/gather_bo.c |  86 +++++
>  drivers/gpu/drm/tegra/uapi/gather_bo.h |  22 ++
>  drivers/gpu/drm/tegra/uapi/submit.c    | 428 +++++++++++++++++++++++++
>  drivers/gpu/drm/tegra/uapi/submit.h    |  17 +
>  6 files changed, 557 insertions(+)
>  create mode 100644 drivers/gpu/drm/tegra/uapi/gather_bo.c
>  create mode 100644 drivers/gpu/drm/tegra/uapi/gather_bo.h
>  create mode 100644 drivers/gpu/drm/tegra/uapi/submit.c
>  create mode 100644 drivers/gpu/drm/tegra/uapi/submit.h
> 
> diff --git a/drivers/gpu/drm/tegra/Makefile b/drivers/gpu/drm/tegra/Makefile
> index 0abdb21b38b9..059322e88943 100644
> --- a/drivers/gpu/drm/tegra/Makefile
> +++ b/drivers/gpu/drm/tegra/Makefile
> @@ -4,6 +4,8 @@ ccflags-$(CONFIG_DRM_TEGRA_DEBUG) += -DDEBUG
>  tegra-drm-y := \
>  	drm.o \
>  	uapi/uapi.o \
> +	uapi/submit.o \
> +	uapi/gather_bo.o \
>  	gem.o \
>  	fb.o \
>  	dp.o \
> diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
> index 6a51035ce33f..60eab403ae9b 100644
> --- a/drivers/gpu/drm/tegra/drm.c
> +++ b/drivers/gpu/drm/tegra/drm.c
> @@ -737,6 +737,8 @@ static const struct drm_ioctl_desc tegra_drm_ioctls[] = {
>  			  DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_UNMAP, tegra_drm_ioctl_channel_unmap,
>  			  DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_SUBMIT, tegra_drm_ioctl_channel_submit,
> +			  DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(TEGRA_GEM_CREATE, tegra_drm_ioctl_gem_create,
>  			  DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(TEGRA_GEM_MMAP, tegra_drm_ioctl_gem_mmap,
> diff --git a/drivers/gpu/drm/tegra/uapi/gather_bo.c b/drivers/gpu/drm/tegra/uapi/gather_bo.c
> new file mode 100644
> index 000000000000..b487a0d44648
> --- /dev/null
> +++ b/drivers/gpu/drm/tegra/uapi/gather_bo.c
> @@ -0,0 +1,86 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2020 NVIDIA Corporation */
> +
> +#include <linux/scatterlist.h>
> +#include <linux/slab.h>
> +
> +#include "gather_bo.h"
> +
> +static struct host1x_bo *gather_bo_get(struct host1x_bo *host_bo)
> +{
> +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
> +
> +	kref_get(&bo->ref);
> +
> +	return host_bo;
> +}
> +
> +static void gather_bo_release(struct kref *ref)
> +{
> +	struct gather_bo *bo = container_of(ref, struct gather_bo, ref);
> +
> +	kfree(bo->gather_data);
> +	kfree(bo);
> +}
> +
> +void gather_bo_put(struct host1x_bo *host_bo)
> +{
> +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
> +
> +	kref_put(&bo->ref, gather_bo_release);
> +}
> +
> +static struct sg_table *
> +gather_bo_pin(struct device *dev, struct host1x_bo *host_bo, dma_addr_t *phys)
> +{
> +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
> +	struct sg_table *sgt;
> +	int err;
> +
> +	if (phys) {
> +		*phys = virt_to_phys(bo->gather_data);
> +		return NULL;
> +	}
> +
> +	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> +	if (!sgt)
> +		return ERR_PTR(-ENOMEM);
> +
> +	err = sg_alloc_table(sgt, 1, GFP_KERNEL);
> +	if (err) {
> +		kfree(sgt);
> +		return ERR_PTR(err);
> +	}
> +
> +	sg_init_one(sgt->sgl, bo->gather_data, bo->gather_data_words*4);
> +
> +	return sgt;
> +}
> +
> +static void gather_bo_unpin(struct device *dev, struct sg_table *sgt)
> +{
> +	if (sgt) {
> +		sg_free_table(sgt);
> +		kfree(sgt);
> +	}
> +}
> +
> +static void *gather_bo_mmap(struct host1x_bo *host_bo)
> +{
> +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
> +
> +	return bo->gather_data;
> +}
> +
> +static void gather_bo_munmap(struct host1x_bo *host_bo, void *addr)
> +{
> +}
> +
> +const struct host1x_bo_ops gather_bo_ops = {
> +	.get = gather_bo_get,
> +	.put = gather_bo_put,
> +	.pin = gather_bo_pin,
> +	.unpin = gather_bo_unpin,
> +	.mmap = gather_bo_mmap,
> +	.munmap = gather_bo_munmap,
> +};
> diff --git a/drivers/gpu/drm/tegra/uapi/gather_bo.h b/drivers/gpu/drm/tegra/uapi/gather_bo.h
> new file mode 100644
> index 000000000000..6b4c9d83ac91
> --- /dev/null
> +++ b/drivers/gpu/drm/tegra/uapi/gather_bo.h
> @@ -0,0 +1,22 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/* Copyright (c) 2020 NVIDIA Corporation */
> +
> +#ifndef _TEGRA_DRM_SUBMIT_GATHER_BO_H
> +#define _TEGRA_DRM_SUBMIT_GATHER_BO_H
> +
> +#include <linux/host1x.h>
> +#include <linux/kref.h>
> +
> +struct gather_bo {
> +	struct host1x_bo base;
> +
> +	struct kref ref;
> +
> +	u32 *gather_data;
> +	size_t gather_data_words;
> +};
> +
> +extern const struct host1x_bo_ops gather_bo_ops;
> +void gather_bo_put(struct host1x_bo *host_bo);
> +
> +#endif
> diff --git a/drivers/gpu/drm/tegra/uapi/submit.c b/drivers/gpu/drm/tegra/uapi/submit.c
> new file mode 100644
> index 000000000000..398be3065e21
> --- /dev/null
> +++ b/drivers/gpu/drm/tegra/uapi/submit.c
> @@ -0,0 +1,428 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2020 NVIDIA Corporation */
> +
> +#include <linux/dma-fence-array.h>
> +#include <linux/file.h>
> +#include <linux/host1x.h>
> +#include <linux/iommu.h>
> +#include <linux/kref.h>
> +#include <linux/list.h>
> +#include <linux/nospec.h>
> +#include <linux/pm_runtime.h>
> +#include <linux/sync_file.h>
> +
> +#include <drm/drm_drv.h>
> +#include <drm/drm_file.h>
> +
> +#include "../uapi.h"
> +#include "../drm.h"
> +#include "../gem.h"
> +
> +#include "gather_bo.h"
> +#include "submit.h"
> +
> +static struct tegra_drm_mapping *
> +tegra_drm_mapping_get(struct tegra_drm_channel_ctx *ctx, u32 id)
> +{
> +	struct tegra_drm_mapping *mapping;
> +
> +	xa_lock(&ctx->mappings);
> +	mapping = xa_load(&ctx->mappings, id);
> +	if (mapping)
> +		kref_get(&mapping->ref);
> +	xa_unlock(&ctx->mappings);
> +
> +	return mapping;
> +}
> +
> +static void *alloc_copy_user_array(void __user *from, size_t count, size_t size)
> +{
> +	unsigned long copy_err;
> +	size_t copy_len;
> +	void *data;
> +
> +	if (check_mul_overflow(count, size, &copy_len))
> +		return ERR_PTR(-EINVAL);
> +
> +	if (copy_len > 0x4000)
> +		return ERR_PTR(-E2BIG);
> +
> +	data = kvmalloc(copy_len, GFP_KERNEL);
> +	if (!data)
> +		return ERR_PTR(-ENOMEM);
> +
> +	copy_err = copy_from_user(data, from, copy_len);
> +	if (copy_err) {
> +		kvfree(data);
> +		return ERR_PTR(-EFAULT);
> +	}
> +
> +	return data;
> +}
> +
> +static int submit_copy_gather_data(struct drm_device *drm,
> +				   struct gather_bo **pbo,
> +				   struct drm_tegra_channel_submit *args)
> +{
> +	unsigned long copy_err;
> +	struct gather_bo *bo;
> +	size_t copy_len;
> +
> +	if (args->gather_data_words == 0) {
> +		drm_info(drm, "gather_data_words can't be 0");
> +		return -EINVAL;
> +	}
> +
> +	if (check_mul_overflow((size_t)args->gather_data_words, (size_t)4, &copy_len))
> +		return -EINVAL;
> +
> +	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
> +	if (!bo)
> +		return -ENOMEM;
> +
> +	kref_init(&bo->ref);
> +	host1x_bo_init(&bo->base, &gather_bo_ops);
> +
> +	bo->gather_data = kmalloc(copy_len, GFP_KERNEL | __GFP_NOWARN);
> +	if (!bo->gather_data) {
> +		kfree(bo);
> +		return -ENOMEM;
> +	}
> +
> +	copy_err = copy_from_user(bo->gather_data,
> +				  u64_to_user_ptr(args->gather_data_ptr),
> +				  copy_len);
> +	if (copy_err) {
> +		kfree(bo->gather_data);
> +		kfree(bo);
> +		return -EFAULT;
> +	}
> +
> +	bo->gather_data_words = args->gather_data_words;
> +
> +	*pbo = bo;
> +
> +	return 0;
> +}
> +
> +static int submit_write_reloc(struct gather_bo *bo,
> +			      struct drm_tegra_submit_buf *buf,
> +			      struct tegra_drm_mapping *mapping)
> +{
> +	/* TODO check that target_offset is within bounds */
> +	dma_addr_t iova = mapping->iova + buf->reloc.target_offset;
> +	u32 written_ptr = (u32)(iova >> buf->reloc.shift);
> +
> +#ifdef CONFIG_ARM64
> +	if (buf->flags & DRM_TEGRA_SUBMIT_BUF_RELOC_BLOCKLINEAR)
> +		written_ptr |= BIT(39);
> +#endif

Sorry, but this still isn't correct. written_ptr is still only 32-bits
wide, so your BIT(39) is going to get discarded even on 64-bit ARM. The
idiomatic way to do this is to make written_ptr dma_addr_t and use a
CONFIG_ARCH_DMA_ADDR_T_64BIT guard.

But even with that this looks wrong because you're OR'ing this in after
shifting by buf->reloc.shift. Doesn't that OR it in at the wrong offset?
Should you perhaps be doing this instead:

	#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
		if (buf->flags & DRM_TEGRA_SUBMIT_BUF_RELOC_BLOCKLINEAR)
			iova |= BIT(39);
	#endif

	written_ptr = (u32)(iova >> buf->reloc_shift);

?

Also, on a side-note: BLOCKLINEAR really isn't the right term here. I
recently dealt with this for display (though I haven't sent out that
patch yet) and this is actually a bit that selects which sector layout
swizzling is being applied. That's independent of block linear format
and I think you can have different sector layouts irrespective of the
block linear format (though I don't think that's usually done).

That said, I wonder if a better interface here would be to reuse format
modifiers here. That would allow us to more fully describe the format of
a surface in case we ever need it, and it already includes the sector
layout information as well.

Thierry
Mikko Perttunen March 23, 2021, 2:16 p.m. UTC | #2
On 3/23/21 3:38 PM, Thierry Reding wrote:
> On Mon, Jan 11, 2021 at 03:00:18PM +0200, Mikko Perttunen wrote:
>> Implement the job submission IOCTL with a minimum feature set.
>>
>> Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
>> ---
>> v5:
>> * Add 16K size limit to copies from userspace.
>> * Guard RELOC_BLOCKLINEAR flag handling to only exist in ARM64
>>    to prevent oversized shift on 32-bit platforms.
>> v4:
>> * Remove all features that are not strictly necessary.
>> * Split into two patches.
>> v3:
>> * Remove WRITE_RELOC. Relocations are now patched implicitly
>>    when patching is needed.
>> * Directly call PM runtime APIs on devices instead of using
>>    power_on/power_off callbacks.
>> * Remove incorrect mutex unlock in tegra_drm_ioctl_channel_open
>> * Use XA_FLAGS_ALLOC1 instead of XA_FLAGS_ALLOC
>> * Accommodate for removal of timeout field and inlining of
>>    syncpt_incrs array.
>> * Copy entire user arrays at a time instead of going through
>>    elements one-by-one.
>> * Implement waiting of DMA reservations.
>> * Split out gather_bo implementation into a separate file.
>> * Fix length parameter passed to sg_init_one in gather_bo
>> * Cosmetic cleanup.
>> ---
>>   drivers/gpu/drm/tegra/Makefile         |   2 +
>>   drivers/gpu/drm/tegra/drm.c            |   2 +
>>   drivers/gpu/drm/tegra/uapi/gather_bo.c |  86 +++++
>>   drivers/gpu/drm/tegra/uapi/gather_bo.h |  22 ++
>>   drivers/gpu/drm/tegra/uapi/submit.c    | 428 +++++++++++++++++++++++++
>>   drivers/gpu/drm/tegra/uapi/submit.h    |  17 +
>>   6 files changed, 557 insertions(+)
>>   create mode 100644 drivers/gpu/drm/tegra/uapi/gather_bo.c
>>   create mode 100644 drivers/gpu/drm/tegra/uapi/gather_bo.h
>>   create mode 100644 drivers/gpu/drm/tegra/uapi/submit.c
>>   create mode 100644 drivers/gpu/drm/tegra/uapi/submit.h
>>
>> diff --git a/drivers/gpu/drm/tegra/Makefile b/drivers/gpu/drm/tegra/Makefile
>> index 0abdb21b38b9..059322e88943 100644
>> --- a/drivers/gpu/drm/tegra/Makefile
>> +++ b/drivers/gpu/drm/tegra/Makefile
>> @@ -4,6 +4,8 @@ ccflags-$(CONFIG_DRM_TEGRA_DEBUG) += -DDEBUG
>>   tegra-drm-y := \
>>   	drm.o \
>>   	uapi/uapi.o \
>> +	uapi/submit.o \
>> +	uapi/gather_bo.o \
>>   	gem.o \
>>   	fb.o \
>>   	dp.o \
>> diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
>> index 6a51035ce33f..60eab403ae9b 100644
>> --- a/drivers/gpu/drm/tegra/drm.c
>> +++ b/drivers/gpu/drm/tegra/drm.c
>> @@ -737,6 +737,8 @@ static const struct drm_ioctl_desc tegra_drm_ioctls[] = {
>>   			  DRM_RENDER_ALLOW),
>>   	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_UNMAP, tegra_drm_ioctl_channel_unmap,
>>   			  DRM_RENDER_ALLOW),
>> +	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_SUBMIT, tegra_drm_ioctl_channel_submit,
>> +			  DRM_RENDER_ALLOW),
>>   	DRM_IOCTL_DEF_DRV(TEGRA_GEM_CREATE, tegra_drm_ioctl_gem_create,
>>   			  DRM_RENDER_ALLOW),
>>   	DRM_IOCTL_DEF_DRV(TEGRA_GEM_MMAP, tegra_drm_ioctl_gem_mmap,
>> diff --git a/drivers/gpu/drm/tegra/uapi/gather_bo.c b/drivers/gpu/drm/tegra/uapi/gather_bo.c
>> new file mode 100644
>> index 000000000000..b487a0d44648
>> --- /dev/null
>> +++ b/drivers/gpu/drm/tegra/uapi/gather_bo.c
>> @@ -0,0 +1,86 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/* Copyright (c) 2020 NVIDIA Corporation */
>> +
>> +#include <linux/scatterlist.h>
>> +#include <linux/slab.h>
>> +
>> +#include "gather_bo.h"
>> +
>> +static struct host1x_bo *gather_bo_get(struct host1x_bo *host_bo)
>> +{
>> +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
>> +
>> +	kref_get(&bo->ref);
>> +
>> +	return host_bo;
>> +}
>> +
>> +static void gather_bo_release(struct kref *ref)
>> +{
>> +	struct gather_bo *bo = container_of(ref, struct gather_bo, ref);
>> +
>> +	kfree(bo->gather_data);
>> +	kfree(bo);
>> +}
>> +
>> +void gather_bo_put(struct host1x_bo *host_bo)
>> +{
>> +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
>> +
>> +	kref_put(&bo->ref, gather_bo_release);
>> +}
>> +
>> +static struct sg_table *
>> +gather_bo_pin(struct device *dev, struct host1x_bo *host_bo, dma_addr_t *phys)
>> +{
>> +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
>> +	struct sg_table *sgt;
>> +	int err;
>> +
>> +	if (phys) {
>> +		*phys = virt_to_phys(bo->gather_data);
>> +		return NULL;
>> +	}
>> +
>> +	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
>> +	if (!sgt)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	err = sg_alloc_table(sgt, 1, GFP_KERNEL);
>> +	if (err) {
>> +		kfree(sgt);
>> +		return ERR_PTR(err);
>> +	}
>> +
>> +	sg_init_one(sgt->sgl, bo->gather_data, bo->gather_data_words*4);
>> +
>> +	return sgt;
>> +}
>> +
>> +static void gather_bo_unpin(struct device *dev, struct sg_table *sgt)
>> +{
>> +	if (sgt) {
>> +		sg_free_table(sgt);
>> +		kfree(sgt);
>> +	}
>> +}
>> +
>> +static void *gather_bo_mmap(struct host1x_bo *host_bo)
>> +{
>> +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
>> +
>> +	return bo->gather_data;
>> +}
>> +
>> +static void gather_bo_munmap(struct host1x_bo *host_bo, void *addr)
>> +{
>> +}
>> +
>> +const struct host1x_bo_ops gather_bo_ops = {
>> +	.get = gather_bo_get,
>> +	.put = gather_bo_put,
>> +	.pin = gather_bo_pin,
>> +	.unpin = gather_bo_unpin,
>> +	.mmap = gather_bo_mmap,
>> +	.munmap = gather_bo_munmap,
>> +};
>> diff --git a/drivers/gpu/drm/tegra/uapi/gather_bo.h b/drivers/gpu/drm/tegra/uapi/gather_bo.h
>> new file mode 100644
>> index 000000000000..6b4c9d83ac91
>> --- /dev/null
>> +++ b/drivers/gpu/drm/tegra/uapi/gather_bo.h
>> @@ -0,0 +1,22 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/* Copyright (c) 2020 NVIDIA Corporation */
>> +
>> +#ifndef _TEGRA_DRM_SUBMIT_GATHER_BO_H
>> +#define _TEGRA_DRM_SUBMIT_GATHER_BO_H
>> +
>> +#include <linux/host1x.h>
>> +#include <linux/kref.h>
>> +
>> +struct gather_bo {
>> +	struct host1x_bo base;
>> +
>> +	struct kref ref;
>> +
>> +	u32 *gather_data;
>> +	size_t gather_data_words;
>> +};
>> +
>> +extern const struct host1x_bo_ops gather_bo_ops;
>> +void gather_bo_put(struct host1x_bo *host_bo);
>> +
>> +#endif
>> diff --git a/drivers/gpu/drm/tegra/uapi/submit.c b/drivers/gpu/drm/tegra/uapi/submit.c
>> new file mode 100644
>> index 000000000000..398be3065e21
>> --- /dev/null
>> +++ b/drivers/gpu/drm/tegra/uapi/submit.c
>> @@ -0,0 +1,428 @@
>> +// SPDX-License-Identifier: GPL-2.0-only
>> +/* Copyright (c) 2020 NVIDIA Corporation */
>> +
>> +#include <linux/dma-fence-array.h>
>> +#include <linux/file.h>
>> +#include <linux/host1x.h>
>> +#include <linux/iommu.h>
>> +#include <linux/kref.h>
>> +#include <linux/list.h>
>> +#include <linux/nospec.h>
>> +#include <linux/pm_runtime.h>
>> +#include <linux/sync_file.h>
>> +
>> +#include <drm/drm_drv.h>
>> +#include <drm/drm_file.h>
>> +
>> +#include "../uapi.h"
>> +#include "../drm.h"
>> +#include "../gem.h"
>> +
>> +#include "gather_bo.h"
>> +#include "submit.h"
>> +
>> +static struct tegra_drm_mapping *
>> +tegra_drm_mapping_get(struct tegra_drm_channel_ctx *ctx, u32 id)
>> +{
>> +	struct tegra_drm_mapping *mapping;
>> +
>> +	xa_lock(&ctx->mappings);
>> +	mapping = xa_load(&ctx->mappings, id);
>> +	if (mapping)
>> +		kref_get(&mapping->ref);
>> +	xa_unlock(&ctx->mappings);
>> +
>> +	return mapping;
>> +}
>> +
>> +static void *alloc_copy_user_array(void __user *from, size_t count, size_t size)
>> +{
>> +	unsigned long copy_err;
>> +	size_t copy_len;
>> +	void *data;
>> +
>> +	if (check_mul_overflow(count, size, &copy_len))
>> +		return ERR_PTR(-EINVAL);
>> +
>> +	if (copy_len > 0x4000)
>> +		return ERR_PTR(-E2BIG);
>> +
>> +	data = kvmalloc(copy_len, GFP_KERNEL);
>> +	if (!data)
>> +		return ERR_PTR(-ENOMEM);
>> +
>> +	copy_err = copy_from_user(data, from, copy_len);
>> +	if (copy_err) {
>> +		kvfree(data);
>> +		return ERR_PTR(-EFAULT);
>> +	}
>> +
>> +	return data;
>> +}
>> +
>> +static int submit_copy_gather_data(struct drm_device *drm,
>> +				   struct gather_bo **pbo,
>> +				   struct drm_tegra_channel_submit *args)
>> +{
>> +	unsigned long copy_err;
>> +	struct gather_bo *bo;
>> +	size_t copy_len;
>> +
>> +	if (args->gather_data_words == 0) {
>> +		drm_info(drm, "gather_data_words can't be 0");
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (check_mul_overflow((size_t)args->gather_data_words, (size_t)4, &copy_len))
>> +		return -EINVAL;
>> +
>> +	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
>> +	if (!bo)
>> +		return -ENOMEM;
>> +
>> +	kref_init(&bo->ref);
>> +	host1x_bo_init(&bo->base, &gather_bo_ops);
>> +
>> +	bo->gather_data = kmalloc(copy_len, GFP_KERNEL | __GFP_NOWARN);
>> +	if (!bo->gather_data) {
>> +		kfree(bo);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	copy_err = copy_from_user(bo->gather_data,
>> +				  u64_to_user_ptr(args->gather_data_ptr),
>> +				  copy_len);
>> +	if (copy_err) {
>> +		kfree(bo->gather_data);
>> +		kfree(bo);
>> +		return -EFAULT;
>> +	}
>> +
>> +	bo->gather_data_words = args->gather_data_words;
>> +
>> +	*pbo = bo;
>> +
>> +	return 0;
>> +}
>> +
>> +static int submit_write_reloc(struct gather_bo *bo,
>> +			      struct drm_tegra_submit_buf *buf,
>> +			      struct tegra_drm_mapping *mapping)
>> +{
>> +	/* TODO check that target_offset is within bounds */
>> +	dma_addr_t iova = mapping->iova + buf->reloc.target_offset;
>> +	u32 written_ptr = (u32)(iova >> buf->reloc.shift);
>> +
>> +#ifdef CONFIG_ARM64
>> +	if (buf->flags & DRM_TEGRA_SUBMIT_BUF_RELOC_BLOCKLINEAR)
>> +		written_ptr |= BIT(39);
>> +#endif
> 
> Sorry, but this still isn't correct. written_ptr is still only 32-bits
> wide, so your BIT(39) is going to get discarded even on 64-bit ARM. The
> idiomatic way to do this is to make written_ptr dma_addr_t and use a
> CONFIG_ARCH_DMA_ADDR_T_64BIT guard. >
> But even with that this looks wrong because you're OR'ing this in after
> shifting by buf->reloc.shift. Doesn't that OR it in at the wrong offset?
> Should you perhaps be doing this instead:
> 
> 	#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
> 		if (buf->flags & DRM_TEGRA_SUBMIT_BUF_RELOC_BLOCKLINEAR)
> 			iova |= BIT(39);
> 	#endif
> 
> 	written_ptr = (u32)(iova >> buf->reloc_shift);
> 
> ?

Yes, you are of course right.. will fix this. That might explain some of 
the VIC test failures I've seen.

> 
> Also, on a side-note: BLOCKLINEAR really isn't the right term here. I
> recently dealt with this for display (though I haven't sent out that
> patch yet) and this is actually a bit that selects which sector layout
> swizzling is being applied. That's independent of block linear format
> and I think you can have different sector layouts irrespective of the
> block linear format (though I don't think that's usually done).
> 
> That said, I wonder if a better interface here would be to reuse format
> modifiers here. That would allow us to more fully describe the format of
> a surface in case we ever need it, and it already includes the sector
> layout information as well.

I think having just a flag that enables or disables the swizzling is 
better -- that way it is the responsibility of the userspace, which is 
where all the engine knowledge is as well, to know for each buffer 
whether it wants swizzling or not. Now, in practice at the moment the 
kernel can just lookup the format and set the bit based on that, but 
e.g. if there was an engine that could do the swizzling natively, and we 
had the format modifier here, we'd need to have the knowledge in the 
kernel to decide for each chip/engine whether to apply the bit.

For display it is a bit different since the knowledge is already in the 
kernel.

Mikko

> 
> Thierry
>
Thierry Reding March 23, 2021, 5:04 p.m. UTC | #3
On Tue, Mar 23, 2021 at 04:16:00PM +0200, Mikko Perttunen wrote:
> On 3/23/21 3:38 PM, Thierry Reding wrote:
> > On Mon, Jan 11, 2021 at 03:00:18PM +0200, Mikko Perttunen wrote:
> > > Implement the job submission IOCTL with a minimum feature set.
> > > 
> > > Signed-off-by: Mikko Perttunen <mperttunen@nvidia.com>
> > > ---
> > > v5:
> > > * Add 16K size limit to copies from userspace.
> > > * Guard RELOC_BLOCKLINEAR flag handling to only exist in ARM64
> > >    to prevent oversized shift on 32-bit platforms.
> > > v4:
> > > * Remove all features that are not strictly necessary.
> > > * Split into two patches.
> > > v3:
> > > * Remove WRITE_RELOC. Relocations are now patched implicitly
> > >    when patching is needed.
> > > * Directly call PM runtime APIs on devices instead of using
> > >    power_on/power_off callbacks.
> > > * Remove incorrect mutex unlock in tegra_drm_ioctl_channel_open
> > > * Use XA_FLAGS_ALLOC1 instead of XA_FLAGS_ALLOC
> > > * Accommodate for removal of timeout field and inlining of
> > >    syncpt_incrs array.
> > > * Copy entire user arrays at a time instead of going through
> > >    elements one-by-one.
> > > * Implement waiting of DMA reservations.
> > > * Split out gather_bo implementation into a separate file.
> > > * Fix length parameter passed to sg_init_one in gather_bo
> > > * Cosmetic cleanup.
> > > ---
> > >   drivers/gpu/drm/tegra/Makefile         |   2 +
> > >   drivers/gpu/drm/tegra/drm.c            |   2 +
> > >   drivers/gpu/drm/tegra/uapi/gather_bo.c |  86 +++++
> > >   drivers/gpu/drm/tegra/uapi/gather_bo.h |  22 ++
> > >   drivers/gpu/drm/tegra/uapi/submit.c    | 428 +++++++++++++++++++++++++
> > >   drivers/gpu/drm/tegra/uapi/submit.h    |  17 +
> > >   6 files changed, 557 insertions(+)
> > >   create mode 100644 drivers/gpu/drm/tegra/uapi/gather_bo.c
> > >   create mode 100644 drivers/gpu/drm/tegra/uapi/gather_bo.h
> > >   create mode 100644 drivers/gpu/drm/tegra/uapi/submit.c
> > >   create mode 100644 drivers/gpu/drm/tegra/uapi/submit.h
> > > 
> > > diff --git a/drivers/gpu/drm/tegra/Makefile b/drivers/gpu/drm/tegra/Makefile
> > > index 0abdb21b38b9..059322e88943 100644
> > > --- a/drivers/gpu/drm/tegra/Makefile
> > > +++ b/drivers/gpu/drm/tegra/Makefile
> > > @@ -4,6 +4,8 @@ ccflags-$(CONFIG_DRM_TEGRA_DEBUG) += -DDEBUG
> > >   tegra-drm-y := \
> > >   	drm.o \
> > >   	uapi/uapi.o \
> > > +	uapi/submit.o \
> > > +	uapi/gather_bo.o \
> > >   	gem.o \
> > >   	fb.o \
> > >   	dp.o \
> > > diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
> > > index 6a51035ce33f..60eab403ae9b 100644
> > > --- a/drivers/gpu/drm/tegra/drm.c
> > > +++ b/drivers/gpu/drm/tegra/drm.c
> > > @@ -737,6 +737,8 @@ static const struct drm_ioctl_desc tegra_drm_ioctls[] = {
> > >   			  DRM_RENDER_ALLOW),
> > >   	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_UNMAP, tegra_drm_ioctl_channel_unmap,
> > >   			  DRM_RENDER_ALLOW),
> > > +	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_SUBMIT, tegra_drm_ioctl_channel_submit,
> > > +			  DRM_RENDER_ALLOW),
> > >   	DRM_IOCTL_DEF_DRV(TEGRA_GEM_CREATE, tegra_drm_ioctl_gem_create,
> > >   			  DRM_RENDER_ALLOW),
> > >   	DRM_IOCTL_DEF_DRV(TEGRA_GEM_MMAP, tegra_drm_ioctl_gem_mmap,
> > > diff --git a/drivers/gpu/drm/tegra/uapi/gather_bo.c b/drivers/gpu/drm/tegra/uapi/gather_bo.c
> > > new file mode 100644
> > > index 000000000000..b487a0d44648
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/tegra/uapi/gather_bo.c
> > > @@ -0,0 +1,86 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/* Copyright (c) 2020 NVIDIA Corporation */
> > > +
> > > +#include <linux/scatterlist.h>
> > > +#include <linux/slab.h>
> > > +
> > > +#include "gather_bo.h"
> > > +
> > > +static struct host1x_bo *gather_bo_get(struct host1x_bo *host_bo)
> > > +{
> > > +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
> > > +
> > > +	kref_get(&bo->ref);
> > > +
> > > +	return host_bo;
> > > +}
> > > +
> > > +static void gather_bo_release(struct kref *ref)
> > > +{
> > > +	struct gather_bo *bo = container_of(ref, struct gather_bo, ref);
> > > +
> > > +	kfree(bo->gather_data);
> > > +	kfree(bo);
> > > +}
> > > +
> > > +void gather_bo_put(struct host1x_bo *host_bo)
> > > +{
> > > +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
> > > +
> > > +	kref_put(&bo->ref, gather_bo_release);
> > > +}
> > > +
> > > +static struct sg_table *
> > > +gather_bo_pin(struct device *dev, struct host1x_bo *host_bo, dma_addr_t *phys)
> > > +{
> > > +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
> > > +	struct sg_table *sgt;
> > > +	int err;
> > > +
> > > +	if (phys) {
> > > +		*phys = virt_to_phys(bo->gather_data);
> > > +		return NULL;
> > > +	}
> > > +
> > > +	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
> > > +	if (!sgt)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	err = sg_alloc_table(sgt, 1, GFP_KERNEL);
> > > +	if (err) {
> > > +		kfree(sgt);
> > > +		return ERR_PTR(err);
> > > +	}
> > > +
> > > +	sg_init_one(sgt->sgl, bo->gather_data, bo->gather_data_words*4);
> > > +
> > > +	return sgt;
> > > +}
> > > +
> > > +static void gather_bo_unpin(struct device *dev, struct sg_table *sgt)
> > > +{
> > > +	if (sgt) {
> > > +		sg_free_table(sgt);
> > > +		kfree(sgt);
> > > +	}
> > > +}
> > > +
> > > +static void *gather_bo_mmap(struct host1x_bo *host_bo)
> > > +{
> > > +	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
> > > +
> > > +	return bo->gather_data;
> > > +}
> > > +
> > > +static void gather_bo_munmap(struct host1x_bo *host_bo, void *addr)
> > > +{
> > > +}
> > > +
> > > +const struct host1x_bo_ops gather_bo_ops = {
> > > +	.get = gather_bo_get,
> > > +	.put = gather_bo_put,
> > > +	.pin = gather_bo_pin,
> > > +	.unpin = gather_bo_unpin,
> > > +	.mmap = gather_bo_mmap,
> > > +	.munmap = gather_bo_munmap,
> > > +};
> > > diff --git a/drivers/gpu/drm/tegra/uapi/gather_bo.h b/drivers/gpu/drm/tegra/uapi/gather_bo.h
> > > new file mode 100644
> > > index 000000000000..6b4c9d83ac91
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/tegra/uapi/gather_bo.h
> > > @@ -0,0 +1,22 @@
> > > +/* SPDX-License-Identifier: GPL-2.0-only */
> > > +/* Copyright (c) 2020 NVIDIA Corporation */
> > > +
> > > +#ifndef _TEGRA_DRM_SUBMIT_GATHER_BO_H
> > > +#define _TEGRA_DRM_SUBMIT_GATHER_BO_H
> > > +
> > > +#include <linux/host1x.h>
> > > +#include <linux/kref.h>
> > > +
> > > +struct gather_bo {
> > > +	struct host1x_bo base;
> > > +
> > > +	struct kref ref;
> > > +
> > > +	u32 *gather_data;
> > > +	size_t gather_data_words;
> > > +};
> > > +
> > > +extern const struct host1x_bo_ops gather_bo_ops;
> > > +void gather_bo_put(struct host1x_bo *host_bo);
> > > +
> > > +#endif
> > > diff --git a/drivers/gpu/drm/tegra/uapi/submit.c b/drivers/gpu/drm/tegra/uapi/submit.c
> > > new file mode 100644
> > > index 000000000000..398be3065e21
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/tegra/uapi/submit.c
> > > @@ -0,0 +1,428 @@
> > > +// SPDX-License-Identifier: GPL-2.0-only
> > > +/* Copyright (c) 2020 NVIDIA Corporation */
> > > +
> > > +#include <linux/dma-fence-array.h>
> > > +#include <linux/file.h>
> > > +#include <linux/host1x.h>
> > > +#include <linux/iommu.h>
> > > +#include <linux/kref.h>
> > > +#include <linux/list.h>
> > > +#include <linux/nospec.h>
> > > +#include <linux/pm_runtime.h>
> > > +#include <linux/sync_file.h>
> > > +
> > > +#include <drm/drm_drv.h>
> > > +#include <drm/drm_file.h>
> > > +
> > > +#include "../uapi.h"
> > > +#include "../drm.h"
> > > +#include "../gem.h"
> > > +
> > > +#include "gather_bo.h"
> > > +#include "submit.h"
> > > +
> > > +static struct tegra_drm_mapping *
> > > +tegra_drm_mapping_get(struct tegra_drm_channel_ctx *ctx, u32 id)
> > > +{
> > > +	struct tegra_drm_mapping *mapping;
> > > +
> > > +	xa_lock(&ctx->mappings);
> > > +	mapping = xa_load(&ctx->mappings, id);
> > > +	if (mapping)
> > > +		kref_get(&mapping->ref);
> > > +	xa_unlock(&ctx->mappings);
> > > +
> > > +	return mapping;
> > > +}
> > > +
> > > +static void *alloc_copy_user_array(void __user *from, size_t count, size_t size)
> > > +{
> > > +	unsigned long copy_err;
> > > +	size_t copy_len;
> > > +	void *data;
> > > +
> > > +	if (check_mul_overflow(count, size, &copy_len))
> > > +		return ERR_PTR(-EINVAL);
> > > +
> > > +	if (copy_len > 0x4000)
> > > +		return ERR_PTR(-E2BIG);
> > > +
> > > +	data = kvmalloc(copy_len, GFP_KERNEL);
> > > +	if (!data)
> > > +		return ERR_PTR(-ENOMEM);
> > > +
> > > +	copy_err = copy_from_user(data, from, copy_len);
> > > +	if (copy_err) {
> > > +		kvfree(data);
> > > +		return ERR_PTR(-EFAULT);
> > > +	}
> > > +
> > > +	return data;
> > > +}
> > > +
> > > +static int submit_copy_gather_data(struct drm_device *drm,
> > > +				   struct gather_bo **pbo,
> > > +				   struct drm_tegra_channel_submit *args)
> > > +{
> > > +	unsigned long copy_err;
> > > +	struct gather_bo *bo;
> > > +	size_t copy_len;
> > > +
> > > +	if (args->gather_data_words == 0) {
> > > +		drm_info(drm, "gather_data_words can't be 0");
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	if (check_mul_overflow((size_t)args->gather_data_words, (size_t)4, &copy_len))
> > > +		return -EINVAL;
> > > +
> > > +	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
> > > +	if (!bo)
> > > +		return -ENOMEM;
> > > +
> > > +	kref_init(&bo->ref);
> > > +	host1x_bo_init(&bo->base, &gather_bo_ops);
> > > +
> > > +	bo->gather_data = kmalloc(copy_len, GFP_KERNEL | __GFP_NOWARN);
> > > +	if (!bo->gather_data) {
> > > +		kfree(bo);
> > > +		return -ENOMEM;
> > > +	}
> > > +
> > > +	copy_err = copy_from_user(bo->gather_data,
> > > +				  u64_to_user_ptr(args->gather_data_ptr),
> > > +				  copy_len);
> > > +	if (copy_err) {
> > > +		kfree(bo->gather_data);
> > > +		kfree(bo);
> > > +		return -EFAULT;
> > > +	}
> > > +
> > > +	bo->gather_data_words = args->gather_data_words;
> > > +
> > > +	*pbo = bo;
> > > +
> > > +	return 0;
> > > +}
> > > +
> > > +static int submit_write_reloc(struct gather_bo *bo,
> > > +			      struct drm_tegra_submit_buf *buf,
> > > +			      struct tegra_drm_mapping *mapping)
> > > +{
> > > +	/* TODO check that target_offset is within bounds */
> > > +	dma_addr_t iova = mapping->iova + buf->reloc.target_offset;
> > > +	u32 written_ptr = (u32)(iova >> buf->reloc.shift);
> > > +
> > > +#ifdef CONFIG_ARM64
> > > +	if (buf->flags & DRM_TEGRA_SUBMIT_BUF_RELOC_BLOCKLINEAR)
> > > +		written_ptr |= BIT(39);
> > > +#endif
> > 
> > Sorry, but this still isn't correct. written_ptr is still only 32-bits
> > wide, so your BIT(39) is going to get discarded even on 64-bit ARM. The
> > idiomatic way to do this is to make written_ptr dma_addr_t and use a
> > CONFIG_ARCH_DMA_ADDR_T_64BIT guard. >
> > But even with that this looks wrong because you're OR'ing this in after
> > shifting by buf->reloc.shift. Doesn't that OR it in at the wrong offset?
> > Should you perhaps be doing this instead:
> > 
> > 	#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
> > 		if (buf->flags & DRM_TEGRA_SUBMIT_BUF_RELOC_BLOCKLINEAR)
> > 			iova |= BIT(39);
> > 	#endif
> > 
> > 	written_ptr = (u32)(iova >> buf->reloc_shift);
> > 
> > ?
> 
> Yes, you are of course right.. will fix this. That might explain some of the
> VIC test failures I've seen.
> 
> > 
> > Also, on a side-note: BLOCKLINEAR really isn't the right term here. I
> > recently dealt with this for display (though I haven't sent out that
> > patch yet) and this is actually a bit that selects which sector layout
> > swizzling is being applied. That's independent of block linear format
> > and I think you can have different sector layouts irrespective of the
> > block linear format (though I don't think that's usually done).
> > 
> > That said, I wonder if a better interface here would be to reuse format
> > modifiers here. That would allow us to more fully describe the format of
> > a surface in case we ever need it, and it already includes the sector
> > layout information as well.
> 
> I think having just a flag that enables or disables the swizzling is better
> -- that way it is the responsibility of the userspace, which is where all
> the engine knowledge is as well, to know for each buffer whether it wants
> swizzling or not. Now, in practice at the moment the kernel can just lookup
> the format and set the bit based on that, but e.g. if there was an engine
> that could do the swizzling natively, and we had the format modifier here,
> we'd need to have the knowledge in the kernel to decide for each chip/engine
> whether to apply the bit.

Fine, let's try it this way. I'm just slightly worried that we'll end up
duplicating a lot of the same information that we already have in the
framebuffer modifiers. We made the same mistake a long time ago with
those odd flags in the CREATE IOCTL and that turned out not to be usable
at all, and also completely insufficient.

Thierry
diff mbox series

Patch

diff --git a/drivers/gpu/drm/tegra/Makefile b/drivers/gpu/drm/tegra/Makefile
index 0abdb21b38b9..059322e88943 100644
--- a/drivers/gpu/drm/tegra/Makefile
+++ b/drivers/gpu/drm/tegra/Makefile
@@ -4,6 +4,8 @@  ccflags-$(CONFIG_DRM_TEGRA_DEBUG) += -DDEBUG
 tegra-drm-y := \
 	drm.o \
 	uapi/uapi.o \
+	uapi/submit.o \
+	uapi/gather_bo.o \
 	gem.o \
 	fb.o \
 	dp.o \
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index 6a51035ce33f..60eab403ae9b 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -737,6 +737,8 @@  static const struct drm_ioctl_desc tegra_drm_ioctls[] = {
 			  DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_UNMAP, tegra_drm_ioctl_channel_unmap,
 			  DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(TEGRA_CHANNEL_SUBMIT, tegra_drm_ioctl_channel_submit,
+			  DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(TEGRA_GEM_CREATE, tegra_drm_ioctl_gem_create,
 			  DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(TEGRA_GEM_MMAP, tegra_drm_ioctl_gem_mmap,
diff --git a/drivers/gpu/drm/tegra/uapi/gather_bo.c b/drivers/gpu/drm/tegra/uapi/gather_bo.c
new file mode 100644
index 000000000000..b487a0d44648
--- /dev/null
+++ b/drivers/gpu/drm/tegra/uapi/gather_bo.c
@@ -0,0 +1,86 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 NVIDIA Corporation */
+
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+
+#include "gather_bo.h"
+
+static struct host1x_bo *gather_bo_get(struct host1x_bo *host_bo)
+{
+	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
+
+	kref_get(&bo->ref);
+
+	return host_bo;
+}
+
+static void gather_bo_release(struct kref *ref)
+{
+	struct gather_bo *bo = container_of(ref, struct gather_bo, ref);
+
+	kfree(bo->gather_data);
+	kfree(bo);
+}
+
+void gather_bo_put(struct host1x_bo *host_bo)
+{
+	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
+
+	kref_put(&bo->ref, gather_bo_release);
+}
+
+static struct sg_table *
+gather_bo_pin(struct device *dev, struct host1x_bo *host_bo, dma_addr_t *phys)
+{
+	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
+	struct sg_table *sgt;
+	int err;
+
+	if (phys) {
+		*phys = virt_to_phys(bo->gather_data);
+		return NULL;
+	}
+
+	sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
+	if (!sgt)
+		return ERR_PTR(-ENOMEM);
+
+	err = sg_alloc_table(sgt, 1, GFP_KERNEL);
+	if (err) {
+		kfree(sgt);
+		return ERR_PTR(err);
+	}
+
+	sg_init_one(sgt->sgl, bo->gather_data, bo->gather_data_words*4);
+
+	return sgt;
+}
+
+static void gather_bo_unpin(struct device *dev, struct sg_table *sgt)
+{
+	if (sgt) {
+		sg_free_table(sgt);
+		kfree(sgt);
+	}
+}
+
+static void *gather_bo_mmap(struct host1x_bo *host_bo)
+{
+	struct gather_bo *bo = container_of(host_bo, struct gather_bo, base);
+
+	return bo->gather_data;
+}
+
+static void gather_bo_munmap(struct host1x_bo *host_bo, void *addr)
+{
+}
+
+const struct host1x_bo_ops gather_bo_ops = {
+	.get = gather_bo_get,
+	.put = gather_bo_put,
+	.pin = gather_bo_pin,
+	.unpin = gather_bo_unpin,
+	.mmap = gather_bo_mmap,
+	.munmap = gather_bo_munmap,
+};
diff --git a/drivers/gpu/drm/tegra/uapi/gather_bo.h b/drivers/gpu/drm/tegra/uapi/gather_bo.h
new file mode 100644
index 000000000000..6b4c9d83ac91
--- /dev/null
+++ b/drivers/gpu/drm/tegra/uapi/gather_bo.h
@@ -0,0 +1,22 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2020 NVIDIA Corporation */
+
+#ifndef _TEGRA_DRM_SUBMIT_GATHER_BO_H
+#define _TEGRA_DRM_SUBMIT_GATHER_BO_H
+
+#include <linux/host1x.h>
+#include <linux/kref.h>
+
+struct gather_bo {
+	struct host1x_bo base;
+
+	struct kref ref;
+
+	u32 *gather_data;
+	size_t gather_data_words;
+};
+
+extern const struct host1x_bo_ops gather_bo_ops;
+void gather_bo_put(struct host1x_bo *host_bo);
+
+#endif
diff --git a/drivers/gpu/drm/tegra/uapi/submit.c b/drivers/gpu/drm/tegra/uapi/submit.c
new file mode 100644
index 000000000000..398be3065e21
--- /dev/null
+++ b/drivers/gpu/drm/tegra/uapi/submit.c
@@ -0,0 +1,428 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 NVIDIA Corporation */
+
+#include <linux/dma-fence-array.h>
+#include <linux/file.h>
+#include <linux/host1x.h>
+#include <linux/iommu.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/nospec.h>
+#include <linux/pm_runtime.h>
+#include <linux/sync_file.h>
+
+#include <drm/drm_drv.h>
+#include <drm/drm_file.h>
+
+#include "../uapi.h"
+#include "../drm.h"
+#include "../gem.h"
+
+#include "gather_bo.h"
+#include "submit.h"
+
+static struct tegra_drm_mapping *
+tegra_drm_mapping_get(struct tegra_drm_channel_ctx *ctx, u32 id)
+{
+	struct tegra_drm_mapping *mapping;
+
+	xa_lock(&ctx->mappings);
+	mapping = xa_load(&ctx->mappings, id);
+	if (mapping)
+		kref_get(&mapping->ref);
+	xa_unlock(&ctx->mappings);
+
+	return mapping;
+}
+
+static void *alloc_copy_user_array(void __user *from, size_t count, size_t size)
+{
+	unsigned long copy_err;
+	size_t copy_len;
+	void *data;
+
+	if (check_mul_overflow(count, size, &copy_len))
+		return ERR_PTR(-EINVAL);
+
+	if (copy_len > 0x4000)
+		return ERR_PTR(-E2BIG);
+
+	data = kvmalloc(copy_len, GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	copy_err = copy_from_user(data, from, copy_len);
+	if (copy_err) {
+		kvfree(data);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return data;
+}
+
+static int submit_copy_gather_data(struct drm_device *drm,
+				   struct gather_bo **pbo,
+				   struct drm_tegra_channel_submit *args)
+{
+	unsigned long copy_err;
+	struct gather_bo *bo;
+	size_t copy_len;
+
+	if (args->gather_data_words == 0) {
+		drm_info(drm, "gather_data_words can't be 0");
+		return -EINVAL;
+	}
+
+	if (check_mul_overflow((size_t)args->gather_data_words, (size_t)4, &copy_len))
+		return -EINVAL;
+
+	bo = kzalloc(sizeof(*bo), GFP_KERNEL);
+	if (!bo)
+		return -ENOMEM;
+
+	kref_init(&bo->ref);
+	host1x_bo_init(&bo->base, &gather_bo_ops);
+
+	bo->gather_data = kmalloc(copy_len, GFP_KERNEL | __GFP_NOWARN);
+	if (!bo->gather_data) {
+		kfree(bo);
+		return -ENOMEM;
+	}
+
+	copy_err = copy_from_user(bo->gather_data,
+				  u64_to_user_ptr(args->gather_data_ptr),
+				  copy_len);
+	if (copy_err) {
+		kfree(bo->gather_data);
+		kfree(bo);
+		return -EFAULT;
+	}
+
+	bo->gather_data_words = args->gather_data_words;
+
+	*pbo = bo;
+
+	return 0;
+}
+
+static int submit_write_reloc(struct gather_bo *bo,
+			      struct drm_tegra_submit_buf *buf,
+			      struct tegra_drm_mapping *mapping)
+{
+	/* TODO check that target_offset is within bounds */
+	dma_addr_t iova = mapping->iova + buf->reloc.target_offset;
+	u32 written_ptr = (u32)(iova >> buf->reloc.shift);
+
+#ifdef CONFIG_ARM64
+	if (buf->flags & DRM_TEGRA_SUBMIT_BUF_RELOC_BLOCKLINEAR)
+		written_ptr |= BIT(39);
+#endif
+
+	if (buf->reloc.gather_offset_words >= bo->gather_data_words)
+		return -EINVAL;
+
+	buf->reloc.gather_offset_words = array_index_nospec(
+		buf->reloc.gather_offset_words, bo->gather_data_words);
+
+	bo->gather_data[buf->reloc.gather_offset_words] = written_ptr;
+
+	return 0;
+}
+
+static int submit_process_bufs(struct drm_device *drm, struct gather_bo *bo,
+			       struct tegra_drm_submit_data *job_data,
+			       struct tegra_drm_channel_ctx *ctx,
+			       struct drm_tegra_channel_submit *args)
+{
+	struct tegra_drm_used_mapping *mappings;
+	struct drm_tegra_submit_buf *bufs;
+	int err;
+	u32 i;
+
+	bufs = alloc_copy_user_array(u64_to_user_ptr(args->bufs_ptr),
+				     args->num_bufs, sizeof(*bufs));
+	if (IS_ERR(bufs))
+		return PTR_ERR(bufs);
+
+	mappings = kcalloc(args->num_bufs, sizeof(*mappings), GFP_KERNEL);
+	if (!mappings) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	for (i = 0; i < args->num_bufs; i++) {
+		struct drm_tegra_submit_buf *buf = &bufs[i];
+		struct tegra_drm_mapping *mapping;
+
+		if (buf->flags & ~DRM_TEGRA_SUBMIT_BUF_RELOC_BLOCKLINEAR) {
+			err = -EINVAL;
+			goto drop_refs;
+		}
+
+		mapping = tegra_drm_mapping_get(ctx, buf->mapping_id);
+		if (!mapping) {
+			drm_info(drm, "invalid mapping_id for buf: %u",
+				 buf->mapping_id);
+			err = -EINVAL;
+			goto drop_refs;
+		}
+
+		err = submit_write_reloc(bo, buf, mapping);
+		if (err) {
+			tegra_drm_mapping_put(mapping);
+			goto drop_refs;
+		}
+
+		mappings[i].mapping = mapping;
+		mappings[i].flags = buf->flags;
+	}
+
+	job_data->used_mappings = mappings;
+	job_data->num_used_mappings = i;
+
+	err = 0;
+
+	goto done;
+
+drop_refs:
+	for (;;) {
+		if (i-- == 0)
+			break;
+
+		tegra_drm_mapping_put(mappings[i].mapping);
+	}
+
+	kfree(mappings);
+	job_data->used_mappings = NULL;
+
+done:
+	kvfree(bufs);
+
+	return err;
+}
+
+static int submit_get_syncpt(struct drm_device *drm, struct host1x_job *job,
+			     struct drm_tegra_channel_submit *args)
+{
+	struct host1x_syncpt *sp;
+
+	if (args->syncpt_incr.flags)
+		return -EINVAL;
+
+	/* Syncpt ref will be dropped on job release */
+	sp = host1x_syncpt_fd_get(args->syncpt_incr.syncpt_fd);
+	if (IS_ERR(sp))
+		return PTR_ERR(sp);
+
+	job->syncpt = sp;
+	job->syncpt_incrs = args->syncpt_incr.num_incrs;
+
+	return 0;
+}
+
+static int submit_job_add_gather(struct host1x_job *job,
+				 struct tegra_drm_channel_ctx *ctx,
+				 struct drm_tegra_submit_cmd_gather_uptr *cmd,
+				 struct gather_bo *bo, u32 *offset,
+				 struct tegra_drm_submit_data *job_data)
+{
+	u32 next_offset;
+
+	if (cmd->reserved[0] || cmd->reserved[1] || cmd->reserved[2])
+		return -EINVAL;
+
+	/* Check for maximum gather size */
+	if (cmd->words > 16383)
+		return -EINVAL;
+
+	if (check_add_overflow(*offset, cmd->words, &next_offset))
+		return -EINVAL;
+
+	if (next_offset > bo->gather_data_words)
+		return -EINVAL;
+
+	host1x_job_add_gather(job, &bo->base, cmd->words, *offset * 4);
+
+	*offset = next_offset;
+
+	return 0;
+}
+
+static int submit_create_job(struct drm_device *drm, struct host1x_job **pjob,
+			     struct gather_bo *bo,
+			     struct tegra_drm_channel_ctx *ctx,
+			     struct drm_tegra_channel_submit *args,
+			     struct tegra_drm_submit_data *job_data)
+{
+	struct drm_tegra_submit_cmd *cmds;
+	u32 i, gather_offset = 0;
+	struct host1x_job *job;
+	int err;
+
+	cmds = alloc_copy_user_array(u64_to_user_ptr(args->cmds_ptr),
+				     args->num_cmds, sizeof(*cmds));
+	if (IS_ERR(cmds))
+		return PTR_ERR(cmds);
+
+	job = host1x_job_alloc(ctx->channel, args->num_cmds, 0);
+	if (!job) {
+		err = -ENOMEM;
+		goto done;
+	}
+
+	err = submit_get_syncpt(drm, job, args);
+	if (err < 0)
+		goto free_job;
+
+	job->client = &ctx->client->base;
+	job->class = ctx->client->base.class;
+	job->serialize = true;
+
+	for (i = 0; i < args->num_cmds; i++) {
+		struct drm_tegra_submit_cmd *cmd = &cmds[i];
+
+		if (cmd->type == DRM_TEGRA_SUBMIT_CMD_GATHER_UPTR) {
+			err = submit_job_add_gather(job, ctx, &cmd->gather_uptr,
+						    bo, &gather_offset,
+						    job_data);
+			if (err)
+				goto free_job;
+		} else if (cmd->type == DRM_TEGRA_SUBMIT_CMD_WAIT_SYNCPT) {
+			if (cmd->wait_syncpt.reserved[0] ||
+			    cmd->wait_syncpt.reserved[1]) {
+				err = -EINVAL;
+				goto free_job;
+			}
+
+			host1x_job_add_wait(job, cmd->wait_syncpt.id,
+					    cmd->wait_syncpt.threshold);
+		} else {
+			err = -EINVAL;
+			goto free_job;
+		}
+	}
+
+	if (gather_offset == 0) {
+		drm_info(drm, "Job must have at least one gather");
+		err = -EINVAL;
+		goto free_job;
+	}
+
+	*pjob = job;
+
+	err = 0;
+	goto done;
+
+free_job:
+	host1x_job_put(job);
+
+done:
+	kvfree(cmds);
+
+	return err;
+}
+
+static void release_job(struct host1x_job *job)
+{
+	struct tegra_drm_client *client =
+		container_of(job->client, struct tegra_drm_client, base);
+	struct tegra_drm_submit_data *job_data = job->user_data;
+	u32 i;
+
+	for (i = 0; i < job_data->num_used_mappings; i++)
+		tegra_drm_mapping_put(job_data->used_mappings[i].mapping);
+
+	kfree(job_data->used_mappings);
+	kfree(job_data);
+
+	pm_runtime_put_autosuspend(client->base.dev);
+}
+
+int tegra_drm_ioctl_channel_submit(struct drm_device *drm, void *data,
+				   struct drm_file *file)
+{
+	struct tegra_drm_file *fpriv = file->driver_priv;
+	struct drm_tegra_channel_submit *args = data;
+	struct tegra_drm_submit_data *job_data;
+	struct tegra_drm_channel_ctx *ctx;
+	struct host1x_job *job;
+	struct gather_bo *bo;
+	u32 i;
+	int err;
+
+	ctx = tegra_drm_channel_ctx_lock(fpriv, args->channel_ctx);
+	if (!ctx)
+		return -EINVAL;
+
+	/* Allocate gather BO and copy gather words in. */
+	err = submit_copy_gather_data(drm, &bo, args);
+	if (err)
+		goto unlock;
+
+	job_data = kzalloc(sizeof(*job_data), GFP_KERNEL);
+	if (!job_data) {
+		err = -ENOMEM;
+		goto put_bo;
+	}
+
+	/* Get data buffer mappings and do relocation patching. */
+	err = submit_process_bufs(drm, bo, job_data, ctx, args);
+	if (err)
+		goto free_job_data;
+
+	/* Allocate host1x_job and add gathers and waits to it. */
+	err = submit_create_job(drm, &job, bo, ctx, args,
+				job_data);
+	if (err)
+		goto free_job_data;
+
+	/* Map gather data for Host1x. */
+	err = host1x_job_pin(job, ctx->client->base.dev);
+	if (err)
+		goto put_job;
+
+	/* Boot engine. */
+	err = pm_runtime_get_sync(ctx->client->base.dev);
+	if (err < 0)
+		goto put_pm_runtime;
+
+	job->user_data = job_data;
+	job->release = release_job;
+	job->timeout = 10000;
+
+	/*
+	 * job_data is now part of job reference counting, so don't release
+	 * it from here.
+	 */
+	job_data = NULL;
+
+	/* Submit job to hardware. */
+	err = host1x_job_submit(job);
+	if (err)
+		goto put_job;
+
+	/* Return postfences to userspace and add fences to DMA reservations. */
+	args->syncpt_incr.fence_value = job->syncpt_end;
+
+	goto put_job;
+
+put_pm_runtime:
+	if (!job->release)
+		pm_runtime_put(ctx->client->base.dev);
+	host1x_job_unpin(job);
+put_job:
+	host1x_job_put(job);
+free_job_data:
+	if (job_data && job_data->used_mappings) {
+		for (i = 0; i < job_data->num_used_mappings; i++)
+			tegra_drm_mapping_put(job_data->used_mappings[i].mapping);
+		kfree(job_data->used_mappings);
+	}
+	if (job_data)
+		kfree(job_data);
+put_bo:
+	gather_bo_put(&bo->base);
+unlock:
+	mutex_unlock(&fpriv->lock);
+	return err;
+}
diff --git a/drivers/gpu/drm/tegra/uapi/submit.h b/drivers/gpu/drm/tegra/uapi/submit.h
new file mode 100644
index 000000000000..0a165e9e4bda
--- /dev/null
+++ b/drivers/gpu/drm/tegra/uapi/submit.h
@@ -0,0 +1,17 @@ 
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2020 NVIDIA Corporation */
+
+#ifndef _TEGRA_DRM_UAPI_SUBMIT_H
+#define _TEGRA_DRM_UAPI_SUBMIT_H
+
+struct tegra_drm_used_mapping {
+	struct tegra_drm_mapping *mapping;
+	u32 flags;
+};
+
+struct tegra_drm_submit_data {
+	struct tegra_drm_used_mapping *used_mappings;
+	u32 num_used_mappings;
+};
+
+#endif