diff mbox series

[drm-misc-next] drm/nouveau: uapi: don't pass NO_PREFETCH flag implicitly

Message ID 20230822234139.11185-1-dakr@redhat.com (mailing list archive)
State New, archived
Headers show
Series [drm-misc-next] drm/nouveau: uapi: don't pass NO_PREFETCH flag implicitly | expand

Commit Message

Danilo Krummrich Aug. 22, 2023, 11:41 p.m. UTC
Currently, NO_PREFETCH is passed implicitly through
drm_nouveau_gem_pushbuf_push::length and drm_nouveau_exec_push::va_len.

Since this is a direct representation of how the HW is programmed it
isn't really future proof for a uAPI. Hence, fix this up for the new
uAPI and split up the va_len field of struct drm_nouveau_exec_push,
such that we keep 32bit for va_len and 32bit for flags.

For drm_nouveau_gem_pushbuf_push::length at least provide
NOUVEAU_GEM_PUSHBUF_NO_PREFETCH to indicate the bit shift.

While at it, fix up nv50_dma_push() as well, such that the caller
doesn't need to encode the NO_PREFETCH flag into the length parameter.

Signed-off-by: Danilo Krummrich <dakr@redhat.com>
---
 drivers/gpu/drm/nouveau/nouveau_dma.c  |  7 +++++--
 drivers/gpu/drm/nouveau/nouveau_dma.h  |  8 ++++++--
 drivers/gpu/drm/nouveau/nouveau_exec.c | 15 ++++++++++++---
 drivers/gpu/drm/nouveau/nouveau_gem.c  |  6 ++++--
 include/uapi/drm/nouveau_drm.h         |  8 +++++++-
 5 files changed, 34 insertions(+), 10 deletions(-)


base-commit: ad1367f831f8743746a1f49705c28e36a7c95525

Comments

Faith Ekstrand Aug. 23, 2023, 2:53 a.m. UTC | #1
On Tue, Aug 22, 2023 at 6:41 PM Danilo Krummrich <dakr@redhat.com> wrote:

> Currently, NO_PREFETCH is passed implicitly through
> drm_nouveau_gem_pushbuf_push::length and drm_nouveau_exec_push::va_len.
>
> Since this is a direct representation of how the HW is programmed it
> isn't really future proof for a uAPI. Hence, fix this up for the new
> uAPI and split up the va_len field of struct drm_nouveau_exec_push,
> such that we keep 32bit for va_len and 32bit for flags.
>
> For drm_nouveau_gem_pushbuf_push::length at least provide
> NOUVEAU_GEM_PUSHBUF_NO_PREFETCH to indicate the bit shift.
>
> While at it, fix up nv50_dma_push() as well, such that the caller
> doesn't need to encode the NO_PREFETCH flag into the length parameter.
>
> Signed-off-by: Danilo Krummrich <dakr@redhat.com>
> ---
>  drivers/gpu/drm/nouveau/nouveau_dma.c  |  7 +++++--
>  drivers/gpu/drm/nouveau/nouveau_dma.h  |  8 ++++++--
>  drivers/gpu/drm/nouveau/nouveau_exec.c | 15 ++++++++++++---
>  drivers/gpu/drm/nouveau/nouveau_gem.c  |  6 ++++--
>  include/uapi/drm/nouveau_drm.h         |  8 +++++++-
>  5 files changed, 34 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.c
> b/drivers/gpu/drm/nouveau/nouveau_dma.c
> index b90cac6d5772..059925e5db6a 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dma.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_dma.c
> @@ -69,16 +69,19 @@ READ_GET(struct nouveau_channel *chan, uint64_t
> *prev_get, int *timeout)
>  }
>
>  void
> -nv50_dma_push(struct nouveau_channel *chan, u64 offset, int length)
> +nv50_dma_push(struct nouveau_channel *chan, u64 offset, u32 length,
> +             bool prefetch)
>  {
>         struct nvif_user *user = &chan->drm->client.device.user;
>         struct nouveau_bo *pb = chan->push.buffer;
>         int ip = (chan->dma.ib_put * 2) + chan->dma.ib_base;
>
>         BUG_ON(chan->dma.ib_free < 1);
> +       WARN_ON(length > NV50_DMA_PUSH_MAX_LENGTH);
>
>         nouveau_bo_wr32(pb, ip++, lower_32_bits(offset));
> -       nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8);
> +       nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8 |
> +                       (prefetch ? 0 : (1 << 31)));
>

It feels a bit weird to be inverting this bit twice. IDK that it matters,
though.


>
>         chan->dma.ib_put = (chan->dma.ib_put + 1) & chan->dma.ib_max;
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h
> b/drivers/gpu/drm/nouveau/nouveau_dma.h
> index 035a709c7be1..fb471c357336 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dma.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
> @@ -31,7 +31,8 @@
>  #include "nouveau_chan.h"
>
>  int nouveau_dma_wait(struct nouveau_channel *, int slots, int size);
> -void nv50_dma_push(struct nouveau_channel *, u64 addr, int length);
> +void nv50_dma_push(struct nouveau_channel *, u64 addr, u32 length,
> +                  bool prefetch);
>
>  /*
>   * There's a hw race condition where you can't jump to your PUT offset,
> @@ -45,6 +46,9 @@ void nv50_dma_push(struct nouveau_channel *, u64 addr,
> int length);
>   */
>  #define NOUVEAU_DMA_SKIPS (128 / 4)
>
> +/* Maximum push buffer size. */
> +#define NV50_DMA_PUSH_MAX_LENGTH 0x7fffff
> +
>  /* Object handles - for stuff that's doesn't use handle == oclass. */
>  enum {
>         NvDmaFB         = 0x80000002,
> @@ -89,7 +93,7 @@ FIRE_RING(struct nouveau_channel *chan)
>
>         if (chan->dma.ib_max) {
>                 nv50_dma_push(chan, chan->push.addr + (chan->dma.put << 2),
> -                             (chan->dma.cur - chan->dma.put) << 2);
> +                             (chan->dma.cur - chan->dma.put) << 2, true);
>         } else {
>                 WRITE_PUT(chan->dma.cur);
>         }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c
> b/drivers/gpu/drm/nouveau/nouveau_exec.c
> index 0f927adda4ed..a123b07b2adf 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_exec.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
> @@ -164,8 +164,10 @@ nouveau_exec_job_run(struct nouveau_job *job)
>         }
>
>         for (i = 0; i < exec_job->push.count; i++) {
> -               nv50_dma_push(chan, exec_job->push.s[i].va,
> -                             exec_job->push.s[i].va_len);
> +               struct drm_nouveau_exec_push *p = &exec_job->push.s[i];
> +               bool prefetch = !(p->flags &
> DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH);
> +
> +               nv50_dma_push(chan, p->va, p->va_len, prefetch);
>         }
>
>         ret = nouveau_fence_emit(fence, chan);
> @@ -223,7 +225,14 @@ nouveau_exec_job_init(struct nouveau_exec_job **pjob,
>  {
>         struct nouveau_exec_job *job;
>         struct nouveau_job_args args = {};
> -       int ret;
> +       int i, ret;
> +
> +       for (i = 0; i < __args->push.count; i++) {
> +               struct drm_nouveau_exec_push *p = &__args->push.s[i];
> +
> +               if (p->va_len > NV50_DMA_PUSH_MAX_LENGTH)
> +                       return -EINVAL;
>

This can probably be wrapped in unlikely().  Also, it'd be nice if we
printed an error message like we do if you try to push too many things.

Looks good. Thanks!

Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com>


> +       }
>
>         job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>         if (!job)
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
> b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index f39360870c70..2f3dc4d71657 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -856,9 +856,11 @@ nouveau_gem_ioctl_pushbuf(struct drm_device *dev,
> void *data,
>                 for (i = 0; i < req->nr_push; i++) {
>                         struct nouveau_vma *vma = (void *)(unsigned long)
>                                 bo[push[i].bo_index].user_priv;
> +                       u64 addr = vma->addr + push[i].offset;
> +                       u32 length = push[i].length &
> ~NOUVEAU_GEM_PUSHBUF_NO_PREFETCH;
> +                       bool prefetch = !(push[i].length &
> NOUVEAU_GEM_PUSHBUF_NO_PREFETCH);
>
> -                       nv50_dma_push(chan, vma->addr + push[i].offset,
> -                                     push[i].length);
> +                       nv50_dma_push(chan, addr, length, prefetch);
>                 }
>         } else
>         if (drm->client.device.info.chipset >= 0x25) {
> diff --git a/include/uapi/drm/nouveau_drm.h
> b/include/uapi/drm/nouveau_drm.h
> index b1ad9d5ffce8..8f16724b5d05 100644
> --- a/include/uapi/drm/nouveau_drm.h
> +++ b/include/uapi/drm/nouveau_drm.h
> @@ -138,6 +138,7 @@ struct drm_nouveau_gem_pushbuf_push {
>         __u32 pad;
>         __u64 offset;
>         __u64 length;
> +#define NOUVEAU_GEM_PUSHBUF_NO_PREFETCH (1 << 23)
>  };
>
>  struct drm_nouveau_gem_pushbuf {
> @@ -338,7 +339,12 @@ struct drm_nouveau_exec_push {
>         /**
>          * @va_len: the length of the push buffer mapping
>          */
> -       __u64 va_len;
> +       __u32 va_len;
> +       /**
> +        * flags: the flags for this push buffer mapping
> +        */
> +       __u32 flags;
> +#define DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH 0x1
>  };
>
>  /**
>
> base-commit: ad1367f831f8743746a1f49705c28e36a7c95525
> --
> 2.41.0
>
>
Danilo Krummrich Aug. 23, 2023, 12:36 p.m. UTC | #2
On 8/23/23 04:53, Faith Ekstrand wrote:
> On Tue, Aug 22, 2023 at 6:41 PM Danilo Krummrich <dakr@redhat.com <mailto:dakr@redhat.com>> wrote:
> 
>     Currently, NO_PREFETCH is passed implicitly through
>     drm_nouveau_gem_pushbuf_push::length and drm_nouveau_exec_push::va_len.
> 
>     Since this is a direct representation of how the HW is programmed it
>     isn't really future proof for a uAPI. Hence, fix this up for the new
>     uAPI and split up the va_len field of struct drm_nouveau_exec_push,
>     such that we keep 32bit for va_len and 32bit for flags.
> 
>     For drm_nouveau_gem_pushbuf_push::length at least provide
>     NOUVEAU_GEM_PUSHBUF_NO_PREFETCH to indicate the bit shift.
> 
>     While at it, fix up nv50_dma_push() as well, such that the caller
>     doesn't need to encode the NO_PREFETCH flag into the length parameter.
> 
>     Signed-off-by: Danilo Krummrich <dakr@redhat.com <mailto:dakr@redhat.com>>
>     ---
>       drivers/gpu/drm/nouveau/nouveau_dma.c  |  7 +++++--
>       drivers/gpu/drm/nouveau/nouveau_dma.h  |  8 ++++++--
>       drivers/gpu/drm/nouveau/nouveau_exec.c | 15 ++++++++++++---
>       drivers/gpu/drm/nouveau/nouveau_gem.c  |  6 ++++--
>       include/uapi/drm/nouveau_drm.h         |  8 +++++++-
>       5 files changed, 34 insertions(+), 10 deletions(-)
> 
>     diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.c b/drivers/gpu/drm/nouveau/nouveau_dma.c
>     index b90cac6d5772..059925e5db6a 100644
>     --- a/drivers/gpu/drm/nouveau/nouveau_dma.c
>     +++ b/drivers/gpu/drm/nouveau/nouveau_dma.c
>     @@ -69,16 +69,19 @@ READ_GET(struct nouveau_channel *chan, uint64_t *prev_get, int *timeout)
>       }
> 
>       void
>     -nv50_dma_push(struct nouveau_channel *chan, u64 offset, int length)
>     +nv50_dma_push(struct nouveau_channel *chan, u64 offset, u32 length,
>     +             bool prefetch)
>       {
>              struct nvif_user *user = &chan->drm->client.device.user;
>              struct nouveau_bo *pb = chan->push.buffer;
>              int ip = (chan->dma.ib_put * 2) + chan->dma.ib_base;
> 
>              BUG_ON(chan->dma.ib_free < 1);
>     +       WARN_ON(length > NV50_DMA_PUSH_MAX_LENGTH);
> 
>              nouveau_bo_wr32(pb, ip++, lower_32_bits(offset));
>     -       nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8);
>     +       nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8 |
>     +                       (prefetch ? 0 : (1 << 31)));
> 
> 
> It feels a bit weird to be inverting this bit twice. IDK that it matters, though.

I usually avoid negated argument names, in this case it kinda makes sense though.

> 
> 
>              chan->dma.ib_put = (chan->dma.ib_put + 1) & chan->dma.ib_max;
> 
>     diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h b/drivers/gpu/drm/nouveau/nouveau_dma.h
>     index 035a709c7be1..fb471c357336 100644
>     --- a/drivers/gpu/drm/nouveau/nouveau_dma.h
>     +++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
>     @@ -31,7 +31,8 @@
>       #include "nouveau_chan.h"
> 
>       int nouveau_dma_wait(struct nouveau_channel *, int slots, int size);
>     -void nv50_dma_push(struct nouveau_channel *, u64 addr, int length);
>     +void nv50_dma_push(struct nouveau_channel *, u64 addr, u32 length,
>     +                  bool prefetch);
> 
>       /*
>        * There's a hw race condition where you can't jump to your PUT offset,
>     @@ -45,6 +46,9 @@ void nv50_dma_push(struct nouveau_channel *, u64 addr, int length);
>        */
>       #define NOUVEAU_DMA_SKIPS (128 / 4)
> 
>     +/* Maximum push buffer size. */
>     +#define NV50_DMA_PUSH_MAX_LENGTH 0x7fffff
>     +
>       /* Object handles - for stuff that's doesn't use handle == oclass. */
>       enum {
>              NvDmaFB         = 0x80000002,
>     @@ -89,7 +93,7 @@ FIRE_RING(struct nouveau_channel *chan)
> 
>              if (chan->dma.ib_max) {
>                      nv50_dma_push(chan, chan->push.addr + (chan->dma.put << 2),
>     -                             (chan->dma.cur - chan->dma.put) << 2);
>     +                             (chan->dma.cur - chan->dma.put) << 2, true);
>              } else {
>                      WRITE_PUT(chan->dma.cur);
>              }
>     diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c b/drivers/gpu/drm/nouveau/nouveau_exec.c
>     index 0f927adda4ed..a123b07b2adf 100644
>     --- a/drivers/gpu/drm/nouveau/nouveau_exec.c
>     +++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
>     @@ -164,8 +164,10 @@ nouveau_exec_job_run(struct nouveau_job *job)
>              }
> 
>              for (i = 0; i < exec_job->push.count; i++) {
>     -               nv50_dma_push(chan, exec_job->push.s[i].va,
>     -                             exec_job->push.s[i].va_len);
>     +               struct drm_nouveau_exec_push *p = &exec_job->push.s[i];
>     +               bool prefetch = !(p->flags & DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH);
>     +
>     +               nv50_dma_push(chan, p->va, p->va_len, prefetch);
>              }
> 
>              ret = nouveau_fence_emit(fence, chan);
>     @@ -223,7 +225,14 @@ nouveau_exec_job_init(struct nouveau_exec_job **pjob,
>       {
>              struct nouveau_exec_job *job;
>              struct nouveau_job_args args = {};
>     -       int ret;
>     +       int i, ret;
>     +
>     +       for (i = 0; i < __args->push.count; i++) {
>     +               struct drm_nouveau_exec_push *p = &__args->push.s[i];
>     +
>     +               if (p->va_len > NV50_DMA_PUSH_MAX_LENGTH)
>     +                       return -EINVAL;
> 
> 
> This can probably be wrapped in unlikely().  Also, it'd be nice if we printed an error message like we do if you try to push too many things.

Yep, will do.

> 
> Looks good. Thanks!
> 
> Reviewed-by: Faith Ekstrand <faith.ekstrand@collabora.com <mailto:faith.ekstrand@collabora.com>>
> 
>     +       }
> 
>              job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
>              if (!job)
>     diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
>     index f39360870c70..2f3dc4d71657 100644
>     --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
>     +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
>     @@ -856,9 +856,11 @@ nouveau_gem_ioctl_pushbuf(struct drm_device *dev, void *data,
>                      for (i = 0; i < req->nr_push; i++) {
>                              struct nouveau_vma *vma = (void *)(unsigned long)
>                                      bo[push[i].bo_index].user_priv;
>     +                       u64 addr = vma->addr + push[i].offset;
>     +                       u32 length = push[i].length & ~NOUVEAU_GEM_PUSHBUF_NO_PREFETCH;
>     +                       bool prefetch = !(push[i].length & NOUVEAU_GEM_PUSHBUF_NO_PREFETCH);
> 
>     -                       nv50_dma_push(chan, vma->addr + push[i].offset,
>     -                                     push[i].length);
>     +                       nv50_dma_push(chan, addr, length, prefetch);
>                      }
>              } else
>              if (drm->client.device.info <http://client.device.info>.chipset >= 0x25) {
>     diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
>     index b1ad9d5ffce8..8f16724b5d05 100644
>     --- a/include/uapi/drm/nouveau_drm.h
>     +++ b/include/uapi/drm/nouveau_drm.h
>     @@ -138,6 +138,7 @@ struct drm_nouveau_gem_pushbuf_push {
>              __u32 pad;
>              __u64 offset;
>              __u64 length;
>     +#define NOUVEAU_GEM_PUSHBUF_NO_PREFETCH (1 << 23)
>       };
> 
>       struct drm_nouveau_gem_pushbuf {
>     @@ -338,7 +339,12 @@ struct drm_nouveau_exec_push {
>              /**
>               * @va_len: the length of the push buffer mapping
>               */
>     -       __u64 va_len;
>     +       __u32 va_len;
>     +       /**
>     +        * flags: the flags for this push buffer mapping
>     +        */
>     +       __u32 flags;
>     +#define DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH 0x1
>       };
> 
>       /**
> 
>     base-commit: ad1367f831f8743746a1f49705c28e36a7c95525
>     -- 
>     2.41.0
>
kernel test robot Aug. 23, 2023, 12:46 p.m. UTC | #3
Hi Danilo,

kernel test robot noticed the following build warnings:

[auto build test WARNING on ad1367f831f8743746a1f49705c28e36a7c95525]

url:    https://github.com/intel-lab-lkp/linux/commits/Danilo-Krummrich/drm-nouveau-uapi-don-t-pass-NO_PREFETCH-flag-implicitly/20230823-074237
base:   ad1367f831f8743746a1f49705c28e36a7c95525
patch link:    https://lore.kernel.org/r/20230822234139.11185-1-dakr%40redhat.com
patch subject: [PATCH drm-misc-next] drm/nouveau: uapi: don't pass NO_PREFETCH flag implicitly
reproduce: (https://download.01.org/0day-ci/archive/20230823/202308232030.0r1irPMu-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202308232030.0r1irPMu-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> ./include/uapi/drm/nouveau_drm.h:344: warning: Incorrect use of kernel-doc format:          * flags: the flags for this push buffer mapping
>> ./include/uapi/drm/nouveau_drm.h:348: warning: Function parameter or member 'flags' not described in 'drm_nouveau_exec_push'

vim +344 ./include/uapi/drm/nouveau_drm.h

   327	
   328	/**
   329	 * struct drm_nouveau_exec_push - EXEC push operation
   330	 *
   331	 * This structure represents a single EXEC push operation. UMDs should pass an
   332	 * array of this structure via struct drm_nouveau_exec's &push_ptr field.
   333	 */
   334	struct drm_nouveau_exec_push {
   335		/**
   336		 * @va: the virtual address of the push buffer mapping
   337		 */
   338		__u64 va;
   339		/**
   340		 * @va_len: the length of the push buffer mapping
   341		 */
   342		__u32 va_len;
   343		/**
 > 344		 * flags: the flags for this push buffer mapping
   345		 */
   346		__u32 flags;
   347	#define DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH 0x1
 > 348	};
   349
diff mbox series

Patch

diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.c b/drivers/gpu/drm/nouveau/nouveau_dma.c
index b90cac6d5772..059925e5db6a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dma.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dma.c
@@ -69,16 +69,19 @@  READ_GET(struct nouveau_channel *chan, uint64_t *prev_get, int *timeout)
 }
 
 void
-nv50_dma_push(struct nouveau_channel *chan, u64 offset, int length)
+nv50_dma_push(struct nouveau_channel *chan, u64 offset, u32 length,
+	      bool prefetch)
 {
 	struct nvif_user *user = &chan->drm->client.device.user;
 	struct nouveau_bo *pb = chan->push.buffer;
 	int ip = (chan->dma.ib_put * 2) + chan->dma.ib_base;
 
 	BUG_ON(chan->dma.ib_free < 1);
+	WARN_ON(length > NV50_DMA_PUSH_MAX_LENGTH);
 
 	nouveau_bo_wr32(pb, ip++, lower_32_bits(offset));
-	nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8);
+	nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8 |
+			(prefetch ? 0 : (1 << 31)));
 
 	chan->dma.ib_put = (chan->dma.ib_put + 1) & chan->dma.ib_max;
 
diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h b/drivers/gpu/drm/nouveau/nouveau_dma.h
index 035a709c7be1..fb471c357336 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dma.h
+++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
@@ -31,7 +31,8 @@ 
 #include "nouveau_chan.h"
 
 int nouveau_dma_wait(struct nouveau_channel *, int slots, int size);
-void nv50_dma_push(struct nouveau_channel *, u64 addr, int length);
+void nv50_dma_push(struct nouveau_channel *, u64 addr, u32 length,
+		   bool prefetch);
 
 /*
  * There's a hw race condition where you can't jump to your PUT offset,
@@ -45,6 +46,9 @@  void nv50_dma_push(struct nouveau_channel *, u64 addr, int length);
  */
 #define NOUVEAU_DMA_SKIPS (128 / 4)
 
+/* Maximum push buffer size. */
+#define NV50_DMA_PUSH_MAX_LENGTH 0x7fffff
+
 /* Object handles - for stuff that's doesn't use handle == oclass. */
 enum {
 	NvDmaFB		= 0x80000002,
@@ -89,7 +93,7 @@  FIRE_RING(struct nouveau_channel *chan)
 
 	if (chan->dma.ib_max) {
 		nv50_dma_push(chan, chan->push.addr + (chan->dma.put << 2),
-			      (chan->dma.cur - chan->dma.put) << 2);
+			      (chan->dma.cur - chan->dma.put) << 2, true);
 	} else {
 		WRITE_PUT(chan->dma.cur);
 	}
diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c b/drivers/gpu/drm/nouveau/nouveau_exec.c
index 0f927adda4ed..a123b07b2adf 100644
--- a/drivers/gpu/drm/nouveau/nouveau_exec.c
+++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
@@ -164,8 +164,10 @@  nouveau_exec_job_run(struct nouveau_job *job)
 	}
 
 	for (i = 0; i < exec_job->push.count; i++) {
-		nv50_dma_push(chan, exec_job->push.s[i].va,
-			      exec_job->push.s[i].va_len);
+		struct drm_nouveau_exec_push *p = &exec_job->push.s[i];
+		bool prefetch = !(p->flags & DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH);
+
+		nv50_dma_push(chan, p->va, p->va_len, prefetch);
 	}
 
 	ret = nouveau_fence_emit(fence, chan);
@@ -223,7 +225,14 @@  nouveau_exec_job_init(struct nouveau_exec_job **pjob,
 {
 	struct nouveau_exec_job *job;
 	struct nouveau_job_args args = {};
-	int ret;
+	int i, ret;
+
+	for (i = 0; i < __args->push.count; i++) {
+		struct drm_nouveau_exec_push *p = &__args->push.s[i];
+
+		if (p->va_len > NV50_DMA_PUSH_MAX_LENGTH)
+			return -EINVAL;
+	}
 
 	job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
 	if (!job)
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
index f39360870c70..2f3dc4d71657 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -856,9 +856,11 @@  nouveau_gem_ioctl_pushbuf(struct drm_device *dev, void *data,
 		for (i = 0; i < req->nr_push; i++) {
 			struct nouveau_vma *vma = (void *)(unsigned long)
 				bo[push[i].bo_index].user_priv;
+			u64 addr = vma->addr + push[i].offset;
+			u32 length = push[i].length & ~NOUVEAU_GEM_PUSHBUF_NO_PREFETCH;
+			bool prefetch = !(push[i].length & NOUVEAU_GEM_PUSHBUF_NO_PREFETCH);
 
-			nv50_dma_push(chan, vma->addr + push[i].offset,
-				      push[i].length);
+			nv50_dma_push(chan, addr, length, prefetch);
 		}
 	} else
 	if (drm->client.device.info.chipset >= 0x25) {
diff --git a/include/uapi/drm/nouveau_drm.h b/include/uapi/drm/nouveau_drm.h
index b1ad9d5ffce8..8f16724b5d05 100644
--- a/include/uapi/drm/nouveau_drm.h
+++ b/include/uapi/drm/nouveau_drm.h
@@ -138,6 +138,7 @@  struct drm_nouveau_gem_pushbuf_push {
 	__u32 pad;
 	__u64 offset;
 	__u64 length;
+#define NOUVEAU_GEM_PUSHBUF_NO_PREFETCH (1 << 23)
 };
 
 struct drm_nouveau_gem_pushbuf {
@@ -338,7 +339,12 @@  struct drm_nouveau_exec_push {
 	/**
 	 * @va_len: the length of the push buffer mapping
 	 */
-	__u64 va_len;
+	__u32 va_len;
+	/**
+	 * flags: the flags for this push buffer mapping
+	 */
+	__u32 flags;
+#define DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH 0x1
 };
 
 /**