diff mbox

drm/vc4: Add the DRM_IOCTL_VC4_GEM_MADVISE ioctl

Message ID 20170927141054.32728-1-boris.brezillon@free-electrons.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boris BREZILLON Sept. 27, 2017, 2:10 p.m. UTC
This ioctl will allow us to purge inactive userspace buffers when the
system is running out of contiguous memory.

For now, the purge logic is rather dumb in that it does not try to
release only the amount of BO needed to meet the last CMA alloc request
but instead purges all objects placed in the purgeable pool as soon as
we experience a CMA allocation failure.

Note that the in-kernel BO cache is always purged before the purgeable
cache because those objects are known to be unused while objects marked
as purgeable by a userspace application/library might have to be
restored when they are marked back as unpurgeable, which can be
expensive.

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
---
Hello,

Updates to libdrm, mesa and igt making use of this kernel feature can
be found on my github repos [1][2][3].

There's currently no debugfs hook to manually force a purge, but this
is being discussed and will probably be added in v2.

Regards,

Boris

[1]https://github.com/bbrezillon/drm/tree/vc4/purgeable
[2]https://github.com/bbrezillon/mesa/tree/vc4/purgeable
[3]https://github.com/bbrezillon/igt/tree/vc4/purgeable
---
 drivers/gpu/drm/vc4/vc4_bo.c        | 188 +++++++++++++++++++++++++++++--
 drivers/gpu/drm/vc4/vc4_drv.c       |   9 +-
 drivers/gpu/drm/vc4/vc4_drv.h       |  26 +++++
 drivers/gpu/drm/vc4/vc4_gem.c       | 213 +++++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/vc4/vc4_plane.c     |  20 ++++
 drivers/gpu/drm/vc4/vc4_render_cl.c |   3 +
 include/uapi/drm/vc4_drm.h          |  12 ++
 7 files changed, 455 insertions(+), 16 deletions(-)

Comments

Eric Anholt Sept. 27, 2017, 6:49 p.m. UTC | #1
Boris Brezillon <boris.brezillon@free-electrons.com> writes:

> This ioctl will allow us to purge inactive userspace buffers when the
> system is running out of contiguous memory.
>
> For now, the purge logic is rather dumb in that it does not try to
> release only the amount of BO needed to meet the last CMA alloc request
> but instead purges all objects placed in the purgeable pool as soon as
> we experience a CMA allocation failure.
>
> Note that the in-kernel BO cache is always purged before the purgeable
> cache because those objects are known to be unused while objects marked
> as purgeable by a userspace application/library might have to be
> restored when they are marked back as unpurgeable, which can be
> expensive.
>
> Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
> ---
> Hello,
>
> Updates to libdrm, mesa and igt making use of this kernel feature can
> be found on my github repos [1][2][3].
>
> There's currently no debugfs hook to manually force a purge, but this
> is being discussed and will probably be added in v2.
>
> Regards,

> diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
> index afae87004963..c01b93d453db 100644
> --- a/include/uapi/drm/vc4_drm.h
> +++ b/include/uapi/drm/vc4_drm.h
> @@ -41,6 +41,7 @@ extern "C" {
>  #define DRM_VC4_SET_TILING                        0x08
>  #define DRM_VC4_GET_TILING                        0x09
>  #define DRM_VC4_LABEL_BO                          0x0a
> +#define DRM_VC4_GEM_MADVISE                       0x0b
>  
>  #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
>  #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
> @@ -53,6 +54,7 @@ extern "C" {
>  #define DRM_IOCTL_VC4_SET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SET_TILING, struct drm_vc4_set_tiling)
>  #define DRM_IOCTL_VC4_GET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
>  #define DRM_IOCTL_VC4_LABEL_BO            DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_LABEL_BO, struct drm_vc4_label_bo)
> +#define DRM_IOCTL_VC4_GEM_MADVISE         DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GEM_MADVISE, struct drm_vc4_gem_madvise)
>  
>  struct drm_vc4_submit_rcl_surface {
>  	__u32 hindex; /* Handle index, or ~0 if not present. */
> @@ -333,6 +335,16 @@ struct drm_vc4_label_bo {
>  	__u64 name;
>  };
>  
> +#define VC4_MADV_WILLNEED			0
> +#define VC4_MADV_DONTNEED			1
> +#define __VC4_MADV_PURGED			2
> +
> +struct drm_vc4_gem_madvise {
> +	__u32 handle;
> +	__u32 madv;
> +	__u32 retained;
> +};

danvet had a note in
http://blog.ffwll.ch/2013/11/botching-up-ioctls.html:

    Pad the entire struct to a multiple of 64bits - the structure size
    will otherwise differ on 32bit versus 64bit. Which hurts when
    passing arrays of structures to the kernel. Or with the ioctl
    structure size checking that e.g. the drm core does.

I'm surprised that i915's ioctl didn't do this or have compat code to
handle it.
Daniel Vetter Sept. 28, 2017, 7:17 a.m. UTC | #2
On Wed, Sep 27, 2017 at 11:49:21AM -0700, Eric Anholt wrote:
> Boris Brezillon <boris.brezillon@free-electrons.com> writes:
> 
> > This ioctl will allow us to purge inactive userspace buffers when the
> > system is running out of contiguous memory.
> >
> > For now, the purge logic is rather dumb in that it does not try to
> > release only the amount of BO needed to meet the last CMA alloc request
> > but instead purges all objects placed in the purgeable pool as soon as
> > we experience a CMA allocation failure.
> >
> > Note that the in-kernel BO cache is always purged before the purgeable
> > cache because those objects are known to be unused while objects marked
> > as purgeable by a userspace application/library might have to be
> > restored when they are marked back as unpurgeable, which can be
> > expensive.
> >
> > Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
> > ---
> > Hello,
> >
> > Updates to libdrm, mesa and igt making use of this kernel feature can
> > be found on my github repos [1][2][3].
> >
> > There's currently no debugfs hook to manually force a purge, but this
> > is being discussed and will probably be added in v2.
> >
> > Regards,
> 
> > diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
> > index afae87004963..c01b93d453db 100644
> > --- a/include/uapi/drm/vc4_drm.h
> > +++ b/include/uapi/drm/vc4_drm.h
> > @@ -41,6 +41,7 @@ extern "C" {
> >  #define DRM_VC4_SET_TILING                        0x08
> >  #define DRM_VC4_GET_TILING                        0x09
> >  #define DRM_VC4_LABEL_BO                          0x0a
> > +#define DRM_VC4_GEM_MADVISE                       0x0b
> >  
> >  #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
> >  #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
> > @@ -53,6 +54,7 @@ extern "C" {
> >  #define DRM_IOCTL_VC4_SET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SET_TILING, struct drm_vc4_set_tiling)
> >  #define DRM_IOCTL_VC4_GET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
> >  #define DRM_IOCTL_VC4_LABEL_BO            DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_LABEL_BO, struct drm_vc4_label_bo)
> > +#define DRM_IOCTL_VC4_GEM_MADVISE         DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GEM_MADVISE, struct drm_vc4_gem_madvise)
> >  
> >  struct drm_vc4_submit_rcl_surface {
> >  	__u32 hindex; /* Handle index, or ~0 if not present. */
> > @@ -333,6 +335,16 @@ struct drm_vc4_label_bo {
> >  	__u64 name;
> >  };
> >  
> > +#define VC4_MADV_WILLNEED			0
> > +#define VC4_MADV_DONTNEED			1
> > +#define __VC4_MADV_PURGED			2
> > +
> > +struct drm_vc4_gem_madvise {
> > +	__u32 handle;
> > +	__u32 madv;
> > +	__u32 retained;
> > +};
> 
> danvet had a note in
> http://blog.ffwll.ch/2013/11/botching-up-ioctls.html:
> 
>     Pad the entire struct to a multiple of 64bits - the structure size
>     will otherwise differ on 32bit versus 64bit. Which hurts when
>     passing arrays of structures to the kernel. Or with the ioctl
>     structure size checking that e.g. the drm core does.
> 
> I'm surprised that i915's ioctl didn't do this or have compat code to
> handle it.

This advise is defensive just in case you ever make an array of any of
your uabi structs, and there's a 64 bit value in there somewhere. It only
matters for that case really. But since gpus have a few of those ioctls
(especially command submission) I figured better safe than sorry.
-Daniel
Eric Anholt Sept. 28, 2017, 5:24 p.m. UTC | #3
Daniel Vetter <daniel@ffwll.ch> writes:

> On Wed, Sep 27, 2017 at 11:49:21AM -0700, Eric Anholt wrote:
>> Boris Brezillon <boris.brezillon@free-electrons.com> writes:
>> 
>> > This ioctl will allow us to purge inactive userspace buffers when the
>> > system is running out of contiguous memory.
>> >
>> > For now, the purge logic is rather dumb in that it does not try to
>> > release only the amount of BO needed to meet the last CMA alloc request
>> > but instead purges all objects placed in the purgeable pool as soon as
>> > we experience a CMA allocation failure.
>> >
>> > Note that the in-kernel BO cache is always purged before the purgeable
>> > cache because those objects are known to be unused while objects marked
>> > as purgeable by a userspace application/library might have to be
>> > restored when they are marked back as unpurgeable, which can be
>> > expensive.
>> >
>> > Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
>> > ---
>> > Hello,
>> >
>> > Updates to libdrm, mesa and igt making use of this kernel feature can
>> > be found on my github repos [1][2][3].
>> >
>> > There's currently no debugfs hook to manually force a purge, but this
>> > is being discussed and will probably be added in v2.
>> >
>> > Regards,
>> 
>> > diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
>> > index afae87004963..c01b93d453db 100644
>> > --- a/include/uapi/drm/vc4_drm.h
>> > +++ b/include/uapi/drm/vc4_drm.h
>> > @@ -41,6 +41,7 @@ extern "C" {
>> >  #define DRM_VC4_SET_TILING                        0x08
>> >  #define DRM_VC4_GET_TILING                        0x09
>> >  #define DRM_VC4_LABEL_BO                          0x0a
>> > +#define DRM_VC4_GEM_MADVISE                       0x0b
>> >  
>> >  #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
>> >  #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
>> > @@ -53,6 +54,7 @@ extern "C" {
>> >  #define DRM_IOCTL_VC4_SET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SET_TILING, struct drm_vc4_set_tiling)
>> >  #define DRM_IOCTL_VC4_GET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
>> >  #define DRM_IOCTL_VC4_LABEL_BO            DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_LABEL_BO, struct drm_vc4_label_bo)
>> > +#define DRM_IOCTL_VC4_GEM_MADVISE         DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GEM_MADVISE, struct drm_vc4_gem_madvise)
>> >  
>> >  struct drm_vc4_submit_rcl_surface {
>> >  	__u32 hindex; /* Handle index, or ~0 if not present. */
>> > @@ -333,6 +335,16 @@ struct drm_vc4_label_bo {
>> >  	__u64 name;
>> >  };
>> >  
>> > +#define VC4_MADV_WILLNEED			0
>> > +#define VC4_MADV_DONTNEED			1
>> > +#define __VC4_MADV_PURGED			2
>> > +
>> > +struct drm_vc4_gem_madvise {
>> > +	__u32 handle;
>> > +	__u32 madv;
>> > +	__u32 retained;
>> > +};
>> 
>> danvet had a note in
>> http://blog.ffwll.ch/2013/11/botching-up-ioctls.html:
>> 
>>     Pad the entire struct to a multiple of 64bits - the structure size
>>     will otherwise differ on 32bit versus 64bit. Which hurts when
>>     passing arrays of structures to the kernel. Or with the ioctl
>>     structure size checking that e.g. the drm core does.
>> 
>> I'm surprised that i915's ioctl didn't do this or have compat code to
>> handle it.
>
> This advise is defensive just in case you ever make an array of any of
> your uabi structs, and there's a 64 bit value in there somewhere. It only
> matters for that case really. But since gpus have a few of those ioctls
> (especially command submission) I figured better safe than sorry.

It talked about there being some core ioctl size checking -- does that
not exist any more?  I've had other people comment on size alignment of
my (non-array) ioctl structs based on your post, so some clarification
would be nice.
diff mbox

Patch

diff --git a/drivers/gpu/drm/vc4/vc4_bo.c b/drivers/gpu/drm/vc4/vc4_bo.c
index 3afdbf4bc10b..f53aa508cb6f 100644
--- a/drivers/gpu/drm/vc4/vc4_bo.c
+++ b/drivers/gpu/drm/vc4/vc4_bo.c
@@ -233,7 +233,7 @@  static struct list_head *vc4_get_cache_list_for_size(struct drm_device *dev,
 	return &vc4->bo_cache.size_list[page_index];
 }
 
-static void vc4_bo_cache_purge(struct drm_device *dev)
+void vc4_bo_cache_purge(struct drm_device *dev)
 {
 	struct vc4_dev *vc4 = to_vc4_dev(dev);
 
@@ -293,6 +293,8 @@  struct drm_gem_object *vc4_create_object(struct drm_device *dev, size_t size)
 	if (!bo)
 		return ERR_PTR(-ENOMEM);
 
+	bo->madv = VC4_MADV_WILLNEED;
+	mutex_init(&bo->madv_lock);
 	mutex_lock(&vc4->bo_lock);
 	bo->label = VC4_BO_TYPE_KERNEL;
 	vc4->bo_labels[VC4_BO_TYPE_KERNEL].num_allocated++;
@@ -330,13 +332,29 @@  struct vc4_bo *vc4_bo_create(struct drm_device *dev, size_t unaligned_size,
 		 * CMA allocations we've got laying around and try again.
 		 */
 		vc4_bo_cache_purge(dev);
+		cma_obj = drm_gem_cma_create(dev, size);
+	}
 
+	if (IS_ERR(cma_obj)) {
+		/*
+		 * Still not enough CMA memory, purge the userspace BO
+		 * cache and retry.
+		 * This is sub-optimal since we purge the whole userspace
+		 * BO cache which forces user that want to re-use the BO to
+		 * restore its initial content.
+		 * Ideally, we should purge entries one by one and retry
+		 * after each to see if CMA allocation succeeds. Or even
+		 * better, try to find an entry with at least the same
+		 * size.
+		 */
+		vc4_userspace_bo_cache_purge(dev);
 		cma_obj = drm_gem_cma_create(dev, size);
-		if (IS_ERR(cma_obj)) {
-			DRM_ERROR("Failed to allocate from CMA:\n");
-			vc4_bo_stats_dump(vc4);
-			return ERR_PTR(-ENOMEM);
-		}
+	}
+
+	if (IS_ERR(cma_obj)) {
+		DRM_ERROR("Failed to allocate from CMA:\n");
+		vc4_bo_stats_dump(vc4);
+		return ERR_PTR(-ENOMEM);
 	}
 	bo = to_vc4_bo(&cma_obj->base);
 
@@ -403,6 +421,15 @@  void vc4_free_object(struct drm_gem_object *gem_bo)
 	struct vc4_bo *bo = to_vc4_bo(gem_bo);
 	struct list_head *cache_list;
 
+	/* Remove the BO from the purgeable list. */
+	mutex_lock(&bo->madv_lock);
+	if (bo->madv == VC4_MADV_DONTNEED && !refcount_read(&bo->usecnt)) {
+		mutex_lock(&vc4->purgeable.lock);
+		list_del(&bo->size_head);
+		mutex_unlock(&vc4->purgeable.lock);
+	}
+	mutex_unlock(&bo->madv_lock);
+
 	mutex_lock(&vc4->bo_lock);
 	/* If the object references someone else's memory, we can't cache it.
 	 */
@@ -418,7 +445,8 @@  void vc4_free_object(struct drm_gem_object *gem_bo)
 	}
 
 	/* If this object was partially constructed but CMA allocation
-	 * had failed, just free it.
+	 * had failed, just free it. Can also happen when the BO has been
+	 * purged.
 	 */
 	if (!bo->base.vaddr) {
 		vc4_bo_destroy(bo);
@@ -437,6 +465,9 @@  void vc4_free_object(struct drm_gem_object *gem_bo)
 		bo->validated_shader = NULL;
 	}
 
+	/* Reset the madv status before adding the BO to the cache. */
+	bo->madv = VC4_MADV_WILLNEED;
+
 	bo->t_format = false;
 	bo->free_time = jiffies;
 	list_add(&bo->size_head, cache_list);
@@ -461,6 +492,64 @@  static void vc4_bo_cache_time_work(struct work_struct *work)
 	mutex_unlock(&vc4->bo_lock);
 }
 
+int vc4_bo_inc_usecnt(struct vc4_bo *bo)
+{
+	int ret;
+
+	/* Fast path: if the BO is already retained by someone, no need to
+	 * check the madv status.
+	 */
+	if (refcount_inc_not_zero(&bo->usecnt))
+		return 0;
+
+	mutex_lock(&bo->madv_lock);
+	switch (bo->madv) {
+	case VC4_MADV_WILLNEED:
+		refcount_inc(&bo->usecnt);
+		ret = 0;
+		break;
+	case VC4_MADV_DONTNEED:
+		/* We shouldn't use a BO marked as purgeable if at least
+		 * someone else retained its content by incrementing usecnt.
+		 * Luckily the BO hasn't been purged yet, but something wrong
+		 * is happening here. Just throw an error instead of
+		 * authorizing this use case.
+		 */
+	case __VC4_MADV_PURGED:
+		/* We can't use a purged BO. */
+	default:
+		/* Invalid madv value. */
+		ret = -EINVAL;
+		break;
+	}
+	mutex_unlock(&bo->madv_lock);
+
+	return ret;
+}
+
+void vc4_bo_dec_usecnt(struct vc4_bo *bo)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(bo->base.base.dev);
+
+	if (WARN_ON(!refcount_read(&bo->usecnt)))
+		return;
+
+	/* Fast path: if the BO is still retained by someone, no need to test
+	 * the madv value.
+	 */
+	if (!refcount_dec_not_one(&bo->usecnt))
+		return;
+
+	mutex_lock(&bo->madv_lock);
+	if (refcount_dec_and_test(&bo->usecnt) &&
+	    bo->madv == VC4_MADV_DONTNEED) {
+		mutex_lock(&vc4->purgeable.lock);
+		list_add_tail(&bo->size_head, &vc4->purgeable.list);
+		mutex_unlock(&vc4->purgeable.lock);
+	}
+	mutex_unlock(&bo->madv_lock);
+}
+
 static void vc4_bo_cache_time_timer(unsigned long data)
 {
 	struct drm_device *dev = (struct drm_device *)data;
@@ -480,18 +569,77 @@  struct dma_buf *
 vc4_prime_export(struct drm_device *dev, struct drm_gem_object *obj, int flags)
 {
 	struct vc4_bo *bo = to_vc4_bo(obj);
+	struct dma_buf *dmabuf;
+	int ret;
 
 	if (bo->validated_shader) {
 		DRM_DEBUG("Attempting to export shader BO\n");
 		return ERR_PTR(-EINVAL);
 	}
 
-	return drm_gem_prime_export(dev, obj, flags);
+	/* Note: as soon as the BO is exported it becomes unpurgeable because
+	 * noone ever decrements the usecnt even if the reference held by the
+	 * exported BO is released. This shouldn't be a problem since we don't
+	 * expect exported BO to be marked as purgeable.
+	 */
+	ret = vc4_bo_inc_usecnt(bo);
+	if (ret) {
+		DRM_ERROR("Failed to increment BO usecnt\n");
+		return ERR_PTR(ret);
+	}
+
+	dmabuf = drm_gem_prime_export(dev, obj, flags);
+	if (IS_ERR(dmabuf))
+		vc4_bo_dec_usecnt(bo);
+
+	return dmabuf;
+}
+
+int vc4_fault(struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	struct drm_gem_object *obj = vma->vm_private_data;
+	unsigned long vm_pgoff;
+	struct vc4_bo *bo = to_vc4_bo(obj);
+	int ret;
+
+	mutex_lock(&bo->madv_lock);
+	if (WARN_ON(bo->madv != VC4_MADV_WILLNEED)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	vm_pgoff = vma->vm_pgoff;
+	vma->vm_pgoff = 0;
+	ret = dma_mmap_wc(obj->dev->dev, vma, bo->base.vaddr,
+			  bo->base.paddr, vma->vm_end - vma->vm_start);
+	vma->vm_pgoff = vm_pgoff;
+
+out_unlock:
+	mutex_unlock(&bo->madv_lock);
+
+	switch (ret) {
+	case -EAGAIN:
+	case 0:
+	case -ERESTARTSYS:
+	case -EINTR:
+	case -EBUSY:
+		/*
+		 * EBUSY is ok: this just means that another thread
+		 * already did the job.
+		 */
+		return VM_FAULT_NOPAGE;
+	case -ENOMEM:
+		return VM_FAULT_OOM;
+	default:
+		return VM_FAULT_SIGBUS;
+	}
 }
 
 int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct drm_gem_object *gem_obj;
+	unsigned long vm_pgoff;
 	struct vc4_bo *bo;
 	int ret;
 
@@ -504,6 +652,14 @@  int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
 
 	if (bo->validated_shader && (vma->vm_flags & VM_WRITE)) {
 		DRM_DEBUG("mmaping of shader BOs for writing not allowed.\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (bo->madv != VC4_MADV_WILLNEED) {
+		DRM_DEBUG("mmaping of %s BO not allowed\n",
+			  bo->madv == VC4_MADV_DONTNEED ?
+			  "purgeable" : "purged");
 		return -EINVAL;
 	}
 
@@ -513,10 +669,24 @@  int vc4_mmap(struct file *filp, struct vm_area_struct *vma)
 	 * the whole buffer.
 	 */
 	vma->vm_flags &= ~VM_PFNMAP;
-	vma->vm_pgoff = 0;
 
+	/* This ->vm_pgoff dance is needed to make all parties happy:
+	 * - dma_mmap_wc() uses ->vm_pgoff as an offset within the allocated
+	 *   mem-region, hence the need to set it to zero (the value set by
+	 *   the DRM core is a virtual offset encoding the GEM object-id)
+	 * - the mmap() core logic needs ->vm_pgoff to be restored to its
+	 *   initial before returning from this function because it encodes the
+	 *   offset of this GEM in the dev->anon_inode pseudo-file and this
+	 *   information will be used when we invalidate userspace mappings
+	 *   with drm_vma_node_unmap() (called from vc4_gem_purge()).
+	 */
+	vm_pgoff = vma->vm_pgoff;
+	vma->vm_pgoff = 0;
 	ret = dma_mmap_wc(bo->base.base.dev->dev, vma, bo->base.vaddr,
 			  bo->base.paddr, vma->vm_end - vma->vm_start);
+	vma->vm_pgoff = vm_pgoff;
+
+out:
 	if (ret)
 		drm_gem_vm_close(vma);
 
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
index 1c96edcb302b..7a1f5223996f 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -117,6 +117,12 @@  static void vc4_lastclose(struct drm_device *dev)
 	drm_fbdev_cma_restore_mode(vc4->fbdev);
 }
 
+static const struct vm_operations_struct vm_ops = {
+	.fault = vc4_fault,
+	.open = drm_gem_vm_open,
+	.close = drm_gem_vm_close,
+};
+
 static const struct file_operations vc4_drm_fops = {
 	.owner = THIS_MODULE,
 	.open = drm_open,
@@ -142,6 +148,7 @@  static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(VC4_SET_TILING, vc4_set_tiling_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(VC4_GET_TILING, vc4_get_tiling_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(VC4_LABEL_BO, vc4_label_bo_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(VC4_GEM_MADVISE, vc4_gem_madvise_ioctl, DRM_RENDER_ALLOW),
 };
 
 static struct drm_driver vc4_drm_driver = {
@@ -166,7 +173,7 @@  static struct drm_driver vc4_drm_driver = {
 
 	.gem_create_object = vc4_create_object,
 	.gem_free_object_unlocked = vc4_free_object,
-	.gem_vm_ops = &drm_gem_cma_vm_ops,
+	.gem_vm_ops = &vm_ops,
 
 	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
 	.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
index 87f2d8e5c134..5b9e66e3639e 100644
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -71,6 +71,14 @@  struct vc4_dev {
 		u32 size_allocated;
 	} *bo_labels;
 
+	/* Purgeable BO pool. All BOs in this pool can have their memory
+	 * reclaimed if the driver is unable to allocate new BOs.
+	 */
+	struct {
+		struct list_head list;
+		struct mutex lock;
+	} purgeable;
+
 	/* Protects bo_cache and bo_labels. */
 	struct mutex bo_lock;
 
@@ -192,6 +200,17 @@  struct vc4_bo {
 	 * for user-allocated labels.
 	 */
 	int label;
+
+	/* Count the number of users with this BO mapped in their address
+	 * space. This is needed to determine whether we can tag the BO as
+	 * purgeable or not (when the BO is used by the GPU or the display
+	 * engine we can't free it).
+	 */
+	refcount_t usecnt;
+
+	/* Store purgeable/purged state here */
+	u32 madv;
+	struct mutex madv_lock;
 };
 
 static inline struct vc4_bo *
@@ -503,6 +522,7 @@  int vc4_get_hang_state_ioctl(struct drm_device *dev, void *data,
 			     struct drm_file *file_priv);
 int vc4_label_bo_ioctl(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
+int vc4_fault(struct vm_fault *vmf);
 int vc4_mmap(struct file *filp, struct vm_area_struct *vma);
 struct reservation_object *vc4_prime_res_obj(struct drm_gem_object *obj);
 int vc4_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma);
@@ -513,6 +533,9 @@  void *vc4_prime_vmap(struct drm_gem_object *obj);
 int vc4_bo_cache_init(struct drm_device *dev);
 void vc4_bo_cache_destroy(struct drm_device *dev);
 int vc4_bo_stats_debugfs(struct seq_file *m, void *arg);
+int vc4_bo_inc_usecnt(struct vc4_bo *bo);
+void vc4_bo_dec_usecnt(struct vc4_bo *bo);
+void vc4_bo_cache_purge(struct drm_device *dev);
 
 /* vc4_crtc.c */
 extern struct platform_driver vc4_crtc_driver;
@@ -557,6 +580,9 @@  void vc4_job_handle_completed(struct vc4_dev *vc4);
 int vc4_queue_seqno_cb(struct drm_device *dev,
 		       struct vc4_seqno_cb *cb, uint64_t seqno,
 		       void (*func)(struct vc4_seqno_cb *cb));
+int vc4_gem_madvise_ioctl(struct drm_device *dev, void *data,
+			  struct drm_file *file_priv);
+void vc4_userspace_bo_cache_purge(struct drm_device *dev);
 
 /* vc4_hdmi.c */
 extern struct platform_driver vc4_hdmi_driver;
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
index d0c6bfb68c4e..ba447ab95c83 100644
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -54,8 +54,12 @@  vc4_free_hang_state(struct drm_device *dev, struct vc4_hang_state *state)
 {
 	unsigned int i;
 
-	for (i = 0; i < state->user_state.bo_count; i++)
+	for (i = 0; i < state->user_state.bo_count; i++) {
+		struct vc4_bo *bo = to_vc4_bo(state->bo[i]);
+
+		vc4_bo_dec_usecnt(bo);
 		drm_gem_object_put_unlocked(state->bo[i]);
+	}
 
 	kfree(state);
 }
@@ -188,11 +192,14 @@  vc4_save_hang_state(struct drm_device *dev)
 			continue;
 
 		for (j = 0; j < exec[i]->bo_count; j++) {
+			bo = to_vc4_bo(&exec[i]->bo[j]->base);
+			WARN_ON(vc4_bo_inc_usecnt(bo));
 			drm_gem_object_get(&exec[i]->bo[j]->base);
 			kernel_state->bo[j + prev_idx] = &exec[i]->bo[j]->base;
 		}
 
 		list_for_each_entry(bo, &exec[i]->unref_list, unref_head) {
+			WARN_ON(vc4_bo_inc_usecnt(bo));
 			drm_gem_object_get(&bo->base.base);
 			kernel_state->bo[j + prev_idx] = &bo->base.base;
 			j++;
@@ -689,18 +696,42 @@  vc4_cl_lookup_bos(struct drm_device *dev,
 	for (i = 0; i < exec->bo_count; i++) {
 		struct drm_gem_object *bo = idr_find(&file_priv->object_idr,
 						     handles[i]);
+		struct vc4_bo *vc4bo;
 		if (!bo) {
 			DRM_DEBUG("Failed to look up GEM BO %d: %d\n",
 				  i, handles[i]);
 			ret = -EINVAL;
-			spin_unlock(&file_priv->table_lock);
-			goto fail;
+			break;
 		}
+
+		vc4bo = to_vc4_bo(bo);
 		drm_gem_object_get(bo);
 		exec->bo[i] = (struct drm_gem_cma_object *)bo;
 	}
 	spin_unlock(&file_priv->table_lock);
 
+	if (ret)
+		goto fail_put_bo;
+
+	for (i = 0; i < exec->bo_count; i++) {
+		ret = vc4_bo_inc_usecnt(to_vc4_bo(&exec->bo[i]->base));
+		if (ret)
+			goto fail_dec_usecnt;
+	}
+
+	kvfree(handles);
+	return ret;
+
+fail_dec_usecnt:
+	/* Decrease usecnt on acquired objects. */
+	for (i-- ; i >= 0; i--)
+		vc4_bo_dec_usecnt(to_vc4_bo(&exec->bo[i]->base));
+
+fail_put_bo:
+	/* Release any reference to acquired objects. */
+	for (i = 0; i < exec->bo_count && exec->bo[i]; i++)
+		drm_gem_object_put(&exec->bo[i]->base);
+
 fail:
 	kvfree(handles);
 	return ret;
@@ -782,6 +813,11 @@  vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
 	}
 	exec->exec_bo = &bo->base;
 
+	ret = vc4_bo_inc_usecnt(bo);
+	if (WARN_ON(ret)) {
+		DRM_ERROR("Couldn't increment BO usecnt\n");
+		goto fail;
+	}
 	list_add_tail(&to_vc4_bo(&exec->exec_bo->base)->unref_head,
 		      &exec->unref_list);
 
@@ -802,17 +838,25 @@  vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
 				  bin,
 				  exec);
 	if (ret)
-		goto fail;
+		goto fail_dec_usecnt;
 
 	ret = vc4_validate_shader_recs(dev, exec);
 	if (ret)
-		goto fail;
+		goto fail_dec_usecnt;
 
 	/* Block waiting on any previous rendering into the CS's VBO,
 	 * IB, or textures, so that pixels are actually written by the
 	 * time we try to read them.
 	 */
 	ret = vc4_wait_for_seqno(dev, exec->bin_dep_seqno, ~0ull, true);
+	if (ret)
+		goto fail_dec_usecnt;
+
+	kvfree(temp);
+	return 0;
+
+fail_dec_usecnt:
+	vc4_bo_dec_usecnt(bo);
 
 fail:
 	kvfree(temp);
@@ -833,8 +877,12 @@  vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 		dma_fence_signal(exec->fence);
 
 	if (exec->bo) {
-		for (i = 0; i < exec->bo_count; i++)
+		for (i = 0; i < exec->bo_count; i++) {
+			struct vc4_bo *bo = to_vc4_bo(&exec->bo[i]->base);
+
+			vc4_bo_dec_usecnt(bo);
 			drm_gem_object_put_unlocked(&exec->bo[i]->base);
+		}
 		kvfree(exec->bo);
 	}
 
@@ -842,6 +890,7 @@  vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
 		struct vc4_bo *bo = list_first_entry(&exec->unref_list,
 						     struct vc4_bo, unref_head);
 		list_del(&bo->unref_head);
+		vc4_bo_dec_usecnt(bo);
 		drm_gem_object_put_unlocked(&bo->base.base);
 	}
 
@@ -1098,6 +1147,9 @@  vc4_gem_init(struct drm_device *dev)
 	INIT_WORK(&vc4->job_done_work, vc4_job_done_work);
 
 	mutex_init(&vc4->power_lock);
+
+	INIT_LIST_HEAD(&vc4->purgeable.list);
+	mutex_init(&vc4->purgeable.lock);
 }
 
 void
@@ -1114,6 +1166,7 @@  vc4_gem_destroy(struct drm_device *dev)
 	 * the overflow allocation registers.  Now free the object.
 	 */
 	if (vc4->bin_bo) {
+		vc4_bo_dec_usecnt(vc4->bin_bo);
 		drm_gem_object_put_unlocked(&vc4->bin_bo->base.base);
 		vc4->bin_bo = NULL;
 	}
@@ -1121,3 +1174,151 @@  vc4_gem_destroy(struct drm_device *dev)
 	if (vc4->hang_state)
 		vc4_free_hang_state(dev, vc4->hang_state);
 }
+
+static void vc4_gem_purge(struct drm_gem_object *obj)
+{
+	struct vc4_bo *bo = to_vc4_bo(obj);
+	struct drm_device *dev = obj->dev;
+
+	WARN_ON(!mutex_is_locked(&bo->madv_lock));
+	WARN_ON(bo->madv != VC4_MADV_DONTNEED);
+
+	drm_vma_node_unmap(&obj->vma_node, dev->anon_inode->i_mapping);
+	/* Reset the BO content to not leak sensitive data. */
+	memset(bo->base.vaddr, 0, obj->size);
+
+	dma_free_wc(dev->dev, obj->size, bo->base.vaddr, bo->base.paddr);
+	bo->base.vaddr = NULL;
+	bo->madv = __VC4_MADV_PURGED;
+}
+
+void vc4_userspace_bo_cache_purge(struct drm_device *dev)
+{
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+
+	mutex_lock(&vc4->purgeable.lock);
+	while (!list_empty(&vc4->purgeable.list)) {
+		struct vc4_bo *bo = list_first_entry(&vc4->purgeable.list,
+						     struct vc4_bo, size_head);
+		struct drm_gem_object *obj = &bo->base.base;
+
+		/* list_del_init() is used here because we are about to release
+		 * the purgeable lock in order to acquire the madv one.
+		 * During this short period of time a user might decide to mark
+		 * the BO as unpurgeable, and if bo->madv is set to
+		 * VC4_MADV_DONTNEED it will try to remove the BO from the
+		 * purgeable list which will fail if the ->next/prev fields
+		 * are set to LIST_POISON1/LIST_POISON2 (which is what
+		 * list_del() does).
+		 * Re-initializing the list element guarantees that list_del()
+		 * will work correctly even if it's a NOP.
+		 */
+		list_del_init(&bo->size_head);
+
+		/* Release the purgeable lock while we're purging the BO so
+		 * that other people can continue inserting things in the
+		 * purgeable pool without having to wait for all BOs to be
+		 * purged.
+		 */
+		mutex_unlock(&vc4->purgeable.lock);
+		mutex_lock(&bo->madv_lock);
+
+		/* Since we released the purgeable pool lock before acquiring
+		 * the BO madv one, vc4_gem_madvise_ioctl() may have changed
+		 * the bo->madv status. In this case, just skip this entry.
+		 */
+		if (bo->madv == VC4_MADV_DONTNEED)
+			vc4_gem_purge(obj);
+		mutex_unlock(&bo->madv_lock);
+		mutex_lock(&vc4->purgeable.lock);
+	}
+	mutex_unlock(&vc4->purgeable.lock);
+}
+
+int vc4_gem_madvise_ioctl(struct drm_device *dev, void *data,
+			  struct drm_file *file_priv)
+{
+	struct drm_vc4_gem_madvise *args = data;
+	struct vc4_dev *vc4 = to_vc4_dev(dev);
+	struct drm_gem_object *gem_obj;
+	struct vc4_bo *bo;
+	int ret;
+
+	switch (args->madv) {
+	case VC4_MADV_DONTNEED:
+	case VC4_MADV_WILLNEED:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	gem_obj = drm_gem_object_lookup(file_priv, args->handle);
+	if (!gem_obj) {
+		DRM_ERROR("Failed to look up GEM BO %d\n", args->handle);
+		return -EINVAL;
+	}
+
+	bo = to_vc4_bo(gem_obj);
+
+	/* Not sure it's safe to purge imported BOs. Let's just assume it's
+	 * not until proven otherwise.
+	 */
+	if (args->madv == VC4_MADV_DONTNEED && gem_obj->import_attach) {
+		ret = -EINVAL;
+		goto out_put_gem;
+	}
+
+	mutex_lock(&bo->madv_lock);
+
+	if (args->madv == VC4_MADV_DONTNEED && bo->madv == VC4_MADV_WILLNEED &&
+	    !refcount_read(&bo->usecnt)) {
+		/* If the BO is about to be marked as purgeable, is not used
+		 * and is not already purgeable or purged, add it to the
+		 * purgeable list.
+		 */
+		mutex_lock(&vc4->purgeable.lock);
+		list_add_tail(&bo->size_head, &vc4->purgeable.list);
+		mutex_unlock(&vc4->purgeable.lock);
+	} else if (args->madv == VC4_MADV_WILLNEED &&
+		   bo->madv == VC4_MADV_DONTNEED &&
+		   !refcount_read(&bo->usecnt)) {
+		/* The BO has not been purged yet, just remove it from
+		 * the purgeable list.
+		 */
+		mutex_lock(&vc4->purgeable.lock);
+		list_del(&bo->size_head);
+		mutex_unlock(&vc4->purgeable.lock);
+	} else if (bo->madv == __VC4_MADV_PURGED) {
+		bo->base.vaddr = dma_alloc_wc(dev->dev, gem_obj->size,
+					      &bo->base.paddr,
+					      GFP_KERNEL | __GFP_NOWARN);
+		if (!bo->base.vaddr) {
+			/* CMA allocation failed, try to purge the in-kernel BO
+			 * cache.
+			 */
+			vc4_bo_cache_purge(dev);
+			bo->base.vaddr = dma_alloc_wc(dev->dev, gem_obj->size,
+						      &bo->base.paddr,
+						      GFP_KERNEL |
+						      __GFP_NOWARN);
+		}
+
+		if (!bo->base.vaddr) {
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+	}
+
+	/* Save the purged state. */
+	args->retained = bo->madv != __VC4_MADV_PURGED;
+	bo->madv = args->madv;
+	ret = 0;
+
+out_unlock:
+	mutex_unlock(&bo->madv_lock);
+
+out_put_gem:
+	drm_gem_object_put_unlocked(gem_obj);
+
+	return ret;
+}
diff --git a/drivers/gpu/drm/vc4/vc4_plane.c b/drivers/gpu/drm/vc4/vc4_plane.c
index 2968b3ebb895..83e789dd7b98 100644
--- a/drivers/gpu/drm/vc4/vc4_plane.c
+++ b/drivers/gpu/drm/vc4/vc4_plane.c
@@ -23,6 +23,7 @@ 
 #include <drm/drm_fb_cma_helper.h>
 #include <drm/drm_plane_helper.h>
 
+#include "uapi/drm/vc4_drm.h"
 #include "vc4_drv.h"
 #include "vc4_regs.h"
 
@@ -764,21 +765,40 @@  static int vc4_prepare_fb(struct drm_plane *plane,
 {
 	struct vc4_bo *bo;
 	struct dma_fence *fence;
+	int ret;
 
 	if ((plane->state->fb == state->fb) || !state->fb)
 		return 0;
 
 	bo = to_vc4_bo(&drm_fb_cma_get_gem_obj(state->fb, 0)->base);
+
+	ret = vc4_bo_inc_usecnt(bo);
+	if (ret)
+		return ret;
+
 	fence = reservation_object_get_excl_rcu(bo->resv);
 	drm_atomic_set_fence_for_plane(state, fence);
 
 	return 0;
 }
 
+static void vc4_cleanup_fb(struct drm_plane *plane,
+			   struct drm_plane_state *state)
+{
+	struct vc4_bo *bo;
+
+	if (plane->state->fb == state->fb || !state->fb)
+		return;
+
+	bo = to_vc4_bo(&drm_fb_cma_get_gem_obj(state->fb, 0)->base);
+	vc4_bo_dec_usecnt(bo);
+}
+
 static const struct drm_plane_helper_funcs vc4_plane_helper_funcs = {
 	.atomic_check = vc4_plane_atomic_check,
 	.atomic_update = vc4_plane_atomic_update,
 	.prepare_fb = vc4_prepare_fb,
+	.cleanup_fb = vc4_cleanup_fb,
 };
 
 static void vc4_plane_destroy(struct drm_plane *plane)
diff --git a/drivers/gpu/drm/vc4/vc4_render_cl.c b/drivers/gpu/drm/vc4/vc4_render_cl.c
index 273984f71ae2..78b0b5ecfdc7 100644
--- a/drivers/gpu/drm/vc4/vc4_render_cl.c
+++ b/drivers/gpu/drm/vc4/vc4_render_cl.c
@@ -332,6 +332,9 @@  static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
 	setup->rcl = &vc4_bo_create(dev, size, true, VC4_BO_TYPE_RCL)->base;
 	if (IS_ERR(setup->rcl))
 		return PTR_ERR(setup->rcl);
+
+	WARN_ON(vc4_bo_inc_usecnt(to_vc4_bo(&setup->rcl->base)));
+
 	list_add_tail(&to_vc4_bo(&setup->rcl->base)->unref_head,
 		      &exec->unref_list);
 
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
index afae87004963..c01b93d453db 100644
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -41,6 +41,7 @@  extern "C" {
 #define DRM_VC4_SET_TILING                        0x08
 #define DRM_VC4_GET_TILING                        0x09
 #define DRM_VC4_LABEL_BO                          0x0a
+#define DRM_VC4_GEM_MADVISE                       0x0b
 
 #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
 #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
@@ -53,6 +54,7 @@  extern "C" {
 #define DRM_IOCTL_VC4_SET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SET_TILING, struct drm_vc4_set_tiling)
 #define DRM_IOCTL_VC4_GET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
 #define DRM_IOCTL_VC4_LABEL_BO            DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_LABEL_BO, struct drm_vc4_label_bo)
+#define DRM_IOCTL_VC4_GEM_MADVISE         DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GEM_MADVISE, struct drm_vc4_gem_madvise)
 
 struct drm_vc4_submit_rcl_surface {
 	__u32 hindex; /* Handle index, or ~0 if not present. */
@@ -333,6 +335,16 @@  struct drm_vc4_label_bo {
 	__u64 name;
 };
 
+#define VC4_MADV_WILLNEED			0
+#define VC4_MADV_DONTNEED			1
+#define __VC4_MADV_PURGED			2
+
+struct drm_vc4_gem_madvise {
+	__u32 handle;
+	__u32 madv;
+	__u32 retained;
+};
+
 #if defined(__cplusplus)
 }
 #endif