diff mbox series

[v16,15/20] drm/shmem-helper: Add memory shrinker

Message ID 20230903170736.513347-16-dmitry.osipenko@collabora.com (mailing list archive)
State New, archived
Headers show
Series Add generic memory shrinker to VirtIO-GPU and Panfrost DRM drivers | expand

Commit Message

Dmitry Osipenko Sept. 3, 2023, 5:07 p.m. UTC
Introduce common drm-shmem shrinker for DRM drivers.

To start using drm-shmem shrinker drivers should do the following:

1. Implement evict() callback of GEM object where driver should check
   whether object is purgeable or evictable using drm-shmem helpers and
   perform the shrinking action

2. Initialize drm-shmem internals using drmm_gem_shmem_init(drm_device),
   which will register drm-shmem shrinker

3. Implement madvise IOCTL that will use drm_gem_shmem_madvise()

Signed-off-by: Daniel Almeida <daniel.almeida@collabora.com>
Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
---
 drivers/gpu/drm/drm_gem_shmem_helper.c        | 442 ++++++++++++++++--
 .../gpu/drm/panfrost/panfrost_gem_shrinker.c  |   9 +-
 include/drm/drm_device.h                      |  10 +-
 include/drm/drm_gem_shmem_helper.h            |  71 ++-
 4 files changed, 494 insertions(+), 38 deletions(-)

Comments

Boris Brezillon Sept. 5, 2023, 8:03 a.m. UTC | #1
On Sun,  3 Sep 2023 20:07:31 +0300
Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:

> Introduce common drm-shmem shrinker for DRM drivers.
> 
> To start using drm-shmem shrinker drivers should do the following:
> 
> 1. Implement evict() callback of GEM object where driver should check
>    whether object is purgeable or evictable using drm-shmem helpers and
>    perform the shrinking action
> 
> 2. Initialize drm-shmem internals using drmm_gem_shmem_init(drm_device),
>    which will register drm-shmem shrinker
> 
> 3. Implement madvise IOCTL that will use drm_gem_shmem_madvise()
> 
> Signed-off-by: Daniel Almeida <daniel.almeida@collabora.com>
> Signed-off-by: Dmitry Osipenko <dmitry.osipenko@collabora.com>
> ---
>  drivers/gpu/drm/drm_gem_shmem_helper.c        | 442 ++++++++++++++++--
>  .../gpu/drm/panfrost/panfrost_gem_shrinker.c  |   9 +-
>  include/drm/drm_device.h                      |  10 +-
>  include/drm/drm_gem_shmem_helper.h            |  71 ++-
>  4 files changed, 494 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c
> index 4633a418faba..a0a6c7e505c8 100644
> --- a/drivers/gpu/drm/drm_gem_shmem_helper.c
> +++ b/drivers/gpu/drm/drm_gem_shmem_helper.c
> @@ -20,6 +20,7 @@
>  #include <drm/drm_device.h>
>  #include <drm/drm_drv.h>
>  #include <drm/drm_gem_shmem_helper.h>
> +#include <drm/drm_managed.h>
>  #include <drm/drm_prime.h>
>  #include <drm/drm_print.h>
>  
> @@ -88,8 +89,6 @@ __drm_gem_shmem_create(struct drm_device *dev, size_t size, bool private)
>  	if (ret)
>  		goto err_release;
>  
> -	INIT_LIST_HEAD(&shmem->madv_list);
> -
>  	if (!private) {
>  		/*
>  		 * Our buffers are kept pinned, so allocating them
> @@ -128,6 +127,62 @@ struct drm_gem_shmem_object *drm_gem_shmem_create(struct drm_device *dev, size_t
>  }
>  EXPORT_SYMBOL_GPL(drm_gem_shmem_create);
>  
> +static bool drm_gem_shmem_is_evictable(struct drm_gem_shmem_object *shmem)
> +{
> +	dma_resv_assert_held(shmem->base.resv);
> +
> +	return (shmem->madv >= 0) && shmem->base.funcs->evict &&
> +		refcount_read(&shmem->pages_use_count) &&
> +		!refcount_read(&shmem->pages_pin_count) &&
> +		!shmem->base.dma_buf && !shmem->base.import_attach &&
> +		shmem->sgt && !shmem->evicted;
> +}
> +
> +static void
> +drm_gem_shmem_update_pages_state_locked(struct drm_gem_shmem_object *shmem)
> +{
> +	struct drm_gem_object *obj = &shmem->base;
> +	struct drm_gem_shmem *shmem_mm = obj->dev->shmem_mm;
> +	struct drm_gem_shmem_shrinker *shmem_shrinker = &shmem_mm->shrinker;
> +
> +	dma_resv_assert_held(shmem->base.resv);
> +
> +	if (!shmem_shrinker || obj->import_attach)
> +		return;
> +
> +	if (shmem->madv < 0)
> +		drm_gem_lru_remove(&shmem->base);
> +	else if (drm_gem_shmem_is_evictable(shmem) || drm_gem_shmem_is_purgeable(shmem))
> +		drm_gem_lru_move_tail(&shmem_shrinker->lru_evictable, &shmem->base);
> +	else if (shmem->evicted)
> +		drm_gem_lru_move_tail(&shmem_shrinker->lru_evicted, &shmem->base);
> +	else if (!shmem->pages)
> +		drm_gem_lru_remove(&shmem->base);
> +	else
> +		drm_gem_lru_move_tail(&shmem_shrinker->lru_pinned, &shmem->base);
> +}
> +
> +static void
> +drm_gem_shmem_do_release_pages_locked(struct drm_gem_shmem_object *shmem)
> +{
> +	struct drm_gem_object *obj = &shmem->base;
> +
> +	if (!shmem->pages) {
> +		drm_WARN_ON(obj->dev, !shmem->evicted && shmem->madv >= 0);
> +		return;
> +	}
> +
> +#ifdef CONFIG_X86
> +	if (shmem->map_wc)
> +		set_pages_array_wb(shmem->pages, obj->size >> PAGE_SHIFT);
> +#endif
> +
> +	drm_gem_put_pages(obj, shmem->pages,
> +			  shmem->pages_mark_dirty_on_put,
> +			  shmem->pages_mark_accessed_on_put);
> +	shmem->pages = NULL;
> +}
> +
>  /**
>   * drm_gem_shmem_free - Free resources associated with a shmem GEM object
>   * @shmem: shmem GEM object to free
> @@ -142,8 +197,6 @@ void drm_gem_shmem_free(struct drm_gem_shmem_object *shmem)
>  	if (obj->import_attach) {
>  		drm_prime_gem_destroy(obj, shmem->sgt);
>  	} else if (!shmem->imported_sgt) {
> -		dma_resv_lock(shmem->base.resv, NULL);
> -
>  		drm_WARN_ON(obj->dev, refcount_read(&shmem->vmap_use_count));
>  
>  		if (shmem->sgt) {
> @@ -152,14 +205,27 @@ void drm_gem_shmem_free(struct drm_gem_shmem_object *shmem)
>  			sg_free_table(shmem->sgt);
>  			kfree(shmem->sgt);
>  		}
> -		if (refcount_read(&shmem->pages_use_count)) {
> -			drm_gem_shmem_put_pages_locked(shmem);

It would be preferable to introduce
drm_gem_shmem_do_release_pages_locked() earlier in the series (not a
huge fan of the name BTW, especially since this function can be called
without the lock held in the free path), so we keep things bisectable.

> -			drm_WARN_ON(obj->dev, !shmem->got_pages_sgt);
> +
> +		/*
> +		 * Destroying the object is a special case.. drm_gem_shmem_free()
> +		 * calls many things that WARN_ON if the obj lock is not held.

If that's the case, that should be fixed the same way you're fixing the
put_pages_locked() situation here.

>                * But
> +		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
> +		 * cause a locking order inversion between reservation_ww_class_mutex
> +		 * and fs_reclaim.
> +		 *
> +		 * This deadlock is not actually possible, because no one should
> +		 * be already holding the lock when drm_gem_shmem_free() is called.
> +		 * Unfortunately lockdep is not aware of this detail.  So when the
> +		 * refcount drops to zero, don't touch the reservation lock.
> +		 */
> +		if (shmem->got_pages_sgt &&
> +		    refcount_dec_and_test(&shmem->pages_use_count)) {
> +			drm_gem_shmem_do_release_pages_locked(shmem);
> +			shmem->got_pages_sgt = false;
>  		}

Leaking memory is the right thing to do if pages_use_count > 1 (it's
better to leak than having someone access memory it no longer owns), but
I think it's worth mentioning in the above comment.

>  
>  		drm_WARN_ON(obj->dev, refcount_read(&shmem->pages_use_count));
> -
> -		dma_resv_unlock(shmem->base.resv);
> +		drm_WARN_ON(obj->dev, shmem->got_pages_sgt);
>  	}
>  
>  	drm_gem_object_release(obj);
> @@ -167,15 +233,26 @@ void drm_gem_shmem_free(struct drm_gem_shmem_object *shmem)
>  }
>  EXPORT_SYMBOL_GPL(drm_gem_shmem_free);
>  
> -static int drm_gem_shmem_get_pages_locked(struct drm_gem_shmem_object *shmem)
> +static int
> +drm_gem_shmem_acquire_pages(struct drm_gem_shmem_object *shmem, bool init)
>  {
>  	struct drm_gem_object *obj = &shmem->base;
>  	struct page **pages;
>  
>  	dma_resv_assert_held(shmem->base.resv);

I'd defer this check to the caller (AFAICT, all callers have this
already). This way the function name is consistent with its behavior.

>  
> -	if (refcount_inc_not_zero(&shmem->pages_use_count))
> +	if (shmem->madv < 0) {
> +		drm_WARN_ON(obj->dev, shmem->pages);
> +		return -ENOMEM;
> +	}
> +
> +	if (shmem->pages) {
> +		drm_WARN_ON(obj->dev, !shmem->evicted);
>  		return 0;
> +	}
> +
> +	if (drm_WARN_ON(obj->dev, !(init ^ refcount_read(&shmem->pages_use_count))))
> +		return -EINVAL;
>  
>  	pages = drm_gem_get_pages(obj);
>  	if (IS_ERR(pages)) {
> @@ -196,8 +273,36 @@ static int drm_gem_shmem_get_pages_locked(struct drm_gem_shmem_object *shmem)
>  
>  	shmem->pages = pages;
>  
> +	return 0;
> +}
> +
> +static void
> +drm_gem_shmem_release_pages_locked(struct drm_gem_shmem_object *shmem)
> +{
> +	dma_resv_assert_held(shmem->base.resv);
> +	drm_gem_shmem_do_release_pages_locked(shmem);
> +}

Do we really need an intermediate helper here. Can't we let the callers
ensure the resv is held (that's already the case BTW). This way, you
simply have drm_gem_shmem_{release,acquire}_pages() helpers, and the
drm_gem_shmem_release_pages() helper can be used in
drm_gem_shmem_free().

> +
> +static int drm_gem_shmem_get_pages_locked(struct drm_gem_shmem_object *shmem)
> +{
> +	int err;
> +
> +	dma_resv_assert_held(shmem->base.resv);
> +
> +	if (shmem->madv < 0)
> +		return -ENOMEM;
> +
> +	if (refcount_inc_not_zero(&shmem->pages_use_count))
> +		return 0;
> +
> +	err = drm_gem_shmem_acquire_pages(shmem, true);
> +	if (err)
> +		return err;
> +
>  	refcount_set(&shmem->pages_use_count, 1);
>  
> +	drm_gem_shmem_update_pages_state_locked(shmem);
> +
>  	return 0;
>  }
>  
> @@ -209,20 +314,11 @@ static int drm_gem_shmem_get_pages_locked(struct drm_gem_shmem_object *shmem)
>   */
>  void drm_gem_shmem_put_pages_locked(struct drm_gem_shmem_object *shmem)
>  {
> -	struct drm_gem_object *obj = &shmem->base;
> -
>  	dma_resv_assert_held(shmem->base.resv);
>  
>  	if (refcount_dec_and_test(&shmem->pages_use_count)) {
> -#ifdef CONFIG_X86
> -		if (shmem->map_wc)
> -			set_pages_array_wb(shmem->pages, obj->size >> PAGE_SHIFT);
> -#endif
> -
> -		drm_gem_put_pages(obj, shmem->pages,
> -				  shmem->pages_mark_dirty_on_put,
> -				  shmem->pages_mark_accessed_on_put);
> -		shmem->pages = NULL;
> +		drm_gem_shmem_release_pages_locked(shmem);
> +		drm_gem_shmem_update_pages_state_locked(shmem);
>  	}
>  }
>  EXPORT_SYMBOL_GPL(drm_gem_shmem_put_pages_locked);
> @@ -251,8 +347,15 @@ static int drm_gem_shmem_pin_locked(struct drm_gem_shmem_object *shmem)
>  		return 0;
>  
>  	ret = drm_gem_shmem_get_pages_locked(shmem);
> -	if (!ret)
> +	if (!ret) {
> +		ret = drm_gem_shmem_swapin_locked(shmem);
> +		if (ret) {
> +			drm_gem_shmem_put_pages_locked(shmem);
> +			return ret;
> +		}
> +
>  		refcount_set(&shmem->pages_pin_count, 1);
> +	}
>  
>  	return ret;
>  }
> @@ -448,29 +551,54 @@ int drm_gem_shmem_madvise_locked(struct drm_gem_shmem_object *shmem, int madv)
>  
>  	madv = shmem->madv;
>  
> +	drm_gem_shmem_update_pages_state_locked(shmem);
> +
>  	return (madv >= 0);
>  }
>  EXPORT_SYMBOL_GPL(drm_gem_shmem_madvise_locked);
>  
> -void drm_gem_shmem_purge_locked(struct drm_gem_shmem_object *shmem)
> +int drm_gem_shmem_madvise(struct drm_gem_shmem_object *shmem, int madv)
> +{
> +	struct drm_gem_object *obj = &shmem->base;
> +	int ret;
> +
> +	ret = dma_resv_lock_interruptible(obj->resv, NULL);
> +	if (ret)
> +		return ret;
> +
> +	ret = drm_gem_shmem_madvise_locked(shmem, madv);
> +	dma_resv_unlock(obj->resv);
> +
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(drm_gem_shmem_madvise);
> +
> +static void drm_gem_shmem_unpin_pages_locked(struct drm_gem_shmem_object *shmem)
>  {
>  	struct drm_gem_object *obj = &shmem->base;
>  	struct drm_device *dev = obj->dev;
>  
>  	dma_resv_assert_held(shmem->base.resv);
>  
> -	drm_WARN_ON(obj->dev, !drm_gem_shmem_is_purgeable(shmem));
> +	if (shmem->evicted)
> +		return;
>  
>  	dma_unmap_sgtable(dev->dev, shmem->sgt, DMA_BIDIRECTIONAL, 0);
> +	drm_gem_shmem_release_pages_locked(shmem);
> +	drm_vma_node_unmap(&obj->vma_node, dev->anon_inode->i_mapping);
> +
>  	sg_free_table(shmem->sgt);
>  	kfree(shmem->sgt);
>  	shmem->sgt = NULL;
> +}
>  
> -	drm_gem_shmem_put_pages_locked(shmem);
> +void drm_gem_shmem_purge_locked(struct drm_gem_shmem_object *shmem)
> +{
> +	struct drm_gem_object *obj = &shmem->base;
>  
> -	shmem->madv = -1;
> +	drm_WARN_ON(obj->dev, !drm_gem_shmem_is_purgeable(shmem));
>  
> -	drm_vma_node_unmap(&obj->vma_node, dev->anon_inode->i_mapping);
> +	drm_gem_shmem_unpin_pages_locked(shmem);
>  	drm_gem_free_mmap_offset(obj);
>  
>  	/* Our goal here is to return as much of the memory as
> @@ -481,9 +609,59 @@ void drm_gem_shmem_purge_locked(struct drm_gem_shmem_object *shmem)
>  	shmem_truncate_range(file_inode(obj->filp), 0, (loff_t)-1);
>  
>  	invalidate_mapping_pages(file_inode(obj->filp)->i_mapping, 0, (loff_t)-1);
> +
> +	shmem->madv = -1;
> +	shmem->evicted = false;
> +	drm_gem_shmem_update_pages_state_locked(shmem);
>  }
>  EXPORT_SYMBOL_GPL(drm_gem_shmem_purge_locked);
>  
> +/**
> + * drm_gem_shmem_swapin_locked() - Moves shmem GEM back to memory and enables
> + *                                 hardware access to the memory.
> + * @shmem: shmem GEM object
> + *
> + * This function moves shmem GEM back to memory if it was previously evicted
> + * by the memory shrinker. The GEM is ready to use on success.
> + *
> + * Returns:
> + * 0 on success or a negative error code on failure.
> + */
> +int drm_gem_shmem_swapin_locked(struct drm_gem_shmem_object *shmem)
> +{
> +	struct drm_gem_object *obj = &shmem->base;
> +	struct sg_table *sgt;
> +	int err;
> +
> +	dma_resv_assert_held(shmem->base.resv);
> +
> +	if (shmem->evicted) {
> +		err = drm_gem_shmem_acquire_pages(shmem, false);
> +		if (err)
> +			return err;
> +
> +		sgt = drm_gem_shmem_get_sg_table(shmem);
> +		if (IS_ERR(sgt))
> +			return PTR_ERR(sgt);
> +
> +		err = dma_map_sgtable(obj->dev->dev, sgt,
> +				      DMA_BIDIRECTIONAL, 0);
> +		if (err) {
> +			sg_free_table(sgt);
> +			kfree(sgt);
> +			return err;
> +		}
> +
> +		shmem->sgt = sgt;
> +		shmem->evicted = false;
> +
> +		drm_gem_shmem_update_pages_state_locked(shmem);
> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(drm_gem_shmem_swapin_locked);
> +
>  /**
>   * drm_gem_shmem_dumb_create - Create a dumb shmem buffer object
>   * @file: DRM file structure to create the dumb buffer for
> @@ -530,22 +708,34 @@ static vm_fault_t drm_gem_shmem_fault(struct vm_fault *vmf)
>  	vm_fault_t ret;
>  	struct page *page;
>  	pgoff_t page_offset;
> +	bool pages_unpinned;
> +	int err;
>  
>  	/* We don't use vmf->pgoff since that has the fake offset */
>  	page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
>  
>  	dma_resv_lock(shmem->base.resv, NULL);
>  
> -	if (page_offset >= num_pages ||
> -	    drm_WARN_ON_ONCE(obj->dev, !shmem->pages) ||
> -	    shmem->madv < 0) {
> +	/* Sanity-check that we have the pages pointer when it should present */
> +	pages_unpinned = (shmem->evicted || shmem->madv < 0 ||
> +			  !refcount_read(&shmem->pages_use_count));
> +	drm_WARN_ON_ONCE(obj->dev, !shmem->pages ^ pages_unpinned);
> +
> +	if (page_offset >= num_pages || (!shmem->pages && !shmem->evicted)) {
>  		ret = VM_FAULT_SIGBUS;
>  	} else {
> +		err = drm_gem_shmem_swapin_locked(shmem);
> +		if (err) {
> +			ret = VM_FAULT_OOM;
> +			goto unlock;
> +		}
> +
>  		page = shmem->pages[page_offset];
>  
>  		ret = vmf_insert_pfn(vma, vmf->address, page_to_pfn(page));
>  	}
>  
> +unlock:
>  	dma_resv_unlock(shmem->base.resv);
>  
>  	return ret;
> @@ -568,6 +758,7 @@ static void drm_gem_shmem_vm_open(struct vm_area_struct *vma)
>  	drm_WARN_ON_ONCE(obj->dev,
>  			 !refcount_inc_not_zero(&shmem->pages_use_count));
>  
> +	drm_gem_shmem_update_pages_state_locked(shmem);
>  	dma_resv_unlock(shmem->base.resv);
>  
>  	drm_gem_vm_open(vma);
> @@ -653,7 +844,9 @@ void drm_gem_shmem_print_info(const struct drm_gem_shmem_object *shmem,
>  	drm_printf_indent(p, indent, "pages_pin_count=%u\n", refcount_read(&shmem->pages_pin_count));
>  	drm_printf_indent(p, indent, "pages_use_count=%u\n", refcount_read(&shmem->pages_use_count));
>  	drm_printf_indent(p, indent, "vmap_use_count=%u\n", refcount_read(&shmem->vmap_use_count));
> +	drm_printf_indent(p, indent, "evicted=%d\n", shmem->evicted);
>  	drm_printf_indent(p, indent, "vaddr=%p\n", shmem->vaddr);
> +	drm_printf_indent(p, indent, "madv=%d\n", shmem->madv);
>  }
>  EXPORT_SYMBOL_GPL(drm_gem_shmem_print_info);
>  
> @@ -715,6 +908,8 @@ static struct sg_table *drm_gem_shmem_get_pages_sgt_locked(struct drm_gem_shmem_
>  
>  	shmem->sgt = sgt;
>  
> +	drm_gem_shmem_update_pages_state_locked(shmem);
> +
>  	return sgt;
>  
>  err_free_sgt:
> @@ -802,6 +997,191 @@ drm_gem_shmem_prime_import_sg_table(struct drm_device *dev,
>  }
>  EXPORT_SYMBOL_GPL(drm_gem_shmem_prime_import_sg_table);
>  
> +static struct drm_gem_shmem_shrinker *
> +to_drm_gem_shmem_shrinker(struct shrinker *shrinker)
> +{
> +	return container_of(shrinker, struct drm_gem_shmem_shrinker, base);
> +}
> +
> +static unsigned long
> +drm_gem_shmem_shrinker_count_objects(struct shrinker *shrinker,
> +				     struct shrink_control *sc)
> +{
> +	struct drm_gem_shmem_shrinker *shmem_shrinker =
> +					to_drm_gem_shmem_shrinker(shrinker);
> +	unsigned long count = shmem_shrinker->lru_evictable.count;
> +
> +	if (count >= SHRINK_EMPTY)
> +		return SHRINK_EMPTY - 1;
> +
> +	return count ?: SHRINK_EMPTY;
> +}
> +
> +void drm_gem_shmem_evict_locked(struct drm_gem_shmem_object *shmem)
> +{
> +	struct drm_gem_object *obj = &shmem->base;
> +
> +	drm_WARN_ON(obj->dev, !drm_gem_shmem_is_evictable(shmem));
> +	drm_WARN_ON(obj->dev, shmem->evicted);
> +
> +	drm_gem_shmem_unpin_pages_locked(shmem);
> +
> +	shmem->evicted = true;
> +	drm_gem_shmem_update_pages_state_locked(shmem);
> +}
> +EXPORT_SYMBOL_GPL(drm_gem_shmem_evict_locked);
> +
> +static bool drm_gem_shmem_shrinker_evict_locked(struct drm_gem_object *obj)
> +{
> +	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
> +	int err;
> +
> +	if (!drm_gem_shmem_is_evictable(shmem) ||
> +	    get_nr_swap_pages() < obj->size >> PAGE_SHIFT)
> +		return false;
> +
> +	err = drm_gem_evict_locked(obj);
> +	if (err)
> +		return false;
> +
> +	return true;
> +}
> +
> +static bool drm_gem_shmem_shrinker_purge_locked(struct drm_gem_object *obj)
> +{
> +	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
> +	int err;
> +
> +	if (!drm_gem_shmem_is_purgeable(shmem))
> +		return false;
> +
> +	err = drm_gem_evict_locked(obj);
> +	if (err)
> +		return false;
> +
> +	return true;
> +}
> +
> +static unsigned long
> +drm_gem_shmem_shrinker_scan_objects(struct shrinker *shrinker,
> +				    struct shrink_control *sc)
> +{
> +	struct drm_gem_shmem_shrinker *shmem_shrinker;
> +	unsigned long nr_to_scan = sc->nr_to_scan;
> +	unsigned long remaining = 0;
> +	unsigned long freed = 0;
> +
> +	shmem_shrinker = to_drm_gem_shmem_shrinker(shrinker);
> +
> +	/* purge as many objects as we can */
> +	freed += drm_gem_lru_scan(&shmem_shrinker->lru_evictable,
> +				  nr_to_scan, &remaining,
> +				  drm_gem_shmem_shrinker_purge_locked);
> +
> +	/* evict as many objects as we can */
> +	if (freed < nr_to_scan)
> +		freed += drm_gem_lru_scan(&shmem_shrinker->lru_evictable,
> +					  nr_to_scan - freed, &remaining,
> +					  drm_gem_shmem_shrinker_evict_locked);
> +
> +	return (freed > 0 && remaining > 0) ? freed : SHRINK_STOP;
> +}
> +
> +static int drm_gem_shmem_shrinker_init(struct drm_gem_shmem *shmem_mm,
> +				       const char *shrinker_name)
> +{
> +	struct drm_gem_shmem_shrinker *shmem_shrinker = &shmem_mm->shrinker;
> +	int err;
> +
> +	shmem_shrinker->base.count_objects = drm_gem_shmem_shrinker_count_objects;
> +	shmem_shrinker->base.scan_objects = drm_gem_shmem_shrinker_scan_objects;
> +	shmem_shrinker->base.seeks = DEFAULT_SEEKS;
> +
> +	mutex_init(&shmem_shrinker->lock);
> +	drm_gem_lru_init(&shmem_shrinker->lru_evictable, &shmem_shrinker->lock);
> +	drm_gem_lru_init(&shmem_shrinker->lru_evicted, &shmem_shrinker->lock);
> +	drm_gem_lru_init(&shmem_shrinker->lru_pinned, &shmem_shrinker->lock);
> +
> +	err = register_shrinker(&shmem_shrinker->base, shrinker_name);
> +	if (err) {
> +		mutex_destroy(&shmem_shrinker->lock);
> +		return err;
> +	}
> +
> +	return 0;
> +}
> +
> +static void drm_gem_shmem_shrinker_release(struct drm_device *dev,
> +					   struct drm_gem_shmem *shmem_mm)
> +{
> +	struct drm_gem_shmem_shrinker *shmem_shrinker = &shmem_mm->shrinker;
> +
> +	unregister_shrinker(&shmem_shrinker->base);
> +	drm_WARN_ON(dev, !list_empty(&shmem_shrinker->lru_evictable.list));
> +	drm_WARN_ON(dev, !list_empty(&shmem_shrinker->lru_evicted.list));
> +	drm_WARN_ON(dev, !list_empty(&shmem_shrinker->lru_pinned.list));
> +	mutex_destroy(&shmem_shrinker->lock);
> +}
> +
> +static int drm_gem_shmem_init(struct drm_device *dev)
> +{
> +	int err;
> +
> +	if (drm_WARN_ON(dev, dev->shmem_mm))
> +		return -EBUSY;
> +
> +	dev->shmem_mm = kzalloc(sizeof(*dev->shmem_mm), GFP_KERNEL);
> +	if (!dev->shmem_mm)
> +		return -ENOMEM;
> +
> +	err = drm_gem_shmem_shrinker_init(dev->shmem_mm, dev->unique);
> +	if (err)
> +		goto free_gem_shmem;
> +
> +	return 0;
> +
> +free_gem_shmem:
> +	kfree(dev->shmem_mm);
> +	dev->shmem_mm = NULL;
> +
> +	return err;
> +}
> +
> +static void drm_gem_shmem_release(struct drm_device *dev, void *ptr)
> +{
> +	struct drm_gem_shmem *shmem_mm = dev->shmem_mm;
> +
> +	drm_gem_shmem_shrinker_release(dev, shmem_mm);
> +	dev->shmem_mm = NULL;
> +	kfree(shmem_mm);
> +}
> +
> +/**
> + * drmm_gem_shmem_init() - Initialize drm-shmem internals
> + * @dev: DRM device
> + *
> + * Cleanup is automatically managed as part of DRM device releasing.
> + * Calling this function multiple times will result in a error.
> + *
> + * Returns:
> + * 0 on success or a negative error code on failure.
> + */
> +int drmm_gem_shmem_init(struct drm_device *dev)
> +{
> +	int err;
> +
> +	err = drm_gem_shmem_init(dev);
> +	if (err)
> +		return err;
> +
> +	err = drmm_add_action_or_reset(dev, drm_gem_shmem_release, NULL);
> +	if (err)
> +		return err;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(drmm_gem_shmem_init);
> +
>  MODULE_DESCRIPTION("DRM SHMEM memory-management helpers");
>  MODULE_IMPORT_NS(DMA_BUF);
>  MODULE_LICENSE("GPL v2");
> diff --git a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
> index 72193bd734e1..1aa94fff7072 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
> @@ -15,6 +15,13 @@
>  #include "panfrost_gem.h"
>  #include "panfrost_mmu.h"
>  
> +static bool panfrost_gem_shmem_is_purgeable(struct drm_gem_shmem_object *shmem)
> +{
> +	return (shmem->madv > 0) &&
> +		!refcount_read(&shmem->pages_pin_count) && shmem->sgt &&
> +		!shmem->base.dma_buf && !shmem->base.import_attach;
> +}
> +
>  static unsigned long
>  panfrost_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc)
>  {
> @@ -27,7 +34,7 @@ panfrost_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc
>  		return 0;
>  
>  	list_for_each_entry(shmem, &pfdev->shrinker_list, madv_list) {
> -		if (drm_gem_shmem_is_purgeable(shmem))
> +		if (panfrost_gem_shmem_is_purgeable(shmem))
>  			count += shmem->base.size >> PAGE_SHIFT;
>  	}
>  
> diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
> index 7cf4afae2e79..a978f0cb5e84 100644
> --- a/include/drm/drm_device.h
> +++ b/include/drm/drm_device.h
> @@ -16,6 +16,7 @@ struct drm_vblank_crtc;
>  struct drm_vma_offset_manager;
>  struct drm_vram_mm;
>  struct drm_fb_helper;
> +struct drm_gem_shmem_shrinker;
>  
>  struct inode;
>  
> @@ -290,8 +291,13 @@ struct drm_device {
>  	/** @vma_offset_manager: GEM information */
>  	struct drm_vma_offset_manager *vma_offset_manager;
>  
> -	/** @vram_mm: VRAM MM memory manager */
> -	struct drm_vram_mm *vram_mm;
> +	union {
> +		/** @vram_mm: VRAM MM memory manager */
> +		struct drm_vram_mm *vram_mm;
> +
> +		/** @shmem_mm: SHMEM GEM memory manager */
> +		struct drm_gem_shmem *shmem_mm;
> +	};
>  
>  	/**
>  	 * @switch_power_state:
> diff --git a/include/drm/drm_gem_shmem_helper.h b/include/drm/drm_gem_shmem_helper.h
> index 63e91e8f2d5c..65c99da25048 100644
> --- a/include/drm/drm_gem_shmem_helper.h
> +++ b/include/drm/drm_gem_shmem_helper.h
> @@ -6,6 +6,7 @@
>  #include <linux/fs.h>
>  #include <linux/mm.h>
>  #include <linux/mutex.h>
> +#include <linux/shrinker.h>
>  
>  #include <drm/drm_file.h>
>  #include <drm/drm_gem.h>
> @@ -13,6 +14,7 @@
>  #include <drm/drm_prime.h>
>  
>  struct dma_buf_attachment;
> +struct drm_device;
>  struct drm_mode_create_dumb;
>  struct drm_printer;
>  struct sg_table;
> @@ -53,8 +55,8 @@ struct drm_gem_shmem_object {
>  	 * @madv: State for madvise
>  	 *
>  	 * 0 is active/inuse.
> +	 * 1 is not-needed/can-be-purged
>  	 * A negative value is the object is purged.
> -	 * Positive values are driver specific and not used by the helpers.
>  	 */
>  	int madv;
>  
> @@ -115,6 +117,12 @@ struct drm_gem_shmem_object {
>  	 * @map_wc: map object write-combined (instead of using shmem defaults).
>  	 */
>  	bool map_wc : 1;
> +
> +	/**
> +	 * @evicted: True if shmem pages are evicted by the memory shrinker.
> +	 * Used internally by memory shrinker.
> +	 */
> +	bool evicted : 1;
>  };
>  
>  #define to_drm_gem_shmem_obj(obj) \
> @@ -133,14 +141,22 @@ void drm_gem_shmem_vunmap_locked(struct drm_gem_shmem_object *shmem,
>  int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct *vma);
>  
>  int drm_gem_shmem_madvise_locked(struct drm_gem_shmem_object *shmem, int madv);
> +int drm_gem_shmem_madvise(struct drm_gem_shmem_object *shmem, int madv);
>  
>  static inline bool drm_gem_shmem_is_purgeable(struct drm_gem_shmem_object *shmem)
>  {
> -	return (shmem->madv > 0) &&
> -		!refcount_read(&shmem->pages_pin_count) && shmem->sgt &&
> -		!shmem->base.dma_buf && !shmem->base.import_attach;
> +	dma_resv_assert_held(shmem->base.resv);
> +
> +	return (shmem->madv > 0) && shmem->base.funcs->evict &&
> +		refcount_read(&shmem->pages_use_count) &&
> +		!refcount_read(&shmem->pages_pin_count) &&
> +		!shmem->base.dma_buf && !shmem->base.import_attach &&
> +		(shmem->sgt || shmem->evicted);
>  }
>  
> +int drm_gem_shmem_swapin_locked(struct drm_gem_shmem_object *shmem);
> +
> +void drm_gem_shmem_evict_locked(struct drm_gem_shmem_object *shmem);
>  void drm_gem_shmem_purge_locked(struct drm_gem_shmem_object *shmem);
>  
>  struct sg_table *drm_gem_shmem_get_sg_table(struct drm_gem_shmem_object *shmem);
> @@ -284,6 +300,53 @@ static inline int drm_gem_shmem_object_mmap(struct drm_gem_object *obj, struct v
>  	return drm_gem_shmem_mmap(shmem, vma);
>  }
>  
> +/**
> + * drm_gem_shmem_object_madvise - unlocked GEM object function for drm_gem_shmem_madvise_locked()
> + * @obj: GEM object
> + * @madv: Madvise value
> + *
> + * This function wraps drm_gem_shmem_madvise_locked(), providing unlocked variant.
> + *
> + * Returns:
> + * 0 on success or a negative error code on failure.
> + */
> +static inline int drm_gem_shmem_object_madvise(struct drm_gem_object *obj, int madv)
> +{
> +	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
> +
> +	return drm_gem_shmem_madvise(shmem, madv);
> +}
> +
> +/**
> + * struct drm_gem_shmem_shrinker - Memory shrinker of GEM shmem memory manager
> + */
> +struct drm_gem_shmem_shrinker {
> +	/** @base: Shrinker for purging shmem GEM objects */
> +	struct shrinker base;
> +
> +	/** @lock: Protects @lru_* */
> +	struct mutex lock;
> +
> +	/** @lru_pinned: List of pinned shmem GEM objects */
> +	struct drm_gem_lru lru_pinned;
> +
> +	/** @lru_evictable: List of shmem GEM objects to be evicted */
> +	struct drm_gem_lru lru_evictable;
> +
> +	/** @lru_evicted: List of evicted shmem GEM objects */
> +	struct drm_gem_lru lru_evicted;
> +};
> +
> +/**
> + * struct drm_gem_shmem - GEM shmem memory manager
> + */
> +struct drm_gem_shmem {
> +	/** @shrinker: GEM shmem shrinker */
> +	struct drm_gem_shmem_shrinker shrinker;
> +};
> +
> +int drmm_gem_shmem_init(struct drm_device *dev);
> +
>  /*
>   * Driver ops
>   */
Dan Carpenter Sept. 7, 2023, 10:03 a.m. UTC | #2
Hi Dmitry,

kernel test robot noticed the following build warnings:

https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Dmitry-Osipenko/drm-shmem-helper-Fix-UAF-in-error-path-when-freeing-SGT-of-imported-GEM/20230904-011037
base:   linus/master
patch link:    https://lore.kernel.org/r/20230903170736.513347-16-dmitry.osipenko%40collabora.com
patch subject: [PATCH v16 15/20] drm/shmem-helper: Add memory shrinker
config: x86_64-randconfig-161-20230907 (https://download.01.org/0day-ci/archive/20230907/202309070814.jyGJOJzY-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce: (https://download.01.org/0day-ci/archive/20230907/202309070814.jyGJOJzY-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
| Closes: https://lore.kernel.org/r/202309070814.jyGJOJzY-lkp@intel.com/

smatch warnings:
drivers/gpu/drm/drm_gem_shmem_helper.c:733 drm_gem_shmem_fault() error: we previously assumed 'shmem->pages' could be null (see line 724)

vim +733 drivers/gpu/drm/drm_gem_shmem_helper.c

2194a63a818db7 Noralf Trønnes  2019-03-12  702  static vm_fault_t drm_gem_shmem_fault(struct vm_fault *vmf)
2194a63a818db7 Noralf Trønnes  2019-03-12  703  {
2194a63a818db7 Noralf Trønnes  2019-03-12  704  	struct vm_area_struct *vma = vmf->vma;
2194a63a818db7 Noralf Trønnes  2019-03-12  705  	struct drm_gem_object *obj = vma->vm_private_data;
2194a63a818db7 Noralf Trønnes  2019-03-12  706  	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
2194a63a818db7 Noralf Trønnes  2019-03-12  707  	loff_t num_pages = obj->size >> PAGE_SHIFT;
d611b4a0907cec Neil Roberts    2021-02-23  708  	vm_fault_t ret;
2194a63a818db7 Noralf Trønnes  2019-03-12  709  	struct page *page;
11d5a4745e00e7 Neil Roberts    2021-02-23  710  	pgoff_t page_offset;
2c607edf57db6a Dmitry Osipenko 2023-09-03  711  	bool pages_unpinned;
2c607edf57db6a Dmitry Osipenko 2023-09-03  712  	int err;
11d5a4745e00e7 Neil Roberts    2021-02-23  713  
11d5a4745e00e7 Neil Roberts    2021-02-23  714  	/* We don't use vmf->pgoff since that has the fake offset */
11d5a4745e00e7 Neil Roberts    2021-02-23  715  	page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
2194a63a818db7 Noralf Trønnes  2019-03-12  716  
21aa27ddc58269 Dmitry Osipenko 2023-05-30  717  	dma_resv_lock(shmem->base.resv, NULL);
2194a63a818db7 Noralf Trønnes  2019-03-12  718  
2c607edf57db6a Dmitry Osipenko 2023-09-03  719  	/* Sanity-check that we have the pages pointer when it should present */
2c607edf57db6a Dmitry Osipenko 2023-09-03  720  	pages_unpinned = (shmem->evicted || shmem->madv < 0 ||
2c607edf57db6a Dmitry Osipenko 2023-09-03  721  			  !refcount_read(&shmem->pages_use_count));
2c607edf57db6a Dmitry Osipenko 2023-09-03  722  	drm_WARN_ON_ONCE(obj->dev, !shmem->pages ^ pages_unpinned);
2c607edf57db6a Dmitry Osipenko 2023-09-03  723  
2c607edf57db6a Dmitry Osipenko 2023-09-03 @724  	if (page_offset >= num_pages || (!shmem->pages && !shmem->evicted)) {
                                                                                          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Should this be || instead of &&?  (The other thing that people do is
add "!shmem->evicted" for readability even though it doesn't need to be
checked.  So maybe that's the issue, but the checker assumes it needs to
be checked).

d611b4a0907cec Neil Roberts    2021-02-23  725  		ret = VM_FAULT_SIGBUS;
d611b4a0907cec Neil Roberts    2021-02-23  726  	} else {
2c607edf57db6a Dmitry Osipenko 2023-09-03  727  		err = drm_gem_shmem_swapin_locked(shmem);

Or maybe it's because the kbuild bot can't use the cross function db
and shmem->pages is assigned here?

2c607edf57db6a Dmitry Osipenko 2023-09-03  728  		if (err) {
2c607edf57db6a Dmitry Osipenko 2023-09-03  729  			ret = VM_FAULT_OOM;
2c607edf57db6a Dmitry Osipenko 2023-09-03  730  			goto unlock;
2c607edf57db6a Dmitry Osipenko 2023-09-03  731  		}
2c607edf57db6a Dmitry Osipenko 2023-09-03  732  
11d5a4745e00e7 Neil Roberts    2021-02-23 @733  		page = shmem->pages[page_offset];
                                                                       ^^^^^^^^^^^^
Unchecked dereference

2194a63a818db7 Noralf Trønnes  2019-03-12  734  
8b93d1d7dbd578 Daniel Vetter   2021-08-12  735  		ret = vmf_insert_pfn(vma, vmf->address, page_to_pfn(page));
d611b4a0907cec Neil Roberts    2021-02-23  736  	}
d611b4a0907cec Neil Roberts    2021-02-23  737  
2c607edf57db6a Dmitry Osipenko 2023-09-03  738  unlock:
21aa27ddc58269 Dmitry Osipenko 2023-05-30  739  	dma_resv_unlock(shmem->base.resv);
d611b4a0907cec Neil Roberts    2021-02-23  740  
d611b4a0907cec Neil Roberts    2021-02-23  741  	return ret;
2194a63a818db7 Noralf Trønnes  2019-03-12  742  }
Dmitry Osipenko Sept. 11, 2023, 11:44 p.m. UTC | #3
On 9/7/23 13:03, Dan Carpenter wrote:
> 2c607edf57db6a Dmitry Osipenko 2023-09-03 @724  	if (page_offset >= num_pages || (!shmem->pages && !shmem->evicted)) {
>                                                                                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> Should this be || instead of &&?  (The other thing that people do is
> add "!shmem->evicted" for readability even though it doesn't need to be
> checked.  So maybe that's the issue, but the checker assumes it needs to
> be checked).
> 
> d611b4a0907cec Neil Roberts    2021-02-23  725  		ret = VM_FAULT_SIGBUS;
> d611b4a0907cec Neil Roberts    2021-02-23  726  	} else {
> 2c607edf57db6a Dmitry Osipenko 2023-09-03  727  		err = drm_gem_shmem_swapin_locked(shmem);
> 
> Or maybe it's because the kbuild bot can't use the cross function db
> and shmem->pages is assigned here?

Should be a function db problem. The shmem->pages won't be NULL if
drm_gem_shmem_swapin_locked() succeeds.
Dmitry Osipenko Sept. 13, 2023, 12:56 a.m. UTC | #4
On 9/5/23 11:03, Boris Brezillon wrote:
>>                * But
>> +		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
>> +		 * cause a locking order inversion between reservation_ww_class_mutex
>> +		 * and fs_reclaim.
>> +		 *
>> +		 * This deadlock is not actually possible, because no one should
>> +		 * be already holding the lock when drm_gem_shmem_free() is called.
>> +		 * Unfortunately lockdep is not aware of this detail.  So when the
>> +		 * refcount drops to zero, don't touch the reservation lock.
>> +		 */
>> +		if (shmem->got_pages_sgt &&
>> +		    refcount_dec_and_test(&shmem->pages_use_count)) {
>> +			drm_gem_shmem_do_release_pages_locked(shmem);
>> +			shmem->got_pages_sgt = false;
>>  		}
> Leaking memory is the right thing to do if pages_use_count > 1 (it's
> better to leak than having someone access memory it no longer owns), but
> I think it's worth mentioning in the above comment.

It's unlikely that it will be only a leak without a following up
use-after-free. Neither is acceptable.

The drm_gem_shmem_free() could be changed such that kernel won't blow up
on a refcnt bug, but that's not worthwhile doing because drivers
shouldn't have silly bugs.
Boris Brezillon Sept. 13, 2023, 7:48 a.m. UTC | #5
On Wed, 13 Sep 2023 03:56:14 +0300
Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:

> On 9/5/23 11:03, Boris Brezillon wrote:
> >>                * But
> >> +		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
> >> +		 * cause a locking order inversion between reservation_ww_class_mutex
> >> +		 * and fs_reclaim.
> >> +		 *
> >> +		 * This deadlock is not actually possible, because no one should
> >> +		 * be already holding the lock when drm_gem_shmem_free() is called.
> >> +		 * Unfortunately lockdep is not aware of this detail.  So when the
> >> +		 * refcount drops to zero, don't touch the reservation lock.
> >> +		 */
> >> +		if (shmem->got_pages_sgt &&
> >> +		    refcount_dec_and_test(&shmem->pages_use_count)) {
> >> +			drm_gem_shmem_do_release_pages_locked(shmem);
> >> +			shmem->got_pages_sgt = false;
> >>  		}  
> > Leaking memory is the right thing to do if pages_use_count > 1 (it's
> > better to leak than having someone access memory it no longer owns), but
> > I think it's worth mentioning in the above comment.  
> 
> It's unlikely that it will be only a leak without a following up
> use-after-free. Neither is acceptable.

Not necessarily, if you have a page leak, it could be that the GPU has
access to those pages, but doesn't need the GEM object anymore
(pages are mapped by the iommu, which doesn't need shmem->sgt or
shmem->pages after the mapping is created). Without a WARN_ON(), this
can go unnoticed and lead to memory corruptions/information leaks.

> 
> The drm_gem_shmem_free() could be changed such that kernel won't blow up
> on a refcnt bug, but that's not worthwhile doing because drivers
> shouldn't have silly bugs.

We definitely don't want to fix that, but we want to complain loudly
(WARN_ON()), and make sure the risk is limited (preventing memory from
being re-assigned to someone else by not freeing it).
Dmitry Osipenko Sept. 14, 2023, 4:02 a.m. UTC | #6
On 9/13/23 10:48, Boris Brezillon wrote:
> On Wed, 13 Sep 2023 03:56:14 +0300
> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> 
>> On 9/5/23 11:03, Boris Brezillon wrote:
>>>>                * But
>>>> +		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
>>>> +		 * cause a locking order inversion between reservation_ww_class_mutex
>>>> +		 * and fs_reclaim.
>>>> +		 *
>>>> +		 * This deadlock is not actually possible, because no one should
>>>> +		 * be already holding the lock when drm_gem_shmem_free() is called.
>>>> +		 * Unfortunately lockdep is not aware of this detail.  So when the
>>>> +		 * refcount drops to zero, don't touch the reservation lock.
>>>> +		 */
>>>> +		if (shmem->got_pages_sgt &&
>>>> +		    refcount_dec_and_test(&shmem->pages_use_count)) {
>>>> +			drm_gem_shmem_do_release_pages_locked(shmem);
>>>> +			shmem->got_pages_sgt = false;
>>>>  		}  
>>> Leaking memory is the right thing to do if pages_use_count > 1 (it's
>>> better to leak than having someone access memory it no longer owns), but
>>> I think it's worth mentioning in the above comment.  
>>
>> It's unlikely that it will be only a leak without a following up
>> use-after-free. Neither is acceptable.
> 
> Not necessarily, if you have a page leak, it could be that the GPU has
> access to those pages, but doesn't need the GEM object anymore
> (pages are mapped by the iommu, which doesn't need shmem->sgt or
> shmem->pages after the mapping is created). Without a WARN_ON(), this
> can go unnoticed and lead to memory corruptions/information leaks.
> 
>>
>> The drm_gem_shmem_free() could be changed such that kernel won't blow up
>> on a refcnt bug, but that's not worthwhile doing because drivers
>> shouldn't have silly bugs.
> 
> We definitely don't want to fix that, but we want to complain loudly
> (WARN_ON()), and make sure the risk is limited (preventing memory from
> being re-assigned to someone else by not freeing it).

That's what the code did and continues to do here. Not exactly sure what
you're trying to say. I'm going to relocate the comment in v17 to
put_pages(), we can continue discussing it there if I'm missing yours point.
Boris Brezillon Sept. 14, 2023, 7:36 a.m. UTC | #7
On Thu, 14 Sep 2023 07:02:52 +0300
Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:

> On 9/13/23 10:48, Boris Brezillon wrote:
> > On Wed, 13 Sep 2023 03:56:14 +0300
> > Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> >   
> >> On 9/5/23 11:03, Boris Brezillon wrote:  
> >>>>                * But
> >>>> +		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
> >>>> +		 * cause a locking order inversion between reservation_ww_class_mutex
> >>>> +		 * and fs_reclaim.
> >>>> +		 *
> >>>> +		 * This deadlock is not actually possible, because no one should
> >>>> +		 * be already holding the lock when drm_gem_shmem_free() is called.
> >>>> +		 * Unfortunately lockdep is not aware of this detail.  So when the
> >>>> +		 * refcount drops to zero, don't touch the reservation lock.
> >>>> +		 */
> >>>> +		if (shmem->got_pages_sgt &&
> >>>> +		    refcount_dec_and_test(&shmem->pages_use_count)) {
> >>>> +			drm_gem_shmem_do_release_pages_locked(shmem);
> >>>> +			shmem->got_pages_sgt = false;
> >>>>  		}    
> >>> Leaking memory is the right thing to do if pages_use_count > 1 (it's
> >>> better to leak than having someone access memory it no longer owns), but
> >>> I think it's worth mentioning in the above comment.    
> >>
> >> It's unlikely that it will be only a leak without a following up
> >> use-after-free. Neither is acceptable.  
> > 
> > Not necessarily, if you have a page leak, it could be that the GPU has
> > access to those pages, but doesn't need the GEM object anymore
> > (pages are mapped by the iommu, which doesn't need shmem->sgt or
> > shmem->pages after the mapping is created). Without a WARN_ON(), this
> > can go unnoticed and lead to memory corruptions/information leaks.
> >   
> >>
> >> The drm_gem_shmem_free() could be changed such that kernel won't blow up
> >> on a refcnt bug, but that's not worthwhile doing because drivers
> >> shouldn't have silly bugs.  
> > 
> > We definitely don't want to fix that, but we want to complain loudly
> > (WARN_ON()), and make sure the risk is limited (preventing memory from
> > being re-assigned to someone else by not freeing it).  
> 
> That's what the code did and continues to do here. Not exactly sure what
> you're trying to say. I'm going to relocate the comment in v17 to
> put_pages(), we can continue discussing it there if I'm missing yours point.
> 

I'm just saying it would be worth mentioning that we're intentionally
leaking memory if shmem->pages_use_count > 1. Something like:

	/**
	 * shmem->pages_use_count should be 1 when ->sgt != NULL and
	 * zero otherwise. If some users still hold a pages reference
	 * that's a bug, and we intentionally leak the pages so they
	 * can't be re-allocated to someone else while the GPU/CPU
	 * still have access to it.
	 */
	drm_WARN_ON(drm,
		    refcount_read(&shmem->pages_use_count) == (shmem->sgt ? 1 : 0));
	if (shmem->sgt && refcount_dec_and_test(&shmem->pages_use_count))
		drm_gem_shmem_free_pages(shmem);
Dmitry Osipenko Sept. 14, 2023, 7:50 a.m. UTC | #8
On 9/14/23 10:36, Boris Brezillon wrote:
> On Thu, 14 Sep 2023 07:02:52 +0300
> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> 
>> On 9/13/23 10:48, Boris Brezillon wrote:
>>> On Wed, 13 Sep 2023 03:56:14 +0300
>>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
>>>   
>>>> On 9/5/23 11:03, Boris Brezillon wrote:  
>>>>>>                * But
>>>>>> +		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
>>>>>> +		 * cause a locking order inversion between reservation_ww_class_mutex
>>>>>> +		 * and fs_reclaim.
>>>>>> +		 *
>>>>>> +		 * This deadlock is not actually possible, because no one should
>>>>>> +		 * be already holding the lock when drm_gem_shmem_free() is called.
>>>>>> +		 * Unfortunately lockdep is not aware of this detail.  So when the
>>>>>> +		 * refcount drops to zero, don't touch the reservation lock.
>>>>>> +		 */
>>>>>> +		if (shmem->got_pages_sgt &&
>>>>>> +		    refcount_dec_and_test(&shmem->pages_use_count)) {
>>>>>> +			drm_gem_shmem_do_release_pages_locked(shmem);
>>>>>> +			shmem->got_pages_sgt = false;
>>>>>>  		}    
>>>>> Leaking memory is the right thing to do if pages_use_count > 1 (it's
>>>>> better to leak than having someone access memory it no longer owns), but
>>>>> I think it's worth mentioning in the above comment.    
>>>>
>>>> It's unlikely that it will be only a leak without a following up
>>>> use-after-free. Neither is acceptable.  
>>>
>>> Not necessarily, if you have a page leak, it could be that the GPU has
>>> access to those pages, but doesn't need the GEM object anymore
>>> (pages are mapped by the iommu, which doesn't need shmem->sgt or
>>> shmem->pages after the mapping is created). Without a WARN_ON(), this
>>> can go unnoticed and lead to memory corruptions/information leaks.
>>>   
>>>>
>>>> The drm_gem_shmem_free() could be changed such that kernel won't blow up
>>>> on a refcnt bug, but that's not worthwhile doing because drivers
>>>> shouldn't have silly bugs.  
>>>
>>> We definitely don't want to fix that, but we want to complain loudly
>>> (WARN_ON()), and make sure the risk is limited (preventing memory from
>>> being re-assigned to someone else by not freeing it).  
>>
>> That's what the code did and continues to do here. Not exactly sure what
>> you're trying to say. I'm going to relocate the comment in v17 to
>> put_pages(), we can continue discussing it there if I'm missing yours point.
>>
> 
> I'm just saying it would be worth mentioning that we're intentionally
> leaking memory if shmem->pages_use_count > 1. Something like:
> 
> 	/**
> 	 * shmem->pages_use_count should be 1 when ->sgt != NULL and
> 	 * zero otherwise. If some users still hold a pages reference
> 	 * that's a bug, and we intentionally leak the pages so they
> 	 * can't be re-allocated to someone else while the GPU/CPU
> 	 * still have access to it.
> 	 */
> 	drm_WARN_ON(drm,
> 		    refcount_read(&shmem->pages_use_count) == (shmem->sgt ? 1 : 0));
> 	if (shmem->sgt && refcount_dec_and_test(&shmem->pages_use_count))
> 		drm_gem_shmem_free_pages(shmem);

That may be acceptable, but only once there will a driver using this
feature.
Boris Brezillon Sept. 14, 2023, 8:27 a.m. UTC | #9
On Thu, 14 Sep 2023 10:50:32 +0300
Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:

> On 9/14/23 10:36, Boris Brezillon wrote:
> > On Thu, 14 Sep 2023 07:02:52 +0300
> > Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> >   
> >> On 9/13/23 10:48, Boris Brezillon wrote:  
> >>> On Wed, 13 Sep 2023 03:56:14 +0300
> >>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> >>>     
> >>>> On 9/5/23 11:03, Boris Brezillon wrote:    
> >>>>>>                * But
> >>>>>> +		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
> >>>>>> +		 * cause a locking order inversion between reservation_ww_class_mutex
> >>>>>> +		 * and fs_reclaim.
> >>>>>> +		 *
> >>>>>> +		 * This deadlock is not actually possible, because no one should
> >>>>>> +		 * be already holding the lock when drm_gem_shmem_free() is called.
> >>>>>> +		 * Unfortunately lockdep is not aware of this detail.  So when the
> >>>>>> +		 * refcount drops to zero, don't touch the reservation lock.
> >>>>>> +		 */
> >>>>>> +		if (shmem->got_pages_sgt &&
> >>>>>> +		    refcount_dec_and_test(&shmem->pages_use_count)) {
> >>>>>> +			drm_gem_shmem_do_release_pages_locked(shmem);
> >>>>>> +			shmem->got_pages_sgt = false;
> >>>>>>  		}      
> >>>>> Leaking memory is the right thing to do if pages_use_count > 1 (it's
> >>>>> better to leak than having someone access memory it no longer owns), but
> >>>>> I think it's worth mentioning in the above comment.      
> >>>>
> >>>> It's unlikely that it will be only a leak without a following up
> >>>> use-after-free. Neither is acceptable.    
> >>>
> >>> Not necessarily, if you have a page leak, it could be that the GPU has
> >>> access to those pages, but doesn't need the GEM object anymore
> >>> (pages are mapped by the iommu, which doesn't need shmem->sgt or
> >>> shmem->pages after the mapping is created). Without a WARN_ON(), this
> >>> can go unnoticed and lead to memory corruptions/information leaks.
> >>>     
> >>>>
> >>>> The drm_gem_shmem_free() could be changed such that kernel won't blow up
> >>>> on a refcnt bug, but that's not worthwhile doing because drivers
> >>>> shouldn't have silly bugs.    
> >>>
> >>> We definitely don't want to fix that, but we want to complain loudly
> >>> (WARN_ON()), and make sure the risk is limited (preventing memory from
> >>> being re-assigned to someone else by not freeing it).    
> >>
> >> That's what the code did and continues to do here. Not exactly sure what
> >> you're trying to say. I'm going to relocate the comment in v17 to
> >> put_pages(), we can continue discussing it there if I'm missing yours point.
> >>  
> > 
> > I'm just saying it would be worth mentioning that we're intentionally
> > leaking memory if shmem->pages_use_count > 1. Something like:
> > 
> > 	/**
> > 	 * shmem->pages_use_count should be 1 when ->sgt != NULL and
> > 	 * zero otherwise. If some users still hold a pages reference
> > 	 * that's a bug, and we intentionally leak the pages so they
> > 	 * can't be re-allocated to someone else while the GPU/CPU
> > 	 * still have access to it.
> > 	 */
> > 	drm_WARN_ON(drm,
> > 		    refcount_read(&shmem->pages_use_count) == (shmem->sgt ? 1 : 0));
> > 	if (shmem->sgt && refcount_dec_and_test(&shmem->pages_use_count))
> > 		drm_gem_shmem_free_pages(shmem);  
> 
> That may be acceptable, but only once there will a driver using this
> feature.

Which feature? That's not related to a specific feature, that's just
how drm_gem_shmem_get_pages_sgt() works, it takes a pages ref that can
only be released in drm_gem_shmem_free(), because sgt users are not
refcounted and the sgt stays around until the GEM object is freed or
its pages are evicted. The only valid cases we have at the moment are:

- pages_use_count == 1 && sgt != NULL
- pages_use_count == 0

any other situations are buggy.
Dmitry Osipenko Sept. 14, 2023, 11:36 a.m. UTC | #10
On 9/14/23 11:27, Boris Brezillon wrote:
> On Thu, 14 Sep 2023 10:50:32 +0300
> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> 
>> On 9/14/23 10:36, Boris Brezillon wrote:
>>> On Thu, 14 Sep 2023 07:02:52 +0300
>>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
>>>   
>>>> On 9/13/23 10:48, Boris Brezillon wrote:  
>>>>> On Wed, 13 Sep 2023 03:56:14 +0300
>>>>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
>>>>>     
>>>>>> On 9/5/23 11:03, Boris Brezillon wrote:    
>>>>>>>>                * But
>>>>>>>> +		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
>>>>>>>> +		 * cause a locking order inversion between reservation_ww_class_mutex
>>>>>>>> +		 * and fs_reclaim.
>>>>>>>> +		 *
>>>>>>>> +		 * This deadlock is not actually possible, because no one should
>>>>>>>> +		 * be already holding the lock when drm_gem_shmem_free() is called.
>>>>>>>> +		 * Unfortunately lockdep is not aware of this detail.  So when the
>>>>>>>> +		 * refcount drops to zero, don't touch the reservation lock.
>>>>>>>> +		 */
>>>>>>>> +		if (shmem->got_pages_sgt &&
>>>>>>>> +		    refcount_dec_and_test(&shmem->pages_use_count)) {
>>>>>>>> +			drm_gem_shmem_do_release_pages_locked(shmem);
>>>>>>>> +			shmem->got_pages_sgt = false;
>>>>>>>>  		}      
>>>>>>> Leaking memory is the right thing to do if pages_use_count > 1 (it's
>>>>>>> better to leak than having someone access memory it no longer owns), but
>>>>>>> I think it's worth mentioning in the above comment.      
>>>>>>
>>>>>> It's unlikely that it will be only a leak without a following up
>>>>>> use-after-free. Neither is acceptable.    
>>>>>
>>>>> Not necessarily, if you have a page leak, it could be that the GPU has
>>>>> access to those pages, but doesn't need the GEM object anymore
>>>>> (pages are mapped by the iommu, which doesn't need shmem->sgt or
>>>>> shmem->pages after the mapping is created). Without a WARN_ON(), this
>>>>> can go unnoticed and lead to memory corruptions/information leaks.
>>>>>     
>>>>>>
>>>>>> The drm_gem_shmem_free() could be changed such that kernel won't blow up
>>>>>> on a refcnt bug, but that's not worthwhile doing because drivers
>>>>>> shouldn't have silly bugs.    
>>>>>
>>>>> We definitely don't want to fix that, but we want to complain loudly
>>>>> (WARN_ON()), and make sure the risk is limited (preventing memory from
>>>>> being re-assigned to someone else by not freeing it).    
>>>>
>>>> That's what the code did and continues to do here. Not exactly sure what
>>>> you're trying to say. I'm going to relocate the comment in v17 to
>>>> put_pages(), we can continue discussing it there if I'm missing yours point.
>>>>  
>>>
>>> I'm just saying it would be worth mentioning that we're intentionally
>>> leaking memory if shmem->pages_use_count > 1. Something like:
>>>
>>> 	/**
>>> 	 * shmem->pages_use_count should be 1 when ->sgt != NULL and
>>> 	 * zero otherwise. If some users still hold a pages reference
>>> 	 * that's a bug, and we intentionally leak the pages so they
>>> 	 * can't be re-allocated to someone else while the GPU/CPU
>>> 	 * still have access to it.
>>> 	 */
>>> 	drm_WARN_ON(drm,
>>> 		    refcount_read(&shmem->pages_use_count) == (shmem->sgt ? 1 : 0));
>>> 	if (shmem->sgt && refcount_dec_and_test(&shmem->pages_use_count))
>>> 		drm_gem_shmem_free_pages(shmem);  
>>
>> That may be acceptable, but only once there will a driver using this
>> feature.
> 
> Which feature? That's not related to a specific feature, that's just
> how drm_gem_shmem_get_pages_sgt() works, it takes a pages ref that can
> only be released in drm_gem_shmem_free(), because sgt users are not
> refcounted and the sgt stays around until the GEM object is freed or
> its pages are evicted. The only valid cases we have at the moment are:
> 
> - pages_use_count == 1 && sgt != NULL
> - pages_use_count == 0
> 
> any other situations are buggy.

sgt may belong to dma-buf for which pages_use_count=0, this can't be
done until sgt mess is sorted out
Boris Brezillon Sept. 14, 2023, 11:58 a.m. UTC | #11
On Thu, 14 Sep 2023 14:36:23 +0300
Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:

> On 9/14/23 11:27, Boris Brezillon wrote:
> > On Thu, 14 Sep 2023 10:50:32 +0300
> > Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> >   
> >> On 9/14/23 10:36, Boris Brezillon wrote:  
> >>> On Thu, 14 Sep 2023 07:02:52 +0300
> >>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> >>>     
> >>>> On 9/13/23 10:48, Boris Brezillon wrote:    
> >>>>> On Wed, 13 Sep 2023 03:56:14 +0300
> >>>>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> >>>>>       
> >>>>>> On 9/5/23 11:03, Boris Brezillon wrote:      
> >>>>>>>>                * But
> >>>>>>>> +		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
> >>>>>>>> +		 * cause a locking order inversion between reservation_ww_class_mutex
> >>>>>>>> +		 * and fs_reclaim.
> >>>>>>>> +		 *
> >>>>>>>> +		 * This deadlock is not actually possible, because no one should
> >>>>>>>> +		 * be already holding the lock when drm_gem_shmem_free() is called.
> >>>>>>>> +		 * Unfortunately lockdep is not aware of this detail.  So when the
> >>>>>>>> +		 * refcount drops to zero, don't touch the reservation lock.
> >>>>>>>> +		 */
> >>>>>>>> +		if (shmem->got_pages_sgt &&
> >>>>>>>> +		    refcount_dec_and_test(&shmem->pages_use_count)) {
> >>>>>>>> +			drm_gem_shmem_do_release_pages_locked(shmem);
> >>>>>>>> +			shmem->got_pages_sgt = false;
> >>>>>>>>  		}        
> >>>>>>> Leaking memory is the right thing to do if pages_use_count > 1 (it's
> >>>>>>> better to leak than having someone access memory it no longer owns), but
> >>>>>>> I think it's worth mentioning in the above comment.        
> >>>>>>
> >>>>>> It's unlikely that it will be only a leak without a following up
> >>>>>> use-after-free. Neither is acceptable.      
> >>>>>
> >>>>> Not necessarily, if you have a page leak, it could be that the GPU has
> >>>>> access to those pages, but doesn't need the GEM object anymore
> >>>>> (pages are mapped by the iommu, which doesn't need shmem->sgt or
> >>>>> shmem->pages after the mapping is created). Without a WARN_ON(), this
> >>>>> can go unnoticed and lead to memory corruptions/information leaks.
> >>>>>       
> >>>>>>
> >>>>>> The drm_gem_shmem_free() could be changed such that kernel won't blow up
> >>>>>> on a refcnt bug, but that's not worthwhile doing because drivers
> >>>>>> shouldn't have silly bugs.      
> >>>>>
> >>>>> We definitely don't want to fix that, but we want to complain loudly
> >>>>> (WARN_ON()), and make sure the risk is limited (preventing memory from
> >>>>> being re-assigned to someone else by not freeing it).      
> >>>>
> >>>> That's what the code did and continues to do here. Not exactly sure what
> >>>> you're trying to say. I'm going to relocate the comment in v17 to
> >>>> put_pages(), we can continue discussing it there if I'm missing yours point.
> >>>>    
> >>>
> >>> I'm just saying it would be worth mentioning that we're intentionally
> >>> leaking memory if shmem->pages_use_count > 1. Something like:
> >>>
> >>> 	/**
> >>> 	 * shmem->pages_use_count should be 1 when ->sgt != NULL and
> >>> 	 * zero otherwise. If some users still hold a pages reference
> >>> 	 * that's a bug, and we intentionally leak the pages so they
> >>> 	 * can't be re-allocated to someone else while the GPU/CPU
> >>> 	 * still have access to it.
> >>> 	 */
> >>> 	drm_WARN_ON(drm,
> >>> 		    refcount_read(&shmem->pages_use_count) == (shmem->sgt ? 1 : 0));
> >>> 	if (shmem->sgt && refcount_dec_and_test(&shmem->pages_use_count))
> >>> 		drm_gem_shmem_free_pages(shmem);    
> >>
> >> That may be acceptable, but only once there will a driver using this
> >> feature.  
> > 
> > Which feature? That's not related to a specific feature, that's just
> > how drm_gem_shmem_get_pages_sgt() works, it takes a pages ref that can
> > only be released in drm_gem_shmem_free(), because sgt users are not
> > refcounted and the sgt stays around until the GEM object is freed or
> > its pages are evicted. The only valid cases we have at the moment are:
> > 
> > - pages_use_count == 1 && sgt != NULL
> > - pages_use_count == 0
> > 
> > any other situations are buggy.  
> 
> sgt may belong to dma-buf for which pages_use_count=0, this can't be
> done until sgt mess is sorted out

No it can't, not in that path, because the code you're adding is in the
if (!obj->import_branch) branch:


 	if (obj->import_attach) {
 		drm_prime_gem_destroy(obj, shmem->sgt);
 	} else {
		...
		// Your changes are here.
		...
	}
>
Dmitry Osipenko Sept. 14, 2023, 1:01 p.m. UTC | #12
On 9/14/23 14:58, Boris Brezillon wrote:
> On Thu, 14 Sep 2023 14:36:23 +0300
> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> 
>> On 9/14/23 11:27, Boris Brezillon wrote:
>>> On Thu, 14 Sep 2023 10:50:32 +0300
>>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
>>>   
>>>> On 9/14/23 10:36, Boris Brezillon wrote:  
>>>>> On Thu, 14 Sep 2023 07:02:52 +0300
>>>>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
>>>>>     
>>>>>> On 9/13/23 10:48, Boris Brezillon wrote:    
>>>>>>> On Wed, 13 Sep 2023 03:56:14 +0300
>>>>>>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
>>>>>>>       
>>>>>>>> On 9/5/23 11:03, Boris Brezillon wrote:      
>>>>>>>>>>                * But
>>>>>>>>>> +		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
>>>>>>>>>> +		 * cause a locking order inversion between reservation_ww_class_mutex
>>>>>>>>>> +		 * and fs_reclaim.
>>>>>>>>>> +		 *
>>>>>>>>>> +		 * This deadlock is not actually possible, because no one should
>>>>>>>>>> +		 * be already holding the lock when drm_gem_shmem_free() is called.
>>>>>>>>>> +		 * Unfortunately lockdep is not aware of this detail.  So when the
>>>>>>>>>> +		 * refcount drops to zero, don't touch the reservation lock.
>>>>>>>>>> +		 */
>>>>>>>>>> +		if (shmem->got_pages_sgt &&
>>>>>>>>>> +		    refcount_dec_and_test(&shmem->pages_use_count)) {
>>>>>>>>>> +			drm_gem_shmem_do_release_pages_locked(shmem);
>>>>>>>>>> +			shmem->got_pages_sgt = false;
>>>>>>>>>>  		}        
>>>>>>>>> Leaking memory is the right thing to do if pages_use_count > 1 (it's
>>>>>>>>> better to leak than having someone access memory it no longer owns), but
>>>>>>>>> I think it's worth mentioning in the above comment.        
>>>>>>>>
>>>>>>>> It's unlikely that it will be only a leak without a following up
>>>>>>>> use-after-free. Neither is acceptable.      
>>>>>>>
>>>>>>> Not necessarily, if you have a page leak, it could be that the GPU has
>>>>>>> access to those pages, but doesn't need the GEM object anymore
>>>>>>> (pages are mapped by the iommu, which doesn't need shmem->sgt or
>>>>>>> shmem->pages after the mapping is created). Without a WARN_ON(), this
>>>>>>> can go unnoticed and lead to memory corruptions/information leaks.
>>>>>>>       
>>>>>>>>
>>>>>>>> The drm_gem_shmem_free() could be changed such that kernel won't blow up
>>>>>>>> on a refcnt bug, but that's not worthwhile doing because drivers
>>>>>>>> shouldn't have silly bugs.      
>>>>>>>
>>>>>>> We definitely don't want to fix that, but we want to complain loudly
>>>>>>> (WARN_ON()), and make sure the risk is limited (preventing memory from
>>>>>>> being re-assigned to someone else by not freeing it).      
>>>>>>
>>>>>> That's what the code did and continues to do here. Not exactly sure what
>>>>>> you're trying to say. I'm going to relocate the comment in v17 to
>>>>>> put_pages(), we can continue discussing it there if I'm missing yours point.
>>>>>>    
>>>>>
>>>>> I'm just saying it would be worth mentioning that we're intentionally
>>>>> leaking memory if shmem->pages_use_count > 1. Something like:
>>>>>
>>>>> 	/**
>>>>> 	 * shmem->pages_use_count should be 1 when ->sgt != NULL and
>>>>> 	 * zero otherwise. If some users still hold a pages reference
>>>>> 	 * that's a bug, and we intentionally leak the pages so they
>>>>> 	 * can't be re-allocated to someone else while the GPU/CPU
>>>>> 	 * still have access to it.
>>>>> 	 */
>>>>> 	drm_WARN_ON(drm,
>>>>> 		    refcount_read(&shmem->pages_use_count) == (shmem->sgt ? 1 : 0));
>>>>> 	if (shmem->sgt && refcount_dec_and_test(&shmem->pages_use_count))
>>>>> 		drm_gem_shmem_free_pages(shmem);    
>>>>
>>>> That may be acceptable, but only once there will a driver using this
>>>> feature.  
>>>
>>> Which feature? That's not related to a specific feature, that's just
>>> how drm_gem_shmem_get_pages_sgt() works, it takes a pages ref that can
>>> only be released in drm_gem_shmem_free(), because sgt users are not
>>> refcounted and the sgt stays around until the GEM object is freed or
>>> its pages are evicted. The only valid cases we have at the moment are:
>>>
>>> - pages_use_count == 1 && sgt != NULL
>>> - pages_use_count == 0
>>>
>>> any other situations are buggy.  
>>
>> sgt may belong to dma-buf for which pages_use_count=0, this can't be
>> done until sgt mess is sorted out
> 
> No it can't, not in that path, because the code you're adding is in the
> if (!obj->import_branch) branch:
> 
> 
>  	if (obj->import_attach) {
>  		drm_prime_gem_destroy(obj, shmem->sgt);
>  	} else {
> 		...
> 		// Your changes are here.
> 		...

This branch is taken for the dma-buf in the prime import error code path. But yes, the pages_use_count=0 for the dma-buf and then it can be written as:

	if (obj->import_attach) {
		drm_prime_gem_destroy(obj, shmem->sgt);
	} else {
		drm_WARN_ON(obj->dev, refcount_read(&shmem->vmap_use_count));

		if (shmem->sgt && refcount_read(&shmem->pages_use_count)) {
			dma_unmap_sgtable(obj->dev->dev, shmem->sgt,
					  DMA_BIDIRECTIONAL, 0);
			sg_free_table(shmem->sgt);
			kfree(shmem->sgt);

			__drm_gem_shmem_put_pages(shmem);
		}

		drm_WARN_ON(obj->dev, refcount_read(&shmem->pages_use_count));

Alright, I'll check if it works as expected for fixing the error code path bug for v17
Boris Brezillon Sept. 14, 2023, 1:27 p.m. UTC | #13
On Thu, 14 Sep 2023 16:01:37 +0300
Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:

> On 9/14/23 14:58, Boris Brezillon wrote:
> > On Thu, 14 Sep 2023 14:36:23 +0300
> > Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> >   
> >> On 9/14/23 11:27, Boris Brezillon wrote:  
> >>> On Thu, 14 Sep 2023 10:50:32 +0300
> >>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> >>>     
> >>>> On 9/14/23 10:36, Boris Brezillon wrote:    
> >>>>> On Thu, 14 Sep 2023 07:02:52 +0300
> >>>>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> >>>>>       
> >>>>>> On 9/13/23 10:48, Boris Brezillon wrote:      
> >>>>>>> On Wed, 13 Sep 2023 03:56:14 +0300
> >>>>>>> Dmitry Osipenko <dmitry.osipenko@collabora.com> wrote:
> >>>>>>>         
> >>>>>>>> On 9/5/23 11:03, Boris Brezillon wrote:        
> >>>>>>>>>>                * But
> >>>>>>>>>> +		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
> >>>>>>>>>> +		 * cause a locking order inversion between reservation_ww_class_mutex
> >>>>>>>>>> +		 * and fs_reclaim.
> >>>>>>>>>> +		 *
> >>>>>>>>>> +		 * This deadlock is not actually possible, because no one should
> >>>>>>>>>> +		 * be already holding the lock when drm_gem_shmem_free() is called.
> >>>>>>>>>> +		 * Unfortunately lockdep is not aware of this detail.  So when the
> >>>>>>>>>> +		 * refcount drops to zero, don't touch the reservation lock.
> >>>>>>>>>> +		 */
> >>>>>>>>>> +		if (shmem->got_pages_sgt &&
> >>>>>>>>>> +		    refcount_dec_and_test(&shmem->pages_use_count)) {
> >>>>>>>>>> +			drm_gem_shmem_do_release_pages_locked(shmem);
> >>>>>>>>>> +			shmem->got_pages_sgt = false;
> >>>>>>>>>>  		}          
> >>>>>>>>> Leaking memory is the right thing to do if pages_use_count > 1 (it's
> >>>>>>>>> better to leak than having someone access memory it no longer owns), but
> >>>>>>>>> I think it's worth mentioning in the above comment.          
> >>>>>>>>
> >>>>>>>> It's unlikely that it will be only a leak without a following up
> >>>>>>>> use-after-free. Neither is acceptable.        
> >>>>>>>
> >>>>>>> Not necessarily, if you have a page leak, it could be that the GPU has
> >>>>>>> access to those pages, but doesn't need the GEM object anymore
> >>>>>>> (pages are mapped by the iommu, which doesn't need shmem->sgt or
> >>>>>>> shmem->pages after the mapping is created). Without a WARN_ON(), this
> >>>>>>> can go unnoticed and lead to memory corruptions/information leaks.
> >>>>>>>         
> >>>>>>>>
> >>>>>>>> The drm_gem_shmem_free() could be changed such that kernel won't blow up
> >>>>>>>> on a refcnt bug, but that's not worthwhile doing because drivers
> >>>>>>>> shouldn't have silly bugs.        
> >>>>>>>
> >>>>>>> We definitely don't want to fix that, but we want to complain loudly
> >>>>>>> (WARN_ON()), and make sure the risk is limited (preventing memory from
> >>>>>>> being re-assigned to someone else by not freeing it).        
> >>>>>>
> >>>>>> That's what the code did and continues to do here. Not exactly sure what
> >>>>>> you're trying to say. I'm going to relocate the comment in v17 to
> >>>>>> put_pages(), we can continue discussing it there if I'm missing yours point.
> >>>>>>      
> >>>>>
> >>>>> I'm just saying it would be worth mentioning that we're intentionally
> >>>>> leaking memory if shmem->pages_use_count > 1. Something like:
> >>>>>
> >>>>> 	/**
> >>>>> 	 * shmem->pages_use_count should be 1 when ->sgt != NULL and
> >>>>> 	 * zero otherwise. If some users still hold a pages reference
> >>>>> 	 * that's a bug, and we intentionally leak the pages so they
> >>>>> 	 * can't be re-allocated to someone else while the GPU/CPU
> >>>>> 	 * still have access to it.
> >>>>> 	 */
> >>>>> 	drm_WARN_ON(drm,
> >>>>> 		    refcount_read(&shmem->pages_use_count) == (shmem->sgt ? 1 : 0));
> >>>>> 	if (shmem->sgt && refcount_dec_and_test(&shmem->pages_use_count))
> >>>>> 		drm_gem_shmem_free_pages(shmem);      
> >>>>
> >>>> That may be acceptable, but only once there will a driver using this
> >>>> feature.    
> >>>
> >>> Which feature? That's not related to a specific feature, that's just
> >>> how drm_gem_shmem_get_pages_sgt() works, it takes a pages ref that can
> >>> only be released in drm_gem_shmem_free(), because sgt users are not
> >>> refcounted and the sgt stays around until the GEM object is freed or
> >>> its pages are evicted. The only valid cases we have at the moment are:
> >>>
> >>> - pages_use_count == 1 && sgt != NULL
> >>> - pages_use_count == 0
> >>>
> >>> any other situations are buggy.    
> >>
> >> sgt may belong to dma-buf for which pages_use_count=0, this can't be
> >> done until sgt mess is sorted out  
> > 
> > No it can't, not in that path, because the code you're adding is in the
> > if (!obj->import_branch) branch:
> > 
> > 
> >  	if (obj->import_attach) {
> >  		drm_prime_gem_destroy(obj, shmem->sgt);
> >  	} else {
> > 		...
> > 		// Your changes are here.
> > 		...  
> 
> This branch is taken for the dma-buf in the prime import error code path.

I suggested a fix for this error that didn't involve adding a new flag,
but that's orthogonal to the piece of code we're discussing anyway.

> But yes, the pages_use_count=0 for the dma-buf and then it can be
> written as:
> 
> 	if (obj->import_attach) {
> 		drm_prime_gem_destroy(obj, shmem->sgt);
> 	} else {
> 		drm_WARN_ON(obj->dev, refcount_read(&shmem->vmap_use_count));
> 
> 		if (shmem->sgt && refcount_read(&shmem->pages_use_count)) {

You should drop the '&& refcount_read(&shmem->pages_use_count)',
otherwise you'll never enter this branch (sgt allocation retained
a ref, so pages_use_count > 0 when ->sgt != NULL).

If you added this pages_use_count > 0 check to deal with the
'free-partially-imported-GEM' case, I keep thinking this is not
the right fix. You should just assume that obj->import_attach == NULL
means not-a-prime-buffer, and then make sure
partially-initialized-prime-GEMs have import_attach assigned (see the
oneliner I suggested in my review of
`[PATCH v15 01/23] drm/shmem-helper: Fix UAF in error path when
freeing SGT of imported GEM`).

> 			dma_unmap_sgtable(obj->dev->dev, shmem->sgt,
> 					  DMA_BIDIRECTIONAL, 0);
> 			sg_free_table(shmem->sgt);
> 			kfree(shmem->sgt);
> 
> 			__drm_gem_shmem_put_pages(shmem);

You need to decrement pages_use_count:

			/* shmem->pages_use_count should be 1 when ->sgt != NULL and
			 * zero otherwise. If some users still hold a pages reference
			 * that's a bug, and we intentionally leak the pages so they
			 * can't be re-allocated to someone else while the GPU/CPU
			 * still have access to it.
		 	 */
			if (refcount_dec_and_test(&shmem->pages_use_count))
				__drm_gem_shmem_put_pages(shmem);

> 		}
> 
> 		drm_WARN_ON(obj->dev, refcount_read(&shmem->pages_use_count));

And now this WARN_ON() ^ should catch unexpected pages leak.

> 
> Alright, I'll check if it works as expected for fixing the error code path bug for v17
>
Boris Brezillon Sept. 14, 2023, 1:30 p.m. UTC | #14
On Thu, 14 Sep 2023 15:27:03 +0200
Boris Brezillon <boris.brezillon@collabora.com> wrote:

> You should drop the '&& refcount_read(&shmem->pages_use_count)',
> otherwise you'll never enter this branch (sgt allocation retained
> a ref, so pages_use_count > 0 when ->sgt != NULL).

Sorry for the brain fart. You can drop this extra test because its
redundant (->sgt != NULL implies pages_use_count > 0), but it shouldn't
prevent you from entering the branch.
Dmitry Osipenko Sept. 14, 2023, 1:58 p.m. UTC | #15
On 9/14/23 16:27, Boris Brezillon wrote:
...
> If you added this pages_use_count > 0 check to deal with the
> 'free-partially-imported-GEM' case, I keep thinking this is not
> the right fix. You should just assume that obj->import_attach == NULL
> means not-a-prime-buffer, and then make sure
> partially-initialized-prime-GEMs have import_attach assigned (see the
> oneliner I suggested in my review of
> `[PATCH v15 01/23] drm/shmem-helper: Fix UAF in error path when
> freeing SGT of imported GEM`).

Yes, I added it to deal with the partially imported GEM. The
obj->import_attach can't be set until dma-buf is fully imported as it
also will cause trouble for the error code path, now dma-buf will be
freed two times.

>> 			dma_unmap_sgtable(obj->dev->dev, shmem->sgt,
>> 					  DMA_BIDIRECTIONAL, 0);
>> 			sg_free_table(shmem->sgt);
>> 			kfree(shmem->sgt);
>>
>> 			__drm_gem_shmem_put_pages(shmem);
> You need to decrement pages_use_count:
> 
> 			/* shmem->pages_use_count should be 1 when ->sgt != NULL and
> 			 * zero otherwise. If some users still hold a pages reference
> 			 * that's a bug, and we intentionally leak the pages so they
> 			 * can't be re-allocated to someone else while the GPU/CPU
> 			 * still have access to it.
> 		 	 */
> 			if (refcount_dec_and_test(&shmem->pages_use_count))
> 				__drm_gem_shmem_put_pages(shmem);
> 

The put_pages() itself decrements the refcnt.

I'm going back to deferring all this questionable changes for the later
times. It is not essential problem for this patchset.
diff mbox series

Patch

diff --git a/drivers/gpu/drm/drm_gem_shmem_helper.c b/drivers/gpu/drm/drm_gem_shmem_helper.c
index 4633a418faba..a0a6c7e505c8 100644
--- a/drivers/gpu/drm/drm_gem_shmem_helper.c
+++ b/drivers/gpu/drm/drm_gem_shmem_helper.c
@@ -20,6 +20,7 @@ 
 #include <drm/drm_device.h>
 #include <drm/drm_drv.h>
 #include <drm/drm_gem_shmem_helper.h>
+#include <drm/drm_managed.h>
 #include <drm/drm_prime.h>
 #include <drm/drm_print.h>
 
@@ -88,8 +89,6 @@  __drm_gem_shmem_create(struct drm_device *dev, size_t size, bool private)
 	if (ret)
 		goto err_release;
 
-	INIT_LIST_HEAD(&shmem->madv_list);
-
 	if (!private) {
 		/*
 		 * Our buffers are kept pinned, so allocating them
@@ -128,6 +127,62 @@  struct drm_gem_shmem_object *drm_gem_shmem_create(struct drm_device *dev, size_t
 }
 EXPORT_SYMBOL_GPL(drm_gem_shmem_create);
 
+static bool drm_gem_shmem_is_evictable(struct drm_gem_shmem_object *shmem)
+{
+	dma_resv_assert_held(shmem->base.resv);
+
+	return (shmem->madv >= 0) && shmem->base.funcs->evict &&
+		refcount_read(&shmem->pages_use_count) &&
+		!refcount_read(&shmem->pages_pin_count) &&
+		!shmem->base.dma_buf && !shmem->base.import_attach &&
+		shmem->sgt && !shmem->evicted;
+}
+
+static void
+drm_gem_shmem_update_pages_state_locked(struct drm_gem_shmem_object *shmem)
+{
+	struct drm_gem_object *obj = &shmem->base;
+	struct drm_gem_shmem *shmem_mm = obj->dev->shmem_mm;
+	struct drm_gem_shmem_shrinker *shmem_shrinker = &shmem_mm->shrinker;
+
+	dma_resv_assert_held(shmem->base.resv);
+
+	if (!shmem_shrinker || obj->import_attach)
+		return;
+
+	if (shmem->madv < 0)
+		drm_gem_lru_remove(&shmem->base);
+	else if (drm_gem_shmem_is_evictable(shmem) || drm_gem_shmem_is_purgeable(shmem))
+		drm_gem_lru_move_tail(&shmem_shrinker->lru_evictable, &shmem->base);
+	else if (shmem->evicted)
+		drm_gem_lru_move_tail(&shmem_shrinker->lru_evicted, &shmem->base);
+	else if (!shmem->pages)
+		drm_gem_lru_remove(&shmem->base);
+	else
+		drm_gem_lru_move_tail(&shmem_shrinker->lru_pinned, &shmem->base);
+}
+
+static void
+drm_gem_shmem_do_release_pages_locked(struct drm_gem_shmem_object *shmem)
+{
+	struct drm_gem_object *obj = &shmem->base;
+
+	if (!shmem->pages) {
+		drm_WARN_ON(obj->dev, !shmem->evicted && shmem->madv >= 0);
+		return;
+	}
+
+#ifdef CONFIG_X86
+	if (shmem->map_wc)
+		set_pages_array_wb(shmem->pages, obj->size >> PAGE_SHIFT);
+#endif
+
+	drm_gem_put_pages(obj, shmem->pages,
+			  shmem->pages_mark_dirty_on_put,
+			  shmem->pages_mark_accessed_on_put);
+	shmem->pages = NULL;
+}
+
 /**
  * drm_gem_shmem_free - Free resources associated with a shmem GEM object
  * @shmem: shmem GEM object to free
@@ -142,8 +197,6 @@  void drm_gem_shmem_free(struct drm_gem_shmem_object *shmem)
 	if (obj->import_attach) {
 		drm_prime_gem_destroy(obj, shmem->sgt);
 	} else if (!shmem->imported_sgt) {
-		dma_resv_lock(shmem->base.resv, NULL);
-
 		drm_WARN_ON(obj->dev, refcount_read(&shmem->vmap_use_count));
 
 		if (shmem->sgt) {
@@ -152,14 +205,27 @@  void drm_gem_shmem_free(struct drm_gem_shmem_object *shmem)
 			sg_free_table(shmem->sgt);
 			kfree(shmem->sgt);
 		}
-		if (refcount_read(&shmem->pages_use_count)) {
-			drm_gem_shmem_put_pages_locked(shmem);
-			drm_WARN_ON(obj->dev, !shmem->got_pages_sgt);
+
+		/*
+		 * Destroying the object is a special case.. drm_gem_shmem_free()
+		 * calls many things that WARN_ON if the obj lock is not held.  But
+		 * acquiring the obj lock in drm_gem_shmem_release_pages_locked() can
+		 * cause a locking order inversion between reservation_ww_class_mutex
+		 * and fs_reclaim.
+		 *
+		 * This deadlock is not actually possible, because no one should
+		 * be already holding the lock when drm_gem_shmem_free() is called.
+		 * Unfortunately lockdep is not aware of this detail.  So when the
+		 * refcount drops to zero, don't touch the reservation lock.
+		 */
+		if (shmem->got_pages_sgt &&
+		    refcount_dec_and_test(&shmem->pages_use_count)) {
+			drm_gem_shmem_do_release_pages_locked(shmem);
+			shmem->got_pages_sgt = false;
 		}
 
 		drm_WARN_ON(obj->dev, refcount_read(&shmem->pages_use_count));
-
-		dma_resv_unlock(shmem->base.resv);
+		drm_WARN_ON(obj->dev, shmem->got_pages_sgt);
 	}
 
 	drm_gem_object_release(obj);
@@ -167,15 +233,26 @@  void drm_gem_shmem_free(struct drm_gem_shmem_object *shmem)
 }
 EXPORT_SYMBOL_GPL(drm_gem_shmem_free);
 
-static int drm_gem_shmem_get_pages_locked(struct drm_gem_shmem_object *shmem)
+static int
+drm_gem_shmem_acquire_pages(struct drm_gem_shmem_object *shmem, bool init)
 {
 	struct drm_gem_object *obj = &shmem->base;
 	struct page **pages;
 
 	dma_resv_assert_held(shmem->base.resv);
 
-	if (refcount_inc_not_zero(&shmem->pages_use_count))
+	if (shmem->madv < 0) {
+		drm_WARN_ON(obj->dev, shmem->pages);
+		return -ENOMEM;
+	}
+
+	if (shmem->pages) {
+		drm_WARN_ON(obj->dev, !shmem->evicted);
 		return 0;
+	}
+
+	if (drm_WARN_ON(obj->dev, !(init ^ refcount_read(&shmem->pages_use_count))))
+		return -EINVAL;
 
 	pages = drm_gem_get_pages(obj);
 	if (IS_ERR(pages)) {
@@ -196,8 +273,36 @@  static int drm_gem_shmem_get_pages_locked(struct drm_gem_shmem_object *shmem)
 
 	shmem->pages = pages;
 
+	return 0;
+}
+
+static void
+drm_gem_shmem_release_pages_locked(struct drm_gem_shmem_object *shmem)
+{
+	dma_resv_assert_held(shmem->base.resv);
+	drm_gem_shmem_do_release_pages_locked(shmem);
+}
+
+static int drm_gem_shmem_get_pages_locked(struct drm_gem_shmem_object *shmem)
+{
+	int err;
+
+	dma_resv_assert_held(shmem->base.resv);
+
+	if (shmem->madv < 0)
+		return -ENOMEM;
+
+	if (refcount_inc_not_zero(&shmem->pages_use_count))
+		return 0;
+
+	err = drm_gem_shmem_acquire_pages(shmem, true);
+	if (err)
+		return err;
+
 	refcount_set(&shmem->pages_use_count, 1);
 
+	drm_gem_shmem_update_pages_state_locked(shmem);
+
 	return 0;
 }
 
@@ -209,20 +314,11 @@  static int drm_gem_shmem_get_pages_locked(struct drm_gem_shmem_object *shmem)
  */
 void drm_gem_shmem_put_pages_locked(struct drm_gem_shmem_object *shmem)
 {
-	struct drm_gem_object *obj = &shmem->base;
-
 	dma_resv_assert_held(shmem->base.resv);
 
 	if (refcount_dec_and_test(&shmem->pages_use_count)) {
-#ifdef CONFIG_X86
-		if (shmem->map_wc)
-			set_pages_array_wb(shmem->pages, obj->size >> PAGE_SHIFT);
-#endif
-
-		drm_gem_put_pages(obj, shmem->pages,
-				  shmem->pages_mark_dirty_on_put,
-				  shmem->pages_mark_accessed_on_put);
-		shmem->pages = NULL;
+		drm_gem_shmem_release_pages_locked(shmem);
+		drm_gem_shmem_update_pages_state_locked(shmem);
 	}
 }
 EXPORT_SYMBOL_GPL(drm_gem_shmem_put_pages_locked);
@@ -251,8 +347,15 @@  static int drm_gem_shmem_pin_locked(struct drm_gem_shmem_object *shmem)
 		return 0;
 
 	ret = drm_gem_shmem_get_pages_locked(shmem);
-	if (!ret)
+	if (!ret) {
+		ret = drm_gem_shmem_swapin_locked(shmem);
+		if (ret) {
+			drm_gem_shmem_put_pages_locked(shmem);
+			return ret;
+		}
+
 		refcount_set(&shmem->pages_pin_count, 1);
+	}
 
 	return ret;
 }
@@ -448,29 +551,54 @@  int drm_gem_shmem_madvise_locked(struct drm_gem_shmem_object *shmem, int madv)
 
 	madv = shmem->madv;
 
+	drm_gem_shmem_update_pages_state_locked(shmem);
+
 	return (madv >= 0);
 }
 EXPORT_SYMBOL_GPL(drm_gem_shmem_madvise_locked);
 
-void drm_gem_shmem_purge_locked(struct drm_gem_shmem_object *shmem)
+int drm_gem_shmem_madvise(struct drm_gem_shmem_object *shmem, int madv)
+{
+	struct drm_gem_object *obj = &shmem->base;
+	int ret;
+
+	ret = dma_resv_lock_interruptible(obj->resv, NULL);
+	if (ret)
+		return ret;
+
+	ret = drm_gem_shmem_madvise_locked(shmem, madv);
+	dma_resv_unlock(obj->resv);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(drm_gem_shmem_madvise);
+
+static void drm_gem_shmem_unpin_pages_locked(struct drm_gem_shmem_object *shmem)
 {
 	struct drm_gem_object *obj = &shmem->base;
 	struct drm_device *dev = obj->dev;
 
 	dma_resv_assert_held(shmem->base.resv);
 
-	drm_WARN_ON(obj->dev, !drm_gem_shmem_is_purgeable(shmem));
+	if (shmem->evicted)
+		return;
 
 	dma_unmap_sgtable(dev->dev, shmem->sgt, DMA_BIDIRECTIONAL, 0);
+	drm_gem_shmem_release_pages_locked(shmem);
+	drm_vma_node_unmap(&obj->vma_node, dev->anon_inode->i_mapping);
+
 	sg_free_table(shmem->sgt);
 	kfree(shmem->sgt);
 	shmem->sgt = NULL;
+}
 
-	drm_gem_shmem_put_pages_locked(shmem);
+void drm_gem_shmem_purge_locked(struct drm_gem_shmem_object *shmem)
+{
+	struct drm_gem_object *obj = &shmem->base;
 
-	shmem->madv = -1;
+	drm_WARN_ON(obj->dev, !drm_gem_shmem_is_purgeable(shmem));
 
-	drm_vma_node_unmap(&obj->vma_node, dev->anon_inode->i_mapping);
+	drm_gem_shmem_unpin_pages_locked(shmem);
 	drm_gem_free_mmap_offset(obj);
 
 	/* Our goal here is to return as much of the memory as
@@ -481,9 +609,59 @@  void drm_gem_shmem_purge_locked(struct drm_gem_shmem_object *shmem)
 	shmem_truncate_range(file_inode(obj->filp), 0, (loff_t)-1);
 
 	invalidate_mapping_pages(file_inode(obj->filp)->i_mapping, 0, (loff_t)-1);
+
+	shmem->madv = -1;
+	shmem->evicted = false;
+	drm_gem_shmem_update_pages_state_locked(shmem);
 }
 EXPORT_SYMBOL_GPL(drm_gem_shmem_purge_locked);
 
+/**
+ * drm_gem_shmem_swapin_locked() - Moves shmem GEM back to memory and enables
+ *                                 hardware access to the memory.
+ * @shmem: shmem GEM object
+ *
+ * This function moves shmem GEM back to memory if it was previously evicted
+ * by the memory shrinker. The GEM is ready to use on success.
+ *
+ * Returns:
+ * 0 on success or a negative error code on failure.
+ */
+int drm_gem_shmem_swapin_locked(struct drm_gem_shmem_object *shmem)
+{
+	struct drm_gem_object *obj = &shmem->base;
+	struct sg_table *sgt;
+	int err;
+
+	dma_resv_assert_held(shmem->base.resv);
+
+	if (shmem->evicted) {
+		err = drm_gem_shmem_acquire_pages(shmem, false);
+		if (err)
+			return err;
+
+		sgt = drm_gem_shmem_get_sg_table(shmem);
+		if (IS_ERR(sgt))
+			return PTR_ERR(sgt);
+
+		err = dma_map_sgtable(obj->dev->dev, sgt,
+				      DMA_BIDIRECTIONAL, 0);
+		if (err) {
+			sg_free_table(sgt);
+			kfree(sgt);
+			return err;
+		}
+
+		shmem->sgt = sgt;
+		shmem->evicted = false;
+
+		drm_gem_shmem_update_pages_state_locked(shmem);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(drm_gem_shmem_swapin_locked);
+
 /**
  * drm_gem_shmem_dumb_create - Create a dumb shmem buffer object
  * @file: DRM file structure to create the dumb buffer for
@@ -530,22 +708,34 @@  static vm_fault_t drm_gem_shmem_fault(struct vm_fault *vmf)
 	vm_fault_t ret;
 	struct page *page;
 	pgoff_t page_offset;
+	bool pages_unpinned;
+	int err;
 
 	/* We don't use vmf->pgoff since that has the fake offset */
 	page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT;
 
 	dma_resv_lock(shmem->base.resv, NULL);
 
-	if (page_offset >= num_pages ||
-	    drm_WARN_ON_ONCE(obj->dev, !shmem->pages) ||
-	    shmem->madv < 0) {
+	/* Sanity-check that we have the pages pointer when it should present */
+	pages_unpinned = (shmem->evicted || shmem->madv < 0 ||
+			  !refcount_read(&shmem->pages_use_count));
+	drm_WARN_ON_ONCE(obj->dev, !shmem->pages ^ pages_unpinned);
+
+	if (page_offset >= num_pages || (!shmem->pages && !shmem->evicted)) {
 		ret = VM_FAULT_SIGBUS;
 	} else {
+		err = drm_gem_shmem_swapin_locked(shmem);
+		if (err) {
+			ret = VM_FAULT_OOM;
+			goto unlock;
+		}
+
 		page = shmem->pages[page_offset];
 
 		ret = vmf_insert_pfn(vma, vmf->address, page_to_pfn(page));
 	}
 
+unlock:
 	dma_resv_unlock(shmem->base.resv);
 
 	return ret;
@@ -568,6 +758,7 @@  static void drm_gem_shmem_vm_open(struct vm_area_struct *vma)
 	drm_WARN_ON_ONCE(obj->dev,
 			 !refcount_inc_not_zero(&shmem->pages_use_count));
 
+	drm_gem_shmem_update_pages_state_locked(shmem);
 	dma_resv_unlock(shmem->base.resv);
 
 	drm_gem_vm_open(vma);
@@ -653,7 +844,9 @@  void drm_gem_shmem_print_info(const struct drm_gem_shmem_object *shmem,
 	drm_printf_indent(p, indent, "pages_pin_count=%u\n", refcount_read(&shmem->pages_pin_count));
 	drm_printf_indent(p, indent, "pages_use_count=%u\n", refcount_read(&shmem->pages_use_count));
 	drm_printf_indent(p, indent, "vmap_use_count=%u\n", refcount_read(&shmem->vmap_use_count));
+	drm_printf_indent(p, indent, "evicted=%d\n", shmem->evicted);
 	drm_printf_indent(p, indent, "vaddr=%p\n", shmem->vaddr);
+	drm_printf_indent(p, indent, "madv=%d\n", shmem->madv);
 }
 EXPORT_SYMBOL_GPL(drm_gem_shmem_print_info);
 
@@ -715,6 +908,8 @@  static struct sg_table *drm_gem_shmem_get_pages_sgt_locked(struct drm_gem_shmem_
 
 	shmem->sgt = sgt;
 
+	drm_gem_shmem_update_pages_state_locked(shmem);
+
 	return sgt;
 
 err_free_sgt:
@@ -802,6 +997,191 @@  drm_gem_shmem_prime_import_sg_table(struct drm_device *dev,
 }
 EXPORT_SYMBOL_GPL(drm_gem_shmem_prime_import_sg_table);
 
+static struct drm_gem_shmem_shrinker *
+to_drm_gem_shmem_shrinker(struct shrinker *shrinker)
+{
+	return container_of(shrinker, struct drm_gem_shmem_shrinker, base);
+}
+
+static unsigned long
+drm_gem_shmem_shrinker_count_objects(struct shrinker *shrinker,
+				     struct shrink_control *sc)
+{
+	struct drm_gem_shmem_shrinker *shmem_shrinker =
+					to_drm_gem_shmem_shrinker(shrinker);
+	unsigned long count = shmem_shrinker->lru_evictable.count;
+
+	if (count >= SHRINK_EMPTY)
+		return SHRINK_EMPTY - 1;
+
+	return count ?: SHRINK_EMPTY;
+}
+
+void drm_gem_shmem_evict_locked(struct drm_gem_shmem_object *shmem)
+{
+	struct drm_gem_object *obj = &shmem->base;
+
+	drm_WARN_ON(obj->dev, !drm_gem_shmem_is_evictable(shmem));
+	drm_WARN_ON(obj->dev, shmem->evicted);
+
+	drm_gem_shmem_unpin_pages_locked(shmem);
+
+	shmem->evicted = true;
+	drm_gem_shmem_update_pages_state_locked(shmem);
+}
+EXPORT_SYMBOL_GPL(drm_gem_shmem_evict_locked);
+
+static bool drm_gem_shmem_shrinker_evict_locked(struct drm_gem_object *obj)
+{
+	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
+	int err;
+
+	if (!drm_gem_shmem_is_evictable(shmem) ||
+	    get_nr_swap_pages() < obj->size >> PAGE_SHIFT)
+		return false;
+
+	err = drm_gem_evict_locked(obj);
+	if (err)
+		return false;
+
+	return true;
+}
+
+static bool drm_gem_shmem_shrinker_purge_locked(struct drm_gem_object *obj)
+{
+	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
+	int err;
+
+	if (!drm_gem_shmem_is_purgeable(shmem))
+		return false;
+
+	err = drm_gem_evict_locked(obj);
+	if (err)
+		return false;
+
+	return true;
+}
+
+static unsigned long
+drm_gem_shmem_shrinker_scan_objects(struct shrinker *shrinker,
+				    struct shrink_control *sc)
+{
+	struct drm_gem_shmem_shrinker *shmem_shrinker;
+	unsigned long nr_to_scan = sc->nr_to_scan;
+	unsigned long remaining = 0;
+	unsigned long freed = 0;
+
+	shmem_shrinker = to_drm_gem_shmem_shrinker(shrinker);
+
+	/* purge as many objects as we can */
+	freed += drm_gem_lru_scan(&shmem_shrinker->lru_evictable,
+				  nr_to_scan, &remaining,
+				  drm_gem_shmem_shrinker_purge_locked);
+
+	/* evict as many objects as we can */
+	if (freed < nr_to_scan)
+		freed += drm_gem_lru_scan(&shmem_shrinker->lru_evictable,
+					  nr_to_scan - freed, &remaining,
+					  drm_gem_shmem_shrinker_evict_locked);
+
+	return (freed > 0 && remaining > 0) ? freed : SHRINK_STOP;
+}
+
+static int drm_gem_shmem_shrinker_init(struct drm_gem_shmem *shmem_mm,
+				       const char *shrinker_name)
+{
+	struct drm_gem_shmem_shrinker *shmem_shrinker = &shmem_mm->shrinker;
+	int err;
+
+	shmem_shrinker->base.count_objects = drm_gem_shmem_shrinker_count_objects;
+	shmem_shrinker->base.scan_objects = drm_gem_shmem_shrinker_scan_objects;
+	shmem_shrinker->base.seeks = DEFAULT_SEEKS;
+
+	mutex_init(&shmem_shrinker->lock);
+	drm_gem_lru_init(&shmem_shrinker->lru_evictable, &shmem_shrinker->lock);
+	drm_gem_lru_init(&shmem_shrinker->lru_evicted, &shmem_shrinker->lock);
+	drm_gem_lru_init(&shmem_shrinker->lru_pinned, &shmem_shrinker->lock);
+
+	err = register_shrinker(&shmem_shrinker->base, shrinker_name);
+	if (err) {
+		mutex_destroy(&shmem_shrinker->lock);
+		return err;
+	}
+
+	return 0;
+}
+
+static void drm_gem_shmem_shrinker_release(struct drm_device *dev,
+					   struct drm_gem_shmem *shmem_mm)
+{
+	struct drm_gem_shmem_shrinker *shmem_shrinker = &shmem_mm->shrinker;
+
+	unregister_shrinker(&shmem_shrinker->base);
+	drm_WARN_ON(dev, !list_empty(&shmem_shrinker->lru_evictable.list));
+	drm_WARN_ON(dev, !list_empty(&shmem_shrinker->lru_evicted.list));
+	drm_WARN_ON(dev, !list_empty(&shmem_shrinker->lru_pinned.list));
+	mutex_destroy(&shmem_shrinker->lock);
+}
+
+static int drm_gem_shmem_init(struct drm_device *dev)
+{
+	int err;
+
+	if (drm_WARN_ON(dev, dev->shmem_mm))
+		return -EBUSY;
+
+	dev->shmem_mm = kzalloc(sizeof(*dev->shmem_mm), GFP_KERNEL);
+	if (!dev->shmem_mm)
+		return -ENOMEM;
+
+	err = drm_gem_shmem_shrinker_init(dev->shmem_mm, dev->unique);
+	if (err)
+		goto free_gem_shmem;
+
+	return 0;
+
+free_gem_shmem:
+	kfree(dev->shmem_mm);
+	dev->shmem_mm = NULL;
+
+	return err;
+}
+
+static void drm_gem_shmem_release(struct drm_device *dev, void *ptr)
+{
+	struct drm_gem_shmem *shmem_mm = dev->shmem_mm;
+
+	drm_gem_shmem_shrinker_release(dev, shmem_mm);
+	dev->shmem_mm = NULL;
+	kfree(shmem_mm);
+}
+
+/**
+ * drmm_gem_shmem_init() - Initialize drm-shmem internals
+ * @dev: DRM device
+ *
+ * Cleanup is automatically managed as part of DRM device releasing.
+ * Calling this function multiple times will result in a error.
+ *
+ * Returns:
+ * 0 on success or a negative error code on failure.
+ */
+int drmm_gem_shmem_init(struct drm_device *dev)
+{
+	int err;
+
+	err = drm_gem_shmem_init(dev);
+	if (err)
+		return err;
+
+	err = drmm_add_action_or_reset(dev, drm_gem_shmem_release, NULL);
+	if (err)
+		return err;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(drmm_gem_shmem_init);
+
 MODULE_DESCRIPTION("DRM SHMEM memory-management helpers");
 MODULE_IMPORT_NS(DMA_BUF);
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
index 72193bd734e1..1aa94fff7072 100644
--- a/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
+++ b/drivers/gpu/drm/panfrost/panfrost_gem_shrinker.c
@@ -15,6 +15,13 @@ 
 #include "panfrost_gem.h"
 #include "panfrost_mmu.h"
 
+static bool panfrost_gem_shmem_is_purgeable(struct drm_gem_shmem_object *shmem)
+{
+	return (shmem->madv > 0) &&
+		!refcount_read(&shmem->pages_pin_count) && shmem->sgt &&
+		!shmem->base.dma_buf && !shmem->base.import_attach;
+}
+
 static unsigned long
 panfrost_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc)
 {
@@ -27,7 +34,7 @@  panfrost_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc
 		return 0;
 
 	list_for_each_entry(shmem, &pfdev->shrinker_list, madv_list) {
-		if (drm_gem_shmem_is_purgeable(shmem))
+		if (panfrost_gem_shmem_is_purgeable(shmem))
 			count += shmem->base.size >> PAGE_SHIFT;
 	}
 
diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
index 7cf4afae2e79..a978f0cb5e84 100644
--- a/include/drm/drm_device.h
+++ b/include/drm/drm_device.h
@@ -16,6 +16,7 @@  struct drm_vblank_crtc;
 struct drm_vma_offset_manager;
 struct drm_vram_mm;
 struct drm_fb_helper;
+struct drm_gem_shmem_shrinker;
 
 struct inode;
 
@@ -290,8 +291,13 @@  struct drm_device {
 	/** @vma_offset_manager: GEM information */
 	struct drm_vma_offset_manager *vma_offset_manager;
 
-	/** @vram_mm: VRAM MM memory manager */
-	struct drm_vram_mm *vram_mm;
+	union {
+		/** @vram_mm: VRAM MM memory manager */
+		struct drm_vram_mm *vram_mm;
+
+		/** @shmem_mm: SHMEM GEM memory manager */
+		struct drm_gem_shmem *shmem_mm;
+	};
 
 	/**
 	 * @switch_power_state:
diff --git a/include/drm/drm_gem_shmem_helper.h b/include/drm/drm_gem_shmem_helper.h
index 63e91e8f2d5c..65c99da25048 100644
--- a/include/drm/drm_gem_shmem_helper.h
+++ b/include/drm/drm_gem_shmem_helper.h
@@ -6,6 +6,7 @@ 
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/shrinker.h>
 
 #include <drm/drm_file.h>
 #include <drm/drm_gem.h>
@@ -13,6 +14,7 @@ 
 #include <drm/drm_prime.h>
 
 struct dma_buf_attachment;
+struct drm_device;
 struct drm_mode_create_dumb;
 struct drm_printer;
 struct sg_table;
@@ -53,8 +55,8 @@  struct drm_gem_shmem_object {
 	 * @madv: State for madvise
 	 *
 	 * 0 is active/inuse.
+	 * 1 is not-needed/can-be-purged
 	 * A negative value is the object is purged.
-	 * Positive values are driver specific and not used by the helpers.
 	 */
 	int madv;
 
@@ -115,6 +117,12 @@  struct drm_gem_shmem_object {
 	 * @map_wc: map object write-combined (instead of using shmem defaults).
 	 */
 	bool map_wc : 1;
+
+	/**
+	 * @evicted: True if shmem pages are evicted by the memory shrinker.
+	 * Used internally by memory shrinker.
+	 */
+	bool evicted : 1;
 };
 
 #define to_drm_gem_shmem_obj(obj) \
@@ -133,14 +141,22 @@  void drm_gem_shmem_vunmap_locked(struct drm_gem_shmem_object *shmem,
 int drm_gem_shmem_mmap(struct drm_gem_shmem_object *shmem, struct vm_area_struct *vma);
 
 int drm_gem_shmem_madvise_locked(struct drm_gem_shmem_object *shmem, int madv);
+int drm_gem_shmem_madvise(struct drm_gem_shmem_object *shmem, int madv);
 
 static inline bool drm_gem_shmem_is_purgeable(struct drm_gem_shmem_object *shmem)
 {
-	return (shmem->madv > 0) &&
-		!refcount_read(&shmem->pages_pin_count) && shmem->sgt &&
-		!shmem->base.dma_buf && !shmem->base.import_attach;
+	dma_resv_assert_held(shmem->base.resv);
+
+	return (shmem->madv > 0) && shmem->base.funcs->evict &&
+		refcount_read(&shmem->pages_use_count) &&
+		!refcount_read(&shmem->pages_pin_count) &&
+		!shmem->base.dma_buf && !shmem->base.import_attach &&
+		(shmem->sgt || shmem->evicted);
 }
 
+int drm_gem_shmem_swapin_locked(struct drm_gem_shmem_object *shmem);
+
+void drm_gem_shmem_evict_locked(struct drm_gem_shmem_object *shmem);
 void drm_gem_shmem_purge_locked(struct drm_gem_shmem_object *shmem);
 
 struct sg_table *drm_gem_shmem_get_sg_table(struct drm_gem_shmem_object *shmem);
@@ -284,6 +300,53 @@  static inline int drm_gem_shmem_object_mmap(struct drm_gem_object *obj, struct v
 	return drm_gem_shmem_mmap(shmem, vma);
 }
 
+/**
+ * drm_gem_shmem_object_madvise - unlocked GEM object function for drm_gem_shmem_madvise_locked()
+ * @obj: GEM object
+ * @madv: Madvise value
+ *
+ * This function wraps drm_gem_shmem_madvise_locked(), providing unlocked variant.
+ *
+ * Returns:
+ * 0 on success or a negative error code on failure.
+ */
+static inline int drm_gem_shmem_object_madvise(struct drm_gem_object *obj, int madv)
+{
+	struct drm_gem_shmem_object *shmem = to_drm_gem_shmem_obj(obj);
+
+	return drm_gem_shmem_madvise(shmem, madv);
+}
+
+/**
+ * struct drm_gem_shmem_shrinker - Memory shrinker of GEM shmem memory manager
+ */
+struct drm_gem_shmem_shrinker {
+	/** @base: Shrinker for purging shmem GEM objects */
+	struct shrinker base;
+
+	/** @lock: Protects @lru_* */
+	struct mutex lock;
+
+	/** @lru_pinned: List of pinned shmem GEM objects */
+	struct drm_gem_lru lru_pinned;
+
+	/** @lru_evictable: List of shmem GEM objects to be evicted */
+	struct drm_gem_lru lru_evictable;
+
+	/** @lru_evicted: List of evicted shmem GEM objects */
+	struct drm_gem_lru lru_evicted;
+};
+
+/**
+ * struct drm_gem_shmem - GEM shmem memory manager
+ */
+struct drm_gem_shmem {
+	/** @shrinker: GEM shmem shrinker */
+	struct drm_gem_shmem_shrinker shrinker;
+};
+
+int drmm_gem_shmem_init(struct drm_device *dev);
+
 /*
  * Driver ops
  */