diff mbox series

[RFC,v6,1/7] drm: Add a sharable drm page-pool implementation

Message ID 20210205080621.3102035-2-john.stultz@linaro.org (mailing list archive)
State New, archived
Headers show
Series Generic page pool & deferred freeing for system dmabuf heap | expand

Commit Message

John Stultz Feb. 5, 2021, 8:06 a.m. UTC
This adds a shrinker controlled page pool, closely
following the ttm_pool logic, which is abstracted out
a bit so it can be used by other non-ttm drivers.

Cc: Daniel Vetter <daniel@ffwll.ch>
Cc: Christian Koenig <christian.koenig@amd.com>
Cc: Sumit Semwal <sumit.semwal@linaro.org>
Cc: Liam Mark <lmark@codeaurora.org>
Cc: Chris Goldsworthy <cgoldswo@codeaurora.org>
Cc: Laura Abbott <labbott@kernel.org>
Cc: Brian Starkey <Brian.Starkey@arm.com>
Cc: Hridya Valsaraju <hridya@google.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Sandeep Patil <sspatil@google.com>
Cc: Daniel Mentz <danielmentz@google.com>
Cc: Ørjan Eide <orjan.eide@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Ezequiel Garcia <ezequiel@collabora.com>
Cc: Simon Ser <contact@emersion.fr>
Cc: James Jones <jajones@nvidia.com>
Cc: linux-media@vger.kernel.org
Cc: dri-devel@lists.freedesktop.org
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 drivers/gpu/drm/Kconfig     |   4 +
 drivers/gpu/drm/Makefile    |   1 +
 drivers/gpu/drm/page_pool.c | 220 ++++++++++++++++++++++++++++++++++++
 include/drm/page_pool.h     |  54 +++++++++
 4 files changed, 279 insertions(+)
 create mode 100644 drivers/gpu/drm/page_pool.c
 create mode 100644 include/drm/page_pool.h

Comments

Christian König Feb. 5, 2021, 8:46 a.m. UTC | #1
Am 05.02.21 um 09:06 schrieb John Stultz:
> This adds a shrinker controlled page pool, closely
> following the ttm_pool logic, which is abstracted out
> a bit so it can be used by other non-ttm drivers.
>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> Cc: Christian Koenig <christian.koenig@amd.com>
> Cc: Sumit Semwal <sumit.semwal@linaro.org>
> Cc: Liam Mark <lmark@codeaurora.org>
> Cc: Chris Goldsworthy <cgoldswo@codeaurora.org>
> Cc: Laura Abbott <labbott@kernel.org>
> Cc: Brian Starkey <Brian.Starkey@arm.com>
> Cc: Hridya Valsaraju <hridya@google.com>
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Sandeep Patil <sspatil@google.com>
> Cc: Daniel Mentz <danielmentz@google.com>
> Cc: Ørjan Eide <orjan.eide@arm.com>
> Cc: Robin Murphy <robin.murphy@arm.com>
> Cc: Ezequiel Garcia <ezequiel@collabora.com>
> Cc: Simon Ser <contact@emersion.fr>
> Cc: James Jones <jajones@nvidia.com>
> Cc: linux-media@vger.kernel.org
> Cc: dri-devel@lists.freedesktop.org
> Signed-off-by: John Stultz <john.stultz@linaro.org>
> ---
>   drivers/gpu/drm/Kconfig     |   4 +
>   drivers/gpu/drm/Makefile    |   1 +
>   drivers/gpu/drm/page_pool.c | 220 ++++++++++++++++++++++++++++++++++++
>   include/drm/page_pool.h     |  54 +++++++++
>   4 files changed, 279 insertions(+)
>   create mode 100644 drivers/gpu/drm/page_pool.c
>   create mode 100644 include/drm/page_pool.h
>
> diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
> index 0973f408d75f..d16bf340ed2e 100644
> --- a/drivers/gpu/drm/Kconfig
> +++ b/drivers/gpu/drm/Kconfig
> @@ -174,6 +174,10 @@ config DRM_DP_CEC
>   	  Note: not all adapters support this feature, and even for those
>   	  that do support this they often do not hook up the CEC pin.
>   
> +config DRM_PAGE_POOL
> +	bool
> +	depends on DRM
> +
>   config DRM_TTM
>   	tristate
>   	depends on DRM && MMU
> diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
> index fefaff4c832d..877e0111ed34 100644
> --- a/drivers/gpu/drm/Makefile
> +++ b/drivers/gpu/drm/Makefile
> @@ -32,6 +32,7 @@ drm-$(CONFIG_AGP) += drm_agpsupport.o
>   drm-$(CONFIG_PCI) += drm_pci.o
>   drm-$(CONFIG_DEBUG_FS) += drm_debugfs.o drm_debugfs_crc.o
>   drm-$(CONFIG_DRM_LOAD_EDID_FIRMWARE) += drm_edid_load.o
> +drm-$(CONFIG_DRM_PAGE_POOL) += page_pool.o
>   
>   drm_vram_helper-y := drm_gem_vram_helper.o
>   obj-$(CONFIG_DRM_VRAM_HELPER) += drm_vram_helper.o
> diff --git a/drivers/gpu/drm/page_pool.c b/drivers/gpu/drm/page_pool.c
> new file mode 100644
> index 000000000000..2139f86e6ca7
> --- /dev/null
> +++ b/drivers/gpu/drm/page_pool.c
> @@ -0,0 +1,220 @@
> +// SPDX-License-Identifier: GPL-2.0

Please use a BSD/MIT compatible license if you want to copy this from 
the TTM code.

> +/*
> + * DRM page pool system
> + *
> + * Copyright (C) 2020 Linaro Ltd.
> + *
> + * Based on the ION page pool code
> + * Copyright (C) 2011 Google, Inc.
> + * As well as the ttm_pool code
> + * Copyright (C) 2020 Advanced Micro Devices, Inc.
> + */
> +
> +#include <linux/freezer.h>
> +#include <linux/list.h>
> +#include <linux/slab.h>
> +#include <linux/swap.h>
> +#include <linux/sched/signal.h>
> +#include <drm/page_pool.h>
> +
> +static LIST_HEAD(pool_list);
> +static DEFINE_MUTEX(pool_list_lock);
> +static atomic_long_t total_pages;
> +static unsigned long page_pool_max;
> +MODULE_PARM_DESC(page_pool_max, "Number of pages in the WC/UC/DMA pool");
> +module_param(page_pool_max, ulong, 0644);
> +
> +void drm_page_pool_set_max(unsigned long max)
> +{
> +	/* only write once */
> +	if (!page_pool_max)
> +		page_pool_max = max;
> +}
> +
> +unsigned long drm_page_pool_get_max(void)
> +{
> +	return page_pool_max;
> +}
> +
> +unsigned long drm_page_pool_get_total(void)
> +{
> +	return atomic_long_read(&total_pages);
> +}
> +
> +int drm_page_pool_get_size(struct drm_page_pool *pool)
> +{
> +	int ret;
> +
> +	spin_lock(&pool->lock);
> +	ret = pool->count;
> +	spin_unlock(&pool->lock);

Maybe use an atomic for the count instead?

> +	return ret;
> +}
> +
> +static inline unsigned int drm_page_pool_free_pages(struct drm_page_pool *pool,
> +						    struct page *page)
> +{
> +	return pool->free(page, pool->order);
> +}
> +
> +static int drm_page_pool_shrink_one(void);
> +
> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> +{
> +	spin_lock(&pool->lock);
> +	list_add_tail(&page->lru, &pool->items);
> +	pool->count++;
> +	atomic_long_add(1 << pool->order, &total_pages);
> +	spin_unlock(&pool->lock);
> +
> +	mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
> +			    1 << pool->order);

Hui what? What should that be good for?

> +
> +	/* make sure we don't grow too large */
> +	while (page_pool_max && atomic_long_read(&total_pages) > page_pool_max)
> +		drm_page_pool_shrink_one();
> +}
> +EXPORT_SYMBOL_GPL(drm_page_pool_add);
> +
> +static struct page *drm_page_pool_remove(struct drm_page_pool *pool)
> +{
> +	struct page *page;
> +
> +	if (!pool->count)
> +		return NULL;

Better use list_first_entry_or_null instead of checking the count.

This way you can also pull the lock into the function.

> +
> +	page = list_first_entry(&pool->items, struct page, lru);
> +	pool->count--;
> +	atomic_long_sub(1 << pool->order, &total_pages);
> +
> +	list_del(&page->lru);
> +	mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
> +			    -(1 << pool->order));
> +	return page;
> +}
> +
> +struct page *drm_page_pool_fetch(struct drm_page_pool *pool)
> +{
> +	struct page *page = NULL;
> +
> +	if (!pool) {
> +		WARN_ON(!pool);
> +		return NULL;
> +	}
> +
> +	spin_lock(&pool->lock);
> +	page = drm_page_pool_remove(pool);
> +	spin_unlock(&pool->lock);
> +
> +	return page;
> +}
> +EXPORT_SYMBOL_GPL(drm_page_pool_fetch);
> +
> +struct drm_page_pool *drm_page_pool_create(unsigned int order,
> +					   int (*free_page)(struct page *p, unsigned int order))
> +{
> +	struct drm_page_pool *pool = kmalloc(sizeof(*pool), GFP_KERNEL);

Why not making this an embedded object? We should not see much dynamic 
pool creation.

> +
> +	if (!pool)
> +		return NULL;
> +
> +	pool->count = 0;
> +	INIT_LIST_HEAD(&pool->items);
> +	pool->order = order;
> +	pool->free = free_page;
> +	spin_lock_init(&pool->lock);
> +	INIT_LIST_HEAD(&pool->list);
> +
> +	mutex_lock(&pool_list_lock);
> +	list_add(&pool->list, &pool_list);
> +	mutex_unlock(&pool_list_lock);
> +
> +	return pool;
> +}
> +EXPORT_SYMBOL_GPL(drm_page_pool_create);
> +
> +void drm_page_pool_destroy(struct drm_page_pool *pool)
> +{
> +	struct page *page;
> +
> +	/* Remove us from the pool list */
> +	mutex_lock(&pool_list_lock);
> +	list_del(&pool->list);
> +	mutex_unlock(&pool_list_lock);
> +
> +	/* Free any remaining pages in the pool */
> +	spin_lock(&pool->lock);

Locking should be unnecessary when the pool is destroyed anyway.

> +	while (pool->count) {
> +		page = drm_page_pool_remove(pool);
> +		spin_unlock(&pool->lock);
> +		drm_page_pool_free_pages(pool, page);
> +		spin_lock(&pool->lock);
> +	}
> +	spin_unlock(&pool->lock);
> +
> +	kfree(pool);
> +}
> +EXPORT_SYMBOL_GPL(drm_page_pool_destroy);
> +
> +static int drm_page_pool_shrink_one(void)
> +{
> +	struct drm_page_pool *pool;
> +	struct page *page;
> +	int nr_freed = 0;
> +
> +	mutex_lock(&pool_list_lock);
> +	pool = list_first_entry(&pool_list, typeof(*pool), list);
> +
> +	spin_lock(&pool->lock);
> +	page = drm_page_pool_remove(pool);
> +	spin_unlock(&pool->lock);
> +
> +	if (page)
> +		nr_freed = drm_page_pool_free_pages(pool, page);
> +
> +	list_move_tail(&pool->list, &pool_list);

Better to move this up, directly after the list_first_entry().

> +	mutex_unlock(&pool_list_lock);
> +
> +	return nr_freed;
> +}
> +
> +static unsigned long drm_page_pool_shrink_count(struct shrinker *shrinker,
> +						struct shrink_control *sc)
> +{
> +	unsigned long count =  atomic_long_read(&total_pages);
> +
> +	return count ? count : SHRINK_EMPTY;
> +}
> +
> +static unsigned long drm_page_pool_shrink_scan(struct shrinker *shrinker,
> +					       struct shrink_control *sc)
> +{
> +	int to_scan = sc->nr_to_scan;
> +	int nr_total = 0;
> +
> +	if (to_scan == 0)
> +		return 0;
> +
> +	do {
> +		int nr_freed = drm_page_pool_shrink_one();
> +
> +		to_scan -= nr_freed;
> +		nr_total += nr_freed;
> +	} while (to_scan >= 0 && atomic_long_read(&total_pages));
> +
> +	return nr_total;
> +}
> +
> +static struct shrinker pool_shrinker = {
> +	.count_objects = drm_page_pool_shrink_count,
> +	.scan_objects = drm_page_pool_shrink_scan,
> +	.seeks = 1,
> +	.batch = 0,
> +};
> +
> +int drm_page_pool_init_shrinker(void)
> +{
> +	return register_shrinker(&pool_shrinker);
> +}
> +module_init(drm_page_pool_init_shrinker);
> +MODULE_LICENSE("GPL v2");
> diff --git a/include/drm/page_pool.h b/include/drm/page_pool.h
> new file mode 100644
> index 000000000000..47e240b2bc69
> --- /dev/null
> +++ b/include/drm/page_pool.h
> @@ -0,0 +1,54 @@
> +/* SPDX-License-Identifier: GPL-2.0 OR MIT */
> +/*
> + * Copyright 2020 Advanced Micro Devices, Inc.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> + * OTHER DEALINGS IN THE SOFTWARE.
> + *
> + * Authors: Christian König
> + */
> +
> +#ifndef _DRM_PAGE_POOL_H_
> +#define _DRM_PAGE_POOL_H_
> +
> +#include <linux/mmzone.h>
> +#include <linux/llist.h>
> +#include <linux/spinlock.h>
> +
> +struct drm_page_pool {
> +	int count;
> +	struct list_head items;
> +
> +	int order;
> +	int (*free)(struct page *p, unsigned int order);
> +
> +	spinlock_t lock;
> +	struct list_head list;
> +};
> +
> +void drm_page_pool_set_max(unsigned long max);
> +unsigned long drm_page_pool_get_max(void);
> +unsigned long drm_page_pool_get_total(void);
> +int drm_page_pool_get_size(struct drm_page_pool *pool);
> +struct page *drm_page_pool_fetch(struct drm_page_pool *pool);
> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page);
> +struct drm_page_pool *drm_page_pool_create(unsigned int order,
> +					   int (*free_page)(struct page *p, unsigned int order));
> +void drm_page_pool_destroy(struct drm_page_pool *pool);
> +
> +#endif
John Stultz Feb. 5, 2021, 8:46 p.m. UTC | #2
On Fri, Feb 5, 2021 at 12:47 AM Christian König
<christian.koenig@amd.com> wrote:
> Am 05.02.21 um 09:06 schrieb John Stultz:
> > diff --git a/drivers/gpu/drm/page_pool.c b/drivers/gpu/drm/page_pool.c
> > new file mode 100644
> > index 000000000000..2139f86e6ca7
> > --- /dev/null
> > +++ b/drivers/gpu/drm/page_pool.c
> > @@ -0,0 +1,220 @@
> > +// SPDX-License-Identifier: GPL-2.0
>
> Please use a BSD/MIT compatible license if you want to copy this from
> the TTM code.

Hrm. This may be problematic, as it's not just TTM code, but some of
the TTM logic integrated into a page-pool implementation I wrote based
on logic from the ION code (which was GPL-2.0 before it was dropped).
So I don't think I can just make it MIT.  Any extra context on the
need for MIT, or suggestions on how to best resolve this?

> > +int drm_page_pool_get_size(struct drm_page_pool *pool)
> > +{
> > +     int ret;
> > +
> > +     spin_lock(&pool->lock);
> > +     ret = pool->count;
> > +     spin_unlock(&pool->lock);
>
> Maybe use an atomic for the count instead?
>

I can do that, but am curious as to the benefit? We are mostly using
count where we already have to take the pool->lock anyway, and this
drm_page_pool_get_size() is only used for debugfs output so far, so I
don't expect it to be a hot path.


> > +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> > +{
> > +     spin_lock(&pool->lock);
> > +     list_add_tail(&page->lru, &pool->items);
> > +     pool->count++;
> > +     atomic_long_add(1 << pool->order, &total_pages);
> > +     spin_unlock(&pool->lock);
> > +
> > +     mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
> > +                         1 << pool->order);
>
> Hui what? What should that be good for?

This is a carryover from the ION page pool implementation:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/staging/android/ion/ion_page_pool.c?h=v5.10#n28

My sense is it helps with the vmstat/meminfo accounting so folks can
see the cached pages are shrinkable/freeable. This maybe falls under
other dmabuf accounting/stats discussions, so I'm happy to remove it
for now, or let the drivers using the shared page pool logic handle
the accounting themselves?


> > +static struct page *drm_page_pool_remove(struct drm_page_pool *pool)
> > +{
> > +     struct page *page;
> > +
> > +     if (!pool->count)
> > +             return NULL;
>
> Better use list_first_entry_or_null instead of checking the count.
>
> This way you can also pull the lock into the function.

Yea, that cleans a number of things up nicely. Thank you!


> > +struct drm_page_pool *drm_page_pool_create(unsigned int order,
> > +                                        int (*free_page)(struct page *p, unsigned int order))
> > +{
> > +     struct drm_page_pool *pool = kmalloc(sizeof(*pool), GFP_KERNEL);
>
> Why not making this an embedded object? We should not see much dynamic
> pool creation.

Yea, it felt cleaner at the time this way, but I think I will need to
switch to an embedded object in order to resolve the memory usage
issue you pointed out with growing the ttm_pool_dma, so thank you for
the suggestion!


> > +void drm_page_pool_destroy(struct drm_page_pool *pool)
> > +{
> > +     struct page *page;
> > +
> > +     /* Remove us from the pool list */
> > +     mutex_lock(&pool_list_lock);
> > +     list_del(&pool->list);
> > +     mutex_unlock(&pool_list_lock);
> > +
> > +     /* Free any remaining pages in the pool */
> > +     spin_lock(&pool->lock);
>
> Locking should be unnecessary when the pool is destroyed anyway.

I guess if we've already pruned ourself from the pool list, then your
right, we can't race with the shrinker and it's maybe not necessary.
But it also seems easier to consistently follow the locking rules in a
very unlikely path rather than leaning on subtlety.  Either way, I
think this becomes moot if I make the improvements you suggest to
drm_page_pool_remove().

> > +static int drm_page_pool_shrink_one(void)
> > +{
> > +     struct drm_page_pool *pool;
> > +     struct page *page;
> > +     int nr_freed = 0;
> > +
> > +     mutex_lock(&pool_list_lock);
> > +     pool = list_first_entry(&pool_list, typeof(*pool), list);
> > +
> > +     spin_lock(&pool->lock);
> > +     page = drm_page_pool_remove(pool);
> > +     spin_unlock(&pool->lock);
> > +
> > +     if (page)
> > +             nr_freed = drm_page_pool_free_pages(pool, page);
> > +
> > +     list_move_tail(&pool->list, &pool_list);
>
> Better to move this up, directly after the list_first_entry().

Sounds good!

Thanks so much for your review and feedback! I'll try to get some of
the easy suggestions integrated, and will have to figure out what to
do about the re-licensing request.

thanks
-john
Suren Baghdasaryan Feb. 5, 2021, 10:38 p.m. UTC | #3
On Fri, Feb 5, 2021 at 12:47 PM John Stultz <john.stultz@linaro.org> wrote:
>
> On Fri, Feb 5, 2021 at 12:47 AM Christian König
> <christian.koenig@amd.com> wrote:
> > Am 05.02.21 um 09:06 schrieb John Stultz:
> > > diff --git a/drivers/gpu/drm/page_pool.c b/drivers/gpu/drm/page_pool.c
> > > new file mode 100644
> > > index 000000000000..2139f86e6ca7
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/page_pool.c
> > > @@ -0,0 +1,220 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> >
> > Please use a BSD/MIT compatible license if you want to copy this from
> > the TTM code.
>
> Hrm. This may be problematic, as it's not just TTM code, but some of
> the TTM logic integrated into a page-pool implementation I wrote based
> on logic from the ION code (which was GPL-2.0 before it was dropped).
> So I don't think I can just make it MIT.  Any extra context on the
> need for MIT, or suggestions on how to best resolve this?
>
> > > +int drm_page_pool_get_size(struct drm_page_pool *pool)
> > > +{
> > > +     int ret;
> > > +
> > > +     spin_lock(&pool->lock);
> > > +     ret = pool->count;
> > > +     spin_unlock(&pool->lock);
> >
> > Maybe use an atomic for the count instead?
> >
>
> I can do that, but am curious as to the benefit? We are mostly using
> count where we already have to take the pool->lock anyway, and this
> drm_page_pool_get_size() is only used for debugfs output so far, so I
> don't expect it to be a hot path.
>
>
> > > +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> > > +{
> > > +     spin_lock(&pool->lock);
> > > +     list_add_tail(&page->lru, &pool->items);
> > > +     pool->count++;
> > > +     atomic_long_add(1 << pool->order, &total_pages);
> > > +     spin_unlock(&pool->lock);
> > > +
> > > +     mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
> > > +                         1 << pool->order);
> >
> > Hui what? What should that be good for?
>
> This is a carryover from the ION page pool implementation:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/staging/android/ion/ion_page_pool.c?h=v5.10#n28
>
> My sense is it helps with the vmstat/meminfo accounting so folks can
> see the cached pages are shrinkable/freeable. This maybe falls under
> other dmabuf accounting/stats discussions, so I'm happy to remove it
> for now, or let the drivers using the shared page pool logic handle
> the accounting themselves?

Yep, ION pools were accounted for as reclaimable kernel memory because
they could be dropped when the system is under memory pressure.

>
>
> > > +static struct page *drm_page_pool_remove(struct drm_page_pool *pool)
> > > +{
> > > +     struct page *page;
> > > +
> > > +     if (!pool->count)
> > > +             return NULL;
> >
> > Better use list_first_entry_or_null instead of checking the count.
> >
> > This way you can also pull the lock into the function.
>
> Yea, that cleans a number of things up nicely. Thank you!
>
>
> > > +struct drm_page_pool *drm_page_pool_create(unsigned int order,
> > > +                                        int (*free_page)(struct page *p, unsigned int order))
> > > +{
> > > +     struct drm_page_pool *pool = kmalloc(sizeof(*pool), GFP_KERNEL);
> >
> > Why not making this an embedded object? We should not see much dynamic
> > pool creation.
>
> Yea, it felt cleaner at the time this way, but I think I will need to
> switch to an embedded object in order to resolve the memory usage
> issue you pointed out with growing the ttm_pool_dma, so thank you for
> the suggestion!
>
>
> > > +void drm_page_pool_destroy(struct drm_page_pool *pool)
> > > +{
> > > +     struct page *page;
> > > +
> > > +     /* Remove us from the pool list */
> > > +     mutex_lock(&pool_list_lock);
> > > +     list_del(&pool->list);
> > > +     mutex_unlock(&pool_list_lock);
> > > +
> > > +     /* Free any remaining pages in the pool */
> > > +     spin_lock(&pool->lock);
> >
> > Locking should be unnecessary when the pool is destroyed anyway.
>
> I guess if we've already pruned ourself from the pool list, then your
> right, we can't race with the shrinker and it's maybe not necessary.
> But it also seems easier to consistently follow the locking rules in a
> very unlikely path rather than leaning on subtlety.  Either way, I
> think this becomes moot if I make the improvements you suggest to
> drm_page_pool_remove().
>
> > > +static int drm_page_pool_shrink_one(void)
> > > +{
> > > +     struct drm_page_pool *pool;
> > > +     struct page *page;
> > > +     int nr_freed = 0;
> > > +
> > > +     mutex_lock(&pool_list_lock);
> > > +     pool = list_first_entry(&pool_list, typeof(*pool), list);
> > > +
> > > +     spin_lock(&pool->lock);
> > > +     page = drm_page_pool_remove(pool);
> > > +     spin_unlock(&pool->lock);
> > > +
> > > +     if (page)
> > > +             nr_freed = drm_page_pool_free_pages(pool, page);
> > > +
> > > +     list_move_tail(&pool->list, &pool_list);
> >
> > Better to move this up, directly after the list_first_entry().
>
> Sounds good!
>
> Thanks so much for your review and feedback! I'll try to get some of
> the easy suggestions integrated, and will have to figure out what to
> do about the re-licensing request.
>
> thanks
> -john
Christian König Feb. 9, 2021, 12:11 p.m. UTC | #4
Am 05.02.21 um 21:46 schrieb John Stultz:
> On Fri, Feb 5, 2021 at 12:47 AM Christian König
> <christian.koenig@amd.com> wrote:
>> Am 05.02.21 um 09:06 schrieb John Stultz:
>>> diff --git a/drivers/gpu/drm/page_pool.c b/drivers/gpu/drm/page_pool.c
>>> new file mode 100644
>>> index 000000000000..2139f86e6ca7
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/page_pool.c
>>> @@ -0,0 +1,220 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>> Please use a BSD/MIT compatible license if you want to copy this from
>> the TTM code.
> Hrm. This may be problematic, as it's not just TTM code, but some of
> the TTM logic integrated into a page-pool implementation I wrote based
> on logic from the ION code (which was GPL-2.0 before it was dropped).
> So I don't think I can just make it MIT.  Any extra context on the
> need for MIT, or suggestions on how to best resolve this?

Most of the DRM/TTM code is also license able under the BSD/MIT style 
license since we want to enable the BSD guys to port it over as well.

What stuff do you need from the ION code that you can't just code 
yourself? As far as I have seen this is like 99% code from the TTM pool.

Christian.

>
>>> +int drm_page_pool_get_size(struct drm_page_pool *pool)
>>> +{
>>> +     int ret;
>>> +
>>> +     spin_lock(&pool->lock);
>>> +     ret = pool->count;
>>> +     spin_unlock(&pool->lock);
>> Maybe use an atomic for the count instead?
>>
> I can do that, but am curious as to the benefit? We are mostly using
> count where we already have to take the pool->lock anyway, and this
> drm_page_pool_get_size() is only used for debugfs output so far, so I
> don't expect it to be a hot path.

Yeah, it's not really important. IIRC I even walked over the linked list 
to count how many pages we had.

Christian.

>
>
>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
>>> +{
>>> +     spin_lock(&pool->lock);
>>> +     list_add_tail(&page->lru, &pool->items);
>>> +     pool->count++;
>>> +     atomic_long_add(1 << pool->order, &total_pages);
>>> +     spin_unlock(&pool->lock);
>>> +
>>> +     mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
>>> +                         1 << pool->order);
>> Hui what? What should that be good for?
> This is a carryover from the ION page pool implementation:
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cc4eadb0a9cf6491d99ba08d8ca173457%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637481548325174885%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=FUjZK5NSDMUYfU7vGeE4fDU2HCF%2FYyNBwc30aoLLPQ4%3D&amp;reserved=0
>
> My sense is it helps with the vmstat/meminfo accounting so folks can
> see the cached pages are shrinkable/freeable. This maybe falls under
> other dmabuf accounting/stats discussions, so I'm happy to remove it
> for now, or let the drivers using the shared page pool logic handle
> the accounting themselves?
>
>
>>> +static struct page *drm_page_pool_remove(struct drm_page_pool *pool)
>>> +{
>>> +     struct page *page;
>>> +
>>> +     if (!pool->count)
>>> +             return NULL;
>> Better use list_first_entry_or_null instead of checking the count.
>>
>> This way you can also pull the lock into the function.
> Yea, that cleans a number of things up nicely. Thank you!
>
>
>>> +struct drm_page_pool *drm_page_pool_create(unsigned int order,
>>> +                                        int (*free_page)(struct page *p, unsigned int order))
>>> +{
>>> +     struct drm_page_pool *pool = kmalloc(sizeof(*pool), GFP_KERNEL);
>> Why not making this an embedded object? We should not see much dynamic
>> pool creation.
> Yea, it felt cleaner at the time this way, but I think I will need to
> switch to an embedded object in order to resolve the memory usage
> issue you pointed out with growing the ttm_pool_dma, so thank you for
> the suggestion!
>
>
>>> +void drm_page_pool_destroy(struct drm_page_pool *pool)
>>> +{
>>> +     struct page *page;
>>> +
>>> +     /* Remove us from the pool list */
>>> +     mutex_lock(&pool_list_lock);
>>> +     list_del(&pool->list);
>>> +     mutex_unlock(&pool_list_lock);
>>> +
>>> +     /* Free any remaining pages in the pool */
>>> +     spin_lock(&pool->lock);
>> Locking should be unnecessary when the pool is destroyed anyway.
> I guess if we've already pruned ourself from the pool list, then your
> right, we can't race with the shrinker and it's maybe not necessary.
> But it also seems easier to consistently follow the locking rules in a
> very unlikely path rather than leaning on subtlety.  Either way, I
> think this becomes moot if I make the improvements you suggest to
> drm_page_pool_remove().
>
>>> +static int drm_page_pool_shrink_one(void)
>>> +{
>>> +     struct drm_page_pool *pool;
>>> +     struct page *page;
>>> +     int nr_freed = 0;
>>> +
>>> +     mutex_lock(&pool_list_lock);
>>> +     pool = list_first_entry(&pool_list, typeof(*pool), list);
>>> +
>>> +     spin_lock(&pool->lock);
>>> +     page = drm_page_pool_remove(pool);
>>> +     spin_unlock(&pool->lock);
>>> +
>>> +     if (page)
>>> +             nr_freed = drm_page_pool_free_pages(pool, page);
>>> +
>>> +     list_move_tail(&pool->list, &pool_list);
>> Better to move this up, directly after the list_first_entry().
> Sounds good!
>
> Thanks so much for your review and feedback! I'll try to get some of
> the easy suggestions integrated, and will have to figure out what to
> do about the re-licensing request.
>
> thanks
> -john
Christian König Feb. 9, 2021, 12:57 p.m. UTC | #5
Am 09.02.21 um 13:11 schrieb Christian König:
> [SNIP]
>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
>>>> +{
>>>> +     spin_lock(&pool->lock);
>>>> +     list_add_tail(&page->lru, &pool->items);
>>>> +     pool->count++;
>>>> +     atomic_long_add(1 << pool->order, &total_pages);
>>>> +     spin_unlock(&pool->lock);
>>>> +
>>>> +     mod_node_page_state(page_pgdat(page), 
>>>> NR_KERNEL_MISC_RECLAIMABLE,
>>>> +                         1 << pool->order);
>>> Hui what? What should that be good for?
>> This is a carryover from the ION page pool implementation:
>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cc4eadb0a9cf6491d99ba08d8ca173457%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637481548325174885%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=FUjZK5NSDMUYfU7vGeE4fDU2HCF%2FYyNBwc30aoLLPQ4%3D&amp;reserved=0 
>>
>>
>> My sense is it helps with the vmstat/meminfo accounting so folks can
>> see the cached pages are shrinkable/freeable. This maybe falls under
>> other dmabuf accounting/stats discussions, so I'm happy to remove it
>> for now, or let the drivers using the shared page pool logic handle
>> the accounting themselves?

Intentionally separated the discussion for that here.

As far as I can see this is just bluntly incorrect.

Either the page is reclaimable or it is part of our pool and freeable 
through the shrinker, but never ever both.

In the best case this just messes up the accounting, in the worst case 
it can cause memory corruption.

Christian.
Suren Baghdasaryan Feb. 9, 2021, 5:33 p.m. UTC | #6
On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
>
> Am 09.02.21 um 13:11 schrieb Christian König:
> > [SNIP]
> >>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> >>>> +{
> >>>> +     spin_lock(&pool->lock);
> >>>> +     list_add_tail(&page->lru, &pool->items);
> >>>> +     pool->count++;
> >>>> +     atomic_long_add(1 << pool->order, &total_pages);
> >>>> +     spin_unlock(&pool->lock);
> >>>> +
> >>>> +     mod_node_page_state(page_pgdat(page),
> >>>> NR_KERNEL_MISC_RECLAIMABLE,
> >>>> +                         1 << pool->order);
> >>> Hui what? What should that be good for?
> >> This is a carryover from the ION page pool implementation:
> >> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cc4eadb0a9cf6491d99ba08d8ca173457%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637481548325174885%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=FUjZK5NSDMUYfU7vGeE4fDU2HCF%2FYyNBwc30aoLLPQ4%3D&amp;reserved=0
> >>
> >>
> >> My sense is it helps with the vmstat/meminfo accounting so folks can
> >> see the cached pages are shrinkable/freeable. This maybe falls under
> >> other dmabuf accounting/stats discussions, so I'm happy to remove it
> >> for now, or let the drivers using the shared page pool logic handle
> >> the accounting themselves?
>
> Intentionally separated the discussion for that here.
>
> As far as I can see this is just bluntly incorrect.
>
> Either the page is reclaimable or it is part of our pool and freeable
> through the shrinker, but never ever both.

IIRC the original motivation for counting ION pooled pages as
reclaimable was to include them into /proc/meminfo's MemAvailable
calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
non-slab kernel pages" seems like a good place to account for them but
I might be wrong.

>
> In the best case this just messes up the accounting, in the worst case
> it can cause memory corruption.
>
> Christian.
Christian König Feb. 9, 2021, 5:46 p.m. UTC | #7
Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
>> Am 09.02.21 um 13:11 schrieb Christian König:
>>> [SNIP]
>>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
>>>>>> +{
>>>>>> +     spin_lock(&pool->lock);
>>>>>> +     list_add_tail(&page->lru, &pool->items);
>>>>>> +     pool->count++;
>>>>>> +     atomic_long_add(1 << pool->order, &total_pages);
>>>>>> +     spin_unlock(&pool->lock);
>>>>>> +
>>>>>> +     mod_node_page_state(page_pgdat(page),
>>>>>> NR_KERNEL_MISC_RECLAIMABLE,
>>>>>> +                         1 << pool->order);
>>>>> Hui what? What should that be good for?
>>>> This is a carryover from the ION page pool implementation:
>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cdccccff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3D&amp;reserved=0
>>>>
>>>>
>>>> My sense is it helps with the vmstat/meminfo accounting so folks can
>>>> see the cached pages are shrinkable/freeable. This maybe falls under
>>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
>>>> for now, or let the drivers using the shared page pool logic handle
>>>> the accounting themselves?
>> Intentionally separated the discussion for that here.
>>
>> As far as I can see this is just bluntly incorrect.
>>
>> Either the page is reclaimable or it is part of our pool and freeable
>> through the shrinker, but never ever both.
> IIRC the original motivation for counting ION pooled pages as
> reclaimable was to include them into /proc/meminfo's MemAvailable
> calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> non-slab kernel pages" seems like a good place to account for them but
> I might be wrong.

Yeah, that's what I see here as well. But exactly that is utterly nonsense.

Those pages are not "free" in the sense that get_free_page could return 
them directly.

Regards,
Christian.

>
>> In the best case this just messes up the accounting, in the worst case
>> it can cause memory corruption.
>>
>> Christian.
John Stultz Feb. 9, 2021, 5:51 p.m. UTC | #8
On Tue, Feb 9, 2021 at 4:11 AM Christian König <christian.koenig@amd.com> wrote:
>
>
>
> Am 05.02.21 um 21:46 schrieb John Stultz:
> > On Fri, Feb 5, 2021 at 12:47 AM Christian König
> > <christian.koenig@amd.com> wrote:
> >> Am 05.02.21 um 09:06 schrieb John Stultz:
> >>> diff --git a/drivers/gpu/drm/page_pool.c b/drivers/gpu/drm/page_pool.c
> >>> new file mode 100644
> >>> index 000000000000..2139f86e6ca7
> >>> --- /dev/null
> >>> +++ b/drivers/gpu/drm/page_pool.c
> >>> @@ -0,0 +1,220 @@
> >>> +// SPDX-License-Identifier: GPL-2.0
> >> Please use a BSD/MIT compatible license if you want to copy this from
> >> the TTM code.
> > Hrm. This may be problematic, as it's not just TTM code, but some of
> > the TTM logic integrated into a page-pool implementation I wrote based
> > on logic from the ION code (which was GPL-2.0 before it was dropped).
> > So I don't think I can just make it MIT.  Any extra context on the
> > need for MIT, or suggestions on how to best resolve this?
>
> Most of the DRM/TTM code is also license able under the BSD/MIT style
> license since we want to enable the BSD guys to port it over as well.
>
> What stuff do you need from the ION code that you can't just code
> yourself? As far as I have seen this is like 99% code from the TTM pool.

Yea, it evolved into being mostly logic from the TTM pool (or code
that was very similar to begin with), but it's not where it started.
My old days at IBM makes me wary of claiming it's no longer the Ship
of Theseus.

So instead I think I'll have to just throw out my patch and rewrite it
in full (so apologies in advance for any review issues I
introduce/reintroduce).

thanks
-john
Suren Baghdasaryan Feb. 9, 2021, 5:52 p.m. UTC | #9
On Tue, Feb 9, 2021 at 9:46 AM Christian König <christian.koenig@amd.com> wrote:
>
>
>
> Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> > On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
> >> Am 09.02.21 um 13:11 schrieb Christian König:
> >>> [SNIP]
> >>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> >>>>>> +{
> >>>>>> +     spin_lock(&pool->lock);
> >>>>>> +     list_add_tail(&page->lru, &pool->items);
> >>>>>> +     pool->count++;
> >>>>>> +     atomic_long_add(1 << pool->order, &total_pages);
> >>>>>> +     spin_unlock(&pool->lock);
> >>>>>> +
> >>>>>> +     mod_node_page_state(page_pgdat(page),
> >>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> >>>>>> +                         1 << pool->order);
> >>>>> Hui what? What should that be good for?
> >>>> This is a carryover from the ION page pool implementation:
> >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cdccccff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3D&amp;reserved=0
> >>>>
> >>>>
> >>>> My sense is it helps with the vmstat/meminfo accounting so folks can
> >>>> see the cached pages are shrinkable/freeable. This maybe falls under
> >>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
> >>>> for now, or let the drivers using the shared page pool logic handle
> >>>> the accounting themselves?
> >> Intentionally separated the discussion for that here.
> >>
> >> As far as I can see this is just bluntly incorrect.
> >>
> >> Either the page is reclaimable or it is part of our pool and freeable
> >> through the shrinker, but never ever both.
> > IIRC the original motivation for counting ION pooled pages as
> > reclaimable was to include them into /proc/meminfo's MemAvailable
> > calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> > non-slab kernel pages" seems like a good place to account for them but
> > I might be wrong.
>
> Yeah, that's what I see here as well. But exactly that is utterly nonsense.
>
> Those pages are not "free" in the sense that get_free_page could return
> them directly.

Any ideas where these pages would fit better? We do want to know that
under memory pressure these pages can be made available (which is I
think what MemAvailable means).

>
> Regards,
> Christian.
>
> >
> >> In the best case this just messes up the accounting, in the worst case
> >> it can cause memory corruption.
> >>
> >> Christian.
>
Daniel Vetter Feb. 9, 2021, 8:03 p.m. UTC | #10
On Tue, Feb 9, 2021 at 6:46 PM Christian König <christian.koenig@amd.com> wrote:
>
>
>
> Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> > On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
> >> Am 09.02.21 um 13:11 schrieb Christian König:
> >>> [SNIP]
> >>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> >>>>>> +{
> >>>>>> +     spin_lock(&pool->lock);
> >>>>>> +     list_add_tail(&page->lru, &pool->items);
> >>>>>> +     pool->count++;
> >>>>>> +     atomic_long_add(1 << pool->order, &total_pages);
> >>>>>> +     spin_unlock(&pool->lock);
> >>>>>> +
> >>>>>> +     mod_node_page_state(page_pgdat(page),
> >>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> >>>>>> +                         1 << pool->order);
> >>>>> Hui what? What should that be good for?
> >>>> This is a carryover from the ION page pool implementation:
> >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cdccccff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3D&amp;reserved=0
> >>>>
> >>>>
> >>>> My sense is it helps with the vmstat/meminfo accounting so folks can
> >>>> see the cached pages are shrinkable/freeable. This maybe falls under
> >>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
> >>>> for now, or let the drivers using the shared page pool logic handle
> >>>> the accounting themselves?
> >> Intentionally separated the discussion for that here.
> >>
> >> As far as I can see this is just bluntly incorrect.
> >>
> >> Either the page is reclaimable or it is part of our pool and freeable
> >> through the shrinker, but never ever both.
> > IIRC the original motivation for counting ION pooled pages as
> > reclaimable was to include them into /proc/meminfo's MemAvailable
> > calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> > non-slab kernel pages" seems like a good place to account for them but
> > I might be wrong.
>
> Yeah, that's what I see here as well. But exactly that is utterly nonsense.
>
> Those pages are not "free" in the sense that get_free_page could return
> them directly.

Well on Android that is kinda true, because Android has it's
oom-killer (way back it was just a shrinker callback, not sure how it
works now), which just shot down all the background apps. So at least
some of that (everything used by background apps) is indeed
reclaimable on Android.

But that doesn't hold on Linux in general, so we can't really do this
for common code.

Also I had a long meeting with Suren, John and other googles
yesterday, and the aim is now to try and support all the Android gpu
memory accounting needs with cgroups. That should work, and it will
allow Android to handle all the Android-ism in a clean way in upstream
code. Or that's at least the plan.

I think the only thing we identified that Android still needs on top
is the dma-buf sysfs stuff, so that shared buffers (which on Android
are always dma-buf, and always stay around as dma-buf fd throughout
their lifetime) can be listed/analyzed with full detail.

But aside from this the plan for all the per-process or per-heap
account, oom-killer integration and everything else is planned to be
done with cgroups. Android (for now) only needs to account overall gpu
memory since none of it is swappable on android drivers anyway, plus
no vram, so not much needed.

Cheers, Daniel

>
> Regards,
> Christian.
>
> >
> >> In the best case this just messes up the accounting, in the worst case
> >> it can cause memory corruption.
> >>
> >> Christian.
>
Suren Baghdasaryan Feb. 9, 2021, 8:16 p.m. UTC | #11
On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Tue, Feb 9, 2021 at 6:46 PM Christian König <christian.koenig@amd.com> wrote:
> >
> >
> >
> > Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> > > On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
> > >> Am 09.02.21 um 13:11 schrieb Christian König:
> > >>> [SNIP]
> > >>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> > >>>>>> +{
> > >>>>>> +     spin_lock(&pool->lock);
> > >>>>>> +     list_add_tail(&page->lru, &pool->items);
> > >>>>>> +     pool->count++;
> > >>>>>> +     atomic_long_add(1 << pool->order, &total_pages);
> > >>>>>> +     spin_unlock(&pool->lock);
> > >>>>>> +
> > >>>>>> +     mod_node_page_state(page_pgdat(page),
> > >>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> > >>>>>> +                         1 << pool->order);
> > >>>>> Hui what? What should that be good for?
> > >>>> This is a carryover from the ION page pool implementation:
> > >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cdccccff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3D&amp;reserved=0
> > >>>>
> > >>>>
> > >>>> My sense is it helps with the vmstat/meminfo accounting so folks can
> > >>>> see the cached pages are shrinkable/freeable. This maybe falls under
> > >>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
> > >>>> for now, or let the drivers using the shared page pool logic handle
> > >>>> the accounting themselves?
> > >> Intentionally separated the discussion for that here.
> > >>
> > >> As far as I can see this is just bluntly incorrect.
> > >>
> > >> Either the page is reclaimable or it is part of our pool and freeable
> > >> through the shrinker, but never ever both.
> > > IIRC the original motivation for counting ION pooled pages as
> > > reclaimable was to include them into /proc/meminfo's MemAvailable
> > > calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> > > non-slab kernel pages" seems like a good place to account for them but
> > > I might be wrong.
> >
> > Yeah, that's what I see here as well. But exactly that is utterly nonsense.
> >
> > Those pages are not "free" in the sense that get_free_page could return
> > them directly.
>
> Well on Android that is kinda true, because Android has it's
> oom-killer (way back it was just a shrinker callback, not sure how it
> works now), which just shot down all the background apps. So at least
> some of that (everything used by background apps) is indeed
> reclaimable on Android.
>
> But that doesn't hold on Linux in general, so we can't really do this
> for common code.
>
> Also I had a long meeting with Suren, John and other googles
> yesterday, and the aim is now to try and support all the Android gpu
> memory accounting needs with cgroups. That should work, and it will
> allow Android to handle all the Android-ism in a clean way in upstream
> code. Or that's at least the plan.
>
> I think the only thing we identified that Android still needs on top
> is the dma-buf sysfs stuff, so that shared buffers (which on Android
> are always dma-buf, and always stay around as dma-buf fd throughout
> their lifetime) can be listed/analyzed with full detail.
>
> But aside from this the plan for all the per-process or per-heap
> account, oom-killer integration and everything else is planned to be
> done with cgroups.

Until cgroups are ready we probably will need to add a sysfs node to
report the total dmabuf pool size and I think that would cover our
current accounting need here.
As I mentioned, not including dmabuf pools into MemAvailable would
affect that stat and I'm wondering if pools should be considered as
part of MemAvailable or not. Since MemAvailable includes SReclaimable
I think it makes sense to include them but maybe there are other
considerations that I'm missing?

> Android (for now) only needs to account overall gpu
> memory since none of it is swappable on android drivers anyway, plus
> no vram, so not much needed.
>
> Cheers, Daniel
>
> >
> > Regards,
> > Christian.
> >
> > >
> > >> In the best case this just messes up the accounting, in the worst case
> > >> it can cause memory corruption.
> > >>
> > >> Christian.
> >
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter Feb. 10, 2021, 1:06 p.m. UTC | #12
On Tue, Feb 09, 2021 at 12:16:51PM -0800, Suren Baghdasaryan wrote:
> On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Tue, Feb 9, 2021 at 6:46 PM Christian König <christian.koenig@amd.com> wrote:
> > >
> > >
> > >
> > > Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> > > > On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
> > > >> Am 09.02.21 um 13:11 schrieb Christian König:
> > > >>> [SNIP]
> > > >>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> > > >>>>>> +{
> > > >>>>>> +     spin_lock(&pool->lock);
> > > >>>>>> +     list_add_tail(&page->lru, &pool->items);
> > > >>>>>> +     pool->count++;
> > > >>>>>> +     atomic_long_add(1 << pool->order, &total_pages);
> > > >>>>>> +     spin_unlock(&pool->lock);
> > > >>>>>> +
> > > >>>>>> +     mod_node_page_state(page_pgdat(page),
> > > >>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> > > >>>>>> +                         1 << pool->order);
> > > >>>>> Hui what? What should that be good for?
> > > >>>> This is a carryover from the ION page pool implementation:
> > > >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cdccccff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3D&amp;reserved=0
> > > >>>>
> > > >>>>
> > > >>>> My sense is it helps with the vmstat/meminfo accounting so folks can
> > > >>>> see the cached pages are shrinkable/freeable. This maybe falls under
> > > >>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
> > > >>>> for now, or let the drivers using the shared page pool logic handle
> > > >>>> the accounting themselves?
> > > >> Intentionally separated the discussion for that here.
> > > >>
> > > >> As far as I can see this is just bluntly incorrect.
> > > >>
> > > >> Either the page is reclaimable or it is part of our pool and freeable
> > > >> through the shrinker, but never ever both.
> > > > IIRC the original motivation for counting ION pooled pages as
> > > > reclaimable was to include them into /proc/meminfo's MemAvailable
> > > > calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> > > > non-slab kernel pages" seems like a good place to account for them but
> > > > I might be wrong.
> > >
> > > Yeah, that's what I see here as well. But exactly that is utterly nonsense.
> > >
> > > Those pages are not "free" in the sense that get_free_page could return
> > > them directly.
> >
> > Well on Android that is kinda true, because Android has it's
> > oom-killer (way back it was just a shrinker callback, not sure how it
> > works now), which just shot down all the background apps. So at least
> > some of that (everything used by background apps) is indeed
> > reclaimable on Android.
> >
> > But that doesn't hold on Linux in general, so we can't really do this
> > for common code.
> >
> > Also I had a long meeting with Suren, John and other googles
> > yesterday, and the aim is now to try and support all the Android gpu
> > memory accounting needs with cgroups. That should work, and it will
> > allow Android to handle all the Android-ism in a clean way in upstream
> > code. Or that's at least the plan.
> >
> > I think the only thing we identified that Android still needs on top
> > is the dma-buf sysfs stuff, so that shared buffers (which on Android
> > are always dma-buf, and always stay around as dma-buf fd throughout
> > their lifetime) can be listed/analyzed with full detail.
> >
> > But aside from this the plan for all the per-process or per-heap
> > account, oom-killer integration and everything else is planned to be
> > done with cgroups.
> 
> Until cgroups are ready we probably will need to add a sysfs node to
> report the total dmabuf pool size and I think that would cover our
> current accounting need here.
> As I mentioned, not including dmabuf pools into MemAvailable would
> affect that stat and I'm wondering if pools should be considered as
> part of MemAvailable or not. Since MemAvailable includes SReclaimable
> I think it makes sense to include them but maybe there are other
> considerations that I'm missing?

On Android, yes, on upstream, not so much. Because upstream doesn't have
the android low memory killer cleanup up all the apps, so effectively we
can't reclaim that memory, and we shouldn't report it as such.
-Daniel

> 
> > Android (for now) only needs to account overall gpu
> > memory since none of it is swappable on android drivers anyway, plus
> > no vram, so not much needed.
> >
> > Cheers, Daniel
> >
> > >
> > > Regards,
> > > Christian.
> > >
> > > >
> > > >> In the best case this just messes up the accounting, in the worst case
> > > >> it can cause memory corruption.
> > > >>
> > > >> Christian.
> > >
> >
> >
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Suren Baghdasaryan Feb. 10, 2021, 4:39 p.m. UTC | #13
On Wed, Feb 10, 2021 at 5:06 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Tue, Feb 09, 2021 at 12:16:51PM -0800, Suren Baghdasaryan wrote:
> > On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >
> > > On Tue, Feb 9, 2021 at 6:46 PM Christian König <christian.koenig@amd.com> wrote:
> > > >
> > > >
> > > >
> > > > Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> > > > > On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
> > > > >> Am 09.02.21 um 13:11 schrieb Christian König:
> > > > >>> [SNIP]
> > > > >>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> > > > >>>>>> +{
> > > > >>>>>> +     spin_lock(&pool->lock);
> > > > >>>>>> +     list_add_tail(&page->lru, &pool->items);
> > > > >>>>>> +     pool->count++;
> > > > >>>>>> +     atomic_long_add(1 << pool->order, &total_pages);
> > > > >>>>>> +     spin_unlock(&pool->lock);
> > > > >>>>>> +
> > > > >>>>>> +     mod_node_page_state(page_pgdat(page),
> > > > >>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> > > > >>>>>> +                         1 << pool->order);
> > > > >>>>> Hui what? What should that be good for?
> > > > >>>> This is a carryover from the ION page pool implementation:
> > > > >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cdccccff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3D&amp;reserved=0
> > > > >>>>
> > > > >>>>
> > > > >>>> My sense is it helps with the vmstat/meminfo accounting so folks can
> > > > >>>> see the cached pages are shrinkable/freeable. This maybe falls under
> > > > >>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
> > > > >>>> for now, or let the drivers using the shared page pool logic handle
> > > > >>>> the accounting themselves?
> > > > >> Intentionally separated the discussion for that here.
> > > > >>
> > > > >> As far as I can see this is just bluntly incorrect.
> > > > >>
> > > > >> Either the page is reclaimable or it is part of our pool and freeable
> > > > >> through the shrinker, but never ever both.
> > > > > IIRC the original motivation for counting ION pooled pages as
> > > > > reclaimable was to include them into /proc/meminfo's MemAvailable
> > > > > calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> > > > > non-slab kernel pages" seems like a good place to account for them but
> > > > > I might be wrong.
> > > >
> > > > Yeah, that's what I see here as well. But exactly that is utterly nonsense.
> > > >
> > > > Those pages are not "free" in the sense that get_free_page could return
> > > > them directly.
> > >
> > > Well on Android that is kinda true, because Android has it's
> > > oom-killer (way back it was just a shrinker callback, not sure how it
> > > works now), which just shot down all the background apps. So at least
> > > some of that (everything used by background apps) is indeed
> > > reclaimable on Android.
> > >
> > > But that doesn't hold on Linux in general, so we can't really do this
> > > for common code.
> > >
> > > Also I had a long meeting with Suren, John and other googles
> > > yesterday, and the aim is now to try and support all the Android gpu
> > > memory accounting needs with cgroups. That should work, and it will
> > > allow Android to handle all the Android-ism in a clean way in upstream
> > > code. Or that's at least the plan.
> > >
> > > I think the only thing we identified that Android still needs on top
> > > is the dma-buf sysfs stuff, so that shared buffers (which on Android
> > > are always dma-buf, and always stay around as dma-buf fd throughout
> > > their lifetime) can be listed/analyzed with full detail.
> > >
> > > But aside from this the plan for all the per-process or per-heap
> > > account, oom-killer integration and everything else is planned to be
> > > done with cgroups.
> >
> > Until cgroups are ready we probably will need to add a sysfs node to
> > report the total dmabuf pool size and I think that would cover our
> > current accounting need here.
> > As I mentioned, not including dmabuf pools into MemAvailable would
> > affect that stat and I'm wondering if pools should be considered as
> > part of MemAvailable or not. Since MemAvailable includes SReclaimable
> > I think it makes sense to include them but maybe there are other
> > considerations that I'm missing?
>
> On Android, yes, on upstream, not so much. Because upstream doesn't have
> the android low memory killer cleanup up all the apps, so effectively we
> can't reclaim that memory, and we shouldn't report it as such.
> -Daniel

Hmm. Sorry, I fail to see why Android's low memory killer makes a
difference here. In my mind, the pages in the pools are not used but
kept there in case heaps need them (maybe that's the part I'm wrong?).
These pages can be freed by the shrinker if memory pressure rises. In
that sense I think it's very similar to reclaimable slabs which are
already accounted as part of MemAvailable. So it seems logical to me
to include unused pages in the pools here as well. What am I missing?

>
> >
> > > Android (for now) only needs to account overall gpu
> > > memory since none of it is swappable on android drivers anyway, plus
> > > no vram, so not much needed.
> > >
> > > Cheers, Daniel
> > >
> > > >
> > > > Regards,
> > > > Christian.
> > > >
> > > > >
> > > > >> In the best case this just messes up the accounting, in the worst case
> > > > >> it can cause memory corruption.
> > > > >>
> > > > >> Christian.
> > > >
> > >
> > >
> > > --
> > > Daniel Vetter
> > > Software Engineer, Intel Corporation
> > > http://blog.ffwll.ch
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter Feb. 10, 2021, 5:21 p.m. UTC | #14
On Wed, Feb 10, 2021 at 5:39 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Feb 10, 2021 at 5:06 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> >
> > On Tue, Feb 09, 2021 at 12:16:51PM -0800, Suren Baghdasaryan wrote:
> > > On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > >
> > > > On Tue, Feb 9, 2021 at 6:46 PM Christian König <christian.koenig@amd.com> wrote:
> > > > >
> > > > >
> > > > >
> > > > > Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> > > > > > On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
> > > > > >> Am 09.02.21 um 13:11 schrieb Christian König:
> > > > > >>> [SNIP]
> > > > > >>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> > > > > >>>>>> +{
> > > > > >>>>>> +     spin_lock(&pool->lock);
> > > > > >>>>>> +     list_add_tail(&page->lru, &pool->items);
> > > > > >>>>>> +     pool->count++;
> > > > > >>>>>> +     atomic_long_add(1 << pool->order, &total_pages);
> > > > > >>>>>> +     spin_unlock(&pool->lock);
> > > > > >>>>>> +
> > > > > >>>>>> +     mod_node_page_state(page_pgdat(page),
> > > > > >>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> > > > > >>>>>> +                         1 << pool->order);
> > > > > >>>>> Hui what? What should that be good for?
> > > > > >>>> This is a carryover from the ION page pool implementation:
> > > > > >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cdccccff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3D&amp;reserved=0
> > > > > >>>>
> > > > > >>>>
> > > > > >>>> My sense is it helps with the vmstat/meminfo accounting so folks can
> > > > > >>>> see the cached pages are shrinkable/freeable. This maybe falls under
> > > > > >>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
> > > > > >>>> for now, or let the drivers using the shared page pool logic handle
> > > > > >>>> the accounting themselves?
> > > > > >> Intentionally separated the discussion for that here.
> > > > > >>
> > > > > >> As far as I can see this is just bluntly incorrect.
> > > > > >>
> > > > > >> Either the page is reclaimable or it is part of our pool and freeable
> > > > > >> through the shrinker, but never ever both.
> > > > > > IIRC the original motivation for counting ION pooled pages as
> > > > > > reclaimable was to include them into /proc/meminfo's MemAvailable
> > > > > > calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> > > > > > non-slab kernel pages" seems like a good place to account for them but
> > > > > > I might be wrong.
> > > > >
> > > > > Yeah, that's what I see here as well. But exactly that is utterly nonsense.
> > > > >
> > > > > Those pages are not "free" in the sense that get_free_page could return
> > > > > them directly.
> > > >
> > > > Well on Android that is kinda true, because Android has it's
> > > > oom-killer (way back it was just a shrinker callback, not sure how it
> > > > works now), which just shot down all the background apps. So at least
> > > > some of that (everything used by background apps) is indeed
> > > > reclaimable on Android.
> > > >
> > > > But that doesn't hold on Linux in general, so we can't really do this
> > > > for common code.
> > > >
> > > > Also I had a long meeting with Suren, John and other googles
> > > > yesterday, and the aim is now to try and support all the Android gpu
> > > > memory accounting needs with cgroups. That should work, and it will
> > > > allow Android to handle all the Android-ism in a clean way in upstream
> > > > code. Or that's at least the plan.
> > > >
> > > > I think the only thing we identified that Android still needs on top
> > > > is the dma-buf sysfs stuff, so that shared buffers (which on Android
> > > > are always dma-buf, and always stay around as dma-buf fd throughout
> > > > their lifetime) can be listed/analyzed with full detail.
> > > >
> > > > But aside from this the plan for all the per-process or per-heap
> > > > account, oom-killer integration and everything else is planned to be
> > > > done with cgroups.
> > >
> > > Until cgroups are ready we probably will need to add a sysfs node to
> > > report the total dmabuf pool size and I think that would cover our
> > > current accounting need here.
> > > As I mentioned, not including dmabuf pools into MemAvailable would
> > > affect that stat and I'm wondering if pools should be considered as
> > > part of MemAvailable or not. Since MemAvailable includes SReclaimable
> > > I think it makes sense to include them but maybe there are other
> > > considerations that I'm missing?
> >
> > On Android, yes, on upstream, not so much. Because upstream doesn't have
> > the android low memory killer cleanup up all the apps, so effectively we
> > can't reclaim that memory, and we shouldn't report it as such.
> > -Daniel
>
> Hmm. Sorry, I fail to see why Android's low memory killer makes a
> difference here. In my mind, the pages in the pools are not used but
> kept there in case heaps need them (maybe that's the part I'm wrong?).
> These pages can be freed by the shrinker if memory pressure rises. In
> that sense I think it's very similar to reclaimable slabs which are
> already accounted as part of MemAvailable. So it seems logical to me
> to include unused pages in the pools here as well. What am I missing?

Ah yes, those page pool pages we can list. But conceptually (at least
in the internals) they're shrunk through mm shrinker callbacks, like
slab cache memory. So not exactly sure where to list that.

Since we have the same pools for gpu allocations on the ttm side and
John is looking into unifying those, maybe we could add that as a
patch on top? For some nice consistency across all gpu drivers from
android to discrete. I think if you, John and Christian from ttm side
can figure out how these page pools should be reported we'll have
something that fits? Maybe John can ping you on the other thread with
the shared pool rfc between ttm and dma-buf heaps (there's so much
going right now all over I'm a bit lost).

Cheers, Daniel

>
> >
> > >
> > > > Android (for now) only needs to account overall gpu
> > > > memory since none of it is swappable on android drivers anyway, plus
> > > > no vram, so not much needed.
> > > >
> > > > Cheers, Daniel
> > > >
> > > > >
> > > > > Regards,
> > > > > Christian.
> > > > >
> > > > > >
> > > > > >> In the best case this just messes up the accounting, in the worst case
> > > > > >> it can cause memory corruption.
> > > > > >>
> > > > > >> Christian.
> > > > >
> > > >
> > > >
> > > > --
> > > > Daniel Vetter
> > > > Software Engineer, Intel Corporation
> > > > http://blog.ffwll.ch
> >
> > --
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
Suren Baghdasaryan Feb. 10, 2021, 5:28 p.m. UTC | #15
On Wed, Feb 10, 2021 at 9:21 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>
> On Wed, Feb 10, 2021 at 5:39 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Wed, Feb 10, 2021 at 5:06 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> > >
> > > On Tue, Feb 09, 2021 at 12:16:51PM -0800, Suren Baghdasaryan wrote:
> > > > On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > >
> > > > > On Tue, Feb 9, 2021 at 6:46 PM Christian König <christian.koenig@amd.com> wrote:
> > > > > >
> > > > > >
> > > > > >
> > > > > > Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> > > > > > > On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
> > > > > > >> Am 09.02.21 um 13:11 schrieb Christian König:
> > > > > > >>> [SNIP]
> > > > > > >>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> > > > > > >>>>>> +{
> > > > > > >>>>>> +     spin_lock(&pool->lock);
> > > > > > >>>>>> +     list_add_tail(&page->lru, &pool->items);
> > > > > > >>>>>> +     pool->count++;
> > > > > > >>>>>> +     atomic_long_add(1 << pool->order, &total_pages);
> > > > > > >>>>>> +     spin_unlock(&pool->lock);
> > > > > > >>>>>> +
> > > > > > >>>>>> +     mod_node_page_state(page_pgdat(page),
> > > > > > >>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> > > > > > >>>>>> +                         1 << pool->order);
> > > > > > >>>>> Hui what? What should that be good for?
> > > > > > >>>> This is a carryover from the ION page pool implementation:
> > > > > > >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cdccccff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3D&amp;reserved=0
> > > > > > >>>>
> > > > > > >>>>
> > > > > > >>>> My sense is it helps with the vmstat/meminfo accounting so folks can
> > > > > > >>>> see the cached pages are shrinkable/freeable. This maybe falls under
> > > > > > >>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
> > > > > > >>>> for now, or let the drivers using the shared page pool logic handle
> > > > > > >>>> the accounting themselves?
> > > > > > >> Intentionally separated the discussion for that here.
> > > > > > >>
> > > > > > >> As far as I can see this is just bluntly incorrect.
> > > > > > >>
> > > > > > >> Either the page is reclaimable or it is part of our pool and freeable
> > > > > > >> through the shrinker, but never ever both.
> > > > > > > IIRC the original motivation for counting ION pooled pages as
> > > > > > > reclaimable was to include them into /proc/meminfo's MemAvailable
> > > > > > > calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> > > > > > > non-slab kernel pages" seems like a good place to account for them but
> > > > > > > I might be wrong.
> > > > > >
> > > > > > Yeah, that's what I see here as well. But exactly that is utterly nonsense.
> > > > > >
> > > > > > Those pages are not "free" in the sense that get_free_page could return
> > > > > > them directly.
> > > > >
> > > > > Well on Android that is kinda true, because Android has it's
> > > > > oom-killer (way back it was just a shrinker callback, not sure how it
> > > > > works now), which just shot down all the background apps. So at least
> > > > > some of that (everything used by background apps) is indeed
> > > > > reclaimable on Android.
> > > > >
> > > > > But that doesn't hold on Linux in general, so we can't really do this
> > > > > for common code.
> > > > >
> > > > > Also I had a long meeting with Suren, John and other googles
> > > > > yesterday, and the aim is now to try and support all the Android gpu
> > > > > memory accounting needs with cgroups. That should work, and it will
> > > > > allow Android to handle all the Android-ism in a clean way in upstream
> > > > > code. Or that's at least the plan.
> > > > >
> > > > > I think the only thing we identified that Android still needs on top
> > > > > is the dma-buf sysfs stuff, so that shared buffers (which on Android
> > > > > are always dma-buf, and always stay around as dma-buf fd throughout
> > > > > their lifetime) can be listed/analyzed with full detail.
> > > > >
> > > > > But aside from this the plan for all the per-process or per-heap
> > > > > account, oom-killer integration and everything else is planned to be
> > > > > done with cgroups.
> > > >
> > > > Until cgroups are ready we probably will need to add a sysfs node to
> > > > report the total dmabuf pool size and I think that would cover our
> > > > current accounting need here.
> > > > As I mentioned, not including dmabuf pools into MemAvailable would
> > > > affect that stat and I'm wondering if pools should be considered as
> > > > part of MemAvailable or not. Since MemAvailable includes SReclaimable
> > > > I think it makes sense to include them but maybe there are other
> > > > considerations that I'm missing?
> > >
> > > On Android, yes, on upstream, not so much. Because upstream doesn't have
> > > the android low memory killer cleanup up all the apps, so effectively we
> > > can't reclaim that memory, and we shouldn't report it as such.
> > > -Daniel
> >
> > Hmm. Sorry, I fail to see why Android's low memory killer makes a
> > difference here. In my mind, the pages in the pools are not used but
> > kept there in case heaps need them (maybe that's the part I'm wrong?).
> > These pages can be freed by the shrinker if memory pressure rises. In
> > that sense I think it's very similar to reclaimable slabs which are
> > already accounted as part of MemAvailable. So it seems logical to me
> > to include unused pages in the pools here as well. What am I missing?
>
> Ah yes, those page pool pages we can list. But conceptually (at least
> in the internals) they're shrunk through mm shrinker callbacks, like
> slab cache memory. So not exactly sure where to list that.
>
> Since we have the same pools for gpu allocations on the ttm side and
> John is looking into unifying those, maybe we could add that as a
> patch on top? For some nice consistency across all gpu drivers from
> android to discrete. I think if you, John and Christian from ttm side
> can figure out how these page pools should be reported we'll have
> something that fits? Maybe John can ping you on the other thread with
> the shared pool rfc between ttm and dma-buf heaps (there's so much
> going right now all over I'm a bit lost).

Sounds good. I'll follow up with John to see where this discussion fits better.
Thanks!

>
> Cheers, Daniel
>
> >
> > >
> > > >
> > > > > Android (for now) only needs to account overall gpu
> > > > > memory since none of it is swappable on android drivers anyway, plus
> > > > > no vram, so not much needed.
> > > > >
> > > > > Cheers, Daniel
> > > > >
> > > > > >
> > > > > > Regards,
> > > > > > Christian.
> > > > > >
> > > > > > >
> > > > > > >> In the best case this just messes up the accounting, in the worst case
> > > > > > >> it can cause memory corruption.
> > > > > > >>
> > > > > > >> Christian.
> > > > > >
> > > > >
> > > > >
> > > > > --
> > > > > Daniel Vetter
> > > > > Software Engineer, Intel Corporation
> > > > > http://blog.ffwll.ch
> > >
> > > --
> > > Daniel Vetter
> > > Software Engineer, Intel Corporation
> > > http://blog.ffwll.ch
>
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Christian König Feb. 10, 2021, 6:32 p.m. UTC | #16
Am 10.02.21 um 17:39 schrieb Suren Baghdasaryan:
> On Wed, Feb 10, 2021 at 5:06 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>> On Tue, Feb 09, 2021 at 12:16:51PM -0800, Suren Baghdasaryan wrote:
>>> On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>> On Tue, Feb 9, 2021 at 6:46 PM Christian König <christian.koenig@amd.com> wrote:
>>>>>
>>>>>
>>>>> Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
>>>>>> On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
>>>>>>> Am 09.02.21 um 13:11 schrieb Christian König:
>>>>>>>> [SNIP]
>>>>>>>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
>>>>>>>>>>> +{
>>>>>>>>>>> +     spin_lock(&pool->lock);
>>>>>>>>>>> +     list_add_tail(&page->lru, &pool->items);
>>>>>>>>>>> +     pool->count++;
>>>>>>>>>>> +     atomic_long_add(1 << pool->order, &total_pages);
>>>>>>>>>>> +     spin_unlock(&pool->lock);
>>>>>>>>>>> +
>>>>>>>>>>> +     mod_node_page_state(page_pgdat(page),
>>>>>>>>>>> NR_KERNEL_MISC_RECLAIMABLE,
>>>>>>>>>>> +                         1 << pool->order);
>>>>>>>>>> Hui what? What should that be good for?
>>>>>>>>> This is a carryover from the ION page pool implementation:
>>>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cbb7155447ee149a49f3a08d8cde2685d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637485719618339413%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=IYsJoAd7SUo12V7tS3CCRqNVm569iy%2FtoXQqm2MdC1g%3D&amp;reserved=0
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> My sense is it helps with the vmstat/meminfo accounting so folks can
>>>>>>>>> see the cached pages are shrinkable/freeable. This maybe falls under
>>>>>>>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
>>>>>>>>> for now, or let the drivers using the shared page pool logic handle
>>>>>>>>> the accounting themselves?
>>>>>>> Intentionally separated the discussion for that here.
>>>>>>>
>>>>>>> As far as I can see this is just bluntly incorrect.
>>>>>>>
>>>>>>> Either the page is reclaimable or it is part of our pool and freeable
>>>>>>> through the shrinker, but never ever both.
>>>>>> IIRC the original motivation for counting ION pooled pages as
>>>>>> reclaimable was to include them into /proc/meminfo's MemAvailable
>>>>>> calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
>>>>>> non-slab kernel pages" seems like a good place to account for them but
>>>>>> I might be wrong.
>>>>> Yeah, that's what I see here as well. But exactly that is utterly nonsense.
>>>>>
>>>>> Those pages are not "free" in the sense that get_free_page could return
>>>>> them directly.
>>>> Well on Android that is kinda true, because Android has it's
>>>> oom-killer (way back it was just a shrinker callback, not sure how it
>>>> works now), which just shot down all the background apps. So at least
>>>> some of that (everything used by background apps) is indeed
>>>> reclaimable on Android.
>>>>
>>>> But that doesn't hold on Linux in general, so we can't really do this
>>>> for common code.
>>>>
>>>> Also I had a long meeting with Suren, John and other googles
>>>> yesterday, and the aim is now to try and support all the Android gpu
>>>> memory accounting needs with cgroups. That should work, and it will
>>>> allow Android to handle all the Android-ism in a clean way in upstream
>>>> code. Or that's at least the plan.
>>>>
>>>> I think the only thing we identified that Android still needs on top
>>>> is the dma-buf sysfs stuff, so that shared buffers (which on Android
>>>> are always dma-buf, and always stay around as dma-buf fd throughout
>>>> their lifetime) can be listed/analyzed with full detail.
>>>>
>>>> But aside from this the plan for all the per-process or per-heap
>>>> account, oom-killer integration and everything else is planned to be
>>>> done with cgroups.
>>> Until cgroups are ready we probably will need to add a sysfs node to
>>> report the total dmabuf pool size and I think that would cover our
>>> current accounting need here.
>>> As I mentioned, not including dmabuf pools into MemAvailable would
>>> affect that stat and I'm wondering if pools should be considered as
>>> part of MemAvailable or not. Since MemAvailable includes SReclaimable
>>> I think it makes sense to include them but maybe there are other
>>> considerations that I'm missing?
>> On Android, yes, on upstream, not so much. Because upstream doesn't have
>> the android low memory killer cleanup up all the apps, so effectively we
>> can't reclaim that memory, and we shouldn't report it as such.
>> -Daniel
> Hmm. Sorry, I fail to see why Android's low memory killer makes a
> difference here. In my mind, the pages in the pools are not used but
> kept there in case heaps need them (maybe that's the part I'm wrong?).
> These pages can be freed by the shrinker if memory pressure rises.

And exactly that's the difference. They *can* be freed is not the same 
thing as they *are* free.

> In that sense I think it's very similar to reclaimable slabs which are
> already accounted as part of MemAvailable. So it seems logical to me
> to include unused pages in the pools here as well. What am I missing?

See the shrinkers are there because you need to do some action before 
you can re-use the memory.

In the case of the TTM/DRM pool for example you need to change the 
caching attributes which might cause sleeping for a TLB flush to finish.

By accounting those pages as free you mess up (for example) the handling 
which makes sure that there are enough emergency reserves. I can only 
strongly recommend to not do that.

What you could do is to add a sysfs interface to expose the different 
shrinkers and the amount of pages in them to userspace. Similar to what 
/proc/slabinfo is doing.

Regards,
Christian.

>
>>>> Android (for now) only needs to account overall gpu
>>>> memory since none of it is swappable on android drivers anyway, plus
>>>> no vram, so not much needed.
>>>>
>>>> Cheers, Daniel
>>>>
>>>>> Regards,
>>>>> Christian.
>>>>>
>>>>>>> In the best case this just messes up the accounting, in the worst case
>>>>>>> it can cause memory corruption.
>>>>>>>
>>>>>>> Christian.
>>>>
>>>> --
>>>> Daniel Vetter
>>>> Software Engineer, Intel Corporation
>>>> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fblog.ffwll.ch%2F&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cbb7155447ee149a49f3a08d8cde2685d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637485719618349407%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=11ENl28PIoSoUx6FhkEK9u4G6yiLc3YhsYsl1DIzsv8%3D&amp;reserved=0
>> --
>> Daniel Vetter
>> Software Engineer, Intel Corporation
>> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fblog.ffwll.ch%2F&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cbb7155447ee149a49f3a08d8cde2685d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637485719618349407%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=11ENl28PIoSoUx6FhkEK9u4G6yiLc3YhsYsl1DIzsv8%3D&amp;reserved=0
Suren Baghdasaryan Feb. 10, 2021, 7:12 p.m. UTC | #17
On Wed, Feb 10, 2021 at 10:32 AM Christian König
<christian.koenig@amd.com> wrote:
>
>
>
> Am 10.02.21 um 17:39 schrieb Suren Baghdasaryan:
> > On Wed, Feb 10, 2021 at 5:06 AM Daniel Vetter <daniel@ffwll.ch> wrote:
> >> On Tue, Feb 09, 2021 at 12:16:51PM -0800, Suren Baghdasaryan wrote:
> >>> On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter <daniel@ffwll.ch> wrote:
> >>>> On Tue, Feb 9, 2021 at 6:46 PM Christian König <christian.koenig@amd.com> wrote:
> >>>>>
> >>>>>
> >>>>> Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> >>>>>> On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
> >>>>>>> Am 09.02.21 um 13:11 schrieb Christian König:
> >>>>>>>> [SNIP]
> >>>>>>>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> >>>>>>>>>>> +{
> >>>>>>>>>>> +     spin_lock(&pool->lock);
> >>>>>>>>>>> +     list_add_tail(&page->lru, &pool->items);
> >>>>>>>>>>> +     pool->count++;
> >>>>>>>>>>> +     atomic_long_add(1 << pool->order, &total_pages);
> >>>>>>>>>>> +     spin_unlock(&pool->lock);
> >>>>>>>>>>> +
> >>>>>>>>>>> +     mod_node_page_state(page_pgdat(page),
> >>>>>>>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> >>>>>>>>>>> +                         1 << pool->order);
> >>>>>>>>>> Hui what? What should that be good for?
> >>>>>>>>> This is a carryover from the ION page pool implementation:
> >>>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cbb7155447ee149a49f3a08d8cde2685d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637485719618339413%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=IYsJoAd7SUo12V7tS3CCRqNVm569iy%2FtoXQqm2MdC1g%3D&amp;reserved=0
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> My sense is it helps with the vmstat/meminfo accounting so folks can
> >>>>>>>>> see the cached pages are shrinkable/freeable. This maybe falls under
> >>>>>>>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
> >>>>>>>>> for now, or let the drivers using the shared page pool logic handle
> >>>>>>>>> the accounting themselves?
> >>>>>>> Intentionally separated the discussion for that here.
> >>>>>>>
> >>>>>>> As far as I can see this is just bluntly incorrect.
> >>>>>>>
> >>>>>>> Either the page is reclaimable or it is part of our pool and freeable
> >>>>>>> through the shrinker, but never ever both.
> >>>>>> IIRC the original motivation for counting ION pooled pages as
> >>>>>> reclaimable was to include them into /proc/meminfo's MemAvailable
> >>>>>> calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> >>>>>> non-slab kernel pages" seems like a good place to account for them but
> >>>>>> I might be wrong.
> >>>>> Yeah, that's what I see here as well. But exactly that is utterly nonsense.
> >>>>>
> >>>>> Those pages are not "free" in the sense that get_free_page could return
> >>>>> them directly.
> >>>> Well on Android that is kinda true, because Android has it's
> >>>> oom-killer (way back it was just a shrinker callback, not sure how it
> >>>> works now), which just shot down all the background apps. So at least
> >>>> some of that (everything used by background apps) is indeed
> >>>> reclaimable on Android.
> >>>>
> >>>> But that doesn't hold on Linux in general, so we can't really do this
> >>>> for common code.
> >>>>
> >>>> Also I had a long meeting with Suren, John and other googles
> >>>> yesterday, and the aim is now to try and support all the Android gpu
> >>>> memory accounting needs with cgroups. That should work, and it will
> >>>> allow Android to handle all the Android-ism in a clean way in upstream
> >>>> code. Or that's at least the plan.
> >>>>
> >>>> I think the only thing we identified that Android still needs on top
> >>>> is the dma-buf sysfs stuff, so that shared buffers (which on Android
> >>>> are always dma-buf, and always stay around as dma-buf fd throughout
> >>>> their lifetime) can be listed/analyzed with full detail.
> >>>>
> >>>> But aside from this the plan for all the per-process or per-heap
> >>>> account, oom-killer integration and everything else is planned to be
> >>>> done with cgroups.
> >>> Until cgroups are ready we probably will need to add a sysfs node to
> >>> report the total dmabuf pool size and I think that would cover our
> >>> current accounting need here.
> >>> As I mentioned, not including dmabuf pools into MemAvailable would
> >>> affect that stat and I'm wondering if pools should be considered as
> >>> part of MemAvailable or not. Since MemAvailable includes SReclaimable
> >>> I think it makes sense to include them but maybe there are other
> >>> considerations that I'm missing?
> >> On Android, yes, on upstream, not so much. Because upstream doesn't have
> >> the android low memory killer cleanup up all the apps, so effectively we
> >> can't reclaim that memory, and we shouldn't report it as such.
> >> -Daniel
> > Hmm. Sorry, I fail to see why Android's low memory killer makes a
> > difference here. In my mind, the pages in the pools are not used but
> > kept there in case heaps need them (maybe that's the part I'm wrong?).
> > These pages can be freed by the shrinker if memory pressure rises.
>
> And exactly that's the difference. They *can* be freed is not the same
> thing as they *are* free.

No argument there. That's why I think meminfo has two separate stats
for MemFree and MemAvailable. MemFree is self-explanatory. The
description of MemAvailable in
https://www.kernel.org/doc/Documentation/filesystems/proc.txt says "An
estimate of how much memory is available for starting new
applications, without swapping.". Since dropping unused pages from
slabs, caches and pools is less expensive than swapping, I would
assume that a well-behaved system would do that before resorting to
swapping. And if so, such memory should be included in MemAvailable
because VM will make it available before swapping. But again, that's
my interpretation. WDYT?

>
> > In that sense I think it's very similar to reclaimable slabs which are
> > already accounted as part of MemAvailable. So it seems logical to me
> > to include unused pages in the pools here as well. What am I missing?
>
> See the shrinkers are there because you need to do some action before
> you can re-use the memory.
>
> In the case of the TTM/DRM pool for example you need to change the
> caching attributes which might cause sleeping for a TLB flush to finish.

I see your point here. But how about caches/pools which can be easily
dropped? Shouldn't they be part of MemAvailable?

>
> By accounting those pages as free you mess up (for example) the handling
> which makes sure that there are enough emergency reserves. I can only
> strongly recommend to not do that.
>
> What you could do is to add a sysfs interface to expose the different
> shrinkers and the amount of pages in them to userspace. Similar to what
> /proc/slabinfo is doing.

True, we can work around this. Just want to make sure whatever we do
really makes sense.
Thanks,
Suren.

>
> Regards,
> Christian.
>
> >
> >>>> Android (for now) only needs to account overall gpu
> >>>> memory since none of it is swappable on android drivers anyway, plus
> >>>> no vram, so not much needed.
> >>>>
> >>>> Cheers, Daniel
> >>>>
> >>>>> Regards,
> >>>>> Christian.
> >>>>>
> >>>>>>> In the best case this just messes up the accounting, in the worst case
> >>>>>>> it can cause memory corruption.
> >>>>>>>
> >>>>>>> Christian.
> >>>>
> >>>> --
> >>>> Daniel Vetter
> >>>> Software Engineer, Intel Corporation
> >>>> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fblog.ffwll.ch%2F&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cbb7155447ee149a49f3a08d8cde2685d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637485719618349407%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=11ENl28PIoSoUx6FhkEK9u4G6yiLc3YhsYsl1DIzsv8%3D&amp;reserved=0
> >> --
> >> Daniel Vetter
> >> Software Engineer, Intel Corporation
> >> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fblog.ffwll.ch%2F&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7Cbb7155447ee149a49f3a08d8cde2685d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637485719618349407%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=11ENl28PIoSoUx6FhkEK9u4G6yiLc3YhsYsl1DIzsv8%3D&amp;reserved=0
>
Christian König Feb. 10, 2021, 7:23 p.m. UTC | #18
Am 10.02.21 um 20:12 schrieb Suren Baghdasaryan:
> On Wed, Feb 10, 2021 at 10:32 AM Christian König
> <christian.koenig@amd.com> wrote:
>>
>>
>> Am 10.02.21 um 17:39 schrieb Suren Baghdasaryan:
>>> On Wed, Feb 10, 2021 at 5:06 AM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>> On Tue, Feb 09, 2021 at 12:16:51PM -0800, Suren Baghdasaryan wrote:
>>>>> On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter <daniel@ffwll.ch> wrote:
>>>>>> On Tue, Feb 9, 2021 at 6:46 PM Christian König <christian.koenig@amd.com> wrote:
>>>>>>>
>>>>>>> Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
>>>>>>>> On Tue, Feb 9, 2021 at 4:57 AM Christian König <christian.koenig@amd.com> wrote:
>>>>>>>>> Am 09.02.21 um 13:11 schrieb Christian König:
>>>>>>>>>> [SNIP]
>>>>>>>>>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> +     spin_lock(&pool->lock);
>>>>>>>>>>>>> +     list_add_tail(&page->lru, &pool->items);
>>>>>>>>>>>>> +     pool->count++;
>>>>>>>>>>>>> +     atomic_long_add(1 << pool->order, &total_pages);
>>>>>>>>>>>>> +     spin_unlock(&pool->lock);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +     mod_node_page_state(page_pgdat(page),
>>>>>>>>>>>>> NR_KERNEL_MISC_RECLAIMABLE,
>>>>>>>>>>>>> +                         1 << pool->order);
>>>>>>>>>>>> Hui what? What should that be good for?
>>>>>>>>>>> This is a carryover from the ION page pool implementation:
>>>>>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C4c92877de38440ef18ee08d8cdf7c8ab%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637485811429326241%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=Ghp48YBj6R3z4yKsj8ecjXxhZp8g5sJOL0GJRUtCFKY%3D&amp;reserved=0
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> My sense is it helps with the vmstat/meminfo accounting so folks can
>>>>>>>>>>> see the cached pages are shrinkable/freeable. This maybe falls under
>>>>>>>>>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
>>>>>>>>>>> for now, or let the drivers using the shared page pool logic handle
>>>>>>>>>>> the accounting themselves?
>>>>>>>>> Intentionally separated the discussion for that here.
>>>>>>>>>
>>>>>>>>> As far as I can see this is just bluntly incorrect.
>>>>>>>>>
>>>>>>>>> Either the page is reclaimable or it is part of our pool and freeable
>>>>>>>>> through the shrinker, but never ever both.
>>>>>>>> IIRC the original motivation for counting ION pooled pages as
>>>>>>>> reclaimable was to include them into /proc/meminfo's MemAvailable
>>>>>>>> calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
>>>>>>>> non-slab kernel pages" seems like a good place to account for them but
>>>>>>>> I might be wrong.
>>>>>>> Yeah, that's what I see here as well. But exactly that is utterly nonsense.
>>>>>>>
>>>>>>> Those pages are not "free" in the sense that get_free_page could return
>>>>>>> them directly.
>>>>>> Well on Android that is kinda true, because Android has it's
>>>>>> oom-killer (way back it was just a shrinker callback, not sure how it
>>>>>> works now), which just shot down all the background apps. So at least
>>>>>> some of that (everything used by background apps) is indeed
>>>>>> reclaimable on Android.
>>>>>>
>>>>>> But that doesn't hold on Linux in general, so we can't really do this
>>>>>> for common code.
>>>>>>
>>>>>> Also I had a long meeting with Suren, John and other googles
>>>>>> yesterday, and the aim is now to try and support all the Android gpu
>>>>>> memory accounting needs with cgroups. That should work, and it will
>>>>>> allow Android to handle all the Android-ism in a clean way in upstream
>>>>>> code. Or that's at least the plan.
>>>>>>
>>>>>> I think the only thing we identified that Android still needs on top
>>>>>> is the dma-buf sysfs stuff, so that shared buffers (which on Android
>>>>>> are always dma-buf, and always stay around as dma-buf fd throughout
>>>>>> their lifetime) can be listed/analyzed with full detail.
>>>>>>
>>>>>> But aside from this the plan for all the per-process or per-heap
>>>>>> account, oom-killer integration and everything else is planned to be
>>>>>> done with cgroups.
>>>>> Until cgroups are ready we probably will need to add a sysfs node to
>>>>> report the total dmabuf pool size and I think that would cover our
>>>>> current accounting need here.
>>>>> As I mentioned, not including dmabuf pools into MemAvailable would
>>>>> affect that stat and I'm wondering if pools should be considered as
>>>>> part of MemAvailable or not. Since MemAvailable includes SReclaimable
>>>>> I think it makes sense to include them but maybe there are other
>>>>> considerations that I'm missing?
>>>> On Android, yes, on upstream, not so much. Because upstream doesn't have
>>>> the android low memory killer cleanup up all the apps, so effectively we
>>>> can't reclaim that memory, and we shouldn't report it as such.
>>>> -Daniel
>>> Hmm. Sorry, I fail to see why Android's low memory killer makes a
>>> difference here. In my mind, the pages in the pools are not used but
>>> kept there in case heaps need them (maybe that's the part I'm wrong?).
>>> These pages can be freed by the shrinker if memory pressure rises.
>> And exactly that's the difference. They *can* be freed is not the same
>> thing as they *are* free.
> No argument there. That's why I think meminfo has two separate stats
> for MemFree and MemAvailable. MemFree is self-explanatory. The
> description of MemAvailable in
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.kernel.org%2Fdoc%2FDocumentation%2Ffilesystems%2Fproc.txt&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C4c92877de38440ef18ee08d8cdf7c8ab%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637485811429326241%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=0MI4k9YtAnGvP0QlBYHC6QMCJvQpZoaWMRlqjptvwsA%3D&amp;reserved=0 says "An
> estimate of how much memory is available for starting new
> applications, without swapping.". Since dropping unused pages from
> slabs, caches and pools is less expensive than swapping, I would
> assume that a well-behaved system would do that before resorting to
> swapping. And if so, such memory should be included in MemAvailable
> because VM will make it available before swapping. But again, that's
> my interpretation. WDYT?
>
>>> In that sense I think it's very similar to reclaimable slabs which are
>>> already accounted as part of MemAvailable. So it seems logical to me
>>> to include unused pages in the pools here as well. What am I missing?
>> See the shrinkers are there because you need to do some action before
>> you can re-use the memory.
>>
>> In the case of the TTM/DRM pool for example you need to change the
>> caching attributes which might cause sleeping for a TLB flush to finish.
> I see your point here. But how about caches/pools which can be easily
> dropped? Shouldn't they be part of MemAvailable?

Well if it can be easily dropped and reallocated we wouldn't have a pool 
for that in the first place :)

I mean we have created this page pool because the TLB shot down for the 
caching change is extremely costly.

It could make sense for slap, but not sure about that.

Regards,
Christian.

>
>> By accounting those pages as free you mess up (for example) the handling
>> which makes sure that there are enough emergency reserves. I can only
>> strongly recommend to not do that.
>>
>> What you could do is to add a sysfs interface to expose the different
>> shrinkers and the amount of pages in them to userspace. Similar to what
>> /proc/slabinfo is doing.
> True, we can work around this. Just want to make sure whatever we do
> really makes sense.
> Thanks,
> Suren.
>
>> Regards,
>> Christian.
>>
>>>>>> Android (for now) only needs to account overall gpu
>>>>>> memory since none of it is swappable on android drivers anyway, plus
>>>>>> no vram, so not much needed.
>>>>>>
>>>>>> Cheers, Daniel
>>>>>>
>>>>>>> Regards,
>>>>>>> Christian.
>>>>>>>
>>>>>>>>> In the best case this just messes up the accounting, in the worst case
>>>>>>>>> it can cause memory corruption.
>>>>>>>>>
>>>>>>>>> Christian.
>>>>>> --
>>>>>> Daniel Vetter
>>>>>> Software Engineer, Intel Corporation
>>>>>> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fblog.ffwll.ch%2F&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C4c92877de38440ef18ee08d8cdf7c8ab%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637485811429326241%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=vqR0DAK5XkzanA4fqN0hxDm8ZbCazpvlqTDMUqxhDoA%3D&amp;reserved=0
>>>> --
>>>> Daniel Vetter
>>>> Software Engineer, Intel Corporation
>>>> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fblog.ffwll.ch%2F&amp;data=04%7C01%7Cchristian.koenig%40amd.com%7C4c92877de38440ef18ee08d8cdf7c8ab%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637485811429326241%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=vqR0DAK5XkzanA4fqN0hxDm8ZbCazpvlqTDMUqxhDoA%3D&amp;reserved=0
diff mbox series

Patch

diff --git a/drivers/gpu/drm/Kconfig b/drivers/gpu/drm/Kconfig
index 0973f408d75f..d16bf340ed2e 100644
--- a/drivers/gpu/drm/Kconfig
+++ b/drivers/gpu/drm/Kconfig
@@ -174,6 +174,10 @@  config DRM_DP_CEC
 	  Note: not all adapters support this feature, and even for those
 	  that do support this they often do not hook up the CEC pin.
 
+config DRM_PAGE_POOL
+	bool
+	depends on DRM
+
 config DRM_TTM
 	tristate
 	depends on DRM && MMU
diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index fefaff4c832d..877e0111ed34 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -32,6 +32,7 @@  drm-$(CONFIG_AGP) += drm_agpsupport.o
 drm-$(CONFIG_PCI) += drm_pci.o
 drm-$(CONFIG_DEBUG_FS) += drm_debugfs.o drm_debugfs_crc.o
 drm-$(CONFIG_DRM_LOAD_EDID_FIRMWARE) += drm_edid_load.o
+drm-$(CONFIG_DRM_PAGE_POOL) += page_pool.o
 
 drm_vram_helper-y := drm_gem_vram_helper.o
 obj-$(CONFIG_DRM_VRAM_HELPER) += drm_vram_helper.o
diff --git a/drivers/gpu/drm/page_pool.c b/drivers/gpu/drm/page_pool.c
new file mode 100644
index 000000000000..2139f86e6ca7
--- /dev/null
+++ b/drivers/gpu/drm/page_pool.c
@@ -0,0 +1,220 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DRM page pool system
+ *
+ * Copyright (C) 2020 Linaro Ltd.
+ *
+ * Based on the ION page pool code
+ * Copyright (C) 2011 Google, Inc.
+ * As well as the ttm_pool code
+ * Copyright (C) 2020 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/freezer.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/sched/signal.h>
+#include <drm/page_pool.h>
+
+static LIST_HEAD(pool_list);
+static DEFINE_MUTEX(pool_list_lock);
+static atomic_long_t total_pages;
+static unsigned long page_pool_max;
+MODULE_PARM_DESC(page_pool_max, "Number of pages in the WC/UC/DMA pool");
+module_param(page_pool_max, ulong, 0644);
+
+void drm_page_pool_set_max(unsigned long max)
+{
+	/* only write once */
+	if (!page_pool_max)
+		page_pool_max = max;
+}
+
+unsigned long drm_page_pool_get_max(void)
+{
+	return page_pool_max;
+}
+
+unsigned long drm_page_pool_get_total(void)
+{
+	return atomic_long_read(&total_pages);
+}
+
+int drm_page_pool_get_size(struct drm_page_pool *pool)
+{
+	int ret;
+
+	spin_lock(&pool->lock);
+	ret = pool->count;
+	spin_unlock(&pool->lock);
+	return ret;
+}
+
+static inline unsigned int drm_page_pool_free_pages(struct drm_page_pool *pool,
+						    struct page *page)
+{
+	return pool->free(page, pool->order);
+}
+
+static int drm_page_pool_shrink_one(void);
+
+void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
+{
+	spin_lock(&pool->lock);
+	list_add_tail(&page->lru, &pool->items);
+	pool->count++;
+	atomic_long_add(1 << pool->order, &total_pages);
+	spin_unlock(&pool->lock);
+
+	mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
+			    1 << pool->order);
+
+	/* make sure we don't grow too large */
+	while (page_pool_max && atomic_long_read(&total_pages) > page_pool_max)
+		drm_page_pool_shrink_one();
+}
+EXPORT_SYMBOL_GPL(drm_page_pool_add);
+
+static struct page *drm_page_pool_remove(struct drm_page_pool *pool)
+{
+	struct page *page;
+
+	if (!pool->count)
+		return NULL;
+
+	page = list_first_entry(&pool->items, struct page, lru);
+	pool->count--;
+	atomic_long_sub(1 << pool->order, &total_pages);
+
+	list_del(&page->lru);
+	mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
+			    -(1 << pool->order));
+	return page;
+}
+
+struct page *drm_page_pool_fetch(struct drm_page_pool *pool)
+{
+	struct page *page = NULL;
+
+	if (!pool) {
+		WARN_ON(!pool);
+		return NULL;
+	}
+
+	spin_lock(&pool->lock);
+	page = drm_page_pool_remove(pool);
+	spin_unlock(&pool->lock);
+
+	return page;
+}
+EXPORT_SYMBOL_GPL(drm_page_pool_fetch);
+
+struct drm_page_pool *drm_page_pool_create(unsigned int order,
+					   int (*free_page)(struct page *p, unsigned int order))
+{
+	struct drm_page_pool *pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+
+	if (!pool)
+		return NULL;
+
+	pool->count = 0;
+	INIT_LIST_HEAD(&pool->items);
+	pool->order = order;
+	pool->free = free_page;
+	spin_lock_init(&pool->lock);
+	INIT_LIST_HEAD(&pool->list);
+
+	mutex_lock(&pool_list_lock);
+	list_add(&pool->list, &pool_list);
+	mutex_unlock(&pool_list_lock);
+
+	return pool;
+}
+EXPORT_SYMBOL_GPL(drm_page_pool_create);
+
+void drm_page_pool_destroy(struct drm_page_pool *pool)
+{
+	struct page *page;
+
+	/* Remove us from the pool list */
+	mutex_lock(&pool_list_lock);
+	list_del(&pool->list);
+	mutex_unlock(&pool_list_lock);
+
+	/* Free any remaining pages in the pool */
+	spin_lock(&pool->lock);
+	while (pool->count) {
+		page = drm_page_pool_remove(pool);
+		spin_unlock(&pool->lock);
+		drm_page_pool_free_pages(pool, page);
+		spin_lock(&pool->lock);
+	}
+	spin_unlock(&pool->lock);
+
+	kfree(pool);
+}
+EXPORT_SYMBOL_GPL(drm_page_pool_destroy);
+
+static int drm_page_pool_shrink_one(void)
+{
+	struct drm_page_pool *pool;
+	struct page *page;
+	int nr_freed = 0;
+
+	mutex_lock(&pool_list_lock);
+	pool = list_first_entry(&pool_list, typeof(*pool), list);
+
+	spin_lock(&pool->lock);
+	page = drm_page_pool_remove(pool);
+	spin_unlock(&pool->lock);
+
+	if (page)
+		nr_freed = drm_page_pool_free_pages(pool, page);
+
+	list_move_tail(&pool->list, &pool_list);
+	mutex_unlock(&pool_list_lock);
+
+	return nr_freed;
+}
+
+static unsigned long drm_page_pool_shrink_count(struct shrinker *shrinker,
+						struct shrink_control *sc)
+{
+	unsigned long count =  atomic_long_read(&total_pages);
+
+	return count ? count : SHRINK_EMPTY;
+}
+
+static unsigned long drm_page_pool_shrink_scan(struct shrinker *shrinker,
+					       struct shrink_control *sc)
+{
+	int to_scan = sc->nr_to_scan;
+	int nr_total = 0;
+
+	if (to_scan == 0)
+		return 0;
+
+	do {
+		int nr_freed = drm_page_pool_shrink_one();
+
+		to_scan -= nr_freed;
+		nr_total += nr_freed;
+	} while (to_scan >= 0 && atomic_long_read(&total_pages));
+
+	return nr_total;
+}
+
+static struct shrinker pool_shrinker = {
+	.count_objects = drm_page_pool_shrink_count,
+	.scan_objects = drm_page_pool_shrink_scan,
+	.seeks = 1,
+	.batch = 0,
+};
+
+int drm_page_pool_init_shrinker(void)
+{
+	return register_shrinker(&pool_shrinker);
+}
+module_init(drm_page_pool_init_shrinker);
+MODULE_LICENSE("GPL v2");
diff --git a/include/drm/page_pool.h b/include/drm/page_pool.h
new file mode 100644
index 000000000000..47e240b2bc69
--- /dev/null
+++ b/include/drm/page_pool.h
@@ -0,0 +1,54 @@ 
+/* SPDX-License-Identifier: GPL-2.0 OR MIT */
+/*
+ * Copyright 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors: Christian König
+ */
+
+#ifndef _DRM_PAGE_POOL_H_
+#define _DRM_PAGE_POOL_H_
+
+#include <linux/mmzone.h>
+#include <linux/llist.h>
+#include <linux/spinlock.h>
+
+struct drm_page_pool {
+	int count;
+	struct list_head items;
+
+	int order;
+	int (*free)(struct page *p, unsigned int order);
+
+	spinlock_t lock;
+	struct list_head list;
+};
+
+void drm_page_pool_set_max(unsigned long max);
+unsigned long drm_page_pool_get_max(void);
+unsigned long drm_page_pool_get_total(void);
+int drm_page_pool_get_size(struct drm_page_pool *pool);
+struct page *drm_page_pool_fetch(struct drm_page_pool *pool);
+void drm_page_pool_add(struct drm_page_pool *pool, struct page *page);
+struct drm_page_pool *drm_page_pool_create(unsigned int order,
+					   int (*free_page)(struct page *p, unsigned int order));
+void drm_page_pool_destroy(struct drm_page_pool *pool);
+
+#endif