diff mbox series

[06/11] drm/i915: Allocate active tracking nodes from a slabcache

Message ID 20190130021906.17933-6-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset | expand

Commit Message

Chris Wilson Jan. 30, 2019, 2:19 a.m. UTC
Wrap the active tracking for a GPU references in a slabcache for faster
allocations, and keep track of inflight nodes so we can reap the
stale entries upon parking (thereby trimming our memory usage).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_active.c            | 55 ++++++++++++++++---
 drivers/gpu/drm/i915/i915_active.h            | 21 +++++--
 drivers/gpu/drm/i915/i915_active_types.h      | 12 +++-
 drivers/gpu/drm/i915/i915_drv.h               |  2 +
 drivers/gpu/drm/i915/i915_gem.c               | 16 +++++-
 drivers/gpu/drm/i915/i915_gem_gtt.c           |  2 +-
 drivers/gpu/drm/i915/i915_vma.c               |  3 +-
 drivers/gpu/drm/i915/selftests/i915_active.c  |  3 +-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  6 ++
 9 files changed, 100 insertions(+), 20 deletions(-)

Comments

Tvrtko Ursulin Jan. 30, 2019, 3:57 p.m. UTC | #1
On 30/01/2019 02:19, Chris Wilson wrote:
> Wrap the active tracking for a GPU references in a slabcache for faster
> allocations, and keep track of inflight nodes so we can reap the
> stale entries upon parking (thereby trimming our memory usage).

I suggest a two staged approach. First patch add a slab cache (you can 
also add kmem_cache_shrink on park as we do for other caches), then add 
the parking/reaping bit.

Under what scenarios we end up not freeing active nodes sufficiently? It 
would have to be some user which keeps many contexts around, having only 
used them once?

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_active.c            | 55 ++++++++++++++++---
>   drivers/gpu/drm/i915/i915_active.h            | 21 +++++--
>   drivers/gpu/drm/i915/i915_active_types.h      | 12 +++-
>   drivers/gpu/drm/i915/i915_drv.h               |  2 +
>   drivers/gpu/drm/i915/i915_gem.c               | 16 +++++-
>   drivers/gpu/drm/i915/i915_gem_gtt.c           |  2 +-
>   drivers/gpu/drm/i915/i915_vma.c               |  3 +-
>   drivers/gpu/drm/i915/selftests/i915_active.c  |  3 +-
>   .../gpu/drm/i915/selftests/mock_gem_device.c  |  6 ++
>   9 files changed, 100 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> index e0182e19cb8b..3c7abbde42ac 100644
> --- a/drivers/gpu/drm/i915/i915_active.c
> +++ b/drivers/gpu/drm/i915/i915_active.c
> @@ -7,7 +7,9 @@
>   #include "i915_drv.h"
>   #include "i915_active.h"
>   
> -#define BKL(ref) (&(ref)->i915->drm.struct_mutex)
> +#define i915_from_gt(x) \
> +	container_of(x, struct drm_i915_private, gt.active_refs)
> +#define BKL(ref) (&i915_from_gt((ref)->gt)->drm.struct_mutex)
>   
>   struct active_node {
>   	struct i915_gem_active base;
> @@ -79,11 +81,11 @@ active_instance(struct i915_active *ref, u64 idx)
>   			p = &parent->rb_left;
>   	}
>   
> -	node = kmalloc(sizeof(*node), GFP_KERNEL);
> +	node = kmem_cache_alloc(ref->gt->slab_cache, GFP_KERNEL);
>   
>   	/* kmalloc may retire the ref->last (thanks shrinker)! */
>   	if (unlikely(!i915_gem_active_raw(&ref->last, BKL(ref)))) {
> -		kfree(node);
> +		kmem_cache_free(ref->gt->slab_cache, node);
>   		goto out;
>   	}
>   
> @@ -94,6 +96,9 @@ active_instance(struct i915_active *ref, u64 idx)
>   	node->ref = ref;
>   	node->timeline = idx;
>   
> +	if (RB_EMPTY_ROOT(&ref->tree))
> +		list_add(&ref->active_link, &ref->gt->active_refs);
> +
>   	rb_link_node(&node->node, parent, p);
>   	rb_insert_color(&node->node, &ref->tree);
>   
> @@ -119,11 +124,11 @@ active_instance(struct i915_active *ref, u64 idx)
>   	return &ref->last;
>   }
>   
> -void i915_active_init(struct drm_i915_private *i915,
> +void i915_active_init(struct i915_gt_active *gt,
>   		      struct i915_active *ref,
>   		      void (*retire)(struct i915_active *ref))
>   {
> -	ref->i915 = i915;
> +	ref->gt = gt;
>   	ref->retire = retire;
>   	ref->tree = RB_ROOT;
>   	init_request_active(&ref->last, last_retire);
> @@ -161,6 +166,7 @@ void i915_active_release(struct i915_active *ref)
>   
>   int i915_active_wait(struct i915_active *ref)
>   {
> +	struct kmem_cache *slab = ref->gt->slab_cache;
>   	struct active_node *it, *n;
>   	int ret;
>   
> @@ -168,15 +174,19 @@ int i915_active_wait(struct i915_active *ref)
>   	if (ret)
>   		return ret;
>   
> +	if (RB_EMPTY_ROOT(&ref->tree))
> +		return 0;
> +
>   	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
>   		ret = i915_gem_active_retire(&it->base, BKL(ref));
>   		if (ret)
>   			return ret;
>   
>   		GEM_BUG_ON(i915_gem_active_isset(&it->base));
> -		kfree(it);
> +		kmem_cache_free(slab, it);
>   	}
>   	ref->tree = RB_ROOT;
> +	list_del(&ref->active_link);
>   
>   	return 0;
>   }
> @@ -210,15 +220,46 @@ int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
>   
>   void i915_active_fini(struct i915_active *ref)
>   {
> +	struct kmem_cache *slab = ref->gt->slab_cache;
>   	struct active_node *it, *n;
>   
> +	lockdep_assert_held(BKL(ref));
>   	GEM_BUG_ON(i915_gem_active_isset(&ref->last));
>   
> +	if (RB_EMPTY_ROOT(&ref->tree))
> +		return;
> +
>   	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
>   		GEM_BUG_ON(i915_gem_active_isset(&it->base));
> -		kfree(it);
> +		kmem_cache_free(slab, it);
>   	}
>   	ref->tree = RB_ROOT;
> +	list_del(&ref->active_link);
> +}
> +
> +int i915_gt_active_init(struct i915_gt_active *gt)
> +{
> +	gt->slab_cache = KMEM_CACHE(active_node, SLAB_HWCACHE_ALIGN);
> +	if (!gt->slab_cache)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&gt->active_refs);
> +
> +	return 0;
> +}
> +
> +void i915_gt_active_park(struct i915_gt_active *gt)
> +{
> +	struct i915_active *it, *n;
> +
> +	list_for_each_entry_safe(it, n, &gt->active_refs, active_link)
> +		i915_active_fini(it);
> +}
> +
> +void i915_gt_active_fini(struct i915_gt_active *gt)
> +{
> +	GEM_BUG_ON(!list_empty(&gt->active_refs));
> +	kmem_cache_destroy(gt->slab_cache);
>   }
>   
>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
> index c0729a046f98..41c4a5da84c8 100644
> --- a/drivers/gpu/drm/i915/i915_active.h
> +++ b/drivers/gpu/drm/i915/i915_active.h
> @@ -9,10 +9,6 @@
>   
>   #include "i915_active_types.h"
>   
> -#include <linux/rbtree.h>
> -
> -#include "i915_request.h"
> -
>   /*
>    * GPU activity tracking
>    *
> @@ -39,7 +35,7 @@
>    * synchronisation.
>    */
>   
> -void i915_active_init(struct drm_i915_private *i915,
> +void i915_active_init(struct i915_gt_active *gt,
>   		      struct i915_active *ref,
>   		      void (*retire)(struct i915_active *ref));
>   
> @@ -63,4 +59,19 @@ i915_active_is_idle(const struct i915_active *ref)
>   
>   void i915_active_fini(struct i915_active *ref);
>   
> +/*
> + * Active refs memory management
> + *
> + * To be more economical with memory, we reap all the i915_active trees on
> + * parking the GPU (when we know the GPU is inactive) and allocate the nodes
> + * from a local slab cache to hopefully reduce the fragmentation as we will
> + * then be able to free all pages en masse upon idling.
> + */
> +
> +int i915_gt_active_init(struct i915_gt_active *gt);
> +void i915_gt_active_park(struct i915_gt_active *gt);
> +void i915_gt_active_fini(struct i915_gt_active *gt);
> +
> +#define i915_gt_active(i915) (&(i915)->gt.active_refs)
> +
>   #endif /* _I915_ACTIVE_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
> index 411e502ed8dd..3d41c33ca78c 100644
> --- a/drivers/gpu/drm/i915/i915_active_types.h
> +++ b/drivers/gpu/drm/i915/i915_active_types.h
> @@ -7,14 +7,17 @@
>   #ifndef _I915_ACTIVE_TYPES_H_
>   #define _I915_ACTIVE_TYPES_H_
>   
> +#include <linux/list.h>
>   #include <linux/rbtree.h>
>   
>   #include "i915_request.h"
>   
> -struct drm_i915_private;
> +struct i915_gt_active;
> +struct kmem_cache;
>   
>   struct i915_active {
> -	struct drm_i915_private *i915;
> +	struct i915_gt_active *gt;

gt_active would be better - gt is to vague.

> +	struct list_head active_link;
>   
>   	struct rb_root tree;
>   	struct i915_gem_active last;
> @@ -23,4 +26,9 @@ struct i915_active {
>   	void (*retire)(struct i915_active *ref);
>   };
>   
> +struct i915_gt_active {
> +	struct list_head active_refs;
> +	struct kmem_cache *slab_cache;
> +};
> +
>   #endif /* _I915_ACTIVE_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 8ec28a7f5452..480ab3e00ba8 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1984,6 +1984,8 @@ struct drm_i915_private {
>   			struct list_head hwsp_free_list;
>   		} timelines;
>   
> +		struct i915_gt_active active_refs;
> +
>   		struct list_head active_rings;
>   		struct list_head closed_vma;
>   		u32 active_requests;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index caccff87a2a1..2bc735df408b 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -130,6 +130,7 @@ static u32 __i915_gem_park(struct drm_i915_private *i915)
>   
>   	intel_engines_park(i915);
>   	i915_timelines_park(i915);
> +	i915_gt_active_park(i915_gt_active(i915));

The i915_gt_active macro is just too horrible IMHO. Why? :)

Make i915_gt_active_park take i915, or i915->gt_active.

>   
>   	i915_pmu_gt_parked(i915);
>   	i915_vma_parked(i915);
> @@ -4998,15 +4999,19 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
>   		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
>   	}
>   
> +	ret = i915_gt_active_init(i915_gt_active(dev_priv));
> +	if (ret)
> +		return ret;
> +
>   	i915_timelines_init(dev_priv);
>   
>   	ret = i915_gem_init_userptr(dev_priv);
>   	if (ret)
> -		return ret;
> +		goto err_timelines;
>   
>   	ret = intel_uc_init_misc(dev_priv);
>   	if (ret)
> -		return ret;
> +		goto err_userptr;
>   
>   	ret = intel_wopcm_init(&dev_priv->wopcm);
>   	if (ret)
> @@ -5122,9 +5127,13 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
>   err_uc_misc:
>   	intel_uc_fini_misc(dev_priv);
>   
> -	if (ret != -EIO) {
> +err_userptr:
> +	if (ret != -EIO)
>   		i915_gem_cleanup_userptr(dev_priv);
> +err_timelines:
> +	if (ret != -EIO) {
>   		i915_timelines_fini(dev_priv);
> +		i915_gt_active_fini(i915_gt_active(dev_priv));
>   	}
>   
>   	if (ret == -EIO) {
> @@ -5177,6 +5186,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv)
>   	intel_uc_fini_misc(dev_priv);
>   	i915_gem_cleanup_userptr(dev_priv);
>   	i915_timelines_fini(dev_priv);
> +	i915_gt_active_fini(i915_gt_active(dev_priv));
>   
>   	i915_gem_drain_freed_objects(dev_priv);
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index e625659c03a2..d8819de0d6ee 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1917,7 +1917,7 @@ static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
>   	if (!vma)
>   		return ERR_PTR(-ENOMEM);
>   
> -	i915_active_init(i915, &vma->active, NULL);
> +	i915_active_init(i915_gt_active(i915), &vma->active, NULL);
>   	init_request_active(&vma->last_fence, NULL);
>   
>   	vma->vm = &ggtt->vm;
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index d4772061e642..2456bfb4877b 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -119,7 +119,8 @@ vma_create(struct drm_i915_gem_object *obj,
>   	if (vma == NULL)
>   		return ERR_PTR(-ENOMEM);
>   
> -	i915_active_init(vm->i915, &vma->active, __i915_vma_retire);
> +	i915_active_init(i915_gt_active(vm->i915),
> +			 &vma->active, __i915_vma_retire);
>   	init_request_active(&vma->last_fence, NULL);
>   
>   	vma->vm = vm;
> diff --git a/drivers/gpu/drm/i915/selftests/i915_active.c b/drivers/gpu/drm/i915/selftests/i915_active.c
> index 7c5c3068565b..0e923476920e 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_active.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_active.c
> @@ -30,7 +30,8 @@ static int __live_active_setup(struct drm_i915_private *i915,
>   	unsigned int count = 0;
>   	int err = 0;
>   
> -	i915_active_init(i915, &active->base, __live_active_retire);
> +	i915_active_init(i915_gt_active(i915),
> +			 &active->base, __live_active_retire);
>   	active->retired = false;
>   
>   	if (!i915_active_acquire(&active->base)) {
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index 074a0d9cbf26..5b88f74c1677 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -69,6 +69,7 @@ static void mock_device_release(struct drm_device *dev)
>   	mutex_unlock(&i915->drm.struct_mutex);
>   
>   	i915_timelines_fini(i915);
> +	i915_gt_active_fini(i915_gt_active(i915));
>   
>   	drain_workqueue(i915->wq);
>   	i915_gem_drain_freed_objects(i915);
> @@ -228,6 +229,9 @@ struct drm_i915_private *mock_gem_device(void)
>   	if (!i915->priorities)
>   		goto err_dependencies;
>   
> +	if (i915_gt_active_init(i915_gt_active(i915)))
> +		goto err_priorities;
> +
>   	i915_timelines_init(i915);
>   
>   	INIT_LIST_HEAD(&i915->gt.active_rings);
> @@ -257,6 +261,8 @@ struct drm_i915_private *mock_gem_device(void)
>   err_unlock:
>   	mutex_unlock(&i915->drm.struct_mutex);
>   	i915_timelines_fini(i915);
> +	i915_gt_active_fini(i915_gt_active(i915));
> +err_priorities:
>   	kmem_cache_destroy(i915->priorities);
>   err_dependencies:
>   	kmem_cache_destroy(i915->dependencies);
> 

Regards,

Tvrtko
Chris Wilson Jan. 30, 2019, 4:08 p.m. UTC | #2
Quoting Tvrtko Ursulin (2019-01-30 15:57:06)
> 
> On 30/01/2019 02:19, Chris Wilson wrote:
> > Wrap the active tracking for a GPU references in a slabcache for faster
> > allocations, and keep track of inflight nodes so we can reap the
> > stale entries upon parking (thereby trimming our memory usage).
> 
> I suggest a two staged approach. First patch add a slab cache (you can 
> also add kmem_cache_shrink on park as we do for other caches), then add 
> the parking/reaping bit.

Not really seeing the point, the lesson learnt is that we should be
tidying up on parking and that's why having a slab makes sense in the
first place.

I can see one argument to not do the idle reaping immediately and that's
if we apply the reaping on vma idle instead. Then parking is redundant.
 
> Under what scenarios we end up not freeing active nodes sufficiently? It 
> would have to be some user which keeps many contexts around, having only 
> used them once?

Never in a typical scenario :) Almost every allocation is served from
the last_request slot, you need to have a pair of concurrent references
to a vma for it to activate, and that takes multiple engines reading the
same object within a context/ppgtt.

> >   struct i915_active {
> > -     struct drm_i915_private *i915;
> > +     struct i915_gt_active *gt;
> 
> gt_active would be better - gt is to vague.

As a backpointer though it's defined by its owner. Give me a better name
for everything.

> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index caccff87a2a1..2bc735df408b 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -130,6 +130,7 @@ static u32 __i915_gem_park(struct drm_i915_private *i915)
> >   
> >       intel_engines_park(i915);
> >       i915_timelines_park(i915);
> > +     i915_gt_active_park(i915_gt_active(i915));
> 
> The i915_gt_active macro is just too horrible IMHO. Why? :)

Because it was called gt.active_refs and that was irking me. I really
haven't settled on the name.

> Make i915_gt_active_park take i915, or i915->gt_active.

The pretense at encapsulation was nice, and I'd like to push harder on
that front.
-Chris
Chris Wilson Jan. 30, 2019, 9:58 p.m. UTC | #3
Quoting Chris Wilson (2019-01-30 16:08:59)
> Quoting Tvrtko Ursulin (2019-01-30 15:57:06)
> > Under what scenarios we end up not freeing active nodes sufficiently? It 
> > would have to be some user which keeps many contexts around, having only 
> > used them once?
> 
> Never in a typical scenario :) Almost every allocation is served from
> the last_request slot, you need to have a pair of concurrent references
> to a vma for it to activate, and that takes multiple engines reading the
> same object within a context/ppgtt.

That being said, that applies to vma. HW semaphores is an example where
the cacheline could be shared far and wide with uncertain lifetimes.
Context barriers in contrast just have a few different timelines and
idle almost immediately.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index e0182e19cb8b..3c7abbde42ac 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -7,7 +7,9 @@ 
 #include "i915_drv.h"
 #include "i915_active.h"
 
-#define BKL(ref) (&(ref)->i915->drm.struct_mutex)
+#define i915_from_gt(x) \
+	container_of(x, struct drm_i915_private, gt.active_refs)
+#define BKL(ref) (&i915_from_gt((ref)->gt)->drm.struct_mutex)
 
 struct active_node {
 	struct i915_gem_active base;
@@ -79,11 +81,11 @@  active_instance(struct i915_active *ref, u64 idx)
 			p = &parent->rb_left;
 	}
 
-	node = kmalloc(sizeof(*node), GFP_KERNEL);
+	node = kmem_cache_alloc(ref->gt->slab_cache, GFP_KERNEL);
 
 	/* kmalloc may retire the ref->last (thanks shrinker)! */
 	if (unlikely(!i915_gem_active_raw(&ref->last, BKL(ref)))) {
-		kfree(node);
+		kmem_cache_free(ref->gt->slab_cache, node);
 		goto out;
 	}
 
@@ -94,6 +96,9 @@  active_instance(struct i915_active *ref, u64 idx)
 	node->ref = ref;
 	node->timeline = idx;
 
+	if (RB_EMPTY_ROOT(&ref->tree))
+		list_add(&ref->active_link, &ref->gt->active_refs);
+
 	rb_link_node(&node->node, parent, p);
 	rb_insert_color(&node->node, &ref->tree);
 
@@ -119,11 +124,11 @@  active_instance(struct i915_active *ref, u64 idx)
 	return &ref->last;
 }
 
-void i915_active_init(struct drm_i915_private *i915,
+void i915_active_init(struct i915_gt_active *gt,
 		      struct i915_active *ref,
 		      void (*retire)(struct i915_active *ref))
 {
-	ref->i915 = i915;
+	ref->gt = gt;
 	ref->retire = retire;
 	ref->tree = RB_ROOT;
 	init_request_active(&ref->last, last_retire);
@@ -161,6 +166,7 @@  void i915_active_release(struct i915_active *ref)
 
 int i915_active_wait(struct i915_active *ref)
 {
+	struct kmem_cache *slab = ref->gt->slab_cache;
 	struct active_node *it, *n;
 	int ret;
 
@@ -168,15 +174,19 @@  int i915_active_wait(struct i915_active *ref)
 	if (ret)
 		return ret;
 
+	if (RB_EMPTY_ROOT(&ref->tree))
+		return 0;
+
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
 		ret = i915_gem_active_retire(&it->base, BKL(ref));
 		if (ret)
 			return ret;
 
 		GEM_BUG_ON(i915_gem_active_isset(&it->base));
-		kfree(it);
+		kmem_cache_free(slab, it);
 	}
 	ref->tree = RB_ROOT;
+	list_del(&ref->active_link);
 
 	return 0;
 }
@@ -210,15 +220,46 @@  int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
 
 void i915_active_fini(struct i915_active *ref)
 {
+	struct kmem_cache *slab = ref->gt->slab_cache;
 	struct active_node *it, *n;
 
+	lockdep_assert_held(BKL(ref));
 	GEM_BUG_ON(i915_gem_active_isset(&ref->last));
 
+	if (RB_EMPTY_ROOT(&ref->tree))
+		return;
+
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
 		GEM_BUG_ON(i915_gem_active_isset(&it->base));
-		kfree(it);
+		kmem_cache_free(slab, it);
 	}
 	ref->tree = RB_ROOT;
+	list_del(&ref->active_link);
+}
+
+int i915_gt_active_init(struct i915_gt_active *gt)
+{
+	gt->slab_cache = KMEM_CACHE(active_node, SLAB_HWCACHE_ALIGN);
+	if (!gt->slab_cache)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&gt->active_refs);
+
+	return 0;
+}
+
+void i915_gt_active_park(struct i915_gt_active *gt)
+{
+	struct i915_active *it, *n;
+
+	list_for_each_entry_safe(it, n, &gt->active_refs, active_link)
+		i915_active_fini(it);
+}
+
+void i915_gt_active_fini(struct i915_gt_active *gt)
+{
+	GEM_BUG_ON(!list_empty(&gt->active_refs));
+	kmem_cache_destroy(gt->slab_cache);
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
index c0729a046f98..41c4a5da84c8 100644
--- a/drivers/gpu/drm/i915/i915_active.h
+++ b/drivers/gpu/drm/i915/i915_active.h
@@ -9,10 +9,6 @@ 
 
 #include "i915_active_types.h"
 
-#include <linux/rbtree.h>
-
-#include "i915_request.h"
-
 /*
  * GPU activity tracking
  *
@@ -39,7 +35,7 @@ 
  * synchronisation.
  */
 
-void i915_active_init(struct drm_i915_private *i915,
+void i915_active_init(struct i915_gt_active *gt,
 		      struct i915_active *ref,
 		      void (*retire)(struct i915_active *ref));
 
@@ -63,4 +59,19 @@  i915_active_is_idle(const struct i915_active *ref)
 
 void i915_active_fini(struct i915_active *ref);
 
+/*
+ * Active refs memory management
+ *
+ * To be more economical with memory, we reap all the i915_active trees on
+ * parking the GPU (when we know the GPU is inactive) and allocate the nodes
+ * from a local slab cache to hopefully reduce the fragmentation as we will
+ * then be able to free all pages en masse upon idling.
+ */
+
+int i915_gt_active_init(struct i915_gt_active *gt);
+void i915_gt_active_park(struct i915_gt_active *gt);
+void i915_gt_active_fini(struct i915_gt_active *gt);
+
+#define i915_gt_active(i915) (&(i915)->gt.active_refs)
+
 #endif /* _I915_ACTIVE_H_ */
diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
index 411e502ed8dd..3d41c33ca78c 100644
--- a/drivers/gpu/drm/i915/i915_active_types.h
+++ b/drivers/gpu/drm/i915/i915_active_types.h
@@ -7,14 +7,17 @@ 
 #ifndef _I915_ACTIVE_TYPES_H_
 #define _I915_ACTIVE_TYPES_H_
 
+#include <linux/list.h>
 #include <linux/rbtree.h>
 
 #include "i915_request.h"
 
-struct drm_i915_private;
+struct i915_gt_active;
+struct kmem_cache;
 
 struct i915_active {
-	struct drm_i915_private *i915;
+	struct i915_gt_active *gt;
+	struct list_head active_link;
 
 	struct rb_root tree;
 	struct i915_gem_active last;
@@ -23,4 +26,9 @@  struct i915_active {
 	void (*retire)(struct i915_active *ref);
 };
 
+struct i915_gt_active {
+	struct list_head active_refs;
+	struct kmem_cache *slab_cache;
+};
+
 #endif /* _I915_ACTIVE_TYPES_H_ */
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8ec28a7f5452..480ab3e00ba8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1984,6 +1984,8 @@  struct drm_i915_private {
 			struct list_head hwsp_free_list;
 		} timelines;
 
+		struct i915_gt_active active_refs;
+
 		struct list_head active_rings;
 		struct list_head closed_vma;
 		u32 active_requests;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index caccff87a2a1..2bc735df408b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -130,6 +130,7 @@  static u32 __i915_gem_park(struct drm_i915_private *i915)
 
 	intel_engines_park(i915);
 	i915_timelines_park(i915);
+	i915_gt_active_park(i915_gt_active(i915));
 
 	i915_pmu_gt_parked(i915);
 	i915_vma_parked(i915);
@@ -4998,15 +4999,19 @@  int i915_gem_init(struct drm_i915_private *dev_priv)
 		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
 	}
 
+	ret = i915_gt_active_init(i915_gt_active(dev_priv));
+	if (ret)
+		return ret;
+
 	i915_timelines_init(dev_priv);
 
 	ret = i915_gem_init_userptr(dev_priv);
 	if (ret)
-		return ret;
+		goto err_timelines;
 
 	ret = intel_uc_init_misc(dev_priv);
 	if (ret)
-		return ret;
+		goto err_userptr;
 
 	ret = intel_wopcm_init(&dev_priv->wopcm);
 	if (ret)
@@ -5122,9 +5127,13 @@  int i915_gem_init(struct drm_i915_private *dev_priv)
 err_uc_misc:
 	intel_uc_fini_misc(dev_priv);
 
-	if (ret != -EIO) {
+err_userptr:
+	if (ret != -EIO)
 		i915_gem_cleanup_userptr(dev_priv);
+err_timelines:
+	if (ret != -EIO) {
 		i915_timelines_fini(dev_priv);
+		i915_gt_active_fini(i915_gt_active(dev_priv));
 	}
 
 	if (ret == -EIO) {
@@ -5177,6 +5186,7 @@  void i915_gem_fini(struct drm_i915_private *dev_priv)
 	intel_uc_fini_misc(dev_priv);
 	i915_gem_cleanup_userptr(dev_priv);
 	i915_timelines_fini(dev_priv);
+	i915_gt_active_fini(i915_gt_active(dev_priv));
 
 	i915_gem_drain_freed_objects(dev_priv);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index e625659c03a2..d8819de0d6ee 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1917,7 +1917,7 @@  static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
 	if (!vma)
 		return ERR_PTR(-ENOMEM);
 
-	i915_active_init(i915, &vma->active, NULL);
+	i915_active_init(i915_gt_active(i915), &vma->active, NULL);
 	init_request_active(&vma->last_fence, NULL);
 
 	vma->vm = &ggtt->vm;
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index d4772061e642..2456bfb4877b 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -119,7 +119,8 @@  vma_create(struct drm_i915_gem_object *obj,
 	if (vma == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	i915_active_init(vm->i915, &vma->active, __i915_vma_retire);
+	i915_active_init(i915_gt_active(vm->i915),
+			 &vma->active, __i915_vma_retire);
 	init_request_active(&vma->last_fence, NULL);
 
 	vma->vm = vm;
diff --git a/drivers/gpu/drm/i915/selftests/i915_active.c b/drivers/gpu/drm/i915/selftests/i915_active.c
index 7c5c3068565b..0e923476920e 100644
--- a/drivers/gpu/drm/i915/selftests/i915_active.c
+++ b/drivers/gpu/drm/i915/selftests/i915_active.c
@@ -30,7 +30,8 @@  static int __live_active_setup(struct drm_i915_private *i915,
 	unsigned int count = 0;
 	int err = 0;
 
-	i915_active_init(i915, &active->base, __live_active_retire);
+	i915_active_init(i915_gt_active(i915),
+			 &active->base, __live_active_retire);
 	active->retired = false;
 
 	if (!i915_active_acquire(&active->base)) {
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 074a0d9cbf26..5b88f74c1677 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -69,6 +69,7 @@  static void mock_device_release(struct drm_device *dev)
 	mutex_unlock(&i915->drm.struct_mutex);
 
 	i915_timelines_fini(i915);
+	i915_gt_active_fini(i915_gt_active(i915));
 
 	drain_workqueue(i915->wq);
 	i915_gem_drain_freed_objects(i915);
@@ -228,6 +229,9 @@  struct drm_i915_private *mock_gem_device(void)
 	if (!i915->priorities)
 		goto err_dependencies;
 
+	if (i915_gt_active_init(i915_gt_active(i915)))
+		goto err_priorities;
+
 	i915_timelines_init(i915);
 
 	INIT_LIST_HEAD(&i915->gt.active_rings);
@@ -257,6 +261,8 @@  struct drm_i915_private *mock_gem_device(void)
 err_unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
 	i915_timelines_fini(i915);
+	i915_gt_active_fini(i915_gt_active(i915));
+err_priorities:
 	kmem_cache_destroy(i915->priorities);
 err_dependencies:
 	kmem_cache_destroy(i915->dependencies);