[04/11] drm/i915: Make request allocation caches global
diff mbox series

Message ID 20190226102404.29153-4-chris@chris-wilson.co.uk
State New
Headers show
Series
  • [01/11] drm/i915: Skip scanning for signalers if we are already inflight
Related show

Commit Message

Chris Wilson Feb. 26, 2019, 10:23 a.m. UTC
As kmem_caches share the same properties (size, allocation/free behaviour)
for all potential devices, we can use global caches. While this
potential has worse fragmentation behaviour (one can argue that
different devices would have different activity lifetimes, but you can
also argue that activity is temporal across the system) it is the
default behaviour of the system at large to amalgamate matching caches.

The benefit for us is much reduced pointer dancing along the frequent
allocation paths.

v2: Defer shrinking until after a global grace period for futureproofing
multiple consumers of the slab caches, similar to the current strategy
for avoiding shrinking too early.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Makefile                 |   1 +
 drivers/gpu/drm/i915/i915_active.c            |   7 +-
 drivers/gpu/drm/i915/i915_active.h            |   1 +
 drivers/gpu/drm/i915/i915_drv.h               |   3 -
 drivers/gpu/drm/i915/i915_gem.c               |  34 +-----
 drivers/gpu/drm/i915/i915_globals.c           | 113 ++++++++++++++++++
 drivers/gpu/drm/i915/i915_globals.h           |  15 +++
 drivers/gpu/drm/i915/i915_pci.c               |   8 +-
 drivers/gpu/drm/i915/i915_request.c           |  53 ++++++--
 drivers/gpu/drm/i915/i915_request.h           |  10 ++
 drivers/gpu/drm/i915/i915_scheduler.c         |  66 +++++++---
 drivers/gpu/drm/i915/i915_scheduler.h         |  34 +++++-
 drivers/gpu/drm/i915/intel_guc_submission.c   |   3 +-
 drivers/gpu/drm/i915/intel_lrc.c              |   6 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h       |  17 ---
 drivers/gpu/drm/i915/selftests/intel_lrc.c    |   2 +-
 drivers/gpu/drm/i915/selftests/mock_engine.c  |  45 ++++---
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  26 ----
 drivers/gpu/drm/i915/selftests/mock_request.c |  12 +-
 drivers/gpu/drm/i915/selftests/mock_request.h |   7 --
 20 files changed, 313 insertions(+), 150 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_globals.c
 create mode 100644 drivers/gpu/drm/i915/i915_globals.h

Comments

Tvrtko Ursulin Feb. 27, 2019, 10:29 a.m. UTC | #1
On 26/02/2019 10:23, Chris Wilson wrote:
> As kmem_caches share the same properties (size, allocation/free behaviour)
> for all potential devices, we can use global caches. While this
> potential has worse fragmentation behaviour (one can argue that
> different devices would have different activity lifetimes, but you can
> also argue that activity is temporal across the system) it is the
> default behaviour of the system at large to amalgamate matching caches.
> 
> The benefit for us is much reduced pointer dancing along the frequent
> allocation paths.
> 
> v2: Defer shrinking until after a global grace period for futureproofing
> multiple consumers of the slab caches, similar to the current strategy
> for avoiding shrinking too early.

I suggested to call i915_globals_park directly from __i915_gem_park for 
symmetry with how i915_gem_unpark calls i915_globals_unpark. 
i915_globals has it's own delayed setup so I don't think it benefits 
from the double indirection courtesy of being called from shrink_caches.

Otherwise I had not other complaints, but this asymmetry for no reason 
is bugging me.

Regards,

Tvrtko

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/Makefile                 |   1 +
>   drivers/gpu/drm/i915/i915_active.c            |   7 +-
>   drivers/gpu/drm/i915/i915_active.h            |   1 +
>   drivers/gpu/drm/i915/i915_drv.h               |   3 -
>   drivers/gpu/drm/i915/i915_gem.c               |  34 +-----
>   drivers/gpu/drm/i915/i915_globals.c           | 113 ++++++++++++++++++
>   drivers/gpu/drm/i915/i915_globals.h           |  15 +++
>   drivers/gpu/drm/i915/i915_pci.c               |   8 +-
>   drivers/gpu/drm/i915/i915_request.c           |  53 ++++++--
>   drivers/gpu/drm/i915/i915_request.h           |  10 ++
>   drivers/gpu/drm/i915/i915_scheduler.c         |  66 +++++++---
>   drivers/gpu/drm/i915/i915_scheduler.h         |  34 +++++-
>   drivers/gpu/drm/i915/intel_guc_submission.c   |   3 +-
>   drivers/gpu/drm/i915/intel_lrc.c              |   6 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.h       |  17 ---
>   drivers/gpu/drm/i915/selftests/intel_lrc.c    |   2 +-
>   drivers/gpu/drm/i915/selftests/mock_engine.c  |  45 ++++---
>   .../gpu/drm/i915/selftests/mock_gem_device.c  |  26 ----
>   drivers/gpu/drm/i915/selftests/mock_request.c |  12 +-
>   drivers/gpu/drm/i915/selftests/mock_request.h |   7 --
>   20 files changed, 313 insertions(+), 150 deletions(-)
>   create mode 100644 drivers/gpu/drm/i915/i915_globals.c
>   create mode 100644 drivers/gpu/drm/i915/i915_globals.h
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 1787e1299b1b..a1d834068765 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -77,6 +77,7 @@ i915-y += \
>   	  i915_gem_tiling.o \
>   	  i915_gem_userptr.o \
>   	  i915_gemfs.o \
> +	  i915_globals.o \
>   	  i915_query.o \
>   	  i915_request.o \
>   	  i915_scheduler.o \
> diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> index db7bb5bd5add..d9f6471ac16c 100644
> --- a/drivers/gpu/drm/i915/i915_active.c
> +++ b/drivers/gpu/drm/i915/i915_active.c
> @@ -294,7 +294,12 @@ int __init i915_global_active_init(void)
>   	return 0;
>   }
>   
> -void __exit i915_global_active_exit(void)
> +void i915_global_active_shrink(void)
> +{
> +	kmem_cache_shrink(global.slab_cache);
> +}
> +
> +void i915_global_active_exit(void)
>   {
>   	kmem_cache_destroy(global.slab_cache);
>   }
> diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
> index 12b5c1d287d1..5fbd9102384b 100644
> --- a/drivers/gpu/drm/i915/i915_active.h
> +++ b/drivers/gpu/drm/i915/i915_active.h
> @@ -420,6 +420,7 @@ static inline void i915_active_fini(struct i915_active *ref) { }
>   #endif
>   
>   int i915_global_active_init(void);
> +void i915_global_active_shrink(void);
>   void i915_global_active_exit(void);
>   
>   #endif /* _I915_ACTIVE_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index cc09caf3870e..f16016b330b3 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1473,9 +1473,6 @@ struct drm_i915_private {
>   	struct kmem_cache *objects;
>   	struct kmem_cache *vmas;
>   	struct kmem_cache *luts;
> -	struct kmem_cache *requests;
> -	struct kmem_cache *dependencies;
> -	struct kmem_cache *priorities;
>   
>   	const struct intel_device_info __info; /* Use INTEL_INFO() to access. */
>   	struct intel_runtime_info __runtime; /* Use RUNTIME_INFO() to access. */
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 2b261524cfa4..713ed6fbdcc8 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -42,6 +42,7 @@
>   #include "i915_drv.h"
>   #include "i915_gem_clflush.h"
>   #include "i915_gemfs.h"
> +#include "i915_globals.h"
>   #include "i915_reset.h"
>   #include "i915_trace.h"
>   #include "i915_vgpu.h"
> @@ -187,6 +188,8 @@ void i915_gem_unpark(struct drm_i915_private *i915)
>   	if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
>   		i915->gt.epoch = 1;
>   
> +	i915_globals_unpark();
> +
>   	intel_enable_gt_powersave(i915);
>   	i915_update_gfx_val(i915);
>   	if (INTEL_GEN(i915) >= 6)
> @@ -2892,12 +2895,11 @@ static void shrink_caches(struct drm_i915_private *i915)
>   	 * filled slabs to prioritise allocating from the mostly full slabs,
>   	 * with the aim of reducing fragmentation.
>   	 */
> -	kmem_cache_shrink(i915->priorities);
> -	kmem_cache_shrink(i915->dependencies);
> -	kmem_cache_shrink(i915->requests);
>   	kmem_cache_shrink(i915->luts);
>   	kmem_cache_shrink(i915->vmas);
>   	kmem_cache_shrink(i915->objects);
> +
> +	i915_globals_park();
>   }
>   
>   struct sleep_rcu_work {
> @@ -5235,23 +5237,6 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
>   	if (!dev_priv->luts)
>   		goto err_vmas;
>   
> -	dev_priv->requests = KMEM_CACHE(i915_request,
> -					SLAB_HWCACHE_ALIGN |
> -					SLAB_RECLAIM_ACCOUNT |
> -					SLAB_TYPESAFE_BY_RCU);
> -	if (!dev_priv->requests)
> -		goto err_luts;
> -
> -	dev_priv->dependencies = KMEM_CACHE(i915_dependency,
> -					    SLAB_HWCACHE_ALIGN |
> -					    SLAB_RECLAIM_ACCOUNT);
> -	if (!dev_priv->dependencies)
> -		goto err_requests;
> -
> -	dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
> -	if (!dev_priv->priorities)
> -		goto err_dependencies;
> -
>   	INIT_LIST_HEAD(&dev_priv->gt.active_rings);
>   	INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
>   
> @@ -5276,12 +5261,6 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
>   
>   	return 0;
>   
> -err_dependencies:
> -	kmem_cache_destroy(dev_priv->dependencies);
> -err_requests:
> -	kmem_cache_destroy(dev_priv->requests);
> -err_luts:
> -	kmem_cache_destroy(dev_priv->luts);
>   err_vmas:
>   	kmem_cache_destroy(dev_priv->vmas);
>   err_objects:
> @@ -5299,9 +5278,6 @@ void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
>   
>   	cleanup_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
>   
> -	kmem_cache_destroy(dev_priv->priorities);
> -	kmem_cache_destroy(dev_priv->dependencies);
> -	kmem_cache_destroy(dev_priv->requests);
>   	kmem_cache_destroy(dev_priv->luts);
>   	kmem_cache_destroy(dev_priv->vmas);
>   	kmem_cache_destroy(dev_priv->objects);
> diff --git a/drivers/gpu/drm/i915/i915_globals.c b/drivers/gpu/drm/i915/i915_globals.c
> new file mode 100644
> index 000000000000..7fd1b3945a04
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_globals.c
> @@ -0,0 +1,113 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#include <linux/slab.h>
> +#include <linux/workqueue.h>
> +
> +#include "i915_active.h"
> +#include "i915_globals.h"
> +#include "i915_request.h"
> +#include "i915_scheduler.h"
> +
> +int __init i915_globals_init(void)
> +{
> +	int err;
> +
> +	err = i915_global_active_init();
> +	if (err)
> +		return err;
> +
> +	err = i915_global_request_init();
> +	if (err)
> +		goto err_active;
> +
> +	err = i915_global_scheduler_init();
> +	if (err)
> +		goto err_request;
> +
> +	return 0;
> +
> +err_request:
> +	i915_global_request_exit();
> +err_active:
> +	i915_global_active_exit();
> +	return err;
> +}
> +
> +static void i915_globals_shrink(void)
> +{
> +	/*
> +	 * kmem_cache_shrink() discards empty slabs and reorders partially
> +	 * filled slabs to prioritise allocating from the mostly full slabs,
> +	 * with the aim of reducing fragmentation.
> +	 */
> +	i915_global_active_shrink();
> +	i915_global_request_shrink();
> +	i915_global_scheduler_shrink();
> +}
> +
> +static atomic_t active;
> +static atomic_t epoch;
> +struct park_work {
> +	struct rcu_work work;
> +	int epoch;
> +};
> +
> +static void __i915_globals_park(struct work_struct *work)
> +{
> +	struct park_work *wrk = container_of(work, typeof(*wrk), work.work);
> +
> +	/* Confirm nothing woke up in the last grace period */
> +	if (wrk->epoch == atomic_read(&epoch))
> +		i915_globals_shrink();
> +
> +	kfree(wrk);
> +}
> +
> +void i915_globals_park(void)
> +{
> +	struct park_work *wrk;
> +
> +	/*
> +	 * Defer shrinking the global slab caches (and other work) until
> +	 * after a RCU grace period has completed with no activity. This
> +	 * is to try and reduce the latency impact on the consumers caused
> +	 * by us shrinking the caches the same time as they are trying to
> +	 * allocate, with the assumption being that if we idle long enough
> +	 * for an RCU grace period to elapse since the last use, it is likely
> +	 * to be longer until we need the caches again.
> +	 */
> +	if (!atomic_dec_and_test(&active))
> +		return;
> +
> +	wrk = kmalloc(sizeof(*wrk), GFP_KERNEL);
> +	if (!wrk)
> +		return;
> +
> +	wrk->epoch = atomic_inc_return(&epoch);
> +	INIT_RCU_WORK(&wrk->work, __i915_globals_park);
> +	queue_rcu_work(system_wq, &wrk->work);
> +}
> +
> +void i915_globals_unpark(void)
> +{
> +	atomic_inc(&epoch);
> +	atomic_inc(&active);
> +}
> +
> +void __exit i915_globals_exit(void)
> +{
> +	/* Flush any residual park_work */
> +	rcu_barrier();
> +	flush_scheduled_work();
> +
> +	i915_global_scheduler_exit();
> +	i915_global_request_exit();
> +	i915_global_active_exit();
> +
> +	/* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
> +	rcu_barrier();
> +}
> diff --git a/drivers/gpu/drm/i915/i915_globals.h b/drivers/gpu/drm/i915/i915_globals.h
> new file mode 100644
> index 000000000000..e468f0413a73
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_globals.h
> @@ -0,0 +1,15 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#ifndef _I915_GLOBALS_H_
> +#define _I915_GLOBALS_H_
> +
> +int i915_globals_init(void);
> +void i915_globals_park(void);
> +void i915_globals_unpark(void);
> +void i915_globals_exit(void);
> +
> +#endif /* _I915_GLOBALS_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
> index c4d6b8da9b03..a9211c370cd1 100644
> --- a/drivers/gpu/drm/i915/i915_pci.c
> +++ b/drivers/gpu/drm/i915/i915_pci.c
> @@ -28,8 +28,8 @@
>   
>   #include <drm/drm_drv.h>
>   
> -#include "i915_active.h"
>   #include "i915_drv.h"
> +#include "i915_globals.h"
>   #include "i915_selftest.h"
>   
>   #define PLATFORM(x) .platform = (x), .platform_mask = BIT(x)
> @@ -802,7 +802,9 @@ static int __init i915_init(void)
>   	bool use_kms = true;
>   	int err;
>   
> -	i915_global_active_init();
> +	err = i915_globals_init();
> +	if (err)
> +		return err;
>   
>   	err = i915_mock_selftests();
>   	if (err)
> @@ -835,7 +837,7 @@ static void __exit i915_exit(void)
>   		return;
>   
>   	pci_unregister_driver(&i915_pci_driver);
> -	i915_global_active_exit();
> +	i915_globals_exit();
>   }
>   
>   module_init(i915_init);
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 00a1ea7cd907..c65f6c990fdd 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -32,6 +32,11 @@
>   #include "i915_active.h"
>   #include "i915_reset.h"
>   
> +static struct i915_global_request {
> +	struct kmem_cache *slab_requests;
> +	struct kmem_cache *slab_dependencies;
> +} global;
> +
>   static const char *i915_fence_get_driver_name(struct dma_fence *fence)
>   {
>   	return "i915";
> @@ -86,7 +91,7 @@ static void i915_fence_release(struct dma_fence *fence)
>   	 */
>   	i915_sw_fence_fini(&rq->submit);
>   
> -	kmem_cache_free(rq->i915->requests, rq);
> +	kmem_cache_free(global.slab_requests, rq);
>   }
>   
>   const struct dma_fence_ops i915_fence_ops = {
> @@ -292,7 +297,7 @@ static void i915_request_retire(struct i915_request *request)
>   
>   	unreserve_gt(request->i915);
>   
> -	i915_sched_node_fini(request->i915, &request->sched);
> +	i915_sched_node_fini(&request->sched);
>   	i915_request_put(request);
>   }
>   
> @@ -506,7 +511,7 @@ i915_request_alloc_slow(struct intel_context *ce)
>   	ring_retire_requests(ring);
>   
>   out:
> -	return kmem_cache_alloc(ce->gem_context->i915->requests, GFP_KERNEL);
> +	return kmem_cache_alloc(global.slab_requests, GFP_KERNEL);
>   }
>   
>   static int add_timeline_barrier(struct i915_request *rq)
> @@ -594,7 +599,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	 *
>   	 * Do not use kmem_cache_zalloc() here!
>   	 */
> -	rq = kmem_cache_alloc(i915->requests,
> +	rq = kmem_cache_alloc(global.slab_requests,
>   			      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
>   	if (unlikely(!rq)) {
>   		rq = i915_request_alloc_slow(ce);
> @@ -681,7 +686,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	GEM_BUG_ON(!list_empty(&rq->sched.signalers_list));
>   	GEM_BUG_ON(!list_empty(&rq->sched.waiters_list));
>   
> -	kmem_cache_free(i915->requests, rq);
> +	kmem_cache_free(global.slab_requests, rq);
>   err_unreserve:
>   	unreserve_gt(i915);
>   	intel_context_unpin(ce);
> @@ -700,9 +705,7 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
>   		return 0;
>   
>   	if (to->engine->schedule) {
> -		ret = i915_sched_node_add_dependency(to->i915,
> -						     &to->sched,
> -						     &from->sched);
> +		ret = i915_sched_node_add_dependency(&to->sched, &from->sched);
>   		if (ret < 0)
>   			return ret;
>   	}
> @@ -1190,3 +1193,37 @@ void i915_retire_requests(struct drm_i915_private *i915)
>   #include "selftests/mock_request.c"
>   #include "selftests/i915_request.c"
>   #endif
> +
> +int __init i915_global_request_init(void)
> +{
> +	global.slab_requests = KMEM_CACHE(i915_request,
> +					  SLAB_HWCACHE_ALIGN |
> +					  SLAB_RECLAIM_ACCOUNT |
> +					  SLAB_TYPESAFE_BY_RCU);
> +	if (!global.slab_requests)
> +		return -ENOMEM;
> +
> +	global.slab_dependencies = KMEM_CACHE(i915_dependency,
> +					      SLAB_HWCACHE_ALIGN |
> +					      SLAB_RECLAIM_ACCOUNT);
> +	if (!global.slab_dependencies)
> +		goto err_requests;
> +
> +	return 0;
> +
> +err_requests:
> +	kmem_cache_destroy(global.slab_requests);
> +	return -ENOMEM;
> +}
> +
> +void i915_global_request_shrink(void)
> +{
> +	kmem_cache_shrink(global.slab_dependencies);
> +	kmem_cache_shrink(global.slab_requests);
> +}
> +
> +void i915_global_request_exit(void)
> +{
> +	kmem_cache_destroy(global.slab_dependencies);
> +	kmem_cache_destroy(global.slab_requests);
> +}
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 1e127c1c53fa..be3ded6bcf56 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -29,6 +29,7 @@
>   
>   #include "i915_gem.h"
>   #include "i915_scheduler.h"
> +#include "i915_selftest.h"
>   #include "i915_sw_fence.h"
>   
>   #include <uapi/drm/i915_drm.h>
> @@ -196,6 +197,11 @@ struct i915_request {
>   	struct drm_i915_file_private *file_priv;
>   	/** file_priv list entry for this request */
>   	struct list_head client_link;
> +
> +	I915_SELFTEST_DECLARE(struct {
> +		struct list_head link;
> +		unsigned long delay;
> +	} mock;)
>   };
>   
>   #define I915_FENCE_GFP (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
> @@ -371,4 +377,8 @@ static inline void i915_request_mark_complete(struct i915_request *rq)
>   
>   void i915_retire_requests(struct drm_i915_private *i915);
>   
> +int i915_global_request_init(void);
> +void i915_global_request_shrink(void);
> +void i915_global_request_exit(void);
> +
>   #endif /* I915_REQUEST_H */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index 9fb96ff57a29..50018ad30233 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -10,6 +10,11 @@
>   #include "i915_request.h"
>   #include "i915_scheduler.h"
>   
> +static struct i915_global_scheduler {
> +	struct kmem_cache *slab_dependencies;
> +	struct kmem_cache *slab_priorities;
> +} global;
> +
>   static DEFINE_SPINLOCK(schedule_lock);
>   
>   static const struct i915_request *
> @@ -37,16 +42,15 @@ void i915_sched_node_init(struct i915_sched_node *node)
>   }
>   
>   static struct i915_dependency *
> -i915_dependency_alloc(struct drm_i915_private *i915)
> +i915_dependency_alloc(void)
>   {
> -	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
> +	return kmem_cache_alloc(global.slab_dependencies, GFP_KERNEL);
>   }
>   
>   static void
> -i915_dependency_free(struct drm_i915_private *i915,
> -		     struct i915_dependency *dep)
> +i915_dependency_free(struct i915_dependency *dep)
>   {
> -	kmem_cache_free(i915->dependencies, dep);
> +	kmem_cache_free(global.slab_dependencies, dep);
>   }
>   
>   bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
> @@ -73,25 +77,23 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
>   	return ret;
>   }
>   
> -int i915_sched_node_add_dependency(struct drm_i915_private *i915,
> -				   struct i915_sched_node *node,
> +int i915_sched_node_add_dependency(struct i915_sched_node *node,
>   				   struct i915_sched_node *signal)
>   {
>   	struct i915_dependency *dep;
>   
> -	dep = i915_dependency_alloc(i915);
> +	dep = i915_dependency_alloc();
>   	if (!dep)
>   		return -ENOMEM;
>   
>   	if (!__i915_sched_node_add_dependency(node, signal, dep,
>   					      I915_DEPENDENCY_ALLOC))
> -		i915_dependency_free(i915, dep);
> +		i915_dependency_free(dep);
>   
>   	return 0;
>   }
>   
> -void i915_sched_node_fini(struct drm_i915_private *i915,
> -			  struct i915_sched_node *node)
> +void i915_sched_node_fini(struct i915_sched_node *node)
>   {
>   	struct i915_dependency *dep, *tmp;
>   
> @@ -111,7 +113,7 @@ void i915_sched_node_fini(struct drm_i915_private *i915,
>   
>   		list_del(&dep->wait_link);
>   		if (dep->flags & I915_DEPENDENCY_ALLOC)
> -			i915_dependency_free(i915, dep);
> +			i915_dependency_free(dep);
>   	}
>   
>   	/* Remove ourselves from everyone who depends upon us */
> @@ -121,7 +123,7 @@ void i915_sched_node_fini(struct drm_i915_private *i915,
>   
>   		list_del(&dep->signal_link);
>   		if (dep->flags & I915_DEPENDENCY_ALLOC)
> -			i915_dependency_free(i915, dep);
> +			i915_dependency_free(dep);
>   	}
>   
>   	spin_unlock(&schedule_lock);
> @@ -198,7 +200,7 @@ i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio)
>   	if (prio == I915_PRIORITY_NORMAL) {
>   		p = &execlists->default_priolist;
>   	} else {
> -		p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC);
> +		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
>   		/* Convert an allocation failure to a priority bump */
>   		if (unlikely(!p)) {
>   			prio = I915_PRIORITY_NORMAL; /* recurses just once */
> @@ -423,3 +425,39 @@ void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump)
>   
>   	spin_unlock_bh(&schedule_lock);
>   }
> +
> +void __i915_priolist_free(struct i915_priolist *p)
> +{
> +	kmem_cache_free(global.slab_priorities, p);
> +}
> +
> +int __init i915_global_scheduler_init(void)
> +{
> +	global.slab_dependencies = KMEM_CACHE(i915_dependency,
> +					      SLAB_HWCACHE_ALIGN);
> +	if (!global.slab_dependencies)
> +		return -ENOMEM;
> +
> +	global.slab_priorities = KMEM_CACHE(i915_priolist,
> +					    SLAB_HWCACHE_ALIGN);
> +	if (!global.slab_priorities)
> +		goto err_priorities;
> +
> +	return 0;
> +
> +err_priorities:
> +	kmem_cache_destroy(global.slab_priorities);
> +	return -ENOMEM;
> +}
> +
> +void i915_global_scheduler_shrink(void)
> +{
> +	kmem_cache_shrink(global.slab_dependencies);
> +	kmem_cache_shrink(global.slab_priorities);
> +}
> +
> +void i915_global_scheduler_exit(void)
> +{
> +	kmem_cache_destroy(global.slab_dependencies);
> +	kmem_cache_destroy(global.slab_priorities);
> +}
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index 54bd6c89817e..5196ce07b6c2 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -85,6 +85,23 @@ struct i915_dependency {
>   #define I915_DEPENDENCY_ALLOC BIT(0)
>   };
>   
> +struct i915_priolist {
> +	struct list_head requests[I915_PRIORITY_COUNT];
> +	struct rb_node node;
> +	unsigned long used;
> +	int priority;
> +};
> +
> +#define priolist_for_each_request(it, plist, idx) \
> +	for (idx = 0; idx < ARRAY_SIZE((plist)->requests); idx++) \
> +		list_for_each_entry(it, &(plist)->requests[idx], sched.link)
> +
> +#define priolist_for_each_request_consume(it, n, plist, idx) \
> +	for (; (idx = ffs((plist)->used)); (plist)->used &= ~BIT(idx - 1)) \
> +		list_for_each_entry_safe(it, n, \
> +					 &(plist)->requests[idx - 1], \
> +					 sched.link)
> +
>   void i915_sched_node_init(struct i915_sched_node *node);
>   
>   bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
> @@ -92,12 +109,10 @@ bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
>   				      struct i915_dependency *dep,
>   				      unsigned long flags);
>   
> -int i915_sched_node_add_dependency(struct drm_i915_private *i915,
> -				   struct i915_sched_node *node,
> +int i915_sched_node_add_dependency(struct i915_sched_node *node,
>   				   struct i915_sched_node *signal);
>   
> -void i915_sched_node_fini(struct drm_i915_private *i915,
> -			  struct i915_sched_node *node);
> +void i915_sched_node_fini(struct i915_sched_node *node);
>   
>   void i915_schedule(struct i915_request *request,
>   		   const struct i915_sched_attr *attr);
> @@ -107,4 +122,15 @@ void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump);
>   struct list_head *
>   i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio);
>   
> +void __i915_priolist_free(struct i915_priolist *p);
> +static inline void i915_priolist_free(struct i915_priolist *p)
> +{
> +	if (p->priority != I915_PRIORITY_NORMAL)
> +		__i915_priolist_free(p);
> +}
> +
> +int i915_global_scheduler_init(void);
> +void i915_global_scheduler_shrink(void);
> +void i915_global_scheduler_exit(void);
> +
>   #endif /* _I915_SCHEDULER_H_ */
> diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
> index a2846ea1e62c..56ba2fcbabe6 100644
> --- a/drivers/gpu/drm/i915/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/intel_guc_submission.c
> @@ -781,8 +781,7 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
>   		}
>   
>   		rb_erase_cached(&p->node, &execlists->queue);
> -		if (p->priority != I915_PRIORITY_NORMAL)
> -			kmem_cache_free(engine->i915->priorities, p);
> +		i915_priolist_free(p);
>   	}
>   done:
>   	execlists->queue_priority_hint =
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index dba19baf6808..29b2a2f34edb 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -818,8 +818,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   		}
>   
>   		rb_erase_cached(&p->node, &execlists->queue);
> -		if (p->priority != I915_PRIORITY_NORMAL)
> -			kmem_cache_free(engine->i915->priorities, p);
> +		i915_priolist_free(p);
>   	}
>   
>   done:
> @@ -972,8 +971,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
>   		}
>   
>   		rb_erase_cached(&p->node, &execlists->queue);
> -		if (p->priority != I915_PRIORITY_NORMAL)
> -			kmem_cache_free(engine->i915->priorities, p);
> +		i915_priolist_free(p);
>   	}
>   
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index de8dba7565b0..5284f243931a 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -187,23 +187,6 @@ enum intel_engine_id {
>   #define _VECS(n) (VECS + (n))
>   };
>   
> -struct i915_priolist {
> -	struct list_head requests[I915_PRIORITY_COUNT];
> -	struct rb_node node;
> -	unsigned long used;
> -	int priority;
> -};
> -
> -#define priolist_for_each_request(it, plist, idx) \
> -	for (idx = 0; idx < ARRAY_SIZE((plist)->requests); idx++) \
> -		list_for_each_entry(it, &(plist)->requests[idx], sched.link)
> -
> -#define priolist_for_each_request_consume(it, n, plist, idx) \
> -	for (; (idx = ffs((plist)->used)); (plist)->used &= ~BIT(idx - 1)) \
> -		list_for_each_entry_safe(it, n, \
> -					 &(plist)->requests[idx - 1], \
> -					 sched.link)
> -
>   struct st_preempt_hang {
>   	struct completion completion;
>   	unsigned int count;
> diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> index 7172f6c7f25a..2d582a21eba9 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> @@ -441,7 +441,7 @@ static struct i915_request *dummy_request(struct intel_engine_cs *engine)
>   static void dummy_request_free(struct i915_request *dummy)
>   {
>   	i915_request_mark_complete(dummy);
> -	i915_sched_node_fini(dummy->engine->i915, &dummy->sched);
> +	i915_sched_node_fini(&dummy->sched);
>   	i915_sw_fence_fini(&dummy->submit);
>   
>   	dma_fence_free(&dummy->fence);
> diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
> index 6f3fb803c747..ec1ae948954c 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_engine.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
> @@ -76,26 +76,26 @@ static void mock_ring_free(struct intel_ring *base)
>   	kfree(ring);
>   }
>   
> -static struct mock_request *first_request(struct mock_engine *engine)
> +static struct i915_request *first_request(struct mock_engine *engine)
>   {
>   	return list_first_entry_or_null(&engine->hw_queue,
> -					struct mock_request,
> -					link);
> +					struct i915_request,
> +					mock.link);
>   }
>   
> -static void advance(struct mock_request *request)
> +static void advance(struct i915_request *request)
>   {
> -	list_del_init(&request->link);
> -	i915_request_mark_complete(&request->base);
> -	GEM_BUG_ON(!i915_request_completed(&request->base));
> +	list_del_init(&request->mock.link);
> +	i915_request_mark_complete(request);
> +	GEM_BUG_ON(!i915_request_completed(request));
>   
> -	intel_engine_queue_breadcrumbs(request->base.engine);
> +	intel_engine_queue_breadcrumbs(request->engine);
>   }
>   
>   static void hw_delay_complete(struct timer_list *t)
>   {
>   	struct mock_engine *engine = from_timer(engine, t, hw_delay);
> -	struct mock_request *request;
> +	struct i915_request *request;
>   	unsigned long flags;
>   
>   	spin_lock_irqsave(&engine->hw_lock, flags);
> @@ -110,8 +110,9 @@ static void hw_delay_complete(struct timer_list *t)
>   	 * requeue the timer for the next delayed request.
>   	 */
>   	while ((request = first_request(engine))) {
> -		if (request->delay) {
> -			mod_timer(&engine->hw_delay, jiffies + request->delay);
> +		if (request->mock.delay) {
> +			mod_timer(&engine->hw_delay,
> +				  jiffies + request->mock.delay);
>   			break;
>   		}
>   
> @@ -169,10 +170,8 @@ mock_context_pin(struct intel_engine_cs *engine,
>   
>   static int mock_request_alloc(struct i915_request *request)
>   {
> -	struct mock_request *mock = container_of(request, typeof(*mock), base);
> -
> -	INIT_LIST_HEAD(&mock->link);
> -	mock->delay = 0;
> +	INIT_LIST_HEAD(&request->mock.link);
> +	request->mock.delay = 0;
>   
>   	return 0;
>   }
> @@ -190,7 +189,6 @@ static u32 *mock_emit_breadcrumb(struct i915_request *request, u32 *cs)
>   
>   static void mock_submit_request(struct i915_request *request)
>   {
> -	struct mock_request *mock = container_of(request, typeof(*mock), base);
>   	struct mock_engine *engine =
>   		container_of(request->engine, typeof(*engine), base);
>   	unsigned long flags;
> @@ -198,12 +196,13 @@ static void mock_submit_request(struct i915_request *request)
>   	i915_request_submit(request);
>   
>   	spin_lock_irqsave(&engine->hw_lock, flags);
> -	list_add_tail(&mock->link, &engine->hw_queue);
> -	if (mock->link.prev == &engine->hw_queue) {
> -		if (mock->delay)
> -			mod_timer(&engine->hw_delay, jiffies + mock->delay);
> +	list_add_tail(&request->mock.link, &engine->hw_queue);
> +	if (list_is_first(&request->mock.link, &engine->hw_queue)) {
> +		if (request->mock.delay)
> +			mod_timer(&engine->hw_delay,
> +				  jiffies + request->mock.delay);
>   		else
> -			advance(mock);
> +			advance(request);
>   	}
>   	spin_unlock_irqrestore(&engine->hw_lock, flags);
>   }
> @@ -263,12 +262,12 @@ void mock_engine_flush(struct intel_engine_cs *engine)
>   {
>   	struct mock_engine *mock =
>   		container_of(engine, typeof(*mock), base);
> -	struct mock_request *request, *rn;
> +	struct i915_request *request, *rn;
>   
>   	del_timer_sync(&mock->hw_delay);
>   
>   	spin_lock_irq(&mock->hw_lock);
> -	list_for_each_entry_safe(request, rn, &mock->hw_queue, link)
> +	list_for_each_entry_safe(request, rn, &mock->hw_queue, mock.link)
>   		advance(request);
>   	spin_unlock_irq(&mock->hw_lock);
>   }
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index fc516a2970f4..5a98caba6d69 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -79,9 +79,6 @@ static void mock_device_release(struct drm_device *dev)
>   
>   	destroy_workqueue(i915->wq);
>   
> -	kmem_cache_destroy(i915->priorities);
> -	kmem_cache_destroy(i915->dependencies);
> -	kmem_cache_destroy(i915->requests);
>   	kmem_cache_destroy(i915->vmas);
>   	kmem_cache_destroy(i915->objects);
>   
> @@ -211,23 +208,6 @@ struct drm_i915_private *mock_gem_device(void)
>   	if (!i915->vmas)
>   		goto err_objects;
>   
> -	i915->requests = KMEM_CACHE(mock_request,
> -				    SLAB_HWCACHE_ALIGN |
> -				    SLAB_RECLAIM_ACCOUNT |
> -				    SLAB_TYPESAFE_BY_RCU);
> -	if (!i915->requests)
> -		goto err_vmas;
> -
> -	i915->dependencies = KMEM_CACHE(i915_dependency,
> -					SLAB_HWCACHE_ALIGN |
> -					SLAB_RECLAIM_ACCOUNT);
> -	if (!i915->dependencies)
> -		goto err_requests;
> -
> -	i915->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
> -	if (!i915->priorities)
> -		goto err_dependencies;
> -
>   	i915_timelines_init(i915);
>   
>   	INIT_LIST_HEAD(&i915->gt.active_rings);
> @@ -257,12 +237,6 @@ struct drm_i915_private *mock_gem_device(void)
>   err_unlock:
>   	mutex_unlock(&i915->drm.struct_mutex);
>   	i915_timelines_fini(i915);
> -	kmem_cache_destroy(i915->priorities);
> -err_dependencies:
> -	kmem_cache_destroy(i915->dependencies);
> -err_requests:
> -	kmem_cache_destroy(i915->requests);
> -err_vmas:
>   	kmem_cache_destroy(i915->vmas);
>   err_objects:
>   	kmem_cache_destroy(i915->objects);
> diff --git a/drivers/gpu/drm/i915/selftests/mock_request.c b/drivers/gpu/drm/i915/selftests/mock_request.c
> index 0dc29e242597..d1a7c9608712 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_request.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_request.c
> @@ -31,29 +31,25 @@ mock_request(struct intel_engine_cs *engine,
>   	     unsigned long delay)
>   {
>   	struct i915_request *request;
> -	struct mock_request *mock;
>   
>   	/* NB the i915->requests slab cache is enlarged to fit mock_request */
>   	request = i915_request_alloc(engine, context);
>   	if (IS_ERR(request))
>   		return NULL;
>   
> -	mock = container_of(request, typeof(*mock), base);
> -	mock->delay = delay;
> -
> -	return &mock->base;
> +	request->mock.delay = delay;
> +	return request;
>   }
>   
>   bool mock_cancel_request(struct i915_request *request)
>   {
> -	struct mock_request *mock = container_of(request, typeof(*mock), base);
>   	struct mock_engine *engine =
>   		container_of(request->engine, typeof(*engine), base);
>   	bool was_queued;
>   
>   	spin_lock_irq(&engine->hw_lock);
> -	was_queued = !list_empty(&mock->link);
> -	list_del_init(&mock->link);
> +	was_queued = !list_empty(&request->mock.link);
> +	list_del_init(&request->mock.link);
>   	spin_unlock_irq(&engine->hw_lock);
>   
>   	if (was_queued)
> diff --git a/drivers/gpu/drm/i915/selftests/mock_request.h b/drivers/gpu/drm/i915/selftests/mock_request.h
> index 995fb728380c..4acf0211df20 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_request.h
> +++ b/drivers/gpu/drm/i915/selftests/mock_request.h
> @@ -29,13 +29,6 @@
>   
>   #include "../i915_request.h"
>   
> -struct mock_request {
> -	struct i915_request base;
> -
> -	struct list_head link;
> -	unsigned long delay;
> -};
> -
>   struct i915_request *
>   mock_request(struct intel_engine_cs *engine,
>   	     struct i915_gem_context *context,
>
Chris Wilson Feb. 27, 2019, 10:44 a.m. UTC | #2
Quoting Tvrtko Ursulin (2019-02-27 10:29:43)
> 
> On 26/02/2019 10:23, Chris Wilson wrote:
> > As kmem_caches share the same properties (size, allocation/free behaviour)
> > for all potential devices, we can use global caches. While this
> > potential has worse fragmentation behaviour (one can argue that
> > different devices would have different activity lifetimes, but you can
> > also argue that activity is temporal across the system) it is the
> > default behaviour of the system at large to amalgamate matching caches.
> > 
> > The benefit for us is much reduced pointer dancing along the frequent
> > allocation paths.
> > 
> > v2: Defer shrinking until after a global grace period for futureproofing
> > multiple consumers of the slab caches, similar to the current strategy
> > for avoiding shrinking too early.
> 
> I suggested to call i915_globals_park directly from __i915_gem_park for 
> symmetry with how i915_gem_unpark calls i915_globals_unpark. 
> i915_globals has it's own delayed setup so I don't think it benefits 
> from the double indirection courtesy of being called from shrink_caches.

I replied I left that change until a later patch after the final
conversions. Mostly so that we had a standalone patch to revert if the
rcu_work turns out badly. In this patch, it was to be the simple
translation over to global_shrink, except you asked for it to be truly
global and so we needed another layer of counters.
-Chris
Tvrtko Ursulin Feb. 27, 2019, 2:17 p.m. UTC | #3
On 27/02/2019 10:44, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-02-27 10:29:43)
>>
>> On 26/02/2019 10:23, Chris Wilson wrote:
>>> As kmem_caches share the same properties (size, allocation/free behaviour)
>>> for all potential devices, we can use global caches. While this
>>> potential has worse fragmentation behaviour (one can argue that
>>> different devices would have different activity lifetimes, but you can
>>> also argue that activity is temporal across the system) it is the
>>> default behaviour of the system at large to amalgamate matching caches.
>>>
>>> The benefit for us is much reduced pointer dancing along the frequent
>>> allocation paths.
>>>
>>> v2: Defer shrinking until after a global grace period for futureproofing
>>> multiple consumers of the slab caches, similar to the current strategy
>>> for avoiding shrinking too early.
>>
>> I suggested to call i915_globals_park directly from __i915_gem_park for
>> symmetry with how i915_gem_unpark calls i915_globals_unpark.
>> i915_globals has it's own delayed setup so I don't think it benefits
>> from the double indirection courtesy of being called from shrink_caches.
> 
> I replied I left that change until a later patch after the final
> conversions. Mostly so that we had a standalone patch to revert if the
> rcu_work turns out badly. In this patch, it was to be the simple
> translation over to global_shrink, except you asked for it to be truly
> global and so we needed another layer of counters.

It's a hard sell I think. Because why even have rcu work now in this 
case? You could make i915_globals_park just shrink if active counter 
dropped to zero. I don't see a benefit in a temporary asymmetric solution.

Regards,

Tvrtko
Chris Wilson Feb. 27, 2019, 2:43 p.m. UTC | #4
Quoting Tvrtko Ursulin (2019-02-27 14:17:25)
> 
> On 27/02/2019 10:44, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-02-27 10:29:43)
> >>
> >> On 26/02/2019 10:23, Chris Wilson wrote:
> >>> As kmem_caches share the same properties (size, allocation/free behaviour)
> >>> for all potential devices, we can use global caches. While this
> >>> potential has worse fragmentation behaviour (one can argue that
> >>> different devices would have different activity lifetimes, but you can
> >>> also argue that activity is temporal across the system) it is the
> >>> default behaviour of the system at large to amalgamate matching caches.
> >>>
> >>> The benefit for us is much reduced pointer dancing along the frequent
> >>> allocation paths.
> >>>
> >>> v2: Defer shrinking until after a global grace period for futureproofing
> >>> multiple consumers of the slab caches, similar to the current strategy
> >>> for avoiding shrinking too early.
> >>
> >> I suggested to call i915_globals_park directly from __i915_gem_park for
> >> symmetry with how i915_gem_unpark calls i915_globals_unpark.
> >> i915_globals has it's own delayed setup so I don't think it benefits
> >> from the double indirection courtesy of being called from shrink_caches.
> > 
> > I replied I left that change until a later patch after the final
> > conversions. Mostly so that we had a standalone patch to revert if the
> > rcu_work turns out badly. In this patch, it was to be the simple
> > translation over to global_shrink, except you asked for it to be truly
> > global and so we needed another layer of counters.
> 
> It's a hard sell I think. Because why even have rcu work now in this 
> case? You could make i915_globals_park just shrink if active counter 
> dropped to zero. I don't see a benefit in a temporary asymmetric solution.

I did do just that in v1!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-Chris

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 1787e1299b1b..a1d834068765 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -77,6 +77,7 @@  i915-y += \
 	  i915_gem_tiling.o \
 	  i915_gem_userptr.o \
 	  i915_gemfs.o \
+	  i915_globals.o \
 	  i915_query.o \
 	  i915_request.o \
 	  i915_scheduler.o \
diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index db7bb5bd5add..d9f6471ac16c 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -294,7 +294,12 @@  int __init i915_global_active_init(void)
 	return 0;
 }
 
-void __exit i915_global_active_exit(void)
+void i915_global_active_shrink(void)
+{
+	kmem_cache_shrink(global.slab_cache);
+}
+
+void i915_global_active_exit(void)
 {
 	kmem_cache_destroy(global.slab_cache);
 }
diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
index 12b5c1d287d1..5fbd9102384b 100644
--- a/drivers/gpu/drm/i915/i915_active.h
+++ b/drivers/gpu/drm/i915/i915_active.h
@@ -420,6 +420,7 @@  static inline void i915_active_fini(struct i915_active *ref) { }
 #endif
 
 int i915_global_active_init(void);
+void i915_global_active_shrink(void);
 void i915_global_active_exit(void);
 
 #endif /* _I915_ACTIVE_H_ */
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index cc09caf3870e..f16016b330b3 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1473,9 +1473,6 @@  struct drm_i915_private {
 	struct kmem_cache *objects;
 	struct kmem_cache *vmas;
 	struct kmem_cache *luts;
-	struct kmem_cache *requests;
-	struct kmem_cache *dependencies;
-	struct kmem_cache *priorities;
 
 	const struct intel_device_info __info; /* Use INTEL_INFO() to access. */
 	struct intel_runtime_info __runtime; /* Use RUNTIME_INFO() to access. */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2b261524cfa4..713ed6fbdcc8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -42,6 +42,7 @@ 
 #include "i915_drv.h"
 #include "i915_gem_clflush.h"
 #include "i915_gemfs.h"
+#include "i915_globals.h"
 #include "i915_reset.h"
 #include "i915_trace.h"
 #include "i915_vgpu.h"
@@ -187,6 +188,8 @@  void i915_gem_unpark(struct drm_i915_private *i915)
 	if (unlikely(++i915->gt.epoch == 0)) /* keep 0 as invalid */
 		i915->gt.epoch = 1;
 
+	i915_globals_unpark();
+
 	intel_enable_gt_powersave(i915);
 	i915_update_gfx_val(i915);
 	if (INTEL_GEN(i915) >= 6)
@@ -2892,12 +2895,11 @@  static void shrink_caches(struct drm_i915_private *i915)
 	 * filled slabs to prioritise allocating from the mostly full slabs,
 	 * with the aim of reducing fragmentation.
 	 */
-	kmem_cache_shrink(i915->priorities);
-	kmem_cache_shrink(i915->dependencies);
-	kmem_cache_shrink(i915->requests);
 	kmem_cache_shrink(i915->luts);
 	kmem_cache_shrink(i915->vmas);
 	kmem_cache_shrink(i915->objects);
+
+	i915_globals_park();
 }
 
 struct sleep_rcu_work {
@@ -5235,23 +5237,6 @@  int i915_gem_init_early(struct drm_i915_private *dev_priv)
 	if (!dev_priv->luts)
 		goto err_vmas;
 
-	dev_priv->requests = KMEM_CACHE(i915_request,
-					SLAB_HWCACHE_ALIGN |
-					SLAB_RECLAIM_ACCOUNT |
-					SLAB_TYPESAFE_BY_RCU);
-	if (!dev_priv->requests)
-		goto err_luts;
-
-	dev_priv->dependencies = KMEM_CACHE(i915_dependency,
-					    SLAB_HWCACHE_ALIGN |
-					    SLAB_RECLAIM_ACCOUNT);
-	if (!dev_priv->dependencies)
-		goto err_requests;
-
-	dev_priv->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
-	if (!dev_priv->priorities)
-		goto err_dependencies;
-
 	INIT_LIST_HEAD(&dev_priv->gt.active_rings);
 	INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
 
@@ -5276,12 +5261,6 @@  int i915_gem_init_early(struct drm_i915_private *dev_priv)
 
 	return 0;
 
-err_dependencies:
-	kmem_cache_destroy(dev_priv->dependencies);
-err_requests:
-	kmem_cache_destroy(dev_priv->requests);
-err_luts:
-	kmem_cache_destroy(dev_priv->luts);
 err_vmas:
 	kmem_cache_destroy(dev_priv->vmas);
 err_objects:
@@ -5299,9 +5278,6 @@  void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
 
 	cleanup_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
 
-	kmem_cache_destroy(dev_priv->priorities);
-	kmem_cache_destroy(dev_priv->dependencies);
-	kmem_cache_destroy(dev_priv->requests);
 	kmem_cache_destroy(dev_priv->luts);
 	kmem_cache_destroy(dev_priv->vmas);
 	kmem_cache_destroy(dev_priv->objects);
diff --git a/drivers/gpu/drm/i915/i915_globals.c b/drivers/gpu/drm/i915/i915_globals.c
new file mode 100644
index 000000000000..7fd1b3945a04
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_globals.c
@@ -0,0 +1,113 @@ 
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include "i915_active.h"
+#include "i915_globals.h"
+#include "i915_request.h"
+#include "i915_scheduler.h"
+
+int __init i915_globals_init(void)
+{
+	int err;
+
+	err = i915_global_active_init();
+	if (err)
+		return err;
+
+	err = i915_global_request_init();
+	if (err)
+		goto err_active;
+
+	err = i915_global_scheduler_init();
+	if (err)
+		goto err_request;
+
+	return 0;
+
+err_request:
+	i915_global_request_exit();
+err_active:
+	i915_global_active_exit();
+	return err;
+}
+
+static void i915_globals_shrink(void)
+{
+	/*
+	 * kmem_cache_shrink() discards empty slabs and reorders partially
+	 * filled slabs to prioritise allocating from the mostly full slabs,
+	 * with the aim of reducing fragmentation.
+	 */
+	i915_global_active_shrink();
+	i915_global_request_shrink();
+	i915_global_scheduler_shrink();
+}
+
+static atomic_t active;
+static atomic_t epoch;
+struct park_work {
+	struct rcu_work work;
+	int epoch;
+};
+
+static void __i915_globals_park(struct work_struct *work)
+{
+	struct park_work *wrk = container_of(work, typeof(*wrk), work.work);
+
+	/* Confirm nothing woke up in the last grace period */
+	if (wrk->epoch == atomic_read(&epoch))
+		i915_globals_shrink();
+
+	kfree(wrk);
+}
+
+void i915_globals_park(void)
+{
+	struct park_work *wrk;
+
+	/*
+	 * Defer shrinking the global slab caches (and other work) until
+	 * after a RCU grace period has completed with no activity. This
+	 * is to try and reduce the latency impact on the consumers caused
+	 * by us shrinking the caches the same time as they are trying to
+	 * allocate, with the assumption being that if we idle long enough
+	 * for an RCU grace period to elapse since the last use, it is likely
+	 * to be longer until we need the caches again.
+	 */
+	if (!atomic_dec_and_test(&active))
+		return;
+
+	wrk = kmalloc(sizeof(*wrk), GFP_KERNEL);
+	if (!wrk)
+		return;
+
+	wrk->epoch = atomic_inc_return(&epoch);
+	INIT_RCU_WORK(&wrk->work, __i915_globals_park);
+	queue_rcu_work(system_wq, &wrk->work);
+}
+
+void i915_globals_unpark(void)
+{
+	atomic_inc(&epoch);
+	atomic_inc(&active);
+}
+
+void __exit i915_globals_exit(void)
+{
+	/* Flush any residual park_work */
+	rcu_barrier();
+	flush_scheduled_work();
+
+	i915_global_scheduler_exit();
+	i915_global_request_exit();
+	i915_global_active_exit();
+
+	/* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
+	rcu_barrier();
+}
diff --git a/drivers/gpu/drm/i915/i915_globals.h b/drivers/gpu/drm/i915/i915_globals.h
new file mode 100644
index 000000000000..e468f0413a73
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_globals.h
@@ -0,0 +1,15 @@ 
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#ifndef _I915_GLOBALS_H_
+#define _I915_GLOBALS_H_
+
+int i915_globals_init(void);
+void i915_globals_park(void);
+void i915_globals_unpark(void);
+void i915_globals_exit(void);
+
+#endif /* _I915_GLOBALS_H_ */
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index c4d6b8da9b03..a9211c370cd1 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -28,8 +28,8 @@ 
 
 #include <drm/drm_drv.h>
 
-#include "i915_active.h"
 #include "i915_drv.h"
+#include "i915_globals.h"
 #include "i915_selftest.h"
 
 #define PLATFORM(x) .platform = (x), .platform_mask = BIT(x)
@@ -802,7 +802,9 @@  static int __init i915_init(void)
 	bool use_kms = true;
 	int err;
 
-	i915_global_active_init();
+	err = i915_globals_init();
+	if (err)
+		return err;
 
 	err = i915_mock_selftests();
 	if (err)
@@ -835,7 +837,7 @@  static void __exit i915_exit(void)
 		return;
 
 	pci_unregister_driver(&i915_pci_driver);
-	i915_global_active_exit();
+	i915_globals_exit();
 }
 
 module_init(i915_init);
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 00a1ea7cd907..c65f6c990fdd 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -32,6 +32,11 @@ 
 #include "i915_active.h"
 #include "i915_reset.h"
 
+static struct i915_global_request {
+	struct kmem_cache *slab_requests;
+	struct kmem_cache *slab_dependencies;
+} global;
+
 static const char *i915_fence_get_driver_name(struct dma_fence *fence)
 {
 	return "i915";
@@ -86,7 +91,7 @@  static void i915_fence_release(struct dma_fence *fence)
 	 */
 	i915_sw_fence_fini(&rq->submit);
 
-	kmem_cache_free(rq->i915->requests, rq);
+	kmem_cache_free(global.slab_requests, rq);
 }
 
 const struct dma_fence_ops i915_fence_ops = {
@@ -292,7 +297,7 @@  static void i915_request_retire(struct i915_request *request)
 
 	unreserve_gt(request->i915);
 
-	i915_sched_node_fini(request->i915, &request->sched);
+	i915_sched_node_fini(&request->sched);
 	i915_request_put(request);
 }
 
@@ -506,7 +511,7 @@  i915_request_alloc_slow(struct intel_context *ce)
 	ring_retire_requests(ring);
 
 out:
-	return kmem_cache_alloc(ce->gem_context->i915->requests, GFP_KERNEL);
+	return kmem_cache_alloc(global.slab_requests, GFP_KERNEL);
 }
 
 static int add_timeline_barrier(struct i915_request *rq)
@@ -594,7 +599,7 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 *
 	 * Do not use kmem_cache_zalloc() here!
 	 */
-	rq = kmem_cache_alloc(i915->requests,
+	rq = kmem_cache_alloc(global.slab_requests,
 			      GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN);
 	if (unlikely(!rq)) {
 		rq = i915_request_alloc_slow(ce);
@@ -681,7 +686,7 @@  i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	GEM_BUG_ON(!list_empty(&rq->sched.signalers_list));
 	GEM_BUG_ON(!list_empty(&rq->sched.waiters_list));
 
-	kmem_cache_free(i915->requests, rq);
+	kmem_cache_free(global.slab_requests, rq);
 err_unreserve:
 	unreserve_gt(i915);
 	intel_context_unpin(ce);
@@ -700,9 +705,7 @@  i915_request_await_request(struct i915_request *to, struct i915_request *from)
 		return 0;
 
 	if (to->engine->schedule) {
-		ret = i915_sched_node_add_dependency(to->i915,
-						     &to->sched,
-						     &from->sched);
+		ret = i915_sched_node_add_dependency(&to->sched, &from->sched);
 		if (ret < 0)
 			return ret;
 	}
@@ -1190,3 +1193,37 @@  void i915_retire_requests(struct drm_i915_private *i915)
 #include "selftests/mock_request.c"
 #include "selftests/i915_request.c"
 #endif
+
+int __init i915_global_request_init(void)
+{
+	global.slab_requests = KMEM_CACHE(i915_request,
+					  SLAB_HWCACHE_ALIGN |
+					  SLAB_RECLAIM_ACCOUNT |
+					  SLAB_TYPESAFE_BY_RCU);
+	if (!global.slab_requests)
+		return -ENOMEM;
+
+	global.slab_dependencies = KMEM_CACHE(i915_dependency,
+					      SLAB_HWCACHE_ALIGN |
+					      SLAB_RECLAIM_ACCOUNT);
+	if (!global.slab_dependencies)
+		goto err_requests;
+
+	return 0;
+
+err_requests:
+	kmem_cache_destroy(global.slab_requests);
+	return -ENOMEM;
+}
+
+void i915_global_request_shrink(void)
+{
+	kmem_cache_shrink(global.slab_dependencies);
+	kmem_cache_shrink(global.slab_requests);
+}
+
+void i915_global_request_exit(void)
+{
+	kmem_cache_destroy(global.slab_dependencies);
+	kmem_cache_destroy(global.slab_requests);
+}
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 1e127c1c53fa..be3ded6bcf56 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -29,6 +29,7 @@ 
 
 #include "i915_gem.h"
 #include "i915_scheduler.h"
+#include "i915_selftest.h"
 #include "i915_sw_fence.h"
 
 #include <uapi/drm/i915_drm.h>
@@ -196,6 +197,11 @@  struct i915_request {
 	struct drm_i915_file_private *file_priv;
 	/** file_priv list entry for this request */
 	struct list_head client_link;
+
+	I915_SELFTEST_DECLARE(struct {
+		struct list_head link;
+		unsigned long delay;
+	} mock;)
 };
 
 #define I915_FENCE_GFP (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
@@ -371,4 +377,8 @@  static inline void i915_request_mark_complete(struct i915_request *rq)
 
 void i915_retire_requests(struct drm_i915_private *i915);
 
+int i915_global_request_init(void);
+void i915_global_request_shrink(void);
+void i915_global_request_exit(void);
+
 #endif /* I915_REQUEST_H */
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 9fb96ff57a29..50018ad30233 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -10,6 +10,11 @@ 
 #include "i915_request.h"
 #include "i915_scheduler.h"
 
+static struct i915_global_scheduler {
+	struct kmem_cache *slab_dependencies;
+	struct kmem_cache *slab_priorities;
+} global;
+
 static DEFINE_SPINLOCK(schedule_lock);
 
 static const struct i915_request *
@@ -37,16 +42,15 @@  void i915_sched_node_init(struct i915_sched_node *node)
 }
 
 static struct i915_dependency *
-i915_dependency_alloc(struct drm_i915_private *i915)
+i915_dependency_alloc(void)
 {
-	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
+	return kmem_cache_alloc(global.slab_dependencies, GFP_KERNEL);
 }
 
 static void
-i915_dependency_free(struct drm_i915_private *i915,
-		     struct i915_dependency *dep)
+i915_dependency_free(struct i915_dependency *dep)
 {
-	kmem_cache_free(i915->dependencies, dep);
+	kmem_cache_free(global.slab_dependencies, dep);
 }
 
 bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
@@ -73,25 +77,23 @@  bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
 	return ret;
 }
 
-int i915_sched_node_add_dependency(struct drm_i915_private *i915,
-				   struct i915_sched_node *node,
+int i915_sched_node_add_dependency(struct i915_sched_node *node,
 				   struct i915_sched_node *signal)
 {
 	struct i915_dependency *dep;
 
-	dep = i915_dependency_alloc(i915);
+	dep = i915_dependency_alloc();
 	if (!dep)
 		return -ENOMEM;
 
 	if (!__i915_sched_node_add_dependency(node, signal, dep,
 					      I915_DEPENDENCY_ALLOC))
-		i915_dependency_free(i915, dep);
+		i915_dependency_free(dep);
 
 	return 0;
 }
 
-void i915_sched_node_fini(struct drm_i915_private *i915,
-			  struct i915_sched_node *node)
+void i915_sched_node_fini(struct i915_sched_node *node)
 {
 	struct i915_dependency *dep, *tmp;
 
@@ -111,7 +113,7 @@  void i915_sched_node_fini(struct drm_i915_private *i915,
 
 		list_del(&dep->wait_link);
 		if (dep->flags & I915_DEPENDENCY_ALLOC)
-			i915_dependency_free(i915, dep);
+			i915_dependency_free(dep);
 	}
 
 	/* Remove ourselves from everyone who depends upon us */
@@ -121,7 +123,7 @@  void i915_sched_node_fini(struct drm_i915_private *i915,
 
 		list_del(&dep->signal_link);
 		if (dep->flags & I915_DEPENDENCY_ALLOC)
-			i915_dependency_free(i915, dep);
+			i915_dependency_free(dep);
 	}
 
 	spin_unlock(&schedule_lock);
@@ -198,7 +200,7 @@  i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio)
 	if (prio == I915_PRIORITY_NORMAL) {
 		p = &execlists->default_priolist;
 	} else {
-		p = kmem_cache_alloc(engine->i915->priorities, GFP_ATOMIC);
+		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
 		/* Convert an allocation failure to a priority bump */
 		if (unlikely(!p)) {
 			prio = I915_PRIORITY_NORMAL; /* recurses just once */
@@ -423,3 +425,39 @@  void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump)
 
 	spin_unlock_bh(&schedule_lock);
 }
+
+void __i915_priolist_free(struct i915_priolist *p)
+{
+	kmem_cache_free(global.slab_priorities, p);
+}
+
+int __init i915_global_scheduler_init(void)
+{
+	global.slab_dependencies = KMEM_CACHE(i915_dependency,
+					      SLAB_HWCACHE_ALIGN);
+	if (!global.slab_dependencies)
+		return -ENOMEM;
+
+	global.slab_priorities = KMEM_CACHE(i915_priolist,
+					    SLAB_HWCACHE_ALIGN);
+	if (!global.slab_priorities)
+		goto err_priorities;
+
+	return 0;
+
+err_priorities:
+	kmem_cache_destroy(global.slab_priorities);
+	return -ENOMEM;
+}
+
+void i915_global_scheduler_shrink(void)
+{
+	kmem_cache_shrink(global.slab_dependencies);
+	kmem_cache_shrink(global.slab_priorities);
+}
+
+void i915_global_scheduler_exit(void)
+{
+	kmem_cache_destroy(global.slab_dependencies);
+	kmem_cache_destroy(global.slab_priorities);
+}
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index 54bd6c89817e..5196ce07b6c2 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -85,6 +85,23 @@  struct i915_dependency {
 #define I915_DEPENDENCY_ALLOC BIT(0)
 };
 
+struct i915_priolist {
+	struct list_head requests[I915_PRIORITY_COUNT];
+	struct rb_node node;
+	unsigned long used;
+	int priority;
+};
+
+#define priolist_for_each_request(it, plist, idx) \
+	for (idx = 0; idx < ARRAY_SIZE((plist)->requests); idx++) \
+		list_for_each_entry(it, &(plist)->requests[idx], sched.link)
+
+#define priolist_for_each_request_consume(it, n, plist, idx) \
+	for (; (idx = ffs((plist)->used)); (plist)->used &= ~BIT(idx - 1)) \
+		list_for_each_entry_safe(it, n, \
+					 &(plist)->requests[idx - 1], \
+					 sched.link)
+
 void i915_sched_node_init(struct i915_sched_node *node);
 
 bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
@@ -92,12 +109,10 @@  bool __i915_sched_node_add_dependency(struct i915_sched_node *node,
 				      struct i915_dependency *dep,
 				      unsigned long flags);
 
-int i915_sched_node_add_dependency(struct drm_i915_private *i915,
-				   struct i915_sched_node *node,
+int i915_sched_node_add_dependency(struct i915_sched_node *node,
 				   struct i915_sched_node *signal);
 
-void i915_sched_node_fini(struct drm_i915_private *i915,
-			  struct i915_sched_node *node);
+void i915_sched_node_fini(struct i915_sched_node *node);
 
 void i915_schedule(struct i915_request *request,
 		   const struct i915_sched_attr *attr);
@@ -107,4 +122,15 @@  void i915_schedule_bump_priority(struct i915_request *rq, unsigned int bump);
 struct list_head *
 i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio);
 
+void __i915_priolist_free(struct i915_priolist *p);
+static inline void i915_priolist_free(struct i915_priolist *p)
+{
+	if (p->priority != I915_PRIORITY_NORMAL)
+		__i915_priolist_free(p);
+}
+
+int i915_global_scheduler_init(void);
+void i915_global_scheduler_shrink(void);
+void i915_global_scheduler_exit(void);
+
 #endif /* _I915_SCHEDULER_H_ */
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index a2846ea1e62c..56ba2fcbabe6 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -781,8 +781,7 @@  static bool __guc_dequeue(struct intel_engine_cs *engine)
 		}
 
 		rb_erase_cached(&p->node, &execlists->queue);
-		if (p->priority != I915_PRIORITY_NORMAL)
-			kmem_cache_free(engine->i915->priorities, p);
+		i915_priolist_free(p);
 	}
 done:
 	execlists->queue_priority_hint =
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index dba19baf6808..29b2a2f34edb 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -818,8 +818,7 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 		}
 
 		rb_erase_cached(&p->node, &execlists->queue);
-		if (p->priority != I915_PRIORITY_NORMAL)
-			kmem_cache_free(engine->i915->priorities, p);
+		i915_priolist_free(p);
 	}
 
 done:
@@ -972,8 +971,7 @@  static void execlists_cancel_requests(struct intel_engine_cs *engine)
 		}
 
 		rb_erase_cached(&p->node, &execlists->queue);
-		if (p->priority != I915_PRIORITY_NORMAL)
-			kmem_cache_free(engine->i915->priorities, p);
+		i915_priolist_free(p);
 	}
 
 	/* Remaining _unready_ requests will be nop'ed when submitted */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index de8dba7565b0..5284f243931a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -187,23 +187,6 @@  enum intel_engine_id {
 #define _VECS(n) (VECS + (n))
 };
 
-struct i915_priolist {
-	struct list_head requests[I915_PRIORITY_COUNT];
-	struct rb_node node;
-	unsigned long used;
-	int priority;
-};
-
-#define priolist_for_each_request(it, plist, idx) \
-	for (idx = 0; idx < ARRAY_SIZE((plist)->requests); idx++) \
-		list_for_each_entry(it, &(plist)->requests[idx], sched.link)
-
-#define priolist_for_each_request_consume(it, n, plist, idx) \
-	for (; (idx = ffs((plist)->used)); (plist)->used &= ~BIT(idx - 1)) \
-		list_for_each_entry_safe(it, n, \
-					 &(plist)->requests[idx - 1], \
-					 sched.link)
-
 struct st_preempt_hang {
 	struct completion completion;
 	unsigned int count;
diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index 7172f6c7f25a..2d582a21eba9 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -441,7 +441,7 @@  static struct i915_request *dummy_request(struct intel_engine_cs *engine)
 static void dummy_request_free(struct i915_request *dummy)
 {
 	i915_request_mark_complete(dummy);
-	i915_sched_node_fini(dummy->engine->i915, &dummy->sched);
+	i915_sched_node_fini(&dummy->sched);
 	i915_sw_fence_fini(&dummy->submit);
 
 	dma_fence_free(&dummy->fence);
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index 6f3fb803c747..ec1ae948954c 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -76,26 +76,26 @@  static void mock_ring_free(struct intel_ring *base)
 	kfree(ring);
 }
 
-static struct mock_request *first_request(struct mock_engine *engine)
+static struct i915_request *first_request(struct mock_engine *engine)
 {
 	return list_first_entry_or_null(&engine->hw_queue,
-					struct mock_request,
-					link);
+					struct i915_request,
+					mock.link);
 }
 
-static void advance(struct mock_request *request)
+static void advance(struct i915_request *request)
 {
-	list_del_init(&request->link);
-	i915_request_mark_complete(&request->base);
-	GEM_BUG_ON(!i915_request_completed(&request->base));
+	list_del_init(&request->mock.link);
+	i915_request_mark_complete(request);
+	GEM_BUG_ON(!i915_request_completed(request));
 
-	intel_engine_queue_breadcrumbs(request->base.engine);
+	intel_engine_queue_breadcrumbs(request->engine);
 }
 
 static void hw_delay_complete(struct timer_list *t)
 {
 	struct mock_engine *engine = from_timer(engine, t, hw_delay);
-	struct mock_request *request;
+	struct i915_request *request;
 	unsigned long flags;
 
 	spin_lock_irqsave(&engine->hw_lock, flags);
@@ -110,8 +110,9 @@  static void hw_delay_complete(struct timer_list *t)
 	 * requeue the timer for the next delayed request.
 	 */
 	while ((request = first_request(engine))) {
-		if (request->delay) {
-			mod_timer(&engine->hw_delay, jiffies + request->delay);
+		if (request->mock.delay) {
+			mod_timer(&engine->hw_delay,
+				  jiffies + request->mock.delay);
 			break;
 		}
 
@@ -169,10 +170,8 @@  mock_context_pin(struct intel_engine_cs *engine,
 
 static int mock_request_alloc(struct i915_request *request)
 {
-	struct mock_request *mock = container_of(request, typeof(*mock), base);
-
-	INIT_LIST_HEAD(&mock->link);
-	mock->delay = 0;
+	INIT_LIST_HEAD(&request->mock.link);
+	request->mock.delay = 0;
 
 	return 0;
 }
@@ -190,7 +189,6 @@  static u32 *mock_emit_breadcrumb(struct i915_request *request, u32 *cs)
 
 static void mock_submit_request(struct i915_request *request)
 {
-	struct mock_request *mock = container_of(request, typeof(*mock), base);
 	struct mock_engine *engine =
 		container_of(request->engine, typeof(*engine), base);
 	unsigned long flags;
@@ -198,12 +196,13 @@  static void mock_submit_request(struct i915_request *request)
 	i915_request_submit(request);
 
 	spin_lock_irqsave(&engine->hw_lock, flags);
-	list_add_tail(&mock->link, &engine->hw_queue);
-	if (mock->link.prev == &engine->hw_queue) {
-		if (mock->delay)
-			mod_timer(&engine->hw_delay, jiffies + mock->delay);
+	list_add_tail(&request->mock.link, &engine->hw_queue);
+	if (list_is_first(&request->mock.link, &engine->hw_queue)) {
+		if (request->mock.delay)
+			mod_timer(&engine->hw_delay,
+				  jiffies + request->mock.delay);
 		else
-			advance(mock);
+			advance(request);
 	}
 	spin_unlock_irqrestore(&engine->hw_lock, flags);
 }
@@ -263,12 +262,12 @@  void mock_engine_flush(struct intel_engine_cs *engine)
 {
 	struct mock_engine *mock =
 		container_of(engine, typeof(*mock), base);
-	struct mock_request *request, *rn;
+	struct i915_request *request, *rn;
 
 	del_timer_sync(&mock->hw_delay);
 
 	spin_lock_irq(&mock->hw_lock);
-	list_for_each_entry_safe(request, rn, &mock->hw_queue, link)
+	list_for_each_entry_safe(request, rn, &mock->hw_queue, mock.link)
 		advance(request);
 	spin_unlock_irq(&mock->hw_lock);
 }
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index fc516a2970f4..5a98caba6d69 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -79,9 +79,6 @@  static void mock_device_release(struct drm_device *dev)
 
 	destroy_workqueue(i915->wq);
 
-	kmem_cache_destroy(i915->priorities);
-	kmem_cache_destroy(i915->dependencies);
-	kmem_cache_destroy(i915->requests);
 	kmem_cache_destroy(i915->vmas);
 	kmem_cache_destroy(i915->objects);
 
@@ -211,23 +208,6 @@  struct drm_i915_private *mock_gem_device(void)
 	if (!i915->vmas)
 		goto err_objects;
 
-	i915->requests = KMEM_CACHE(mock_request,
-				    SLAB_HWCACHE_ALIGN |
-				    SLAB_RECLAIM_ACCOUNT |
-				    SLAB_TYPESAFE_BY_RCU);
-	if (!i915->requests)
-		goto err_vmas;
-
-	i915->dependencies = KMEM_CACHE(i915_dependency,
-					SLAB_HWCACHE_ALIGN |
-					SLAB_RECLAIM_ACCOUNT);
-	if (!i915->dependencies)
-		goto err_requests;
-
-	i915->priorities = KMEM_CACHE(i915_priolist, SLAB_HWCACHE_ALIGN);
-	if (!i915->priorities)
-		goto err_dependencies;
-
 	i915_timelines_init(i915);
 
 	INIT_LIST_HEAD(&i915->gt.active_rings);
@@ -257,12 +237,6 @@  struct drm_i915_private *mock_gem_device(void)
 err_unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
 	i915_timelines_fini(i915);
-	kmem_cache_destroy(i915->priorities);
-err_dependencies:
-	kmem_cache_destroy(i915->dependencies);
-err_requests:
-	kmem_cache_destroy(i915->requests);
-err_vmas:
 	kmem_cache_destroy(i915->vmas);
 err_objects:
 	kmem_cache_destroy(i915->objects);
diff --git a/drivers/gpu/drm/i915/selftests/mock_request.c b/drivers/gpu/drm/i915/selftests/mock_request.c
index 0dc29e242597..d1a7c9608712 100644
--- a/drivers/gpu/drm/i915/selftests/mock_request.c
+++ b/drivers/gpu/drm/i915/selftests/mock_request.c
@@ -31,29 +31,25 @@  mock_request(struct intel_engine_cs *engine,
 	     unsigned long delay)
 {
 	struct i915_request *request;
-	struct mock_request *mock;
 
 	/* NB the i915->requests slab cache is enlarged to fit mock_request */
 	request = i915_request_alloc(engine, context);
 	if (IS_ERR(request))
 		return NULL;
 
-	mock = container_of(request, typeof(*mock), base);
-	mock->delay = delay;
-
-	return &mock->base;
+	request->mock.delay = delay;
+	return request;
 }
 
 bool mock_cancel_request(struct i915_request *request)
 {
-	struct mock_request *mock = container_of(request, typeof(*mock), base);
 	struct mock_engine *engine =
 		container_of(request->engine, typeof(*engine), base);
 	bool was_queued;
 
 	spin_lock_irq(&engine->hw_lock);
-	was_queued = !list_empty(&mock->link);
-	list_del_init(&mock->link);
+	was_queued = !list_empty(&request->mock.link);
+	list_del_init(&request->mock.link);
 	spin_unlock_irq(&engine->hw_lock);
 
 	if (was_queued)
diff --git a/drivers/gpu/drm/i915/selftests/mock_request.h b/drivers/gpu/drm/i915/selftests/mock_request.h
index 995fb728380c..4acf0211df20 100644
--- a/drivers/gpu/drm/i915/selftests/mock_request.h
+++ b/drivers/gpu/drm/i915/selftests/mock_request.h
@@ -29,13 +29,6 @@ 
 
 #include "../i915_request.h"
 
-struct mock_request {
-	struct i915_request base;
-
-	struct list_head link;
-	unsigned long delay;
-};
-
 struct i915_request *
 mock_request(struct intel_engine_cs *engine,
 	     struct i915_gem_context *context,