diff mbox series

[20/41] drm/i915: Replace priolist rbtree with a skiplist

Message ID 20210125140136.10494-20-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/41] drm/i915/selftests: Check for engine-reset errors in the middle of workarounds | expand

Commit Message

Chris Wilson Jan. 25, 2021, 2:01 p.m. UTC
Replace the priolist rbtree with a skiplist. The crucial difference is
that walking and removing the first element of a skiplist is O(1), but
O(lgN) for an rbtree, as we need to rebalance on remove. This is a
hindrance for submission latency as it occurs between picking a request
for the priolist and submitting it to hardware, as well effectively
trippling the number of O(lgN) operations required under the irqoff lock.
This is critical to reducing the latency jitter with multiple clients.

The downsides to skiplists are that lookup/insertion is only
probablistically O(lgN) and there is a significant memory penalty to
as each skip node is larger than the rbtree equivalent. Furthermore, we
don't use dynamic arrays for the skiplist, so the allocation is fixed,
and imposes an upper bound on the scalability wrt to the number of
inflight requests.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../drm/i915/gt/intel_execlists_submission.c  |  63 +++--
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  30 +--
 drivers/gpu/drm/i915/i915_priolist_types.h    |  28 +-
 drivers/gpu/drm/i915/i915_scheduler.c         | 244 ++++++++++++++----
 drivers/gpu/drm/i915/i915_scheduler.h         |  11 +-
 drivers/gpu/drm/i915/i915_scheduler_types.h   |   2 +-
 .../drm/i915/selftests/i915_mock_selftests.h  |   1 +
 .../gpu/drm/i915/selftests/i915_scheduler.c   |  53 +++-
 8 files changed, 316 insertions(+), 116 deletions(-)

Comments

Tvrtko Ursulin Jan. 27, 2021, 3:10 p.m. UTC | #1
On 25/01/2021 14:01, Chris Wilson wrote:
> Replace the priolist rbtree with a skiplist. The crucial difference is
> that walking and removing the first element of a skiplist is O(1), but

I wasn't (and am not) familiar with them, but wikipedia page says 
removal is O(logN) average case to O(N) worst case.

If I understand correctly O(1) could be ignoring the need to traverse 
from top to bottom level and removing the element from all. But since 
I915_PRIOLIST_HEIGHT is fixed maybe it is okay to call it O(1).

I wonder though why this wouldn't mean skip list would be worse for both 
lightly loaded and highly-loaded scenarios? Presumably height would need 
to be balanced to compensate for that.

In summary I have no idea for what number of in-flight requests would 
they be better.

How about putting this patch aside for now since it doesn't sound it is 
critical for deadline scheduling per se?

Regards,

Tvrtko

> O(lgN) for an rbtree, as we need to rebalance on remove. This is a
> hindrance for submission latency as it occurs between picking a request
> for the priolist and submitting it to hardware, as well effectively
> trippling the number of O(lgN) operations required under the irqoff lock.
> This is critical to reducing the latency jitter with multiple clients.
> 
> The downsides to skiplists are that lookup/insertion is only
> probablistically O(lgN) and there is a significant memory penalty to
> as each skip node is larger than the rbtree equivalent. Furthermore, we
> don't use dynamic arrays for the skiplist, so the allocation is fixed,
> and imposes an upper bound on the scalability wrt to the number of
> inflight requests.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   .../drm/i915/gt/intel_execlists_submission.c  |  63 +++--
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  30 +--
>   drivers/gpu/drm/i915/i915_priolist_types.h    |  28 +-
>   drivers/gpu/drm/i915/i915_scheduler.c         | 244 ++++++++++++++----
>   drivers/gpu/drm/i915/i915_scheduler.h         |  11 +-
>   drivers/gpu/drm/i915/i915_scheduler_types.h   |   2 +-
>   .../drm/i915/selftests/i915_mock_selftests.h  |   1 +
>   .../gpu/drm/i915/selftests/i915_scheduler.c   |  53 +++-
>   8 files changed, 316 insertions(+), 116 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 1103c8a00af1..129144dd86b0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -244,11 +244,6 @@ static void ring_set_paused(const struct intel_engine_cs *engine, int state)
>   		wmb();
>   }
>   
> -static struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static int rq_prio(const struct i915_request *rq)
>   {
>   	return READ_ONCE(rq->sched.attr.priority);
> @@ -272,15 +267,31 @@ static int effective_prio(const struct i915_request *rq)
>   	return prio;
>   }
>   
> -static int queue_prio(const struct i915_sched_engine *se)
> +static struct i915_request *first_request(struct i915_sched_engine *se)
>   {
> -	struct rb_node *rb;
> +	struct i915_priolist *pl;
>   
> -	rb = rb_first_cached(&se->queue);
> -	if (!rb)
> +	for_each_priolist(pl, &se->queue) {
> +		if (likely(!list_empty(&pl->requests)))
> +			return list_first_entry(&pl->requests,
> +						struct i915_request,
> +						sched.link);
> +
> +		i915_priolist_advance(&se->queue, pl);
> +	}
> +
> +	return NULL;
> +}
> +
> +static int queue_prio(struct i915_sched_engine *se)
> +{
> +	struct i915_request *rq;
> +
> +	rq = first_request(se);
> +	if (!rq)
>   		return INT_MIN;
>   
> -	return to_priolist(rb)->priority;
> +	return rq_prio(rq);
>   }
>   
>   static int virtual_prio(const struct intel_engine_execlists *el)
> @@ -290,7 +301,7 @@ static int virtual_prio(const struct intel_engine_execlists *el)
>   	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
>   }
>   
> -static bool need_preempt(const struct intel_engine_cs *engine,
> +static bool need_preempt(struct intel_engine_cs *engine,
>   			 const struct i915_request *rq)
>   {
>   	int last_prio;
> @@ -1136,6 +1147,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request ** const last_port = port + execlists->port_mask;
>   	struct i915_request *last, * const *active;
>   	struct virtual_engine *ve;
> +	struct i915_priolist *pl;
>   	struct rb_node *rb;
>   	bool submit = false;
>   
> @@ -1346,11 +1358,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			break;
>   	}
>   
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> +	for_each_priolist(pl, &engine->active.queue) {
>   		struct i915_request *rq, *rn;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			bool merge = true;
>   
>   			/*
> @@ -1425,8 +1436,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			}
>   		}
>   
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>   	}
>   done:
>   	*port++ = i915_request_get(last);
> @@ -2631,6 +2641,7 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct i915_request *rq, *rn;
> +	struct i915_priolist *pl;
>   	struct rb_node *rb;
>   	unsigned long flags;
>   
> @@ -2661,16 +2672,12 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   	intel_engine_signal_breadcrumbs(engine);
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> +	for_each_priolist(pl, &engine->active.queue) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			i915_request_mark_eio(rq);
>   			__i915_request_submit(rq);
>   		}
> -
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>   	}
>   	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
>   
> @@ -2703,7 +2710,6 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	engine->active.queue = RB_ROOT_CACHED;
>   
>   	GEM_BUG_ON(__tasklet_is_enabled(&engine->active.tasklet));
>   	engine->active.tasklet.func = nop_submission_tasklet;
> @@ -3089,6 +3095,8 @@ static void virtual_context_exit(struct intel_context *ce)
>   
>   	for (n = 0; n < ve->num_siblings; n++)
>   		intel_engine_pm_put(ve->siblings[n]);
> +
> +	i915_sched_park_engine(&ve->base.active);
>   }
>   
>   static const struct intel_context_ops virtual_context_ops = {
> @@ -3501,6 +3509,7 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>   {
>   	const struct intel_engine_execlists *execlists = &engine->execlists;
>   	struct i915_request *rq, *last;
> +	struct i915_priolist *pl;
>   	unsigned long flags;
>   	unsigned int count;
>   	struct rb_node *rb;
> @@ -3530,10 +3539,8 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>   
>   	last = NULL;
>   	count = 0;
> -	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> -
> -		priolist_for_each_request(rq, p) {
> +	for_each_priolist(pl, &engine->active.queue) {
> +		priolist_for_each_request(rq, pl) {
>   			if (count++ < max - 1)
>   				show_request(m, rq, "\t\t", 0);
>   			else
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 2d7339ef3b4c..8d0c6cd277b3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -59,11 +59,6 @@
>   
>   #define GUC_REQUEST_SIZE 64 /* bytes */
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
>   {
>   	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
> @@ -185,8 +180,8 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request ** const last_port = first + execlists->port_mask;
>   	struct i915_request *last = first[0];
>   	struct i915_request **port;
> +	struct i915_priolist *pl;
>   	bool submit = false;
> -	struct rb_node *rb;
>   
>   	lockdep_assert_held(&engine->active.lock);
>   
> @@ -203,11 +198,10 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	 * event.
>   	 */
>   	port = first;
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> +	for_each_priolist(pl, &engine->active.queue) {
>   		struct i915_request *rq, *rn;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			if (last && rq->context != last->context) {
>   				if (port == last_port)
>   					goto done;
> @@ -223,12 +217,11 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   			last = rq;
>   		}
>   
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>   	}
>   done:
>   	execlists->queue_priority_hint =
> -		rb ? to_priolist(rb)->priority : INT_MIN;
> +		pl != &engine->active.queue.sentinel ? pl->priority : INT_MIN;
>   	if (submit) {
>   		*port = schedule_in(last, port - execlists->inflight);
>   		*++port = NULL;
> @@ -327,7 +320,7 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct i915_request *rq, *rn;
> -	struct rb_node *rb;
> +	struct i915_priolist *p;
>   	unsigned long flags;
>   
>   	ENGINE_TRACE(engine, "\n");
> @@ -355,25 +348,20 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>   	}
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> +	for_each_priolist(p, &engine->active.queue) {
> +		priolist_for_each_request_safe(rq, rn, p) {
>   			list_del_init(&rq->sched.link);
>   			__i915_request_submit(rq);
>   			dma_fence_set_error(&rq->fence, -EIO);
>   			i915_request_mark_complete(rq);
>   		}
> -
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, p);
>   	}
>   	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
>   
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	engine->active.queue = RB_ROOT_CACHED;
>   
>   	spin_unlock_irqrestore(&engine->active.lock, flags);
>   }
> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> index bc2fa84f98a8..1200c3df6a4a 100644
> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> @@ -38,10 +38,36 @@ enum {
>   #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
>   #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
>   
> +#ifdef CONFIG_64BIT
> +#define I915_PRIOLIST_HEIGHT 12
> +#else
> +#define I915_PRIOLIST_HEIGHT 11
> +#endif
> +
>   struct i915_priolist {
>   	struct list_head requests;
> -	struct rb_node node;
>   	int priority;
> +
> +	int level;
> +	struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
>   };
>   
> +struct i915_priolist_root {
> +	struct i915_priolist sentinel;
> +	u32 prng;
> +};
> +
> +#define i915_priolist_is_empty(root) ((root)->sentinel.level < 0)
> +
> +#define for_each_priolist(p, root) \
> +	for ((p) = (root)->sentinel.next[0]; \
> +	     (p) != &(root)->sentinel; \
> +	     (p) = (p)->next[0])
> +
> +#define priolist_for_each_request(it, plist) \
> +	list_for_each_entry(it, &(plist)->requests, sched.link)
> +
> +#define priolist_for_each_request_safe(it, n, plist) \
> +	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> +
>   #endif /* _I915_PRIOLIST_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index a3ee06cb66d7..74000d3eebb1 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -4,7 +4,9 @@
>    * Copyright © 2018 Intel Corporation
>    */
>   
> +#include <linux/bitops.h>
>   #include <linux/mutex.h>
> +#include <linux/prandom.h>
>   
>   #include "gt/intel_ring.h"
>   #include "gt/intel_lrc_reg.h"
> @@ -91,15 +93,24 @@ static void i915_sched_init_ipi(struct i915_sched_ipi *ipi)
>   	ipi->list = NULL;
>   }
>   
> +static void init_priolist(struct i915_priolist_root *const root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +
> +	memset_p((void **)pl->next, pl, ARRAY_SIZE(pl->next));
> +	pl->priority = INT_MIN;
> +	pl->level = -1;
> +}
> +
>   void i915_sched_init_engine(struct i915_sched_engine *se,
>   			    unsigned int subclass)
>   {
>   	spin_lock_init(&se->lock);
>   	lockdep_set_subclass(&se->lock, subclass);
>   
> +	init_priolist(&se->queue);
>   	INIT_LIST_HEAD(&se->requests);
>   	INIT_LIST_HEAD(&se->hold);
> -	se->queue = RB_ROOT_CACHED;
>   
>   	i915_sched_init_ipi(&se->ipi);
>   
> @@ -116,8 +127,57 @@ void i915_sched_init_engine(struct i915_sched_engine *se,
>   #endif
>   }
>   
> +__maybe_unused static bool priolist_idle(struct i915_priolist_root *root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +	int lvl;
> +
> +	for (lvl = 0; lvl < ARRAY_SIZE(pl->next); lvl++) {
> +		if (pl->next[lvl] != pl) {
> +			GEM_TRACE_ERR("root[%d] is not empty\n", lvl);
> +			return false;
> +		}
> +	}
> +
> +	if (pl->level != -1) {
> +		GEM_TRACE_ERR("root is not clear: %d\n", pl->level);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static void pl_push(struct i915_priolist *pl, struct list_head *head)
> +{
> +	pl->requests.next = head->next;
> +	head->next = &pl->requests;
> +}
> +
> +static struct i915_priolist *pl_pop(struct list_head *head)
> +{
> +	struct i915_priolist *pl;
> +
> +	pl = container_of(head->next, typeof(*pl), requests);
> +	head->next = pl->requests.next;
> +
> +	return pl;
> +}
> +
> +static bool pl_empty(struct list_head *head)
> +{
> +	return !head->next;
> +}
> +
>   void i915_sched_park_engine(struct i915_sched_engine *se)
>   {
> +	struct i915_priolist_root *root = &se->queue;
> +	struct list_head *list = &root->sentinel.requests;
> +
> +	GEM_BUG_ON(!priolist_idle(root));
> +
> +	while (!pl_empty(list))
> +		kmem_cache_free(global.slab_priorities, pl_pop(list));
> +
>   	GEM_BUG_ON(!i915_sched_is_idle(se));
>   	se->no_priolist = false;
>   }
> @@ -183,71 +243,55 @@ static inline bool node_signaled(const struct i915_sched_node *node)
>   	return i915_request_completed(node_to_request(node));
>   }
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +static inline unsigned int random_level(struct i915_priolist_root *root)
>   {
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
> -static void assert_priolists(struct i915_sched_engine * const se)
> -{
> -	struct rb_node *rb;
> -	long last_prio;
> -
> -	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> -		return;
> -
> -	GEM_BUG_ON(rb_first_cached(&se->queue) !=
> -		   rb_first(&se->queue.rb_root));
> -
> -	last_prio = INT_MAX;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		const struct i915_priolist *p = to_priolist(rb);
> -
> -		GEM_BUG_ON(p->priority > last_prio);
> -		last_prio = p->priority;
> -	}
> +	root->prng = next_pseudo_random32(root->prng);
> +	return  __ffs(root->prng) / 2;
>   }
>   
>   static struct list_head *
>   lookup_priolist(struct intel_engine_cs *engine, int prio)
>   {
> +	struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
>   	struct i915_sched_engine * const se = &engine->active;
> -	struct i915_priolist *p;
> -	struct rb_node **parent, *rb;
> -	bool first = true;
> -
> -	lockdep_assert_held(&engine->active.lock);
> -	assert_priolists(se);
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	int lvl;
>   
> +	lockdep_assert_held(&se->lock);
>   	if (unlikely(se->no_priolist))
>   		prio = I915_PRIORITY_NORMAL;
>   
> +	for_each_priolist(pl, root) { /* recycle any empty elements before us */
> +		if (pl->priority >= prio || !list_empty(&pl->requests))
> +			break;
> +
> +		i915_priolist_advance(root, pl);
> +	}
> +
>   find_priolist:
> -	/* most positive priority is scheduled first, equal priorities fifo */
> -	rb = NULL;
> -	parent = &se->queue.rb_root.rb_node;
> -	while (*parent) {
> -		rb = *parent;
> -		p = to_priolist(rb);
> -		if (prio > p->priority) {
> -			parent = &rb->rb_left;
> -		} else if (prio < p->priority) {
> -			parent = &rb->rb_right;
> -			first = false;
> -		} else {
> -			return &p->requests;
> -		}
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	while (lvl >= 0) {
> +		while (tmp = pl->next[lvl], tmp->priority >= prio)
> +			pl = tmp;
> +		if (pl->priority == prio)
> +			goto out;
> +		update[lvl--] = pl;
>   	}
>   
>   	if (prio == I915_PRIORITY_NORMAL) {
> -		p = &se->default_priolist;
> +		pl = &se->default_priolist;
> +	} else if (!pl_empty(&root->sentinel.requests)) {
> +		pl = pl_pop(&root->sentinel.requests);
>   	} else {
> -		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> +		pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
>   		/* Convert an allocation failure to a priority bump */
> -		if (unlikely(!p)) {
> +		if (unlikely(!pl)) {
>   			prio = I915_PRIORITY_NORMAL; /* recurses just once */
>   
> -			/* To maintain ordering with all rendering, after an
> +			/*
> +			 * To maintain ordering with all rendering, after an
>   			 * allocation failure we have to disable all scheduling.
>   			 * Requests will then be executed in fifo, and schedule
>   			 * will ensure that dependencies are emitted in fifo.
> @@ -260,18 +304,103 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
>   		}
>   	}
>   
> -	p->priority = prio;
> -	INIT_LIST_HEAD(&p->requests);
> +	pl->priority = prio;
> +	INIT_LIST_HEAD(&pl->requests);
>   
> -	rb_link_node(&p->node, rb, parent);
> -	rb_insert_color_cached(&p->node, &se->queue, first);
> +	lvl = random_level(root);
> +	if (lvl > root->sentinel.level) {
> +		if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
> +			lvl = ++root->sentinel.level;
> +			update[lvl] = &root->sentinel;
> +		} else {
> +			lvl = I915_PRIOLIST_HEIGHT - 1;
> +		}
> +	}
> +	GEM_BUG_ON(lvl < 0);
> +	GEM_BUG_ON(lvl >= ARRAY_SIZE(pl->next));
>   
> -	return &p->requests;
> +	pl->level = lvl;
> +	do {
> +		tmp = update[lvl];
> +		pl->next[lvl] = update[lvl]->next[lvl];
> +		tmp->next[lvl] = pl;
> +	} while (--lvl >= 0);
> +
> +	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
> +		struct i915_priolist *chk;
> +
> +		chk = &root->sentinel;
> +		lvl = chk->level;
> +		do {
> +			while (tmp = chk->next[lvl], tmp->priority >= prio)
> +				chk = tmp;
> +		} while (--lvl >= 0);
> +
> +		GEM_BUG_ON(chk != pl);
> +	}
> +
> +out:
> +	GEM_BUG_ON(pl == &root->sentinel);
> +	return &pl->requests;
>   }
>   
> -void __i915_priolist_free(struct i915_priolist *p)
> +static void remove_priolist(struct intel_engine_cs *engine,
> +			    struct list_head *plist)
>   {
> -	kmem_cache_free(global.slab_priorities, p);
> +	struct i915_sched_engine * const se = &engine->active;
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	struct i915_priolist *old =
> +		container_of(plist, struct i915_priolist, requests);
> +	int prio = old->priority;
> +	int lvl;
> +
> +	lockdep_assert_held(&se->lock);
> +	GEM_BUG_ON(!list_empty(plist));
> +
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +
> +	if (prio != I915_PRIORITY_NORMAL)
> +		pl_push(old, &pl->requests);
> +
> +	do {
> +		while (tmp = pl->next[lvl], tmp->priority > prio)
> +			pl = tmp;
> +		if (lvl <= old->level) {
> +			pl->next[lvl] = old->next[lvl];
> +			if (pl == &root->sentinel && old->next[lvl] == pl) {
> +				GEM_BUG_ON(pl->level != lvl);
> +				pl->level--;
> +			}
> +		}
> +	} while (--lvl >= 0);
> +	GEM_BUG_ON(tmp != old);
> +}
> +
> +void i915_priolist_advance(struct i915_priolist_root *root,
> +			   struct i915_priolist *pl)
> +{
> +	struct i915_priolist * const s = &root->sentinel;
> +	int lvl;
> +
> +	GEM_BUG_ON(!list_empty(&pl->requests));
> +	GEM_BUG_ON(pl != s->next[0]);
> +	GEM_BUG_ON(pl == s);
> +
> +	if (pl->priority != I915_PRIORITY_NORMAL)
> +		pl_push(pl, &s->requests);
> +
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +	do {
> +		s->next[lvl] = pl->next[lvl];
> +		if (pl->next[lvl] == s) {
> +			GEM_BUG_ON(s->level != lvl);
> +			s->level--;
> +		}
> +	} while (--lvl >= 0);
>   }
>   
>   static struct i915_request *
> @@ -420,8 +549,13 @@ static void __i915_request_set_priority(struct i915_request *rq, int prio)
>   			continue;
>   
>   		GEM_BUG_ON(rq->engine != engine);
> -		if (i915_request_in_priority_queue(rq))
> +		if (i915_request_in_priority_queue(rq)) {
> +			struct list_head *prev = rq->sched.link.prev;
> +
>   			list_move_tail(&rq->sched.link, plist);
> +			if (list_empty(prev))
> +				remove_priolist(engine, prev);
> +		}
>   
>   		/* Defer (tasklet) submission until after all updates. */
>   		kick_submission(engine, rq, prio);
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index 0ab47cbf0e9c..bca89a58d953 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -16,12 +16,6 @@
>   
>   struct drm_printer;
>   
> -#define priolist_for_each_request(it, plist) \
> -	list_for_each_entry(it, &(plist)->requests, sched.link)
> -
> -#define priolist_for_each_request_consume(it, n, plist) \
> -	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> -
>   void i915_sched_node_init(struct i915_sched_node *node);
>   void i915_sched_node_reinit(struct i915_sched_node *node);
>   
> @@ -69,7 +63,7 @@ static inline void i915_priolist_free(struct i915_priolist *p)
>   
>   static inline bool i915_sched_is_idle(const struct i915_sched_engine *se)
>   {
> -	return RB_EMPTY_ROOT(&se->queue.rb_root);
> +	return i915_priolist_is_empty(&se->queue);
>   }
>   
>   static inline bool
> @@ -99,6 +93,9 @@ static inline void i915_sched_kick(struct i915_sched_engine *se)
>   	tasklet_hi_schedule(&se->tasklet);
>   }
>   
> +void i915_priolist_advance(struct i915_priolist_root *root,
> +			   struct i915_priolist *old);
> +
>   void i915_request_show_with_schedule(struct drm_printer *m,
>   				     const struct i915_request *rq,
>   				     const char *prefix,
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index f668c680d290..e64750be4e77 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -89,7 +89,7 @@ struct i915_sched_engine {
>   	/**
>   	 * @queue: queue of requests, in priority lists
>   	 */
> -	struct rb_root_cached queue;
> +	struct i915_priolist_root queue;
>   
>   	struct i915_sched_ipi ipi;
>   
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index 3db34d3eea58..946c93441c1f 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -25,6 +25,7 @@ selftest(ring, intel_ring_mock_selftests)
>   selftest(engine, intel_engine_cs_mock_selftests)
>   selftest(timelines, intel_timeline_mock_selftests)
>   selftest(requests, i915_request_mock_selftests)
> +selftest(scheduler, i915_scheduler_mock_selftests)
>   selftest(objects, i915_gem_object_mock_selftests)
>   selftest(phys, i915_gem_phys_mock_selftests)
>   selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_scheduler.c b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> index de44a66210b7..de5b1443129b 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> @@ -12,6 +12,54 @@
>   #include "selftests/igt_spinner.h"
>   #include "selftests/i915_random.h"
>   
> +static int mock_skiplist_levels(void *dummy)
> +{
> +	struct i915_priolist_root root = {};
> +	struct i915_priolist *pl = &root.sentinel;
> +	IGT_TIMEOUT(end_time);
> +	unsigned long total;
> +	int count, lvl;
> +
> +	total = 0;
> +	do {
> +		for (count = 0; count < 16384; count++) {
> +			lvl = random_level(&root);
> +			if (lvl > pl->level) {
> +				if (lvl < I915_PRIOLIST_HEIGHT - 1)
> +					lvl = ++pl->level;
> +				else
> +					lvl = I915_PRIOLIST_HEIGHT - 1;
> +			}
> +
> +			pl->next[lvl] = ptr_inc(pl->next[lvl]);
> +		}
> +		total += count;
> +	} while (!__igt_timeout(end_time, NULL));
> +
> +	pr_info("Total %9lu\n", total);
> +	for (lvl = 0; lvl <= pl->level; lvl++) {
> +		int x = ilog2((unsigned long)pl->next[lvl]);
> +		char row[80];
> +
> +		memset(row, '*', x);
> +		row[x] = '\0';
> +
> +		pr_info(" [%2d] %9lu %s\n",
> +			lvl, (unsigned long)pl->next[lvl], row);
> +	}
> +
> +	return 0;
> +}
> +
> +int i915_scheduler_mock_selftests(void)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(mock_skiplist_levels),
> +	};
> +
> +	return i915_subtests(tests, NULL);
> +}
> +
>   static void scheduling_disable(struct intel_engine_cs *engine)
>   {
>   	engine->props.preempt_timeout_ms = 0;
> @@ -80,9 +128,9 @@ static int all_engines(struct drm_i915_private *i915,
>   static bool check_context_order(struct intel_engine_cs *engine)
>   {
>   	u64 last_seqno, last_context;
> +	struct i915_priolist *p;
>   	unsigned long count;
>   	bool result = false;
> -	struct rb_node *rb;
>   	int last_prio;
>   
>   	/* We expect the execution order to follow ascending fence-context */
> @@ -92,8 +140,7 @@ static bool check_context_order(struct intel_engine_cs *engine)
>   	last_context = 0;
>   	last_seqno = 0;
>   	last_prio = 0;
> -	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	for_each_priolist(p, &engine->active.queue) {
>   		struct i915_request *rq;
>   
>   		priolist_for_each_request(rq, p) {
>
Chris Wilson Jan. 27, 2021, 3:33 p.m. UTC | #2
Quoting Tvrtko Ursulin (2021-01-27 15:10:43)
> 
> On 25/01/2021 14:01, Chris Wilson wrote:
> > Replace the priolist rbtree with a skiplist. The crucial difference is
> > that walking and removing the first element of a skiplist is O(1), but
> 
> I wasn't (and am not) familiar with them, but wikipedia page says 
> removal is O(logN) average case to O(N) worst case.
> 
> If I understand correctly O(1) could be ignoring the need to traverse 
> from top to bottom level and removing the element from all. But since 
> I915_PRIOLIST_HEIGHT is fixed maybe it is okay to call it O(1).

Correct, since we removing the first element, we do not need to do the
lgN search and can just move the next[I915_PRIOLIST_HEIGHT] forwards.
(Although, I did starting doing the lgN removal for timeslicing as
traversing the empty levels were showing up in worst case lock hold
times.) But the primary means of removing from the skiplist is as we
consume the first request during the dequeue.

> I wonder though why this wouldn't mean skip list would be worse for both 
> lightly loaded and highly-loaded scenarios? Presumably height would need 
> to be balanced to compensate for that.

I think the answer is yes. skiplists uses probablistic balancing so they
are only from a bird's eye view as good as a rbtree. If you look at the
perf tests, the skiplists are generally faster, but it's close overall.

What sold me was lock_stat and that I could remove a few hacky patches
trying to hide some of the worst case behaviour of rbtree and how we had
frees within the critical submit path.
 
> In summary I have no idea for what number of in-flight requests would 
> they be better.
> 
> How about putting this patch aside for now since it doesn't sound it is 
> critical for deadline scheduling per se?

Oh no, we are not going back to the hacky patches like
https://patchwork.freedesktop.org/patch/407913/?series=84900&rev=1
https://patchwork.freedesktop.org/patch/407903/?series=84900&rev=1
-Chris
Chris Wilson Jan. 27, 2021, 3:44 p.m. UTC | #3
Quoting Chris Wilson (2021-01-27 15:33:05)
> Quoting Tvrtko Ursulin (2021-01-27 15:10:43)
> > 
> > On 25/01/2021 14:01, Chris Wilson wrote:
> > > Replace the priolist rbtree with a skiplist. The crucial difference is
> > > that walking and removing the first element of a skiplist is O(1), but
> > 
> > I wasn't (and am not) familiar with them, but wikipedia page says 
> > removal is O(logN) average case to O(N) worst case.
> > 
> > If I understand correctly O(1) could be ignoring the need to traverse 
> > from top to bottom level and removing the element from all. But since 
> > I915_PRIOLIST_HEIGHT is fixed maybe it is okay to call it O(1).
> 
> Correct, since we removing the first element, we do not need to do the
> lgN search and can just move the next[I915_PRIOLIST_HEIGHT] forwards.
> (Although, I did starting doing the lgN removal for timeslicing as
> traversing the empty levels were showing up in worst case lock hold
> times.) But the primary means of removing from the skiplist is as we
> consume the first request during the dequeue.
> 
> > I wonder though why this wouldn't mean skip list would be worse for both 
> > lightly loaded and highly-loaded scenarios? Presumably height would need 
> > to be balanced to compensate for that.
> 
> I think the answer is yes. skiplists uses probablistic balancing so they
> are only from a bird's eye view as good as a rbtree. If you look at the
> perf tests, the skiplists are generally faster, but it's close overall.
> 
> What sold me was lock_stat and that I could remove a few hacky patches
> trying to hide some of the worst case behaviour of rbtree and how we had
> frees within the critical submit path.
>  
> > In summary I have no idea for what number of in-flight requests would 
> > they be better.
> > 
> > How about putting this patch aside for now since it doesn't sound it is 
> > critical for deadline scheduling per se?
> 
> Oh no, we are not going back to the hacky patches like
> https://patchwork.freedesktop.org/patch/407913/?series=84900&rev=1
> https://patchwork.freedesktop.org/patch/407903/?series=84900&rev=1

To be extra clear, the biggest drawback in using deadlines as the sort
key is that they are very, very sparse in comparison to priorities.
Where we would typically have only a single priority level for every
request, with deadlines we typically have a new deadline with every
request (and it's not until we get into priority bumping or timeslice
deferring do we start to see the deadlines coalesce). In this situation,
the lgN list traversal of rbtree during execlists_dequeue() was
abyssmal, and so as the skiplists give similar lgN insertion but O(1)
list traversal, the difference is enough to completely negate the
overhead of having more levels to process. It is a dramatic improvement.
-Chris
Tvrtko Ursulin Jan. 27, 2021, 3:58 p.m. UTC | #4
On 27/01/2021 15:44, Chris Wilson wrote:
> Quoting Chris Wilson (2021-01-27 15:33:05)
>> Quoting Tvrtko Ursulin (2021-01-27 15:10:43)
>>>
>>> On 25/01/2021 14:01, Chris Wilson wrote:
>>>> Replace the priolist rbtree with a skiplist. The crucial difference is
>>>> that walking and removing the first element of a skiplist is O(1), but
>>>
>>> I wasn't (and am not) familiar with them, but wikipedia page says
>>> removal is O(logN) average case to O(N) worst case.
>>>
>>> If I understand correctly O(1) could be ignoring the need to traverse
>>> from top to bottom level and removing the element from all. But since
>>> I915_PRIOLIST_HEIGHT is fixed maybe it is okay to call it O(1).
>>
>> Correct, since we removing the first element, we do not need to do the
>> lgN search and can just move the next[I915_PRIOLIST_HEIGHT] forwards.
>> (Although, I did starting doing the lgN removal for timeslicing as
>> traversing the empty levels were showing up in worst case lock hold
>> times.) But the primary means of removing from the skiplist is as we
>> consume the first request during the dequeue.
>>
>>> I wonder though why this wouldn't mean skip list would be worse for both
>>> lightly loaded and highly-loaded scenarios? Presumably height would need
>>> to be balanced to compensate for that.
>>
>> I think the answer is yes. skiplists uses probablistic balancing so they
>> are only from a bird's eye view as good as a rbtree. If you look at the
>> perf tests, the skiplists are generally faster, but it's close overall.
>>
>> What sold me was lock_stat and that I could remove a few hacky patches
>> trying to hide some of the worst case behaviour of rbtree and how we had
>> frees within the critical submit path.
>>   
>>> In summary I have no idea for what number of in-flight requests would
>>> they be better.
>>>
>>> How about putting this patch aside for now since it doesn't sound it is
>>> critical for deadline scheduling per se?
>>
>> Oh no, we are not going back to the hacky patches like
>> https://patchwork.freedesktop.org/patch/407913/?series=84900&rev=1
>> https://patchwork.freedesktop.org/patch/407903/?series=84900&rev=1
> 
> To be extra clear, the biggest drawback in using deadlines as the sort
> key is that they are very, very sparse in comparison to priorities.
> Where we would typically have only a single priority level for every
> request, with deadlines we typically have a new deadline with every
> request (and it's not until we get into priority bumping or timeslice
> deferring do we start to see the deadlines coalesce). In this situation,
> the lgN list traversal of rbtree during execlists_dequeue() was
> abyssmal, and so as the skiplists give similar lgN insertion but O(1)
> list traversal, the difference is enough to completely negate the
> overhead of having more levels to process. It is a dramatic improvement.

Okay makes sense. The change in key drives the requirement so just 
please mention in the commit message and I'll tackle the skip list 
mechanics in the meantime.

Regards,

Tvrtko
Chris Wilson Jan. 28, 2021, 9:50 a.m. UTC | #5
Quoting Tvrtko Ursulin (2021-01-27 15:58:12)
> Okay makes sense. The change in key drives the requirement so just 
> please mention in the commit message and I'll tackle the skip list 
> mechanics in the meantime.

In the following patches, we introduce a new sort key to the scheduler,
a virtual deadline. This imposes a different structure to the tree.
Using a priority sort, we have very few priority levels active at any
time, most likely just the default priority and so the rbtree degenerates
to a single elements containing the list of all ready requests. The
deadlines in contrast are very sparse, and typically each request has a
unique deadline. Instead of being able to simply walk the list during
dequeue, with the deadline scheduler we have to iterate through the bst
on the critical submission path. Skiplists are vastly superior in this
instance due to the O(1) iteration during dequeue, with very similar
characteristics [on average] to the rbtree for insertion.

This means that by using skiplists we can introduce a sparse sort key
without degrading latency on the critical submission path.

As an example, one simple case where we try to do lots of
semi-independent work without any priority management (gem_exec_parallel),
the lock hold times were
[worst]        [total]    [avg]
 973.05     6301584.84     0.35 # plain rbtree
 559.82     5424915.25     0.33 # best rbtree with pruning
 208.21     3898784.09     0.24 # skiplist
  34.05     5784106.01     0.32 # rbtree without deadlines
  23.35     4152999.80     0.24 # skiplist without deadlines

-Chris
Tvrtko Ursulin Jan. 28, 2021, 3:56 p.m. UTC | #6
On 25/01/2021 14:01, Chris Wilson wrote:
> Replace the priolist rbtree with a skiplist. The crucial difference is
> that walking and removing the first element of a skiplist is O(1), but
> O(lgN) for an rbtree, as we need to rebalance on remove. This is a
> hindrance for submission latency as it occurs between picking a request
> for the priolist and submitting it to hardware, as well effectively
> trippling the number of O(lgN) operations required under the irqoff lock.
> This is critical to reducing the latency jitter with multiple clients.
> 
> The downsides to skiplists are that lookup/insertion is only
> probablistically O(lgN) and there is a significant memory penalty to
> as each skip node is larger than the rbtree equivalent. Furthermore, we
> don't use dynamic arrays for the skiplist, so the allocation is fixed,
> and imposes an upper bound on the scalability wrt to the number of
> inflight requests.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   .../drm/i915/gt/intel_execlists_submission.c  |  63 +++--
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  30 +--
>   drivers/gpu/drm/i915/i915_priolist_types.h    |  28 +-
>   drivers/gpu/drm/i915/i915_scheduler.c         | 244 ++++++++++++++----
>   drivers/gpu/drm/i915/i915_scheduler.h         |  11 +-
>   drivers/gpu/drm/i915/i915_scheduler_types.h   |   2 +-
>   .../drm/i915/selftests/i915_mock_selftests.h  |   1 +
>   .../gpu/drm/i915/selftests/i915_scheduler.c   |  53 +++-
>   8 files changed, 316 insertions(+), 116 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 1103c8a00af1..129144dd86b0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -244,11 +244,6 @@ static void ring_set_paused(const struct intel_engine_cs *engine, int state)
>   		wmb();
>   }
>   
> -static struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static int rq_prio(const struct i915_request *rq)
>   {
>   	return READ_ONCE(rq->sched.attr.priority);
> @@ -272,15 +267,31 @@ static int effective_prio(const struct i915_request *rq)
>   	return prio;
>   }
>   
> -static int queue_prio(const struct i915_sched_engine *se)
> +static struct i915_request *first_request(struct i915_sched_engine *se)
>   {
> -	struct rb_node *rb;
> +	struct i915_priolist *pl;
>   
> -	rb = rb_first_cached(&se->queue);
> -	if (!rb)
> +	for_each_priolist(pl, &se->queue) {
> +		if (likely(!list_empty(&pl->requests)))
> +			return list_first_entry(&pl->requests,
> +						struct i915_request,
> +						sched.link);
> +
> +		i915_priolist_advance(&se->queue, pl);
> +	}
> +
> +	return NULL;
> +}
> +
> +static int queue_prio(struct i915_sched_engine *se)
> +{
> +	struct i915_request *rq;
> +
> +	rq = first_request(se);
> +	if (!rq)
>   		return INT_MIN;
>   
> -	return to_priolist(rb)->priority;
> +	return rq_prio(rq);
>   }
>   
>   static int virtual_prio(const struct intel_engine_execlists *el)
> @@ -290,7 +301,7 @@ static int virtual_prio(const struct intel_engine_execlists *el)
>   	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
>   }
>   
> -static bool need_preempt(const struct intel_engine_cs *engine,
> +static bool need_preempt(struct intel_engine_cs *engine,
>   			 const struct i915_request *rq)
>   {
>   	int last_prio;
> @@ -1136,6 +1147,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request ** const last_port = port + execlists->port_mask;
>   	struct i915_request *last, * const *active;
>   	struct virtual_engine *ve;
> +	struct i915_priolist *pl;
>   	struct rb_node *rb;
>   	bool submit = false;
>   
> @@ -1346,11 +1358,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			break;
>   	}
>   
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> +	for_each_priolist(pl, &engine->active.queue) {
>   		struct i915_request *rq, *rn;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			bool merge = true;
>   
>   			/*
> @@ -1425,8 +1436,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			}
>   		}
>   
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>   	}
>   done:
>   	*port++ = i915_request_get(last);
> @@ -2631,6 +2641,7 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct i915_request *rq, *rn;
> +	struct i915_priolist *pl;
>   	struct rb_node *rb;
>   	unsigned long flags;
>   
> @@ -2661,16 +2672,12 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   	intel_engine_signal_breadcrumbs(engine);
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> +	for_each_priolist(pl, &engine->active.queue) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			i915_request_mark_eio(rq);
>   			__i915_request_submit(rq);
>   		}
> -
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>   	}
>   	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
>   
> @@ -2703,7 +2710,6 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	engine->active.queue = RB_ROOT_CACHED;
>   
>   	GEM_BUG_ON(__tasklet_is_enabled(&engine->active.tasklet));
>   	engine->active.tasklet.func = nop_submission_tasklet;
> @@ -3089,6 +3095,8 @@ static void virtual_context_exit(struct intel_context *ce)
>   
>   	for (n = 0; n < ve->num_siblings; n++)
>   		intel_engine_pm_put(ve->siblings[n]);
> +
> +	i915_sched_park_engine(&ve->base.active);
>   }
>   
>   static const struct intel_context_ops virtual_context_ops = {
> @@ -3501,6 +3509,7 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>   {
>   	const struct intel_engine_execlists *execlists = &engine->execlists;
>   	struct i915_request *rq, *last;
> +	struct i915_priolist *pl;
>   	unsigned long flags;
>   	unsigned int count;
>   	struct rb_node *rb;
> @@ -3530,10 +3539,8 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>   
>   	last = NULL;
>   	count = 0;
> -	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> -
> -		priolist_for_each_request(rq, p) {
> +	for_each_priolist(pl, &engine->active.queue) {
> +		priolist_for_each_request(rq, pl) {
>   			if (count++ < max - 1)
>   				show_request(m, rq, "\t\t", 0);
>   			else
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 2d7339ef3b4c..8d0c6cd277b3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -59,11 +59,6 @@
>   
>   #define GUC_REQUEST_SIZE 64 /* bytes */
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
>   {
>   	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
> @@ -185,8 +180,8 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request ** const last_port = first + execlists->port_mask;
>   	struct i915_request *last = first[0];
>   	struct i915_request **port;
> +	struct i915_priolist *pl;
>   	bool submit = false;
> -	struct rb_node *rb;
>   
>   	lockdep_assert_held(&engine->active.lock);
>   
> @@ -203,11 +198,10 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	 * event.
>   	 */
>   	port = first;
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> +	for_each_priolist(pl, &engine->active.queue) {
>   		struct i915_request *rq, *rn;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			if (last && rq->context != last->context) {
>   				if (port == last_port)
>   					goto done;
> @@ -223,12 +217,11 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   			last = rq;
>   		}
>   
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>   	}
>   done:
>   	execlists->queue_priority_hint =
> -		rb ? to_priolist(rb)->priority : INT_MIN;
> +		pl != &engine->active.queue.sentinel ? pl->priority : INT_MIN;
>   	if (submit) {
>   		*port = schedule_in(last, port - execlists->inflight);
>   		*++port = NULL;
> @@ -327,7 +320,7 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct i915_request *rq, *rn;
> -	struct rb_node *rb;
> +	struct i915_priolist *p;
>   	unsigned long flags;
>   
>   	ENGINE_TRACE(engine, "\n");
> @@ -355,25 +348,20 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>   	}
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> +	for_each_priolist(p, &engine->active.queue) {
> +		priolist_for_each_request_safe(rq, rn, p) {
>   			list_del_init(&rq->sched.link);
>   			__i915_request_submit(rq);
>   			dma_fence_set_error(&rq->fence, -EIO);
>   			i915_request_mark_complete(rq);
>   		}
> -
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, p);
>   	}
>   	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
>   
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	engine->active.queue = RB_ROOT_CACHED;
>   
>   	spin_unlock_irqrestore(&engine->active.lock, flags);
>   }
> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> index bc2fa84f98a8..1200c3df6a4a 100644
> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> @@ -38,10 +38,36 @@ enum {
>   #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
>   #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
>   
> +#ifdef CONFIG_64BIT
> +#define I915_PRIOLIST_HEIGHT 12
> +#else
> +#define I915_PRIOLIST_HEIGHT 11
> +#endif

I did not get this. On one hand I could think pointers are larger on 
64-bit so go for fewer levels, if size was a concern. But on the other 
hand 32-bit is less important these days, definitely much less as a 
performance platform. So going for less memory use => worse performance 
on a less important platform, which typically could be more memory 
constrained? Not sure I see it as that important either way to be 
distinctive but a comment would satisfy me.

> +
>   struct i915_priolist {
>   	struct list_head requests;

What would be on this list? Request can only be on one at a time, so I 
was thinking these nodes would have pointers to list of that priority, 
rather than lists themselves. Assuming there can be multiple nodes of 
the same priority in the 2d hierarcy. Possibly I don't understand the 
layout.

> -	struct rb_node node;
>   	int priority;
> +
> +	int level;
> +	struct i915_priolist *next[I915_PRIOLIST_HEIGHT];

Does every node need maximum height or you could allocated depending on 
current height?

>   };
>   
> +struct i915_priolist_root {
> +	struct i915_priolist sentinel;
> +	u32 prng;
> +};
> +
> +#define i915_priolist_is_empty(root) ((root)->sentinel.level < 0)
> +
> +#define for_each_priolist(p, root) \
> +	for ((p) = (root)->sentinel.next[0]; \
> +	     (p) != &(root)->sentinel; \
> +	     (p) = (p)->next[0])
> +
> +#define priolist_for_each_request(it, plist) \
> +	list_for_each_entry(it, &(plist)->requests, sched.link)
> +
> +#define priolist_for_each_request_safe(it, n, plist) \
> +	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> +
>   #endif /* _I915_PRIOLIST_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index a3ee06cb66d7..74000d3eebb1 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -4,7 +4,9 @@
>    * Copyright © 2018 Intel Corporation
>    */
>   
> +#include <linux/bitops.h>
>   #include <linux/mutex.h>
> +#include <linux/prandom.h>
>   
>   #include "gt/intel_ring.h"
>   #include "gt/intel_lrc_reg.h"
> @@ -91,15 +93,24 @@ static void i915_sched_init_ipi(struct i915_sched_ipi *ipi)
>   	ipi->list = NULL;
>   }
>   
> +static void init_priolist(struct i915_priolist_root *const root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +
> +	memset_p((void **)pl->next, pl, ARRAY_SIZE(pl->next));
> +	pl->priority = INT_MIN;
> +	pl->level = -1;
> +}
> +
>   void i915_sched_init_engine(struct i915_sched_engine *se,
>   			    unsigned int subclass)
>   {
>   	spin_lock_init(&se->lock);
>   	lockdep_set_subclass(&se->lock, subclass);
>   
> +	init_priolist(&se->queue);
>   	INIT_LIST_HEAD(&se->requests);
>   	INIT_LIST_HEAD(&se->hold);
> -	se->queue = RB_ROOT_CACHED;
>   
>   	i915_sched_init_ipi(&se->ipi);
>   
> @@ -116,8 +127,57 @@ void i915_sched_init_engine(struct i915_sched_engine *se,
>   #endif
>   }
>   
> +__maybe_unused static bool priolist_idle(struct i915_priolist_root *root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +	int lvl;
> +
> +	for (lvl = 0; lvl < ARRAY_SIZE(pl->next); lvl++) {
> +		if (pl->next[lvl] != pl) {
> +			GEM_TRACE_ERR("root[%d] is not empty\n", lvl);
> +			return false;
> +		}
> +	}
> +
> +	if (pl->level != -1) {
> +		GEM_TRACE_ERR("root is not clear: %d\n", pl->level);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static void pl_push(struct i915_priolist *pl, struct list_head *head)
> +{
> +	pl->requests.next = head->next;
> +	head->next = &pl->requests;
> +}
> +
> +static struct i915_priolist *pl_pop(struct list_head *head)
> +{
> +	struct i915_priolist *pl;
> +
> +	pl = container_of(head->next, typeof(*pl), requests);
> +	head->next = pl->requests.next;
> +
> +	return pl;
> +}
> +
> +static bool pl_empty(struct list_head *head)
> +{
> +	return !head->next;
> +}
> +
>   void i915_sched_park_engine(struct i915_sched_engine *se)
>   {
> +	struct i915_priolist_root *root = &se->queue;
> +	struct list_head *list = &root->sentinel.requests;
> +
> +	GEM_BUG_ON(!priolist_idle(root));
> +
> +	while (!pl_empty(list))
> +		kmem_cache_free(global.slab_priorities, pl_pop(list));
> +
>   	GEM_BUG_ON(!i915_sched_is_idle(se));
>   	se->no_priolist = false;
>   }
> @@ -183,71 +243,55 @@ static inline bool node_signaled(const struct i915_sched_node *node)
>   	return i915_request_completed(node_to_request(node));
>   }
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +static inline unsigned int random_level(struct i915_priolist_root *root)
>   {
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
> -static void assert_priolists(struct i915_sched_engine * const se)
> -{
> -	struct rb_node *rb;
> -	long last_prio;
> -
> -	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> -		return;
> -
> -	GEM_BUG_ON(rb_first_cached(&se->queue) !=
> -		   rb_first(&se->queue.rb_root));
> -
> -	last_prio = INT_MAX;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		const struct i915_priolist *p = to_priolist(rb);
> -
> -		GEM_BUG_ON(p->priority > last_prio);
> -		last_prio = p->priority;
> -	}
> +	root->prng = next_pseudo_random32(root->prng);
> +	return  __ffs(root->prng) / 2;

Where is the relationship to I915_PRIOLIST_HEIGHT? Feels root->prng % 
I915_PRIOLIST_HEIGHT would be more obvious here unless I am terribly 
mistaken. Or at least put a comment saying why the hack.

>   }
>   
>   static struct list_head *
>   lookup_priolist(struct intel_engine_cs *engine, int prio)
>   {
> +	struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
>   	struct i915_sched_engine * const se = &engine->active;
> -	struct i915_priolist *p;
> -	struct rb_node **parent, *rb;
> -	bool first = true;
> -
> -	lockdep_assert_held(&engine->active.lock);
> -	assert_priolists(se);
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	int lvl;
>   
> +	lockdep_assert_held(&se->lock);
>   	if (unlikely(se->no_priolist))
>   		prio = I915_PRIORITY_NORMAL;
>   
> +	for_each_priolist(pl, root) { /* recycle any empty elements before us */
> +		if (pl->priority >= prio || !list_empty(&pl->requests))
> +			break;
> +
> +		i915_priolist_advance(root, pl);
> +	}
> +
>   find_priolist:
> -	/* most positive priority is scheduled first, equal priorities fifo */
> -	rb = NULL;
> -	parent = &se->queue.rb_root.rb_node;
> -	while (*parent) {
> -		rb = *parent;
> -		p = to_priolist(rb);
> -		if (prio > p->priority) {
> -			parent = &rb->rb_left;
> -		} else if (prio < p->priority) {
> -			parent = &rb->rb_right;
> -			first = false;
> -		} else {
> -			return &p->requests;
> -		}
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	while (lvl >= 0) {
> +		while (tmp = pl->next[lvl], tmp->priority >= prio)
> +			pl = tmp;
> +		if (pl->priority == prio)
> +			goto out;
> +		update[lvl--] = pl;
>   	}
>   
>   	if (prio == I915_PRIORITY_NORMAL) {
> -		p = &se->default_priolist;
> +		pl = &se->default_priolist;
> +	} else if (!pl_empty(&root->sentinel.requests)) {
> +		pl = pl_pop(&root->sentinel.requests);
>   	} else {
> -		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> +		pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
>   		/* Convert an allocation failure to a priority bump */
> -		if (unlikely(!p)) {
> +		if (unlikely(!pl)) {
>   			prio = I915_PRIORITY_NORMAL; /* recurses just once */
>   
> -			/* To maintain ordering with all rendering, after an
> +			/*
> +			 * To maintain ordering with all rendering, after an
>   			 * allocation failure we have to disable all scheduling.
>   			 * Requests will then be executed in fifo, and schedule
>   			 * will ensure that dependencies are emitted in fifo.
> @@ -260,18 +304,103 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
>   		}
>   	}
>   
> -	p->priority = prio;
> -	INIT_LIST_HEAD(&p->requests);
> +	pl->priority = prio;
> +	INIT_LIST_HEAD(&pl->requests);
>   
> -	rb_link_node(&p->node, rb, parent);
> -	rb_insert_color_cached(&p->node, &se->queue, first);
> +	lvl = random_level(root);
> +	if (lvl > root->sentinel.level) {
> +		if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
> +			lvl = ++root->sentinel.level;

root->sentinel.level is maximum currently populated height? So if 
random_level said insert at 4 but there are currently only 2 levels, 
height will grow by one only?

> +			update[lvl] = &root->sentinel;
> +		} else {
> +			lvl = I915_PRIOLIST_HEIGHT - 1;

But if maximum level already has been reached then this branch does not 
set anything to update[], relying on the while loop earlier in the 
function has populated it? How should I think of the update array?

Regards,

Tvrtko

> +		}
> +	}
> +	GEM_BUG_ON(lvl < 0);
> +	GEM_BUG_ON(lvl >= ARRAY_SIZE(pl->next));
>   
> -	return &p->requests;
> +	pl->level = lvl;
> +	do {
> +		tmp = update[lvl];
> +		pl->next[lvl] = update[lvl]->next[lvl];
> +		tmp->next[lvl] = pl;
> +	} while (--lvl >= 0);
> +
> +	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
> +		struct i915_priolist *chk;
> +
> +		chk = &root->sentinel;
> +		lvl = chk->level;
> +		do {
> +			while (tmp = chk->next[lvl], tmp->priority >= prio)
> +				chk = tmp;
> +		} while (--lvl >= 0);
> +
> +		GEM_BUG_ON(chk != pl);
> +	}
> +
> +out:
> +	GEM_BUG_ON(pl == &root->sentinel);
> +	return &pl->requests;
>   }
>   
> -void __i915_priolist_free(struct i915_priolist *p)
> +static void remove_priolist(struct intel_engine_cs *engine,
> +			    struct list_head *plist)
>   {
> -	kmem_cache_free(global.slab_priorities, p);
> +	struct i915_sched_engine * const se = &engine->active;
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	struct i915_priolist *old =
> +		container_of(plist, struct i915_priolist, requests);
> +	int prio = old->priority;
> +	int lvl;
> +
> +	lockdep_assert_held(&se->lock);
> +	GEM_BUG_ON(!list_empty(plist));
> +
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +
> +	if (prio != I915_PRIORITY_NORMAL)
> +		pl_push(old, &pl->requests);
> +
> +	do {
> +		while (tmp = pl->next[lvl], tmp->priority > prio)
> +			pl = tmp;
> +		if (lvl <= old->level) {
> +			pl->next[lvl] = old->next[lvl];
> +			if (pl == &root->sentinel && old->next[lvl] == pl) {
> +				GEM_BUG_ON(pl->level != lvl);
> +				pl->level--;
> +			}
> +		}
> +	} while (--lvl >= 0);
> +	GEM_BUG_ON(tmp != old);
> +}
> +
> +void i915_priolist_advance(struct i915_priolist_root *root,
> +			   struct i915_priolist *pl)
> +{
> +	struct i915_priolist * const s = &root->sentinel;
> +	int lvl;
> +
> +	GEM_BUG_ON(!list_empty(&pl->requests));
> +	GEM_BUG_ON(pl != s->next[0]);
> +	GEM_BUG_ON(pl == s);
> +
> +	if (pl->priority != I915_PRIORITY_NORMAL)
> +		pl_push(pl, &s->requests);
> +
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +	do {
> +		s->next[lvl] = pl->next[lvl];
> +		if (pl->next[lvl] == s) {
> +			GEM_BUG_ON(s->level != lvl);
> +			s->level--;
> +		}
> +	} while (--lvl >= 0);
>   }
>   
>   static struct i915_request *
> @@ -420,8 +549,13 @@ static void __i915_request_set_priority(struct i915_request *rq, int prio)
>   			continue;
>   
>   		GEM_BUG_ON(rq->engine != engine);
> -		if (i915_request_in_priority_queue(rq))
> +		if (i915_request_in_priority_queue(rq)) {
> +			struct list_head *prev = rq->sched.link.prev;
> +
>   			list_move_tail(&rq->sched.link, plist);
> +			if (list_empty(prev))
> +				remove_priolist(engine, prev);
> +		}
>   
>   		/* Defer (tasklet) submission until after all updates. */
>   		kick_submission(engine, rq, prio);
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index 0ab47cbf0e9c..bca89a58d953 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -16,12 +16,6 @@
>   
>   struct drm_printer;
>   
> -#define priolist_for_each_request(it, plist) \
> -	list_for_each_entry(it, &(plist)->requests, sched.link)
> -
> -#define priolist_for_each_request_consume(it, n, plist) \
> -	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> -
>   void i915_sched_node_init(struct i915_sched_node *node);
>   void i915_sched_node_reinit(struct i915_sched_node *node);
>   
> @@ -69,7 +63,7 @@ static inline void i915_priolist_free(struct i915_priolist *p)
>   
>   static inline bool i915_sched_is_idle(const struct i915_sched_engine *se)
>   {
> -	return RB_EMPTY_ROOT(&se->queue.rb_root);
> +	return i915_priolist_is_empty(&se->queue);
>   }
>   
>   static inline bool
> @@ -99,6 +93,9 @@ static inline void i915_sched_kick(struct i915_sched_engine *se)
>   	tasklet_hi_schedule(&se->tasklet);
>   }
>   
> +void i915_priolist_advance(struct i915_priolist_root *root,
> +			   struct i915_priolist *old);
> +
>   void i915_request_show_with_schedule(struct drm_printer *m,
>   				     const struct i915_request *rq,
>   				     const char *prefix,
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index f668c680d290..e64750be4e77 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -89,7 +89,7 @@ struct i915_sched_engine {
>   	/**
>   	 * @queue: queue of requests, in priority lists
>   	 */
> -	struct rb_root_cached queue;
> +	struct i915_priolist_root queue;
>   
>   	struct i915_sched_ipi ipi;
>   
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index 3db34d3eea58..946c93441c1f 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -25,6 +25,7 @@ selftest(ring, intel_ring_mock_selftests)
>   selftest(engine, intel_engine_cs_mock_selftests)
>   selftest(timelines, intel_timeline_mock_selftests)
>   selftest(requests, i915_request_mock_selftests)
> +selftest(scheduler, i915_scheduler_mock_selftests)
>   selftest(objects, i915_gem_object_mock_selftests)
>   selftest(phys, i915_gem_phys_mock_selftests)
>   selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_scheduler.c b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> index de44a66210b7..de5b1443129b 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> @@ -12,6 +12,54 @@
>   #include "selftests/igt_spinner.h"
>   #include "selftests/i915_random.h"
>   
> +static int mock_skiplist_levels(void *dummy)
> +{
> +	struct i915_priolist_root root = {};
> +	struct i915_priolist *pl = &root.sentinel;
> +	IGT_TIMEOUT(end_time);
> +	unsigned long total;
> +	int count, lvl;
> +
> +	total = 0;
> +	do {
> +		for (count = 0; count < 16384; count++) {
> +			lvl = random_level(&root);
> +			if (lvl > pl->level) {
> +				if (lvl < I915_PRIOLIST_HEIGHT - 1)
> +					lvl = ++pl->level;
> +				else
> +					lvl = I915_PRIOLIST_HEIGHT - 1;
> +			}
> +
> +			pl->next[lvl] = ptr_inc(pl->next[lvl]);
> +		}
> +		total += count;
> +	} while (!__igt_timeout(end_time, NULL));
> +
> +	pr_info("Total %9lu\n", total);
> +	for (lvl = 0; lvl <= pl->level; lvl++) {
> +		int x = ilog2((unsigned long)pl->next[lvl]);
> +		char row[80];
> +
> +		memset(row, '*', x);
> +		row[x] = '\0';
> +
> +		pr_info(" [%2d] %9lu %s\n",
> +			lvl, (unsigned long)pl->next[lvl], row);
> +	}
> +
> +	return 0;
> +}
> +
> +int i915_scheduler_mock_selftests(void)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(mock_skiplist_levels),
> +	};
> +
> +	return i915_subtests(tests, NULL);
> +}
> +
>   static void scheduling_disable(struct intel_engine_cs *engine)
>   {
>   	engine->props.preempt_timeout_ms = 0;
> @@ -80,9 +128,9 @@ static int all_engines(struct drm_i915_private *i915,
>   static bool check_context_order(struct intel_engine_cs *engine)
>   {
>   	u64 last_seqno, last_context;
> +	struct i915_priolist *p;
>   	unsigned long count;
>   	bool result = false;
> -	struct rb_node *rb;
>   	int last_prio;
>   
>   	/* We expect the execution order to follow ascending fence-context */
> @@ -92,8 +140,7 @@ static bool check_context_order(struct intel_engine_cs *engine)
>   	last_context = 0;
>   	last_seqno = 0;
>   	last_prio = 0;
> -	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	for_each_priolist(p, &engine->active.queue) {
>   		struct i915_request *rq;
>   
>   		priolist_for_each_request(rq, p) {
>
Chris Wilson Jan. 28, 2021, 4:26 p.m. UTC | #7
Quoting Tvrtko Ursulin (2021-01-28 15:56:19)
> On 25/01/2021 14:01, Chris Wilson wrote:
> > diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> > index bc2fa84f98a8..1200c3df6a4a 100644
> > --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> > +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> > @@ -38,10 +38,36 @@ enum {
> >   #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
> >   #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
> >   
> > +#ifdef CONFIG_64BIT
> > +#define I915_PRIOLIST_HEIGHT 12
> > +#else
> > +#define I915_PRIOLIST_HEIGHT 11
> > +#endif
> 
> I did not get this. On one hand I could think pointers are larger on 
> 64-bit so go for fewer levels, if size was a concern. But on the other 
> hand 32-bit is less important these days, definitely much less as a 
> performance platform. So going for less memory use => worse performance 
> on a less important platform, which typically could be more memory 
> constrained? Not sure I see it as that important either way to be 
> distinctive but a comment would satisfy me.

Just aligned to the cacheline. The struct is 128B on 64b and 64B on 32b.
On 64B, we will scale to around 16 million requests in flight and 4
million on 32b. Which should be enough.

If we shrunk 64b to a 64B node, we would only scale to 256 requests
which limit we definitely will exceed.

> >   struct i915_priolist {
> >       struct list_head requests;
> 
> What would be on this list? Request can only be on one at a time, so I 
> was thinking these nodes would have pointers to list of that priority, 
> rather than lists themselves. Assuming there can be multiple nodes of 
> the same priority in the 2d hierarcy. Possibly I don't understand the 
> layout.

A request is only on one list (queue, active, hold). But we may still
have more than one request at the same deadline, though that will likely
be limited to priority-inheritance and timeslice deferrals.

Since we would need pointer to the request, we could only reclaim a
single pointer here, which is not enough to warrant reducing the overall
node size. And while there is at least one user of request->sched.link,
the list maintenance will still be incurred. Using request->sched.link
remains a convenient interface.

> 
> > -     struct rb_node node;
> >       int priority;
> > +
> > +     int level;
> > +     struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
> 
> Does every node need maximum height or you could allocated depending on 
> current height?

Every slab allocation here is a power of 2, so there are only a few
different options that are worthwhile (on 64b the only other choice is
[4], unless you want to go larger to [28]). It did not feel like enough
benefit to justify the extra code.

> > -static void assert_priolists(struct i915_sched_engine * const se)
> > -{
> > -     struct rb_node *rb;
> > -     long last_prio;
> > -
> > -     if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> > -             return;
> > -
> > -     GEM_BUG_ON(rb_first_cached(&se->queue) !=
> > -                rb_first(&se->queue.rb_root));
> > -
> > -     last_prio = INT_MAX;
> > -     for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> > -             const struct i915_priolist *p = to_priolist(rb);
> > -
> > -             GEM_BUG_ON(p->priority > last_prio);
> > -             last_prio = p->priority;
> > -     }
> > +     root->prng = next_pseudo_random32(root->prng);
> > +     return  __ffs(root->prng) / 2;
> 
> Where is the relationship to I915_PRIOLIST_HEIGHT? Feels root->prng % 
> I915_PRIOLIST_HEIGHT would be more obvious here unless I am terribly 
> mistaken. Or at least put a comment saying why the hack.

HEIGHT is the maximum possible for our struct. skiplists only want to
increment the height of the tree one step at a time. So we choose a level
with decreasing probability, and then limit that to the maximum height of
the current tree + 1, clamped to HEIGHT.

You might notice that unlike traditional skiplists, this uses a
probability of 0.25 for each additional level. A neat trick discovered by
Con Kolivas (I haven't found it mentioned elsewhere) as the cost of the
extra level (using P=.5) is the same as the extra chain length with
P=.25. So you can scale to higher number of requests by packing more
requests into each level.

So that is split between randomly choosing a level and then working out
the height of the node.

> >   static struct list_head *
> >   lookup_priolist(struct intel_engine_cs *engine, int prio)
> >   {
> > +     struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
> >       struct i915_sched_engine * const se = &engine->active;
> > -     struct i915_priolist *p;
> > -     struct rb_node **parent, *rb;
> > -     bool first = true;
> > -
> > -     lockdep_assert_held(&engine->active.lock);
> > -     assert_priolists(se);
> > +     struct i915_priolist_root *root = &se->queue;
> > +     struct i915_priolist *pl, *tmp;
> > +     int lvl;
> >   
> > +     lockdep_assert_held(&se->lock);
> >       if (unlikely(se->no_priolist))
> >               prio = I915_PRIORITY_NORMAL;
> >   
> > +     for_each_priolist(pl, root) { /* recycle any empty elements before us */
> > +             if (pl->priority >= prio || !list_empty(&pl->requests))
> > +                     break;
> > +
> > +             i915_priolist_advance(root, pl);
> > +     }
> > +
> >   find_priolist:
> > -     /* most positive priority is scheduled first, equal priorities fifo */
> > -     rb = NULL;
> > -     parent = &se->queue.rb_root.rb_node;
> > -     while (*parent) {
> > -             rb = *parent;
> > -             p = to_priolist(rb);
> > -             if (prio > p->priority) {
> > -                     parent = &rb->rb_left;
> > -             } else if (prio < p->priority) {
> > -                     parent = &rb->rb_right;
> > -                     first = false;
> > -             } else {
> > -                     return &p->requests;
> > -             }
> > +     pl = &root->sentinel;
> > +     lvl = pl->level;
> > +     while (lvl >= 0) {
> > +             while (tmp = pl->next[lvl], tmp->priority >= prio)
> > +                     pl = tmp;
> > +             if (pl->priority == prio)
> > +                     goto out;
> > +             update[lvl--] = pl;
> >       }
> >   
> >       if (prio == I915_PRIORITY_NORMAL) {
> > -             p = &se->default_priolist;
> > +             pl = &se->default_priolist;
> > +     } else if (!pl_empty(&root->sentinel.requests)) {
> > +             pl = pl_pop(&root->sentinel.requests);
> >       } else {
> > -             p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> > +             pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> >               /* Convert an allocation failure to a priority bump */
> > -             if (unlikely(!p)) {
> > +             if (unlikely(!pl)) {
> >                       prio = I915_PRIORITY_NORMAL; /* recurses just once */
> >   
> > -                     /* To maintain ordering with all rendering, after an
> > +                     /*
> > +                      * To maintain ordering with all rendering, after an
> >                        * allocation failure we have to disable all scheduling.
> >                        * Requests will then be executed in fifo, and schedule
> >                        * will ensure that dependencies are emitted in fifo.
> > @@ -260,18 +304,103 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
> >               }
> >       }
> >   
> > -     p->priority = prio;
> > -     INIT_LIST_HEAD(&p->requests);
> > +     pl->priority = prio;
> > +     INIT_LIST_HEAD(&pl->requests);
> >   
> > -     rb_link_node(&p->node, rb, parent);
> > -     rb_insert_color_cached(&p->node, &se->queue, first);
> > +     lvl = random_level(root);
> > +     if (lvl > root->sentinel.level) {
> > +             if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
> > +                     lvl = ++root->sentinel.level;
> 
> root->sentinel.level is maximum currently populated height? So if 
> random_level said insert at 4 but there are currently only 2 levels, 
> height will grow by one only?

Yes. The idea is keep the number of next[] as small as possible for the
number of nodes in the tree. (Since the height of the tree is the
constant overhead in list traversal.)

> > +                     update[lvl] = &root->sentinel;
> > +             } else {
> > +                     lvl = I915_PRIOLIST_HEIGHT - 1;
> 
> But if maximum level already has been reached then this branch does not 
> set anything to update[],

at the next level.

> relying on the while loop earlier in the 
> function has populated it? How should I think of the update array?

The update[] is the array of nodes just before the position we need to
insert. So update[] needs only be the height of the tree at that time,
and if we decide to grow the tree, update[height] will be the root node,
as we will be the first in that level.
-Chris
Tvrtko Ursulin Jan. 28, 2021, 4:42 p.m. UTC | #8
On 28/01/2021 16:26, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2021-01-28 15:56:19)
>> On 25/01/2021 14:01, Chris Wilson wrote:
>>> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
>>> index bc2fa84f98a8..1200c3df6a4a 100644
>>> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
>>> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
>>> @@ -38,10 +38,36 @@ enum {
>>>    #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
>>>    #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
>>>    
>>> +#ifdef CONFIG_64BIT
>>> +#define I915_PRIOLIST_HEIGHT 12
>>> +#else
>>> +#define I915_PRIOLIST_HEIGHT 11
>>> +#endif
>>
>> I did not get this. On one hand I could think pointers are larger on
>> 64-bit so go for fewer levels, if size was a concern. But on the other
>> hand 32-bit is less important these days, definitely much less as a
>> performance platform. So going for less memory use => worse performance
>> on a less important platform, which typically could be more memory
>> constrained? Not sure I see it as that important either way to be
>> distinctive but a comment would satisfy me.
> 
> Just aligned to the cacheline. The struct is 128B on 64b and 64B on 32b.
> On 64B, we will scale to around 16 million requests in flight and 4
> million on 32b. Which should be enough.
> 
> If we shrunk 64b to a 64B node, we would only scale to 256 requests
> which limit we definitely will exceed.

Ok thanks, pouring it into a comment is implied.

> 
>>>    struct i915_priolist {
>>>        struct list_head requests;
>>
>> What would be on this list? Request can only be on one at a time, so I
>> was thinking these nodes would have pointers to list of that priority,
>> rather than lists themselves. Assuming there can be multiple nodes of
>> the same priority in the 2d hierarcy. Possibly I don't understand the
>> layout.
> 
> A request is only on one list (queue, active, hold). But we may still
> have more than one request at the same deadline, though that will likely
> be limited to priority-inheritance and timeslice deferrals.
> 
> Since we would need pointer to the request, we could only reclaim a
> single pointer here, which is not enough to warrant reducing the overall
> node size. And while there is at least one user of request->sched.link,
> the list maintenance will still be incurred. Using request->sched.link
> remains a convenient interface.

Lost you.

Is the data structure like this and I will limit to priorities for 
simplicity:

    Level1:	[-1]------------->[1]
    Level0: 	[-1]---->[0]----->[1]
[SENTINEL]

Each of the boxes is struct i915_priolist?

Sentinel contains pointers to first i915_priolist for each level. Or 
maybe it could contain just a single pointer to highest level (most 
sparse) list.

And then each box is i915_priolist, single linked to next, in order.

But it should also have a single pointer for down, or up (or both)? I 
don't understand why you have up to "max levels" pointers in each.

And each box should then contain a pointer to a list of requests. I 
cannot each have it's own list since there are duplicates.

But obviously I am understanding something way wrong.

> 
>>
>>> -     struct rb_node node;
>>>        int priority;
>>> +
>>> +     int level;
>>> +     struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
>>
>> Does every node need maximum height or you could allocated depending on
>> current height?
> 
> Every slab allocation here is a power of 2, so there are only a few
> different options that are worthwhile (on 64b the only other choice is
> [4], unless you want to go larger to [28]). It did not feel like enough
> benefit to justify the extra code.
> 
>>> -static void assert_priolists(struct i915_sched_engine * const se)
>>> -{
>>> -     struct rb_node *rb;
>>> -     long last_prio;
>>> -
>>> -     if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
>>> -             return;
>>> -
>>> -     GEM_BUG_ON(rb_first_cached(&se->queue) !=
>>> -                rb_first(&se->queue.rb_root));
>>> -
>>> -     last_prio = INT_MAX;
>>> -     for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
>>> -             const struct i915_priolist *p = to_priolist(rb);
>>> -
>>> -             GEM_BUG_ON(p->priority > last_prio);
>>> -             last_prio = p->priority;
>>> -     }
>>> +     root->prng = next_pseudo_random32(root->prng);
>>> +     return  __ffs(root->prng) / 2;
>>
>> Where is the relationship to I915_PRIOLIST_HEIGHT? Feels root->prng %
>> I915_PRIOLIST_HEIGHT would be more obvious here unless I am terribly
>> mistaken. Or at least put a comment saying why the hack.
> 
> HEIGHT is the maximum possible for our struct. skiplists only want to
> increment the height of the tree one step at a time. So we choose a level
> with decreasing probability, and then limit that to the maximum height of
> the current tree + 1, clamped to HEIGHT.
> 
> You might notice that unlike traditional skiplists, this uses a

That's optimistic, that I would notice that. I'll stick to the basics 
for now. :)

Regards,

Tvrtko

> probability of 0.25 for each additional level. A neat trick discovered by
> Con Kolivas (I haven't found it mentioned elsewhere) as the cost of the
> extra level (using P=.5) is the same as the extra chain length with
> P=.25. So you can scale to higher number of requests by packing more
> requests into each level.
> 
> So that is split between randomly choosing a level and then working out
> the height of the node.
> 
>>>    static struct list_head *
>>>    lookup_priolist(struct intel_engine_cs *engine, int prio)
>>>    {
>>> +     struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
>>>        struct i915_sched_engine * const se = &engine->active;
>>> -     struct i915_priolist *p;
>>> -     struct rb_node **parent, *rb;
>>> -     bool first = true;
>>> -
>>> -     lockdep_assert_held(&engine->active.lock);
>>> -     assert_priolists(se);
>>> +     struct i915_priolist_root *root = &se->queue;
>>> +     struct i915_priolist *pl, *tmp;
>>> +     int lvl;
>>>    
>>> +     lockdep_assert_held(&se->lock);
>>>        if (unlikely(se->no_priolist))
>>>                prio = I915_PRIORITY_NORMAL;
>>>    
>>> +     for_each_priolist(pl, root) { /* recycle any empty elements before us */
>>> +             if (pl->priority >= prio || !list_empty(&pl->requests))
>>> +                     break;
>>> +
>>> +             i915_priolist_advance(root, pl);
>>> +     }
>>> +
>>>    find_priolist:
>>> -     /* most positive priority is scheduled first, equal priorities fifo */
>>> -     rb = NULL;
>>> -     parent = &se->queue.rb_root.rb_node;
>>> -     while (*parent) {
>>> -             rb = *parent;
>>> -             p = to_priolist(rb);
>>> -             if (prio > p->priority) {
>>> -                     parent = &rb->rb_left;
>>> -             } else if (prio < p->priority) {
>>> -                     parent = &rb->rb_right;
>>> -                     first = false;
>>> -             } else {
>>> -                     return &p->requests;
>>> -             }
>>> +     pl = &root->sentinel;
>>> +     lvl = pl->level;
>>> +     while (lvl >= 0) {
>>> +             while (tmp = pl->next[lvl], tmp->priority >= prio)
>>> +                     pl = tmp;
>>> +             if (pl->priority == prio)
>>> +                     goto out;
>>> +             update[lvl--] = pl;
>>>        }
>>>    
>>>        if (prio == I915_PRIORITY_NORMAL) {
>>> -             p = &se->default_priolist;
>>> +             pl = &se->default_priolist;
>>> +     } else if (!pl_empty(&root->sentinel.requests)) {
>>> +             pl = pl_pop(&root->sentinel.requests);
>>>        } else {
>>> -             p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
>>> +             pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
>>>                /* Convert an allocation failure to a priority bump */
>>> -             if (unlikely(!p)) {
>>> +             if (unlikely(!pl)) {
>>>                        prio = I915_PRIORITY_NORMAL; /* recurses just once */
>>>    
>>> -                     /* To maintain ordering with all rendering, after an
>>> +                     /*
>>> +                      * To maintain ordering with all rendering, after an
>>>                         * allocation failure we have to disable all scheduling.
>>>                         * Requests will then be executed in fifo, and schedule
>>>                         * will ensure that dependencies are emitted in fifo.
>>> @@ -260,18 +304,103 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
>>>                }
>>>        }
>>>    
>>> -     p->priority = prio;
>>> -     INIT_LIST_HEAD(&p->requests);
>>> +     pl->priority = prio;
>>> +     INIT_LIST_HEAD(&pl->requests);
>>>    
>>> -     rb_link_node(&p->node, rb, parent);
>>> -     rb_insert_color_cached(&p->node, &se->queue, first);
>>> +     lvl = random_level(root);
>>> +     if (lvl > root->sentinel.level) {
>>> +             if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
>>> +                     lvl = ++root->sentinel.level;
>>
>> root->sentinel.level is maximum currently populated height? So if
>> random_level said insert at 4 but there are currently only 2 levels,
>> height will grow by one only?
> 
> Yes. The idea is keep the number of next[] as small as possible for the
> number of nodes in the tree. (Since the height of the tree is the
> constant overhead in list traversal.)
> 
>>> +                     update[lvl] = &root->sentinel;
>>> +             } else {
>>> +                     lvl = I915_PRIOLIST_HEIGHT - 1;
>>
>> But if maximum level already has been reached then this branch does not
>> set anything to update[],
> 
> at the next level.
> 
>> relying on the while loop earlier in the
>> function has populated it? How should I think of the update array?
> 
> The update[] is the array of nodes just before the position we need to
> insert. So update[] needs only be the height of the tree at that time,
> and if we decide to grow the tree, update[height] will be the root node,
> as we will be the first in that level.
> -Chris
>
Chris Wilson Jan. 28, 2021, 10:20 p.m. UTC | #9
Quoting Tvrtko Ursulin (2021-01-28 16:42:44)
> 
> On 28/01/2021 16:26, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2021-01-28 15:56:19)
> >> On 25/01/2021 14:01, Chris Wilson wrote:
> >>>    struct i915_priolist {
> >>>        struct list_head requests;
> >>
> >> What would be on this list? Request can only be on one at a time, so I
> >> was thinking these nodes would have pointers to list of that priority,
> >> rather than lists themselves. Assuming there can be multiple nodes of
> >> the same priority in the 2d hierarcy. Possibly I don't understand the
> >> layout.
> > 
> > A request is only on one list (queue, active, hold). But we may still
> > have more than one request at the same deadline, though that will likely
> > be limited to priority-inheritance and timeslice deferrals.
> > 
> > Since we would need pointer to the request, we could only reclaim a
> > single pointer here, which is not enough to warrant reducing the overall
> > node size. And while there is at least one user of request->sched.link,
> > the list maintenance will still be incurred. Using request->sched.link
> > remains a convenient interface.
> 
> Lost you.
> 
> Is the data structure like this and I will limit to priorities for 
> simplicity:
> 
>     Level1:     [-1]------------->[1]
>     Level0:     [-1]---->[0]----->[1]
> [SENTINEL]
> 
> Each of the boxes is struct i915_priolist?

Although each level is circular.

1: SENTINEL -> [-1] --------> [1] -> SENTINEL
0: SENTINEL -> [-1] -> [0] -> [1] -> SENTINEL

Ah. I think I see the cause of confusion here. Each column, not each
box, is a i915_priolist.

So the skiplist is really a set of [HEIGHT] singly linked lists, with
each list containing a sorted subset of the whole. And each descending
level includes every member from the level above, until we reach a
linked list of all i915_priolist in [0].

[skip, hopefully I caught the central point]

SENTINEL[2] is a list of all i915_priolist of level >= 2
SENTINEL[1] is a list of all i915_priolist of level >= 1
SENTINEL[0] is a list of all i915_priolist.

As we randomly assign i915_priolist.level, SENTINEL[1] should have half
the elements of SENTINEL[0], and SENTINEL[2] should have half again the
elements of SENTINEL[1] (hence its ability to do a binary/lgN search for
a key, each level is a bisection of the last).
-Chris
Chris Wilson Jan. 28, 2021, 10:44 p.m. UTC | #10
Quoting Tvrtko Ursulin (2021-01-28 16:42:44)
> 
> On 28/01/2021 16:26, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2021-01-28 15:56:19)
> >> On 25/01/2021 14:01, Chris Wilson wrote:
> >>> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> >>> index bc2fa84f98a8..1200c3df6a4a 100644
> >>> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> >>> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> >>> @@ -38,10 +38,36 @@ enum {
> >>>    #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
> >>>    #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
> >>>    
> >>> +#ifdef CONFIG_64BIT
> >>> +#define I915_PRIOLIST_HEIGHT 12
> >>> +#else
> >>> +#define I915_PRIOLIST_HEIGHT 11
> >>> +#endif
> >>
> >> I did not get this. On one hand I could think pointers are larger on
> >> 64-bit so go for fewer levels, if size was a concern. But on the other
> >> hand 32-bit is less important these days, definitely much less as a
> >> performance platform. So going for less memory use => worse performance
> >> on a less important platform, which typically could be more memory
> >> constrained? Not sure I see it as that important either way to be
> >> distinctive but a comment would satisfy me.
> > 
> > Just aligned to the cacheline. The struct is 128B on 64b and 64B on 32b.
> > On 64B, we will scale to around 16 million requests in flight and 4
> > million on 32b. Which should be enough.
> > 
> > If we shrunk 64b to a 64B node, we would only scale to 256 requests
> > which limit we definitely will exceed.
> 
> Ok thanks, pouring it into a comment is implied.
> 
> > 
> >>>    struct i915_priolist {
> >>>        struct list_head requests;
> >>
> >> What would be on this list? Request can only be on one at a time, so I
> >> was thinking these nodes would have pointers to list of that priority,
> >> rather than lists themselves. Assuming there can be multiple nodes of
> >> the same priority in the 2d hierarcy. Possibly I don't understand the
> >> layout.
> > 
> > A request is only on one list (queue, active, hold). But we may still
> > have more than one request at the same deadline, though that will likely
> > be limited to priority-inheritance and timeslice deferrals.
> > 
> > Since we would need pointer to the request, we could only reclaim a
> > single pointer here, which is not enough to warrant reducing the overall
> > node size. And while there is at least one user of request->sched.link,
> > the list maintenance will still be incurred. Using request->sched.link
> > remains a convenient interface.
> 
> Lost you.

/*
 * i915_priolist forms a skiplist. The skiplist is built in layers,
 * starting at the base [0] is a singly linked list of all i915_priolist.
 * Each higher layer contains a fraction of the i915_priolist from the
 * previous layer:
 *
 * S[0] 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF S
 * E[1] >1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F E
 * N[2] -->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F N
 * T[3] ------->7----->F-------7------>F------>7------>F------>7------>F T
 * I[4] -------------->F-------------->F-------------->F-------------->F I
 * N[5] ------------------------------>F------------------------------>F N
 * E[6] ------------------------------>F-------------------------------> E
 * L[7] ---------------------------------------------------------------> L
 *
 * To iterate through all active i915_priolist, we only need to follow
 * the chain in i915_priolist.next[0] (see for_each_priolist).
 *
 * To quickly find a specific key (or insert point), we can perform a binary
 * search by starting at the highest level and following the linked list
 * at that level until we either find the node, or have gone passed the key.
 * Then we descend a level, and start walking the list again starting from
 * the current position, until eventually we find our key, or we run out of
 * levels.
 */
-Chris
Matthew Brost Jan. 28, 2021, 10:56 p.m. UTC | #11
On Mon, Jan 25, 2021 at 02:01:15PM +0000, Chris Wilson wrote:
> Replace the priolist rbtree with a skiplist. The crucial difference is
> that walking and removing the first element of a skiplist is O(1), but
> O(lgN) for an rbtree, as we need to rebalance on remove. This is a
> hindrance for submission latency as it occurs between picking a request
> for the priolist and submitting it to hardware, as well effectively
> trippling the number of O(lgN) operations required under the irqoff lock.
> This is critical to reducing the latency jitter with multiple clients.
> 
> The downsides to skiplists are that lookup/insertion is only
> probablistically O(lgN) and there is a significant memory penalty to
> as each skip node is larger than the rbtree equivalent. Furthermore, we
> don't use dynamic arrays for the skiplist, so the allocation is fixed,
> and imposes an upper bound on the scalability wrt to the number of
> inflight requests.
> 

This is a fun data structure but IMO might be overkill to maintain this
code in the i915. The UMDs have effectively agreed to use only 3 levels,
is O(lgN) where N == 3 really a big deal? With GuC submission we will
statically map all user levels into 3 buckets. If we are doing that, do
we even need a complex data structure? i.e. Could use just use can
array of linked lists?

Also BTW, seems like people are having a hard time understanding what a
skip list is, might have just started with the below link which explains
it quite nicely:
https://en.wikipedia.org/wiki/Skip_list

Matt

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  .../drm/i915/gt/intel_execlists_submission.c  |  63 +++--
>  .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  30 +--
>  drivers/gpu/drm/i915/i915_priolist_types.h    |  28 +-
>  drivers/gpu/drm/i915/i915_scheduler.c         | 244 ++++++++++++++----
>  drivers/gpu/drm/i915/i915_scheduler.h         |  11 +-
>  drivers/gpu/drm/i915/i915_scheduler_types.h   |   2 +-
>  .../drm/i915/selftests/i915_mock_selftests.h  |   1 +
>  .../gpu/drm/i915/selftests/i915_scheduler.c   |  53 +++-
>  8 files changed, 316 insertions(+), 116 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 1103c8a00af1..129144dd86b0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -244,11 +244,6 @@ static void ring_set_paused(const struct intel_engine_cs *engine, int state)
>  		wmb();
>  }
>  
> -static struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>  static int rq_prio(const struct i915_request *rq)
>  {
>  	return READ_ONCE(rq->sched.attr.priority);
> @@ -272,15 +267,31 @@ static int effective_prio(const struct i915_request *rq)
>  	return prio;
>  }
>  
> -static int queue_prio(const struct i915_sched_engine *se)
> +static struct i915_request *first_request(struct i915_sched_engine *se)
>  {
> -	struct rb_node *rb;
> +	struct i915_priolist *pl;
>  
> -	rb = rb_first_cached(&se->queue);
> -	if (!rb)
> +	for_each_priolist(pl, &se->queue) {
> +		if (likely(!list_empty(&pl->requests)))
> +			return list_first_entry(&pl->requests,
> +						struct i915_request,
> +						sched.link);
> +
> +		i915_priolist_advance(&se->queue, pl);
> +	}
> +
> +	return NULL;
> +}
> +
> +static int queue_prio(struct i915_sched_engine *se)
> +{
> +	struct i915_request *rq;
> +
> +	rq = first_request(se);
> +	if (!rq)
>  		return INT_MIN;
>  
> -	return to_priolist(rb)->priority;
> +	return rq_prio(rq);
>  }
>  
>  static int virtual_prio(const struct intel_engine_execlists *el)
> @@ -290,7 +301,7 @@ static int virtual_prio(const struct intel_engine_execlists *el)
>  	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
>  }
>  
> -static bool need_preempt(const struct intel_engine_cs *engine,
> +static bool need_preempt(struct intel_engine_cs *engine,
>  			 const struct i915_request *rq)
>  {
>  	int last_prio;
> @@ -1136,6 +1147,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	struct i915_request ** const last_port = port + execlists->port_mask;
>  	struct i915_request *last, * const *active;
>  	struct virtual_engine *ve;
> +	struct i915_priolist *pl;
>  	struct rb_node *rb;
>  	bool submit = false;
>  
> @@ -1346,11 +1358,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			break;
>  	}
>  
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> +	for_each_priolist(pl, &engine->active.queue) {
>  		struct i915_request *rq, *rn;
>  
> -		priolist_for_each_request_consume(rq, rn, p) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>  			bool merge = true;
>  
>  			/*
> @@ -1425,8 +1436,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			}
>  		}
>  
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>  	}
>  done:
>  	*port++ = i915_request_get(last);
> @@ -2631,6 +2641,7 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>  {
>  	struct intel_engine_execlists * const execlists = &engine->execlists;
>  	struct i915_request *rq, *rn;
> +	struct i915_priolist *pl;
>  	struct rb_node *rb;
>  	unsigned long flags;
>  
> @@ -2661,16 +2672,12 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>  	intel_engine_signal_breadcrumbs(engine);
>  
>  	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> +	for_each_priolist(pl, &engine->active.queue) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>  			i915_request_mark_eio(rq);
>  			__i915_request_submit(rq);
>  		}
> -
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>  	}
>  	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
>  
> @@ -2703,7 +2710,6 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>  	/* Remaining _unready_ requests will be nop'ed when submitted */
>  
>  	execlists->queue_priority_hint = INT_MIN;
> -	engine->active.queue = RB_ROOT_CACHED;
>  
>  	GEM_BUG_ON(__tasklet_is_enabled(&engine->active.tasklet));
>  	engine->active.tasklet.func = nop_submission_tasklet;
> @@ -3089,6 +3095,8 @@ static void virtual_context_exit(struct intel_context *ce)
>  
>  	for (n = 0; n < ve->num_siblings; n++)
>  		intel_engine_pm_put(ve->siblings[n]);
> +
> +	i915_sched_park_engine(&ve->base.active);
>  }
>  
>  static const struct intel_context_ops virtual_context_ops = {
> @@ -3501,6 +3509,7 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>  {
>  	const struct intel_engine_execlists *execlists = &engine->execlists;
>  	struct i915_request *rq, *last;
> +	struct i915_priolist *pl;
>  	unsigned long flags;
>  	unsigned int count;
>  	struct rb_node *rb;
> @@ -3530,10 +3539,8 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>  
>  	last = NULL;
>  	count = 0;
> -	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> -
> -		priolist_for_each_request(rq, p) {
> +	for_each_priolist(pl, &engine->active.queue) {
> +		priolist_for_each_request(rq, pl) {
>  			if (count++ < max - 1)
>  				show_request(m, rq, "\t\t", 0);
>  			else
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 2d7339ef3b4c..8d0c6cd277b3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -59,11 +59,6 @@
>  
>  #define GUC_REQUEST_SIZE 64 /* bytes */
>  
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>  static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
>  {
>  	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
> @@ -185,8 +180,8 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>  	struct i915_request ** const last_port = first + execlists->port_mask;
>  	struct i915_request *last = first[0];
>  	struct i915_request **port;
> +	struct i915_priolist *pl;
>  	bool submit = false;
> -	struct rb_node *rb;
>  
>  	lockdep_assert_held(&engine->active.lock);
>  
> @@ -203,11 +198,10 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>  	 * event.
>  	 */
>  	port = first;
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> +	for_each_priolist(pl, &engine->active.queue) {
>  		struct i915_request *rq, *rn;
>  
> -		priolist_for_each_request_consume(rq, rn, p) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>  			if (last && rq->context != last->context) {
>  				if (port == last_port)
>  					goto done;
> @@ -223,12 +217,11 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>  			last = rq;
>  		}
>  
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>  	}
>  done:
>  	execlists->queue_priority_hint =
> -		rb ? to_priolist(rb)->priority : INT_MIN;
> +		pl != &engine->active.queue.sentinel ? pl->priority : INT_MIN;
>  	if (submit) {
>  		*port = schedule_in(last, port - execlists->inflight);
>  		*++port = NULL;
> @@ -327,7 +320,7 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>  {
>  	struct intel_engine_execlists * const execlists = &engine->execlists;
>  	struct i915_request *rq, *rn;
> -	struct rb_node *rb;
> +	struct i915_priolist *p;
>  	unsigned long flags;
>  
>  	ENGINE_TRACE(engine, "\n");
> @@ -355,25 +348,20 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>  	}
>  
>  	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> +	for_each_priolist(p, &engine->active.queue) {
> +		priolist_for_each_request_safe(rq, rn, p) {
>  			list_del_init(&rq->sched.link);
>  			__i915_request_submit(rq);
>  			dma_fence_set_error(&rq->fence, -EIO);
>  			i915_request_mark_complete(rq);
>  		}
> -
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, p);
>  	}
>  	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
>  
>  	/* Remaining _unready_ requests will be nop'ed when submitted */
>  
>  	execlists->queue_priority_hint = INT_MIN;
> -	engine->active.queue = RB_ROOT_CACHED;
>  
>  	spin_unlock_irqrestore(&engine->active.lock, flags);
>  }
> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> index bc2fa84f98a8..1200c3df6a4a 100644
> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> @@ -38,10 +38,36 @@ enum {
>  #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
>  #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
>  
> +#ifdef CONFIG_64BIT
> +#define I915_PRIOLIST_HEIGHT 12
> +#else
> +#define I915_PRIOLIST_HEIGHT 11
> +#endif
> +
>  struct i915_priolist {
>  	struct list_head requests;
> -	struct rb_node node;
>  	int priority;
> +
> +	int level;
> +	struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
>  };
>  
> +struct i915_priolist_root {
> +	struct i915_priolist sentinel;
> +	u32 prng;
> +};
> +
> +#define i915_priolist_is_empty(root) ((root)->sentinel.level < 0)
> +
> +#define for_each_priolist(p, root) \
> +	for ((p) = (root)->sentinel.next[0]; \
> +	     (p) != &(root)->sentinel; \
> +	     (p) = (p)->next[0])
> +
> +#define priolist_for_each_request(it, plist) \
> +	list_for_each_entry(it, &(plist)->requests, sched.link)
> +
> +#define priolist_for_each_request_safe(it, n, plist) \
> +	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> +
>  #endif /* _I915_PRIOLIST_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index a3ee06cb66d7..74000d3eebb1 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -4,7 +4,9 @@
>   * Copyright © 2018 Intel Corporation
>   */
>  
> +#include <linux/bitops.h>
>  #include <linux/mutex.h>
> +#include <linux/prandom.h>
>  
>  #include "gt/intel_ring.h"
>  #include "gt/intel_lrc_reg.h"
> @@ -91,15 +93,24 @@ static void i915_sched_init_ipi(struct i915_sched_ipi *ipi)
>  	ipi->list = NULL;
>  }
>  
> +static void init_priolist(struct i915_priolist_root *const root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +
> +	memset_p((void **)pl->next, pl, ARRAY_SIZE(pl->next));
> +	pl->priority = INT_MIN;
> +	pl->level = -1;
> +}
> +
>  void i915_sched_init_engine(struct i915_sched_engine *se,
>  			    unsigned int subclass)
>  {
>  	spin_lock_init(&se->lock);
>  	lockdep_set_subclass(&se->lock, subclass);
>  
> +	init_priolist(&se->queue);
>  	INIT_LIST_HEAD(&se->requests);
>  	INIT_LIST_HEAD(&se->hold);
> -	se->queue = RB_ROOT_CACHED;
>  
>  	i915_sched_init_ipi(&se->ipi);
>  
> @@ -116,8 +127,57 @@ void i915_sched_init_engine(struct i915_sched_engine *se,
>  #endif
>  }
>  
> +__maybe_unused static bool priolist_idle(struct i915_priolist_root *root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +	int lvl;
> +
> +	for (lvl = 0; lvl < ARRAY_SIZE(pl->next); lvl++) {
> +		if (pl->next[lvl] != pl) {
> +			GEM_TRACE_ERR("root[%d] is not empty\n", lvl);
> +			return false;
> +		}
> +	}
> +
> +	if (pl->level != -1) {
> +		GEM_TRACE_ERR("root is not clear: %d\n", pl->level);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static void pl_push(struct i915_priolist *pl, struct list_head *head)
> +{
> +	pl->requests.next = head->next;
> +	head->next = &pl->requests;
> +}
> +
> +static struct i915_priolist *pl_pop(struct list_head *head)
> +{
> +	struct i915_priolist *pl;
> +
> +	pl = container_of(head->next, typeof(*pl), requests);
> +	head->next = pl->requests.next;
> +
> +	return pl;
> +}
> +
> +static bool pl_empty(struct list_head *head)
> +{
> +	return !head->next;
> +}
> +
>  void i915_sched_park_engine(struct i915_sched_engine *se)
>  {
> +	struct i915_priolist_root *root = &se->queue;
> +	struct list_head *list = &root->sentinel.requests;
> +
> +	GEM_BUG_ON(!priolist_idle(root));
> +
> +	while (!pl_empty(list))
> +		kmem_cache_free(global.slab_priorities, pl_pop(list));
> +
>  	GEM_BUG_ON(!i915_sched_is_idle(se));
>  	se->no_priolist = false;
>  }
> @@ -183,71 +243,55 @@ static inline bool node_signaled(const struct i915_sched_node *node)
>  	return i915_request_completed(node_to_request(node));
>  }
>  
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +static inline unsigned int random_level(struct i915_priolist_root *root)
>  {
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
> -static void assert_priolists(struct i915_sched_engine * const se)
> -{
> -	struct rb_node *rb;
> -	long last_prio;
> -
> -	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> -		return;
> -
> -	GEM_BUG_ON(rb_first_cached(&se->queue) !=
> -		   rb_first(&se->queue.rb_root));
> -
> -	last_prio = INT_MAX;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		const struct i915_priolist *p = to_priolist(rb);
> -
> -		GEM_BUG_ON(p->priority > last_prio);
> -		last_prio = p->priority;
> -	}
> +	root->prng = next_pseudo_random32(root->prng);
> +	return  __ffs(root->prng) / 2;
>  }
>  
>  static struct list_head *
>  lookup_priolist(struct intel_engine_cs *engine, int prio)
>  {
> +	struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
>  	struct i915_sched_engine * const se = &engine->active;
> -	struct i915_priolist *p;
> -	struct rb_node **parent, *rb;
> -	bool first = true;
> -
> -	lockdep_assert_held(&engine->active.lock);
> -	assert_priolists(se);
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	int lvl;
>  
> +	lockdep_assert_held(&se->lock);
>  	if (unlikely(se->no_priolist))
>  		prio = I915_PRIORITY_NORMAL;
>  
> +	for_each_priolist(pl, root) { /* recycle any empty elements before us */
> +		if (pl->priority >= prio || !list_empty(&pl->requests))
> +			break;
> +
> +		i915_priolist_advance(root, pl);
> +	}
> +
>  find_priolist:
> -	/* most positive priority is scheduled first, equal priorities fifo */
> -	rb = NULL;
> -	parent = &se->queue.rb_root.rb_node;
> -	while (*parent) {
> -		rb = *parent;
> -		p = to_priolist(rb);
> -		if (prio > p->priority) {
> -			parent = &rb->rb_left;
> -		} else if (prio < p->priority) {
> -			parent = &rb->rb_right;
> -			first = false;
> -		} else {
> -			return &p->requests;
> -		}
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	while (lvl >= 0) {
> +		while (tmp = pl->next[lvl], tmp->priority >= prio)
> +			pl = tmp;
> +		if (pl->priority == prio)
> +			goto out;
> +		update[lvl--] = pl;
>  	}
>  
>  	if (prio == I915_PRIORITY_NORMAL) {
> -		p = &se->default_priolist;
> +		pl = &se->default_priolist;
> +	} else if (!pl_empty(&root->sentinel.requests)) {
> +		pl = pl_pop(&root->sentinel.requests);
>  	} else {
> -		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> +		pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
>  		/* Convert an allocation failure to a priority bump */
> -		if (unlikely(!p)) {
> +		if (unlikely(!pl)) {
>  			prio = I915_PRIORITY_NORMAL; /* recurses just once */
>  
> -			/* To maintain ordering with all rendering, after an
> +			/*
> +			 * To maintain ordering with all rendering, after an
>  			 * allocation failure we have to disable all scheduling.
>  			 * Requests will then be executed in fifo, and schedule
>  			 * will ensure that dependencies are emitted in fifo.
> @@ -260,18 +304,103 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
>  		}
>  	}
>  
> -	p->priority = prio;
> -	INIT_LIST_HEAD(&p->requests);
> +	pl->priority = prio;
> +	INIT_LIST_HEAD(&pl->requests);
>  
> -	rb_link_node(&p->node, rb, parent);
> -	rb_insert_color_cached(&p->node, &se->queue, first);
> +	lvl = random_level(root);
> +	if (lvl > root->sentinel.level) {
> +		if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
> +			lvl = ++root->sentinel.level;
> +			update[lvl] = &root->sentinel;
> +		} else {
> +			lvl = I915_PRIOLIST_HEIGHT - 1;
> +		}
> +	}
> +	GEM_BUG_ON(lvl < 0);
> +	GEM_BUG_ON(lvl >= ARRAY_SIZE(pl->next));
>  
> -	return &p->requests;
> +	pl->level = lvl;
> +	do {
> +		tmp = update[lvl];
> +		pl->next[lvl] = update[lvl]->next[lvl];
> +		tmp->next[lvl] = pl;
> +	} while (--lvl >= 0);
> +
> +	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
> +		struct i915_priolist *chk;
> +
> +		chk = &root->sentinel;
> +		lvl = chk->level;
> +		do {
> +			while (tmp = chk->next[lvl], tmp->priority >= prio)
> +				chk = tmp;
> +		} while (--lvl >= 0);
> +
> +		GEM_BUG_ON(chk != pl);
> +	}
> +
> +out:
> +	GEM_BUG_ON(pl == &root->sentinel);
> +	return &pl->requests;
>  }
>  
> -void __i915_priolist_free(struct i915_priolist *p)
> +static void remove_priolist(struct intel_engine_cs *engine,
> +			    struct list_head *plist)
>  {
> -	kmem_cache_free(global.slab_priorities, p);
> +	struct i915_sched_engine * const se = &engine->active;
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	struct i915_priolist *old =
> +		container_of(plist, struct i915_priolist, requests);
> +	int prio = old->priority;
> +	int lvl;
> +
> +	lockdep_assert_held(&se->lock);
> +	GEM_BUG_ON(!list_empty(plist));
> +
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +
> +	if (prio != I915_PRIORITY_NORMAL)
> +		pl_push(old, &pl->requests);
> +
> +	do {
> +		while (tmp = pl->next[lvl], tmp->priority > prio)
> +			pl = tmp;
> +		if (lvl <= old->level) {
> +			pl->next[lvl] = old->next[lvl];
> +			if (pl == &root->sentinel && old->next[lvl] == pl) {
> +				GEM_BUG_ON(pl->level != lvl);
> +				pl->level--;
> +			}
> +		}
> +	} while (--lvl >= 0);
> +	GEM_BUG_ON(tmp != old);
> +}
> +
> +void i915_priolist_advance(struct i915_priolist_root *root,
> +			   struct i915_priolist *pl)
> +{
> +	struct i915_priolist * const s = &root->sentinel;
> +	int lvl;
> +
> +	GEM_BUG_ON(!list_empty(&pl->requests));
> +	GEM_BUG_ON(pl != s->next[0]);
> +	GEM_BUG_ON(pl == s);
> +
> +	if (pl->priority != I915_PRIORITY_NORMAL)
> +		pl_push(pl, &s->requests);
> +
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +	do {
> +		s->next[lvl] = pl->next[lvl];
> +		if (pl->next[lvl] == s) {
> +			GEM_BUG_ON(s->level != lvl);
> +			s->level--;
> +		}
> +	} while (--lvl >= 0);
>  }
>  
>  static struct i915_request *
> @@ -420,8 +549,13 @@ static void __i915_request_set_priority(struct i915_request *rq, int prio)
>  			continue;
>  
>  		GEM_BUG_ON(rq->engine != engine);
> -		if (i915_request_in_priority_queue(rq))
> +		if (i915_request_in_priority_queue(rq)) {
> +			struct list_head *prev = rq->sched.link.prev;
> +
>  			list_move_tail(&rq->sched.link, plist);
> +			if (list_empty(prev))
> +				remove_priolist(engine, prev);
> +		}
>  
>  		/* Defer (tasklet) submission until after all updates. */
>  		kick_submission(engine, rq, prio);
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index 0ab47cbf0e9c..bca89a58d953 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -16,12 +16,6 @@
>  
>  struct drm_printer;
>  
> -#define priolist_for_each_request(it, plist) \
> -	list_for_each_entry(it, &(plist)->requests, sched.link)
> -
> -#define priolist_for_each_request_consume(it, n, plist) \
> -	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> -
>  void i915_sched_node_init(struct i915_sched_node *node);
>  void i915_sched_node_reinit(struct i915_sched_node *node);
>  
> @@ -69,7 +63,7 @@ static inline void i915_priolist_free(struct i915_priolist *p)
>  
>  static inline bool i915_sched_is_idle(const struct i915_sched_engine *se)
>  {
> -	return RB_EMPTY_ROOT(&se->queue.rb_root);
> +	return i915_priolist_is_empty(&se->queue);
>  }
>  
>  static inline bool
> @@ -99,6 +93,9 @@ static inline void i915_sched_kick(struct i915_sched_engine *se)
>  	tasklet_hi_schedule(&se->tasklet);
>  }
>  
> +void i915_priolist_advance(struct i915_priolist_root *root,
> +			   struct i915_priolist *old);
> +
>  void i915_request_show_with_schedule(struct drm_printer *m,
>  				     const struct i915_request *rq,
>  				     const char *prefix,
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index f668c680d290..e64750be4e77 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -89,7 +89,7 @@ struct i915_sched_engine {
>  	/**
>  	 * @queue: queue of requests, in priority lists
>  	 */
> -	struct rb_root_cached queue;
> +	struct i915_priolist_root queue;
>  
>  	struct i915_sched_ipi ipi;
>  
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index 3db34d3eea58..946c93441c1f 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -25,6 +25,7 @@ selftest(ring, intel_ring_mock_selftests)
>  selftest(engine, intel_engine_cs_mock_selftests)
>  selftest(timelines, intel_timeline_mock_selftests)
>  selftest(requests, i915_request_mock_selftests)
> +selftest(scheduler, i915_scheduler_mock_selftests)
>  selftest(objects, i915_gem_object_mock_selftests)
>  selftest(phys, i915_gem_phys_mock_selftests)
>  selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_scheduler.c b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> index de44a66210b7..de5b1443129b 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> @@ -12,6 +12,54 @@
>  #include "selftests/igt_spinner.h"
>  #include "selftests/i915_random.h"
>  
> +static int mock_skiplist_levels(void *dummy)
> +{
> +	struct i915_priolist_root root = {};
> +	struct i915_priolist *pl = &root.sentinel;
> +	IGT_TIMEOUT(end_time);
> +	unsigned long total;
> +	int count, lvl;
> +
> +	total = 0;
> +	do {
> +		for (count = 0; count < 16384; count++) {
> +			lvl = random_level(&root);
> +			if (lvl > pl->level) {
> +				if (lvl < I915_PRIOLIST_HEIGHT - 1)
> +					lvl = ++pl->level;
> +				else
> +					lvl = I915_PRIOLIST_HEIGHT - 1;
> +			}
> +
> +			pl->next[lvl] = ptr_inc(pl->next[lvl]);
> +		}
> +		total += count;
> +	} while (!__igt_timeout(end_time, NULL));
> +
> +	pr_info("Total %9lu\n", total);
> +	for (lvl = 0; lvl <= pl->level; lvl++) {
> +		int x = ilog2((unsigned long)pl->next[lvl]);
> +		char row[80];
> +
> +		memset(row, '*', x);
> +		row[x] = '\0';
> +
> +		pr_info(" [%2d] %9lu %s\n",
> +			lvl, (unsigned long)pl->next[lvl], row);
> +	}
> +
> +	return 0;
> +}
> +
> +int i915_scheduler_mock_selftests(void)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(mock_skiplist_levels),
> +	};
> +
> +	return i915_subtests(tests, NULL);
> +}
> +
>  static void scheduling_disable(struct intel_engine_cs *engine)
>  {
>  	engine->props.preempt_timeout_ms = 0;
> @@ -80,9 +128,9 @@ static int all_engines(struct drm_i915_private *i915,
>  static bool check_context_order(struct intel_engine_cs *engine)
>  {
>  	u64 last_seqno, last_context;
> +	struct i915_priolist *p;
>  	unsigned long count;
>  	bool result = false;
> -	struct rb_node *rb;
>  	int last_prio;
>  
>  	/* We expect the execution order to follow ascending fence-context */
> @@ -92,8 +140,7 @@ static bool check_context_order(struct intel_engine_cs *engine)
>  	last_context = 0;
>  	last_seqno = 0;
>  	last_prio = 0;
> -	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	for_each_priolist(p, &engine->active.queue) {
>  		struct i915_request *rq;
>  
>  		priolist_for_each_request(rq, p) {
> -- 
> 2.20.1
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Tvrtko Ursulin Jan. 29, 2021, 9:24 a.m. UTC | #12
On 28/01/2021 22:44, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2021-01-28 16:42:44)
>>
>> On 28/01/2021 16:26, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2021-01-28 15:56:19)
>>>> On 25/01/2021 14:01, Chris Wilson wrote:
>>>>> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
>>>>> index bc2fa84f98a8..1200c3df6a4a 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
>>>>> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
>>>>> @@ -38,10 +38,36 @@ enum {
>>>>>     #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
>>>>>     #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
>>>>>     
>>>>> +#ifdef CONFIG_64BIT
>>>>> +#define I915_PRIOLIST_HEIGHT 12
>>>>> +#else
>>>>> +#define I915_PRIOLIST_HEIGHT 11
>>>>> +#endif
>>>>
>>>> I did not get this. On one hand I could think pointers are larger on
>>>> 64-bit so go for fewer levels, if size was a concern. But on the other
>>>> hand 32-bit is less important these days, definitely much less as a
>>>> performance platform. So going for less memory use => worse performance
>>>> on a less important platform, which typically could be more memory
>>>> constrained? Not sure I see it as that important either way to be
>>>> distinctive but a comment would satisfy me.
>>>
>>> Just aligned to the cacheline. The struct is 128B on 64b and 64B on 32b.
>>> On 64B, we will scale to around 16 million requests in flight and 4
>>> million on 32b. Which should be enough.
>>>
>>> If we shrunk 64b to a 64B node, we would only scale to 256 requests
>>> which limit we definitely will exceed.
>>
>> Ok thanks, pouring it into a comment is implied.
>>
>>>
>>>>>     struct i915_priolist {
>>>>>         struct list_head requests;
>>>>
>>>> What would be on this list? Request can only be on one at a time, so I
>>>> was thinking these nodes would have pointers to list of that priority,
>>>> rather than lists themselves. Assuming there can be multiple nodes of
>>>> the same priority in the 2d hierarcy. Possibly I don't understand the
>>>> layout.
>>>
>>> A request is only on one list (queue, active, hold). But we may still
>>> have more than one request at the same deadline, though that will likely
>>> be limited to priority-inheritance and timeslice deferrals.
>>>
>>> Since we would need pointer to the request, we could only reclaim a
>>> single pointer here, which is not enough to warrant reducing the overall
>>> node size. And while there is at least one user of request->sched.link,
>>> the list maintenance will still be incurred. Using request->sched.link
>>> remains a convenient interface.
>>
>> Lost you.
> 
> /*
>   * i915_priolist forms a skiplist. The skiplist is built in layers,
>   * starting at the base [0] is a singly linked list of all i915_priolist.
>   * Each higher layer contains a fraction of the i915_priolist from the
>   * previous layer:
>   *
>   * S[0] 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF S
>   * E[1] >1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F E
>   * N[2] -->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F N
>   * T[3] ------->7----->F-------7------>F------>7------>F------>7------>F 

Just align this first 7.

T
>   * I[4] -------------->F-------------->F-------------->F-------------->F I
>   * N[5] ------------------------------>F------------------------------>F N
>   * E[6] ------------------------------>F-------------------------------> E
>   * L[7] ---------------------------------------------------------------> L
>   *
>   * To iterate through all active i915_priolist, we only need to follow
>   * the chain in i915_priolist.next[0] (see for_each_priolist).
>   *
>   * To quickly find a specific key (or insert point), we can perform a binary
>   * search by starting at the highest level and following the linked list
>   * at that level until we either find the node, or have gone passed the key.
>   * Then we descend a level, and start walking the list again starting from
>   * the current position, until eventually we find our key, or we run out of

 From the previous on the current level, not current I believe. So go 
previous before descending, right?

Very useful diagram, thank you.

Regards,

Tvrtko
Tvrtko Ursulin Jan. 29, 2021, 9:37 a.m. UTC | #13
On 28/01/2021 16:26, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2021-01-28 15:56:19)

>>> -static void assert_priolists(struct i915_sched_engine * const se)
>>> -{
>>> -     struct rb_node *rb;
>>> -     long last_prio;
>>> -
>>> -     if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
>>> -             return;
>>> -
>>> -     GEM_BUG_ON(rb_first_cached(&se->queue) !=
>>> -                rb_first(&se->queue.rb_root));
>>> -
>>> -     last_prio = INT_MAX;
>>> -     for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
>>> -             const struct i915_priolist *p = to_priolist(rb);
>>> -
>>> -             GEM_BUG_ON(p->priority > last_prio);
>>> -             last_prio = p->priority;
>>> -     }
>>> +     root->prng = next_pseudo_random32(root->prng);
>>> +     return  __ffs(root->prng) / 2;
>>
>> Where is the relationship to I915_PRIOLIST_HEIGHT? Feels root->prng %
>> I915_PRIOLIST_HEIGHT would be more obvious here unless I am terribly
>> mistaken. Or at least put a comment saying why the hack.
> 
> HEIGHT is the maximum possible for our struct. skiplists only want to
> increment the height of the tree one step at a time. So we choose a level
> with decreasing probability, and then limit that to the maximum height of
> the current tree + 1, clamped to HEIGHT.
> 
> You might notice that unlike traditional skiplists, this uses a
> probability of 0.25 for each additional level. A neat trick discovered by
> Con Kolivas (I haven't found it mentioned elsewhere) as the cost of the
> extra level (using P=.5) is the same as the extra chain length with
> P=.25. So you can scale to higher number of requests by packing more
> requests into each level.
> 
> So that is split between randomly choosing a level and then working out
> the height of the node.

Choosing levels with decreasing probability by the virtue of using ffs 
on a random number? Or because (BITS_PER_TYPE(u32) / 2) is greater than 
I915_PRIOLIST_HEIGHT?

Regards,

Tvrtko
Tvrtko Ursulin Jan. 29, 2021, 10:22 a.m. UTC | #14
On 25/01/2021 14:01, Chris Wilson wrote:
> Replace the priolist rbtree with a skiplist. The crucial difference is
> that walking and removing the first element of a skiplist is O(1), but
> O(lgN) for an rbtree, as we need to rebalance on remove. This is a
> hindrance for submission latency as it occurs between picking a request
> for the priolist and submitting it to hardware, as well effectively
> trippling the number of O(lgN) operations required under the irqoff lock.
> This is critical to reducing the latency jitter with multiple clients.
> 
> The downsides to skiplists are that lookup/insertion is only
> probablistically O(lgN) and there is a significant memory penalty to
> as each skip node is larger than the rbtree equivalent. Furthermore, we
> don't use dynamic arrays for the skiplist, so the allocation is fixed,
> and imposes an upper bound on the scalability wrt to the number of
> inflight requests.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   .../drm/i915/gt/intel_execlists_submission.c  |  63 +++--
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  30 +--
>   drivers/gpu/drm/i915/i915_priolist_types.h    |  28 +-
>   drivers/gpu/drm/i915/i915_scheduler.c         | 244 ++++++++++++++----
>   drivers/gpu/drm/i915/i915_scheduler.h         |  11 +-
>   drivers/gpu/drm/i915/i915_scheduler_types.h   |   2 +-
>   .../drm/i915/selftests/i915_mock_selftests.h  |   1 +
>   .../gpu/drm/i915/selftests/i915_scheduler.c   |  53 +++-
>   8 files changed, 316 insertions(+), 116 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 1103c8a00af1..129144dd86b0 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -244,11 +244,6 @@ static void ring_set_paused(const struct intel_engine_cs *engine, int state)
>   		wmb();
>   }
>   
> -static struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static int rq_prio(const struct i915_request *rq)
>   {
>   	return READ_ONCE(rq->sched.attr.priority);
> @@ -272,15 +267,31 @@ static int effective_prio(const struct i915_request *rq)
>   	return prio;
>   }
>   
> -static int queue_prio(const struct i915_sched_engine *se)
> +static struct i915_request *first_request(struct i915_sched_engine *se)
>   {
> -	struct rb_node *rb;
> +	struct i915_priolist *pl;
>   
> -	rb = rb_first_cached(&se->queue);
> -	if (!rb)
> +	for_each_priolist(pl, &se->queue) {
> +		if (likely(!list_empty(&pl->requests)))
> +			return list_first_entry(&pl->requests,
> +						struct i915_request,
> +						sched.link);
> +
> +		i915_priolist_advance(&se->queue, pl);

Why is a "peek" type call site doing tree modifications? Couldn't that 
be limited to places which add/remove?

> +	}
> +
> +	return NULL;
> +}
> +
> +static int queue_prio(struct i915_sched_engine *se)
> +{
> +	struct i915_request *rq;
> +
> +	rq = first_request(se);
> +	if (!rq)
>   		return INT_MIN;
>   
> -	return to_priolist(rb)->priority;
> +	return rq_prio(rq);
>   }
>   
>   static int virtual_prio(const struct intel_engine_execlists *el)
> @@ -290,7 +301,7 @@ static int virtual_prio(const struct intel_engine_execlists *el)
>   	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
>   }
>   
> -static bool need_preempt(const struct intel_engine_cs *engine,
> +static bool need_preempt(struct intel_engine_cs *engine,
>   			 const struct i915_request *rq)
>   {
>   	int last_prio;
> @@ -1136,6 +1147,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request ** const last_port = port + execlists->port_mask;
>   	struct i915_request *last, * const *active;
>   	struct virtual_engine *ve;
> +	struct i915_priolist *pl;
>   	struct rb_node *rb;
>   	bool submit = false;
>   
> @@ -1346,11 +1358,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			break;
>   	}
>   
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> +	for_each_priolist(pl, &engine->active.queue) {
>   		struct i915_request *rq, *rn;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			bool merge = true;
>   
>   			/*
> @@ -1425,8 +1436,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			}
>   		}
>   
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);

There must be someone doing a list_del on this request in this block so 
I suppose it is hidden somewhere in the chain, must be 
__i915_request_submit. I guess it was the same with rbtree so I just 
wonder if there is a way to document this better. Nothing comes to mind.

>   	}
>   done:
>   	*port++ = i915_request_get(last);
> @@ -2631,6 +2641,7 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct i915_request *rq, *rn;
> +	struct i915_priolist *pl;
>   	struct rb_node *rb;
>   	unsigned long flags;
>   
> @@ -2661,16 +2672,12 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   	intel_engine_signal_breadcrumbs(engine);
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> +	for_each_priolist(pl, &engine->active.queue) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			i915_request_mark_eio(rq);
>   			__i915_request_submit(rq);
>   		}
> -
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>   	}
>   	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
>   
> @@ -2703,7 +2710,6 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	engine->active.queue = RB_ROOT_CACHED;
>   
>   	GEM_BUG_ON(__tasklet_is_enabled(&engine->active.tasklet));
>   	engine->active.tasklet.func = nop_submission_tasklet;
> @@ -3089,6 +3095,8 @@ static void virtual_context_exit(struct intel_context *ce)
>   
>   	for (n = 0; n < ve->num_siblings; n++)
>   		intel_engine_pm_put(ve->siblings[n]);
> +
> +	i915_sched_park_engine(&ve->base.active);
>   }
>   
>   static const struct intel_context_ops virtual_context_ops = {
> @@ -3501,6 +3509,7 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>   {
>   	const struct intel_engine_execlists *execlists = &engine->execlists;
>   	struct i915_request *rq, *last;
> +	struct i915_priolist *pl;
>   	unsigned long flags;
>   	unsigned int count;
>   	struct rb_node *rb;
> @@ -3530,10 +3539,8 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
>   
>   	last = NULL;
>   	count = 0;
> -	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> -
> -		priolist_for_each_request(rq, p) {
> +	for_each_priolist(pl, &engine->active.queue) {
> +		priolist_for_each_request(rq, pl) {
>   			if (count++ < max - 1)
>   				show_request(m, rq, "\t\t", 0);
>   			else
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index 2d7339ef3b4c..8d0c6cd277b3 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -59,11 +59,6 @@
>   
>   #define GUC_REQUEST_SIZE 64 /* bytes */
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
>   {
>   	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
> @@ -185,8 +180,8 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request ** const last_port = first + execlists->port_mask;
>   	struct i915_request *last = first[0];
>   	struct i915_request **port;
> +	struct i915_priolist *pl;
>   	bool submit = false;
> -	struct rb_node *rb;
>   
>   	lockdep_assert_held(&engine->active.lock);
>   
> @@ -203,11 +198,10 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	 * event.
>   	 */
>   	port = first;
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> +	for_each_priolist(pl, &engine->active.queue) {
>   		struct i915_request *rq, *rn;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> +		priolist_for_each_request_safe(rq, rn, pl) {
>   			if (last && rq->context != last->context) {
>   				if (port == last_port)
>   					goto done;
> @@ -223,12 +217,11 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   			last = rq;
>   		}
>   
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, pl);
>   	}
>   done:
>   	execlists->queue_priority_hint =
> -		rb ? to_priolist(rb)->priority : INT_MIN;
> +		pl != &engine->active.queue.sentinel ? pl->priority : INT_MIN;
>   	if (submit) {
>   		*port = schedule_in(last, port - execlists->inflight);
>   		*++port = NULL;
> @@ -327,7 +320,7 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
>   	struct i915_request *rq, *rn;
> -	struct rb_node *rb;
> +	struct i915_priolist *p;
>   	unsigned long flags;
>   
>   	ENGINE_TRACE(engine, "\n");
> @@ -355,25 +348,20 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>   	}
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&engine->active.queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> +	for_each_priolist(p, &engine->active.queue) {
> +		priolist_for_each_request_safe(rq, rn, p) {
>   			list_del_init(&rq->sched.link);
>   			__i915_request_submit(rq);
>   			dma_fence_set_error(&rq->fence, -EIO);
>   			i915_request_mark_complete(rq);
>   		}
> -
> -		rb_erase_cached(&p->node, &engine->active.queue);
> -		i915_priolist_free(p);
> +		i915_priolist_advance(&engine->active.queue, p);
>   	}
>   	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
>   
>   	/* Remaining _unready_ requests will be nop'ed when submitted */
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	engine->active.queue = RB_ROOT_CACHED;
>   
>   	spin_unlock_irqrestore(&engine->active.lock, flags);
>   }
> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> index bc2fa84f98a8..1200c3df6a4a 100644
> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> @@ -38,10 +38,36 @@ enum {
>   #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
>   #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
>   
> +#ifdef CONFIG_64BIT
> +#define I915_PRIOLIST_HEIGHT 12
> +#else
> +#define I915_PRIOLIST_HEIGHT 11
> +#endif
> +
>   struct i915_priolist {
>   	struct list_head requests;
> -	struct rb_node node;
>   	int priority;
> +
> +	int level;
> +	struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
>   };
>   
> +struct i915_priolist_root {
> +	struct i915_priolist sentinel;
> +	u32 prng;
> +};
> +
> +#define i915_priolist_is_empty(root) ((root)->sentinel.level < 0)
> +
> +#define for_each_priolist(p, root) \
> +	for ((p) = (root)->sentinel.next[0]; \
> +	     (p) != &(root)->sentinel; \
> +	     (p) = (p)->next[0])
> +
> +#define priolist_for_each_request(it, plist) \
> +	list_for_each_entry(it, &(plist)->requests, sched.link)
> +
> +#define priolist_for_each_request_safe(it, n, plist) \
> +	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> +
>   #endif /* _I915_PRIOLIST_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index a3ee06cb66d7..74000d3eebb1 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -4,7 +4,9 @@
>    * Copyright © 2018 Intel Corporation
>    */
>   
> +#include <linux/bitops.h>
>   #include <linux/mutex.h>
> +#include <linux/prandom.h>
>   
>   #include "gt/intel_ring.h"
>   #include "gt/intel_lrc_reg.h"
> @@ -91,15 +93,24 @@ static void i915_sched_init_ipi(struct i915_sched_ipi *ipi)
>   	ipi->list = NULL;
>   }
>   
> +static void init_priolist(struct i915_priolist_root *const root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +
> +	memset_p((void **)pl->next, pl, ARRAY_SIZE(pl->next));
> +	pl->priority = INT_MIN;
> +	pl->level = -1;
> +}
> +
>   void i915_sched_init_engine(struct i915_sched_engine *se,
>   			    unsigned int subclass)
>   {
>   	spin_lock_init(&se->lock);
>   	lockdep_set_subclass(&se->lock, subclass);
>   
> +	init_priolist(&se->queue);
>   	INIT_LIST_HEAD(&se->requests);
>   	INIT_LIST_HEAD(&se->hold);
> -	se->queue = RB_ROOT_CACHED;
>   
>   	i915_sched_init_ipi(&se->ipi);
>   
> @@ -116,8 +127,57 @@ void i915_sched_init_engine(struct i915_sched_engine *se,
>   #endif
>   }
>   
> +__maybe_unused static bool priolist_idle(struct i915_priolist_root *root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +	int lvl;
> +
> +	for (lvl = 0; lvl < ARRAY_SIZE(pl->next); lvl++) {
> +		if (pl->next[lvl] != pl) {
> +			GEM_TRACE_ERR("root[%d] is not empty\n", lvl);
> +			return false;
> +		}
> +	}
> +
> +	if (pl->level != -1) {
> +		GEM_TRACE_ERR("root is not clear: %d\n", pl->level);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static void pl_push(struct i915_priolist *pl, struct list_head *head)
> +{
> +	pl->requests.next = head->next;
> +	head->next = &pl->requests;
> +}
> +
> +static struct i915_priolist *pl_pop(struct list_head *head)
> +{
> +	struct i915_priolist *pl;
> +
> +	pl = container_of(head->next, typeof(*pl), requests);
> +	head->next = pl->requests.next;
> +
> +	return pl;
> +}
> +
> +static bool pl_empty(struct list_head *head)
> +{
> +	return !head->next;
> +}
> +
>   void i915_sched_park_engine(struct i915_sched_engine *se)
>   {
> +	struct i915_priolist_root *root = &se->queue;
> +	struct list_head *list = &root->sentinel.requests;
> +
> +	GEM_BUG_ON(!priolist_idle(root));
> +
> +	while (!pl_empty(list))
> +		kmem_cache_free(global.slab_priorities, pl_pop(list));

If I follow correct you could just unlink the head and free the rest 
with just a walk, no need to pop along the way.

> +
>   	GEM_BUG_ON(!i915_sched_is_idle(se));
>   	se->no_priolist = false;
>   }
> @@ -183,71 +243,55 @@ static inline bool node_signaled(const struct i915_sched_node *node)
>   	return i915_request_completed(node_to_request(node));
>   }
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +static inline unsigned int random_level(struct i915_priolist_root *root)
>   {
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
> -static void assert_priolists(struct i915_sched_engine * const se)
> -{
> -	struct rb_node *rb;
> -	long last_prio;
> -
> -	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> -		return;
> -
> -	GEM_BUG_ON(rb_first_cached(&se->queue) !=
> -		   rb_first(&se->queue.rb_root));
> -
> -	last_prio = INT_MAX;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		const struct i915_priolist *p = to_priolist(rb);
> -
> -		GEM_BUG_ON(p->priority > last_prio);
> -		last_prio = p->priority;
> -	}
> +	root->prng = next_pseudo_random32(root->prng);
> +	return  __ffs(root->prng) / 2;
>   }
>   
>   static struct list_head *
>   lookup_priolist(struct intel_engine_cs *engine, int prio)
>   {
> +	struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
>   	struct i915_sched_engine * const se = &engine->active;
> -	struct i915_priolist *p;
> -	struct rb_node **parent, *rb;
> -	bool first = true;
> -
> -	lockdep_assert_held(&engine->active.lock);
> -	assert_priolists(se);
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	int lvl;
>   
> +	lockdep_assert_held(&se->lock);
>   	if (unlikely(se->no_priolist))
>   		prio = I915_PRIORITY_NORMAL;
>   
> +	for_each_priolist(pl, root) { /* recycle any empty elements before us */
> +		if (pl->priority >= prio || !list_empty(&pl->requests))
> +			break;
> +
> +		i915_priolist_advance(root, pl);
> +	}
> +
>   find_priolist:
> -	/* most positive priority is scheduled first, equal priorities fifo */
> -	rb = NULL;
> -	parent = &se->queue.rb_root.rb_node;
> -	while (*parent) {
> -		rb = *parent;
> -		p = to_priolist(rb);
> -		if (prio > p->priority) {
> -			parent = &rb->rb_left;
> -		} else if (prio < p->priority) {
> -			parent = &rb->rb_right;
> -			first = false;
> -		} else {
> -			return &p->requests;
> -		}
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	while (lvl >= 0) {
> +		while (tmp = pl->next[lvl], tmp->priority >= prio)
> +			pl = tmp;
> +		if (pl->priority == prio)
> +			goto out;
> +		update[lvl--] = pl;
>   	}
>   
>   	if (prio == I915_PRIORITY_NORMAL) {
> -		p = &se->default_priolist;
> +		pl = &se->default_priolist;
> +	} else if (!pl_empty(&root->sentinel.requests)) {
> +		pl = pl_pop(&root->sentinel.requests);
>   	} else {
> -		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> +		pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
>   		/* Convert an allocation failure to a priority bump */
> -		if (unlikely(!p)) {
> +		if (unlikely(!pl)) {
>   			prio = I915_PRIORITY_NORMAL; /* recurses just once */
>   
> -			/* To maintain ordering with all rendering, after an
> +			/*
> +			 * To maintain ordering with all rendering, after an
>   			 * allocation failure we have to disable all scheduling.
>   			 * Requests will then be executed in fifo, and schedule
>   			 * will ensure that dependencies are emitted in fifo.
> @@ -260,18 +304,103 @@ lookup_priolist(struct intel_engine_cs *engine, int prio)
>   		}
>   	}
>   
> -	p->priority = prio;
> -	INIT_LIST_HEAD(&p->requests);
> +	pl->priority = prio;
> +	INIT_LIST_HEAD(&pl->requests);
>   
> -	rb_link_node(&p->node, rb, parent);
> -	rb_insert_color_cached(&p->node, &se->queue, first);
> +	lvl = random_level(root);
> +	if (lvl > root->sentinel.level) {
> +		if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
> +			lvl = ++root->sentinel.level;
> +			update[lvl] = &root->sentinel;
> +		} else {
> +			lvl = I915_PRIOLIST_HEIGHT - 1;
> +		}
> +	}
> +	GEM_BUG_ON(lvl < 0);
> +	GEM_BUG_ON(lvl >= ARRAY_SIZE(pl->next));
>   
> -	return &p->requests;
> +	pl->level = lvl;
> +	do {
> +		tmp = update[lvl];
> +		pl->next[lvl] = update[lvl]->next[lvl];
> +		tmp->next[lvl] = pl;
> +	} while (--lvl >= 0);
> +
> +	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
> +		struct i915_priolist *chk;
> +
> +		chk = &root->sentinel;
> +		lvl = chk->level;
> +		do {
> +			while (tmp = chk->next[lvl], tmp->priority >= prio)
> +				chk = tmp;
> +		} while (--lvl >= 0);
> +
> +		GEM_BUG_ON(chk != pl);
> +	}
> +
> +out:
> +	GEM_BUG_ON(pl == &root->sentinel);
> +	return &pl->requests;
>   }
>   
> -void __i915_priolist_free(struct i915_priolist *p)
> +static void remove_priolist(struct intel_engine_cs *engine,
> +			    struct list_head *plist)
>   {
> -	kmem_cache_free(global.slab_priorities, p);
> +	struct i915_sched_engine * const se = &engine->active;
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	struct i915_priolist *old =
> +		container_of(plist, struct i915_priolist, requests);
> +	int prio = old->priority;
> +	int lvl;
> +
> +	lockdep_assert_held(&se->lock);
> +	GEM_BUG_ON(!list_empty(plist));
> +
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +
> +	if (prio != I915_PRIORITY_NORMAL)
> +		pl_push(old, &pl->requests);
> +
> +	do {
> +		while (tmp = pl->next[lvl], tmp->priority > prio)
> +			pl = tmp;
> +		if (lvl <= old->level) {
> +			pl->next[lvl] = old->next[lvl];
> +			if (pl == &root->sentinel && old->next[lvl] == pl) {
> +				GEM_BUG_ON(pl->level != lvl);
> +				pl->level--;
> +			}
> +		}
> +	} while (--lvl >= 0);
> +	GEM_BUG_ON(tmp != old);
> +}

Any chance to extract some commonality between remove and advance? Not 
fully getting it yet but they both seem to be removing nodes and then 
decreasing the height.

> +
> +void i915_priolist_advance(struct i915_priolist_root *root,
> +			   struct i915_priolist *pl)
> +{

This seems to be called when lowest priority entry has no requests, 
right? Some sort of trim? Not getting the "advance" idea.

> +	struct i915_priolist * const s = &root->sentinel;
> +	int lvl;
> +
> +	GEM_BUG_ON(!list_empty(&pl->requests));
> +	GEM_BUG_ON(pl != s->next[0]);
> +	GEM_BUG_ON(pl == s);
> +
> +	if (pl->priority != I915_PRIORITY_NORMAL)
> +		pl_push(pl, &s->requests);

s->requests is just a store of empty priolist nodes? If so please put a 
comment, maybe in struct i915_priolist_root.

> +
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +	do {
> +		s->next[lvl] = pl->next[lvl];

So it unlinks the empty pl from each layer starting from the top.

> +		if (pl->next[lvl] == s) {

If the layer is completely empty..

> +			GEM_BUG_ON(s->level != lvl);
> +			s->level--;

.. decreases the max height by one. But it also expects max height to be 
equal to the heigh of the empty level. Because layer was empty it has to 
be, okay.

> +		}
> +	} while (--lvl >= 0);
>   }
>   
>   static struct i915_request *
> @@ -420,8 +549,13 @@ static void __i915_request_set_priority(struct i915_request *rq, int prio)
>   			continue;
>   
>   		GEM_BUG_ON(rq->engine != engine);
> -		if (i915_request_in_priority_queue(rq))
> +		if (i915_request_in_priority_queue(rq)) {
> +			struct list_head *prev = rq->sched.link.prev;
> +
>   			list_move_tail(&rq->sched.link, plist);
> +			if (list_empty(prev))
> +				remove_priolist(engine, prev);
> +		}
>   
>   		/* Defer (tasklet) submission until after all updates. */
>   		kick_submission(engine, rq, prio);
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index 0ab47cbf0e9c..bca89a58d953 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -16,12 +16,6 @@
>   
>   struct drm_printer;
>   
> -#define priolist_for_each_request(it, plist) \
> -	list_for_each_entry(it, &(plist)->requests, sched.link)
> -
> -#define priolist_for_each_request_consume(it, n, plist) \
> -	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> -
>   void i915_sched_node_init(struct i915_sched_node *node);
>   void i915_sched_node_reinit(struct i915_sched_node *node);
>   
> @@ -69,7 +63,7 @@ static inline void i915_priolist_free(struct i915_priolist *p)
>   
>   static inline bool i915_sched_is_idle(const struct i915_sched_engine *se)
>   {
> -	return RB_EMPTY_ROOT(&se->queue.rb_root);
> +	return i915_priolist_is_empty(&se->queue);
>   }
>   
>   static inline bool
> @@ -99,6 +93,9 @@ static inline void i915_sched_kick(struct i915_sched_engine *se)
>   	tasklet_hi_schedule(&se->tasklet);
>   }
>   
> +void i915_priolist_advance(struct i915_priolist_root *root,
> +			   struct i915_priolist *old);
> +
>   void i915_request_show_with_schedule(struct drm_printer *m,
>   				     const struct i915_request *rq,
>   				     const char *prefix,
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index f668c680d290..e64750be4e77 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -89,7 +89,7 @@ struct i915_sched_engine {
>   	/**
>   	 * @queue: queue of requests, in priority lists
>   	 */
> -	struct rb_root_cached queue;
> +	struct i915_priolist_root queue;
>   
>   	struct i915_sched_ipi ipi;
>   
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index 3db34d3eea58..946c93441c1f 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -25,6 +25,7 @@ selftest(ring, intel_ring_mock_selftests)
>   selftest(engine, intel_engine_cs_mock_selftests)
>   selftest(timelines, intel_timeline_mock_selftests)
>   selftest(requests, i915_request_mock_selftests)
> +selftest(scheduler, i915_scheduler_mock_selftests)
>   selftest(objects, i915_gem_object_mock_selftests)
>   selftest(phys, i915_gem_phys_mock_selftests)
>   selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_scheduler.c b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> index de44a66210b7..de5b1443129b 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> @@ -12,6 +12,54 @@
>   #include "selftests/igt_spinner.h"
>   #include "selftests/i915_random.h"
>   
> +static int mock_skiplist_levels(void *dummy)
> +{
> +	struct i915_priolist_root root = {};
> +	struct i915_priolist *pl = &root.sentinel;
> +	IGT_TIMEOUT(end_time);
> +	unsigned long total;
> +	int count, lvl;
> +
> +	total = 0;
> +	do {
> +		for (count = 0; count < 16384; count++) {
> +			lvl = random_level(&root);
> +			if (lvl > pl->level) {
> +				if (lvl < I915_PRIOLIST_HEIGHT - 1)
> +					lvl = ++pl->level;
> +				else
> +					lvl = I915_PRIOLIST_HEIGHT - 1;
> +			}
> +
> +			pl->next[lvl] = ptr_inc(pl->next[lvl]);
> +		}
> +		total += count;
> +	} while (!__igt_timeout(end_time, NULL));
> +
> +	pr_info("Total %9lu\n", total);
> +	for (lvl = 0; lvl <= pl->level; lvl++) {
> +		int x = ilog2((unsigned long)pl->next[lvl]);
> +		char row[80];
> +
> +		memset(row, '*', x);
> +		row[x] = '\0';
> +
> +		pr_info(" [%2d] %9lu %s\n",
> +			lvl, (unsigned long)pl->next[lvl], row);
> +	}
> +
> +	return 0;
> +}
> +
> +int i915_scheduler_mock_selftests(void)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(mock_skiplist_levels),
> +	};
> +
> +	return i915_subtests(tests, NULL);
> +}
> +
>   static void scheduling_disable(struct intel_engine_cs *engine)
>   {
>   	engine->props.preempt_timeout_ms = 0;
> @@ -80,9 +128,9 @@ static int all_engines(struct drm_i915_private *i915,
>   static bool check_context_order(struct intel_engine_cs *engine)
>   {
>   	u64 last_seqno, last_context;
> +	struct i915_priolist *p;
>   	unsigned long count;
>   	bool result = false;
> -	struct rb_node *rb;
>   	int last_prio;
>   
>   	/* We expect the execution order to follow ascending fence-context */
> @@ -92,8 +140,7 @@ static bool check_context_order(struct intel_engine_cs *engine)
>   	last_context = 0;
>   	last_seqno = 0;
>   	last_prio = 0;
> -	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	for_each_priolist(p, &engine->active.queue) {
>   		struct i915_request *rq;
>   
>   		priolist_for_each_request(rq, p) {
> 

Regards,

Tvrtko
Chris Wilson Jan. 29, 2021, 10:26 a.m. UTC | #15
Quoting Tvrtko Ursulin (2021-01-29 09:37:27)
> 
> On 28/01/2021 16:26, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2021-01-28 15:56:19)
> 
> >>> -static void assert_priolists(struct i915_sched_engine * const se)
> >>> -{
> >>> -     struct rb_node *rb;
> >>> -     long last_prio;
> >>> -
> >>> -     if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> >>> -             return;
> >>> -
> >>> -     GEM_BUG_ON(rb_first_cached(&se->queue) !=
> >>> -                rb_first(&se->queue.rb_root));
> >>> -
> >>> -     last_prio = INT_MAX;
> >>> -     for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> >>> -             const struct i915_priolist *p = to_priolist(rb);
> >>> -
> >>> -             GEM_BUG_ON(p->priority > last_prio);
> >>> -             last_prio = p->priority;
> >>> -     }
> >>> +     root->prng = next_pseudo_random32(root->prng);
> >>> +     return  __ffs(root->prng) / 2;
> >>
> >> Where is the relationship to I915_PRIOLIST_HEIGHT? Feels root->prng %
> >> I915_PRIOLIST_HEIGHT would be more obvious here unless I am terribly
> >> mistaken. Or at least put a comment saying why the hack.
> > 
> > HEIGHT is the maximum possible for our struct. skiplists only want to
> > increment the height of the tree one step at a time. So we choose a level
> > with decreasing probability, and then limit that to the maximum height of
> > the current tree + 1, clamped to HEIGHT.
> > 
> > You might notice that unlike traditional skiplists, this uses a
> > probability of 0.25 for each additional level. A neat trick discovered by
> > Con Kolivas (I haven't found it mentioned elsewhere) as the cost of the
> > extra level (using P=.5) is the same as the extra chain length with
> > P=.25. So you can scale to higher number of requests by packing more
> > requests into each level.
> > 
> > So that is split between randomly choosing a level and then working out
> > the height of the node.
> 
> Choosing levels with decreasing probability by the virtue of using ffs 
> on a random number? Or because (BITS_PER_TYPE(u32) / 2) is greater than 
> I915_PRIOLIST_HEIGHT?

        /*
         * Given a uniform distribution of random numbers over the u32, then
         * the probability each bit is unset is P=0.5. The probability of a
         * successive sequence of bits being unset is P(n) = 0.5^n [n > 0].
         *   P(level:1) = 0.5
         *   P(level:2) = 0.25
         *   P(level:3) = 0.125
         *   P(level:4) = 0.0625
         *   ...
         * So we can use ffs() on a good random number generator to pick our
         * level. We divide by two to reduce the probability of choosing a
         * level to .25, as the cost of descending a level is the same as
         * following an extra link in the chain at that level (so we can
         * pack more nodes into fewer levels without incurring extra cost,
         * and allow scaling to higher volumes of requests without expanding
         * the height of the skiplist).
         */

-Chris
Chris Wilson Jan. 29, 2021, 10:30 a.m. UTC | #16
Quoting Matthew Brost (2021-01-28 22:56:04)
> On Mon, Jan 25, 2021 at 02:01:15PM +0000, Chris Wilson wrote:
> > Replace the priolist rbtree with a skiplist. The crucial difference is
> > that walking and removing the first element of a skiplist is O(1), but
> > O(lgN) for an rbtree, as we need to rebalance on remove. This is a
> > hindrance for submission latency as it occurs between picking a request
> > for the priolist and submitting it to hardware, as well effectively
> > trippling the number of O(lgN) operations required under the irqoff lock.
> > This is critical to reducing the latency jitter with multiple clients.
> > 
> > The downsides to skiplists are that lookup/insertion is only
> > probablistically O(lgN) and there is a significant memory penalty to
> > as each skip node is larger than the rbtree equivalent. Furthermore, we
> > don't use dynamic arrays for the skiplist, so the allocation is fixed,
> > and imposes an upper bound on the scalability wrt to the number of
> > inflight requests.
> > 
> 
> This is a fun data structure but IMO might be overkill to maintain this
> code in the i915. The UMDs have effectively agreed to use only 3 levels,
> is O(lgN) where N == 3 really a big deal? With GuC submission we will
> statically map all user levels into 3 buckets. If we are doing that, do
> we even need a complex data structure? i.e. Could use just use can
> array of linked lists?

Because we need to scale the bst to handle a unqiue key per request with
thousands of requests [this is not only about priorities]. And as you
will see from the results, even with just a single priority in the system
(so one entry in either the skiplist or rbtree), the skiplist is beating 
the rbtree as measured by the lock hold time around insert/dequeue of
requests. That surprised me.
-Chris
Matthew Brost Jan. 29, 2021, 5:01 p.m. UTC | #17
On Fri, Jan 29, 2021 at 10:30:58AM +0000, Chris Wilson wrote:
> Quoting Matthew Brost (2021-01-28 22:56:04)
> > On Mon, Jan 25, 2021 at 02:01:15PM +0000, Chris Wilson wrote:
> > > Replace the priolist rbtree with a skiplist. The crucial difference is
> > > that walking and removing the first element of a skiplist is O(1), but
> > > O(lgN) for an rbtree, as we need to rebalance on remove. This is a
> > > hindrance for submission latency as it occurs between picking a request
> > > for the priolist and submitting it to hardware, as well effectively
> > > trippling the number of O(lgN) operations required under the irqoff lock.
> > > This is critical to reducing the latency jitter with multiple clients.
> > > 
> > > The downsides to skiplists are that lookup/insertion is only
> > > probablistically O(lgN) and there is a significant memory penalty to
> > > as each skip node is larger than the rbtree equivalent. Furthermore, we
> > > don't use dynamic arrays for the skiplist, so the allocation is fixed,
> > > and imposes an upper bound on the scalability wrt to the number of
> > > inflight requests.
> > > 
> > 
> > This is a fun data structure but IMO might be overkill to maintain this
> > code in the i915. The UMDs have effectively agreed to use only 3 levels,
> > is O(lgN) where N == 3 really a big deal? With GuC submission we will
> > statically map all user levels into 3 buckets. If we are doing that, do
> > we even need a complex data structure? i.e. Could use just use can
> > array of linked lists?
> 
> Because we need to scale the bst to handle a unqiue key per request with
> thousands of requests [this is not only about priorities]. And as you
> will see from the results, even with just a single priority in the system
> (so one entry in either the skiplist or rbtree), the skiplist is beating 
> the rbtree as measured by the lock hold time around insert/dequeue of
> requests. That surprised me.

Ok, seems reasonable. Skips list are pretty cool, wondering if at some
point we should make skip list code a bit more generic so it can
possibly be levered in other parts of the i915 / kernel.

Matt

> -Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 1103c8a00af1..129144dd86b0 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -244,11 +244,6 @@  static void ring_set_paused(const struct intel_engine_cs *engine, int state)
 		wmb();
 }
 
-static struct i915_priolist *to_priolist(struct rb_node *rb)
-{
-	return rb_entry(rb, struct i915_priolist, node);
-}
-
 static int rq_prio(const struct i915_request *rq)
 {
 	return READ_ONCE(rq->sched.attr.priority);
@@ -272,15 +267,31 @@  static int effective_prio(const struct i915_request *rq)
 	return prio;
 }
 
-static int queue_prio(const struct i915_sched_engine *se)
+static struct i915_request *first_request(struct i915_sched_engine *se)
 {
-	struct rb_node *rb;
+	struct i915_priolist *pl;
 
-	rb = rb_first_cached(&se->queue);
-	if (!rb)
+	for_each_priolist(pl, &se->queue) {
+		if (likely(!list_empty(&pl->requests)))
+			return list_first_entry(&pl->requests,
+						struct i915_request,
+						sched.link);
+
+		i915_priolist_advance(&se->queue, pl);
+	}
+
+	return NULL;
+}
+
+static int queue_prio(struct i915_sched_engine *se)
+{
+	struct i915_request *rq;
+
+	rq = first_request(se);
+	if (!rq)
 		return INT_MIN;
 
-	return to_priolist(rb)->priority;
+	return rq_prio(rq);
 }
 
 static int virtual_prio(const struct intel_engine_execlists *el)
@@ -290,7 +301,7 @@  static int virtual_prio(const struct intel_engine_execlists *el)
 	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
 }
 
-static bool need_preempt(const struct intel_engine_cs *engine,
+static bool need_preempt(struct intel_engine_cs *engine,
 			 const struct i915_request *rq)
 {
 	int last_prio;
@@ -1136,6 +1147,7 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 	struct i915_request ** const last_port = port + execlists->port_mask;
 	struct i915_request *last, * const *active;
 	struct virtual_engine *ve;
+	struct i915_priolist *pl;
 	struct rb_node *rb;
 	bool submit = false;
 
@@ -1346,11 +1358,10 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 			break;
 	}
 
-	while ((rb = rb_first_cached(&engine->active.queue))) {
-		struct i915_priolist *p = to_priolist(rb);
+	for_each_priolist(pl, &engine->active.queue) {
 		struct i915_request *rq, *rn;
 
-		priolist_for_each_request_consume(rq, rn, p) {
+		priolist_for_each_request_safe(rq, rn, pl) {
 			bool merge = true;
 
 			/*
@@ -1425,8 +1436,7 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 			}
 		}
 
-		rb_erase_cached(&p->node, &engine->active.queue);
-		i915_priolist_free(p);
+		i915_priolist_advance(&engine->active.queue, pl);
 	}
 done:
 	*port++ = i915_request_get(last);
@@ -2631,6 +2641,7 @@  static void execlists_reset_cancel(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct i915_request *rq, *rn;
+	struct i915_priolist *pl;
 	struct rb_node *rb;
 	unsigned long flags;
 
@@ -2661,16 +2672,12 @@  static void execlists_reset_cancel(struct intel_engine_cs *engine)
 	intel_engine_signal_breadcrumbs(engine);
 
 	/* Flush the queued requests to the timeline list (for retiring). */
-	while ((rb = rb_first_cached(&engine->active.queue))) {
-		struct i915_priolist *p = to_priolist(rb);
-
-		priolist_for_each_request_consume(rq, rn, p) {
+	for_each_priolist(pl, &engine->active.queue) {
+		priolist_for_each_request_safe(rq, rn, pl) {
 			i915_request_mark_eio(rq);
 			__i915_request_submit(rq);
 		}
-
-		rb_erase_cached(&p->node, &engine->active.queue);
-		i915_priolist_free(p);
+		i915_priolist_advance(&engine->active.queue, pl);
 	}
 	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
 
@@ -2703,7 +2710,6 @@  static void execlists_reset_cancel(struct intel_engine_cs *engine)
 	/* Remaining _unready_ requests will be nop'ed when submitted */
 
 	execlists->queue_priority_hint = INT_MIN;
-	engine->active.queue = RB_ROOT_CACHED;
 
 	GEM_BUG_ON(__tasklet_is_enabled(&engine->active.tasklet));
 	engine->active.tasklet.func = nop_submission_tasklet;
@@ -3089,6 +3095,8 @@  static void virtual_context_exit(struct intel_context *ce)
 
 	for (n = 0; n < ve->num_siblings; n++)
 		intel_engine_pm_put(ve->siblings[n]);
+
+	i915_sched_park_engine(&ve->base.active);
 }
 
 static const struct intel_context_ops virtual_context_ops = {
@@ -3501,6 +3509,7 @@  void intel_execlists_show_requests(struct intel_engine_cs *engine,
 {
 	const struct intel_engine_execlists *execlists = &engine->execlists;
 	struct i915_request *rq, *last;
+	struct i915_priolist *pl;
 	unsigned long flags;
 	unsigned int count;
 	struct rb_node *rb;
@@ -3530,10 +3539,8 @@  void intel_execlists_show_requests(struct intel_engine_cs *engine,
 
 	last = NULL;
 	count = 0;
-	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
-
-		priolist_for_each_request(rq, p) {
+	for_each_priolist(pl, &engine->active.queue) {
+		priolist_for_each_request(rq, pl) {
 			if (count++ < max - 1)
 				show_request(m, rq, "\t\t", 0);
 			else
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index 2d7339ef3b4c..8d0c6cd277b3 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -59,11 +59,6 @@ 
 
 #define GUC_REQUEST_SIZE 64 /* bytes */
 
-static inline struct i915_priolist *to_priolist(struct rb_node *rb)
-{
-	return rb_entry(rb, struct i915_priolist, node);
-}
-
 static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
 {
 	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
@@ -185,8 +180,8 @@  static void __guc_dequeue(struct intel_engine_cs *engine)
 	struct i915_request ** const last_port = first + execlists->port_mask;
 	struct i915_request *last = first[0];
 	struct i915_request **port;
+	struct i915_priolist *pl;
 	bool submit = false;
-	struct rb_node *rb;
 
 	lockdep_assert_held(&engine->active.lock);
 
@@ -203,11 +198,10 @@  static void __guc_dequeue(struct intel_engine_cs *engine)
 	 * event.
 	 */
 	port = first;
-	while ((rb = rb_first_cached(&engine->active.queue))) {
-		struct i915_priolist *p = to_priolist(rb);
+	for_each_priolist(pl, &engine->active.queue) {
 		struct i915_request *rq, *rn;
 
-		priolist_for_each_request_consume(rq, rn, p) {
+		priolist_for_each_request_safe(rq, rn, pl) {
 			if (last && rq->context != last->context) {
 				if (port == last_port)
 					goto done;
@@ -223,12 +217,11 @@  static void __guc_dequeue(struct intel_engine_cs *engine)
 			last = rq;
 		}
 
-		rb_erase_cached(&p->node, &engine->active.queue);
-		i915_priolist_free(p);
+		i915_priolist_advance(&engine->active.queue, pl);
 	}
 done:
 	execlists->queue_priority_hint =
-		rb ? to_priolist(rb)->priority : INT_MIN;
+		pl != &engine->active.queue.sentinel ? pl->priority : INT_MIN;
 	if (submit) {
 		*port = schedule_in(last, port - execlists->inflight);
 		*++port = NULL;
@@ -327,7 +320,7 @@  static void guc_reset_cancel(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct i915_request *rq, *rn;
-	struct rb_node *rb;
+	struct i915_priolist *p;
 	unsigned long flags;
 
 	ENGINE_TRACE(engine, "\n");
@@ -355,25 +348,20 @@  static void guc_reset_cancel(struct intel_engine_cs *engine)
 	}
 
 	/* Flush the queued requests to the timeline list (for retiring). */
-	while ((rb = rb_first_cached(&engine->active.queue))) {
-		struct i915_priolist *p = to_priolist(rb);
-
-		priolist_for_each_request_consume(rq, rn, p) {
+	for_each_priolist(p, &engine->active.queue) {
+		priolist_for_each_request_safe(rq, rn, p) {
 			list_del_init(&rq->sched.link);
 			__i915_request_submit(rq);
 			dma_fence_set_error(&rq->fence, -EIO);
 			i915_request_mark_complete(rq);
 		}
-
-		rb_erase_cached(&p->node, &engine->active.queue);
-		i915_priolist_free(p);
+		i915_priolist_advance(&engine->active.queue, p);
 	}
 	GEM_BUG_ON(!i915_sched_is_idle(&engine->active));
 
 	/* Remaining _unready_ requests will be nop'ed when submitted */
 
 	execlists->queue_priority_hint = INT_MIN;
-	engine->active.queue = RB_ROOT_CACHED;
 
 	spin_unlock_irqrestore(&engine->active.lock, flags);
 }
diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
index bc2fa84f98a8..1200c3df6a4a 100644
--- a/drivers/gpu/drm/i915/i915_priolist_types.h
+++ b/drivers/gpu/drm/i915/i915_priolist_types.h
@@ -38,10 +38,36 @@  enum {
 #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
 #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
 
+#ifdef CONFIG_64BIT
+#define I915_PRIOLIST_HEIGHT 12
+#else
+#define I915_PRIOLIST_HEIGHT 11
+#endif
+
 struct i915_priolist {
 	struct list_head requests;
-	struct rb_node node;
 	int priority;
+
+	int level;
+	struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
 };
 
+struct i915_priolist_root {
+	struct i915_priolist sentinel;
+	u32 prng;
+};
+
+#define i915_priolist_is_empty(root) ((root)->sentinel.level < 0)
+
+#define for_each_priolist(p, root) \
+	for ((p) = (root)->sentinel.next[0]; \
+	     (p) != &(root)->sentinel; \
+	     (p) = (p)->next[0])
+
+#define priolist_for_each_request(it, plist) \
+	list_for_each_entry(it, &(plist)->requests, sched.link)
+
+#define priolist_for_each_request_safe(it, n, plist) \
+	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
+
 #endif /* _I915_PRIOLIST_TYPES_H_ */
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index a3ee06cb66d7..74000d3eebb1 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -4,7 +4,9 @@ 
  * Copyright © 2018 Intel Corporation
  */
 
+#include <linux/bitops.h>
 #include <linux/mutex.h>
+#include <linux/prandom.h>
 
 #include "gt/intel_ring.h"
 #include "gt/intel_lrc_reg.h"
@@ -91,15 +93,24 @@  static void i915_sched_init_ipi(struct i915_sched_ipi *ipi)
 	ipi->list = NULL;
 }
 
+static void init_priolist(struct i915_priolist_root *const root)
+{
+	struct i915_priolist *pl = &root->sentinel;
+
+	memset_p((void **)pl->next, pl, ARRAY_SIZE(pl->next));
+	pl->priority = INT_MIN;
+	pl->level = -1;
+}
+
 void i915_sched_init_engine(struct i915_sched_engine *se,
 			    unsigned int subclass)
 {
 	spin_lock_init(&se->lock);
 	lockdep_set_subclass(&se->lock, subclass);
 
+	init_priolist(&se->queue);
 	INIT_LIST_HEAD(&se->requests);
 	INIT_LIST_HEAD(&se->hold);
-	se->queue = RB_ROOT_CACHED;
 
 	i915_sched_init_ipi(&se->ipi);
 
@@ -116,8 +127,57 @@  void i915_sched_init_engine(struct i915_sched_engine *se,
 #endif
 }
 
+__maybe_unused static bool priolist_idle(struct i915_priolist_root *root)
+{
+	struct i915_priolist *pl = &root->sentinel;
+	int lvl;
+
+	for (lvl = 0; lvl < ARRAY_SIZE(pl->next); lvl++) {
+		if (pl->next[lvl] != pl) {
+			GEM_TRACE_ERR("root[%d] is not empty\n", lvl);
+			return false;
+		}
+	}
+
+	if (pl->level != -1) {
+		GEM_TRACE_ERR("root is not clear: %d\n", pl->level);
+		return false;
+	}
+
+	return true;
+}
+
+static void pl_push(struct i915_priolist *pl, struct list_head *head)
+{
+	pl->requests.next = head->next;
+	head->next = &pl->requests;
+}
+
+static struct i915_priolist *pl_pop(struct list_head *head)
+{
+	struct i915_priolist *pl;
+
+	pl = container_of(head->next, typeof(*pl), requests);
+	head->next = pl->requests.next;
+
+	return pl;
+}
+
+static bool pl_empty(struct list_head *head)
+{
+	return !head->next;
+}
+
 void i915_sched_park_engine(struct i915_sched_engine *se)
 {
+	struct i915_priolist_root *root = &se->queue;
+	struct list_head *list = &root->sentinel.requests;
+
+	GEM_BUG_ON(!priolist_idle(root));
+
+	while (!pl_empty(list))
+		kmem_cache_free(global.slab_priorities, pl_pop(list));
+
 	GEM_BUG_ON(!i915_sched_is_idle(se));
 	se->no_priolist = false;
 }
@@ -183,71 +243,55 @@  static inline bool node_signaled(const struct i915_sched_node *node)
 	return i915_request_completed(node_to_request(node));
 }
 
-static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+static inline unsigned int random_level(struct i915_priolist_root *root)
 {
-	return rb_entry(rb, struct i915_priolist, node);
-}
-
-static void assert_priolists(struct i915_sched_engine * const se)
-{
-	struct rb_node *rb;
-	long last_prio;
-
-	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
-		return;
-
-	GEM_BUG_ON(rb_first_cached(&se->queue) !=
-		   rb_first(&se->queue.rb_root));
-
-	last_prio = INT_MAX;
-	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
-		const struct i915_priolist *p = to_priolist(rb);
-
-		GEM_BUG_ON(p->priority > last_prio);
-		last_prio = p->priority;
-	}
+	root->prng = next_pseudo_random32(root->prng);
+	return  __ffs(root->prng) / 2;
 }
 
 static struct list_head *
 lookup_priolist(struct intel_engine_cs *engine, int prio)
 {
+	struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
 	struct i915_sched_engine * const se = &engine->active;
-	struct i915_priolist *p;
-	struct rb_node **parent, *rb;
-	bool first = true;
-
-	lockdep_assert_held(&engine->active.lock);
-	assert_priolists(se);
+	struct i915_priolist_root *root = &se->queue;
+	struct i915_priolist *pl, *tmp;
+	int lvl;
 
+	lockdep_assert_held(&se->lock);
 	if (unlikely(se->no_priolist))
 		prio = I915_PRIORITY_NORMAL;
 
+	for_each_priolist(pl, root) { /* recycle any empty elements before us */
+		if (pl->priority >= prio || !list_empty(&pl->requests))
+			break;
+
+		i915_priolist_advance(root, pl);
+	}
+
 find_priolist:
-	/* most positive priority is scheduled first, equal priorities fifo */
-	rb = NULL;
-	parent = &se->queue.rb_root.rb_node;
-	while (*parent) {
-		rb = *parent;
-		p = to_priolist(rb);
-		if (prio > p->priority) {
-			parent = &rb->rb_left;
-		} else if (prio < p->priority) {
-			parent = &rb->rb_right;
-			first = false;
-		} else {
-			return &p->requests;
-		}
+	pl = &root->sentinel;
+	lvl = pl->level;
+	while (lvl >= 0) {
+		while (tmp = pl->next[lvl], tmp->priority >= prio)
+			pl = tmp;
+		if (pl->priority == prio)
+			goto out;
+		update[lvl--] = pl;
 	}
 
 	if (prio == I915_PRIORITY_NORMAL) {
-		p = &se->default_priolist;
+		pl = &se->default_priolist;
+	} else if (!pl_empty(&root->sentinel.requests)) {
+		pl = pl_pop(&root->sentinel.requests);
 	} else {
-		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
+		pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
 		/* Convert an allocation failure to a priority bump */
-		if (unlikely(!p)) {
+		if (unlikely(!pl)) {
 			prio = I915_PRIORITY_NORMAL; /* recurses just once */
 
-			/* To maintain ordering with all rendering, after an
+			/*
+			 * To maintain ordering with all rendering, after an
 			 * allocation failure we have to disable all scheduling.
 			 * Requests will then be executed in fifo, and schedule
 			 * will ensure that dependencies are emitted in fifo.
@@ -260,18 +304,103 @@  lookup_priolist(struct intel_engine_cs *engine, int prio)
 		}
 	}
 
-	p->priority = prio;
-	INIT_LIST_HEAD(&p->requests);
+	pl->priority = prio;
+	INIT_LIST_HEAD(&pl->requests);
 
-	rb_link_node(&p->node, rb, parent);
-	rb_insert_color_cached(&p->node, &se->queue, first);
+	lvl = random_level(root);
+	if (lvl > root->sentinel.level) {
+		if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
+			lvl = ++root->sentinel.level;
+			update[lvl] = &root->sentinel;
+		} else {
+			lvl = I915_PRIOLIST_HEIGHT - 1;
+		}
+	}
+	GEM_BUG_ON(lvl < 0);
+	GEM_BUG_ON(lvl >= ARRAY_SIZE(pl->next));
 
-	return &p->requests;
+	pl->level = lvl;
+	do {
+		tmp = update[lvl];
+		pl->next[lvl] = update[lvl]->next[lvl];
+		tmp->next[lvl] = pl;
+	} while (--lvl >= 0);
+
+	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
+		struct i915_priolist *chk;
+
+		chk = &root->sentinel;
+		lvl = chk->level;
+		do {
+			while (tmp = chk->next[lvl], tmp->priority >= prio)
+				chk = tmp;
+		} while (--lvl >= 0);
+
+		GEM_BUG_ON(chk != pl);
+	}
+
+out:
+	GEM_BUG_ON(pl == &root->sentinel);
+	return &pl->requests;
 }
 
-void __i915_priolist_free(struct i915_priolist *p)
+static void remove_priolist(struct intel_engine_cs *engine,
+			    struct list_head *plist)
 {
-	kmem_cache_free(global.slab_priorities, p);
+	struct i915_sched_engine * const se = &engine->active;
+	struct i915_priolist_root *root = &se->queue;
+	struct i915_priolist *pl, *tmp;
+	struct i915_priolist *old =
+		container_of(plist, struct i915_priolist, requests);
+	int prio = old->priority;
+	int lvl;
+
+	lockdep_assert_held(&se->lock);
+	GEM_BUG_ON(!list_empty(plist));
+
+	pl = &root->sentinel;
+	lvl = pl->level;
+	GEM_BUG_ON(lvl < 0);
+
+	if (prio != I915_PRIORITY_NORMAL)
+		pl_push(old, &pl->requests);
+
+	do {
+		while (tmp = pl->next[lvl], tmp->priority > prio)
+			pl = tmp;
+		if (lvl <= old->level) {
+			pl->next[lvl] = old->next[lvl];
+			if (pl == &root->sentinel && old->next[lvl] == pl) {
+				GEM_BUG_ON(pl->level != lvl);
+				pl->level--;
+			}
+		}
+	} while (--lvl >= 0);
+	GEM_BUG_ON(tmp != old);
+}
+
+void i915_priolist_advance(struct i915_priolist_root *root,
+			   struct i915_priolist *pl)
+{
+	struct i915_priolist * const s = &root->sentinel;
+	int lvl;
+
+	GEM_BUG_ON(!list_empty(&pl->requests));
+	GEM_BUG_ON(pl != s->next[0]);
+	GEM_BUG_ON(pl == s);
+
+	if (pl->priority != I915_PRIORITY_NORMAL)
+		pl_push(pl, &s->requests);
+
+	lvl = pl->level;
+	GEM_BUG_ON(lvl < 0);
+	do {
+		s->next[lvl] = pl->next[lvl];
+		if (pl->next[lvl] == s) {
+			GEM_BUG_ON(s->level != lvl);
+			s->level--;
+		}
+	} while (--lvl >= 0);
 }
 
 static struct i915_request *
@@ -420,8 +549,13 @@  static void __i915_request_set_priority(struct i915_request *rq, int prio)
 			continue;
 
 		GEM_BUG_ON(rq->engine != engine);
-		if (i915_request_in_priority_queue(rq))
+		if (i915_request_in_priority_queue(rq)) {
+			struct list_head *prev = rq->sched.link.prev;
+
 			list_move_tail(&rq->sched.link, plist);
+			if (list_empty(prev))
+				remove_priolist(engine, prev);
+		}
 
 		/* Defer (tasklet) submission until after all updates. */
 		kick_submission(engine, rq, prio);
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index 0ab47cbf0e9c..bca89a58d953 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -16,12 +16,6 @@ 
 
 struct drm_printer;
 
-#define priolist_for_each_request(it, plist) \
-	list_for_each_entry(it, &(plist)->requests, sched.link)
-
-#define priolist_for_each_request_consume(it, n, plist) \
-	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
-
 void i915_sched_node_init(struct i915_sched_node *node);
 void i915_sched_node_reinit(struct i915_sched_node *node);
 
@@ -69,7 +63,7 @@  static inline void i915_priolist_free(struct i915_priolist *p)
 
 static inline bool i915_sched_is_idle(const struct i915_sched_engine *se)
 {
-	return RB_EMPTY_ROOT(&se->queue.rb_root);
+	return i915_priolist_is_empty(&se->queue);
 }
 
 static inline bool
@@ -99,6 +93,9 @@  static inline void i915_sched_kick(struct i915_sched_engine *se)
 	tasklet_hi_schedule(&se->tasklet);
 }
 
+void i915_priolist_advance(struct i915_priolist_root *root,
+			   struct i915_priolist *old);
+
 void i915_request_show_with_schedule(struct drm_printer *m,
 				     const struct i915_request *rq,
 				     const char *prefix,
diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
index f668c680d290..e64750be4e77 100644
--- a/drivers/gpu/drm/i915/i915_scheduler_types.h
+++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
@@ -89,7 +89,7 @@  struct i915_sched_engine {
 	/**
 	 * @queue: queue of requests, in priority lists
 	 */
-	struct rb_root_cached queue;
+	struct i915_priolist_root queue;
 
 	struct i915_sched_ipi ipi;
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
index 3db34d3eea58..946c93441c1f 100644
--- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
@@ -25,6 +25,7 @@  selftest(ring, intel_ring_mock_selftests)
 selftest(engine, intel_engine_cs_mock_selftests)
 selftest(timelines, intel_timeline_mock_selftests)
 selftest(requests, i915_request_mock_selftests)
+selftest(scheduler, i915_scheduler_mock_selftests)
 selftest(objects, i915_gem_object_mock_selftests)
 selftest(phys, i915_gem_phys_mock_selftests)
 selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
diff --git a/drivers/gpu/drm/i915/selftests/i915_scheduler.c b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
index de44a66210b7..de5b1443129b 100644
--- a/drivers/gpu/drm/i915/selftests/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
@@ -12,6 +12,54 @@ 
 #include "selftests/igt_spinner.h"
 #include "selftests/i915_random.h"
 
+static int mock_skiplist_levels(void *dummy)
+{
+	struct i915_priolist_root root = {};
+	struct i915_priolist *pl = &root.sentinel;
+	IGT_TIMEOUT(end_time);
+	unsigned long total;
+	int count, lvl;
+
+	total = 0;
+	do {
+		for (count = 0; count < 16384; count++) {
+			lvl = random_level(&root);
+			if (lvl > pl->level) {
+				if (lvl < I915_PRIOLIST_HEIGHT - 1)
+					lvl = ++pl->level;
+				else
+					lvl = I915_PRIOLIST_HEIGHT - 1;
+			}
+
+			pl->next[lvl] = ptr_inc(pl->next[lvl]);
+		}
+		total += count;
+	} while (!__igt_timeout(end_time, NULL));
+
+	pr_info("Total %9lu\n", total);
+	for (lvl = 0; lvl <= pl->level; lvl++) {
+		int x = ilog2((unsigned long)pl->next[lvl]);
+		char row[80];
+
+		memset(row, '*', x);
+		row[x] = '\0';
+
+		pr_info(" [%2d] %9lu %s\n",
+			lvl, (unsigned long)pl->next[lvl], row);
+	}
+
+	return 0;
+}
+
+int i915_scheduler_mock_selftests(void)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(mock_skiplist_levels),
+	};
+
+	return i915_subtests(tests, NULL);
+}
+
 static void scheduling_disable(struct intel_engine_cs *engine)
 {
 	engine->props.preempt_timeout_ms = 0;
@@ -80,9 +128,9 @@  static int all_engines(struct drm_i915_private *i915,
 static bool check_context_order(struct intel_engine_cs *engine)
 {
 	u64 last_seqno, last_context;
+	struct i915_priolist *p;
 	unsigned long count;
 	bool result = false;
-	struct rb_node *rb;
 	int last_prio;
 
 	/* We expect the execution order to follow ascending fence-context */
@@ -92,8 +140,7 @@  static bool check_context_order(struct intel_engine_cs *engine)
 	last_context = 0;
 	last_seqno = 0;
 	last_prio = 0;
-	for (rb = rb_first_cached(&engine->active.queue); rb; rb = rb_next(rb)) {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+	for_each_priolist(p, &engine->active.queue) {
 		struct i915_request *rq;
 
 		priolist_for_each_request(rq, p) {