diff mbox series

[09/31] drm/i915: Replace priolist rbtree with a skiplist

Message ID 20210208105236.28498-9-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/31] drm/i915/gt: Ratelimit heartbeat completion probing | expand

Commit Message

Chris Wilson Feb. 8, 2021, 10:52 a.m. UTC
Replace the priolist rbtree with a skiplist. The crucial difference is
that walking and removing the first element of a skiplist is O(1), but
O(lgN) for an rbtree, as we need to rebalance on remove. This is a
hindrance for submission latency as it occurs between picking a request
for the priolist and submitting it to hardware, as well effectively
tripling the number of O(lgN) operations required under the irqoff lock.
This is critical to reducing the latency jitter with multiple clients.

The downsides to skiplists are that lookup/insertion is only
probabilistically O(lgN) and there is a significant memory penalty to
as each skip node is larger than the rbtree equivalent. Furthermore, we
don't use dynamic arrays for the skiplist, so the allocation is fixed,
and imposes an upper bound on the scalability wrt to the number of
inflight requests.

In the following patches, we introduce a new sort key to the scheduler,
a virtual deadline. This imposes a different structure to the tree.
Using a priority sort, we have very few priority levels active at any
time, most likely just the default priority and so the rbtree degenerates
to a single elements containing the list of all ready requests. The
deadlines in contrast are very sparse, and typically each request has a
unique deadline. Instead of being able to simply walk the list during
dequeue, with the deadline scheduler we have to iterate through the bst
on the critical submission path. Skiplists are vastly superior in this
instance due to the O(1) iteration during dequeue, with very similar
characteristics [on average] to the rbtree for insertion.

This means that by using skiplists we can introduce a sparse sort key
without degrading latency on the critical submission path.

As an example, one simple case where we try to do lots of
semi-independent work without any priority management (gem_exec_parallel),
the lock hold times were:
[worst]        [total]    [avg]
 973.05     6301584.84     0.35 # plain rbtree
 559.82     5424915.25     0.33 # best rbtree with pruning
 208.21     3898784.09     0.24 # skiplist
  34.05     5784106.01     0.32 # rbtree without deadlines
  23.35     4152999.80     0.24 # skiplist without deadlines

Based on the skiplist implementation by Dr Con Kolivas for MuQSS.

References: https://en.wikipedia.org/wiki/Skip_list
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 .../drm/i915/gt/intel_execlists_submission.c  | 168 +++++-----
 .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  41 +--
 drivers/gpu/drm/i915/i915_priolist_types.h    |  64 +++-
 drivers/gpu/drm/i915/i915_scheduler.c         | 304 +++++++++++++-----
 drivers/gpu/drm/i915/i915_scheduler.h         |  16 +-
 drivers/gpu/drm/i915/i915_scheduler_types.h   |   2 +-
 .../drm/i915/selftests/i915_mock_selftests.h  |   1 +
 .../gpu/drm/i915/selftests/i915_scheduler.c   |  53 ++-
 8 files changed, 454 insertions(+), 195 deletions(-)

Comments

Tvrtko Ursulin Feb. 8, 2021, 12:29 p.m. UTC | #1
On 08/02/2021 10:52, Chris Wilson wrote:
> Replace the priolist rbtree with a skiplist. The crucial difference is
> that walking and removing the first element of a skiplist is O(1), but
> O(lgN) for an rbtree, as we need to rebalance on remove. This is a
> hindrance for submission latency as it occurs between picking a request
> for the priolist and submitting it to hardware, as well effectively
> tripling the number of O(lgN) operations required under the irqoff lock.
> This is critical to reducing the latency jitter with multiple clients.
> 
> The downsides to skiplists are that lookup/insertion is only
> probabilistically O(lgN) and there is a significant memory penalty to
> as each skip node is larger than the rbtree equivalent. Furthermore, we
> don't use dynamic arrays for the skiplist, so the allocation is fixed,
> and imposes an upper bound on the scalability wrt to the number of
> inflight requests.
> 
> In the following patches, we introduce a new sort key to the scheduler,
> a virtual deadline. This imposes a different structure to the tree.
> Using a priority sort, we have very few priority levels active at any
> time, most likely just the default priority and so the rbtree degenerates
> to a single elements containing the list of all ready requests. The
> deadlines in contrast are very sparse, and typically each request has a
> unique deadline. Instead of being able to simply walk the list during
> dequeue, with the deadline scheduler we have to iterate through the bst
> on the critical submission path. Skiplists are vastly superior in this
> instance due to the O(1) iteration during dequeue, with very similar
> characteristics [on average] to the rbtree for insertion.
> 
> This means that by using skiplists we can introduce a sparse sort key
> without degrading latency on the critical submission path.
> 
> As an example, one simple case where we try to do lots of
> semi-independent work without any priority management (gem_exec_parallel),
> the lock hold times were:
> [worst]        [total]    [avg]
>   973.05     6301584.84     0.35 # plain rbtree
>   559.82     5424915.25     0.33 # best rbtree with pruning
>   208.21     3898784.09     0.24 # skiplist
>    34.05     5784106.01     0.32 # rbtree without deadlines
>    23.35     4152999.80     0.24 # skiplist without deadlines
> 
> Based on the skiplist implementation by Dr Con Kolivas for MuQSS.
> 
> References: https://en.wikipedia.org/wiki/Skip_list
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   .../drm/i915/gt/intel_execlists_submission.c  | 168 +++++-----
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  41 +--
>   drivers/gpu/drm/i915/i915_priolist_types.h    |  64 +++-
>   drivers/gpu/drm/i915/i915_scheduler.c         | 304 +++++++++++++-----
>   drivers/gpu/drm/i915/i915_scheduler.h         |  16 +-
>   drivers/gpu/drm/i915/i915_scheduler_types.h   |   2 +-
>   .../drm/i915/selftests/i915_mock_selftests.h  |   1 +
>   .../gpu/drm/i915/selftests/i915_scheduler.c   |  53 ++-
>   8 files changed, 454 insertions(+), 195 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 78fda9b4f626..4a0258347c10 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -254,11 +254,6 @@ static void ring_set_paused(const struct intel_engine_cs *engine, int state)
>   		wmb();
>   }
>   
> -static struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static int rq_prio(const struct i915_request *rq)
>   {
>   	return READ_ONCE(rq->sched.attr.priority);
> @@ -282,15 +277,27 @@ static int effective_prio(const struct i915_request *rq)
>   	return prio;
>   }
>   
> +static struct i915_request *first_request(const struct i915_sched *se)
> +{
> +	struct i915_priolist *pl = se->queue.sentinel.next[0];
> +
> +	if (pl == &se->queue.sentinel)
> +		return NULL;
> +
> +	return list_first_entry_or_null(&pl->requests,
> +					struct i915_request,
> +					sched.link);
> +}
> +
>   static int queue_prio(const struct i915_sched *se)
>   {
> -	struct rb_node *rb;
> +	struct i915_request *rq;
>   
> -	rb = rb_first_cached(&se->queue);
> -	if (!rb)
> +	rq = first_request(se);
> +	if (!rq)
>   		return INT_MIN;
>   
> -	return to_priolist(rb)->priority;
> +	return rq_prio(rq);
>   }
>   
>   static int virtual_prio(const struct intel_engine_execlists *el)
> @@ -300,7 +307,7 @@ static int virtual_prio(const struct intel_engine_execlists *el)
>   	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
>   }
>   
> -static bool need_preempt(const struct intel_engine_cs *engine,
> +static bool need_preempt(struct intel_engine_cs *engine,
>   			 const struct i915_request *rq)
>   {
>   	const struct i915_sched *se = &engine->sched;
> @@ -1144,7 +1151,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request **port = execlists->pending;
>   	struct i915_request ** const last_port = port + execlists->port_mask;
>   	struct i915_request *last, * const *active;
> +	struct i915_request *rq, *rn;
>   	struct virtual_engine *ve;
> +	struct i915_priolist *pl;
>   	struct rb_node *rb;
>   	bool submit = false;
>   
> @@ -1355,87 +1364,79 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			break;
>   	}
>   
> -	while ((rb = rb_first_cached(&se->queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -		struct i915_request *rq, *rn;
> +	i915_sched_dequeue(se, pl, rq, rn) {
> +		bool merge = true;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> -			bool merge = true;
> +		/*
> +		 * Can we combine this request with the current port?
> +		 * It has to be the same context/ringbuffer and not
> +		 * have any exceptions (e.g. GVT saying never to
> +		 * combine contexts).
> +		 *
> +		 * If we can combine the requests, we can execute both
> +		 * by updating the RING_TAIL to point to the end of the
> +		 * second request, and so we never need to tell the
> +		 * hardware about the first.
> +		 */
> +		if (last && !can_merge_rq(last, rq)) {
> +			/*
> +			 * If we are on the second port and cannot
> +			 * combine this request with the last, then we
> +			 * are done.
> +			 */
> +			if (port == last_port)
> +				goto done;
>   
>   			/*
> -			 * Can we combine this request with the current port?
> -			 * It has to be the same context/ringbuffer and not
> -			 * have any exceptions (e.g. GVT saying never to
> -			 * combine contexts).
> -			 *
> -			 * If we can combine the requests, we can execute both
> -			 * by updating the RING_TAIL to point to the end of the
> -			 * second request, and so we never need to tell the
> -			 * hardware about the first.
> +			 * We must not populate both ELSP[] with the
> +			 * same LRCA, i.e. we must submit 2 different
> +			 * contexts if we submit 2 ELSP.
>   			 */
> -			if (last && !can_merge_rq(last, rq)) {
> -				/*
> -				 * If we are on the second port and cannot
> -				 * combine this request with the last, then we
> -				 * are done.
> -				 */
> -				if (port == last_port)
> -					goto done;
> +			if (last->context == rq->context)
> +				goto done;
>   
> -				/*
> -				 * We must not populate both ELSP[] with the
> -				 * same LRCA, i.e. we must submit 2 different
> -				 * contexts if we submit 2 ELSP.
> -				 */
> -				if (last->context == rq->context)
> -					goto done;
> +			if (i915_request_has_sentinel(last))
> +				goto done;
>   
> -				if (i915_request_has_sentinel(last))
> -					goto done;
> +			/*
> +			 * We avoid submitting virtual requests into
> +			 * the secondary ports so that we can migrate
> +			 * the request immediately to another engine
> +			 * rather than wait for the primary request.
> +			 */
> +			if (rq->execution_mask != engine->mask)
> +				goto done;
>   
> -				/*
> -				 * We avoid submitting virtual requests into
> -				 * the secondary ports so that we can migrate
> -				 * the request immediately to another engine
> -				 * rather than wait for the primary request.
> -				 */
> -				if (rq->execution_mask != engine->mask)
> -					goto done;
> +			/*
> +			 * If GVT overrides us we only ever submit
> +			 * port[0], leaving port[1] empty. Note that we
> +			 * also have to be careful that we don't queue
> +			 * the same context (even though a different
> +			 * request) to the second port.
> +			 */
> +			if (ctx_single_port_submission(last->context) ||
> +			    ctx_single_port_submission(rq->context))
> +				goto done;
>   
> -				/*
> -				 * If GVT overrides us we only ever submit
> -				 * port[0], leaving port[1] empty. Note that we
> -				 * also have to be careful that we don't queue
> -				 * the same context (even though a different
> -				 * request) to the second port.
> -				 */
> -				if (ctx_single_port_submission(last->context) ||
> -				    ctx_single_port_submission(rq->context))
> -					goto done;
> -
> -				merge = false;
> -			}
> -
> -			if (__i915_request_submit(rq)) {
> -				if (!merge) {
> -					*port++ = i915_request_get(last);
> -					last = NULL;
> -				}
> -
> -				GEM_BUG_ON(last &&
> -					   !can_merge_ctx(last->context,
> -							  rq->context));
> -				GEM_BUG_ON(last &&
> -					   i915_seqno_passed(last->fence.seqno,
> -							     rq->fence.seqno));
> -
> -				submit = true;
> -				last = rq;
> -			}
> +			merge = false;
>   		}
>   
> -		rb_erase_cached(&p->node, &se->queue);
> -		i915_priolist_free(p);
> +		if (__i915_request_submit(rq)) {
> +			if (!merge) {
> +				*port++ = i915_request_get(last);
> +				last = NULL;
> +			}
> +
> +			GEM_BUG_ON(last &&
> +				   !can_merge_ctx(last->context,
> +						  rq->context));
> +			GEM_BUG_ON(last &&
> +				   i915_seqno_passed(last->fence.seqno,
> +						     rq->fence.seqno));
> +
> +			submit = true;
> +			last = rq;
> +		}
>   	}
>   done:
>   	*port++ = i915_request_get(last);
> @@ -1456,7 +1457,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	 * request triggering preemption on the next dequeue (or subsequent
>   	 * interrupt for secondary ports).
>   	 */
> -	execlists->queue_priority_hint = queue_prio(se);
> +	execlists->queue_priority_hint = pl->priority;
>   	spin_unlock(&se->lock);
>   
>   	/*
> @@ -2716,7 +2717,6 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   	}
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	se->queue = RB_ROOT_CACHED;
>   
>   	GEM_BUG_ON(__tasklet_is_enabled(&se->tasklet));
>   	se->tasklet.callback = nop_submission_tasklet;
> @@ -3173,6 +3173,8 @@ static void virtual_context_exit(struct intel_context *ce)
>   
>   	for (n = 0; n < ve->num_siblings; n++)
>   		intel_engine_pm_put(ve->siblings[n]);
> +
> +	i915_sched_park(intel_engine_get_scheduler(&ve->base));
>   }
>   
>   static const struct intel_context_ops virtual_context_ops = {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index d14b9db77df8..c16393df42a0 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -60,11 +60,6 @@
>   
>   #define GUC_REQUEST_SIZE 64 /* bytes */
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
>   {
>   	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
> @@ -186,9 +181,10 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request **first = execlists->inflight;
>   	struct i915_request ** const last_port = first + execlists->port_mask;
>   	struct i915_request *last = first[0];
> +	struct i915_request *rq, *rn;
>   	struct i915_request **port;
> +	struct i915_priolist *pl;
>   	bool submit = false;
> -	struct rb_node *rb;
>   
>   	lockdep_assert_held(&se->lock);
>   
> @@ -205,32 +201,22 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	 * event.
>   	 */
>   	port = first;
> -	while ((rb = rb_first_cached(&se->queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -		struct i915_request *rq, *rn;
> +	i915_sched_dequeue(se, pl, rq, rn) {
> +		if (last && rq->context != last->context) {
> +			if (port == last_port)
> +				goto done;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> -			if (last && rq->context != last->context) {
> -				if (port == last_port)
> -					goto done;
> -
> -				*port = schedule_in(last,
> -						    port - execlists->inflight);
> -				port++;
> -			}
> -
> -			list_del_init(&rq->sched.link);
> -			__i915_request_submit(rq);
> -			submit = true;
> -			last = rq;
> +			*port = schedule_in(last, port - execlists->inflight);
> +			port++;
>   		}
>   
> -		rb_erase_cached(&p->node, &se->queue);
> -		i915_priolist_free(p);
> +		list_del_init(&rq->sched.link);
> +		__i915_request_submit(rq);
> +		submit = true;
> +		last = rq;
>   	}
>   done:
> -	execlists->queue_priority_hint =
> -		rb ? to_priolist(rb)->priority : INT_MIN;
> +	execlists->queue_priority_hint = pl->priority;
>   	if (submit) {
>   		*port = schedule_in(last, port - execlists->inflight);
>   		*++port = NULL;
> @@ -361,7 +347,6 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>   	__i915_sched_cancel_queue(se);
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	se->queue = RB_ROOT_CACHED;
>   
>   	spin_unlock_irqrestore(&se->lock, flags);
>   	intel_engine_signal_breadcrumbs(engine);
> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> index bc2fa84f98a8..ee7482b9c813 100644
> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> @@ -38,10 +38,72 @@ enum {
>   #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
>   #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
>   
> +/*
> + * The slab returns power-of-two chunks of memory, so fill out the
> + * node to the next cacheline.
> + *
> + * We can estimate how many requests the skiplist will scale to based
> + * on its height:
> + *   11 =>  4 million requests
> + *   12 => 16 million requests
> + */
> +#ifdef CONFIG_64BIT
> +#define I915_PRIOLIST_HEIGHT 12
> +#else
> +#define I915_PRIOLIST_HEIGHT 11
> +#endif
> +
> +/*
> + * i915_priolist forms a skiplist. The skiplist is built in layers,
> + * starting at the base [0] is a singly linked list of all i915_priolist.
> + * Each higher layer contains a fraction of the i915_priolist from the
> + * previous layer:
> + *
> + * S[0] 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF S
> + * E[1] >1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F E
> + * N[2] -->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F N
> + * T[3] ------>7------>F-------7------>F------>7------>F------>7------>F T
> + * I[4] -------------->F-------------->F-------------->F-------------->F I
> + * N[5] ------------------------------>F------------------------------>F N
> + * E[6] ------------------------------>F-------------------------------> E
> + * L[7] ---------------------------------------------------------------> L
> + *
> + * To iterate through all active i915_priolist, we only need to follow
> + * the chain in i915_priolist.next[0] (see for_each_priolist()).
> + *
> + * To quickly find a specific key (or insert point), we can perform a binary
> + * search by starting at the highest level and following the linked list
> + * at that level until we either find the node, or have gone passed the key.
> + * Then we descend a level, and start walking the list again starting from
> + * the current position, until eventually we find our key, or we run out of
> + * levels.
> + *
> + * https://en.wikipedia.org/wiki/Skip_list
> + */
>   struct i915_priolist {
>   	struct list_head requests;
> -	struct rb_node node;
>   	int priority;
> +
> +	int level;
> +	struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
>   };
>   
> +struct i915_priolist_root {
> +	struct i915_priolist sentinel;
> +	u32 prng;
> +};
> +
> +#define i915_priolist_is_empty(root) ((root)->sentinel.level < 0)
> +
> +#define for_each_priolist(p, root) \
> +	for ((p) = (root)->sentinel.next[0]; \
> +	     (p) != &(root)->sentinel; \
> +	     (p) = (p)->next[0])
> +
> +#define priolist_for_each_request(it, plist) \
> +	list_for_each_entry(it, &(plist)->requests, sched.link)
> +
> +#define priolist_for_each_request_safe(it, n, plist) \
> +	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> +
>   #endif /* _I915_PRIOLIST_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index 312e1538d001..518eac67959e 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -4,7 +4,9 @@
>    * Copyright © 2018 Intel Corporation
>    */
>   
> +#include <linux/bitops.h>
>   #include <linux/mutex.h>
> +#include <linux/prandom.h>
>   
>   #include "gt/intel_ring.h"
>   #include "gt/intel_lrc_reg.h"
> @@ -168,6 +170,16 @@ void i915_sched_select_mode(struct i915_sched *se, enum i915_sched_mode mode)
>   	}
>   }
>   
> +static void init_priolist(struct i915_priolist_root *const root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +
> +	memset_p((void **)pl->next, pl, ARRAY_SIZE(pl->next));
> +	pl->requests.prev = NULL;
> +	pl->priority = INT_MIN;
> +	pl->level = -1;
> +}
> +
>   void i915_sched_init(struct i915_sched *se,
>   		     struct device *dev,
>   		     const char *name,
> @@ -183,9 +195,9 @@ void i915_sched_init(struct i915_sched *se,
>   
>   	se->mask = mask;
>   
> +	init_priolist(&se->queue);
>   	INIT_LIST_HEAD(&se->requests);
>   	INIT_LIST_HEAD(&se->hold);
> -	se->queue = RB_ROOT_CACHED;
>   
>   	init_ipi(&se->ipi);
>   
> @@ -194,8 +206,60 @@ void i915_sched_init(struct i915_sched *se,
>   	se->revoke_context = i915_sched_default_revoke_context;
>   }
>   
> +__maybe_unused static bool priolist_idle(struct i915_priolist_root *root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +	int lvl;
> +
> +	for (lvl = 0; lvl < ARRAY_SIZE(pl->next); lvl++) {
> +		if (pl->next[lvl] != pl) {
> +			GEM_TRACE_ERR("root[%d] is not empty\n", lvl);
> +			return false;
> +		}
> +	}
> +
> +	if (pl->level != -1) {
> +		GEM_TRACE_ERR("root is not clear: %d\n", pl->level);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool pl_empty(struct list_head *st)
> +{
> +	return !st->prev;
> +}
> +
> +static void pl_push(struct i915_priolist *pl, struct list_head *st)
> +{
> +	/* Keep list_empty(&pl->requests) valid for concurrent readers */
> +	pl->requests.prev = st->prev;
> +	st->prev = &pl->requests;
> +	GEM_BUG_ON(pl_empty(st));
> +}
> +
> +static struct i915_priolist *pl_pop(struct list_head *st)
> +{
> +	struct i915_priolist *pl;
> +
> +	GEM_BUG_ON(pl_empty(st));
> +	pl = container_of(st->prev, typeof(*pl), requests);
> +	st->prev = pl->requests.prev;
> +
> +	return pl;
> +}
> +
>   void i915_sched_park(struct i915_sched *se)
>   {
> +	struct i915_priolist_root *root = &se->queue;
> +	struct list_head *list = &root->sentinel.requests;
> +
> +	GEM_BUG_ON(!priolist_idle(root));
> +
> +	while (!pl_empty(list))
> +		kmem_cache_free(global.slab_priorities, pl_pop(list));
> +
>   	GEM_BUG_ON(!i915_sched_is_idle(se));
>   	se->no_priolist = false;
>   }
> @@ -251,70 +315,71 @@ static inline bool node_signaled(const struct i915_sched_node *node)
>   	return i915_request_completed(node_to_request(node));
>   }
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +static inline unsigned int random_level(struct i915_priolist_root *root)
>   {
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
> -static void assert_priolists(struct i915_sched * const se)
> -{
> -	struct rb_node *rb;
> -	long last_prio;
> -
> -	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> -		return;
> -
> -	GEM_BUG_ON(rb_first_cached(&se->queue) !=
> -		   rb_first(&se->queue.rb_root));
> -
> -	last_prio = INT_MAX;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		const struct i915_priolist *p = to_priolist(rb);
> -
> -		GEM_BUG_ON(p->priority > last_prio);
> -		last_prio = p->priority;
> -	}
> +	/*
> +	 * Given a uniform distribution of random numbers over the u32, then
> +	 * the probability each bit being unset is P=0.5. The probability of a
> +	 * successive sequence of bits being unset is P(n) = 0.5^n [n > 0].
> +	 *   P(level:1) = 0.5
> +	 *   P(level:2) = 0.25
> +	 *   P(level:3) = 0.125
> +	 *   P(level:4) = 0.0625
> +	 *   ...
> +	 * So we can use ffs() on a good random number generator to pick our
> +	 * level. We divide by two to reduce the probability of choosing a
> +	 * level to .25, as the cost of descending a level is the same as
> +	 * following an extra link in the chain at that level (so we can
> +	 * pack more nodes into fewer levels without incurring extra cost,
> +	 * and allow scaling to higher volumes of requests without expanding
> +	 * the height of the skiplist).
> +	 */
> +	root->prng = next_pseudo_random32(root->prng);
> +	return  __ffs(root->prng) / 2;
>   }
>   
>   static struct list_head *
>   lookup_priolist(struct i915_sched *se, int prio)
>   {
> -	struct i915_priolist *p;
> -	struct rb_node **parent, *rb;
> -	bool first = true;
> +	struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
> +	struct i915_priolist_root *const root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	int lvl;
>   
>   	lockdep_assert_held(&se->lock);
> -	assert_priolists(se);
> -
>   	if (unlikely(se->no_priolist))
>   		prio = I915_PRIORITY_NORMAL;
>   
> +	for_each_priolist(pl, root) { /* recycle any empty elements before us */
> +		if (pl->priority <= prio || !list_empty(&pl->requests))
> +			break;
> +
> +		__i915_sched_dequeue_next(se);
> +	}
> +
>   find_priolist:
> -	/* most positive priority is scheduled first, equal priorities fifo */
> -	rb = NULL;
> -	parent = &se->queue.rb_root.rb_node;
> -	while (*parent) {
> -		rb = *parent;
> -		p = to_priolist(rb);
> -		if (prio > p->priority) {
> -			parent = &rb->rb_left;
> -		} else if (prio < p->priority) {
> -			parent = &rb->rb_right;
> -			first = false;
> -		} else {
> -			return &p->requests;
> -		}
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	while (lvl >= 0) {
> +		while (tmp = pl->next[lvl], tmp->priority >= prio)
> +			pl = tmp;
> +		if (pl->priority == prio)
> +			goto out;
> +		update[lvl--] = pl;
>   	}
>   
>   	if (prio == I915_PRIORITY_NORMAL) {
> -		p = &se->default_priolist;
> +		pl = &se->default_priolist;
> +	} else if (!pl_empty(&root->sentinel.requests)) {
> +		pl = pl_pop(&root->sentinel.requests);
>   	} else {
> -		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> +		pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
>   		/* Convert an allocation failure to a priority bump */
> -		if (unlikely(!p)) {
> +		if (unlikely(!pl)) {
>   			prio = I915_PRIORITY_NORMAL; /* recurses just once */
>   
> -			/* To maintain ordering with all rendering, after an
> +			/*
> +			 * To maintain ordering with all rendering, after an
>   			 * allocation failure we have to disable all scheduling.
>   			 * Requests will then be executed in fifo, and schedule
>   			 * will ensure that dependencies are emitted in fifo.
> @@ -327,18 +392,123 @@ lookup_priolist(struct i915_sched *se, int prio)
>   		}
>   	}
>   
> -	p->priority = prio;
> -	INIT_LIST_HEAD(&p->requests);
> +	pl->priority = prio;
> +	INIT_LIST_HEAD(&pl->requests);
>   
> -	rb_link_node(&p->node, rb, parent);
> -	rb_insert_color_cached(&p->node, &se->queue, first);
> +	lvl = random_level(root);
> +	if (lvl > root->sentinel.level) {
> +		if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
> +			lvl = ++root->sentinel.level;
> +			update[lvl] = &root->sentinel;
> +		} else {
> +			lvl = I915_PRIOLIST_HEIGHT - 1;
> +		}
> +	}
> +	GEM_BUG_ON(lvl < 0);
> +	GEM_BUG_ON(lvl >= ARRAY_SIZE(pl->next));
>   
> -	return &p->requests;
> +	pl->level = lvl;
> +	do {
> +		tmp = update[lvl];
> +		pl->next[lvl] = tmp->next[lvl];
> +		tmp->next[lvl] = pl;
> +	} while (--lvl >= 0);
> +
> +	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
> +		struct i915_priolist *chk;
> +
> +		chk = &root->sentinel;
> +		lvl = chk->level;
> +		do {
> +			while (tmp = chk->next[lvl], tmp->priority >= prio)
> +				chk = tmp;
> +		} while (--lvl >= 0);
> +
> +		GEM_BUG_ON(chk != pl);
> +	}
> +
> +out:
> +	GEM_BUG_ON(pl == &root->sentinel);
> +	return &pl->requests;
>   }
>   
> -void __i915_priolist_free(struct i915_priolist *p)
> +static void __remove_priolist(struct i915_sched *se, struct list_head *plist)
>   {
> -	kmem_cache_free(global.slab_priorities, p);
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	struct i915_priolist *old =
> +		container_of(plist, struct i915_priolist, requests);
> +	int prio = old->priority;
> +	int lvl;
> +
> +	lockdep_assert_held(&se->lock);
> +	GEM_BUG_ON(!list_empty(plist));
> +
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +
> +	if (prio != I915_PRIORITY_NORMAL)
> +		pl_push(old, &pl->requests);
> +
> +	do {
> +		while (tmp = pl->next[lvl], tmp->priority > prio)
> +			pl = tmp;
> +		if (lvl <= old->level) {
> +			pl->next[lvl] = old->next[lvl];
> +			if (pl == &root->sentinel && old->next[lvl] == pl) {
> +				GEM_BUG_ON(pl->level != lvl);
> +				pl->level--;
> +			}
> +		}
> +	} while (--lvl >= 0);
> +	GEM_BUG_ON(tmp != old);
> +}
> +
> +static void remove_from_priolist(struct i915_sched *se,
> +				 struct i915_request *rq,
> +				 struct list_head *list,
> +				 bool tail)
> +{
> +	struct list_head *prev = rq->sched.link.prev;

This depends on rq being at the head of it's list?

> +
> +	GEM_BUG_ON(!i915_request_in_priority_queue(rq));
> +
> +	__list_del_entry(&rq->sched.link);
> +	if (tail)
> +		list_add_tail(&rq->sched.link, list);
> +	else
> +		list_add(&rq->sched.link, list);

So it is more move than remove(_from_priolist) ?

> +
> +	/* If we just removed the last element in the old plist, delete it */
> +	if (list_empty(prev))
> +		__remove_priolist(se, prev);
> +}
> +
> +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se)
> +{
> +	struct i915_priolist * const s = &se->queue.sentinel;
> +	struct i915_priolist *pl = s->next[0];
> +	int lvl;
> +
> +	GEM_BUG_ON(!list_empty(&pl->requests));

Lost as to why pl->requests has to be empty at this point. Considering:

+#define i915_sched_dequeue(se, pl, rq, rn) \
+	for ((pl) = (se)->queue.sentinel.next[0]; \
+	     (pl) != &(se)->queue.sentinel; \
+	     (pl) = __i915_sched_dequeue_next(se)) \
+		priolist_for_each_request_safe(rq, rn, pl)
+

I also don't understand what it would de-queue. Whole priolist woth of 
requests at a time? But it can't be empty to dequeue something. And who 
puts any unconsumed requests back on somewhere in this case.

Regards,

Tvrtko

> +	GEM_BUG_ON(pl == s);
> +
> +	/* Keep pl->next[0] valid for for_each_priolist iteration */
> +	if (pl->priority != I915_PRIORITY_NORMAL)
> +		pl_push(pl, &s->requests);
> +
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +	do {
> +		s->next[lvl] = pl->next[lvl];
> +		if (pl->next[lvl] == s) {
> +			GEM_BUG_ON(s->level != lvl);
> +			s->level--;
> +		}
> +	} while (--lvl >= 0);
> +
> +	return pl->next[0];
>   }
>   
>   static struct i915_request *
> @@ -491,7 +661,7 @@ static void __i915_request_set_priority(struct i915_request *rq, int prio)
>   
>   		GEM_BUG_ON(rq->engine != engine);
>   		if (i915_request_in_priority_queue(rq))
> -			list_move_tail(&rq->sched.link, plist);
> +			remove_from_priolist(se, rq, plist, true);
>   
>   		/* Defer (tasklet) submission until after all updates. */
>   		kick_submission(engine, rq, prio);
> @@ -627,8 +797,7 @@ void __i915_sched_defer_request(struct intel_engine_cs *engine,
>   
>   		/* Note list is reversed for waiters wrt signal hierarchy */
>   		GEM_BUG_ON(rq->engine != engine);
> -		GEM_BUG_ON(!i915_request_in_priority_queue(rq));
> -		list_move(&rq->sched.link, &dfs);
> +		remove_from_priolist(se, rq, &dfs, false);
>   
>   		/* Track our visit, and prevent duplicate processing */
>   		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
> @@ -927,7 +1096,7 @@ void i915_sched_resume_request(struct intel_engine_cs *engine,
>   void __i915_sched_cancel_queue(struct i915_sched *se)
>   {
>   	struct i915_request *rq, *rn;
> -	struct rb_node *rb;
> +	struct i915_priolist *pl;
>   
>   	lockdep_assert_held(&se->lock);
>   
> @@ -936,16 +1105,9 @@ void __i915_sched_cancel_queue(struct i915_sched *se)
>   		i915_request_put(i915_request_mark_eio(rq));
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&se->queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> -			i915_request_put(i915_request_mark_eio(rq));
> -			__i915_request_submit(rq);
> -		}
> -
> -		rb_erase_cached(&p->node, &se->queue);
> -		i915_priolist_free(p);
> +	i915_sched_dequeue(se, pl, rq, rn) {
> +		i915_request_put(i915_request_mark_eio(rq));
> +		__i915_request_submit(rq);
>   	}
>   	GEM_BUG_ON(!i915_sched_is_idle(se));
>   
> @@ -1225,9 +1387,9 @@ void i915_sched_show(struct drm_printer *m,
>   		     unsigned int max)
>   {
>   	const struct i915_request *rq, *last;
> +	struct i915_priolist *pl;
>   	unsigned long flags;
>   	unsigned int count;
> -	struct rb_node *rb;
>   
>   	rcu_read_lock();
>   	spin_lock_irqsave(&se->lock, flags);
> @@ -1282,10 +1444,8 @@ void i915_sched_show(struct drm_printer *m,
>   
>   	last = NULL;
>   	count = 0;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> -
> -		priolist_for_each_request(rq, p) {
> +	for_each_priolist(pl, &se->queue) {
> +		priolist_for_each_request(rq, pl) {
>   			if (count++ < max - 1)
>   				show_request(m, rq, "\t", 0);
>   			else
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index fe392109b112..872d221f6ba7 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -24,12 +24,6 @@ struct intel_engine_cs;
>   		  ##__VA_ARGS__);					\
>   } while (0)
>   
> -#define priolist_for_each_request(it, plist) \
> -	list_for_each_entry(it, &(plist)->requests, sched.link)
> -
> -#define priolist_for_each_request_consume(it, n, plist) \
> -	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> -
>   void i915_sched_node_init(struct i915_sched_node *node);
>   void i915_sched_node_reinit(struct i915_sched_node *node);
>   
> @@ -100,7 +94,7 @@ static inline void i915_priolist_free(struct i915_priolist *p)
>   
>   static inline bool i915_sched_is_idle(const struct i915_sched *se)
>   {
> -	return RB_EMPTY_ROOT(&se->queue.rb_root);
> +	return i915_priolist_is_empty(&se->queue);
>   }
>   
>   static inline bool
> @@ -168,6 +162,14 @@ i915_sched_get_active_request(const struct i915_sched *se)
>   	return NULL;
>   }
>   
> +/* Walk the scheduler queue of requests (in submission order) and remove them */
> +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se);
> +#define i915_sched_dequeue(se, pl, rq, rn) \
> +	for ((pl) = (se)->queue.sentinel.next[0]; \
> +	     (pl) != &(se)->queue.sentinel; \
> +	     (pl) = __i915_sched_dequeue_next(se)) \
> +		priolist_for_each_request_safe(rq, rn, pl)
> +
>   void i915_request_show_with_schedule(struct drm_printer *m,
>   				     const struct i915_request *rq,
>   				     const char *prefix,
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index 5ca2dc1b4fb5..bc668f375097 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -115,7 +115,7 @@ struct i915_sched {
>   	 * @queue is only used to transfer requests from the scheduler
>   	 * frontend to the back.
>   	 */
> -	struct rb_root_cached queue;
> +	struct i915_priolist_root queue;
>   
>   	/**
>   	 * @tasklet: softirq tasklet for bottom half
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index 3db34d3eea58..946c93441c1f 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -25,6 +25,7 @@ selftest(ring, intel_ring_mock_selftests)
>   selftest(engine, intel_engine_cs_mock_selftests)
>   selftest(timelines, intel_timeline_mock_selftests)
>   selftest(requests, i915_request_mock_selftests)
> +selftest(scheduler, i915_scheduler_mock_selftests)
>   selftest(objects, i915_gem_object_mock_selftests)
>   selftest(phys, i915_gem_phys_mock_selftests)
>   selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_scheduler.c b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> index f54bdbeaa48b..2bb2d3d07d06 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> @@ -12,6 +12,54 @@
>   #include "selftests/igt_spinner.h"
>   #include "selftests/i915_random.h"
>   
> +static int mock_skiplist_levels(void *dummy)
> +{
> +	struct i915_priolist_root root = {};
> +	struct i915_priolist *pl = &root.sentinel;
> +	IGT_TIMEOUT(end_time);
> +	unsigned long total;
> +	int count, lvl;
> +
> +	total = 0;
> +	do {
> +		for (count = 0; count < 16384; count++) {
> +			lvl = random_level(&root);
> +			if (lvl > pl->level) {
> +				if (lvl < I915_PRIOLIST_HEIGHT - 1)
> +					lvl = ++pl->level;
> +				else
> +					lvl = I915_PRIOLIST_HEIGHT - 1;
> +			}
> +
> +			pl->next[lvl] = ptr_inc(pl->next[lvl]);
> +		}
> +		total += count;
> +	} while (!__igt_timeout(end_time, NULL));
> +
> +	pr_info("Total %9lu\n", total);
> +	for (lvl = 0; lvl <= pl->level; lvl++) {
> +		int x = ilog2((unsigned long)pl->next[lvl]);
> +		char row[80];
> +
> +		memset(row, '*', x);
> +		row[x] = '\0';
> +
> +		pr_info(" [%2d] %9lu %s\n",
> +			lvl, (unsigned long)pl->next[lvl], row);
> +	}
> +
> +	return 0;
> +}
> +
> +int i915_scheduler_mock_selftests(void)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(mock_skiplist_levels),
> +	};
> +
> +	return i915_subtests(tests, NULL);
> +}
> +
>   static void scheduling_disable(struct intel_engine_cs *engine)
>   {
>   	engine->props.preempt_timeout_ms = 0;
> @@ -80,9 +128,9 @@ static int all_engines(struct drm_i915_private *i915,
>   static bool check_context_order(struct i915_sched *se)
>   {
>   	u64 last_seqno, last_context;
> +	struct i915_priolist *p;
>   	unsigned long count;
>   	bool result = false;
> -	struct rb_node *rb;
>   	int last_prio;
>   
>   	/* We expect the execution order to follow ascending fence-context */
> @@ -92,8 +140,7 @@ static bool check_context_order(struct i915_sched *se)
>   	last_context = 0;
>   	last_seqno = 0;
>   	last_prio = 0;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	for_each_priolist(p, &se->queue) {
>   		struct i915_request *rq;
>   
>   		priolist_for_each_request(rq, p) {
>
Chris Wilson Feb. 8, 2021, 12:46 p.m. UTC | #2
Quoting Tvrtko Ursulin (2021-02-08 12:29:14)
> 
> On 08/02/2021 10:52, Chris Wilson wrote:
> > +static void remove_from_priolist(struct i915_sched *se,
> > +                              struct i915_request *rq,
> > +                              struct list_head *list,
> > +                              bool tail)
> > +{
> > +     struct list_head *prev = rq->sched.link.prev;
> 
> This depends on rq being at the head of it's list?

Not depends. We are testing if the list is singular, that is by removing
this request from the i915_priolist.requests that list becomes empty,
and so the i915_priolist can be removed from the skiplist.

> > +
> > +     GEM_BUG_ON(!i915_request_in_priority_queue(rq));
> > +
> > +     __list_del_entry(&rq->sched.link);
> > +     if (tail)
> > +             list_add_tail(&rq->sched.link, list);
> > +     else
> > +             list_add(&rq->sched.link, list);
> 
> So it is more move than remove(_from_priolist) ?

Yes, we can quite happily just keep the list_move(), except we then end
up with lots of empty levels. At first I thought the walk through those
(during dequeue) would be cheaper than removing. The max lock holdtime
strongly favours the removal as we move requests around (which will
happen in dribs-and-drabs) over doing a bulk remove at dequeue.

> > +     /* If we just removed the last element in the old plist, delete it */
> > +     if (list_empty(prev))
> > +             __remove_priolist(se, prev);
> > +}
> > +
> > +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se)
> > +{
> > +     struct i915_priolist * const s = &se->queue.sentinel;
> > +     struct i915_priolist *pl = s->next[0];
> > +     int lvl;
> > +
> > +     GEM_BUG_ON(!list_empty(&pl->requests));
> 
> Lost as to why pl->requests has to be empty at this point. Considering:
> 
> +#define i915_sched_dequeue(se, pl, rq, rn) \
> +       for ((pl) = (se)->queue.sentinel.next[0]; \
> +            (pl) != &(se)->queue.sentinel; \
> +            (pl) = __i915_sched_dequeue_next(se)) \
> +               priolist_for_each_request_safe(rq, rn, pl)
> +
> 
> I also don't understand what it would de-queue. Whole priolist woth of 
> requests at a time? But it can't be empty to dequeue something. And who 
> puts any unconsumed requests back on somewhere in this case.

It's a double for-loop. I think the flattening of the logic is worth it.

During dequeue, we always move the very first request onto the next list
(i.e. i915_sched.active). Then when we have finished with all the
requests in one priority level, we move onto the next i915_priolist
(calling __i915_sched_dequeue_next).

So in __i915_sched_dequeue_next, we are always dealing with an empty
i915_priolist and want to advance the start of the skiplist to the next.

I was thinking that in order to hide the double for-loop, we could
handle the non-empty i915_priolist case causing it to break out of the
outer loop. So we could get rid of the goto done.
-Chris
Tvrtko Ursulin Feb. 8, 2021, 3:10 p.m. UTC | #3
On 08/02/2021 12:46, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2021-02-08 12:29:14)
>>
>> On 08/02/2021 10:52, Chris Wilson wrote:
>>> +static void remove_from_priolist(struct i915_sched *se,
>>> +                              struct i915_request *rq,
>>> +                              struct list_head *list,
>>> +                              bool tail)
>>> +{
>>> +     struct list_head *prev = rq->sched.link.prev;
>>
>> This depends on rq being at the head of it's list?
> 
> Not depends. We are testing if the list is singular, that is by removing
> this request from the i915_priolist.requests that list becomes empty,
> and so the i915_priolist can be removed from the skiplist.

Ah so obvious now, thanks.

> 
>>> +
>>> +     GEM_BUG_ON(!i915_request_in_priority_queue(rq));
>>> +
>>> +     __list_del_entry(&rq->sched.link);
>>> +     if (tail)
>>> +             list_add_tail(&rq->sched.link, list);
>>> +     else
>>> +             list_add(&rq->sched.link, list);
>>
>> So it is more move than remove(_from_priolist) ?
> 
> Yes, we can quite happily just keep the list_move(), except we then end
> up with lots of empty levels. At first I thought the walk through those
> (during dequeue) would be cheaper than removing. The max lock holdtime
> strongly favours the removal as we move requests around (which will
> happen in dribs-and-drabs) over doing a bulk remove at dequeue.

Give it a name to reflect it is a move like move_to_priolist?

> 
>>> +     /* If we just removed the last element in the old plist, delete it */
>>> +     if (list_empty(prev))
>>> +             __remove_priolist(se, prev);
>>> +}
>>> +
>>> +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se)
>>> +{
>>> +     struct i915_priolist * const s = &se->queue.sentinel;
>>> +     struct i915_priolist *pl = s->next[0];
>>> +     int lvl;
>>> +
>>> +     GEM_BUG_ON(!list_empty(&pl->requests));
>>
>> Lost as to why pl->requests has to be empty at this point. Considering:
>>
>> +#define i915_sched_dequeue(se, pl, rq, rn) \
>> +       for ((pl) = (se)->queue.sentinel.next[0]; \
>> +            (pl) != &(se)->queue.sentinel; \
>> +            (pl) = __i915_sched_dequeue_next(se)) \
>> +               priolist_for_each_request_safe(rq, rn, pl)
>> +
>>
>> I also don't understand what it would de-queue. Whole priolist woth of
>> requests at a time? But it can't be empty to dequeue something. And who
>> puts any unconsumed requests back on somewhere in this case.
> 
> It's a double for-loop. I think the flattening of the logic is worth it.
> 
> During dequeue, we always move the very first request onto the next list
> (i.e. i915_sched.active). Then when we have finished with all the
> requests in one priority level, we move onto the next i915_priolist
> (calling __i915_sched_dequeue_next).
> 
> So in __i915_sched_dequeue_next, we are always dealing with an empty
> i915_priolist and want to advance the start of the skiplist to the next.

Ah yes, __i915_sched_dequeue_next is only if there isn't a "goto done" 
from within the inner loop (priolist_for_each_request_safe). Well it's a 
bit fragile if someone does a break one day. But I guess bug on will be 
hit then so it's okay.

Right, I have some more questions for which I'll start a new sub-thread.

Regards,

Tvrtko

> 
> I was thinking that in order to hide the double for-loop, we could
> handle the non-empty i915_priolist case causing it to break out of the
> outer loop. So we could get rid of the goto done.
> -Chris
>
Tvrtko Ursulin Feb. 8, 2021, 3:23 p.m. UTC | #4
On 08/02/2021 10:52, Chris Wilson wrote:
> Replace the priolist rbtree with a skiplist. The crucial difference is
> that walking and removing the first element of a skiplist is O(1), but
> O(lgN) for an rbtree, as we need to rebalance on remove. This is a
> hindrance for submission latency as it occurs between picking a request
> for the priolist and submitting it to hardware, as well effectively
> tripling the number of O(lgN) operations required under the irqoff lock.
> This is critical to reducing the latency jitter with multiple clients.
> 
> The downsides to skiplists are that lookup/insertion is only
> probabilistically O(lgN) and there is a significant memory penalty to
> as each skip node is larger than the rbtree equivalent. Furthermore, we
> don't use dynamic arrays for the skiplist, so the allocation is fixed,
> and imposes an upper bound on the scalability wrt to the number of
> inflight requests.
> 
> In the following patches, we introduce a new sort key to the scheduler,
> a virtual deadline. This imposes a different structure to the tree.
> Using a priority sort, we have very few priority levels active at any
> time, most likely just the default priority and so the rbtree degenerates
> to a single elements containing the list of all ready requests. The
> deadlines in contrast are very sparse, and typically each request has a
> unique deadline. Instead of being able to simply walk the list during
> dequeue, with the deadline scheduler we have to iterate through the bst
> on the critical submission path. Skiplists are vastly superior in this
> instance due to the O(1) iteration during dequeue, with very similar
> characteristics [on average] to the rbtree for insertion.
> 
> This means that by using skiplists we can introduce a sparse sort key
> without degrading latency on the critical submission path.
> 
> As an example, one simple case where we try to do lots of
> semi-independent work without any priority management (gem_exec_parallel),
> the lock hold times were:
> [worst]        [total]    [avg]
>   973.05     6301584.84     0.35 # plain rbtree
>   559.82     5424915.25     0.33 # best rbtree with pruning
>   208.21     3898784.09     0.24 # skiplist
>    34.05     5784106.01     0.32 # rbtree without deadlines
>    23.35     4152999.80     0.24 # skiplist without deadlines
> 
> Based on the skiplist implementation by Dr Con Kolivas for MuQSS.
> 
> References: https://en.wikipedia.org/wiki/Skip_list
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   .../drm/i915/gt/intel_execlists_submission.c  | 168 +++++-----
>   .../gpu/drm/i915/gt/uc/intel_guc_submission.c |  41 +--
>   drivers/gpu/drm/i915/i915_priolist_types.h    |  64 +++-
>   drivers/gpu/drm/i915/i915_scheduler.c         | 304 +++++++++++++-----
>   drivers/gpu/drm/i915/i915_scheduler.h         |  16 +-
>   drivers/gpu/drm/i915/i915_scheduler_types.h   |   2 +-
>   .../drm/i915/selftests/i915_mock_selftests.h  |   1 +
>   .../gpu/drm/i915/selftests/i915_scheduler.c   |  53 ++-
>   8 files changed, 454 insertions(+), 195 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> index 78fda9b4f626..4a0258347c10 100644
> --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
> @@ -254,11 +254,6 @@ static void ring_set_paused(const struct intel_engine_cs *engine, int state)
>   		wmb();
>   }
>   
> -static struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static int rq_prio(const struct i915_request *rq)
>   {
>   	return READ_ONCE(rq->sched.attr.priority);
> @@ -282,15 +277,27 @@ static int effective_prio(const struct i915_request *rq)
>   	return prio;
>   }
>   
> +static struct i915_request *first_request(const struct i915_sched *se)
> +{
> +	struct i915_priolist *pl = se->queue.sentinel.next[0];
> +
> +	if (pl == &se->queue.sentinel)
> +		return NULL;
> +
> +	return list_first_entry_or_null(&pl->requests,
> +					struct i915_request,
> +					sched.link);
> +}
> +
>   static int queue_prio(const struct i915_sched *se)
>   {
> -	struct rb_node *rb;
> +	struct i915_request *rq;
>   
> -	rb = rb_first_cached(&se->queue);
> -	if (!rb)
> +	rq = first_request(se);
> +	if (!rq)
>   		return INT_MIN;
>   
> -	return to_priolist(rb)->priority;
> +	return rq_prio(rq);
>   }
>   
>   static int virtual_prio(const struct intel_engine_execlists *el)
> @@ -300,7 +307,7 @@ static int virtual_prio(const struct intel_engine_execlists *el)
>   	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
>   }
>   
> -static bool need_preempt(const struct intel_engine_cs *engine,
> +static bool need_preempt(struct intel_engine_cs *engine,
>   			 const struct i915_request *rq)
>   {
>   	const struct i915_sched *se = &engine->sched;
> @@ -1144,7 +1151,9 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request **port = execlists->pending;
>   	struct i915_request ** const last_port = port + execlists->port_mask;
>   	struct i915_request *last, * const *active;
> +	struct i915_request *rq, *rn;
>   	struct virtual_engine *ve;
> +	struct i915_priolist *pl;
>   	struct rb_node *rb;
>   	bool submit = false;
>   
> @@ -1355,87 +1364,79 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			break;
>   	}
>   
> -	while ((rb = rb_first_cached(&se->queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -		struct i915_request *rq, *rn;
> +	i915_sched_dequeue(se, pl, rq, rn) {
> +		bool merge = true;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> -			bool merge = true;
> +		/*
> +		 * Can we combine this request with the current port?
> +		 * It has to be the same context/ringbuffer and not
> +		 * have any exceptions (e.g. GVT saying never to
> +		 * combine contexts).
> +		 *
> +		 * If we can combine the requests, we can execute both
> +		 * by updating the RING_TAIL to point to the end of the
> +		 * second request, and so we never need to tell the
> +		 * hardware about the first.
> +		 */
> +		if (last && !can_merge_rq(last, rq)) {
> +			/*
> +			 * If we are on the second port and cannot
> +			 * combine this request with the last, then we
> +			 * are done.
> +			 */
> +			if (port == last_port)
> +				goto done;
>   
>   			/*
> -			 * Can we combine this request with the current port?
> -			 * It has to be the same context/ringbuffer and not
> -			 * have any exceptions (e.g. GVT saying never to
> -			 * combine contexts).
> -			 *
> -			 * If we can combine the requests, we can execute both
> -			 * by updating the RING_TAIL to point to the end of the
> -			 * second request, and so we never need to tell the
> -			 * hardware about the first.
> +			 * We must not populate both ELSP[] with the
> +			 * same LRCA, i.e. we must submit 2 different
> +			 * contexts if we submit 2 ELSP.
>   			 */
> -			if (last && !can_merge_rq(last, rq)) {
> -				/*
> -				 * If we are on the second port and cannot
> -				 * combine this request with the last, then we
> -				 * are done.
> -				 */
> -				if (port == last_port)
> -					goto done;
> +			if (last->context == rq->context)
> +				goto done;
>   
> -				/*
> -				 * We must not populate both ELSP[] with the
> -				 * same LRCA, i.e. we must submit 2 different
> -				 * contexts if we submit 2 ELSP.
> -				 */
> -				if (last->context == rq->context)
> -					goto done;
> +			if (i915_request_has_sentinel(last))
> +				goto done;
>   
> -				if (i915_request_has_sentinel(last))
> -					goto done;
> +			/*
> +			 * We avoid submitting virtual requests into
> +			 * the secondary ports so that we can migrate
> +			 * the request immediately to another engine
> +			 * rather than wait for the primary request.
> +			 */
> +			if (rq->execution_mask != engine->mask)
> +				goto done;
>   
> -				/*
> -				 * We avoid submitting virtual requests into
> -				 * the secondary ports so that we can migrate
> -				 * the request immediately to another engine
> -				 * rather than wait for the primary request.
> -				 */
> -				if (rq->execution_mask != engine->mask)
> -					goto done;
> +			/*
> +			 * If GVT overrides us we only ever submit
> +			 * port[0], leaving port[1] empty. Note that we
> +			 * also have to be careful that we don't queue
> +			 * the same context (even though a different
> +			 * request) to the second port.
> +			 */
> +			if (ctx_single_port_submission(last->context) ||
> +			    ctx_single_port_submission(rq->context))
> +				goto done;
>   
> -				/*
> -				 * If GVT overrides us we only ever submit
> -				 * port[0], leaving port[1] empty. Note that we
> -				 * also have to be careful that we don't queue
> -				 * the same context (even though a different
> -				 * request) to the second port.
> -				 */
> -				if (ctx_single_port_submission(last->context) ||
> -				    ctx_single_port_submission(rq->context))
> -					goto done;
> -
> -				merge = false;
> -			}
> -
> -			if (__i915_request_submit(rq)) {
> -				if (!merge) {
> -					*port++ = i915_request_get(last);
> -					last = NULL;
> -				}
> -
> -				GEM_BUG_ON(last &&
> -					   !can_merge_ctx(last->context,
> -							  rq->context));
> -				GEM_BUG_ON(last &&
> -					   i915_seqno_passed(last->fence.seqno,
> -							     rq->fence.seqno));
> -
> -				submit = true;
> -				last = rq;
> -			}
> +			merge = false;
>   		}
>   
> -		rb_erase_cached(&p->node, &se->queue);
> -		i915_priolist_free(p);
> +		if (__i915_request_submit(rq)) {
> +			if (!merge) {
> +				*port++ = i915_request_get(last);
> +				last = NULL;
> +			}
> +
> +			GEM_BUG_ON(last &&
> +				   !can_merge_ctx(last->context,
> +						  rq->context));
> +			GEM_BUG_ON(last &&
> +				   i915_seqno_passed(last->fence.seqno,
> +						     rq->fence.seqno));
> +
> +			submit = true;
> +			last = rq;
> +		}
>   	}
>   done:
>   	*port++ = i915_request_get(last);
> @@ -1456,7 +1457,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	 * request triggering preemption on the next dequeue (or subsequent
>   	 * interrupt for secondary ports).
>   	 */
> -	execlists->queue_priority_hint = queue_prio(se);
> +	execlists->queue_priority_hint = pl->priority;
>   	spin_unlock(&se->lock);
>   
>   	/*
> @@ -2716,7 +2717,6 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
>   	}
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	se->queue = RB_ROOT_CACHED;
>   
>   	GEM_BUG_ON(__tasklet_is_enabled(&se->tasklet));
>   	se->tasklet.callback = nop_submission_tasklet;
> @@ -3173,6 +3173,8 @@ static void virtual_context_exit(struct intel_context *ce)
>   
>   	for (n = 0; n < ve->num_siblings; n++)
>   		intel_engine_pm_put(ve->siblings[n]);
> +
> +	i915_sched_park(intel_engine_get_scheduler(&ve->base));
>   }
>   
>   static const struct intel_context_ops virtual_context_ops = {
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> index d14b9db77df8..c16393df42a0 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
> @@ -60,11 +60,6 @@
>   
>   #define GUC_REQUEST_SIZE 64 /* bytes */
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> -{
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
>   static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
>   {
>   	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
> @@ -186,9 +181,10 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	struct i915_request **first = execlists->inflight;
>   	struct i915_request ** const last_port = first + execlists->port_mask;
>   	struct i915_request *last = first[0];
> +	struct i915_request *rq, *rn;
>   	struct i915_request **port;
> +	struct i915_priolist *pl;
>   	bool submit = false;
> -	struct rb_node *rb;
>   
>   	lockdep_assert_held(&se->lock);
>   
> @@ -205,32 +201,22 @@ static void __guc_dequeue(struct intel_engine_cs *engine)
>   	 * event.
>   	 */
>   	port = first;
> -	while ((rb = rb_first_cached(&se->queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -		struct i915_request *rq, *rn;
> +	i915_sched_dequeue(se, pl, rq, rn) {
> +		if (last && rq->context != last->context) {
> +			if (port == last_port)
> +				goto done;
>   
> -		priolist_for_each_request_consume(rq, rn, p) {
> -			if (last && rq->context != last->context) {
> -				if (port == last_port)
> -					goto done;
> -
> -				*port = schedule_in(last,
> -						    port - execlists->inflight);
> -				port++;
> -			}
> -
> -			list_del_init(&rq->sched.link);
> -			__i915_request_submit(rq);
> -			submit = true;
> -			last = rq;
> +			*port = schedule_in(last, port - execlists->inflight);
> +			port++;
>   		}
>   
> -		rb_erase_cached(&p->node, &se->queue);
> -		i915_priolist_free(p);
> +		list_del_init(&rq->sched.link);
> +		__i915_request_submit(rq);
> +		submit = true;
> +		last = rq;
>   	}
>   done:
> -	execlists->queue_priority_hint =
> -		rb ? to_priolist(rb)->priority : INT_MIN;
> +	execlists->queue_priority_hint = pl->priority;
>   	if (submit) {
>   		*port = schedule_in(last, port - execlists->inflight);
>   		*++port = NULL;
> @@ -361,7 +347,6 @@ static void guc_reset_cancel(struct intel_engine_cs *engine)
>   	__i915_sched_cancel_queue(se);
>   
>   	execlists->queue_priority_hint = INT_MIN;
> -	se->queue = RB_ROOT_CACHED;
>   
>   	spin_unlock_irqrestore(&se->lock, flags);
>   	intel_engine_signal_breadcrumbs(engine);
> diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
> index bc2fa84f98a8..ee7482b9c813 100644
> --- a/drivers/gpu/drm/i915/i915_priolist_types.h
> +++ b/drivers/gpu/drm/i915/i915_priolist_types.h
> @@ -38,10 +38,72 @@ enum {
>   #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
>   #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
>   
> +/*
> + * The slab returns power-of-two chunks of memory, so fill out the
> + * node to the next cacheline.
> + *
> + * We can estimate how many requests the skiplist will scale to based
> + * on its height:
> + *   11 =>  4 million requests
> + *   12 => 16 million requests
> + */
> +#ifdef CONFIG_64BIT
> +#define I915_PRIOLIST_HEIGHT 12
> +#else
> +#define I915_PRIOLIST_HEIGHT 11
> +#endif
> +
> +/*
> + * i915_priolist forms a skiplist. The skiplist is built in layers,
> + * starting at the base [0] is a singly linked list of all i915_priolist.
> + * Each higher layer contains a fraction of the i915_priolist from the
> + * previous layer:
> + *
> + * S[0] 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF S
> + * E[1] >1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F E
> + * N[2] -->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F N
> + * T[3] ------>7------>F-------7------>F------>7------>F------>7------>F T
> + * I[4] -------------->F-------------->F-------------->F-------------->F I
> + * N[5] ------------------------------>F------------------------------>F N
> + * E[6] ------------------------------>F-------------------------------> E
> + * L[7] ---------------------------------------------------------------> L
> + *
> + * To iterate through all active i915_priolist, we only need to follow
> + * the chain in i915_priolist.next[0] (see for_each_priolist()).
> + *
> + * To quickly find a specific key (or insert point), we can perform a binary
> + * search by starting at the highest level and following the linked list
> + * at that level until we either find the node, or have gone passed the key.
> + * Then we descend a level, and start walking the list again starting from
> + * the current position, until eventually we find our key, or we run out of
> + * levels.
> + *
> + * https://en.wikipedia.org/wiki/Skip_list
> + */
>   struct i915_priolist {
>   	struct list_head requests;
> -	struct rb_node node;
>   	int priority;
> +
> +	int level;
> +	struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
>   };
>   
> +struct i915_priolist_root {
> +	struct i915_priolist sentinel;
> +	u32 prng;
> +};
> +
> +#define i915_priolist_is_empty(root) ((root)->sentinel.level < 0)
> +
> +#define for_each_priolist(p, root) \
> +	for ((p) = (root)->sentinel.next[0]; \
> +	     (p) != &(root)->sentinel; \
> +	     (p) = (p)->next[0])
> +
> +#define priolist_for_each_request(it, plist) \
> +	list_for_each_entry(it, &(plist)->requests, sched.link)
> +
> +#define priolist_for_each_request_safe(it, n, plist) \
> +	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> +
>   #endif /* _I915_PRIOLIST_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index 312e1538d001..518eac67959e 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -4,7 +4,9 @@
>    * Copyright © 2018 Intel Corporation
>    */
>   
> +#include <linux/bitops.h>
>   #include <linux/mutex.h>
> +#include <linux/prandom.h>
>   
>   #include "gt/intel_ring.h"
>   #include "gt/intel_lrc_reg.h"
> @@ -168,6 +170,16 @@ void i915_sched_select_mode(struct i915_sched *se, enum i915_sched_mode mode)
>   	}
>   }
>   
> +static void init_priolist(struct i915_priolist_root *const root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +
> +	memset_p((void **)pl->next, pl, ARRAY_SIZE(pl->next));
> +	pl->requests.prev = NULL;
> +	pl->priority = INT_MIN;
> +	pl->level = -1;
> +}
> +
>   void i915_sched_init(struct i915_sched *se,
>   		     struct device *dev,
>   		     const char *name,
> @@ -183,9 +195,9 @@ void i915_sched_init(struct i915_sched *se,
>   
>   	se->mask = mask;
>   
> +	init_priolist(&se->queue);
>   	INIT_LIST_HEAD(&se->requests);
>   	INIT_LIST_HEAD(&se->hold);
> -	se->queue = RB_ROOT_CACHED;
>   
>   	init_ipi(&se->ipi);
>   
> @@ -194,8 +206,60 @@ void i915_sched_init(struct i915_sched *se,
>   	se->revoke_context = i915_sched_default_revoke_context;
>   }
>   
> +__maybe_unused static bool priolist_idle(struct i915_priolist_root *root)
> +{
> +	struct i915_priolist *pl = &root->sentinel;
> +	int lvl;
> +
> +	for (lvl = 0; lvl < ARRAY_SIZE(pl->next); lvl++) {
> +		if (pl->next[lvl] != pl) {
> +			GEM_TRACE_ERR("root[%d] is not empty\n", lvl);
> +			return false;
> +		}
> +	}
> +
> +	if (pl->level != -1) {
> +		GEM_TRACE_ERR("root is not clear: %d\n", pl->level);
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +static bool pl_empty(struct list_head *st)
> +{
> +	return !st->prev;
> +}
> +
> +static void pl_push(struct i915_priolist *pl, struct list_head *st)
> +{
> +	/* Keep list_empty(&pl->requests) valid for concurrent readers */
> +	pl->requests.prev = st->prev;
> +	st->prev = &pl->requests;
> +	GEM_BUG_ON(pl_empty(st));
> +}
> +
> +static struct i915_priolist *pl_pop(struct list_head *st)
> +{
> +	struct i915_priolist *pl;
> +
> +	GEM_BUG_ON(pl_empty(st));
> +	pl = container_of(st->prev, typeof(*pl), requests);
> +	st->prev = pl->requests.prev;
> +
> +	return pl;
> +}
> +
>   void i915_sched_park(struct i915_sched *se)
>   {
> +	struct i915_priolist_root *root = &se->queue;
> +	struct list_head *list = &root->sentinel.requests;
> +
> +	GEM_BUG_ON(!priolist_idle(root));
> +
> +	while (!pl_empty(list))
> +		kmem_cache_free(global.slab_priorities, pl_pop(list));
> +
>   	GEM_BUG_ON(!i915_sched_is_idle(se));
>   	se->no_priolist = false;
>   }
> @@ -251,70 +315,71 @@ static inline bool node_signaled(const struct i915_sched_node *node)
>   	return i915_request_completed(node_to_request(node));
>   }
>   
> -static inline struct i915_priolist *to_priolist(struct rb_node *rb)
> +static inline unsigned int random_level(struct i915_priolist_root *root)
>   {
> -	return rb_entry(rb, struct i915_priolist, node);
> -}
> -
> -static void assert_priolists(struct i915_sched * const se)
> -{
> -	struct rb_node *rb;
> -	long last_prio;
> -
> -	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
> -		return;
> -
> -	GEM_BUG_ON(rb_first_cached(&se->queue) !=
> -		   rb_first(&se->queue.rb_root));
> -
> -	last_prio = INT_MAX;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		const struct i915_priolist *p = to_priolist(rb);
> -
> -		GEM_BUG_ON(p->priority > last_prio);
> -		last_prio = p->priority;
> -	}
> +	/*
> +	 * Given a uniform distribution of random numbers over the u32, then
> +	 * the probability each bit being unset is P=0.5. The probability of a
> +	 * successive sequence of bits being unset is P(n) = 0.5^n [n > 0].
> +	 *   P(level:1) = 0.5
> +	 *   P(level:2) = 0.25
> +	 *   P(level:3) = 0.125
> +	 *   P(level:4) = 0.0625
> +	 *   ...
> +	 * So we can use ffs() on a good random number generator to pick our
> +	 * level. We divide by two to reduce the probability of choosing a
> +	 * level to .25, as the cost of descending a level is the same as
> +	 * following an extra link in the chain at that level (so we can
> +	 * pack more nodes into fewer levels without incurring extra cost,
> +	 * and allow scaling to higher volumes of requests without expanding
> +	 * the height of the skiplist).
> +	 */
> +	root->prng = next_pseudo_random32(root->prng);
> +	return  __ffs(root->prng) / 2;
>   }
>   
>   static struct list_head *
>   lookup_priolist(struct i915_sched *se, int prio)
>   {
> -	struct i915_priolist *p;
> -	struct rb_node **parent, *rb;
> -	bool first = true;
> +	struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
> +	struct i915_priolist_root *const root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	int lvl;
>   
>   	lockdep_assert_held(&se->lock);
> -	assert_priolists(se);
> -
>   	if (unlikely(se->no_priolist))
>   		prio = I915_PRIORITY_NORMAL;
>   
> +	for_each_priolist(pl, root) { /* recycle any empty elements before us */
> +		if (pl->priority <= prio || !list_empty(&pl->requests))
> +			break;

This less part of the less-or-equal condition keeps confusing me as a 
break criteria. If premise is cleaning up, why break on first smaller 
prio? Would the idea be to prune all empty lists up to, not including, 
the lookup prio?

> +
> +		__i915_sched_dequeue_next(se);
> +	}
> +
>   find_priolist:
> -	/* most positive priority is scheduled first, equal priorities fifo */
> -	rb = NULL;
> -	parent = &se->queue.rb_root.rb_node;
> -	while (*parent) {
> -		rb = *parent;
> -		p = to_priolist(rb);
> -		if (prio > p->priority) {
> -			parent = &rb->rb_left;
> -		} else if (prio < p->priority) {
> -			parent = &rb->rb_right;
> -			first = false;
> -		} else {
> -			return &p->requests;
> -		}
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	while (lvl >= 0) {
> +		while (tmp = pl->next[lvl], tmp->priority >= prio)
> +			pl = tmp;
> +		if (pl->priority == prio)
> +			goto out;
> +		update[lvl--] = pl;
>   	}
>   
>   	if (prio == I915_PRIORITY_NORMAL) {
> -		p = &se->default_priolist;
> +		pl = &se->default_priolist;
> +	} else if (!pl_empty(&root->sentinel.requests)) {
> +		pl = pl_pop(&root->sentinel.requests);
>   	} else {
> -		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
> +		pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
>   		/* Convert an allocation failure to a priority bump */
> -		if (unlikely(!p)) {
> +		if (unlikely(!pl)) {
>   			prio = I915_PRIORITY_NORMAL; /* recurses just once */
>   
> -			/* To maintain ordering with all rendering, after an
> +			/*
> +			 * To maintain ordering with all rendering, after an
>   			 * allocation failure we have to disable all scheduling.
>   			 * Requests will then be executed in fifo, and schedule
>   			 * will ensure that dependencies are emitted in fifo.
> @@ -327,18 +392,123 @@ lookup_priolist(struct i915_sched *se, int prio)
>   		}
>   	}
>   
> -	p->priority = prio;
> -	INIT_LIST_HEAD(&p->requests);
> +	pl->priority = prio;
> +	INIT_LIST_HEAD(&pl->requests);
>   
> -	rb_link_node(&p->node, rb, parent);
> -	rb_insert_color_cached(&p->node, &se->queue, first);
> +	lvl = random_level(root);
> +	if (lvl > root->sentinel.level) {
> +		if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
> +			lvl = ++root->sentinel.level;
> +			update[lvl] = &root->sentinel;
> +		} else {
> +			lvl = I915_PRIOLIST_HEIGHT - 1;
> +		}
> +	}
> +	GEM_BUG_ON(lvl < 0);
> +	GEM_BUG_ON(lvl >= ARRAY_SIZE(pl->next));
>   
> -	return &p->requests;
> +	pl->level = lvl;
> +	do {
> +		tmp = update[lvl];
> +		pl->next[lvl] = tmp->next[lvl];
> +		tmp->next[lvl] = pl;
> +	} while (--lvl >= 0);
> +
> +	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
> +		struct i915_priolist *chk;
> +
> +		chk = &root->sentinel;
> +		lvl = chk->level;
> +		do {
> +			while (tmp = chk->next[lvl], tmp->priority >= prio)
> +				chk = tmp;
> +		} while (--lvl >= 0);
> +
> +		GEM_BUG_ON(chk != pl);
> +	}
> +
> +out:
> +	GEM_BUG_ON(pl == &root->sentinel);
> +	return &pl->requests;
>   }
>   
> -void __i915_priolist_free(struct i915_priolist *p)
> +static void __remove_priolist(struct i915_sched *se, struct list_head *plist)
>   {
> -	kmem_cache_free(global.slab_priorities, p);
> +	struct i915_priolist_root *root = &se->queue;
> +	struct i915_priolist *pl, *tmp;
> +	struct i915_priolist *old =
> +		container_of(plist, struct i915_priolist, requests);
> +	int prio = old->priority;
> +	int lvl;
> +
> +	lockdep_assert_held(&se->lock);
> +	GEM_BUG_ON(!list_empty(plist));
> +
> +	pl = &root->sentinel;
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +
> +	if (prio != I915_PRIORITY_NORMAL)
> +		pl_push(old, &pl->requests);
> +
> +	do {
> +		while (tmp = pl->next[lvl], tmp->priority > prio)
> +			pl = tmp;
> +		if (lvl <= old->level) {
> +			pl->next[lvl] = old->next[lvl];
> +			if (pl == &root->sentinel && old->next[lvl] == pl) {
> +				GEM_BUG_ON(pl->level != lvl);
> +				pl->level--;
> +			}
> +		}
> +	} while (--lvl >= 0);
> +	GEM_BUG_ON(tmp != old);
> +}
> +
> +static void remove_from_priolist(struct i915_sched *se,
> +				 struct i915_request *rq,
> +				 struct list_head *list,
> +				 bool tail)
> +{
> +	struct list_head *prev = rq->sched.link.prev;
> +
> +	GEM_BUG_ON(!i915_request_in_priority_queue(rq));
> +
> +	__list_del_entry(&rq->sched.link);
> +	if (tail)
> +		list_add_tail(&rq->sched.link, list);
> +	else
> +		list_add(&rq->sched.link, list);
> +
> +	/* If we just removed the last element in the old plist, delete it */
> +	if (list_empty(prev))
> +		__remove_priolist(se, prev);
> +}
> +
> +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se)
> +{
> +	struct i915_priolist * const s = &se->queue.sentinel;
> +	struct i915_priolist *pl = s->next[0];
> +	int lvl;
> +
> +	GEM_BUG_ON(!list_empty(&pl->requests));
> +	GEM_BUG_ON(pl == s);
> +
> +	/* Keep pl->next[0] valid for for_each_priolist iteration */
> +	if (pl->priority != I915_PRIORITY_NORMAL)
> +		pl_push(pl, &s->requests);
> +
> +	lvl = pl->level;
> +	GEM_BUG_ON(lvl < 0);
> +	do {
> +		s->next[lvl] = pl->next[lvl];
> +		if (pl->next[lvl] == s) {
> +			GEM_BUG_ON(s->level != lvl);
> +			s->level--;
> +		}
> +	} while (--lvl >= 0);
> +
> +	return pl->next[0];
>   }

If both __i915_sched_dequeue_next and __remove_priolist are removing an 
empty list from the hieararchy, why can't they shared some code?

Regards,

Tvrtko

>   
>   static struct i915_request *
> @@ -491,7 +661,7 @@ static void __i915_request_set_priority(struct i915_request *rq, int prio)
>   
>   		GEM_BUG_ON(rq->engine != engine);
>   		if (i915_request_in_priority_queue(rq))
> -			list_move_tail(&rq->sched.link, plist);
> +			remove_from_priolist(se, rq, plist, true);
>   
>   		/* Defer (tasklet) submission until after all updates. */
>   		kick_submission(engine, rq, prio);
> @@ -627,8 +797,7 @@ void __i915_sched_defer_request(struct intel_engine_cs *engine,
>   
>   		/* Note list is reversed for waiters wrt signal hierarchy */
>   		GEM_BUG_ON(rq->engine != engine);
> -		GEM_BUG_ON(!i915_request_in_priority_queue(rq));
> -		list_move(&rq->sched.link, &dfs);
> +		remove_from_priolist(se, rq, &dfs, false);
>   
>   		/* Track our visit, and prevent duplicate processing */
>   		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
> @@ -927,7 +1096,7 @@ void i915_sched_resume_request(struct intel_engine_cs *engine,
>   void __i915_sched_cancel_queue(struct i915_sched *se)
>   {
>   	struct i915_request *rq, *rn;
> -	struct rb_node *rb;
> +	struct i915_priolist *pl;
>   
>   	lockdep_assert_held(&se->lock);
>   
> @@ -936,16 +1105,9 @@ void __i915_sched_cancel_queue(struct i915_sched *se)
>   		i915_request_put(i915_request_mark_eio(rq));
>   
>   	/* Flush the queued requests to the timeline list (for retiring). */
> -	while ((rb = rb_first_cached(&se->queue))) {
> -		struct i915_priolist *p = to_priolist(rb);
> -
> -		priolist_for_each_request_consume(rq, rn, p) {
> -			i915_request_put(i915_request_mark_eio(rq));
> -			__i915_request_submit(rq);
> -		}
> -
> -		rb_erase_cached(&p->node, &se->queue);
> -		i915_priolist_free(p);
> +	i915_sched_dequeue(se, pl, rq, rn) {
> +		i915_request_put(i915_request_mark_eio(rq));
> +		__i915_request_submit(rq);
>   	}
>   	GEM_BUG_ON(!i915_sched_is_idle(se));
>   
> @@ -1225,9 +1387,9 @@ void i915_sched_show(struct drm_printer *m,
>   		     unsigned int max)
>   {
>   	const struct i915_request *rq, *last;
> +	struct i915_priolist *pl;
>   	unsigned long flags;
>   	unsigned int count;
> -	struct rb_node *rb;
>   
>   	rcu_read_lock();
>   	spin_lock_irqsave(&se->lock, flags);
> @@ -1282,10 +1444,8 @@ void i915_sched_show(struct drm_printer *m,
>   
>   	last = NULL;
>   	count = 0;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> -
> -		priolist_for_each_request(rq, p) {
> +	for_each_priolist(pl, &se->queue) {
> +		priolist_for_each_request(rq, pl) {
>   			if (count++ < max - 1)
>   				show_request(m, rq, "\t", 0);
>   			else
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index fe392109b112..872d221f6ba7 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -24,12 +24,6 @@ struct intel_engine_cs;
>   		  ##__VA_ARGS__);					\
>   } while (0)
>   
> -#define priolist_for_each_request(it, plist) \
> -	list_for_each_entry(it, &(plist)->requests, sched.link)
> -
> -#define priolist_for_each_request_consume(it, n, plist) \
> -	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
> -
>   void i915_sched_node_init(struct i915_sched_node *node);
>   void i915_sched_node_reinit(struct i915_sched_node *node);
>   
> @@ -100,7 +94,7 @@ static inline void i915_priolist_free(struct i915_priolist *p)
>   
>   static inline bool i915_sched_is_idle(const struct i915_sched *se)
>   {
> -	return RB_EMPTY_ROOT(&se->queue.rb_root);
> +	return i915_priolist_is_empty(&se->queue);
>   }
>   
>   static inline bool
> @@ -168,6 +162,14 @@ i915_sched_get_active_request(const struct i915_sched *se)
>   	return NULL;
>   }
>   
> +/* Walk the scheduler queue of requests (in submission order) and remove them */
> +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se);
> +#define i915_sched_dequeue(se, pl, rq, rn) \
> +	for ((pl) = (se)->queue.sentinel.next[0]; \
> +	     (pl) != &(se)->queue.sentinel; \
> +	     (pl) = __i915_sched_dequeue_next(se)) \
> +		priolist_for_each_request_safe(rq, rn, pl)
> +
>   void i915_request_show_with_schedule(struct drm_printer *m,
>   				     const struct i915_request *rq,
>   				     const char *prefix,
> diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
> index 5ca2dc1b4fb5..bc668f375097 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler_types.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
> @@ -115,7 +115,7 @@ struct i915_sched {
>   	 * @queue is only used to transfer requests from the scheduler
>   	 * frontend to the back.
>   	 */
> -	struct rb_root_cached queue;
> +	struct i915_priolist_root queue;
>   
>   	/**
>   	 * @tasklet: softirq tasklet for bottom half
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index 3db34d3eea58..946c93441c1f 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -25,6 +25,7 @@ selftest(ring, intel_ring_mock_selftests)
>   selftest(engine, intel_engine_cs_mock_selftests)
>   selftest(timelines, intel_timeline_mock_selftests)
>   selftest(requests, i915_request_mock_selftests)
> +selftest(scheduler, i915_scheduler_mock_selftests)
>   selftest(objects, i915_gem_object_mock_selftests)
>   selftest(phys, i915_gem_phys_mock_selftests)
>   selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_scheduler.c b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> index f54bdbeaa48b..2bb2d3d07d06 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
> @@ -12,6 +12,54 @@
>   #include "selftests/igt_spinner.h"
>   #include "selftests/i915_random.h"
>   
> +static int mock_skiplist_levels(void *dummy)
> +{
> +	struct i915_priolist_root root = {};
> +	struct i915_priolist *pl = &root.sentinel;
> +	IGT_TIMEOUT(end_time);
> +	unsigned long total;
> +	int count, lvl;
> +
> +	total = 0;
> +	do {
> +		for (count = 0; count < 16384; count++) {
> +			lvl = random_level(&root);
> +			if (lvl > pl->level) {
> +				if (lvl < I915_PRIOLIST_HEIGHT - 1)
> +					lvl = ++pl->level;
> +				else
> +					lvl = I915_PRIOLIST_HEIGHT - 1;
> +			}
> +
> +			pl->next[lvl] = ptr_inc(pl->next[lvl]);
> +		}
> +		total += count;
> +	} while (!__igt_timeout(end_time, NULL));
> +
> +	pr_info("Total %9lu\n", total);
> +	for (lvl = 0; lvl <= pl->level; lvl++) {
> +		int x = ilog2((unsigned long)pl->next[lvl]);
> +		char row[80];
> +
> +		memset(row, '*', x);
> +		row[x] = '\0';
> +
> +		pr_info(" [%2d] %9lu %s\n",
> +			lvl, (unsigned long)pl->next[lvl], row);
> +	}
> +
> +	return 0;
> +}
> +
> +int i915_scheduler_mock_selftests(void)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(mock_skiplist_levels),
> +	};
> +
> +	return i915_subtests(tests, NULL);
> +}
> +
>   static void scheduling_disable(struct intel_engine_cs *engine)
>   {
>   	engine->props.preempt_timeout_ms = 0;
> @@ -80,9 +128,9 @@ static int all_engines(struct drm_i915_private *i915,
>   static bool check_context_order(struct i915_sched *se)
>   {
>   	u64 last_seqno, last_context;
> +	struct i915_priolist *p;
>   	unsigned long count;
>   	bool result = false;
> -	struct rb_node *rb;
>   	int last_prio;
>   
>   	/* We expect the execution order to follow ascending fence-context */
> @@ -92,8 +140,7 @@ static bool check_context_order(struct i915_sched *se)
>   	last_context = 0;
>   	last_seqno = 0;
>   	last_prio = 0;
> -	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
> -		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
> +	for_each_priolist(p, &se->queue) {
>   		struct i915_request *rq;
>   
>   		priolist_for_each_request(rq, p) {
>
Chris Wilson Feb. 8, 2021, 4:19 p.m. UTC | #5
Quoting Tvrtko Ursulin (2021-02-08 15:23:17)
> 
> On 08/02/2021 10:52, Chris Wilson wrote:
> >   static struct list_head *
> >   lookup_priolist(struct i915_sched *se, int prio)
> >   {
> > -     struct i915_priolist *p;
> > -     struct rb_node **parent, *rb;
> > -     bool first = true;
> > +     struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
> > +     struct i915_priolist_root *const root = &se->queue;
> > +     struct i915_priolist *pl, *tmp;
> > +     int lvl;
> >   
> >       lockdep_assert_held(&se->lock);
> > -     assert_priolists(se);
> > -
> >       if (unlikely(se->no_priolist))
> >               prio = I915_PRIORITY_NORMAL;
> >   
> > +     for_each_priolist(pl, root) { /* recycle any empty elements before us */
> > +             if (pl->priority <= prio || !list_empty(&pl->requests))
> > +                     break;
> 
> This less part of the less-or-equal condition keeps confusing me as a 
> break criteria. If premise is cleaning up, why break on first smaller 
> prio? Would the idea be to prune all empty lists up to, not including, 
> the lookup prio?

Just parcelling up the work. If we tidy up all the unused nodes before
us, we insert ourselves at the head of the tree, and all the cheap
checks to see if this is the first request, or to find the first
request are happy.

It's not expected to find anything unused with the tweaks to tidy up
empty elements as we move between i915_priolist.requests, but it seems
sensible to keep as then it should be just checking the first
i915_priolist and breaking out.

> > -void __i915_priolist_free(struct i915_priolist *p)
> > +static void __remove_priolist(struct i915_sched *se, struct list_head *plist)
> >   {
> > -     kmem_cache_free(global.slab_priorities, p);
> > +     struct i915_priolist_root *root = &se->queue;
> > +     struct i915_priolist *pl, *tmp;
> > +     struct i915_priolist *old =
> > +             container_of(plist, struct i915_priolist, requests);
> > +     int prio = old->priority;
> > +     int lvl;
> > +
> > +     lockdep_assert_held(&se->lock);
> > +     GEM_BUG_ON(!list_empty(plist));
> > +
> > +     pl = &root->sentinel;
> > +     lvl = pl->level;
> > +     GEM_BUG_ON(lvl < 0);
> > +
> > +     if (prio != I915_PRIORITY_NORMAL)
> > +             pl_push(old, &pl->requests);
> > +
> > +     do {
> > +             while (tmp = pl->next[lvl], tmp->priority > prio)
> > +                     pl = tmp;
> > +             if (lvl <= old->level) {
> > +                     pl->next[lvl] = old->next[lvl];
> > +                     if (pl == &root->sentinel && old->next[lvl] == pl) {
> > +                             GEM_BUG_ON(pl->level != lvl);
> > +                             pl->level--;
> > +                     }
> > +             }
> > +     } while (--lvl >= 0);
> > +     GEM_BUG_ON(tmp != old);
> > +}

> > +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se)
> > +{
> > +     struct i915_priolist * const s = &se->queue.sentinel;
> > +     struct i915_priolist *pl = s->next[0];
> > +     int lvl;
> > +
> > +     GEM_BUG_ON(!list_empty(&pl->requests));
> > +     GEM_BUG_ON(pl == s);
> > +
> > +     /* Keep pl->next[0] valid for for_each_priolist iteration */
> > +     if (pl->priority != I915_PRIORITY_NORMAL)
> > +             pl_push(pl, &s->requests);
> > +
> > +     lvl = pl->level;
> > +     GEM_BUG_ON(lvl < 0);
> > +     do {
> > +             s->next[lvl] = pl->next[lvl];
> > +             if (pl->next[lvl] == s) {
> > +                     GEM_BUG_ON(s->level != lvl);
> > +                     s->level--;
> > +             }
> > +     } while (--lvl >= 0);
> > +
> > +     return pl->next[0];
> >   }
> 
> If both __i915_sched_dequeue_next and __remove_priolist are removing an 
> empty list from the hieararchy, why can't they shared some code?

The __remove_priolist does the general search and remove, whereas
dequeue_next is trying to keep O(1) remove-from-head. dequeue_next is
meant to be called many, many more times than __remove_priolist.
-Chris
Tvrtko Ursulin Feb. 9, 2021, 4:11 p.m. UTC | #6
On 08/02/2021 16:19, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2021-02-08 15:23:17)
>>
>> On 08/02/2021 10:52, Chris Wilson wrote:
>>>    static struct list_head *
>>>    lookup_priolist(struct i915_sched *se, int prio)
>>>    {
>>> -     struct i915_priolist *p;
>>> -     struct rb_node **parent, *rb;
>>> -     bool first = true;
>>> +     struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
>>> +     struct i915_priolist_root *const root = &se->queue;
>>> +     struct i915_priolist *pl, *tmp;
>>> +     int lvl;
>>>    
>>>        lockdep_assert_held(&se->lock);
>>> -     assert_priolists(se);
>>> -
>>>        if (unlikely(se->no_priolist))
>>>                prio = I915_PRIORITY_NORMAL;
>>>    
>>> +     for_each_priolist(pl, root) { /* recycle any empty elements before us */
>>> +             if (pl->priority <= prio || !list_empty(&pl->requests))
>>> +                     break;
>>
>> This less part of the less-or-equal condition keeps confusing me as a
>> break criteria. If premise is cleaning up, why break on first smaller
>> prio? Would the idea be to prune all empty lists up to, not including,
>> the lookup prio?
> 
> Just parcelling up the work. If we tidy up all the unused nodes before
> us, we insert ourselves at the head of the tree, and all the cheap
> checks to see if this is the first request, or to find the first
> request are happy.
> 
> It's not expected to find anything unused with the tweaks to tidy up
> empty elements as we move between i915_priolist.requests, but it seems
> sensible to keep as then it should be just checking the first
> i915_priolist and breaking out.

It's fine, for some reason I missed the order is descending. Probably 
thinking about deadlines already. Need to see how that works there then. 
But a comment indicating the order would be cool.

>>> -void __i915_priolist_free(struct i915_priolist *p)
>>> +static void __remove_priolist(struct i915_sched *se, struct list_head *plist)
>>>    {
>>> -     kmem_cache_free(global.slab_priorities, p);
>>> +     struct i915_priolist_root *root = &se->queue;
>>> +     struct i915_priolist *pl, *tmp;
>>> +     struct i915_priolist *old =
>>> +             container_of(plist, struct i915_priolist, requests);
>>> +     int prio = old->priority;
>>> +     int lvl;
>>> +
>>> +     lockdep_assert_held(&se->lock);
>>> +     GEM_BUG_ON(!list_empty(plist));
>>> +
>>> +     pl = &root->sentinel;
>>> +     lvl = pl->level;
>>> +     GEM_BUG_ON(lvl < 0);
>>> +
>>> +     if (prio != I915_PRIORITY_NORMAL)
>>> +             pl_push(old, &pl->requests);
>>> +
>>> +     do {
>>> +             while (tmp = pl->next[lvl], tmp->priority > prio)
>>> +                     pl = tmp;

Ah okay, this is needed because the list is singly linked. I suggest a 
comment.

Doubly linked would not be interesting?

>>> +             if (lvl <= old->level) {
>>> +                     pl->next[lvl] = old->next[lvl];
>>> +                     if (pl == &root->sentinel && old->next[lvl] == pl) {
>>> +                             GEM_BUG_ON(pl->level != lvl);
>>> +                             pl->level--;
>>> +                     }
>>> +             }
>>> +     } while (--lvl >= 0);
>>> +     GEM_BUG_ON(tmp != old);
>>> +}
> 
>>> +struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se)
>>> +{
>>> +     struct i915_priolist * const s = &se->queue.sentinel;
>>> +     struct i915_priolist *pl = s->next[0];
>>> +     int lvl;
>>> +
>>> +     GEM_BUG_ON(!list_empty(&pl->requests));
>>> +     GEM_BUG_ON(pl == s);
>>> +
>>> +     /* Keep pl->next[0] valid for for_each_priolist iteration */
>>> +     if (pl->priority != I915_PRIORITY_NORMAL)
>>> +             pl_push(pl, &s->requests);
>>> +
>>> +     lvl = pl->level;
>>> +     GEM_BUG_ON(lvl < 0);
>>> +     do {
>>> +             s->next[lvl] = pl->next[lvl];
>>> +             if (pl->next[lvl] == s) {
>>> +                     GEM_BUG_ON(s->level != lvl);
>>> +                     s->level--;
>>> +             }
>>> +     } while (--lvl >= 0);
>>> +
>>> +     return pl->next[0];
>>>    }
>>
>> If both __i915_sched_dequeue_next and __remove_priolist are removing an
>> empty list from the hieararchy, why can't they shared some code?
> 
> The __remove_priolist does the general search and remove, whereas
> dequeue_next is trying to keep O(1) remove-from-head. dequeue_next is
> meant to be called many, many more times than __remove_priolist.

Ok.

Regards,

Tvrtko
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
index 78fda9b4f626..4a0258347c10 100644
--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
+++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c
@@ -254,11 +254,6 @@  static void ring_set_paused(const struct intel_engine_cs *engine, int state)
 		wmb();
 }
 
-static struct i915_priolist *to_priolist(struct rb_node *rb)
-{
-	return rb_entry(rb, struct i915_priolist, node);
-}
-
 static int rq_prio(const struct i915_request *rq)
 {
 	return READ_ONCE(rq->sched.attr.priority);
@@ -282,15 +277,27 @@  static int effective_prio(const struct i915_request *rq)
 	return prio;
 }
 
+static struct i915_request *first_request(const struct i915_sched *se)
+{
+	struct i915_priolist *pl = se->queue.sentinel.next[0];
+
+	if (pl == &se->queue.sentinel)
+		return NULL;
+
+	return list_first_entry_or_null(&pl->requests,
+					struct i915_request,
+					sched.link);
+}
+
 static int queue_prio(const struct i915_sched *se)
 {
-	struct rb_node *rb;
+	struct i915_request *rq;
 
-	rb = rb_first_cached(&se->queue);
-	if (!rb)
+	rq = first_request(se);
+	if (!rq)
 		return INT_MIN;
 
-	return to_priolist(rb)->priority;
+	return rq_prio(rq);
 }
 
 static int virtual_prio(const struct intel_engine_execlists *el)
@@ -300,7 +307,7 @@  static int virtual_prio(const struct intel_engine_execlists *el)
 	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
 }
 
-static bool need_preempt(const struct intel_engine_cs *engine,
+static bool need_preempt(struct intel_engine_cs *engine,
 			 const struct i915_request *rq)
 {
 	const struct i915_sched *se = &engine->sched;
@@ -1144,7 +1151,9 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 	struct i915_request **port = execlists->pending;
 	struct i915_request ** const last_port = port + execlists->port_mask;
 	struct i915_request *last, * const *active;
+	struct i915_request *rq, *rn;
 	struct virtual_engine *ve;
+	struct i915_priolist *pl;
 	struct rb_node *rb;
 	bool submit = false;
 
@@ -1355,87 +1364,79 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 			break;
 	}
 
-	while ((rb = rb_first_cached(&se->queue))) {
-		struct i915_priolist *p = to_priolist(rb);
-		struct i915_request *rq, *rn;
+	i915_sched_dequeue(se, pl, rq, rn) {
+		bool merge = true;
 
-		priolist_for_each_request_consume(rq, rn, p) {
-			bool merge = true;
+		/*
+		 * Can we combine this request with the current port?
+		 * It has to be the same context/ringbuffer and not
+		 * have any exceptions (e.g. GVT saying never to
+		 * combine contexts).
+		 *
+		 * If we can combine the requests, we can execute both
+		 * by updating the RING_TAIL to point to the end of the
+		 * second request, and so we never need to tell the
+		 * hardware about the first.
+		 */
+		if (last && !can_merge_rq(last, rq)) {
+			/*
+			 * If we are on the second port and cannot
+			 * combine this request with the last, then we
+			 * are done.
+			 */
+			if (port == last_port)
+				goto done;
 
 			/*
-			 * Can we combine this request with the current port?
-			 * It has to be the same context/ringbuffer and not
-			 * have any exceptions (e.g. GVT saying never to
-			 * combine contexts).
-			 *
-			 * If we can combine the requests, we can execute both
-			 * by updating the RING_TAIL to point to the end of the
-			 * second request, and so we never need to tell the
-			 * hardware about the first.
+			 * We must not populate both ELSP[] with the
+			 * same LRCA, i.e. we must submit 2 different
+			 * contexts if we submit 2 ELSP.
 			 */
-			if (last && !can_merge_rq(last, rq)) {
-				/*
-				 * If we are on the second port and cannot
-				 * combine this request with the last, then we
-				 * are done.
-				 */
-				if (port == last_port)
-					goto done;
+			if (last->context == rq->context)
+				goto done;
 
-				/*
-				 * We must not populate both ELSP[] with the
-				 * same LRCA, i.e. we must submit 2 different
-				 * contexts if we submit 2 ELSP.
-				 */
-				if (last->context == rq->context)
-					goto done;
+			if (i915_request_has_sentinel(last))
+				goto done;
 
-				if (i915_request_has_sentinel(last))
-					goto done;
+			/*
+			 * We avoid submitting virtual requests into
+			 * the secondary ports so that we can migrate
+			 * the request immediately to another engine
+			 * rather than wait for the primary request.
+			 */
+			if (rq->execution_mask != engine->mask)
+				goto done;
 
-				/*
-				 * We avoid submitting virtual requests into
-				 * the secondary ports so that we can migrate
-				 * the request immediately to another engine
-				 * rather than wait for the primary request.
-				 */
-				if (rq->execution_mask != engine->mask)
-					goto done;
+			/*
+			 * If GVT overrides us we only ever submit
+			 * port[0], leaving port[1] empty. Note that we
+			 * also have to be careful that we don't queue
+			 * the same context (even though a different
+			 * request) to the second port.
+			 */
+			if (ctx_single_port_submission(last->context) ||
+			    ctx_single_port_submission(rq->context))
+				goto done;
 
-				/*
-				 * If GVT overrides us we only ever submit
-				 * port[0], leaving port[1] empty. Note that we
-				 * also have to be careful that we don't queue
-				 * the same context (even though a different
-				 * request) to the second port.
-				 */
-				if (ctx_single_port_submission(last->context) ||
-				    ctx_single_port_submission(rq->context))
-					goto done;
-
-				merge = false;
-			}
-
-			if (__i915_request_submit(rq)) {
-				if (!merge) {
-					*port++ = i915_request_get(last);
-					last = NULL;
-				}
-
-				GEM_BUG_ON(last &&
-					   !can_merge_ctx(last->context,
-							  rq->context));
-				GEM_BUG_ON(last &&
-					   i915_seqno_passed(last->fence.seqno,
-							     rq->fence.seqno));
-
-				submit = true;
-				last = rq;
-			}
+			merge = false;
 		}
 
-		rb_erase_cached(&p->node, &se->queue);
-		i915_priolist_free(p);
+		if (__i915_request_submit(rq)) {
+			if (!merge) {
+				*port++ = i915_request_get(last);
+				last = NULL;
+			}
+
+			GEM_BUG_ON(last &&
+				   !can_merge_ctx(last->context,
+						  rq->context));
+			GEM_BUG_ON(last &&
+				   i915_seqno_passed(last->fence.seqno,
+						     rq->fence.seqno));
+
+			submit = true;
+			last = rq;
+		}
 	}
 done:
 	*port++ = i915_request_get(last);
@@ -1456,7 +1457,7 @@  static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * request triggering preemption on the next dequeue (or subsequent
 	 * interrupt for secondary ports).
 	 */
-	execlists->queue_priority_hint = queue_prio(se);
+	execlists->queue_priority_hint = pl->priority;
 	spin_unlock(&se->lock);
 
 	/*
@@ -2716,7 +2717,6 @@  static void execlists_reset_cancel(struct intel_engine_cs *engine)
 	}
 
 	execlists->queue_priority_hint = INT_MIN;
-	se->queue = RB_ROOT_CACHED;
 
 	GEM_BUG_ON(__tasklet_is_enabled(&se->tasklet));
 	se->tasklet.callback = nop_submission_tasklet;
@@ -3173,6 +3173,8 @@  static void virtual_context_exit(struct intel_context *ce)
 
 	for (n = 0; n < ve->num_siblings; n++)
 		intel_engine_pm_put(ve->siblings[n]);
+
+	i915_sched_park(intel_engine_get_scheduler(&ve->base));
 }
 
 static const struct intel_context_ops virtual_context_ops = {
diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
index d14b9db77df8..c16393df42a0 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
@@ -60,11 +60,6 @@ 
 
 #define GUC_REQUEST_SIZE 64 /* bytes */
 
-static inline struct i915_priolist *to_priolist(struct rb_node *rb)
-{
-	return rb_entry(rb, struct i915_priolist, node);
-}
-
 static struct guc_stage_desc *__get_stage_desc(struct intel_guc *guc, u32 id)
 {
 	struct guc_stage_desc *base = guc->stage_desc_pool_vaddr;
@@ -186,9 +181,10 @@  static void __guc_dequeue(struct intel_engine_cs *engine)
 	struct i915_request **first = execlists->inflight;
 	struct i915_request ** const last_port = first + execlists->port_mask;
 	struct i915_request *last = first[0];
+	struct i915_request *rq, *rn;
 	struct i915_request **port;
+	struct i915_priolist *pl;
 	bool submit = false;
-	struct rb_node *rb;
 
 	lockdep_assert_held(&se->lock);
 
@@ -205,32 +201,22 @@  static void __guc_dequeue(struct intel_engine_cs *engine)
 	 * event.
 	 */
 	port = first;
-	while ((rb = rb_first_cached(&se->queue))) {
-		struct i915_priolist *p = to_priolist(rb);
-		struct i915_request *rq, *rn;
+	i915_sched_dequeue(se, pl, rq, rn) {
+		if (last && rq->context != last->context) {
+			if (port == last_port)
+				goto done;
 
-		priolist_for_each_request_consume(rq, rn, p) {
-			if (last && rq->context != last->context) {
-				if (port == last_port)
-					goto done;
-
-				*port = schedule_in(last,
-						    port - execlists->inflight);
-				port++;
-			}
-
-			list_del_init(&rq->sched.link);
-			__i915_request_submit(rq);
-			submit = true;
-			last = rq;
+			*port = schedule_in(last, port - execlists->inflight);
+			port++;
 		}
 
-		rb_erase_cached(&p->node, &se->queue);
-		i915_priolist_free(p);
+		list_del_init(&rq->sched.link);
+		__i915_request_submit(rq);
+		submit = true;
+		last = rq;
 	}
 done:
-	execlists->queue_priority_hint =
-		rb ? to_priolist(rb)->priority : INT_MIN;
+	execlists->queue_priority_hint = pl->priority;
 	if (submit) {
 		*port = schedule_in(last, port - execlists->inflight);
 		*++port = NULL;
@@ -361,7 +347,6 @@  static void guc_reset_cancel(struct intel_engine_cs *engine)
 	__i915_sched_cancel_queue(se);
 
 	execlists->queue_priority_hint = INT_MIN;
-	se->queue = RB_ROOT_CACHED;
 
 	spin_unlock_irqrestore(&se->lock, flags);
 	intel_engine_signal_breadcrumbs(engine);
diff --git a/drivers/gpu/drm/i915/i915_priolist_types.h b/drivers/gpu/drm/i915/i915_priolist_types.h
index bc2fa84f98a8..ee7482b9c813 100644
--- a/drivers/gpu/drm/i915/i915_priolist_types.h
+++ b/drivers/gpu/drm/i915/i915_priolist_types.h
@@ -38,10 +38,72 @@  enum {
 #define I915_PRIORITY_UNPREEMPTABLE INT_MAX
 #define I915_PRIORITY_BARRIER (I915_PRIORITY_UNPREEMPTABLE - 1)
 
+/*
+ * The slab returns power-of-two chunks of memory, so fill out the
+ * node to the next cacheline.
+ *
+ * We can estimate how many requests the skiplist will scale to based
+ * on its height:
+ *   11 =>  4 million requests
+ *   12 => 16 million requests
+ */
+#ifdef CONFIG_64BIT
+#define I915_PRIOLIST_HEIGHT 12
+#else
+#define I915_PRIOLIST_HEIGHT 11
+#endif
+
+/*
+ * i915_priolist forms a skiplist. The skiplist is built in layers,
+ * starting at the base [0] is a singly linked list of all i915_priolist.
+ * Each higher layer contains a fraction of the i915_priolist from the
+ * previous layer:
+ *
+ * S[0] 0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF0123456789ABCDEF S
+ * E[1] >1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F>1>3>5>7>9>B>D>F E
+ * N[2] -->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F-->3-->7-->B-->F N
+ * T[3] ------>7------>F-------7------>F------>7------>F------>7------>F T
+ * I[4] -------------->F-------------->F-------------->F-------------->F I
+ * N[5] ------------------------------>F------------------------------>F N
+ * E[6] ------------------------------>F-------------------------------> E
+ * L[7] ---------------------------------------------------------------> L
+ *
+ * To iterate through all active i915_priolist, we only need to follow
+ * the chain in i915_priolist.next[0] (see for_each_priolist()).
+ *
+ * To quickly find a specific key (or insert point), we can perform a binary
+ * search by starting at the highest level and following the linked list
+ * at that level until we either find the node, or have gone passed the key.
+ * Then we descend a level, and start walking the list again starting from
+ * the current position, until eventually we find our key, or we run out of
+ * levels.
+ *
+ * https://en.wikipedia.org/wiki/Skip_list
+ */
 struct i915_priolist {
 	struct list_head requests;
-	struct rb_node node;
 	int priority;
+
+	int level;
+	struct i915_priolist *next[I915_PRIOLIST_HEIGHT];
 };
 
+struct i915_priolist_root {
+	struct i915_priolist sentinel;
+	u32 prng;
+};
+
+#define i915_priolist_is_empty(root) ((root)->sentinel.level < 0)
+
+#define for_each_priolist(p, root) \
+	for ((p) = (root)->sentinel.next[0]; \
+	     (p) != &(root)->sentinel; \
+	     (p) = (p)->next[0])
+
+#define priolist_for_each_request(it, plist) \
+	list_for_each_entry(it, &(plist)->requests, sched.link)
+
+#define priolist_for_each_request_safe(it, n, plist) \
+	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
+
 #endif /* _I915_PRIOLIST_TYPES_H_ */
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 312e1538d001..518eac67959e 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -4,7 +4,9 @@ 
  * Copyright © 2018 Intel Corporation
  */
 
+#include <linux/bitops.h>
 #include <linux/mutex.h>
+#include <linux/prandom.h>
 
 #include "gt/intel_ring.h"
 #include "gt/intel_lrc_reg.h"
@@ -168,6 +170,16 @@  void i915_sched_select_mode(struct i915_sched *se, enum i915_sched_mode mode)
 	}
 }
 
+static void init_priolist(struct i915_priolist_root *const root)
+{
+	struct i915_priolist *pl = &root->sentinel;
+
+	memset_p((void **)pl->next, pl, ARRAY_SIZE(pl->next));
+	pl->requests.prev = NULL;
+	pl->priority = INT_MIN;
+	pl->level = -1;
+}
+
 void i915_sched_init(struct i915_sched *se,
 		     struct device *dev,
 		     const char *name,
@@ -183,9 +195,9 @@  void i915_sched_init(struct i915_sched *se,
 
 	se->mask = mask;
 
+	init_priolist(&se->queue);
 	INIT_LIST_HEAD(&se->requests);
 	INIT_LIST_HEAD(&se->hold);
-	se->queue = RB_ROOT_CACHED;
 
 	init_ipi(&se->ipi);
 
@@ -194,8 +206,60 @@  void i915_sched_init(struct i915_sched *se,
 	se->revoke_context = i915_sched_default_revoke_context;
 }
 
+__maybe_unused static bool priolist_idle(struct i915_priolist_root *root)
+{
+	struct i915_priolist *pl = &root->sentinel;
+	int lvl;
+
+	for (lvl = 0; lvl < ARRAY_SIZE(pl->next); lvl++) {
+		if (pl->next[lvl] != pl) {
+			GEM_TRACE_ERR("root[%d] is not empty\n", lvl);
+			return false;
+		}
+	}
+
+	if (pl->level != -1) {
+		GEM_TRACE_ERR("root is not clear: %d\n", pl->level);
+		return false;
+	}
+
+	return true;
+}
+
+static bool pl_empty(struct list_head *st)
+{
+	return !st->prev;
+}
+
+static void pl_push(struct i915_priolist *pl, struct list_head *st)
+{
+	/* Keep list_empty(&pl->requests) valid for concurrent readers */
+	pl->requests.prev = st->prev;
+	st->prev = &pl->requests;
+	GEM_BUG_ON(pl_empty(st));
+}
+
+static struct i915_priolist *pl_pop(struct list_head *st)
+{
+	struct i915_priolist *pl;
+
+	GEM_BUG_ON(pl_empty(st));
+	pl = container_of(st->prev, typeof(*pl), requests);
+	st->prev = pl->requests.prev;
+
+	return pl;
+}
+
 void i915_sched_park(struct i915_sched *se)
 {
+	struct i915_priolist_root *root = &se->queue;
+	struct list_head *list = &root->sentinel.requests;
+
+	GEM_BUG_ON(!priolist_idle(root));
+
+	while (!pl_empty(list))
+		kmem_cache_free(global.slab_priorities, pl_pop(list));
+
 	GEM_BUG_ON(!i915_sched_is_idle(se));
 	se->no_priolist = false;
 }
@@ -251,70 +315,71 @@  static inline bool node_signaled(const struct i915_sched_node *node)
 	return i915_request_completed(node_to_request(node));
 }
 
-static inline struct i915_priolist *to_priolist(struct rb_node *rb)
+static inline unsigned int random_level(struct i915_priolist_root *root)
 {
-	return rb_entry(rb, struct i915_priolist, node);
-}
-
-static void assert_priolists(struct i915_sched * const se)
-{
-	struct rb_node *rb;
-	long last_prio;
-
-	if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
-		return;
-
-	GEM_BUG_ON(rb_first_cached(&se->queue) !=
-		   rb_first(&se->queue.rb_root));
-
-	last_prio = INT_MAX;
-	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
-		const struct i915_priolist *p = to_priolist(rb);
-
-		GEM_BUG_ON(p->priority > last_prio);
-		last_prio = p->priority;
-	}
+	/*
+	 * Given a uniform distribution of random numbers over the u32, then
+	 * the probability each bit being unset is P=0.5. The probability of a
+	 * successive sequence of bits being unset is P(n) = 0.5^n [n > 0].
+	 *   P(level:1) = 0.5
+	 *   P(level:2) = 0.25
+	 *   P(level:3) = 0.125
+	 *   P(level:4) = 0.0625
+	 *   ...
+	 * So we can use ffs() on a good random number generator to pick our
+	 * level. We divide by two to reduce the probability of choosing a
+	 * level to .25, as the cost of descending a level is the same as
+	 * following an extra link in the chain at that level (so we can
+	 * pack more nodes into fewer levels without incurring extra cost,
+	 * and allow scaling to higher volumes of requests without expanding
+	 * the height of the skiplist).
+	 */
+	root->prng = next_pseudo_random32(root->prng);
+	return  __ffs(root->prng) / 2;
 }
 
 static struct list_head *
 lookup_priolist(struct i915_sched *se, int prio)
 {
-	struct i915_priolist *p;
-	struct rb_node **parent, *rb;
-	bool first = true;
+	struct i915_priolist *update[I915_PRIOLIST_HEIGHT];
+	struct i915_priolist_root *const root = &se->queue;
+	struct i915_priolist *pl, *tmp;
+	int lvl;
 
 	lockdep_assert_held(&se->lock);
-	assert_priolists(se);
-
 	if (unlikely(se->no_priolist))
 		prio = I915_PRIORITY_NORMAL;
 
+	for_each_priolist(pl, root) { /* recycle any empty elements before us */
+		if (pl->priority <= prio || !list_empty(&pl->requests))
+			break;
+
+		__i915_sched_dequeue_next(se);
+	}
+
 find_priolist:
-	/* most positive priority is scheduled first, equal priorities fifo */
-	rb = NULL;
-	parent = &se->queue.rb_root.rb_node;
-	while (*parent) {
-		rb = *parent;
-		p = to_priolist(rb);
-		if (prio > p->priority) {
-			parent = &rb->rb_left;
-		} else if (prio < p->priority) {
-			parent = &rb->rb_right;
-			first = false;
-		} else {
-			return &p->requests;
-		}
+	pl = &root->sentinel;
+	lvl = pl->level;
+	while (lvl >= 0) {
+		while (tmp = pl->next[lvl], tmp->priority >= prio)
+			pl = tmp;
+		if (pl->priority == prio)
+			goto out;
+		update[lvl--] = pl;
 	}
 
 	if (prio == I915_PRIORITY_NORMAL) {
-		p = &se->default_priolist;
+		pl = &se->default_priolist;
+	} else if (!pl_empty(&root->sentinel.requests)) {
+		pl = pl_pop(&root->sentinel.requests);
 	} else {
-		p = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
+		pl = kmem_cache_alloc(global.slab_priorities, GFP_ATOMIC);
 		/* Convert an allocation failure to a priority bump */
-		if (unlikely(!p)) {
+		if (unlikely(!pl)) {
 			prio = I915_PRIORITY_NORMAL; /* recurses just once */
 
-			/* To maintain ordering with all rendering, after an
+			/*
+			 * To maintain ordering with all rendering, after an
 			 * allocation failure we have to disable all scheduling.
 			 * Requests will then be executed in fifo, and schedule
 			 * will ensure that dependencies are emitted in fifo.
@@ -327,18 +392,123 @@  lookup_priolist(struct i915_sched *se, int prio)
 		}
 	}
 
-	p->priority = prio;
-	INIT_LIST_HEAD(&p->requests);
+	pl->priority = prio;
+	INIT_LIST_HEAD(&pl->requests);
 
-	rb_link_node(&p->node, rb, parent);
-	rb_insert_color_cached(&p->node, &se->queue, first);
+	lvl = random_level(root);
+	if (lvl > root->sentinel.level) {
+		if (root->sentinel.level < I915_PRIOLIST_HEIGHT - 1) {
+			lvl = ++root->sentinel.level;
+			update[lvl] = &root->sentinel;
+		} else {
+			lvl = I915_PRIOLIST_HEIGHT - 1;
+		}
+	}
+	GEM_BUG_ON(lvl < 0);
+	GEM_BUG_ON(lvl >= ARRAY_SIZE(pl->next));
 
-	return &p->requests;
+	pl->level = lvl;
+	do {
+		tmp = update[lvl];
+		pl->next[lvl] = tmp->next[lvl];
+		tmp->next[lvl] = pl;
+	} while (--lvl >= 0);
+
+	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)) {
+		struct i915_priolist *chk;
+
+		chk = &root->sentinel;
+		lvl = chk->level;
+		do {
+			while (tmp = chk->next[lvl], tmp->priority >= prio)
+				chk = tmp;
+		} while (--lvl >= 0);
+
+		GEM_BUG_ON(chk != pl);
+	}
+
+out:
+	GEM_BUG_ON(pl == &root->sentinel);
+	return &pl->requests;
 }
 
-void __i915_priolist_free(struct i915_priolist *p)
+static void __remove_priolist(struct i915_sched *se, struct list_head *plist)
 {
-	kmem_cache_free(global.slab_priorities, p);
+	struct i915_priolist_root *root = &se->queue;
+	struct i915_priolist *pl, *tmp;
+	struct i915_priolist *old =
+		container_of(plist, struct i915_priolist, requests);
+	int prio = old->priority;
+	int lvl;
+
+	lockdep_assert_held(&se->lock);
+	GEM_BUG_ON(!list_empty(plist));
+
+	pl = &root->sentinel;
+	lvl = pl->level;
+	GEM_BUG_ON(lvl < 0);
+
+	if (prio != I915_PRIORITY_NORMAL)
+		pl_push(old, &pl->requests);
+
+	do {
+		while (tmp = pl->next[lvl], tmp->priority > prio)
+			pl = tmp;
+		if (lvl <= old->level) {
+			pl->next[lvl] = old->next[lvl];
+			if (pl == &root->sentinel && old->next[lvl] == pl) {
+				GEM_BUG_ON(pl->level != lvl);
+				pl->level--;
+			}
+		}
+	} while (--lvl >= 0);
+	GEM_BUG_ON(tmp != old);
+}
+
+static void remove_from_priolist(struct i915_sched *se,
+				 struct i915_request *rq,
+				 struct list_head *list,
+				 bool tail)
+{
+	struct list_head *prev = rq->sched.link.prev;
+
+	GEM_BUG_ON(!i915_request_in_priority_queue(rq));
+
+	__list_del_entry(&rq->sched.link);
+	if (tail)
+		list_add_tail(&rq->sched.link, list);
+	else
+		list_add(&rq->sched.link, list);
+
+	/* If we just removed the last element in the old plist, delete it */
+	if (list_empty(prev))
+		__remove_priolist(se, prev);
+}
+
+struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se)
+{
+	struct i915_priolist * const s = &se->queue.sentinel;
+	struct i915_priolist *pl = s->next[0];
+	int lvl;
+
+	GEM_BUG_ON(!list_empty(&pl->requests));
+	GEM_BUG_ON(pl == s);
+
+	/* Keep pl->next[0] valid for for_each_priolist iteration */
+	if (pl->priority != I915_PRIORITY_NORMAL)
+		pl_push(pl, &s->requests);
+
+	lvl = pl->level;
+	GEM_BUG_ON(lvl < 0);
+	do {
+		s->next[lvl] = pl->next[lvl];
+		if (pl->next[lvl] == s) {
+			GEM_BUG_ON(s->level != lvl);
+			s->level--;
+		}
+	} while (--lvl >= 0);
+
+	return pl->next[0];
 }
 
 static struct i915_request *
@@ -491,7 +661,7 @@  static void __i915_request_set_priority(struct i915_request *rq, int prio)
 
 		GEM_BUG_ON(rq->engine != engine);
 		if (i915_request_in_priority_queue(rq))
-			list_move_tail(&rq->sched.link, plist);
+			remove_from_priolist(se, rq, plist, true);
 
 		/* Defer (tasklet) submission until after all updates. */
 		kick_submission(engine, rq, prio);
@@ -627,8 +797,7 @@  void __i915_sched_defer_request(struct intel_engine_cs *engine,
 
 		/* Note list is reversed for waiters wrt signal hierarchy */
 		GEM_BUG_ON(rq->engine != engine);
-		GEM_BUG_ON(!i915_request_in_priority_queue(rq));
-		list_move(&rq->sched.link, &dfs);
+		remove_from_priolist(se, rq, &dfs, false);
 
 		/* Track our visit, and prevent duplicate processing */
 		clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
@@ -927,7 +1096,7 @@  void i915_sched_resume_request(struct intel_engine_cs *engine,
 void __i915_sched_cancel_queue(struct i915_sched *se)
 {
 	struct i915_request *rq, *rn;
-	struct rb_node *rb;
+	struct i915_priolist *pl;
 
 	lockdep_assert_held(&se->lock);
 
@@ -936,16 +1105,9 @@  void __i915_sched_cancel_queue(struct i915_sched *se)
 		i915_request_put(i915_request_mark_eio(rq));
 
 	/* Flush the queued requests to the timeline list (for retiring). */
-	while ((rb = rb_first_cached(&se->queue))) {
-		struct i915_priolist *p = to_priolist(rb);
-
-		priolist_for_each_request_consume(rq, rn, p) {
-			i915_request_put(i915_request_mark_eio(rq));
-			__i915_request_submit(rq);
-		}
-
-		rb_erase_cached(&p->node, &se->queue);
-		i915_priolist_free(p);
+	i915_sched_dequeue(se, pl, rq, rn) {
+		i915_request_put(i915_request_mark_eio(rq));
+		__i915_request_submit(rq);
 	}
 	GEM_BUG_ON(!i915_sched_is_idle(se));
 
@@ -1225,9 +1387,9 @@  void i915_sched_show(struct drm_printer *m,
 		     unsigned int max)
 {
 	const struct i915_request *rq, *last;
+	struct i915_priolist *pl;
 	unsigned long flags;
 	unsigned int count;
-	struct rb_node *rb;
 
 	rcu_read_lock();
 	spin_lock_irqsave(&se->lock, flags);
@@ -1282,10 +1444,8 @@  void i915_sched_show(struct drm_printer *m,
 
 	last = NULL;
 	count = 0;
-	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
-
-		priolist_for_each_request(rq, p) {
+	for_each_priolist(pl, &se->queue) {
+		priolist_for_each_request(rq, pl) {
 			if (count++ < max - 1)
 				show_request(m, rq, "\t", 0);
 			else
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index fe392109b112..872d221f6ba7 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -24,12 +24,6 @@  struct intel_engine_cs;
 		  ##__VA_ARGS__);					\
 } while (0)
 
-#define priolist_for_each_request(it, plist) \
-	list_for_each_entry(it, &(plist)->requests, sched.link)
-
-#define priolist_for_each_request_consume(it, n, plist) \
-	list_for_each_entry_safe(it, n, &(plist)->requests, sched.link)
-
 void i915_sched_node_init(struct i915_sched_node *node);
 void i915_sched_node_reinit(struct i915_sched_node *node);
 
@@ -100,7 +94,7 @@  static inline void i915_priolist_free(struct i915_priolist *p)
 
 static inline bool i915_sched_is_idle(const struct i915_sched *se)
 {
-	return RB_EMPTY_ROOT(&se->queue.rb_root);
+	return i915_priolist_is_empty(&se->queue);
 }
 
 static inline bool
@@ -168,6 +162,14 @@  i915_sched_get_active_request(const struct i915_sched *se)
 	return NULL;
 }
 
+/* Walk the scheduler queue of requests (in submission order) and remove them */
+struct i915_priolist *__i915_sched_dequeue_next(struct i915_sched *se);
+#define i915_sched_dequeue(se, pl, rq, rn) \
+	for ((pl) = (se)->queue.sentinel.next[0]; \
+	     (pl) != &(se)->queue.sentinel; \
+	     (pl) = __i915_sched_dequeue_next(se)) \
+		priolist_for_each_request_safe(rq, rn, pl)
+
 void i915_request_show_with_schedule(struct drm_printer *m,
 				     const struct i915_request *rq,
 				     const char *prefix,
diff --git a/drivers/gpu/drm/i915/i915_scheduler_types.h b/drivers/gpu/drm/i915/i915_scheduler_types.h
index 5ca2dc1b4fb5..bc668f375097 100644
--- a/drivers/gpu/drm/i915/i915_scheduler_types.h
+++ b/drivers/gpu/drm/i915/i915_scheduler_types.h
@@ -115,7 +115,7 @@  struct i915_sched {
 	 * @queue is only used to transfer requests from the scheduler
 	 * frontend to the back.
 	 */
-	struct rb_root_cached queue;
+	struct i915_priolist_root queue;
 
 	/**
 	 * @tasklet: softirq tasklet for bottom half
diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
index 3db34d3eea58..946c93441c1f 100644
--- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
@@ -25,6 +25,7 @@  selftest(ring, intel_ring_mock_selftests)
 selftest(engine, intel_engine_cs_mock_selftests)
 selftest(timelines, intel_timeline_mock_selftests)
 selftest(requests, i915_request_mock_selftests)
+selftest(scheduler, i915_scheduler_mock_selftests)
 selftest(objects, i915_gem_object_mock_selftests)
 selftest(phys, i915_gem_phys_mock_selftests)
 selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
diff --git a/drivers/gpu/drm/i915/selftests/i915_scheduler.c b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
index f54bdbeaa48b..2bb2d3d07d06 100644
--- a/drivers/gpu/drm/i915/selftests/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/selftests/i915_scheduler.c
@@ -12,6 +12,54 @@ 
 #include "selftests/igt_spinner.h"
 #include "selftests/i915_random.h"
 
+static int mock_skiplist_levels(void *dummy)
+{
+	struct i915_priolist_root root = {};
+	struct i915_priolist *pl = &root.sentinel;
+	IGT_TIMEOUT(end_time);
+	unsigned long total;
+	int count, lvl;
+
+	total = 0;
+	do {
+		for (count = 0; count < 16384; count++) {
+			lvl = random_level(&root);
+			if (lvl > pl->level) {
+				if (lvl < I915_PRIOLIST_HEIGHT - 1)
+					lvl = ++pl->level;
+				else
+					lvl = I915_PRIOLIST_HEIGHT - 1;
+			}
+
+			pl->next[lvl] = ptr_inc(pl->next[lvl]);
+		}
+		total += count;
+	} while (!__igt_timeout(end_time, NULL));
+
+	pr_info("Total %9lu\n", total);
+	for (lvl = 0; lvl <= pl->level; lvl++) {
+		int x = ilog2((unsigned long)pl->next[lvl]);
+		char row[80];
+
+		memset(row, '*', x);
+		row[x] = '\0';
+
+		pr_info(" [%2d] %9lu %s\n",
+			lvl, (unsigned long)pl->next[lvl], row);
+	}
+
+	return 0;
+}
+
+int i915_scheduler_mock_selftests(void)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(mock_skiplist_levels),
+	};
+
+	return i915_subtests(tests, NULL);
+}
+
 static void scheduling_disable(struct intel_engine_cs *engine)
 {
 	engine->props.preempt_timeout_ms = 0;
@@ -80,9 +128,9 @@  static int all_engines(struct drm_i915_private *i915,
 static bool check_context_order(struct i915_sched *se)
 {
 	u64 last_seqno, last_context;
+	struct i915_priolist *p;
 	unsigned long count;
 	bool result = false;
-	struct rb_node *rb;
 	int last_prio;
 
 	/* We expect the execution order to follow ascending fence-context */
@@ -92,8 +140,7 @@  static bool check_context_order(struct i915_sched *se)
 	last_context = 0;
 	last_seqno = 0;
 	last_prio = 0;
-	for (rb = rb_first_cached(&se->queue); rb; rb = rb_next(rb)) {
-		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
+	for_each_priolist(p, &se->queue) {
 		struct i915_request *rq;
 
 		priolist_for_each_request(rq, p) {