diff mbox

[RFC,31/44] drm/i915: Implemented the GPU scheduler

Message ID 1403803475-16337-32-git-send-email-John.C.Harrison@Intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

John Harrison June 26, 2014, 5:24 p.m. UTC
From: John Harrison <John.C.Harrison@Intel.com>

Filled in all the 'do stuff here' blanks...

The general theory of operation is that when batch buffers are submitted to the
driver, the execbuffer() code assigns a unique seqno value and then packages up
all the information required to execute the batch buffer at a later time. This
package is given over to the scheduler which adds it to an internal node list.
The scheduler also scans the list of objects associated with the batch buffer
and compares them against the objects already in use by other buffers in the
node list. If matches are found then the new batch buffer node is marked as
being dependent upon the matching node. The same is done for the context object.
The scheduler also bumps up the priority of such matching nodes on the grounds
that the more dependencies a given batch buffer has the more important it is
likely to be.

The scheduler aims to have a given (tuneable) number of batch buffers in flight
on the hardware at any given time. If fewer than this are currently executing
when a new node is queued, then the node is passed straight through to the
submit function. Otherwise it is simply added to the queue and the driver
returns back to user land.

As each batch buffer completes, it raises an interrupt which wakes up the
scheduler. Note that it is possible for multiple buffers to complete before the
IRQ handler gets to run. Further, the seqno values of the individual buffers are
not necessary incrementing as the scheduler may have re-ordered their
submission. However, the scheduler keeps the list of executing buffers in order
of hardware submission. Thus it can scan through the list until a matching seqno
is found and then mark all in flight nodes from that point on as completed.

A deferred work queue is also poked by the interrupt handler. When this wakes up
it can do more involved processing such as actually removing completed nodes
from the queue and freeing up the resources associated with them (internal
memory allocations, DRM object references, context reference, etc.). The work
handler also checks the in flight count and calls the submission code if a new
slot has appeared.

When the scheduler's submit code is called, it scans the queued node list for
the highest priority node that has no unmet dependencies. Note that the
dependency calculation is complex as it must take inter-ring dependencies and
potential preemptions into account. Note also that in the future this will be
extended to include external dependencies such as the Android Native Sync file
descriptors and/or the linux dma-buff synchronisation scheme.

If a suitable node is found then it is sent to execbuff_final() for submission
to the hardware. The in flight count is then re-checked and a new node popped
from the list if appropriate.

Note that this change does not implement pre-emptive scheduling. Only basic
scheduling by re-ordering batch buffer submission is currently implemented.
---
 drivers/gpu/drm/i915/i915_scheduler.c |  945 +++++++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_scheduler.h |   59 +-
 2 files changed, 965 insertions(+), 39 deletions(-)
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 6a10a76..1816f1d 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -41,6 +41,7 @@  int i915_scheduler_init(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
 	struct i915_scheduler   *scheduler = dev_priv->scheduler;
+	int                     r;
 
 	if (scheduler)
 		return 0;
@@ -51,8 +52,16 @@  int i915_scheduler_init(struct drm_device *dev)
 
 	spin_lock_init(&scheduler->lock);
 
+	for (r = 0; r < I915_NUM_RINGS; r++)
+		INIT_LIST_HEAD(&scheduler->node_queue[r]);
+
 	scheduler->index = 1;
 
+	/* Default tuning values: */
+	scheduler->priority_level_max     = ~0U;
+	scheduler->priority_level_preempt = 900;
+	scheduler->min_flying             = 2;
+
 	dev_priv->scheduler = scheduler;
 
 	return 0;
@@ -60,50 +69,371 @@  int i915_scheduler_init(struct drm_device *dev)
 
 int i915_scheduler_queue_execbuffer(struct i915_scheduler_queue_entry *qe)
 {
-	struct drm_i915_private     *dev_priv = qe->params.dev->dev_private;
-	struct i915_scheduler       *scheduler = dev_priv->scheduler;
-	int ret, i;
+	struct drm_i915_private *dev_priv = qe->params.dev->dev_private;
+	struct i915_scheduler   *scheduler = dev_priv->scheduler;
+	struct intel_engine_cs  *ring = qe->params.ring;
+	struct i915_scheduler_queue_entry  *node;
+	struct i915_scheduler_queue_entry  *test;
+	struct timespec     stamp;
+	unsigned long       flags;
+	bool                not_flying, found;
+	int                 i, j, r, got_batch = 0;
+	int                 incomplete = 0;
 
 	BUG_ON(!scheduler);
 
-	qe->params.scheduler_index = scheduler->index++;
+	if (i915.scheduler_override & i915_so_direct_submit) {
+		int ret;
 
-	ret = i915_gem_do_execbuffer_final(&qe->params);
+		qe->params.scheduler_index = scheduler->index++;
 
-	/* Need to release the objects: */
-	for (i = 0; i < qe->num_objs; i++) {
-		if (!qe->saved_objects[i].obj)
-			continue;
+		scheduler->flags[qe->params.ring->id] |= i915_sf_submitting;
+		ret = i915_gem_do_execbuffer_final(&qe->params);
+		scheduler->flags[qe->params.ring->id] &= ~i915_sf_submitting;
+
+		/* Need to release the objects: */
+		for (i = 0; i < qe->num_objs; i++) {
+			if (!qe->saved_objects[i].obj)
+				continue;
 
-		drm_gem_object_unreference(&qe->saved_objects[i].obj->base);
+			drm_gem_object_unreference(&qe->saved_objects[i].obj->base);
+		}
+
+		kfree(qe->saved_objects);
+		qe->saved_objects = NULL;
+		qe->num_objs = 0;
+
+		/* Free the context object too: */
+		if (qe->params.ctx)
+			i915_gem_context_unreference(qe->params.ctx);
+
+		/* And anything else owned by the QE structure: */
+		kfree(qe->params.cliprects);
+
+		return ret;
 	}
 
-	kfree(qe->saved_objects);
-	qe->saved_objects = NULL;
-	qe->num_objs = 0;
+	getrawmonotonic(&stamp);
 
-	/* Free the context object too: */
-	if (qe->params.ctx)
-		i915_gem_context_unreference(qe->params.ctx);
+	node = kmalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
 
-	/* And anything else owned by the QE structure: */
-	kfree(qe->params.cliprects);
+	*node = *qe;
+	INIT_LIST_HEAD(&node->link);
+	node->status = i915_sqs_queued;
+	node->stamp  = stamp;
 
-	return ret;
+	/*
+	 * Verify that the batch buffer itself is included in the object list.
+	 */
+	for (i = 0; i < node->num_objs; i++) {
+		if (node->saved_objects[i].obj == node->params.batch_obj)
+			got_batch++;
+	}
+
+	BUG_ON(got_batch != 1);
+
+	/* Need to determine the number of incomplete entries in the list as
+	 * that will be the maximum size of the dependency list.
+	 *
+	 * Note that the allocation must not be made with the spinlock acquired
+	 * as kmalloc can sleep. However, the unlock/relock is safe because no
+	 * new entries can be queued up during the unlock as the i915 driver
+	 * mutex is still held. Entries could be removed from the list but that
+	 * just means the dep_list will be over-allocated which is fine.
+	 */
+	spin_lock_irqsave(&scheduler->lock, flags);
+	for (r = 0; r < I915_NUM_RINGS; r++) {
+		list_for_each_entry(test, &scheduler->node_queue[r], link) {
+			if (I915_SQS_IS_COMPLETE(test))
+				continue;
+
+			incomplete++;
+		}
+	}
+
+	/* Temporarily unlock to allocate memory: */
+	spin_unlock_irqrestore(&scheduler->lock, flags);
+	if (incomplete) {
+		node->dep_list = kmalloc(sizeof(node->dep_list[0]) * incomplete,
+					 GFP_KERNEL);
+		if (!node->dep_list) {
+			kfree(node);
+			return -ENOMEM;
+		}
+	} else
+		node->dep_list = NULL;
+
+	spin_lock_irqsave(&scheduler->lock, flags);
+	node->num_deps = 0;
+
+	if (node->dep_list) {
+		for (r = 0; r < I915_NUM_RINGS; r++) {
+			list_for_each_entry(test, &scheduler->node_queue[r], link) {
+				if (I915_SQS_IS_COMPLETE(test))
+					continue;
+
+				found = (node->params.ctx == test->params.ctx);
+
+				for (i = 0; (i < node->num_objs) && !found; i++) {
+					for (j = 0; j < test->num_objs; j++) {
+						if (node->saved_objects[i].obj !=
+							    test->saved_objects[j].obj)
+							continue;
+
+						found = true;
+						break;
+					}
+				}
+
+				if (found) {
+					node->dep_list[node->num_deps] = test;
+					node->num_deps++;
+				}
+			}
+		}
+
+		BUG_ON(node->num_deps > incomplete);
+	}
+
+	if (node->priority && node->num_deps) {
+		i915_scheduler_priority_bump_clear(scheduler, ring);
+
+		for (i = 0; i < node->num_deps; i++)
+			i915_scheduler_priority_bump(scheduler,
+					node->dep_list[i], node->priority);
+	}
+
+	node->params.scheduler_index = scheduler->index++;
+
+	list_add_tail(&node->link, &scheduler->node_queue[ring->id]);
+
+	not_flying = i915_scheduler_count_flying(scheduler, ring) <
+						 scheduler->min_flying;
+
+	spin_unlock_irqrestore(&scheduler->lock, flags);
+
+	if (not_flying)
+		i915_scheduler_submit(ring, true);
+
+	return 0;
 }
 
 int i915_scheduler_fly_seqno(struct intel_engine_cs *ring, uint32_t seqno)
 {
-	/* Do stuff... */
+	struct i915_scheduler_queue_entry *node;
+	struct drm_i915_private           *dev_priv = ring->dev->dev_private;
+	struct i915_scheduler             *scheduler = dev_priv->scheduler;
+	struct timespec stamp;
+	unsigned long   flags;
+	int             ret;
+
+	BUG_ON(!scheduler);
+
+	/* No need to add if this request is due to a scheduler submission */
+	if (scheduler->flags[ring->id] & i915_sf_submitting)
+		return 0;
+
+	getrawmonotonic(&stamp);
+
+	/* Need to allocate a new node. Note that kzalloc can sleep
+	 * thus the spinlock must not be held yet. */
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&node->link);
+	node->params.ring  = ring;
+	node->params.seqno = seqno;
+	node->params.dev   = ring->dev;
+	node->stamp        = stamp;
+	node->status       = i915_sqs_none;
+
+	spin_lock_irqsave(&scheduler->lock, flags);
+	ret = i915_scheduler_fly_node(node);
+	spin_unlock_irqrestore(&scheduler->lock, flags);
+
+	return ret;
+}
+
+int i915_scheduler_fly_node(struct i915_scheduler_queue_entry *node)
+{
+	struct drm_i915_private *dev_priv = node->params.dev->dev_private;
+	struct i915_scheduler   *scheduler = dev_priv->scheduler;
+	struct intel_engine_cs  *ring;
+
+	BUG_ON(!scheduler);
+	BUG_ON(!node);
+	BUG_ON(node->status != i915_sqs_none);
+
+	ring = node->params.ring;
+
+	/* Add the node (which should currently be in state none) to the front
+	 * of the queue. This ensure that flying nodes are always held in
+	 * hardware submission order. */
+	list_add(&node->link, &scheduler->node_queue[ring->id]);
+
+	node->status = i915_sqs_flying;
+
+	if (!(scheduler->flags[ring->id] & i915_sf_interrupts_enabled)) {
+		bool    success = true;
+
+		success = ring->irq_get(ring);
+		if (success)
+			scheduler->flags[ring->id] |= i915_sf_interrupts_enabled;
+		else
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+/*
+ * Nodes are considered valid dependencies if they are queued on any ring or
+ * if they are in flight on a different ring. In flight on the same ring is no
+ * longer interesting for non-premptive nodes as the ring serialises execution.
+ * For pre-empting nodes, all in flight dependencies are valid as they must not
+ * be jumped by the act of pre-empting.
+ *
+ * Anything that is neither queued nor flying is uninteresting.
+ */
+static inline bool i915_scheduler_is_dependency_valid(
+			struct i915_scheduler_queue_entry *node, uint32_t idx)
+{
+	struct i915_scheduler_queue_entry *dep;
+
+	dep = node->dep_list[idx];
+	if (!dep)
+		return false;
+
+	if (I915_SQS_IS_QUEUED(dep))
+		return true;
+
+	if (I915_SQS_IS_FLYING(dep)) {
+		if (node->params.ring != dep->params.ring)
+			return true;
+	}
+
+	return false;
+}
+
+uint32_t i915_scheduler_count_flying(struct i915_scheduler *scheduler,
+				     struct intel_engine_cs *ring)
+{
+	struct i915_scheduler_queue_entry *node;
+	uint32_t                          flying = 0;
+
+	list_for_each_entry(node, &scheduler->node_queue[ring->id], link)
+		if (I915_SQS_IS_FLYING(node))
+			flying++;
+
+	return flying;
+}
+
+/* Add a popped node back in to the queue. For example, because the ring
+ * was hung when execbuff_final() was called and thus the ring submission
+ * needs to be retried later. */
+static void i915_scheduler_node_requeue(struct i915_scheduler_queue_entry *node)
+{
+	BUG_ON(!node);
+	BUG_ON(!I915_SQS_IS_FLYING(node));
+
+	node->status = i915_sqs_queued;
+}
+
+/* Give up on a popped node completely. For example, because it is causing the
+ * ring to hang or is using some resource that no longer exists. */
+static void i915_scheduler_node_kill(struct i915_scheduler_queue_entry *node)
+{
+	BUG_ON(!node);
+	BUG_ON(!I915_SQS_IS_FLYING(node));
+
+	node->status = i915_sqs_complete;
+}
+
+/*
+ * The batch tagged with the indicated seqence number has completed.
+ * Search the queue for it, update its status and those of any batches
+ * submitted earlier, which must also have completed or been preeempted
+ * as appropriate.
+ *
+ * Called with spinlock already held.
+ */
+static int i915_scheduler_seqno_complete(struct intel_engine_cs *ring, uint32_t seqno)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct i915_scheduler   *scheduler = dev_priv->scheduler;
+	struct i915_scheduler_queue_entry *node;
+
+	/*
+	 * Batch buffers are added to the head of the list in execution order,
+	 * thus seqno values, although not necessarily incrementing, will be
+	 * met in completion order when scanning the list. So when a match is
+	 * found, all subsequent entries must have also popped out. Conversely,
+	 * if a completed entry is found then there is no need to scan further.
+	 */
+	list_for_each_entry(node, &scheduler->node_queue[ring->id], link) {
+		if (I915_SQS_IS_COMPLETE(node))
+			goto done;
+
+		if (seqno == node->params.seqno)
+			break;
+	}
+
+	/*
+	 * NB: Lots of extra seqnos get added to the ring to track things
+	 * like cache flushes and page flips. So don't complain about if
+	 * no node was found.
+	 */
+	if (&node->link == &scheduler->node_queue[ring->id])
+		goto done;
+
+	BUG_ON(!I915_SQS_IS_FLYING(node));
+
+	/* Everything from here can be marked as done: */
+	list_for_each_entry_from(node, &scheduler->node_queue[ring->id], link) {
+		/* Check if the marking has already been done: */
+		if (I915_SQS_IS_COMPLETE(node))
+			break;
+
+		if (!I915_SQS_IS_FLYING(node))
+			continue;
+
+		/* Node was in flight so mark it as complete. */
+		node->status = i915_sqs_complete;
+	}
+
+	/* Should submit new work here if flight list is empty but the DRM
+	 * mutex lock might not be available if a '__wait_seqno()' call is
+	 * blocking the system. */
 
+done:
 	return 0;
 }
 
 int i915_scheduler_handle_IRQ(struct intel_engine_cs *ring)
 {
 	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct i915_scheduler   *scheduler = dev_priv->scheduler;
+	unsigned long       flags;
+	static uint32_t     last_seqno;
+	uint32_t            seqno;
+
+	seqno = ring->get_seqno(ring, false);
+
+	if (i915.scheduler_override & i915_so_direct_submit)
+		return 0;
+
+	if (seqno == last_seqno) {
+		/* Why are there sometimes multiple interrupts per seqno? */
+		return 0;
+	}
+	last_seqno = seqno;
 
-	/* Do stuff... */
+	spin_lock_irqsave(&scheduler->lock, flags);
+	i915_scheduler_seqno_complete(ring, seqno);
+	spin_unlock_irqrestore(&scheduler->lock, flags);
 
 	queue_work(dev_priv->wq, &dev_priv->mm.scheduler_work);
 
@@ -112,22 +442,506 @@  int i915_scheduler_handle_IRQ(struct intel_engine_cs *ring)
 
 int i915_scheduler_remove(struct intel_engine_cs *ring)
 {
-	/* Do stuff... */
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+	struct i915_scheduler   *scheduler = dev_priv->scheduler;
+	struct i915_scheduler_queue_entry  *node, *node_next;
+	unsigned long       flags;
+	int                 flying = 0, queued = 0;
+	int                 ret = 0;
+	bool                do_submit;
+	uint32_t            i, min_seqno;
+	struct list_head    remove;
 
-	return 0;
+	if (list_empty(&scheduler->node_queue[ring->id]))
+		return 0;
+
+	spin_lock_irqsave(&scheduler->lock, flags);
+
+	/* /i915_scheduler_dump_locked(ring, "remove/pre");/ */
+
+	/*
+	 * In the case where the system is idle, starting 'min_seqno' from a big
+	 * number will cause all nodes to be removed as they are now back to
+	 * being in-order. However, this will be a problem if the last one to
+	 * complete was actually out-of-order as the ring seqno value will be
+	 * lower than one or more completed buffers. Thus code looking for the
+	 * completion of said buffers will wait forever.
+	 * Instead, use the hardware seqno as the starting point. This means
+	 * that some buffers might be kept around even in a completely idle
+	 * system but it should guarantee that no-one ever gets confused when
+	 * waiting for buffer completion.
+	 */
+	min_seqno = ring->get_seqno(ring, true);
+
+	list_for_each_entry(node, &scheduler->node_queue[ring->id], link) {
+		if (I915_SQS_IS_QUEUED(node))
+			queued++;
+		else if (I915_SQS_IS_FLYING(node))
+			flying++;
+		else if (I915_SQS_IS_COMPLETE(node))
+			continue;
+
+		if (i915_compare_seqno_values(node->params.seqno, min_seqno) < 0)
+			min_seqno = node->params.seqno;
+	}
+
+	INIT_LIST_HEAD(&remove);
+	list_for_each_entry_safe(node, node_next, &scheduler->node_queue[ring->id], link) {
+		/*
+		 * Only remove completed nodes which have a lower seqno than
+		 * all pending nodes. While there is the possibility of the
+		 * ring's seqno counting backwards, all higher buffers must
+		 * be remembered so that the 'i915_seqno_passed()' test can
+		 * report that they have in fact passed.
+		 */
+		if (!I915_SQS_IS_COMPLETE(node))
+			continue;
+
+		if (i915_compare_seqno_values(node->params.seqno, min_seqno) > 0)
+			continue;
+
+		list_del(&node->link);
+		list_add(&node->link, &remove);
+
+		/* Strip the dependency info while the mutex is still locked */
+		i915_scheduler_remove_dependent(scheduler, node);
+
+		continue;
+	}
+
+	/*
+	 * No idea why but this seems to cause problems occasionally.
+	 * Note that the 'irq_put' code is internally reference counted
+	 * and spin_locked so it should be safe to call.
+	 */
+	/*if ((scheduler->flags[ring->id] & i915_sf_interrupts_enabled) &&
+	    (first_flight[ring->id] == NULL)) {
+		ring->irq_put(ring);
+		scheduler->flags[ring->id] &= ~i915_sf_interrupts_enabled;
+	}*/
+
+	/* Launch more packets now? */
+	do_submit = (queued > 0) && (flying < scheduler->min_flying);
+
+	spin_unlock_irqrestore(&scheduler->lock, flags);
+
+	if (do_submit)
+		ret = i915_scheduler_submit(ring, true);
+
+	while (!list_empty(&remove)) {
+		node = list_first_entry(&remove, typeof(*node), link);
+		list_del(&node->link);
+
+		/* Release the locked buffers: */
+		for (i = 0; i < node->num_objs; i++) {
+			drm_gem_object_unreference(
+					    &node->saved_objects[i].obj->base);
+		}
+		kfree(node->saved_objects);
+
+		/* Context too: */
+		if (node->params.ctx)
+			i915_gem_context_unreference(node->params.ctx);
+
+		/* And anything else owned by the node: */
+		kfree(node->params.cliprects);
+		kfree(node->dep_list);
+		kfree(node);
+	}
+
+	return ret;
 }
 
 int i915_scheduler_flush_seqno(struct intel_engine_cs *ring, bool is_locked,
 			       uint32_t seqno)
 {
-	/* Do stuff... */
+	struct i915_scheduler_queue_entry  *node;
+	struct drm_i915_private            *dev_priv;
+	struct i915_scheduler              *scheduler;
+	unsigned long       flags;
+	int                 flush_count = 0;
 
-	return 0;
+	if (!ring)
+		return -EINVAL;
+
+	dev_priv  = ring->dev->dev_private;
+	scheduler = dev_priv->scheduler;
+
+	if (!scheduler)
+		return 0;
+
+	BUG_ON(is_locked && (scheduler->flags[ring->id] & i915_sf_submitting));
+
+	if (list_empty(&scheduler->node_queue[ring->id]))
+		return 0;
+
+	spin_lock_irqsave(&scheduler->lock, flags);
+
+	i915_scheduler_priority_bump_clear(scheduler, ring);
+
+	list_for_each_entry(node, &scheduler->node_queue[ring->id], link) {
+		if (!I915_SQS_IS_QUEUED(node))
+			continue;
+
+		if (node->params.seqno != seqno)
+			continue;
+
+		flush_count += i915_scheduler_priority_bump(scheduler,
+					node, scheduler->priority_level_max);
+	}
+
+	spin_unlock_irqrestore(&scheduler->lock, flags);
+
+	if (flush_count) {
+		DRM_DEBUG_SCHED("<%s> Bumped %d entries\n", ring->name, flush_count);
+		flush_count = i915_scheduler_submit_max_priority(ring, is_locked);
+	}
+
+	return flush_count;
 }
 
 int i915_scheduler_flush(struct intel_engine_cs *ring, bool is_locked)
 {
-	/* Do stuff... */
+	struct i915_scheduler_queue_entry *node;
+	struct drm_i915_private           *dev_priv;
+	struct i915_scheduler             *scheduler;
+	unsigned long       flags;
+	bool        found;
+	int         ret;
+	uint32_t    count = 0;
+
+	if (!ring)
+		return -EINVAL;
+
+	dev_priv  = ring->dev->dev_private;
+	scheduler = dev_priv->scheduler;
+
+	if (!scheduler)
+		return 0;
+
+	BUG_ON(is_locked && (scheduler->flags[ring->id] & i915_sf_submitting));
+
+	do {
+		found = false;
+		spin_lock_irqsave(&scheduler->lock, flags);
+		list_for_each_entry(node, &scheduler->node_queue[ring->id], link) {
+			if (!I915_SQS_IS_QUEUED(node))
+				continue;
+
+			found = true;
+			break;
+		}
+		spin_unlock_irqrestore(&scheduler->lock, flags);
+
+		if (found) {
+			ret = i915_scheduler_submit(ring, is_locked);
+			if (ret < 0)
+				return ret;
+
+			count += ret;
+		}
+	} while (found);
+
+	return count;
+}
+
+void i915_scheduler_priority_bump_clear(struct i915_scheduler *scheduler,
+					struct intel_engine_cs *ring)
+{
+	struct i915_scheduler_queue_entry *node;
+	int i;
+
+	/*
+	 * Ensure circular dependencies don't cause problems and that a bump
+	 * by object usage only bumps each using buffer once:
+	 */
+	for (i = 0; i < I915_NUM_RINGS; i++) {
+		list_for_each_entry(node, &scheduler->node_queue[i], link)
+			node->bumped = false;
+	}
+}
+
+int i915_scheduler_priority_bump(struct i915_scheduler *scheduler,
+				 struct i915_scheduler_queue_entry *target,
+				 uint32_t bump)
+{
+	uint32_t new_priority;
+	int      i, count;
+
+	if (target->priority >= scheduler->priority_level_max)
+		return 1;
+
+	if (target->bumped)
+		return 0;
+
+	new_priority = target->priority + bump;
+	if ((new_priority <= target->priority) ||
+	    (new_priority > scheduler->priority_level_max))
+		target->priority = scheduler->priority_level_max;
+	else
+		target->priority = new_priority;
+
+	count = 1;
+	target->bumped = true;
+
+	for (i = 0; i < target->num_deps; i++) {
+		if (!target->dep_list[i])
+			continue;
+
+		if (target->dep_list[i]->bumped)
+			continue;
+
+		count += i915_scheduler_priority_bump(scheduler,
+						      target->dep_list[i],
+						      bump);
+	}
+
+	return count;
+}
+
+int i915_scheduler_submit_max_priority(struct intel_engine_cs *ring,
+				       bool is_locked)
+{
+	struct i915_scheduler_queue_entry  *node;
+	struct drm_i915_private            *dev_priv = ring->dev->dev_private;
+	struct i915_scheduler              *scheduler = dev_priv->scheduler;
+	unsigned long	flags;
+	int             ret, count = 0;
+	bool            found;
+
+	do {
+		found = false;
+		spin_lock_irqsave(&scheduler->lock, flags);
+		list_for_each_entry(node, &scheduler->node_queue[ring->id], link) {
+			if (!I915_SQS_IS_QUEUED(node))
+				continue;
+
+			if (node->priority < scheduler->priority_level_max)
+				continue;
+
+			found = true;
+			break;
+		}
+		spin_unlock_irqrestore(&scheduler->lock, flags);
+
+		if (!found)
+			break;
+
+		ret = i915_scheduler_submit(ring, is_locked);
+		if (ret < 0)
+			return ret;
+
+		count += ret;
+	} while (found);
+
+	return count;
+}
+
+static int i915_scheduler_pop_from_queue_locked(struct intel_engine_cs *ring,
+				    struct i915_scheduler_queue_entry **pop_node,
+				    unsigned long *flags)
+{
+	struct drm_i915_private            *dev_priv = ring->dev->dev_private;
+	struct i915_scheduler              *scheduler = dev_priv->scheduler;
+	struct i915_scheduler_queue_entry  *best;
+	struct i915_scheduler_queue_entry  *node;
+	int     ret;
+	int     i;
+	bool	any_queued;
+	bool	has_local, has_remote, only_remote;
+
+	*pop_node = NULL;
+	ret = -ENODATA;
+
+	any_queued = false;
+	only_remote = false;
+	best = NULL;
+
+	list_for_each_entry(node, &scheduler->node_queue[ring->id], link) {
+		if (!I915_SQS_IS_QUEUED(node))
+			continue;
+		any_queued = true;
+
+		has_local  = false;
+		has_remote = false;
+		for (i = 0; i < node->num_deps; i++) {
+			if (!i915_scheduler_is_dependency_valid(node, i))
+				continue;
+
+			if (node->dep_list[i]->params.ring == node->params.ring)
+				has_local = true;
+			else
+				has_remote = true;
+		}
+
+		if (has_remote && !has_local)
+			only_remote = true;
+
+		if (!has_local && !has_remote) {
+			if (!best ||
+			    (node->priority > best->priority))
+				best = node;
+		}
+	}
+
+	if (best) {
+		list_del(&best->link);
+
+		INIT_LIST_HEAD(&best->link);
+		best->status  = i915_sqs_none;
+
+		ret = 0;
+	} else {
+		/* Can only get here if:
+		 * (a) there are no buffers in the queue
+		 * (b) all queued buffers are dependent on other buffers
+		 *     e.g. on a buffer that is in flight on a different ring
+		 */
+		if (only_remote) {
+			/* The only dependent buffers are on another ring. */
+			ret = -EAGAIN;
+		} else if (any_queued) {
+			/* It seems that something has gone horribly wrong! */
+			DRM_ERROR("Broken dependency tracking on ring %d!\n",
+				  (int) ring->id);
+		}
+	}
+
+	/* i915_scheduler_dump_queue_pop(ring, best); */
+
+	*pop_node = best;
+	return ret;
+}
+
+int i915_scheduler_submit(struct intel_engine_cs *ring, bool was_locked)
+{
+	struct drm_device   *dev = ring->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct i915_scheduler   *scheduler = dev_priv->scheduler;
+	struct i915_scheduler_queue_entry  *node;
+	unsigned long       flags;
+	int                 ret = 0, count = 0;
+
+	if (!was_locked) {
+		ret = i915_mutex_lock_interruptible(dev);
+		if (ret)
+			return ret;
+	}
+
+	BUG_ON(!mutex_is_locked(&dev->struct_mutex));
+
+	spin_lock_irqsave(&scheduler->lock, flags);
+
+	/* First time around, complain if anything unexpected occurs: */
+	ret = i915_scheduler_pop_from_queue_locked(ring, &node, &flags);
+	if (ret) {
+		spin_unlock_irqrestore(&scheduler->lock, flags);
+
+		if (!was_locked)
+			mutex_unlock(&dev->struct_mutex);
+
+		return ret;
+	}
+
+	do {
+		BUG_ON(!node);
+		BUG_ON(node->params.ring != ring);
+		BUG_ON(node->status != i915_sqs_none);
+		count++;
+
+		/* The call to pop above will have removed the node from the
+		 * list. So add it back in and mark it as in flight. */
+		i915_scheduler_fly_node(node);
+
+		scheduler->flags[ring->id] |= i915_sf_submitting;
+		spin_unlock_irqrestore(&scheduler->lock, flags);
+		ret = i915_gem_do_execbuffer_final(&node->params);
+		spin_lock_irqsave(&scheduler->lock, flags);
+		scheduler->flags[ring->id] &= ~i915_sf_submitting;
+
+		if (ret) {
+			bool requeue = false;
+
+			/* Oh dear! Either the node is broken or the ring is
+			 * busy. So need to kill the node or requeue it and try
+			 * again later as appropriate. */
+
+			switch (-ret) {
+			case EAGAIN:
+			case EBUSY:
+			case EIO:
+			case ENOMEM:
+			case ERESTARTSYS:
+				/* Supposedly recoverable errors. */
+				requeue = true;
+			break;
+
+			case ENODEV:
+			case ENOENT:
+				/* Fatal errors. Kill the node. */
+			break;
+
+			default:
+				DRM_DEBUG_SCHED("<%s> Got unexpected error from execbuff_final(): %d!\n",
+						ring->name, ret);
+				/* Assume it is recoverable and hope for the best. */
+				requeue = true;
+			break;
+			}
+
+			if (requeue) {
+				i915_scheduler_node_requeue(node);
+				/* No point spinning if the ring is currently
+				 * unavailable so just give up and come back
+				 * later. */
+				break;
+			} else
+				i915_scheduler_node_kill(node);
+		}
+
+		/* Keep launching until the sky is sufficiently full. */
+		if (i915_scheduler_count_flying(scheduler, ring) >=
+						scheduler->min_flying)
+			break;
+
+		ret = i915_scheduler_pop_from_queue_locked(ring, &node, &flags);
+	} while (ret == 0);
+
+	spin_unlock_irqrestore(&scheduler->lock, flags);
+
+	if (!was_locked)
+		mutex_unlock(&dev->struct_mutex);
+
+	/* Don't complain about not being able to submit extra entries */
+	if (ret == -ENODATA)
+		ret = 0;
+
+	return (ret < 0) ? ret : count;
+}
+
+int i915_scheduler_remove_dependent(struct i915_scheduler *scheduler,
+				    struct i915_scheduler_queue_entry *remove)
+{
+	struct i915_scheduler_queue_entry  *node;
+	int     i, r;
+	int     count = 0;
+
+	for (i = 0; i < remove->num_deps; i++)
+		if ((remove->dep_list[i]) &&
+		    (!I915_SQS_IS_COMPLETE(remove->dep_list[i])))
+			count++;
+	BUG_ON(count);
+
+	for (r = 0; r < I915_NUM_RINGS; r++) {
+		list_for_each_entry(node, &scheduler->node_queue[r], link) {
+			for (i = 0; i < node->num_deps; i++) {
+				if (node->dep_list[i] != remove)
+					continue;
+
+				node->dep_list[i] = NULL;
+			}
+		}
+	}
 
 	return 0;
 }
@@ -135,17 +949,25 @@  int i915_scheduler_flush(struct intel_engine_cs *ring, bool is_locked)
 bool i915_scheduler_is_seqno_in_flight(struct intel_engine_cs *ring,
 			       uint32_t seqno, bool *completed)
 {
-	struct drm_i915_private *dev_priv = ring->dev->dev_private;
-	struct i915_scheduler   *scheduler = dev_priv->scheduler;
-	bool                    found = false;
-	unsigned long           flags;
+	struct i915_scheduler_queue_entry  *node;
+	struct drm_i915_private            *dev_priv = ring->dev->dev_private;
+	struct i915_scheduler              *scheduler = dev_priv->scheduler;
+	bool            found = false;
+	unsigned long   flags;
 
 	if (!scheduler)
 		return false;
 
 	spin_lock_irqsave(&scheduler->lock, flags);
 
-	/* Do stuff... */
+	list_for_each_entry(node, &scheduler->node_queue[ring->id], link) {
+		if (node->params.seqno != seqno)
+			continue;
+
+		found = true;
+		*completed = I915_SQS_IS_COMPLETE(node);
+		break;
+	}
 
 	spin_unlock_irqrestore(&scheduler->lock, flags);
 
@@ -154,20 +976,73 @@  bool i915_scheduler_is_seqno_in_flight(struct intel_engine_cs *ring,
 
 int i915_scheduler_closefile(struct drm_device *dev, struct drm_file *file)
 {
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct i915_scheduler   *scheduler = dev_priv->scheduler;
+	struct i915_scheduler_queue_entry  *node;
+	struct drm_i915_private            *dev_priv = dev->dev_private;
+	struct i915_scheduler              *scheduler = dev_priv->scheduler;
+	struct intel_engine_cs  *ring;
+	int                     i, ret;
+	uint32_t                seqno;
+	unsigned long           flags;
+	bool                    found;
 
 	if (!scheduler)
 		return 0;
 
-	/* Do stuff... */
+	for_each_ring(ring, dev_priv, i) {
+		do {
+			spin_lock_irqsave(&scheduler->lock, flags);
+
+			found = false;
+			list_for_each_entry(node, &scheduler->node_queue[ring->id], link) {
+				if (I915_SQS_IS_COMPLETE(node))
+					continue;
+
+				if (node->params.file != file)
+					continue;
+
+				found = true;
+				seqno = node->params.seqno;
+				break;
+			}
+
+			spin_unlock_irqrestore(&scheduler->lock, flags);
+
+			if (found) {
+				do {
+					mutex_lock(&dev->struct_mutex);
+					ret = i915_wait_seqno(ring, seqno);
+					mutex_unlock(&dev->struct_mutex);
+				} while (ret == -EAGAIN);
+			}
+		} while (found);
+	}
+
+	spin_lock_irqsave(&scheduler->lock, flags);
+	for_each_ring(ring, dev_priv, i) {
+		list_for_each_entry(node, &scheduler->node_queue[ring->id], link) {
+			if (node->params.file != file)
+				continue;
+
+			WARN_ON(!I915_SQS_IS_COMPLETE(node));
+
+			node->params.file = NULL;
+		}
+	}
+	spin_unlock_irqrestore(&scheduler->lock, flags);
 
 	return 0;
 }
 
 bool i915_scheduler_is_idle(struct intel_engine_cs *ring)
 {
-	/* Do stuff... */
+	struct i915_scheduler_queue_entry *node;
+	struct drm_device       *dev = ring->dev;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	struct i915_scheduler   *scheduler = dev_priv->scheduler;
+
+	list_for_each_entry(node, &scheduler->node_queue[ring->id], link)
+		if (!I915_SQS_IS_COMPLETE(node))
+			return false;
 
 	return true;
 }
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index 6dd4fea..f93d57d 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -47,14 +47,38 @@  struct i915_execbuffer_params {
 	uint32_t                        scheduler_index;
 };
 
+enum i915_scheduler_queue_status {
+	/* Limbo, between other states: */
+	i915_sqs_none = 0,
+	/* Not yet submitted to hardware: */
+	i915_sqs_queued,
+	/* Sent to hardware for processing: */
+	i915_sqs_flying,
+	/* Finished processing on the hardware: */
+	i915_sqs_complete,
+	/* Limit value for use with arrays/loops */
+	i915_sqs_MAX
+};
+
+#define I915_SQS_IS_QUEUED(node)	(((node)->status == i915_sqs_queued))
+#define I915_SQS_IS_FLYING(node)	(((node)->status == i915_sqs_flying))
+#define I915_SQS_IS_COMPLETE(node)	((node)->status == i915_sqs_complete)
+
 struct i915_scheduler_obj_entry {
 	struct drm_i915_gem_object          *obj;
 };
 
 struct i915_scheduler_queue_entry {
 	struct i915_execbuffer_params       params;
+	uint32_t                            priority;
 	struct i915_scheduler_obj_entry     *saved_objects;
 	int                                 num_objs;
+	bool                                bumped;
+	struct i915_scheduler_queue_entry   **dep_list;
+	int                                 num_deps;
+	enum i915_scheduler_queue_status    status;
+	struct timespec                     stamp;
+	struct list_head                    link;
 };
 
 #ifdef CONFIG_DRM_I915_SCHEDULER
@@ -79,21 +103,48 @@  bool        i915_scheduler_is_idle(struct intel_engine_cs *ring);
 #ifdef CONFIG_DRM_I915_SCHEDULER
 
 struct i915_scheduler {
-	uint32_t    flags[I915_NUM_RINGS];
-	spinlock_t  lock;
-	uint32_t    index;
+	struct list_head    node_queue[I915_NUM_RINGS];
+	uint32_t            flags[I915_NUM_RINGS];
+	spinlock_t          lock;
+	uint32_t            index;
+
+	/* Tuning parameters: */
+	uint32_t            priority_level_max;
+	uint32_t            priority_level_preempt;
+	uint32_t            min_flying;
+};
+
+/* Flag bits for i915_scheduler::flags */
+enum {
+	i915_sf_interrupts_enabled  = (1 << 0),
+	i915_sf_submitting          = (1 << 1),
 };
 
 /* Options for 'scheduler_override' module parameter: */
 enum {
-	i915_so_normal              = 0,
+	i915_so_direct_submit       = (1 << 0),
 };
 
+bool        i915_scheduler_is_busy(struct intel_engine_cs *ring);
+int         i915_scheduler_fly_node(struct i915_scheduler_queue_entry *node);
 int         i915_scheduler_fly_seqno(struct intel_engine_cs *ring, uint32_t seqno);
 int         i915_scheduler_remove(struct intel_engine_cs *ring);
+int         i915_scheduler_remove_dependent(struct i915_scheduler *scheduler,
+				struct i915_scheduler_queue_entry *remove);
 int         i915_scheduler_flush(struct intel_engine_cs *ring, bool is_locked);
 int         i915_scheduler_flush_seqno(struct intel_engine_cs *ring,
 				       bool is_locked, uint32_t seqno);
+int         i915_scheduler_submit(struct intel_engine_cs *ring,
+				  bool is_locked);
+int         i915_scheduler_submit_max_priority(struct intel_engine_cs *ring,
+					       bool is_locked);
+uint32_t    i915_scheduler_count_flying(struct i915_scheduler *scheduler,
+					struct intel_engine_cs *ring);
+void        i915_scheduler_priority_bump_clear(struct i915_scheduler *scheduler,
+					       struct intel_engine_cs *ring);
+int         i915_scheduler_priority_bump(struct i915_scheduler *scheduler,
+				struct i915_scheduler_queue_entry *target,
+				uint32_t bump);
 bool        i915_scheduler_is_seqno_in_flight(struct intel_engine_cs *ring,
 					      uint32_t seqno, bool *completed);