diff mbox

[v7] drm/i915: Slaughter the thundering i915_wait_request herd

Message ID 1449159727-27526-1-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Dec. 3, 2015, 4:22 p.m. UTC
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.

Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on.

v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
---
 drivers/gpu/drm/i915/Makefile            |   1 +
 drivers/gpu/drm/i915/i915_debugfs.c      |   2 +-
 drivers/gpu/drm/i915/i915_drv.h          |   3 +-
 drivers/gpu/drm/i915/i915_gem.c          | 108 ++++++-----------
 drivers/gpu/drm/i915/i915_gpu_error.c    |   2 +-
 drivers/gpu/drm/i915/i915_irq.c          |  17 +--
 drivers/gpu/drm/i915/intel_breadcrumbs.c | 197 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c         |   5 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c  |   5 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h  |  47 +++++++-
 10 files changed, 297 insertions(+), 90 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/intel_breadcrumbs.c

Comments

Tvrtko Ursulin Dec. 7, 2015, 3:08 p.m. UTC | #1
Hi,

On 03/12/15 16:22, Chris Wilson wrote:
> One particularly stressful scenario consists of many independent tasks
> all competing for GPU time and waiting upon the results (e.g. realtime
> transcoding of many, many streams). One bottleneck in particular is that
> each client waits on its own results, but every client is woken up after
> every batchbuffer - hence the thunder of hooves as then every client must
> do its heavyweight dance to read a coherent seqno to see if it is the
> lucky one.
>
> Ideally, we only want one client to wake up after the interrupt and
> check its request for completion. Since the requests must retire in
> order, we can select the first client on the oldest request to be woken.
> Once that client has completed his wait, we can then wake up the
> next client and so on.
>
> v2: Convert from a kworker per engine into a dedicated kthread for the
> bottom-half.
> v3: Rename request members and tweak comments.
> v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
> v5: Fix race in locklessly checking waiter status and kicking the task on
> adding a new waiter.
> v6: Fix deciding when to force the timer to hide missing interrupts.
> v7: Move the bottom-half from the kthread to the first client process.

I like the idea a lot so I'll make some comments even before you post v9 
or later. :)

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
> Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Cc: Dave Gordon <david.s.gordon@intel.com>
> ---
>   drivers/gpu/drm/i915/Makefile            |   1 +
>   drivers/gpu/drm/i915/i915_debugfs.c      |   2 +-
>   drivers/gpu/drm/i915/i915_drv.h          |   3 +-
>   drivers/gpu/drm/i915/i915_gem.c          | 108 ++++++-----------
>   drivers/gpu/drm/i915/i915_gpu_error.c    |   2 +-
>   drivers/gpu/drm/i915/i915_irq.c          |  17 +--
>   drivers/gpu/drm/i915/intel_breadcrumbs.c | 197 +++++++++++++++++++++++++++++++
>   drivers/gpu/drm/i915/intel_lrc.c         |   5 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.c  |   5 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.h  |  47 +++++++-
>   10 files changed, 297 insertions(+), 90 deletions(-)
>   create mode 100644 drivers/gpu/drm/i915/intel_breadcrumbs.c
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 0851de07bd13..d3b9d3618719 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -35,6 +35,7 @@ i915-y += i915_cmd_parser.o \
>   	  i915_gem_userptr.o \
>   	  i915_gpu_error.o \
>   	  i915_trace_points.o \
> +	  intel_breadcrumbs.o \
>   	  intel_lrc.o \
>   	  intel_mocs.o \
>   	  intel_ringbuffer.o \
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 0583eda642d7..68fbe4d1f91d 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -2323,7 +2323,7 @@ static int count_irq_waiters(struct drm_i915_private *i915)
>   	int i;
>
>   	for_each_ring(ring, i915, i)
> -		count += ring->irq_refcount;
> +		count += intel_engine_has_waiter(ring);

Don't care how many any longer? Okay in debugfs, but also in error capture?

>   	return count;
>   }
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 2acf2cf0b836..30d470934874 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1383,7 +1383,7 @@ struct i915_gpu_error {
>   #define I915_STOP_RING_ALLOW_WARN      (1 << 30)
>
>   	/* For missed irq/seqno simulation. */
> -	unsigned int test_irq_rings;
> +	unsigned long test_irq_rings;
>   };
>
>   enum modeset_restore {
> @@ -2800,7 +2800,6 @@ ibx_disable_display_interrupt(struct drm_i915_private *dev_priv, uint32_t bits)
>   	ibx_display_interrupt_update(dev_priv, bits, 0);
>   }
>
> -
>   /* i915_gem.c */
>   int i915_gem_create_ioctl(struct drm_device *dev, void *data,
>   			  struct drm_file *file_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 6e051402e340..8dcbac584e84 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1125,17 +1125,6 @@ i915_gem_check_wedge(unsigned reset_counter, bool interruptible)
>   	return 0;
>   }
>
> -static void fake_irq(unsigned long data)
> -{
> -	wake_up_process((struct task_struct *)data);
> -}
> -
> -static bool missed_irq(struct drm_i915_private *dev_priv,
> -		       struct intel_engine_cs *ring)
> -{
> -	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
> -}
> -
>   static unsigned long local_clock_us(unsigned *cpu)
>   {
>   	unsigned long t;
> @@ -1183,9 +1172,6 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   	 * takes to sleep on a request, on the order of a microsecond.
>   	 */
>
> -	if (req->ring->irq_refcount)
> -		return -EBUSY;
> -
>   	/* Only spin if we know the GPU is processing this request */
>   	if (!i915_gem_request_started(req, false))
>   		return -EAGAIN;
> @@ -1204,9 +1190,6 @@ static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
>   		cpu_relax_lowlatency();
>   	}
>
> -	if (i915_gem_request_completed(req, false))
> -		return 0;
> -
>   	return -EAGAIN;
>   }
>
> @@ -1231,26 +1214,19 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   			s64 *timeout,
>   			struct intel_rps_client *rps)
>   {
> -	struct intel_engine_cs *ring = i915_gem_request_get_ring(req);
> -	struct drm_device *dev = ring->dev;
> -	struct drm_i915_private *dev_priv = dev->dev_private;
> -	const bool irq_test_in_progress =
> -		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
>   	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
> -	DEFINE_WAIT(wait);
> -	unsigned long timeout_expire;
> +	struct intel_breadcrumb wait;
> +	unsigned long timeout_remain;
>   	s64 before, now;
>   	int ret;
>
> -	WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled");
> -
>   	if (list_empty(&req->list))
>   		return 0;
>
>   	if (i915_gem_request_completed(req, true))
>   		return 0;
>
> -	timeout_expire = 0;
> +	timeout_remain = MAX_SCHEDULE_TIMEOUT;
>   	if (timeout) {
>   		if (WARN_ON(*timeout < 0))
>   			return -EINVAL;
> @@ -1258,34 +1234,43 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   		if (*timeout == 0)
>   			return -ETIME;
>
> -		timeout_expire = jiffies + nsecs_to_jiffies_timeout(*timeout);
> +		timeout_remain = nsecs_to_jiffies_timeout(*timeout);
>   	}
>
> -	if (INTEL_INFO(dev_priv)->gen >= 6)
> -		gen6_rps_boost(dev_priv, rps, req->emitted_jiffies);
> -
>   	/* Record current time in case interrupted by signal, or wedged */
>   	trace_i915_gem_request_wait_begin(req);
>   	before = ktime_get_raw_ns();
>
> -	/* Optimistic spin for the next jiffie before touching IRQs */
> -	ret = __i915_spin_request(req, state);
> -	if (ret == 0)
> -		goto out;
> +	if (INTEL_INFO(req->i915)->gen >= 6)
> +		gen6_rps_boost(req->i915, rps, req->emitted_jiffies);
>
> -	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
> -		ret = -ENODEV;
> -		goto out;
> +	/* Optimistic spin for the next jiffie before touching IRQs */

Exactly the opposite! :)

> +	if (intel_breadcrumbs_add_waiter(req, &wait)) {
> +		ret = __i915_spin_request(req, state);
> +		if (ret == 0)
> +			goto out;
>   	}
>
>   	for (;;) {
> -		struct timer_list timer;
> -
> -		prepare_to_wait(&ring->irq_queue, &wait, state);
> +		set_task_state(wait.task, state);
> +
> +		/* Ensure our read of the seqno is coherent so that we
> +		 * do not "miss an interrupt" (i.e. if this is the last
> +		 * request and the seqno write from the GPU is not visible
> +		 * by the time the interrupt fires, we will see that the
> +		 * request is incomplete and go back to sleep awaiting
> +		 * another interrupt that will never come.)
> +		 *
> +		 * Strictly, we only need to do this once after an interrupt,
> +		 * but it is easier and safer to do it every time the waiter
> +		 * is woken.
> +		 */
> +		if (i915_gem_request_completed(req, false)) {
> +			ret = 0;
> +			break;
> +		}
>
> -		/* We need to check whether any gpu reset happened in between
> -		 * the caller grabbing the seqno and now ... */
> -		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
> +		if (unlikely(req->reset_counter != i915_reset_counter(&req->i915->gpu_error))) {
>   			/* As we do not requeue the request over a GPU reset,
>   			 * if one does occur we know that the request is
>   			 * effectively complete.
> @@ -1294,46 +1279,21 @@ int __i915_wait_request(struct drm_i915_gem_request *req,
>   			break;
>   		}
>
> -		if (i915_gem_request_completed(req, false)) {
> -			ret = 0;
> -			break;
> -		}
> -
> -		if (signal_pending_state(state, current)) {
> +		if (signal_pending_state(state, wait.task)) {
>   			ret = -ERESTARTSYS;
>   			break;
>   		}
>
> -		if (timeout && time_after_eq(jiffies, timeout_expire)) {
> +		timeout_remain = io_schedule_timeout(timeout_remain);
> +		if (timeout_remain == 0) {
>   			ret = -ETIME;
>   			break;
>   		}
> -
> -		i915_queue_hangcheck(dev_priv);
> -
> -		timer.function = NULL;
> -		if (timeout || missed_irq(dev_priv, ring)) {
> -			unsigned long expire;
> -
> -			setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
> -			expire = missed_irq(dev_priv, ring) ? jiffies + 1 : timeout_expire;
> -			mod_timer(&timer, expire);
> -		}
> -
> -		io_schedule();
> -
> -		if (timer.function) {
> -			del_singleshot_timer_sync(&timer);
> -			destroy_timer_on_stack(&timer);
> -		}
>   	}
> -	if (!irq_test_in_progress)
> -		ring->irq_put(ring);
> -
> -	finish_wait(&ring->irq_queue, &wait);
> -
> +	__set_task_state(wait.task, TASK_RUNNING);
>   out:
>   	now = ktime_get_raw_ns();
> +	intel_breadcrumbs_remove_waiter(req, &wait);
>   	trace_i915_gem_request_wait_end(req);
>
>   	if (timeout) {
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 06ca4082735b..f805d117f3d1 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -900,7 +900,7 @@ static void i915_record_ring_state(struct drm_device *dev,
>   		ering->instdone = I915_READ(GEN2_INSTDONE);
>   	}
>
> -	ering->waiting = waitqueue_active(&ring->irq_queue);
> +	ering->waiting = intel_engine_has_waiter(ring);
>   	ering->instpm = I915_READ(RING_INSTPM(ring->mmio_base));
>   	ering->seqno = ring->get_seqno(ring, false);
>   	ering->acthd = intel_ring_get_active_head(ring);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 28c0329f8281..57b113132fd7 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -996,12 +996,11 @@ static void ironlake_rps_change_irq_handler(struct drm_device *dev)
>
>   static void notify_ring(struct intel_engine_cs *ring)
>   {
> -	if (!intel_ring_initialized(ring))
> +	if (ring->i915 == NULL)

Glanced something about this in some other thread, maybe best to keep it 
consolidated in one patch?

>   		return;
>
>   	trace_i915_gem_request_notify(ring);
> -
> -	wake_up_all(&ring->irq_queue);
> +	intel_engine_wakeup(ring);
>   }
>
>   static void vlv_c0_read(struct drm_i915_private *dev_priv,
> @@ -1083,7 +1082,7 @@ static bool any_waiters(struct drm_i915_private *dev_priv)
>   	int i;
>
>   	for_each_ring(ring, dev_priv, i)
> -		if (ring->irq_refcount)
> +		if (intel_engine_has_waiter(ring))
>   			return true;
>
>   	return false;
> @@ -2411,7 +2410,7 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv,
>
>   	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
>   	for_each_ring(ring, dev_priv, i)
> -		wake_up_all(&ring->irq_queue);
> +		intel_engine_wakeup(ring);
>
>   	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
>   	wake_up_all(&dev_priv->pending_flip_queue);
> @@ -2986,16 +2985,18 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
>   			if (ring_idle(ring, seqno)) {
>   				ring->hangcheck.action = HANGCHECK_IDLE;
>
> -				if (waitqueue_active(&ring->irq_queue)) {
> +				if (intel_engine_has_waiter(ring)) {
>   					/* Issue a wake-up to catch stuck h/w. */
>   					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
> -						if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
> +						if (!test_bit(ring->id, &dev_priv->gpu_error.test_irq_rings))
>   							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
>   								  ring->name);
>   						else
>   							DRM_INFO("Fake missed irq on %s\n",
>   								 ring->name);
> -						wake_up_all(&ring->irq_queue);
> +						mod_delayed_work(system_highpri_wq,
> +								 &ring->breadcrumbs.irq,
> +								 0);
>   					}
>   					/* Safeguard against driver failure */
>   					ring->hangcheck.score += BUSY;
> diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> new file mode 100644
> index 000000000000..17d51269b774
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
> @@ -0,0 +1,197 @@
> +/*
> + * Copyright © 2015 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#include <linux/kthread.h>

Not any more.

> +
> +#include "i915_drv.h"
> +
> +static bool irq_get(struct intel_engine_cs *engine)
> +{
> +	if (test_bit(engine->id, &engine->i915->gpu_error.test_irq_rings))
> +		return false;

I don't understand why knowledge of this debugfs interface needs to be 
sprinkled around? I mean, I am not even 100% sure what the debugfs 
interface is for. But it looks to be about software masking user 
interrupts on a ring? Why not just mask the interrupt delivery based on 
this mask at the source?

> +
> +	if (!intel_irqs_enabled(engine->i915))
> +		return false;
> +
> +	return engine->irq_get(engine);
> +}
> +
> +static void intel_breadcrumbs_irq(struct work_struct *work)
> +{
> +	struct intel_breadcrumbs *b =
> +		container_of(work, struct intel_breadcrumbs, irq.work);
> +	struct intel_engine_cs *engine =
> +		container_of(b, struct intel_engine_cs, breadcrumbs);
> +
> +	/* We unmask the user-interrupt in the background so that the
> +	 * first-waiter can do a little busywaiting before sleeping and
> +	 * not incur additional latency from setting up the IRQ.
> +	 *
> +	 * The worker also persists in case we cannot enable interrupts,
> +	 * or if we have previously seen seqno/interrupt incoherency
> +	 * ("missed interrupt" syndrome). Here the worker will wake up
> +	 * every jiffie in order to kick the original waiter to do the
> +	 * coherent seqno check. (Not as efficient as having a timer
> +	 * source kick the waiter directly, but this should only have to
> +	 * be used in rare cases where we try to workaround some
> +	 * hardware/driver failures.)
> +	 */
> +
> +	spin_lock(&b->lock);
> +	if (b->first_waiter) {
> +		if (!b->rpm_wakelock) {
> +			intel_runtime_pm_get(engine->i915);
> +			b->rpm_wakelock = true;
> +		}
> +
> +		if (!b->irq_enabled) {
> +			b->irq_enabled = irq_get(engine);
> +			/* If we were delayed, the original waiter may have
> +			 * gone to sleep already and we may have missed the
> +			 * interrupt. So after enabling the interrupt,
> +			 * kick the waiter.
> +			 */
> +			wake_up_process(b->first_waiter);
> +		}
> +
> +		/* No interrupts? Kick the waiter every jiffie! */
> +		if (!b->irq_enabled ||
> +		    test_bit(engine->id,
> +			     &engine->i915->gpu_error.missed_irq_rings)) {
> +			wake_up_process(b->first_waiter);
> +			queue_delayed_work(system_highpri_wq, &b->irq, 1);
> +		}
> +	} else {
> +		if (b->irq_enabled) {
> +			engine->irq_put(engine);
> +			b->irq_enabled = false;
> +		}
> +		if (b->rpm_wakelock) {
> +			intel_runtime_pm_put(engine->i915);
> +			b->rpm_wakelock = false;
> +		}
> +	}
> +	spin_unlock(&b->lock);
> +}
> +
> +inline struct intel_breadcrumb *to_wait(struct rb_node *node)
> +{
> +	if (node == NULL)
> +		return NULL;
> +
> +	return container_of(node, struct intel_breadcrumb, node);
> +}
> +
> +bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request,
> +				  struct intel_breadcrumb *wait)
> +{
> +	struct intel_breadcrumbs *b = &request->ring->breadcrumbs;
> +	struct rb_node **p, *parent;
> +	bool first = false;
> +
> +	wait->task = current;
> +	wait->seqno = request->seqno;
> +
> +	spin_lock(&b->lock);
> +
> +	/* First? Unmask the user interrupt */
> +	if (b->first_waiter == NULL)
> +		queue_delayed_work(system_highpri_wq, &b->irq, 0);

Who enables interrupts if the worker gets scheduled and completes before 
settig b->first_waiter below?

> +
> +	/* Insert the request into the retirment ordered list
> +	 * of waiters by walking the rbtree. If we are the oldest
> +	 * seqno in the tree (the first to be retired), then
> +	 * set ourselves as the bottom-half.
> +	 */
> +	first = true;
> +	parent = NULL;
> +	p = &b->requests.rb_node;
> +	while (*p) {
> +		parent = *p;
> +		if (i915_seqno_passed(wait->seqno, to_wait(parent)->seqno)) {
> +			p = &parent->rb_right;
> +			first = false;
> +		} else
> +			p = &parent->rb_left;
> +	}
> +	rb_link_node(&wait->node, parent, p);
> +	rb_insert_color(&wait->node, &b->requests);

WARN_ON(!first && !b->first_waiter) maybe?

> +	if (first)
> +		b->first_waiter = wait->task;
> +
> +	spin_unlock(&b->lock);
> +
> +	return first;
> +}
> +
> +void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request,
> +				     struct intel_breadcrumb *wait)
> +{
> +	struct intel_breadcrumbs *b = &request->ring->breadcrumbs;
> +
> +	spin_lock(&b->lock);
> +
> +	if (b->first_waiter == wait->task) {
> +		struct intel_breadcrumb *next;
> +		struct task_struct *task;
> +
> +		/* We are the current bottom-half. Find the next candidate,
> +		 * the first waiter in the queue on the remaining oldest
> +		 * request. As multiple seqnos may complete in the time it
> +		 * takes us to wake up and find the next waiter, we have to
> +		 * wake up that waiter for it to perform its own coherent
> +		 * completion check.
> +		 */

Equally, why wouldn't we wake up all waiters for which the requests have 
been completed?

Would be a cheap check here and it would save a cascading growing 
latency as one task wakes up the next one.

Even more benefit for multiple waiters on the same request.

> +		task = NULL;
> +		next = to_wait(rb_next(&wait->node));
> +		if (next)
> +			task = next->task;
> +
> +		b->first_waiter = task;
> +		if (task)
> +			wake_up_process(task);
> +		else
> +			/* Disable the user interrupts */
> +			mod_delayed_work(system_highpri_wq, &b->irq, 1);
> +	}
> +
> +	rb_erase(&wait->node, &b->requests);
> +	spin_unlock(&b->lock);
> +}
> +
> +void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
> +{
> +	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> +
> +	spin_lock_init(&b->lock);
> +	INIT_DELAYED_WORK(&b->irq, intel_breadcrumbs_irq);
> +}
> +
> +void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
> +{
> +	struct intel_breadcrumbs *b = &engine->breadcrumbs;
> +
> +	flush_delayed_work(&b->irq);
> +}
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 08b982a6ae81..ce5119e2bcef 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1904,6 +1904,8 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
>   	i915_cmd_parser_fini_ring(ring);
>   	i915_gem_batch_pool_fini(&ring->batch_pool);
>
> +	intel_engine_fini_breadcrumbs(ring);
> +
>   	if (ring->status_page.obj) {
>   		kunmap(sg_page(ring->status_page.obj->pages->sgl));
>   		ring->status_page.obj = NULL;
> @@ -1920,10 +1922,11 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
>   	ring->buffer = NULL;
>
>   	ring->dev = dev;
> +	ring->i915 = to_i915(dev);
>   	INIT_LIST_HEAD(&ring->active_list);
>   	INIT_LIST_HEAD(&ring->request_list);
>   	i915_gem_batch_pool_init(dev, &ring->batch_pool);
> -	init_waitqueue_head(&ring->irq_queue);
> +	intel_engine_init_breadcrumbs(ring);
>
>   	INIT_LIST_HEAD(&ring->buffers);
>   	INIT_LIST_HEAD(&ring->execlist_queue);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 511efe556d73..8a0f02e73a4e 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2157,6 +2157,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>   	WARN_ON(ring->buffer);
>
>   	ring->dev = dev;
> +	ring->i915 = to_i915(dev);
>   	INIT_LIST_HEAD(&ring->active_list);
>   	INIT_LIST_HEAD(&ring->request_list);
>   	INIT_LIST_HEAD(&ring->execlist_queue);
> @@ -2164,7 +2165,7 @@ static int intel_init_ring_buffer(struct drm_device *dev,
>   	i915_gem_batch_pool_init(dev, &ring->batch_pool);
>   	memset(ring->semaphore.sync_seqno, 0, sizeof(ring->semaphore.sync_seqno));
>
> -	init_waitqueue_head(&ring->irq_queue);
> +	intel_engine_init_breadcrumbs(ring);
>
>   	ringbuf = intel_engine_create_ringbuffer(ring, 32 * PAGE_SIZE);
>   	if (IS_ERR(ringbuf))
> @@ -2225,6 +2226,8 @@ void intel_cleanup_ring_buffer(struct intel_engine_cs *ring)
>
>   	i915_cmd_parser_fini_ring(ring);
>   	i915_gem_batch_pool_fini(&ring->batch_pool);
> +
> +	intel_engine_fini_breadcrumbs(ring);
>   }
>
>   static int ring_wait_for_space(struct intel_engine_cs *ring, int n)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 5d1eb206151d..7dde9310df1b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -157,9 +157,31 @@ struct  intel_engine_cs {
>   #define LAST_USER_RING (VECS + 1)
>   	u32		mmio_base;
>   	struct		drm_device *dev;
> +	struct drm_i915_private *i915;
>   	struct intel_ringbuffer *buffer;
>   	struct list_head buffers;
>
> +	/* Rather than have every client wait upon all user interrupts,
> +	 * with the herd waking after every interrupt and each doing the
> +	 * heavyweight seqno dance, we delegate the task to the first client.
> +	 * After the interrupt, we wake up one client, who does the heavyweight
> +	 * coherent seqno read and either goes back to sleep (if incomplete),
> +	 * or wakes up the next client in the queue and so forth.
> +	 *
> +	 * With respect to walking the entire list of waiters in a single

s/With respect to/In contrast to/ or "Compared to"?

> +	 * dedicated bottom-half, we reduce the latency of the first waiter
> +	 * by avoiding a context switch, but incur additional coherent seqno
> +	 * reads when following the chain of request breadcrumbs.
> +	 */
> +	struct intel_breadcrumbs {
> +		spinlock_t lock; /* protects the lists of requests */
> +		struct rb_root requests; /* sorted by retirement */
> +		struct task_struct *first_waiter; /* bh for user interrupts */
> +		struct delayed_work irq; /* used to enable or fake inerrupts */
> +		bool irq_enabled : 1;
> +		bool rpm_wakelock : 1;

Are bitfields worth it? There aren't that many engine so maybe it loses 
more in terms of instructions generated.

> +	} breadcrumbs;
> +
>   	/*
>   	 * A pool of objects to use as shadow copies of client batch buffers
>   	 * when the command parser is enabled. Prevents the client from
> @@ -303,8 +325,6 @@ struct  intel_engine_cs {
>
>   	bool gpu_caches_dirty;
>
> -	wait_queue_head_t irq_queue;
> -
>   	struct intel_context *default_context;
>   	struct intel_context *last_context;
>
> @@ -506,4 +526,27 @@ void intel_ring_reserved_space_end(struct intel_ringbuffer *ringbuf);
>   /* Legacy ringbuffer specific portion of reservation code: */
>   int intel_ring_reserve_space(struct drm_i915_gem_request *request);
>
> +/* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
> +struct intel_breadcrumb {
> +	struct rb_node node;
> +	struct task_struct *task;
> +	u32 seqno;
> +};
> +void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
> +bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request,
> +				  struct intel_breadcrumb *wait);
> +void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request,
> +				     struct intel_breadcrumb *wait);
> +static inline bool intel_engine_has_waiter(struct intel_engine_cs *engine)
> +{
> +	return READ_ONCE(engine->breadcrumbs.first_waiter);
> +}
> +static inline void intel_engine_wakeup(struct intel_engine_cs *engine)
> +{
> +	struct task_struct *task = READ_ONCE(engine->breadcrumbs.first_waiter);
> +	if (task)
> +		wake_up_process(task);

What guarantees task is a valid task at this point?

I know it is so extremely unlikely, but it can theoretically sample the 
first_waiter which then exists and disappears bafore the wake_up_process 
call.

> +}
> +void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine);
> +
>   #endif /* _INTEL_RINGBUFFER_H_ */
>

Regards,

Tvrtko
Chris Wilson Dec. 8, 2015, 10:44 a.m. UTC | #2
On Mon, Dec 07, 2015 at 03:08:28PM +0000, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 03/12/15 16:22, Chris Wilson wrote:
> >One particularly stressful scenario consists of many independent tasks
> >all competing for GPU time and waiting upon the results (e.g. realtime
> >transcoding of many, many streams). One bottleneck in particular is that
> >each client waits on its own results, but every client is woken up after
> >every batchbuffer - hence the thunder of hooves as then every client must
> >do its heavyweight dance to read a coherent seqno to see if it is the
> >lucky one.
> >
> >Ideally, we only want one client to wake up after the interrupt and
> >check its request for completion. Since the requests must retire in
> >order, we can select the first client on the oldest request to be woken.
> >Once that client has completed his wait, we can then wake up the
> >next client and so on.
> >
> >v2: Convert from a kworker per engine into a dedicated kthread for the
> >bottom-half.
> >v3: Rename request members and tweak comments.
> >v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
> >v5: Fix race in locklessly checking waiter status and kicking the task on
> >adding a new waiter.
> >v6: Fix deciding when to force the timer to hide missing interrupts.
> >v7: Move the bottom-half from the kthread to the first client process.
> 
> I like the idea a lot so I'll make some comments even before you
> post v9 or later. :)
> 
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
> >Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
> >Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> >Cc: Dave Gordon <david.s.gordon@intel.com>
> >---
> >  drivers/gpu/drm/i915/Makefile            |   1 +
> >  drivers/gpu/drm/i915/i915_debugfs.c      |   2 +-
> >  drivers/gpu/drm/i915/i915_drv.h          |   3 +-
> >  drivers/gpu/drm/i915/i915_gem.c          | 108 ++++++-----------
> >  drivers/gpu/drm/i915/i915_gpu_error.c    |   2 +-
> >  drivers/gpu/drm/i915/i915_irq.c          |  17 +--
> >  drivers/gpu/drm/i915/intel_breadcrumbs.c | 197 +++++++++++++++++++++++++++++++
> >  drivers/gpu/drm/i915/intel_lrc.c         |   5 +-
> >  drivers/gpu/drm/i915/intel_ringbuffer.c  |   5 +-
> >  drivers/gpu/drm/i915/intel_ringbuffer.h  |  47 +++++++-
> >  10 files changed, 297 insertions(+), 90 deletions(-)
> >  create mode 100644 drivers/gpu/drm/i915/intel_breadcrumbs.c
> >
> >diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> >index 0851de07bd13..d3b9d3618719 100644
> >--- a/drivers/gpu/drm/i915/Makefile
> >+++ b/drivers/gpu/drm/i915/Makefile
> >@@ -35,6 +35,7 @@ i915-y += i915_cmd_parser.o \
> >  	  i915_gem_userptr.o \
> >  	  i915_gpu_error.o \
> >  	  i915_trace_points.o \
> >+	  intel_breadcrumbs.o \
> >  	  intel_lrc.o \
> >  	  intel_mocs.o \
> >  	  intel_ringbuffer.o \
> >diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> >index 0583eda642d7..68fbe4d1f91d 100644
> >--- a/drivers/gpu/drm/i915/i915_debugfs.c
> >+++ b/drivers/gpu/drm/i915/i915_debugfs.c
> >@@ -2323,7 +2323,7 @@ static int count_irq_waiters(struct drm_i915_private *i915)
> >  	int i;
> >
> >  	for_each_ring(ring, i915, i)
> >-		count += ring->irq_refcount;
> >+		count += intel_engine_has_waiter(ring);
> 
> Don't care how many any longer? Okay in debugfs, but also in error capture?

It was nice that we were able to mark the requests with waiters for
free. We could add that, but it is no longer a side-effect of this
patch. Here I was tempted to do an intel_engine_count_breadcrumbs(), but
that doesn't match how the information is used by RPS so adding it to
debugfs for RPS is misleading.

> >-	/* Optimistic spin for the next jiffie before touching IRQs */
> >-	ret = __i915_spin_request(req, state);
> >-	if (ret == 0)
> >-		goto out;
> >+	if (INTEL_INFO(req->i915)->gen >= 6)
> >+		gen6_rps_boost(req->i915, rps, req->emitted_jiffies);
> >
> >-	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
> >-		ret = -ENODEV;
> >-		goto out;
> >+	/* Optimistic spin for the next jiffie before touching IRQs */
> 
> Exactly the opposite! :)

Bah. It's close, it certainly succinctly captures the intent if not the
implementation! s/jiffie/~jiffie/!

> >@@ -996,12 +996,11 @@ static void ironlake_rps_change_irq_handler(struct drm_device *dev)
> >
> >  static void notify_ring(struct intel_engine_cs *ring)
> >  {
> >-	if (!intel_ring_initialized(ring))
> >+	if (ring->i915 == NULL)
> 
> Glanced something about this in some other thread, maybe best to
> keep it consolidated in one patch?

This can die now. It was justifiable when I was using ring->i915
directly here, but now it is just (mostly) pointless churn.

> >+static bool irq_get(struct intel_engine_cs *engine)
> >+{
> >+	if (test_bit(engine->id, &engine->i915->gpu_error.test_irq_rings))
> >+		return false;
> 
> I don't understand why knowledge of this debugfs interface needs to
> be sprinkled around?

It's not though. This is the only use - the interface is to inject
faults into the waiter that ensure we cannot wake up after an
interrupt, and so force the hangcheck to intervene. As you thinking of
its counterpart missed_irq, which we use to track when we need to force
the fake interrupt?

> I mean, I am not even 100% sure what the
> debugfs interface is for. But it looks to be about software masking
> user interrupts on a ring? Why not just mask the interrupt delivery
> based on this mask at the source?

It is to simulate a broken wait, as opposed to masking interrupts.

> >+bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request,
> >+				  struct intel_breadcrumb *wait)
> >+{
> >+	struct intel_breadcrumbs *b = &request->ring->breadcrumbs;
> >+	struct rb_node **p, *parent;
> >+	bool first = false;
> >+
> >+	wait->task = current;
> >+	wait->seqno = request->seqno;
> >+
> >+	spin_lock(&b->lock);
> >+
> >+	/* First? Unmask the user interrupt */
> >+	if (b->first_waiter == NULL)
> >+		queue_delayed_work(system_highpri_wq, &b->irq, 0);
> 
> Who enables interrupts if the worker gets scheduled and completes
> before settig b->first_waiter below?

In this version, it is serialised by the spinlock. In the next version,
the ordering is explicit (no more delayed worker).
 
> >+	/* Insert the request into the retirment ordered list
> >+	 * of waiters by walking the rbtree. If we are the oldest
> >+	 * seqno in the tree (the first to be retired), then
> >+	 * set ourselves as the bottom-half.
> >+	 */
> >+	first = true;
> >+	parent = NULL;
> >+	p = &b->requests.rb_node;
> >+	while (*p) {
> >+		parent = *p;
> >+		if (i915_seqno_passed(wait->seqno, to_wait(parent)->seqno)) {
> >+			p = &parent->rb_right;
> >+			first = false;
> >+		} else
> >+			p = &parent->rb_left;
> >+	}
> >+	rb_link_node(&wait->node, parent, p);
> >+	rb_insert_color(&wait->node, &b->requests);
> 
> WARN_ON(!first && !b->first_waiter) maybe?

We will get a GPF on b->first_waiter soon enough :)

> 
> >+	if (first)
> >+		b->first_waiter = wait->task;

BUG_ON(b->first_waiter == NULL);

would be clearer I guess.

> >+
> >+	spin_unlock(&b->lock);
> >+
> >+	return first;
> >+}
> >+
> >+void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request,
> >+				     struct intel_breadcrumb *wait)
> >+{
> >+	struct intel_breadcrumbs *b = &request->ring->breadcrumbs;
> >+
> >+	spin_lock(&b->lock);
> >+
> >+	if (b->first_waiter == wait->task) {
> >+		struct intel_breadcrumb *next;
> >+		struct task_struct *task;
> >+
> >+		/* We are the current bottom-half. Find the next candidate,
> >+		 * the first waiter in the queue on the remaining oldest
> >+		 * request. As multiple seqnos may complete in the time it
> >+		 * takes us to wake up and find the next waiter, we have to
> >+		 * wake up that waiter for it to perform its own coherent
> >+		 * completion check.
> >+		 */
> 
> Equally, why wouldn't we wake up all waiters for which the requests
> have been completed?

Because we no longer track the requests to be completed, having moved to
a chain of waiting processes instead of a chain of requests. I could
insert a waitqueue into the intel_breadcrumb and that would indeed
necessitate locking in the irq handler (and irq locks everywhere :(.
 
> Would be a cheap check here and it would save a cascading growing
> latency as one task wakes up the next one.

Well, it can't be here since we may remove_waiter after a signal
(incomplete wait). So this part has to walk the chain of processes. Ugh,
and have to move the waitqueue from one waiter to the next...

> Even more benefit for multiple waiters on the same request.

Yes. That's what I liked about the previous version with the independent
waitqueue. However, latency of a single waiter is a compelling argument.

My head is still full of cold, maybe embedding a waitqueue into the
breadcrumb is an improvement - I am not sure yet.

> >+	/* Rather than have every client wait upon all user interrupts,
> >+	 * with the herd waking after every interrupt and each doing the
> >+	 * heavyweight seqno dance, we delegate the task to the first client.
> >+	 * After the interrupt, we wake up one client, who does the heavyweight
> >+	 * coherent seqno read and either goes back to sleep (if incomplete),
> >+	 * or wakes up the next client in the queue and so forth.
> >+	 *
> >+	 * With respect to walking the entire list of waiters in a single
> 
> s/With respect to/In contrast to/ or "Compared to"?

"Compared to", thanks.
 
> >+	 * dedicated bottom-half, we reduce the latency of the first waiter
> >+	 * by avoiding a context switch, but incur additional coherent seqno
> >+	 * reads when following the chain of request breadcrumbs.
> >+	 */
> >+	struct intel_breadcrumbs {
> >+		spinlock_t lock; /* protects the lists of requests */
> >+		struct rb_root requests; /* sorted by retirement */
> >+		struct task_struct *first_waiter; /* bh for user interrupts */
> >+		struct delayed_work irq; /* used to enable or fake inerrupts */
> >+		bool irq_enabled : 1;
> >+		bool rpm_wakelock : 1;
> 
> Are bitfields worth it? There aren't that many engine so maybe it
> loses more in terms of instructions generated.

I'm always optimistic that gcc gets it's act together. Not that I don't
have patches converting from bitfields to flags when gcc hurts. Here,
the potential extra instruction is going to dwarfed by the irq spinlocks
and worse along these paths.

> >+static inline void intel_engine_wakeup(struct intel_engine_cs *engine)
> >+{
> >+	struct task_struct *task = READ_ONCE(engine->breadcrumbs.first_waiter);
> >+	if (task)
> >+		wake_up_process(task);
> 
> What guarantees task is a valid task at this point?

Not an awful lot! Indirectly, I can point to the that the task struct
cannot be freed whilst we are in an irq (thanks to rcu), but other than
that it is simply that there is a much longer path between clearing the
first_waiter and freeing the task_struct than doing a wake_up_process on
the running task.
 
> I know it is so extremely unlikely, but it can theoretically sample
> the first_waiter which then exists and disappears bafore the
> wake_up_process call.

I can leave a comment: "I told you so -- Tvrtko" :)
-Chris
Tvrtko Ursulin Dec. 8, 2015, 2:03 p.m. UTC | #3
On 08/12/15 10:44, Chris Wilson wrote:
> On Mon, Dec 07, 2015 at 03:08:28PM +0000, Tvrtko Ursulin wrote:
>>
>> Hi,
>>
>> On 03/12/15 16:22, Chris Wilson wrote:
>>> One particularly stressful scenario consists of many independent tasks
>>> all competing for GPU time and waiting upon the results (e.g. realtime
>>> transcoding of many, many streams). One bottleneck in particular is that
>>> each client waits on its own results, but every client is woken up after
>>> every batchbuffer - hence the thunder of hooves as then every client must
>>> do its heavyweight dance to read a coherent seqno to see if it is the
>>> lucky one.
>>>
>>> Ideally, we only want one client to wake up after the interrupt and
>>> check its request for completion. Since the requests must retire in
>>> order, we can select the first client on the oldest request to be woken.
>>> Once that client has completed his wait, we can then wake up the
>>> next client and so on.
>>>
>>> v2: Convert from a kworker per engine into a dedicated kthread for the
>>> bottom-half.
>>> v3: Rename request members and tweak comments.
>>> v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
>>> v5: Fix race in locklessly checking waiter status and kicking the task on
>>> adding a new waiter.
>>> v6: Fix deciding when to force the timer to hide missing interrupts.
>>> v7: Move the bottom-half from the kthread to the first client process.
>>
>> I like the idea a lot so I'll make some comments even before you
>> post v9 or later. :)
>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
>>> Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
>>> Cc: Dave Gordon <david.s.gordon@intel.com>
>>> ---
>>>   drivers/gpu/drm/i915/Makefile            |   1 +
>>>   drivers/gpu/drm/i915/i915_debugfs.c      |   2 +-
>>>   drivers/gpu/drm/i915/i915_drv.h          |   3 +-
>>>   drivers/gpu/drm/i915/i915_gem.c          | 108 ++++++-----------
>>>   drivers/gpu/drm/i915/i915_gpu_error.c    |   2 +-
>>>   drivers/gpu/drm/i915/i915_irq.c          |  17 +--
>>>   drivers/gpu/drm/i915/intel_breadcrumbs.c | 197 +++++++++++++++++++++++++++++++
>>>   drivers/gpu/drm/i915/intel_lrc.c         |   5 +-
>>>   drivers/gpu/drm/i915/intel_ringbuffer.c  |   5 +-
>>>   drivers/gpu/drm/i915/intel_ringbuffer.h  |  47 +++++++-
>>>   10 files changed, 297 insertions(+), 90 deletions(-)
>>>   create mode 100644 drivers/gpu/drm/i915/intel_breadcrumbs.c
>>>
>>> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
>>> index 0851de07bd13..d3b9d3618719 100644
>>> --- a/drivers/gpu/drm/i915/Makefile
>>> +++ b/drivers/gpu/drm/i915/Makefile
>>> @@ -35,6 +35,7 @@ i915-y += i915_cmd_parser.o \
>>>   	  i915_gem_userptr.o \
>>>   	  i915_gpu_error.o \
>>>   	  i915_trace_points.o \
>>> +	  intel_breadcrumbs.o \
>>>   	  intel_lrc.o \
>>>   	  intel_mocs.o \
>>>   	  intel_ringbuffer.o \
>>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>>> index 0583eda642d7..68fbe4d1f91d 100644
>>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>>> @@ -2323,7 +2323,7 @@ static int count_irq_waiters(struct drm_i915_private *i915)
>>>   	int i;
>>>
>>>   	for_each_ring(ring, i915, i)
>>> -		count += ring->irq_refcount;
>>> +		count += intel_engine_has_waiter(ring);
>>
>> Don't care how many any longer? Okay in debugfs, but also in error capture?
>
> It was nice that we were able to mark the requests with waiters for
> free. We could add that, but it is no longer a side-effect of this
> patch. Here I was tempted to do an intel_engine_count_breadcrumbs(), but
> that doesn't match how the information is used by RPS so adding it to
> debugfs for RPS is misleading.
>
>>> -	/* Optimistic spin for the next jiffie before touching IRQs */
>>> -	ret = __i915_spin_request(req, state);
>>> -	if (ret == 0)
>>> -		goto out;
>>> +	if (INTEL_INFO(req->i915)->gen >= 6)
>>> +		gen6_rps_boost(req->i915, rps, req->emitted_jiffies);
>>>
>>> -	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
>>> -		ret = -ENODEV;
>>> -		goto out;
>>> +	/* Optimistic spin for the next jiffie before touching IRQs */
>>
>> Exactly the opposite! :)
>
> Bah. It's close, it certainly succinctly captures the intent if not the
> implementation! s/jiffie/~jiffie/!
>
>>> @@ -996,12 +996,11 @@ static void ironlake_rps_change_irq_handler(struct drm_device *dev)
>>>
>>>   static void notify_ring(struct intel_engine_cs *ring)
>>>   {
>>> -	if (!intel_ring_initialized(ring))
>>> +	if (ring->i915 == NULL)
>>
>> Glanced something about this in some other thread, maybe best to
>> keep it consolidated in one patch?
>
> This can die now. It was justifiable when I was using ring->i915
> directly here, but now it is just (mostly) pointless churn.
>
>>> +static bool irq_get(struct intel_engine_cs *engine)
>>> +{
>>> +	if (test_bit(engine->id, &engine->i915->gpu_error.test_irq_rings))
>>> +		return false;
>>
>> I don't understand why knowledge of this debugfs interface needs to
>> be sprinkled around?
>
> It's not though. This is the only use - the interface is to inject
> faults into the waiter that ensure we cannot wake up after an
> interrupt, and so force the hangcheck to intervene. As you thinking of
> its counterpart missed_irq, which we use to track when we need to force
> the fake interrupt?
>
>> I mean, I am not even 100% sure what the
>> debugfs interface is for. But it looks to be about software masking
>> user interrupts on a ring? Why not just mask the interrupt delivery
>> based on this mask at the source?
>
> It is to simulate a broken wait, as opposed to masking interrupts.

I don't see the difference between a broken wait and masking interrupts?

It is masking interrupts before and after this patch. Broken wait could 
have been accomplished by doing the test from wait when 
request_completed is called.

Anyway, I don't get it.

>>> +bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request,
>>> +				  struct intel_breadcrumb *wait)
>>> +{
>>> +	struct intel_breadcrumbs *b = &request->ring->breadcrumbs;
>>> +	struct rb_node **p, *parent;
>>> +	bool first = false;
>>> +
>>> +	wait->task = current;
>>> +	wait->seqno = request->seqno;
>>> +
>>> +	spin_lock(&b->lock);
>>> +
>>> +	/* First? Unmask the user interrupt */
>>> +	if (b->first_waiter == NULL)
>>> +		queue_delayed_work(system_highpri_wq, &b->irq, 0);
>>
>> Who enables interrupts if the worker gets scheduled and completes
>> before settig b->first_waiter below?
>
> In this version, it is serialised by the spinlock. In the next version,
> the ordering is explicit (no more delayed worker).
>
>>> +	/* Insert the request into the retirment ordered list
>>> +	 * of waiters by walking the rbtree. If we are the oldest
>>> +	 * seqno in the tree (the first to be retired), then
>>> +	 * set ourselves as the bottom-half.
>>> +	 */
>>> +	first = true;
>>> +	parent = NULL;
>>> +	p = &b->requests.rb_node;
>>> +	while (*p) {
>>> +		parent = *p;
>>> +		if (i915_seqno_passed(wait->seqno, to_wait(parent)->seqno)) {
>>> +			p = &parent->rb_right;
>>> +			first = false;
>>> +		} else
>>> +			p = &parent->rb_left;
>>> +	}
>>> +	rb_link_node(&wait->node, parent, p);
>>> +	rb_insert_color(&wait->node, &b->requests);
>>
>> WARN_ON(!first && !b->first_waiter) maybe?
>
> We will get a GPF on b->first_waiter soon enough :)
>
>>
>>> +	if (first)
>>> +		b->first_waiter = wait->task;
>
> BUG_ON(b->first_waiter == NULL);
>
> would be clearer I guess.
>
>>> +
>>> +	spin_unlock(&b->lock);
>>> +
>>> +	return first;
>>> +}
>>> +
>>> +void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request,
>>> +				     struct intel_breadcrumb *wait)
>>> +{
>>> +	struct intel_breadcrumbs *b = &request->ring->breadcrumbs;
>>> +
>>> +	spin_lock(&b->lock);
>>> +
>>> +	if (b->first_waiter == wait->task) {
>>> +		struct intel_breadcrumb *next;
>>> +		struct task_struct *task;
>>> +
>>> +		/* We are the current bottom-half. Find the next candidate,
>>> +		 * the first waiter in the queue on the remaining oldest
>>> +		 * request. As multiple seqnos may complete in the time it
>>> +		 * takes us to wake up and find the next waiter, we have to
>>> +		 * wake up that waiter for it to perform its own coherent
>>> +		 * completion check.
>>> +		 */
>>
>> Equally, why wouldn't we wake up all waiters for which the requests
>> have been completed?
>
> Because we no longer track the requests to be completed, having moved to
> a chain of waiting processes instead of a chain of requests. I could
> insert a waitqueue into the intel_breadcrumb and that would indeed
> necessitate locking in the irq handler (and irq locks everywhere :(.

You have a tree of seqnos each with a wait->task and current seqno on 
the engine can be queried. So I don't see where is the problem?

>> Would be a cheap check here and it would save a cascading growing
>> latency as one task wakes up the next one.
>
> Well, it can't be here since we may remove_waiter after a signal
> (incomplete wait). So this part has to walk the chain of processes. Ugh,
> and have to move the waitqueue from one waiter to the next...

Ok on interrupted waiters it makes no sense, but on normal waiter 
removal it would just mean comparing engine->get_seqno() vs the first 
waiter seqno and waking up all until the uncompleted one is found?

>> Even more benefit for multiple waiters on the same request.
>
> Yes. That's what I liked about the previous version with the independent
> waitqueue. However, latency of a single waiter is a compelling argument.
>
> My head is still full of cold, maybe embedding a waitqueue into the
> breadcrumb is an improvement - I am not sure yet.

Leave it for some other day then.

>>> +	/* Rather than have every client wait upon all user interrupts,
>>> +	 * with the herd waking after every interrupt and each doing the
>>> +	 * heavyweight seqno dance, we delegate the task to the first client.
>>> +	 * After the interrupt, we wake up one client, who does the heavyweight
>>> +	 * coherent seqno read and either goes back to sleep (if incomplete),
>>> +	 * or wakes up the next client in the queue and so forth.
>>> +	 *
>>> +	 * With respect to walking the entire list of waiters in a single
>>
>> s/With respect to/In contrast to/ or "Compared to"?
>
> "Compared to", thanks.
>
>>> +	 * dedicated bottom-half, we reduce the latency of the first waiter
>>> +	 * by avoiding a context switch, but incur additional coherent seqno
>>> +	 * reads when following the chain of request breadcrumbs.
>>> +	 */
>>> +	struct intel_breadcrumbs {
>>> +		spinlock_t lock; /* protects the lists of requests */
>>> +		struct rb_root requests; /* sorted by retirement */
>>> +		struct task_struct *first_waiter; /* bh for user interrupts */
>>> +		struct delayed_work irq; /* used to enable or fake inerrupts */
>>> +		bool irq_enabled : 1;
>>> +		bool rpm_wakelock : 1;
>>
>> Are bitfields worth it? There aren't that many engine so maybe it
>> loses more in terms of instructions generated.
>
> I'm always optimistic that gcc gets it's act together. Not that I don't
> have patches converting from bitfields to flags when gcc hurts. Here,
> the potential extra instruction is going to dwarfed by the irq spinlocks
> and worse along these paths.
>
>>> +static inline void intel_engine_wakeup(struct intel_engine_cs *engine)
>>> +{
>>> +	struct task_struct *task = READ_ONCE(engine->breadcrumbs.first_waiter);
>>> +	if (task)
>>> +		wake_up_process(task);
>>
>> What guarantees task is a valid task at this point?
>
> Not an awful lot! Indirectly, I can point to the that the task struct
> cannot be freed whilst we are in an irq (thanks to rcu), but other than
> that it is simply that there is a much longer path between clearing the
> first_waiter and freeing the task_struct than doing a wake_up_process on
> the running task.
>
>> I know it is so extremely unlikely, but it can theoretically sample
>> the first_waiter which then exists and disappears bafore the
>> wake_up_process call.
>
> I can leave a comment: "I told you so -- Tvrtko" :)

Hm, it possibly needs a synchronize_irq before remove_waiter exits then?

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 0851de07bd13..d3b9d3618719 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -35,6 +35,7 @@  i915-y += i915_cmd_parser.o \
 	  i915_gem_userptr.o \
 	  i915_gpu_error.o \
 	  i915_trace_points.o \
+	  intel_breadcrumbs.o \
 	  intel_lrc.o \
 	  intel_mocs.o \
 	  intel_ringbuffer.o \
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 0583eda642d7..68fbe4d1f91d 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2323,7 +2323,7 @@  static int count_irq_waiters(struct drm_i915_private *i915)
 	int i;
 
 	for_each_ring(ring, i915, i)
-		count += ring->irq_refcount;
+		count += intel_engine_has_waiter(ring);
 
 	return count;
 }
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2acf2cf0b836..30d470934874 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1383,7 +1383,7 @@  struct i915_gpu_error {
 #define I915_STOP_RING_ALLOW_WARN      (1 << 30)
 
 	/* For missed irq/seqno simulation. */
-	unsigned int test_irq_rings;
+	unsigned long test_irq_rings;
 };
 
 enum modeset_restore {
@@ -2800,7 +2800,6 @@  ibx_disable_display_interrupt(struct drm_i915_private *dev_priv, uint32_t bits)
 	ibx_display_interrupt_update(dev_priv, bits, 0);
 }
 
-
 /* i915_gem.c */
 int i915_gem_create_ioctl(struct drm_device *dev, void *data,
 			  struct drm_file *file_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6e051402e340..8dcbac584e84 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1125,17 +1125,6 @@  i915_gem_check_wedge(unsigned reset_counter, bool interruptible)
 	return 0;
 }
 
-static void fake_irq(unsigned long data)
-{
-	wake_up_process((struct task_struct *)data);
-}
-
-static bool missed_irq(struct drm_i915_private *dev_priv,
-		       struct intel_engine_cs *ring)
-{
-	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
-}
-
 static unsigned long local_clock_us(unsigned *cpu)
 {
 	unsigned long t;
@@ -1183,9 +1172,6 @@  static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 	 * takes to sleep on a request, on the order of a microsecond.
 	 */
 
-	if (req->ring->irq_refcount)
-		return -EBUSY;
-
 	/* Only spin if we know the GPU is processing this request */
 	if (!i915_gem_request_started(req, false))
 		return -EAGAIN;
@@ -1204,9 +1190,6 @@  static int __i915_spin_request(struct drm_i915_gem_request *req, int state)
 		cpu_relax_lowlatency();
 	}
 
-	if (i915_gem_request_completed(req, false))
-		return 0;
-
 	return -EAGAIN;
 }
 
@@ -1231,26 +1214,19 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 			s64 *timeout,
 			struct intel_rps_client *rps)
 {
-	struct intel_engine_cs *ring = i915_gem_request_get_ring(req);
-	struct drm_device *dev = ring->dev;
-	struct drm_i915_private *dev_priv = dev->dev_private;
-	const bool irq_test_in_progress =
-		ACCESS_ONCE(dev_priv->gpu_error.test_irq_rings) & intel_ring_flag(ring);
 	int state = interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
-	DEFINE_WAIT(wait);
-	unsigned long timeout_expire;
+	struct intel_breadcrumb wait;
+	unsigned long timeout_remain;
 	s64 before, now;
 	int ret;
 
-	WARN(!intel_irqs_enabled(dev_priv), "IRQs disabled");
-
 	if (list_empty(&req->list))
 		return 0;
 
 	if (i915_gem_request_completed(req, true))
 		return 0;
 
-	timeout_expire = 0;
+	timeout_remain = MAX_SCHEDULE_TIMEOUT;
 	if (timeout) {
 		if (WARN_ON(*timeout < 0))
 			return -EINVAL;
@@ -1258,34 +1234,43 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 		if (*timeout == 0)
 			return -ETIME;
 
-		timeout_expire = jiffies + nsecs_to_jiffies_timeout(*timeout);
+		timeout_remain = nsecs_to_jiffies_timeout(*timeout);
 	}
 
-	if (INTEL_INFO(dev_priv)->gen >= 6)
-		gen6_rps_boost(dev_priv, rps, req->emitted_jiffies);
-
 	/* Record current time in case interrupted by signal, or wedged */
 	trace_i915_gem_request_wait_begin(req);
 	before = ktime_get_raw_ns();
 
-	/* Optimistic spin for the next jiffie before touching IRQs */
-	ret = __i915_spin_request(req, state);
-	if (ret == 0)
-		goto out;
+	if (INTEL_INFO(req->i915)->gen >= 6)
+		gen6_rps_boost(req->i915, rps, req->emitted_jiffies);
 
-	if (!irq_test_in_progress && WARN_ON(!ring->irq_get(ring))) {
-		ret = -ENODEV;
-		goto out;
+	/* Optimistic spin for the next jiffie before touching IRQs */
+	if (intel_breadcrumbs_add_waiter(req, &wait)) {
+		ret = __i915_spin_request(req, state);
+		if (ret == 0)
+			goto out;
 	}
 
 	for (;;) {
-		struct timer_list timer;
-
-		prepare_to_wait(&ring->irq_queue, &wait, state);
+		set_task_state(wait.task, state);
+
+		/* Ensure our read of the seqno is coherent so that we
+		 * do not "miss an interrupt" (i.e. if this is the last
+		 * request and the seqno write from the GPU is not visible
+		 * by the time the interrupt fires, we will see that the
+		 * request is incomplete and go back to sleep awaiting
+		 * another interrupt that will never come.)
+		 *
+		 * Strictly, we only need to do this once after an interrupt,
+		 * but it is easier and safer to do it every time the waiter
+		 * is woken.
+		 */
+		if (i915_gem_request_completed(req, false)) {
+			ret = 0;
+			break;
+		}
 
-		/* We need to check whether any gpu reset happened in between
-		 * the caller grabbing the seqno and now ... */
-		if (req->reset_counter != i915_reset_counter(&dev_priv->gpu_error)) {
+		if (unlikely(req->reset_counter != i915_reset_counter(&req->i915->gpu_error))) {
 			/* As we do not requeue the request over a GPU reset,
 			 * if one does occur we know that the request is
 			 * effectively complete.
@@ -1294,46 +1279,21 @@  int __i915_wait_request(struct drm_i915_gem_request *req,
 			break;
 		}
 
-		if (i915_gem_request_completed(req, false)) {
-			ret = 0;
-			break;
-		}
-
-		if (signal_pending_state(state, current)) {
+		if (signal_pending_state(state, wait.task)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
 
-		if (timeout && time_after_eq(jiffies, timeout_expire)) {
+		timeout_remain = io_schedule_timeout(timeout_remain);
+		if (timeout_remain == 0) {
 			ret = -ETIME;
 			break;
 		}
-
-		i915_queue_hangcheck(dev_priv);
-
-		timer.function = NULL;
-		if (timeout || missed_irq(dev_priv, ring)) {
-			unsigned long expire;
-
-			setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
-			expire = missed_irq(dev_priv, ring) ? jiffies + 1 : timeout_expire;
-			mod_timer(&timer, expire);
-		}
-
-		io_schedule();
-
-		if (timer.function) {
-			del_singleshot_timer_sync(&timer);
-			destroy_timer_on_stack(&timer);
-		}
 	}
-	if (!irq_test_in_progress)
-		ring->irq_put(ring);
-
-	finish_wait(&ring->irq_queue, &wait);
-
+	__set_task_state(wait.task, TASK_RUNNING);
 out:
 	now = ktime_get_raw_ns();
+	intel_breadcrumbs_remove_waiter(req, &wait);
 	trace_i915_gem_request_wait_end(req);
 
 	if (timeout) {
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 06ca4082735b..f805d117f3d1 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -900,7 +900,7 @@  static void i915_record_ring_state(struct drm_device *dev,
 		ering->instdone = I915_READ(GEN2_INSTDONE);
 	}
 
-	ering->waiting = waitqueue_active(&ring->irq_queue);
+	ering->waiting = intel_engine_has_waiter(ring);
 	ering->instpm = I915_READ(RING_INSTPM(ring->mmio_base));
 	ering->seqno = ring->get_seqno(ring, false);
 	ering->acthd = intel_ring_get_active_head(ring);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 28c0329f8281..57b113132fd7 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -996,12 +996,11 @@  static void ironlake_rps_change_irq_handler(struct drm_device *dev)
 
 static void notify_ring(struct intel_engine_cs *ring)
 {
-	if (!intel_ring_initialized(ring))
+	if (ring->i915 == NULL)
 		return;
 
 	trace_i915_gem_request_notify(ring);
-
-	wake_up_all(&ring->irq_queue);
+	intel_engine_wakeup(ring);
 }
 
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
@@ -1083,7 +1082,7 @@  static bool any_waiters(struct drm_i915_private *dev_priv)
 	int i;
 
 	for_each_ring(ring, dev_priv, i)
-		if (ring->irq_refcount)
+		if (intel_engine_has_waiter(ring))
 			return true;
 
 	return false;
@@ -2411,7 +2410,7 @@  static void i915_error_wake_up(struct drm_i915_private *dev_priv,
 
 	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
 	for_each_ring(ring, dev_priv, i)
-		wake_up_all(&ring->irq_queue);
+		intel_engine_wakeup(ring);
 
 	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
 	wake_up_all(&dev_priv->pending_flip_queue);
@@ -2986,16 +2985,18 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 			if (ring_idle(ring, seqno)) {
 				ring->hangcheck.action = HANGCHECK_IDLE;
 
-				if (waitqueue_active(&ring->irq_queue)) {
+				if (intel_engine_has_waiter(ring)) {
 					/* Issue a wake-up to catch stuck h/w. */
 					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
-						if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
+						if (!test_bit(ring->id, &dev_priv->gpu_error.test_irq_rings))
 							DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
 								  ring->name);
 						else
 							DRM_INFO("Fake missed irq on %s\n",
 								 ring->name);
-						wake_up_all(&ring->irq_queue);
+						mod_delayed_work(system_highpri_wq,
+								 &ring->breadcrumbs.irq,
+								 0);
 					}
 					/* Safeguard against driver failure */
 					ring->hangcheck.score += BUSY;
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
new file mode 100644
index 000000000000..17d51269b774
--- /dev/null
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -0,0 +1,197 @@ 
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#include <linux/kthread.h>
+
+#include "i915_drv.h"
+
+static bool irq_get(struct intel_engine_cs *engine)
+{
+	if (test_bit(engine->id, &engine->i915->gpu_error.test_irq_rings))
+		return false;
+
+	if (!intel_irqs_enabled(engine->i915))
+		return false;
+
+	return engine->irq_get(engine);
+}
+
+static void intel_breadcrumbs_irq(struct work_struct *work)
+{
+	struct intel_breadcrumbs *b =
+		container_of(work, struct intel_breadcrumbs, irq.work);
+	struct intel_engine_cs *engine =
+		container_of(b, struct intel_engine_cs, breadcrumbs);
+
+	/* We unmask the user-interrupt in the background so that the
+	 * first-waiter can do a little busywaiting before sleeping and
+	 * not incur additional latency from setting up the IRQ.
+	 *
+	 * The worker also persists in case we cannot enable interrupts,
+	 * or if we have previously seen seqno/interrupt incoherency
+	 * ("missed interrupt" syndrome). Here the worker will wake up
+	 * every jiffie in order to kick the original waiter to do the
+	 * coherent seqno check. (Not as efficient as having a timer
+	 * source kick the waiter directly, but this should only have to
+	 * be used in rare cases where we try to workaround some
+	 * hardware/driver failures.)
+	 */
+
+	spin_lock(&b->lock);
+	if (b->first_waiter) {
+		if (!b->rpm_wakelock) {
+			intel_runtime_pm_get(engine->i915);
+			b->rpm_wakelock = true;
+		}
+
+		if (!b->irq_enabled) {
+			b->irq_enabled = irq_get(engine);
+			/* If we were delayed, the original waiter may have
+			 * gone to sleep already and we may have missed the
+			 * interrupt. So after enabling the interrupt,
+			 * kick the waiter.
+			 */
+			wake_up_process(b->first_waiter);
+		}
+
+		/* No interrupts? Kick the waiter every jiffie! */
+		if (!b->irq_enabled ||
+		    test_bit(engine->id,
+			     &engine->i915->gpu_error.missed_irq_rings)) {
+			wake_up_process(b->first_waiter);
+			queue_delayed_work(system_highpri_wq, &b->irq, 1);
+		}
+	} else {
+		if (b->irq_enabled) {
+			engine->irq_put(engine);
+			b->irq_enabled = false;
+		}
+		if (b->rpm_wakelock) {
+			intel_runtime_pm_put(engine->i915);
+			b->rpm_wakelock = false;
+		}
+	}
+	spin_unlock(&b->lock);
+}
+
+inline struct intel_breadcrumb *to_wait(struct rb_node *node)
+{
+	if (node == NULL)
+		return NULL;
+
+	return container_of(node, struct intel_breadcrumb, node);
+}
+
+bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request,
+				  struct intel_breadcrumb *wait)
+{
+	struct intel_breadcrumbs *b = &request->ring->breadcrumbs;
+	struct rb_node **p, *parent;
+	bool first = false;
+
+	wait->task = current;
+	wait->seqno = request->seqno;
+
+	spin_lock(&b->lock);
+
+	/* First? Unmask the user interrupt */
+	if (b->first_waiter == NULL)
+		queue_delayed_work(system_highpri_wq, &b->irq, 0);
+
+	/* Insert the request into the retirment ordered list
+	 * of waiters by walking the rbtree. If we are the oldest
+	 * seqno in the tree (the first to be retired), then
+	 * set ourselves as the bottom-half.
+	 */
+	first = true;
+	parent = NULL;
+	p = &b->requests.rb_node;
+	while (*p) {
+		parent = *p;
+		if (i915_seqno_passed(wait->seqno, to_wait(parent)->seqno)) {
+			p = &parent->rb_right;
+			first = false;
+		} else
+			p = &parent->rb_left;
+	}
+	rb_link_node(&wait->node, parent, p);
+	rb_insert_color(&wait->node, &b->requests);
+
+	if (first)
+		b->first_waiter = wait->task;
+
+	spin_unlock(&b->lock);
+
+	return first;
+}
+
+void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request,
+				     struct intel_breadcrumb *wait)
+{
+	struct intel_breadcrumbs *b = &request->ring->breadcrumbs;
+
+	spin_lock(&b->lock);
+
+	if (b->first_waiter == wait->task) {
+		struct intel_breadcrumb *next;
+		struct task_struct *task;
+
+		/* We are the current bottom-half. Find the next candidate,
+		 * the first waiter in the queue on the remaining oldest
+		 * request. As multiple seqnos may complete in the time it
+		 * takes us to wake up and find the next waiter, we have to
+		 * wake up that waiter for it to perform its own coherent
+		 * completion check.
+		 */
+		task = NULL;
+		next = to_wait(rb_next(&wait->node));
+		if (next)
+			task = next->task;
+
+		b->first_waiter = task;
+		if (task)
+			wake_up_process(task);
+		else
+			/* Disable the user interrupts */
+			mod_delayed_work(system_highpri_wq, &b->irq, 1);
+	}
+
+	rb_erase(&wait->node, &b->requests);
+	spin_unlock(&b->lock);
+}
+
+void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
+{
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+
+	spin_lock_init(&b->lock);
+	INIT_DELAYED_WORK(&b->irq, intel_breadcrumbs_irq);
+}
+
+void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
+{
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+
+	flush_delayed_work(&b->irq);
+}
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 08b982a6ae81..ce5119e2bcef 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1904,6 +1904,8 @@  void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
 	i915_cmd_parser_fini_ring(ring);
 	i915_gem_batch_pool_fini(&ring->batch_pool);
 
+	intel_engine_fini_breadcrumbs(ring);
+
 	if (ring->status_page.obj) {
 		kunmap(sg_page(ring->status_page.obj->pages->sgl));
 		ring->status_page.obj = NULL;
@@ -1920,10 +1922,11 @@  static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin
 	ring->buffer = NULL;
 
 	ring->dev = dev;
+	ring->i915 = to_i915(dev);
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
 	i915_gem_batch_pool_init(dev, &ring->batch_pool);
-	init_waitqueue_head(&ring->irq_queue);
+	intel_engine_init_breadcrumbs(ring);
 
 	INIT_LIST_HEAD(&ring->buffers);
 	INIT_LIST_HEAD(&ring->execlist_queue);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 511efe556d73..8a0f02e73a4e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2157,6 +2157,7 @@  static int intel_init_ring_buffer(struct drm_device *dev,
 	WARN_ON(ring->buffer);
 
 	ring->dev = dev;
+	ring->i915 = to_i915(dev);
 	INIT_LIST_HEAD(&ring->active_list);
 	INIT_LIST_HEAD(&ring->request_list);
 	INIT_LIST_HEAD(&ring->execlist_queue);
@@ -2164,7 +2165,7 @@  static int intel_init_ring_buffer(struct drm_device *dev,
 	i915_gem_batch_pool_init(dev, &ring->batch_pool);
 	memset(ring->semaphore.sync_seqno, 0, sizeof(ring->semaphore.sync_seqno));
 
-	init_waitqueue_head(&ring->irq_queue);
+	intel_engine_init_breadcrumbs(ring);
 
 	ringbuf = intel_engine_create_ringbuffer(ring, 32 * PAGE_SIZE);
 	if (IS_ERR(ringbuf))
@@ -2225,6 +2226,8 @@  void intel_cleanup_ring_buffer(struct intel_engine_cs *ring)
 
 	i915_cmd_parser_fini_ring(ring);
 	i915_gem_batch_pool_fini(&ring->batch_pool);
+
+	intel_engine_fini_breadcrumbs(ring);
 }
 
 static int ring_wait_for_space(struct intel_engine_cs *ring, int n)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 5d1eb206151d..7dde9310df1b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -157,9 +157,31 @@  struct  intel_engine_cs {
 #define LAST_USER_RING (VECS + 1)
 	u32		mmio_base;
 	struct		drm_device *dev;
+	struct drm_i915_private *i915;
 	struct intel_ringbuffer *buffer;
 	struct list_head buffers;
 
+	/* Rather than have every client wait upon all user interrupts,
+	 * with the herd waking after every interrupt and each doing the
+	 * heavyweight seqno dance, we delegate the task to the first client.
+	 * After the interrupt, we wake up one client, who does the heavyweight
+	 * coherent seqno read and either goes back to sleep (if incomplete),
+	 * or wakes up the next client in the queue and so forth.
+	 *
+	 * With respect to walking the entire list of waiters in a single
+	 * dedicated bottom-half, we reduce the latency of the first waiter
+	 * by avoiding a context switch, but incur additional coherent seqno
+	 * reads when following the chain of request breadcrumbs.
+	 */
+	struct intel_breadcrumbs {
+		spinlock_t lock; /* protects the lists of requests */
+		struct rb_root requests; /* sorted by retirement */
+		struct task_struct *first_waiter; /* bh for user interrupts */
+		struct delayed_work irq; /* used to enable or fake inerrupts */
+		bool irq_enabled : 1;
+		bool rpm_wakelock : 1;
+	} breadcrumbs;
+
 	/*
 	 * A pool of objects to use as shadow copies of client batch buffers
 	 * when the command parser is enabled. Prevents the client from
@@ -303,8 +325,6 @@  struct  intel_engine_cs {
 
 	bool gpu_caches_dirty;
 
-	wait_queue_head_t irq_queue;
-
 	struct intel_context *default_context;
 	struct intel_context *last_context;
 
@@ -506,4 +526,27 @@  void intel_ring_reserved_space_end(struct intel_ringbuffer *ringbuf);
 /* Legacy ringbuffer specific portion of reservation code: */
 int intel_ring_reserve_space(struct drm_i915_gem_request *request);
 
+/* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
+struct intel_breadcrumb {
+	struct rb_node node;
+	struct task_struct *task;
+	u32 seqno;
+};
+void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
+bool intel_breadcrumbs_add_waiter(struct drm_i915_gem_request *request,
+				  struct intel_breadcrumb *wait);
+void intel_breadcrumbs_remove_waiter(struct drm_i915_gem_request *request,
+				     struct intel_breadcrumb *wait);
+static inline bool intel_engine_has_waiter(struct intel_engine_cs *engine)
+{
+	return READ_ONCE(engine->breadcrumbs.first_waiter);
+}
+static inline void intel_engine_wakeup(struct intel_engine_cs *engine)
+{
+	struct task_struct *task = READ_ONCE(engine->breadcrumbs.first_waiter);
+	if (task)
+		wake_up_process(task);
+}
+void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine);
+
 #endif /* _INTEL_RINGBUFFER_H_ */