diff mbox

[1/2] drm/i915: Fix __wait_seqno to use true infinite timeouts

Message ID 1377525114-8976-2-git-send-email-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson Aug. 26, 2013, 1:51 p.m. UTC
When we switched to always using a timeout in conjunction with
wait_seqno, we lost the ability to detect missed interrupts. Since, we
have had issues with interrupts on a number of generations, and they are
required to be delivered in a timely fashion for a smooth UX, it is
important that we do log errors found in the wild and prevent the
display stalling for upwards of 1s every time the seqno interrupt is
missed.

Rather than continue to fix up the timeouts to work around the interface
impedence in wait_event_*(), open code the combination of
wait_event[_interruptible][_timeout], and use the exposed timer to
poll for seqno should we detect a lost interrupt.

v2: In order to satisfy the debug requirement of logging missed
interrupts with the real world requirments of making machines work even
if interrupts are hosed, we revert to polling after detecting a missed
interrupt.

v3: Throw in a debugfs interface to simulate broken hw not reporting
interrupts.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c   |  68 ++++++++++++++++++++
 drivers/gpu/drm/i915/i915_drv.h       |   5 ++
 drivers/gpu/drm/i915/i915_gem.c       | 114 ++++++++++++++++++++--------------
 drivers/gpu/drm/i915/i915_gpu_error.c |   1 +
 drivers/gpu/drm/i915/i915_irq.c       |  11 ++--
 5 files changed, 148 insertions(+), 51 deletions(-)

Comments

Ben Widawsky Aug. 27, 2013, 6:11 a.m. UTC | #1
On Mon, Aug 26, 2013 at 02:51:53PM +0100, Chris Wilson wrote:
> When we switched to always using a timeout in conjunction with
> wait_seqno, we lost the ability to detect missed interrupts. Since, we
> have had issues with interrupts on a number of generations, and they are
> required to be delivered in a timely fashion for a smooth UX, it is
> important that we do log errors found in the wild and prevent the
> display stalling for upwards of 1s every time the seqno interrupt is
> missed.
> 
> Rather than continue to fix up the timeouts to work around the interface
> impedence in wait_event_*(), open code the combination of
> wait_event[_interruptible][_timeout], and use the exposed timer to
> poll for seqno should we detect a lost interrupt.
> 
> v2: In order to satisfy the debug requirement of logging missed
> interrupts with the real world requirments of making machines work even
> if interrupts are hosed, we revert to polling after detecting a missed
> interrupt.
> 
> v3: Throw in a debugfs interface to simulate broken hw not reporting
> interrupts.

This interface should be a separate patch.

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c   |  68 ++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_drv.h       |   5 ++
>  drivers/gpu/drm/i915/i915_gem.c       | 114 ++++++++++++++++++++--------------
>  drivers/gpu/drm/i915/i915_gpu_error.c |   1 +
>  drivers/gpu/drm/i915/i915_irq.c       |  11 ++--
>  5 files changed, 148 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 55ab924..e823169 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1876,6 +1876,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops,
>  			i915_ring_stop_get, i915_ring_stop_set,
>  			"0x%08llx\n");
>  
> +static int
> +i915_ring_missed_irq_get(void *data, u64 *val)
> +{
> +	struct drm_device *dev = data;
> +	drm_i915_private_t *dev_priv = dev->dev_private;
> +
> +	*val = dev_priv->gpu_error.missed_irq_rings;
> +	return 0;
> +}
> +
> +static int
> +i915_ring_missed_irq_set(void *data, u64 val)
> +{
> +	struct drm_device *dev = data;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	int ret;
> +
> +	/* Lock against concurrent debugfs callers */
> +	ret = mutex_lock_interruptible(&dev->struct_mutex);
> +	if (ret)
> +		return ret;
> +	dev_priv->gpu_error.missed_irq_rings = val;
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
> +			i915_ring_missed_irq_get, i915_ring_missed_irq_set,
> +			"0x%08llx\n");
> +
> +static int
> +i915_ring_test_irq_get(void *data, u64 *val)
> +{
> +	struct drm_device *dev = data;
> +	drm_i915_private_t *dev_priv = dev->dev_private;
> +
> +	*val = dev_priv->gpu_error.test_irq_rings;
> +
> +	return 0;
> +}
> +
> +static int
> +i915_ring_test_irq_set(void *data, u64 val)
> +{
> +	struct drm_device *dev = data;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	int ret;
> +
> +	DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
> +
> +	/* Lock against concurrent debugfs callers */
> +	ret = mutex_lock_interruptible(&dev->struct_mutex);
> +	if (ret)
> +		return ret;
> +
> +	dev_priv->gpu_error.test_irq_rings = val;
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
> +			i915_ring_test_irq_get, i915_ring_test_irq_set,
> +			"0x%08llx\n");
> +
>  #define DROP_UNBOUND 0x1
>  #define DROP_BOUND 0x2
>  #define DROP_RETIRE 0x4
> @@ -2269,6 +2335,8 @@ static struct i915_debugfs_files {
>  	{"i915_min_freq", &i915_min_freq_fops},
>  	{"i915_cache_sharing", &i915_cache_sharing_fops},
>  	{"i915_ring_stop", &i915_ring_stop_fops},
> +	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
> +	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
>  	{"i915_gem_drop_caches", &i915_drop_caches_fops},
>  	{"i915_error_state", &i915_error_state_fops},
>  	{"i915_next_seqno", &i915_next_seqno_fops},
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 84b95b1..ac6db6e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -993,6 +993,8 @@ struct i915_gpu_error {
>  
>  	unsigned long last_reset;
>  
> +	unsigned long missed_irq_rings;
> +
>  	/**
>  	 * State variable and reset counter controlling the reset flow
>  	 *
> @@ -1031,6 +1033,9 @@ struct i915_gpu_error {
>  
>  	/* For gpu hang simulation. */
>  	unsigned int stop_rings;
> +
> +	/* For missed irq/seqno simulation. */
> +	unsigned int test_irq_rings;
>  };
>  
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ccfebf5..4ed782f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -970,6 +970,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
>  	return ret;
>  }
>  
> +static void fake_irq(unsigned long data)
> +{
> +	wake_up_process((struct task_struct *)data);
> +}
> +
> +static bool missed_irq(struct drm_i915_private *dev_priv,
> +		       struct intel_ring_buffer *ring)
> +{
> +	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
> +}
> +
>  /**
>   * __wait_seqno - wait until execution of seqno has finished
>   * @ring: the ring expected to report seqno
> @@ -993,10 +1004,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
>  			bool interruptible, struct timespec *timeout)
>  {
>  	drm_i915_private_t *dev_priv = ring->dev->dev_private;
> -	struct timespec before, now, wait_time={1,0};
> -	unsigned long timeout_jiffies;
> -	long end;
> -	bool wait_forever = true;
> +	struct timespec before, now;
> +	DEFINE_WAIT(wait);
> +	long timeout_jiffies;
>  	int ret;
>  
>  	WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n");
> @@ -1004,51 +1014,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
>  	if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
>  		return 0;
>  
> -	trace_i915_gem_request_wait_begin(ring, seqno);
> -
> -	if (timeout != NULL) {
> -		wait_time = *timeout;
> -		wait_forever = false;
> -	}
> -
> -	timeout_jiffies = timespec_to_jiffies_timeout(&wait_time);
> +	timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1;
>  
> -	if (WARN_ON(!ring->irq_get(ring)))
> +	if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) &&
> +	    WARN_ON(!ring->irq_get(ring)))
>  		return -ENODEV;
>  
> -	/* Record current time in case interrupted by signal, or wedged * */
> +	/* Record current time in case interrupted by signal, or wedged */
> +	trace_i915_gem_request_wait_begin(ring, seqno);
>  	getrawmonotonic(&before);
> +	for (;;) {
> +		struct timer_list timer;
> +		unsigned long expire;
>  
> -#define EXIT_COND \
> -	(i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
> -	 i915_reset_in_progress(&dev_priv->gpu_error) || \
> -	 reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
> -	do {
> -		if (interruptible)
> -			end = wait_event_interruptible_timeout(ring->irq_queue,
> -							       EXIT_COND,
> -							       timeout_jiffies);
> -		else
> -			end = wait_event_timeout(ring->irq_queue, EXIT_COND,
> -						 timeout_jiffies);
> +		prepare_to_wait(&ring->irq_queue, &wait,
> +				interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
>  
>  		/* We need to check whether any gpu reset happened in between
>  		 * the caller grabbing the seqno and now ... */
> -		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
> -			end = -EAGAIN;
> +		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
> +			/* ... but upgrade the -EGAIN to an -EIO if the gpu
                                                ^ EAGAIN (I realize it's
                                                copypasta)
> +			 * is truely gone. */
                                ^ truly
> +			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> +			if (ret == 0)
> +				ret = -EAGAIN;
> +			break;
> +		}
>  
> -		/* ... but upgrade the -EGAIN to an -EIO if the gpu is truely
> -		 * gone. */
> -		ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> -		if (ret)
> -			end = ret;
> -	} while (end == 0 && wait_forever);
> +		if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) {
> +			ret = 0;
> +			break;
> +		}
> +
> +		if (interruptible && signal_pending(current)) {
> +			ret = -ERESTARTSYS;
> +			break;
> +		}
> +

Not sure, but I think the signal check should come first.  As long as
you've thought about it, I'm okay with whatever.

> +		if (timeout_jiffies <= 0) {
> +			ret = -ETIME;
> +			break;
> +		}
>  
> +		timer.function = NULL;
> +		if (timeout || missed_irq(dev_priv, ring)) {
> +			setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
> +			expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies);
> +			mod_timer(&timer, expire);
> +		}

Isn't this just a fancy schedule_timeout_interruptible?

> +
> +		schedule();
> +
> +		if (timeout)
> +			timeout_jiffies = expire - jiffies;
> +
> +		if (timer.function) {
> +			del_singleshot_timer_sync(&timer);
> +			destroy_timer_on_stack(&timer);
> +		}
> +	}
>  	getrawmonotonic(&now);
> +	trace_i915_gem_request_wait_end(ring, seqno);
>  
>  	ring->irq_put(ring);
> -	trace_i915_gem_request_wait_end(ring, seqno);
> -#undef EXIT_COND
> +
> +	finish_wait(&ring->irq_queue, &wait);
>  
>  	if (timeout) {
>  		struct timespec sleep_time = timespec_sub(now, before);
> @@ -1057,17 +1087,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
>  			set_normalized_timespec(timeout, 0, 0);
>  	}
>  
> -	switch (end) {
> -	case -EIO:
> -	case -EAGAIN: /* Wedged */
> -	case -ERESTARTSYS: /* Signal */
> -		return (int)end;
> -	case 0: /* Timeout */
> -		return -ETIME;
> -	default: /* Completed */
> -		WARN_ON(end < 0); /* We're not aware of other errors */
> -		return 0;
> -	}
> +	return ret;
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 558e568..62368f4 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -288,6 +288,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
>  	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
>  	err_printf(m, "CCID: 0x%08x\n", error->ccid);
> +	err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings);
>  
>  	for (i = 0; i < dev_priv->num_fence_regs; i++)
>  		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index a03b445..7ebf105 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1972,10 +1972,13 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  			if (ring_idle(ring, seqno)) {
>  				if (waitqueue_active(&ring->irq_queue)) {
>  					/* Issue a wake-up to catch stuck h/w. */
> -					DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> -						  ring->name);
> -					wake_up_all(&ring->irq_queue);
> -					ring->hangcheck.score += HUNG;
> +					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
> +						DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> +							  ring->name);
> +						wake_up_all(&ring->irq_queue);
> +					}
> +					/* Safeguard against driver failure */
> +					ring->hangcheck.score += BUSY;
>  				} else
>  					busy = false;
>  			} else {

Honestly, I don't think using the scheduler APIs is a step in the right
direction wrt readability - but I have no better solution, and it seems
to do what you want.

Anyway, I tried to review it, and failed. If you want to split it up
more, I can try again.

Meanwhile, the idea seems okay to me.
Imre Deak Sept. 5, 2013, 2:23 p.m. UTC | #2
On Mon, 2013-08-26 at 14:51 +0100, Chris Wilson wrote:
> When we switched to always using a timeout in conjunction with
> wait_seqno, we lost the ability to detect missed interrupts. Since, we
> have had issues with interrupts on a number of generations, and they are
> required to be delivered in a timely fashion for a smooth UX, it is
> important that we do log errors found in the wild and prevent the
> display stalling for upwards of 1s every time the seqno interrupt is
> missed.
> 
> Rather than continue to fix up the timeouts to work around the interface
> impedence in wait_event_*(), open code the combination of
> wait_event[_interruptible][_timeout], and use the exposed timer to
> poll for seqno should we detect a lost interrupt.
> 
> v2: In order to satisfy the debug requirement of logging missed
> interrupts with the real world requirments of making machines work even
> if interrupts are hosed, we revert to polling after detecting a missed
> interrupt.
> 
> v3: Throw in a debugfs interface to simulate broken hw not reporting
> interrupts.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c   |  68 ++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_drv.h       |   5 ++
>  drivers/gpu/drm/i915/i915_gem.c       | 114 ++++++++++++++++++++--------------
>  drivers/gpu/drm/i915/i915_gpu_error.c |   1 +
>  drivers/gpu/drm/i915/i915_irq.c       |  11 ++--
>  5 files changed, 148 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 55ab924..e823169 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1876,6 +1876,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops,
>  			i915_ring_stop_get, i915_ring_stop_set,
>  			"0x%08llx\n");
>  
> +static int
> +i915_ring_missed_irq_get(void *data, u64 *val)
> +{
> +	struct drm_device *dev = data;
> +	drm_i915_private_t *dev_priv = dev->dev_private;
> +
> +	*val = dev_priv->gpu_error.missed_irq_rings;
> +	return 0;
> +}
> +
> +static int
> +i915_ring_missed_irq_set(void *data, u64 val)
> +{
> +	struct drm_device *dev = data;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	int ret;
> +
> +	/* Lock against concurrent debugfs callers */
> +	ret = mutex_lock_interruptible(&dev->struct_mutex);
> +	if (ret)
> +		return ret;
> +	dev_priv->gpu_error.missed_irq_rings = val;
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
> +			i915_ring_missed_irq_get, i915_ring_missed_irq_set,
> +			"0x%08llx\n");
> +
> +static int
> +i915_ring_test_irq_get(void *data, u64 *val)
> +{
> +	struct drm_device *dev = data;
> +	drm_i915_private_t *dev_priv = dev->dev_private;
> +
> +	*val = dev_priv->gpu_error.test_irq_rings;
> +
> +	return 0;
> +}
> +
> +static int
> +i915_ring_test_irq_set(void *data, u64 val)
> +{
> +	struct drm_device *dev = data;
> +	struct drm_i915_private *dev_priv = dev->dev_private;
> +	int ret;
> +
> +	DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
> +
> +	/* Lock against concurrent debugfs callers */
> +	ret = mutex_lock_interruptible(&dev->struct_mutex);
> +	if (ret)
> +		return ret;
> +
> +	dev_priv->gpu_error.test_irq_rings = val;
> +	mutex_unlock(&dev->struct_mutex);
> +
> +	return 0;
> +}
> +
> +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
> +			i915_ring_test_irq_get, i915_ring_test_irq_set,
> +			"0x%08llx\n");
> +
>  #define DROP_UNBOUND 0x1
>  #define DROP_BOUND 0x2
>  #define DROP_RETIRE 0x4
> @@ -2269,6 +2335,8 @@ static struct i915_debugfs_files {
>  	{"i915_min_freq", &i915_min_freq_fops},
>  	{"i915_cache_sharing", &i915_cache_sharing_fops},
>  	{"i915_ring_stop", &i915_ring_stop_fops},
> +	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
> +	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
>  	{"i915_gem_drop_caches", &i915_drop_caches_fops},
>  	{"i915_error_state", &i915_error_state_fops},
>  	{"i915_next_seqno", &i915_next_seqno_fops},
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 84b95b1..ac6db6e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -993,6 +993,8 @@ struct i915_gpu_error {
>  
>  	unsigned long last_reset;
>  
> +	unsigned long missed_irq_rings;
> +
>  	/**
>  	 * State variable and reset counter controlling the reset flow
>  	 *
> @@ -1031,6 +1033,9 @@ struct i915_gpu_error {
>  
>  	/* For gpu hang simulation. */
>  	unsigned int stop_rings;
> +
> +	/* For missed irq/seqno simulation. */
> +	unsigned int test_irq_rings;
>  };
>  
>  enum modeset_restore {
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ccfebf5..4ed782f 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -970,6 +970,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
>  	return ret;
>  }
>  
> +static void fake_irq(unsigned long data)
> +{
> +	wake_up_process((struct task_struct *)data);
> +}
> +
> +static bool missed_irq(struct drm_i915_private *dev_priv,
> +		       struct intel_ring_buffer *ring)
> +{
> +	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
> +}
> +
>  /**
>   * __wait_seqno - wait until execution of seqno has finished
>   * @ring: the ring expected to report seqno
> @@ -993,10 +1004,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
>  			bool interruptible, struct timespec *timeout)
>  {
>  	drm_i915_private_t *dev_priv = ring->dev->dev_private;
> -	struct timespec before, now, wait_time={1,0};
> -	unsigned long timeout_jiffies;
> -	long end;
> -	bool wait_forever = true;
> +	struct timespec before, now;
> +	DEFINE_WAIT(wait);
> +	long timeout_jiffies;
>  	int ret;
>  
>  	WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n");
> @@ -1004,51 +1014,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
>  	if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
>  		return 0;
>  
> -	trace_i915_gem_request_wait_begin(ring, seqno);
> -
> -	if (timeout != NULL) {
> -		wait_time = *timeout;
> -		wait_forever = false;
> -	}
> -
> -	timeout_jiffies = timespec_to_jiffies_timeout(&wait_time);
> +	timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1;
>  
> -	if (WARN_ON(!ring->irq_get(ring)))
> +	if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) &&
> +	    WARN_ON(!ring->irq_get(ring)))
>  		return -ENODEV;
>  
> -	/* Record current time in case interrupted by signal, or wedged * */
> +	/* Record current time in case interrupted by signal, or wedged */
> +	trace_i915_gem_request_wait_begin(ring, seqno);
>  	getrawmonotonic(&before);
> +	for (;;) {
> +		struct timer_list timer;
> +		unsigned long expire;
>  
> -#define EXIT_COND \
> -	(i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
> -	 i915_reset_in_progress(&dev_priv->gpu_error) || \
> -	 reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
> -	do {
> -		if (interruptible)
> -			end = wait_event_interruptible_timeout(ring->irq_queue,
> -							       EXIT_COND,
> -							       timeout_jiffies);
> -		else
> -			end = wait_event_timeout(ring->irq_queue, EXIT_COND,
> -						 timeout_jiffies);
> +		prepare_to_wait(&ring->irq_queue, &wait,
> +				interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
>  
>  		/* We need to check whether any gpu reset happened in between
>  		 * the caller grabbing the seqno and now ... */
> -		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
> -			end = -EAGAIN;
> +		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
> +			/* ... but upgrade the -EGAIN to an -EIO if the gpu

Should be -EAGAIN.

Otherwise looks ok to me:
Reviewed-by: Imre Deak <imre.deak@intel.com>


> +			 * is truely gone. */
> +			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> +			if (ret == 0)
> +				ret = -EAGAIN;
> +			break;
> +		}
>  
> -		/* ... but upgrade the -EGAIN to an -EIO if the gpu is truely
> -		 * gone. */
> -		ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
> -		if (ret)
> -			end = ret;
> -	} while (end == 0 && wait_forever);
> +		if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) {
> +			ret = 0;
> +			break;
> +		}
> +
> +		if (interruptible && signal_pending(current)) {
> +			ret = -ERESTARTSYS;
> +			break;
> +		}
> +
> +		if (timeout_jiffies <= 0) {
> +			ret = -ETIME;
> +			break;
> +		}
>  
> +		timer.function = NULL;
> +		if (timeout || missed_irq(dev_priv, ring)) {
> +			setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
> +			expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies);
> +			mod_timer(&timer, expire);
> +		}
> +
> +		schedule();
> +
> +		if (timeout)
> +			timeout_jiffies = expire - jiffies;
> +
> +		if (timer.function) {
> +			del_singleshot_timer_sync(&timer);
> +			destroy_timer_on_stack(&timer);
> +		}
> +	}
>  	getrawmonotonic(&now);
> +	trace_i915_gem_request_wait_end(ring, seqno);
>  
>  	ring->irq_put(ring);
> -	trace_i915_gem_request_wait_end(ring, seqno);
> -#undef EXIT_COND
> +
> +	finish_wait(&ring->irq_queue, &wait);
>  
>  	if (timeout) {
>  		struct timespec sleep_time = timespec_sub(now, before);
> @@ -1057,17 +1087,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
>  			set_normalized_timespec(timeout, 0, 0);
>  	}
>  
> -	switch (end) {
> -	case -EIO:
> -	case -EAGAIN: /* Wedged */
> -	case -ERESTARTSYS: /* Signal */
> -		return (int)end;
> -	case 0: /* Timeout */
> -		return -ETIME;
> -	default: /* Completed */
> -		WARN_ON(end < 0); /* We're not aware of other errors */
> -		return 0;
> -	}
> +	return ret;
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 558e568..62368f4 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -288,6 +288,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
>  	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
>  	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
>  	err_printf(m, "CCID: 0x%08x\n", error->ccid);
> +	err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings);
>  
>  	for (i = 0; i < dev_priv->num_fence_regs; i++)
>  		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index a03b445..7ebf105 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1972,10 +1972,13 @@ static void i915_hangcheck_elapsed(unsigned long data)
>  			if (ring_idle(ring, seqno)) {
>  				if (waitqueue_active(&ring->irq_queue)) {
>  					/* Issue a wake-up to catch stuck h/w. */
> -					DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> -						  ring->name);
> -					wake_up_all(&ring->irq_queue);
> -					ring->hangcheck.score += HUNG;
> +					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
> +						DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
> +							  ring->name);
> +						wake_up_all(&ring->irq_queue);
> +					}
> +					/* Safeguard against driver failure */
> +					ring->hangcheck.score += BUSY;
>  				} else
>  					busy = false;
>  			} else {
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 55ab924..e823169 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1876,6 +1876,72 @@  DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops,
 			i915_ring_stop_get, i915_ring_stop_set,
 			"0x%08llx\n");
 
+static int
+i915_ring_missed_irq_get(void *data, u64 *val)
+{
+	struct drm_device *dev = data;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+
+	*val = dev_priv->gpu_error.missed_irq_rings;
+	return 0;
+}
+
+static int
+i915_ring_missed_irq_set(void *data, u64 val)
+{
+	struct drm_device *dev = data;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int ret;
+
+	/* Lock against concurrent debugfs callers */
+	ret = mutex_lock_interruptible(&dev->struct_mutex);
+	if (ret)
+		return ret;
+	dev_priv->gpu_error.missed_irq_rings = val;
+	mutex_unlock(&dev->struct_mutex);
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
+			i915_ring_missed_irq_get, i915_ring_missed_irq_set,
+			"0x%08llx\n");
+
+static int
+i915_ring_test_irq_get(void *data, u64 *val)
+{
+	struct drm_device *dev = data;
+	drm_i915_private_t *dev_priv = dev->dev_private;
+
+	*val = dev_priv->gpu_error.test_irq_rings;
+
+	return 0;
+}
+
+static int
+i915_ring_test_irq_set(void *data, u64 val)
+{
+	struct drm_device *dev = data;
+	struct drm_i915_private *dev_priv = dev->dev_private;
+	int ret;
+
+	DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
+
+	/* Lock against concurrent debugfs callers */
+	ret = mutex_lock_interruptible(&dev->struct_mutex);
+	if (ret)
+		return ret;
+
+	dev_priv->gpu_error.test_irq_rings = val;
+	mutex_unlock(&dev->struct_mutex);
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
+			i915_ring_test_irq_get, i915_ring_test_irq_set,
+			"0x%08llx\n");
+
 #define DROP_UNBOUND 0x1
 #define DROP_BOUND 0x2
 #define DROP_RETIRE 0x4
@@ -2269,6 +2335,8 @@  static struct i915_debugfs_files {
 	{"i915_min_freq", &i915_min_freq_fops},
 	{"i915_cache_sharing", &i915_cache_sharing_fops},
 	{"i915_ring_stop", &i915_ring_stop_fops},
+	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
+	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
 	{"i915_gem_drop_caches", &i915_drop_caches_fops},
 	{"i915_error_state", &i915_error_state_fops},
 	{"i915_next_seqno", &i915_next_seqno_fops},
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 84b95b1..ac6db6e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -993,6 +993,8 @@  struct i915_gpu_error {
 
 	unsigned long last_reset;
 
+	unsigned long missed_irq_rings;
+
 	/**
 	 * State variable and reset counter controlling the reset flow
 	 *
@@ -1031,6 +1033,9 @@  struct i915_gpu_error {
 
 	/* For gpu hang simulation. */
 	unsigned int stop_rings;
+
+	/* For missed irq/seqno simulation. */
+	unsigned int test_irq_rings;
 };
 
 enum modeset_restore {
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ccfebf5..4ed782f 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -970,6 +970,17 @@  i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno)
 	return ret;
 }
 
+static void fake_irq(unsigned long data)
+{
+	wake_up_process((struct task_struct *)data);
+}
+
+static bool missed_irq(struct drm_i915_private *dev_priv,
+		       struct intel_ring_buffer *ring)
+{
+	return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings);
+}
+
 /**
  * __wait_seqno - wait until execution of seqno has finished
  * @ring: the ring expected to report seqno
@@ -993,10 +1004,9 @@  static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 			bool interruptible, struct timespec *timeout)
 {
 	drm_i915_private_t *dev_priv = ring->dev->dev_private;
-	struct timespec before, now, wait_time={1,0};
-	unsigned long timeout_jiffies;
-	long end;
-	bool wait_forever = true;
+	struct timespec before, now;
+	DEFINE_WAIT(wait);
+	long timeout_jiffies;
 	int ret;
 
 	WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n");
@@ -1004,51 +1014,71 @@  static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 	if (i915_seqno_passed(ring->get_seqno(ring, true), seqno))
 		return 0;
 
-	trace_i915_gem_request_wait_begin(ring, seqno);
-
-	if (timeout != NULL) {
-		wait_time = *timeout;
-		wait_forever = false;
-	}
-
-	timeout_jiffies = timespec_to_jiffies_timeout(&wait_time);
+	timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1;
 
-	if (WARN_ON(!ring->irq_get(ring)))
+	if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) &&
+	    WARN_ON(!ring->irq_get(ring)))
 		return -ENODEV;
 
-	/* Record current time in case interrupted by signal, or wedged * */
+	/* Record current time in case interrupted by signal, or wedged */
+	trace_i915_gem_request_wait_begin(ring, seqno);
 	getrawmonotonic(&before);
+	for (;;) {
+		struct timer_list timer;
+		unsigned long expire;
 
-#define EXIT_COND \
-	(i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \
-	 i915_reset_in_progress(&dev_priv->gpu_error) || \
-	 reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
-	do {
-		if (interruptible)
-			end = wait_event_interruptible_timeout(ring->irq_queue,
-							       EXIT_COND,
-							       timeout_jiffies);
-		else
-			end = wait_event_timeout(ring->irq_queue, EXIT_COND,
-						 timeout_jiffies);
+		prepare_to_wait(&ring->irq_queue, &wait,
+				interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
 
 		/* We need to check whether any gpu reset happened in between
 		 * the caller grabbing the seqno and now ... */
-		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter))
-			end = -EAGAIN;
+		if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
+			/* ... but upgrade the -EGAIN to an -EIO if the gpu
+			 * is truely gone. */
+			ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
+			if (ret == 0)
+				ret = -EAGAIN;
+			break;
+		}
 
-		/* ... but upgrade the -EGAIN to an -EIO if the gpu is truely
-		 * gone. */
-		ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
-		if (ret)
-			end = ret;
-	} while (end == 0 && wait_forever);
+		if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) {
+			ret = 0;
+			break;
+		}
+
+		if (interruptible && signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (timeout_jiffies <= 0) {
+			ret = -ETIME;
+			break;
+		}
 
+		timer.function = NULL;
+		if (timeout || missed_irq(dev_priv, ring)) {
+			setup_timer_on_stack(&timer, fake_irq, (unsigned long)current);
+			expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies);
+			mod_timer(&timer, expire);
+		}
+
+		schedule();
+
+		if (timeout)
+			timeout_jiffies = expire - jiffies;
+
+		if (timer.function) {
+			del_singleshot_timer_sync(&timer);
+			destroy_timer_on_stack(&timer);
+		}
+	}
 	getrawmonotonic(&now);
+	trace_i915_gem_request_wait_end(ring, seqno);
 
 	ring->irq_put(ring);
-	trace_i915_gem_request_wait_end(ring, seqno);
-#undef EXIT_COND
+
+	finish_wait(&ring->irq_queue, &wait);
 
 	if (timeout) {
 		struct timespec sleep_time = timespec_sub(now, before);
@@ -1057,17 +1087,7 @@  static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
 			set_normalized_timespec(timeout, 0, 0);
 	}
 
-	switch (end) {
-	case -EIO:
-	case -EAGAIN: /* Wedged */
-	case -ERESTARTSYS: /* Signal */
-		return (int)end;
-	case 0: /* Timeout */
-		return -ETIME;
-	default: /* Completed */
-		WARN_ON(end < 0); /* We're not aware of other errors */
-		return 0;
-	}
+	return ret;
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 558e568..62368f4 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -288,6 +288,7 @@  int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
 	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
 	err_printf(m, "CCID: 0x%08x\n", error->ccid);
+	err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings);
 
 	for (i = 0; i < dev_priv->num_fence_regs; i++)
 		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index a03b445..7ebf105 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1972,10 +1972,13 @@  static void i915_hangcheck_elapsed(unsigned long data)
 			if (ring_idle(ring, seqno)) {
 				if (waitqueue_active(&ring->irq_queue)) {
 					/* Issue a wake-up to catch stuck h/w. */
-					DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
-						  ring->name);
-					wake_up_all(&ring->irq_queue);
-					ring->hangcheck.score += HUNG;
+					if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
+						DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
+							  ring->name);
+						wake_up_all(&ring->irq_queue);
+					}
+					/* Safeguard against driver failure */
+					ring->hangcheck.score += BUSY;
 				} else
 					busy = false;
 			} else {