Message ID | 1377525114-8976-2-git-send-email-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Mon, Aug 26, 2013 at 02:51:53PM +0100, Chris Wilson wrote: > When we switched to always using a timeout in conjunction with > wait_seqno, we lost the ability to detect missed interrupts. Since, we > have had issues with interrupts on a number of generations, and they are > required to be delivered in a timely fashion for a smooth UX, it is > important that we do log errors found in the wild and prevent the > display stalling for upwards of 1s every time the seqno interrupt is > missed. > > Rather than continue to fix up the timeouts to work around the interface > impedence in wait_event_*(), open code the combination of > wait_event[_interruptible][_timeout], and use the exposed timer to > poll for seqno should we detect a lost interrupt. > > v2: In order to satisfy the debug requirement of logging missed > interrupts with the real world requirments of making machines work even > if interrupts are hosed, we revert to polling after detecting a missed > interrupt. > > v3: Throw in a debugfs interface to simulate broken hw not reporting > interrupts. This interface should be a separate patch. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 68 ++++++++++++++++++++ > drivers/gpu/drm/i915/i915_drv.h | 5 ++ > drivers/gpu/drm/i915/i915_gem.c | 114 ++++++++++++++++++++-------------- > drivers/gpu/drm/i915/i915_gpu_error.c | 1 + > drivers/gpu/drm/i915/i915_irq.c | 11 ++-- > 5 files changed, 148 insertions(+), 51 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 55ab924..e823169 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -1876,6 +1876,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops, > i915_ring_stop_get, i915_ring_stop_set, > "0x%08llx\n"); > > +static int > +i915_ring_missed_irq_get(void *data, u64 *val) > +{ > + struct drm_device *dev = data; > + drm_i915_private_t *dev_priv = dev->dev_private; > + > + *val = dev_priv->gpu_error.missed_irq_rings; > + return 0; > +} > + > +static int > +i915_ring_missed_irq_set(void *data, u64 val) > +{ > + struct drm_device *dev = data; > + struct drm_i915_private *dev_priv = dev->dev_private; > + int ret; > + > + /* Lock against concurrent debugfs callers */ > + ret = mutex_lock_interruptible(&dev->struct_mutex); > + if (ret) > + return ret; > + dev_priv->gpu_error.missed_irq_rings = val; > + mutex_unlock(&dev->struct_mutex); > + > + return 0; > +} > + > +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops, > + i915_ring_missed_irq_get, i915_ring_missed_irq_set, > + "0x%08llx\n"); > + > +static int > +i915_ring_test_irq_get(void *data, u64 *val) > +{ > + struct drm_device *dev = data; > + drm_i915_private_t *dev_priv = dev->dev_private; > + > + *val = dev_priv->gpu_error.test_irq_rings; > + > + return 0; > +} > + > +static int > +i915_ring_test_irq_set(void *data, u64 val) > +{ > + struct drm_device *dev = data; > + struct drm_i915_private *dev_priv = dev->dev_private; > + int ret; > + > + DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val); > + > + /* Lock against concurrent debugfs callers */ > + ret = mutex_lock_interruptible(&dev->struct_mutex); > + if (ret) > + return ret; > + > + dev_priv->gpu_error.test_irq_rings = val; > + mutex_unlock(&dev->struct_mutex); > + > + return 0; > +} > + > +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops, > + i915_ring_test_irq_get, i915_ring_test_irq_set, > + "0x%08llx\n"); > + > #define DROP_UNBOUND 0x1 > #define DROP_BOUND 0x2 > #define DROP_RETIRE 0x4 > @@ -2269,6 +2335,8 @@ static struct i915_debugfs_files { > {"i915_min_freq", &i915_min_freq_fops}, > {"i915_cache_sharing", &i915_cache_sharing_fops}, > {"i915_ring_stop", &i915_ring_stop_fops}, > + {"i915_ring_missed_irq", &i915_ring_missed_irq_fops}, > + {"i915_ring_test_irq", &i915_ring_test_irq_fops}, > {"i915_gem_drop_caches", &i915_drop_caches_fops}, > {"i915_error_state", &i915_error_state_fops}, > {"i915_next_seqno", &i915_next_seqno_fops}, > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 84b95b1..ac6db6e 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -993,6 +993,8 @@ struct i915_gpu_error { > > unsigned long last_reset; > > + unsigned long missed_irq_rings; > + > /** > * State variable and reset counter controlling the reset flow > * > @@ -1031,6 +1033,9 @@ struct i915_gpu_error { > > /* For gpu hang simulation. */ > unsigned int stop_rings; > + > + /* For missed irq/seqno simulation. */ > + unsigned int test_irq_rings; > }; > > enum modeset_restore { > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index ccfebf5..4ed782f 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -970,6 +970,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno) > return ret; > } > > +static void fake_irq(unsigned long data) > +{ > + wake_up_process((struct task_struct *)data); > +} > + > +static bool missed_irq(struct drm_i915_private *dev_priv, > + struct intel_ring_buffer *ring) > +{ > + return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings); > +} > + > /** > * __wait_seqno - wait until execution of seqno has finished > * @ring: the ring expected to report seqno > @@ -993,10 +1004,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, > bool interruptible, struct timespec *timeout) > { > drm_i915_private_t *dev_priv = ring->dev->dev_private; > - struct timespec before, now, wait_time={1,0}; > - unsigned long timeout_jiffies; > - long end; > - bool wait_forever = true; > + struct timespec before, now; > + DEFINE_WAIT(wait); > + long timeout_jiffies; > int ret; > > WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n"); > @@ -1004,51 +1014,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, > if (i915_seqno_passed(ring->get_seqno(ring, true), seqno)) > return 0; > > - trace_i915_gem_request_wait_begin(ring, seqno); > - > - if (timeout != NULL) { > - wait_time = *timeout; > - wait_forever = false; > - } > - > - timeout_jiffies = timespec_to_jiffies_timeout(&wait_time); > + timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1; > > - if (WARN_ON(!ring->irq_get(ring))) > + if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) && > + WARN_ON(!ring->irq_get(ring))) > return -ENODEV; > > - /* Record current time in case interrupted by signal, or wedged * */ > + /* Record current time in case interrupted by signal, or wedged */ > + trace_i915_gem_request_wait_begin(ring, seqno); > getrawmonotonic(&before); > + for (;;) { > + struct timer_list timer; > + unsigned long expire; > > -#define EXIT_COND \ > - (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \ > - i915_reset_in_progress(&dev_priv->gpu_error) || \ > - reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) > - do { > - if (interruptible) > - end = wait_event_interruptible_timeout(ring->irq_queue, > - EXIT_COND, > - timeout_jiffies); > - else > - end = wait_event_timeout(ring->irq_queue, EXIT_COND, > - timeout_jiffies); > + prepare_to_wait(&ring->irq_queue, &wait, > + interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); > > /* We need to check whether any gpu reset happened in between > * the caller grabbing the seqno and now ... */ > - if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) > - end = -EAGAIN; > + if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) { > + /* ... but upgrade the -EGAIN to an -EIO if the gpu ^ EAGAIN (I realize it's copypasta) > + * is truely gone. */ ^ truly > + ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); > + if (ret == 0) > + ret = -EAGAIN; > + break; > + } > > - /* ... but upgrade the -EGAIN to an -EIO if the gpu is truely > - * gone. */ > - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); > - if (ret) > - end = ret; > - } while (end == 0 && wait_forever); > + if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) { > + ret = 0; > + break; > + } > + > + if (interruptible && signal_pending(current)) { > + ret = -ERESTARTSYS; > + break; > + } > + Not sure, but I think the signal check should come first. As long as you've thought about it, I'm okay with whatever. > + if (timeout_jiffies <= 0) { > + ret = -ETIME; > + break; > + } > > + timer.function = NULL; > + if (timeout || missed_irq(dev_priv, ring)) { > + setup_timer_on_stack(&timer, fake_irq, (unsigned long)current); > + expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies); > + mod_timer(&timer, expire); > + } Isn't this just a fancy schedule_timeout_interruptible? > + > + schedule(); > + > + if (timeout) > + timeout_jiffies = expire - jiffies; > + > + if (timer.function) { > + del_singleshot_timer_sync(&timer); > + destroy_timer_on_stack(&timer); > + } > + } > getrawmonotonic(&now); > + trace_i915_gem_request_wait_end(ring, seqno); > > ring->irq_put(ring); > - trace_i915_gem_request_wait_end(ring, seqno); > -#undef EXIT_COND > + > + finish_wait(&ring->irq_queue, &wait); > > if (timeout) { > struct timespec sleep_time = timespec_sub(now, before); > @@ -1057,17 +1087,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, > set_normalized_timespec(timeout, 0, 0); > } > > - switch (end) { > - case -EIO: > - case -EAGAIN: /* Wedged */ > - case -ERESTARTSYS: /* Signal */ > - return (int)end; > - case 0: /* Timeout */ > - return -ETIME; > - default: /* Completed */ > - WARN_ON(end < 0); /* We're not aware of other errors */ > - return 0; > - } > + return ret; > } > > /** > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index 558e568..62368f4 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -288,6 +288,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, > err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); > err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); > err_printf(m, "CCID: 0x%08x\n", error->ccid); > + err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings); > > for (i = 0; i < dev_priv->num_fence_regs; i++) > err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index a03b445..7ebf105 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -1972,10 +1972,13 @@ static void i915_hangcheck_elapsed(unsigned long data) > if (ring_idle(ring, seqno)) { > if (waitqueue_active(&ring->irq_queue)) { > /* Issue a wake-up to catch stuck h/w. */ > - DRM_ERROR("Hangcheck timer elapsed... %s idle\n", > - ring->name); > - wake_up_all(&ring->irq_queue); > - ring->hangcheck.score += HUNG; > + if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) { > + DRM_ERROR("Hangcheck timer elapsed... %s idle\n", > + ring->name); > + wake_up_all(&ring->irq_queue); > + } > + /* Safeguard against driver failure */ > + ring->hangcheck.score += BUSY; > } else > busy = false; > } else { Honestly, I don't think using the scheduler APIs is a step in the right direction wrt readability - but I have no better solution, and it seems to do what you want. Anyway, I tried to review it, and failed. If you want to split it up more, I can try again. Meanwhile, the idea seems okay to me.
On Mon, 2013-08-26 at 14:51 +0100, Chris Wilson wrote: > When we switched to always using a timeout in conjunction with > wait_seqno, we lost the ability to detect missed interrupts. Since, we > have had issues with interrupts on a number of generations, and they are > required to be delivered in a timely fashion for a smooth UX, it is > important that we do log errors found in the wild and prevent the > display stalling for upwards of 1s every time the seqno interrupt is > missed. > > Rather than continue to fix up the timeouts to work around the interface > impedence in wait_event_*(), open code the combination of > wait_event[_interruptible][_timeout], and use the exposed timer to > poll for seqno should we detect a lost interrupt. > > v2: In order to satisfy the debug requirement of logging missed > interrupts with the real world requirments of making machines work even > if interrupts are hosed, we revert to polling after detecting a missed > interrupt. > > v3: Throw in a debugfs interface to simulate broken hw not reporting > interrupts. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > --- > drivers/gpu/drm/i915/i915_debugfs.c | 68 ++++++++++++++++++++ > drivers/gpu/drm/i915/i915_drv.h | 5 ++ > drivers/gpu/drm/i915/i915_gem.c | 114 ++++++++++++++++++++-------------- > drivers/gpu/drm/i915/i915_gpu_error.c | 1 + > drivers/gpu/drm/i915/i915_irq.c | 11 ++-- > 5 files changed, 148 insertions(+), 51 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c > index 55ab924..e823169 100644 > --- a/drivers/gpu/drm/i915/i915_debugfs.c > +++ b/drivers/gpu/drm/i915/i915_debugfs.c > @@ -1876,6 +1876,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops, > i915_ring_stop_get, i915_ring_stop_set, > "0x%08llx\n"); > > +static int > +i915_ring_missed_irq_get(void *data, u64 *val) > +{ > + struct drm_device *dev = data; > + drm_i915_private_t *dev_priv = dev->dev_private; > + > + *val = dev_priv->gpu_error.missed_irq_rings; > + return 0; > +} > + > +static int > +i915_ring_missed_irq_set(void *data, u64 val) > +{ > + struct drm_device *dev = data; > + struct drm_i915_private *dev_priv = dev->dev_private; > + int ret; > + > + /* Lock against concurrent debugfs callers */ > + ret = mutex_lock_interruptible(&dev->struct_mutex); > + if (ret) > + return ret; > + dev_priv->gpu_error.missed_irq_rings = val; > + mutex_unlock(&dev->struct_mutex); > + > + return 0; > +} > + > +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops, > + i915_ring_missed_irq_get, i915_ring_missed_irq_set, > + "0x%08llx\n"); > + > +static int > +i915_ring_test_irq_get(void *data, u64 *val) > +{ > + struct drm_device *dev = data; > + drm_i915_private_t *dev_priv = dev->dev_private; > + > + *val = dev_priv->gpu_error.test_irq_rings; > + > + return 0; > +} > + > +static int > +i915_ring_test_irq_set(void *data, u64 val) > +{ > + struct drm_device *dev = data; > + struct drm_i915_private *dev_priv = dev->dev_private; > + int ret; > + > + DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val); > + > + /* Lock against concurrent debugfs callers */ > + ret = mutex_lock_interruptible(&dev->struct_mutex); > + if (ret) > + return ret; > + > + dev_priv->gpu_error.test_irq_rings = val; > + mutex_unlock(&dev->struct_mutex); > + > + return 0; > +} > + > +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops, > + i915_ring_test_irq_get, i915_ring_test_irq_set, > + "0x%08llx\n"); > + > #define DROP_UNBOUND 0x1 > #define DROP_BOUND 0x2 > #define DROP_RETIRE 0x4 > @@ -2269,6 +2335,8 @@ static struct i915_debugfs_files { > {"i915_min_freq", &i915_min_freq_fops}, > {"i915_cache_sharing", &i915_cache_sharing_fops}, > {"i915_ring_stop", &i915_ring_stop_fops}, > + {"i915_ring_missed_irq", &i915_ring_missed_irq_fops}, > + {"i915_ring_test_irq", &i915_ring_test_irq_fops}, > {"i915_gem_drop_caches", &i915_drop_caches_fops}, > {"i915_error_state", &i915_error_state_fops}, > {"i915_next_seqno", &i915_next_seqno_fops}, > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h > index 84b95b1..ac6db6e 100644 > --- a/drivers/gpu/drm/i915/i915_drv.h > +++ b/drivers/gpu/drm/i915/i915_drv.h > @@ -993,6 +993,8 @@ struct i915_gpu_error { > > unsigned long last_reset; > > + unsigned long missed_irq_rings; > + > /** > * State variable and reset counter controlling the reset flow > * > @@ -1031,6 +1033,9 @@ struct i915_gpu_error { > > /* For gpu hang simulation. */ > unsigned int stop_rings; > + > + /* For missed irq/seqno simulation. */ > + unsigned int test_irq_rings; > }; > > enum modeset_restore { > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c > index ccfebf5..4ed782f 100644 > --- a/drivers/gpu/drm/i915/i915_gem.c > +++ b/drivers/gpu/drm/i915/i915_gem.c > @@ -970,6 +970,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno) > return ret; > } > > +static void fake_irq(unsigned long data) > +{ > + wake_up_process((struct task_struct *)data); > +} > + > +static bool missed_irq(struct drm_i915_private *dev_priv, > + struct intel_ring_buffer *ring) > +{ > + return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings); > +} > + > /** > * __wait_seqno - wait until execution of seqno has finished > * @ring: the ring expected to report seqno > @@ -993,10 +1004,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, > bool interruptible, struct timespec *timeout) > { > drm_i915_private_t *dev_priv = ring->dev->dev_private; > - struct timespec before, now, wait_time={1,0}; > - unsigned long timeout_jiffies; > - long end; > - bool wait_forever = true; > + struct timespec before, now; > + DEFINE_WAIT(wait); > + long timeout_jiffies; > int ret; > > WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n"); > @@ -1004,51 +1014,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, > if (i915_seqno_passed(ring->get_seqno(ring, true), seqno)) > return 0; > > - trace_i915_gem_request_wait_begin(ring, seqno); > - > - if (timeout != NULL) { > - wait_time = *timeout; > - wait_forever = false; > - } > - > - timeout_jiffies = timespec_to_jiffies_timeout(&wait_time); > + timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1; > > - if (WARN_ON(!ring->irq_get(ring))) > + if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) && > + WARN_ON(!ring->irq_get(ring))) > return -ENODEV; > > - /* Record current time in case interrupted by signal, or wedged * */ > + /* Record current time in case interrupted by signal, or wedged */ > + trace_i915_gem_request_wait_begin(ring, seqno); > getrawmonotonic(&before); > + for (;;) { > + struct timer_list timer; > + unsigned long expire; > > -#define EXIT_COND \ > - (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \ > - i915_reset_in_progress(&dev_priv->gpu_error) || \ > - reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) > - do { > - if (interruptible) > - end = wait_event_interruptible_timeout(ring->irq_queue, > - EXIT_COND, > - timeout_jiffies); > - else > - end = wait_event_timeout(ring->irq_queue, EXIT_COND, > - timeout_jiffies); > + prepare_to_wait(&ring->irq_queue, &wait, > + interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); > > /* We need to check whether any gpu reset happened in between > * the caller grabbing the seqno and now ... */ > - if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) > - end = -EAGAIN; > + if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) { > + /* ... but upgrade the -EGAIN to an -EIO if the gpu Should be -EAGAIN. Otherwise looks ok to me: Reviewed-by: Imre Deak <imre.deak@intel.com> > + * is truely gone. */ > + ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); > + if (ret == 0) > + ret = -EAGAIN; > + break; > + } > > - /* ... but upgrade the -EGAIN to an -EIO if the gpu is truely > - * gone. */ > - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); > - if (ret) > - end = ret; > - } while (end == 0 && wait_forever); > + if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) { > + ret = 0; > + break; > + } > + > + if (interruptible && signal_pending(current)) { > + ret = -ERESTARTSYS; > + break; > + } > + > + if (timeout_jiffies <= 0) { > + ret = -ETIME; > + break; > + } > > + timer.function = NULL; > + if (timeout || missed_irq(dev_priv, ring)) { > + setup_timer_on_stack(&timer, fake_irq, (unsigned long)current); > + expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies); > + mod_timer(&timer, expire); > + } > + > + schedule(); > + > + if (timeout) > + timeout_jiffies = expire - jiffies; > + > + if (timer.function) { > + del_singleshot_timer_sync(&timer); > + destroy_timer_on_stack(&timer); > + } > + } > getrawmonotonic(&now); > + trace_i915_gem_request_wait_end(ring, seqno); > > ring->irq_put(ring); > - trace_i915_gem_request_wait_end(ring, seqno); > -#undef EXIT_COND > + > + finish_wait(&ring->irq_queue, &wait); > > if (timeout) { > struct timespec sleep_time = timespec_sub(now, before); > @@ -1057,17 +1087,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, > set_normalized_timespec(timeout, 0, 0); > } > > - switch (end) { > - case -EIO: > - case -EAGAIN: /* Wedged */ > - case -ERESTARTSYS: /* Signal */ > - return (int)end; > - case 0: /* Timeout */ > - return -ETIME; > - default: /* Completed */ > - WARN_ON(end < 0); /* We're not aware of other errors */ > - return 0; > - } > + return ret; > } > > /** > diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c > index 558e568..62368f4 100644 > --- a/drivers/gpu/drm/i915/i915_gpu_error.c > +++ b/drivers/gpu/drm/i915/i915_gpu_error.c > @@ -288,6 +288,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, > err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); > err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); > err_printf(m, "CCID: 0x%08x\n", error->ccid); > + err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings); > > for (i = 0; i < dev_priv->num_fence_regs; i++) > err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c > index a03b445..7ebf105 100644 > --- a/drivers/gpu/drm/i915/i915_irq.c > +++ b/drivers/gpu/drm/i915/i915_irq.c > @@ -1972,10 +1972,13 @@ static void i915_hangcheck_elapsed(unsigned long data) > if (ring_idle(ring, seqno)) { > if (waitqueue_active(&ring->irq_queue)) { > /* Issue a wake-up to catch stuck h/w. */ > - DRM_ERROR("Hangcheck timer elapsed... %s idle\n", > - ring->name); > - wake_up_all(&ring->irq_queue); > - ring->hangcheck.score += HUNG; > + if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) { > + DRM_ERROR("Hangcheck timer elapsed... %s idle\n", > + ring->name); > + wake_up_all(&ring->irq_queue); > + } > + /* Safeguard against driver failure */ > + ring->hangcheck.score += BUSY; > } else > busy = false; > } else {
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 55ab924..e823169 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -1876,6 +1876,72 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_ring_stop_fops, i915_ring_stop_get, i915_ring_stop_set, "0x%08llx\n"); +static int +i915_ring_missed_irq_get(void *data, u64 *val) +{ + struct drm_device *dev = data; + drm_i915_private_t *dev_priv = dev->dev_private; + + *val = dev_priv->gpu_error.missed_irq_rings; + return 0; +} + +static int +i915_ring_missed_irq_set(void *data, u64 val) +{ + struct drm_device *dev = data; + struct drm_i915_private *dev_priv = dev->dev_private; + int ret; + + /* Lock against concurrent debugfs callers */ + ret = mutex_lock_interruptible(&dev->struct_mutex); + if (ret) + return ret; + dev_priv->gpu_error.missed_irq_rings = val; + mutex_unlock(&dev->struct_mutex); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops, + i915_ring_missed_irq_get, i915_ring_missed_irq_set, + "0x%08llx\n"); + +static int +i915_ring_test_irq_get(void *data, u64 *val) +{ + struct drm_device *dev = data; + drm_i915_private_t *dev_priv = dev->dev_private; + + *val = dev_priv->gpu_error.test_irq_rings; + + return 0; +} + +static int +i915_ring_test_irq_set(void *data, u64 val) +{ + struct drm_device *dev = data; + struct drm_i915_private *dev_priv = dev->dev_private; + int ret; + + DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val); + + /* Lock against concurrent debugfs callers */ + ret = mutex_lock_interruptible(&dev->struct_mutex); + if (ret) + return ret; + + dev_priv->gpu_error.test_irq_rings = val; + mutex_unlock(&dev->struct_mutex); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops, + i915_ring_test_irq_get, i915_ring_test_irq_set, + "0x%08llx\n"); + #define DROP_UNBOUND 0x1 #define DROP_BOUND 0x2 #define DROP_RETIRE 0x4 @@ -2269,6 +2335,8 @@ static struct i915_debugfs_files { {"i915_min_freq", &i915_min_freq_fops}, {"i915_cache_sharing", &i915_cache_sharing_fops}, {"i915_ring_stop", &i915_ring_stop_fops}, + {"i915_ring_missed_irq", &i915_ring_missed_irq_fops}, + {"i915_ring_test_irq", &i915_ring_test_irq_fops}, {"i915_gem_drop_caches", &i915_drop_caches_fops}, {"i915_error_state", &i915_error_state_fops}, {"i915_next_seqno", &i915_next_seqno_fops}, diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 84b95b1..ac6db6e 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -993,6 +993,8 @@ struct i915_gpu_error { unsigned long last_reset; + unsigned long missed_irq_rings; + /** * State variable and reset counter controlling the reset flow * @@ -1031,6 +1033,9 @@ struct i915_gpu_error { /* For gpu hang simulation. */ unsigned int stop_rings; + + /* For missed irq/seqno simulation. */ + unsigned int test_irq_rings; }; enum modeset_restore { diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index ccfebf5..4ed782f 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -970,6 +970,17 @@ i915_gem_check_olr(struct intel_ring_buffer *ring, u32 seqno) return ret; } +static void fake_irq(unsigned long data) +{ + wake_up_process((struct task_struct *)data); +} + +static bool missed_irq(struct drm_i915_private *dev_priv, + struct intel_ring_buffer *ring) +{ + return test_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings); +} + /** * __wait_seqno - wait until execution of seqno has finished * @ring: the ring expected to report seqno @@ -993,10 +1004,9 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, bool interruptible, struct timespec *timeout) { drm_i915_private_t *dev_priv = ring->dev->dev_private; - struct timespec before, now, wait_time={1,0}; - unsigned long timeout_jiffies; - long end; - bool wait_forever = true; + struct timespec before, now; + DEFINE_WAIT(wait); + long timeout_jiffies; int ret; WARN(dev_priv->pc8.irqs_disabled, "IRQs disabled\n"); @@ -1004,51 +1014,71 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, if (i915_seqno_passed(ring->get_seqno(ring, true), seqno)) return 0; - trace_i915_gem_request_wait_begin(ring, seqno); - - if (timeout != NULL) { - wait_time = *timeout; - wait_forever = false; - } - - timeout_jiffies = timespec_to_jiffies_timeout(&wait_time); + timeout_jiffies = timeout ? timespec_to_jiffies_timeout(timeout) : 1; - if (WARN_ON(!ring->irq_get(ring))) + if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)) && + WARN_ON(!ring->irq_get(ring))) return -ENODEV; - /* Record current time in case interrupted by signal, or wedged * */ + /* Record current time in case interrupted by signal, or wedged */ + trace_i915_gem_request_wait_begin(ring, seqno); getrawmonotonic(&before); + for (;;) { + struct timer_list timer; + unsigned long expire; -#define EXIT_COND \ - (i915_seqno_passed(ring->get_seqno(ring, false), seqno) || \ - i915_reset_in_progress(&dev_priv->gpu_error) || \ - reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) - do { - if (interruptible) - end = wait_event_interruptible_timeout(ring->irq_queue, - EXIT_COND, - timeout_jiffies); - else - end = wait_event_timeout(ring->irq_queue, EXIT_COND, - timeout_jiffies); + prepare_to_wait(&ring->irq_queue, &wait, + interruptible ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE); /* We need to check whether any gpu reset happened in between * the caller grabbing the seqno and now ... */ - if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) - end = -EAGAIN; + if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) { + /* ... but upgrade the -EGAIN to an -EIO if the gpu + * is truely gone. */ + ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); + if (ret == 0) + ret = -EAGAIN; + break; + } - /* ... but upgrade the -EGAIN to an -EIO if the gpu is truely - * gone. */ - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); - if (ret) - end = ret; - } while (end == 0 && wait_forever); + if (i915_seqno_passed(ring->get_seqno(ring, false), seqno)) { + ret = 0; + break; + } + + if (interruptible && signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + if (timeout_jiffies <= 0) { + ret = -ETIME; + break; + } + timer.function = NULL; + if (timeout || missed_irq(dev_priv, ring)) { + setup_timer_on_stack(&timer, fake_irq, (unsigned long)current); + expire = jiffies + (missed_irq(dev_priv, ring) ? 1: timeout_jiffies); + mod_timer(&timer, expire); + } + + schedule(); + + if (timeout) + timeout_jiffies = expire - jiffies; + + if (timer.function) { + del_singleshot_timer_sync(&timer); + destroy_timer_on_stack(&timer); + } + } getrawmonotonic(&now); + trace_i915_gem_request_wait_end(ring, seqno); ring->irq_put(ring); - trace_i915_gem_request_wait_end(ring, seqno); -#undef EXIT_COND + + finish_wait(&ring->irq_queue, &wait); if (timeout) { struct timespec sleep_time = timespec_sub(now, before); @@ -1057,17 +1087,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, set_normalized_timespec(timeout, 0, 0); } - switch (end) { - case -EIO: - case -EAGAIN: /* Wedged */ - case -ERESTARTSYS: /* Signal */ - return (int)end; - case 0: /* Timeout */ - return -ETIME; - default: /* Completed */ - WARN_ON(end < 0); /* We're not aware of other errors */ - return 0; - } + return ret; } /** diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 558e568..62368f4 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -288,6 +288,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m, err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake); err_printf(m, "DERRMR: 0x%08x\n", error->derrmr); err_printf(m, "CCID: 0x%08x\n", error->ccid); + err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings); for (i = 0; i < dev_priv->num_fence_regs; i++) err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index a03b445..7ebf105 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1972,10 +1972,13 @@ static void i915_hangcheck_elapsed(unsigned long data) if (ring_idle(ring, seqno)) { if (waitqueue_active(&ring->irq_queue)) { /* Issue a wake-up to catch stuck h/w. */ - DRM_ERROR("Hangcheck timer elapsed... %s idle\n", - ring->name); - wake_up_all(&ring->irq_queue); - ring->hangcheck.score += HUNG; + if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) { + DRM_ERROR("Hangcheck timer elapsed... %s idle\n", + ring->name); + wake_up_all(&ring->irq_queue); + } + /* Safeguard against driver failure */ + ring->hangcheck.score += BUSY; } else busy = false; } else {
When we switched to always using a timeout in conjunction with wait_seqno, we lost the ability to detect missed interrupts. Since, we have had issues with interrupts on a number of generations, and they are required to be delivered in a timely fashion for a smooth UX, it is important that we do log errors found in the wild and prevent the display stalling for upwards of 1s every time the seqno interrupt is missed. Rather than continue to fix up the timeouts to work around the interface impedence in wait_event_*(), open code the combination of wait_event[_interruptible][_timeout], and use the exposed timer to poll for seqno should we detect a lost interrupt. v2: In order to satisfy the debug requirement of logging missed interrupts with the real world requirments of making machines work even if interrupts are hosed, we revert to polling after detecting a missed interrupt. v3: Throw in a debugfs interface to simulate broken hw not reporting interrupts. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> --- drivers/gpu/drm/i915/i915_debugfs.c | 68 ++++++++++++++++++++ drivers/gpu/drm/i915/i915_drv.h | 5 ++ drivers/gpu/drm/i915/i915_gem.c | 114 ++++++++++++++++++++-------------- drivers/gpu/drm/i915/i915_gpu_error.c | 1 + drivers/gpu/drm/i915/i915_irq.c | 11 ++-- 5 files changed, 148 insertions(+), 51 deletions(-)