[2/3] drm/i915: Per-engine Timeout detection and recovery on HSW

Message ID	1384181971.3116.74.camel@asiluver-linux.isw.intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: "Siluvery, Arun" <arun.siluvery@intel.com> To: "intel-gfx@lists.freedesktop.org" <intel-gfx@lists.freedesktop.org> Thread-Topic: [PATCH 2/3] drm/i915: Per-engine Timeout detection and recovery on HSW Thread-Index: AQHO3u6jTCbQNkBUIEWhoVVlkwJRBw== Date: Mon, 11 Nov 2013 14:59:37 +0000 Message-ID: <1384181971.3116.74.camel@asiluver-linux.isw.intel.com> Accept-Language: en-US Content-Language: en-US Content-ID: <CCC2494BE4D75E4F99CA98C0B6023D30@intel.com> MIME-Version: 1.0 Subject: [Intel-gfx] [PATCH 2/3] drm/i915: Per-engine Timeout detection and recovery on HSW Precedence: list Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Sender: intel-gfx-bounces@lists.freedesktop.org Errors-To: intel-gfx-bounces@lists.freedesktop.org

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 6875b7a..0e5bcb4 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -2604,9 +2604,13 @@ static int i915_wedged_set(void *data, u64 val) { struct drm_device *dev = data; + drm_i915_private_t *dev_priv = dev->dev_private; DRM_INFO("Manually setting wedged to %llu\n", val); - i915_handle_error(dev, val); + if (val) { + if (!i915_reset_in_progress(&dev_priv->gpu_error)) + i915_handle_error(dev, NULL); + } return 0; } diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 0cab2d0..694da55 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -1466,6 +1466,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) struct intel_device_info *info; int ret = 0, mmio_bar, mmio_size; uint32_t aperture_size; + uint32_t i; info = (struct intel_device_info *) flags; @@ -1661,6 +1662,17 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags) acpi_video_register(); } + for (i = 0; i < I915_NUM_RINGS; i++) { + dev_priv->hangcheck[i].count = 0; + dev_priv->hangcheck[i].last_acthd = 0; + dev_priv->hangcheck[i].ringid = i; + dev_priv->hangcheck[i].dev = dev; + + setup_timer(&dev_priv->hangcheck[i].timer, + i915_hangcheck_sample, + (unsigned long) &dev_priv->hangcheck[i]); + } + if (IS_GEN5(dev)) intel_gpu_ips_init(dev_priv); @@ -1703,6 +1715,7 @@ int i915_driver_unload(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; int ret; + uint32_t i; intel_gpu_ips_teardown(); @@ -1748,9 +1761,10 @@ int i915_driver_unload(struct drm_device *dev) } /* Free error state after interrupts are fully disabled. */ - del_timer_sync(&dev_priv->gpu_error.hangcheck_timer); cancel_work_sync(&dev_priv->gpu_error.work); i915_destroy_error_state(dev); + for (i = 0; i < I915_NUM_RINGS; i++) + del_timer_sync(&dev_priv->hangcheck[i].timer); cancel_delayed_work_sync(&dev_priv->pc8.enable_work); diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 24d58b0..1c8b96b 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -113,6 +113,59 @@ MODULE_PARM_DESC(enable_hangcheck, "WARNING: Disabling this can cause system wide hangs. " "(default: true)"); +unsigned int i915_hangcheck_period __read_mostly = 1000; + +int hangcheck_period_set(const char *val, const struct kernel_param *kp) +{ + /* Custom set function so we can validate the range*/ + unsigned long num; + int ret; + + ret = kstrtoul(val, 0, &num); + + if (ret) + return ret; + + /* Enforce minimum delay in ms */ + if ((num >= MINIMUM_HANGCHECK_PERIOD) + && (num <= MAXIMUM_HANGCHECK_PERIOD)) { + i915_hangcheck_period = num; + return 0; + } + + return -EINVAL; +} + +static const struct kernel_param_ops hangcheck_ops = { + .set = hangcheck_period_set, + .get = param_get_uint, +}; + +module_param_cb(i915_hangcheck_period, &hangcheck_ops, + &i915_hangcheck_period, 0644); +MODULE_PARM_DESC(i915_hangcheck_period, + "The hangcheck timer period in milliseconds. " + "The actual time to detect a hang may be 3 - 4 times " + "this value (default = 1000ms)"); + +unsigned int i915_ring_reset_min_alive_period __read_mostly; +module_param_named(i915_ring_reset_min_alive_period, + i915_ring_reset_min_alive_period, int, 0644); +MODULE_PARM_DESC(i915_ring_reset_min_alive_period, + "Catch excessive ring resets. Each ring maintains a timestamp of " + "the last time it was reset. If it hangs again within this period " + "then switch to full GPU reset to try and clear the hang." + "default=0 seconds (disabled)"); + +unsigned int i915_gpu_reset_min_alive_period __read_mostly; +module_param_named(i915_gpu_reset_min_alive_period, + i915_gpu_reset_min_alive_period, int, 0644); +MODULE_PARM_DESC(i915_gpu_reset_min_alive_period, + "Catch excessive GPU resets. If the GPU hangs again within this period " + "following the previous GPU reset then declare it wedged and " + "prevent further resets. " + "default=0 seconds (disabled)"); + int i915_enable_ppgtt __read_mostly = -1; module_param_named(i915_enable_ppgtt, i915_enable_ppgtt, int, 0600); MODULE_PARM_DESC(i915_enable_ppgtt, @@ -726,6 +779,142 @@ int i915_resume(struct drm_device *dev) return 0; } +int i915_handle_hung_ring(struct drm_device *dev, uint32_t ringid) +{ + /* TDR Version 1: + * Reset the ring that is hung + * + * WARNING: Hold dev->struct_mutex before entering + * this function + */ + drm_i915_private_t *dev_priv = dev->dev_private; + struct intel_ring_buffer *ring = &dev_priv->ring[ringid]; + struct drm_crtc *crtc; + struct intel_crtc *intel_crtc; + int ret = 0; + int pipe = 0; + struct intel_unpin_work *unpin_work; + uint32_t ring_flags = 0; + uint32_t head; + + BUG_ON(!mutex_is_locked(&dev->struct_mutex)); + + /* Take wake lock to prevent power saving mode */ + gen6_gt_force_wake_get(dev_priv); + + /* Check if the ring has hung on a MI_DISPLAY_FLIP command. + * The pipe value will be stored in the HWS page if it has. + * At the moment this should only happen for the blitter but + * each ring has its own status page so this should work for + * all rings*/ + pipe = intel_read_status_page(ring, I915_GEM_PGFLIP_INDEX); + if (pipe) { + /* Clear it to avoid responding to it twice*/ + intel_write_status_page(ring, I915_GEM_PGFLIP_INDEX, 0); + } + + /* Clear any simulated hang flags */ + if (dev_priv->stop_rings) { + DRM_DEBUG_TDR("Simulated gpu hang, rst stop_rings bits %08x\n", + (0x1 << ringid)); + dev_priv->stop_rings &= ~(0x1 << ringid); + } + + DRM_DEBUG_TDR("Resetting ring %d\n", ringid); + + ret = intel_ring_disable(ring); + if (ret != 0) { + DRM_ERROR("Failed to disable ring %d\n", ringid); + goto handle_hung_ring_error; + } + + /* Sample the current ring head position */ + head = I915_READ(RING_HEAD(ring->mmio_base)) & HEAD_ADDR; + DRM_DEBUG_TDR("head 0x%08X, last_head 0x%08X\n", + head, dev_priv->hangcheck[ringid].last_head); + if (head == dev_priv->hangcheck[ringid].last_head) { + /* The ring has not advanced since the last + * time it hung so force it to advance to the + * next QWORD. In most cases the ring head + * pointer will automatically advance to the + * next instruction as soon as it has read the + * current instruction, without waiting for it + * to complete. This seems to be the default + * behaviour, however an MBOX wait inserted + * directly to the VCS/BCS rings does not behave + * in the same way, instead the head pointer + * will still be pointing at the MBOX instruction + * until it completes.*/ + ring_flags = FORCE_ADVANCE; + DRM_DEBUG_TDR("Force ring head to advance\n"); + } + dev_priv->hangcheck[ringid].last_head = head; + + ret = intel_ring_save(ring, ring_flags); + if (ret != 0) { + DRM_ERROR("Failed to save ring state\n"); + goto handle_hung_ring_error; + } + + ret = intel_ring_reset(ring); + if (ret != 0) { + DRM_ERROR("Failed to reset ring\n"); + goto handle_hung_ring_error; + } + + DRM_ERROR("Reset ring %d (GPU Hang)\n", ringid); + + /* Clear last_acthd in hangcheck timer for this ring */ + dev_priv->hangcheck[ringid].last_acthd = 0; + + /* Clear reset to allow future hangchecks */ + atomic_set(&dev_priv->hangcheck[ringid].reset, 0); + + ret = intel_ring_restore(ring); + if (ret != 0) { + DRM_ERROR("Failed to restore ring state\n"); + goto handle_hung_ring_error; + } + + /* Correct driver state */ + intel_ring_resample(ring); + + ret = intel_ring_enable(ring); + if (ret != 0) { + DRM_ERROR("Failed to enable ring\n"); + goto handle_hung_ring_error; + } + + /* Wake up anything waiting on this rings queue */ + wake_up_all(&ring->irq_queue); + + if (pipe && + ((pipe - 1) < ARRAY_SIZE(dev_priv->pipe_to_crtc_mapping))) { + /* The pipe value in the status page is offset by 1 */ + pipe -= 1; + + /* The ring hung on a page flip command so we + * must manually release the pending flip queue */ + crtc = dev_priv->pipe_to_crtc_mapping[pipe]; + intel_crtc = to_intel_crtc(crtc); + unpin_work = intel_crtc->unpin_work; + + if (unpin_work && unpin_work->pending_flip_obj) { + intel_prepare_page_flip(dev, intel_crtc->pipe); + intel_finish_page_flip(dev, intel_crtc->pipe); + DRM_DEBUG_TDR("Released stuck page flip for pipe %d\n", + pipe); + } + } + +handle_hung_ring_error: + + /* Release power lock */ + gen6_gt_force_wake_put(dev_priv); + + return ret; +} + /** * i915_reset - reset chip after a hang * @dev: drm device to reset @@ -759,7 +948,11 @@ int i915_reset(struct drm_device *dev) ret = intel_gpu_reset(dev); /* Also reset the gpu hangman. */ - if (simulated) { + if (!simulated && (get_seconds() - dev_priv->gpu_error.last_reset) + < i915_gpu_reset_min_alive_period) { + DRM_ERROR("GPU hanging too fast, declaring wedged!\n"); + ret = -ENODEV; + } else if (simulated) { DRM_INFO("Simulated gpu hang, resetting stop_rings\n"); dev_priv->gpu_error.stop_rings = 0; if (ret == -ENODEV) { diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index b0a244d..c539509 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -307,7 +307,7 @@ struct drm_i915_error_state { u32 ctl[I915_NUM_RINGS]; u32 ipeir[I915_NUM_RINGS]; u32 ipehr[I915_NUM_RINGS]; - u32 instdone[I915_NUM_RINGS]; + u32 instdone[I915_NUM_RINGS][I915_NUM_INSTDONE_REG]; u32 acthd[I915_NUM_RINGS]; u32 semaphore_mboxes[I915_NUM_RINGS][I915_NUM_RINGS - 1]; u32 semaphore_seqno[I915_NUM_RINGS][I915_NUM_RINGS - 1]; @@ -1042,6 +1042,13 @@ struct i915_gem_mm { */ bool interruptible; + /** + * This is set when the error_recovery function is running. + * It prevents command submission from occurring and makes + * every pending request fail + */ + atomic_t wedged; + /** Bit 6 swizzling required for X tiling */ uint32_t bit_6_swizzle_x; /** Bit 6 swizzling required for Y tiling */ @@ -1072,8 +1079,10 @@ struct i915_error_state_file_priv { struct i915_gpu_error { /* For hangcheck timer */ +#define MINIMUM_HANGCHECK_PERIOD 100 /* 100ms */ +#define MAXIMUM_HANGCHECK_PERIOD 30000 /* 30s */ #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */ -#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD) +#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(i915_hangcheck_period) /* Hang gpu twice in this window and your context gets banned */ #define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000) @@ -1087,6 +1096,7 @@ struct i915_gpu_error { unsigned long missed_irq_rings; + unsigned long last_reset; /** * State variable and reset counter controlling the reset flow @@ -1304,6 +1314,41 @@ struct intel_pipe_crc { wait_queue_head_t wq; }; +struct intel_hangcheck { + /* The ring being monitored*/ + uint32_t ringid; + + /* Parent drm_device*/ + struct drm_device *dev; + + /* Timer for this ring only*/ + struct timer_list timer; + + /* Count of consecutive hang detections + * (reset flag set once count exceeds threshold)*/ +#define HANGCHECK_THRESHOLD 1 +#define MBOX_HANGCHECK_THRESHOLD 4 + int count; + + /* Last sampled head and active head*/ + uint32_t last_acthd; + uint32_t last_hd; + + /* Last recorded ring head index from previous ring hang. + * This is only ever a ring index where as active + * head may be a graphics address in a ring buffer */ + uint32_t last_head; + + /* Last recorded instdone*/ + uint32_t prev_instdone[I915_NUM_INSTDONE_REG]; + + /* Flag to indicate if ring reset required*/ + atomic_t reset; + + /* Keep a record of the last time the ring was reset */ + unsigned long last_reset; +}; + typedef struct drm_i915_private { struct drm_device *dev; struct kmem_cache *slab; @@ -1372,6 +1417,17 @@ typedef struct drm_i915_private { int num_plane; + /* For hangcheck timer */ + struct intel_hangcheck hangcheck[I915_NUM_RINGS]; + + unsigned int stop_rings; + + unsigned long cfb_size; + unsigned int cfb_fb; + enum plane cfb_plane; + int cfb_y; + struct intel_fbc_work *fbc_work; + struct i915_fbc fbc; struct intel_opregion opregion; struct intel_vbt_data vbt; @@ -1397,6 +1453,11 @@ typedef struct drm_i915_private { unsigned int fsb_freq, mem_freq, is_ddr3; + struct work_struct error_work; + atomic_t full_reset; + uint32_t total_resets; + + wait_queue_head_t error_queue; /** * wq - Driver workqueue for GEM. * @@ -1864,6 +1925,9 @@ extern int i915_vbt_sdvo_panel_type __read_mostly; extern int i915_enable_rc6 __read_mostly; extern int i915_enable_fbc __read_mostly; extern bool i915_enable_hangcheck __read_mostly; +extern unsigned int i915_hangcheck_period __read_mostly; +extern unsigned int i915_ring_reset_min_alive_period __read_mostly; +extern unsigned int i915_gpu_reset_min_alive_period __read_mostly; extern int i915_enable_ppgtt __read_mostly; extern int i915_enable_psr __read_mostly; extern unsigned int i915_preliminary_hw_support __read_mostly; @@ -1899,6 +1963,7 @@ extern int i915_emit_box(struct drm_device *dev, struct drm_clip_rect *box, int DR1, int DR4); extern int intel_gpu_reset(struct drm_device *dev); +extern int i915_handle_hung_ring(struct drm_device *dev, uint32_t ringid); extern int i915_reset(struct drm_device *dev); extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv); extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv); @@ -1909,7 +1974,10 @@ extern void intel_console_resume(struct work_struct *work); /* i915_irq.c */ void i915_queue_hangcheck(struct drm_device *dev); -void i915_handle_error(struct drm_device *dev, bool wedged); +void i915_hangcheck_sample(unsigned long data); +void i915_handle_error(struct drm_device *dev, struct intel_hangcheck *hc); + + extern void intel_irq_init(struct drm_device *dev); extern void intel_pm_init(struct drm_device *dev); @@ -2067,7 +2135,8 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj) bool i915_gem_retire_requests(struct drm_device *dev); void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring); int __must_check i915_gem_check_wedge(struct i915_gpu_error *error, - bool interruptible); + bool interruptible, + struct intel_ring_buffer *ring); static inline bool i915_reset_in_progress(struct i915_gpu_error *error) { return unlikely(atomic_read(&error->reset_counter) @@ -2312,7 +2381,8 @@ void i915_error_state_get(struct drm_device *dev, void i915_error_state_put(struct i915_error_state_file_priv *error_priv); void i915_destroy_error_state(struct drm_device *dev); -void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone); +void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone, + struct intel_ring_buffer *ring); const char *i915_cache_level_str(int type); /* i915_suspend.c */ diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 40d9dcf..b7e5a8e 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -142,14 +142,38 @@ i915_gem_wait_for_error(struct i915_gpu_error *error) return 0; } -int i915_mutex_lock_interruptible(struct drm_device *dev) +int i915_gem_wedged(struct drm_device *dev, bool interruptible) { + /* Warning: This function can only give an indication + * if the GPU is wedged at a particular instance of time. + * The hangcheck process is asynchronous so a hang + * may be detected just after the flags have been sampled */ + unsigned i; struct drm_i915_private *dev_priv = dev->dev_private; + int err = !interruptible ? -EIO : -EAGAIN; + + /* Full reset requested */ + if (i915_reset_in_progress(&dev_priv->gpu_error)) + return err; + + /* Check for an individual ring which has hung */ + for (i = 0; i < I915_NUM_RINGS; i++) { + if (atomic_read(&dev_priv->hangcheck[i].reset)) + return err; + } + + return 0; +} + +int i915_mutex_lock_interruptible(struct drm_device *dev) +{ int ret; - ret = i915_gem_wait_for_error(&dev_priv->gpu_error); - if (ret) - return ret; + /* There should be no need to call i915_gem_wait_for_error + * as the error recovery handler takes dev->struct_mutex + * so if it is active we will wait on the + * mutex_lock_interruptible call instead. + */ ret = mutex_lock_interruptible(&dev->struct_mutex); if (ret) @@ -935,9 +959,15 @@ unlock: int i915_gem_check_wedge(struct i915_gpu_error *error, - bool interruptible) + bool interruptible, + struct intel_ring_buffer *ring) { - if (i915_reset_in_progress(error)) { + drm_i915_private_t *dev_priv; + + dev_priv = container_of(error, drm_i915_private_t, gpu_error); + + if ((ring && atomic_read(&dev_priv->hangcheck[ring->id].reset)) || + i915_reset_in_progress(error)) { /* Non-interruptible callers can't handle -EAGAIN, hence return * -EIO unconditionally for these. */ if (!interruptible) @@ -1054,7 +1084,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno, if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) { /* ... but upgrade the -EAGAIN to an -EIO if the gpu * is truely gone. */ - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); + ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible, ring); if (ret == 0) ret = -EAGAIN; break; @@ -1124,7 +1154,7 @@ i915_wait_seqno(struct intel_ring_buffer *ring, uint32_t seqno) BUG_ON(!mutex_is_locked(&dev->struct_mutex)); BUG_ON(seqno == 0); - ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible); + ret = i915_gem_wedged(dev, interruptible); if (ret) return ret; @@ -1201,7 +1231,7 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj, if (seqno == 0) return 0; - ret = i915_gem_check_wedge(&dev_priv->gpu_error, true); + ret = i915_gem_check_wedge(&dev_priv->gpu_error, true, ring); if (ret) return ret; @@ -1213,8 +1243,9 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj, mutex_unlock(&dev->struct_mutex); ret = __wait_seqno(ring, seqno, reset_counter, true, NULL, file->driver_priv); mutex_lock(&dev->struct_mutex); - if (ret) + if (ret) { return ret; + } return i915_gem_object_wait_rendering__tail(obj, ring); } @@ -2180,8 +2211,6 @@ int __i915_add_request(struct intel_ring_buffer *ring, ring->preallocated_lazy_request = NULL; if (!dev_priv->ums.mm_suspended) { - i915_queue_hangcheck(ring->dev); - if (was_empty) { cancel_delayed_work_sync(&dev_priv->mm.idle_work); queue_delayed_work(dev_priv->wq, @@ -3810,7 +3839,7 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file) if (ret) return ret; - ret = i915_gem_check_wedge(&dev_priv->gpu_error, false); + ret = i915_gem_check_wedge(&dev_priv->gpu_error, false, NULL); if (ret) return ret; @@ -3828,9 +3857,16 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file) if (seqno == 0) return 0; - ret = __wait_seqno(ring, seqno, reset_counter, true, NULL, NULL); - if (ret == 0) - queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0); + if (ring) { + if (i915_gem_wedged(dev, 1) != 0) + return -EIO; + + ret = __wait_seqno(ring, seqno, reset_counter, true, NULL, + file->driver_priv); + if (ret == 0) + queue_delayed_work(dev_priv->wq, + &dev_priv->mm.retire_work, 0); + } return ret; } @@ -4275,6 +4311,7 @@ i915_gem_suspend(struct drm_device *dev) { drm_i915_private_t *dev_priv = dev->dev_private; int ret = 0; + int i; mutex_lock(&dev->struct_mutex); if (dev_priv->ums.mm_suspended) @@ -4301,7 +4338,8 @@ i915_gem_suspend(struct drm_device *dev) DRIVER_MODESET); mutex_unlock(&dev->struct_mutex); - del_timer_sync(&dev_priv->gpu_error.hangcheck_timer); + for (i = 0; i < I915_NUM_RINGS; i++) + del_timer_sync(&dev_priv->hangcheck[i].timer); cancel_delayed_work_sync(&dev_priv->mm.retire_work); cancel_delayed_work_sync(&dev_priv->mm.idle_work); @@ -4530,6 +4568,7 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data, { struct drm_i915_private *dev_priv = dev->dev_private; int ret; + int i; if (drm_core_check_feature(dev, DRIVER_MODESET)) return 0; @@ -4537,6 +4576,10 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data, if (i915_reset_in_progress(&dev_priv->gpu_error)) { DRM_ERROR("Reenabling wedged hardware, good luck\n"); atomic_set(&dev_priv->gpu_error.reset_counter, 0); + for (i = 0; i < I915_NUM_RINGS; i++) { + /* Clear the reset flag */ + atomic_set(&dev_priv->hangcheck[i].reset, 0); + } } mutex_lock(&dev->struct_mutex); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 79dcb8f..4c8ad4f 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -723,7 +723,6 @@ static void i915_record_ring_state(struct drm_device *dev, error->faddr[ring->id] = I915_READ(RING_DMA_FADD(ring->mmio_base)); error->ipeir[ring->id] = I915_READ(RING_IPEIR(ring->mmio_base)); error->ipehr[ring->id] = I915_READ(RING_IPEHR(ring->mmio_base)); - error->instdone[ring->id] = I915_READ(RING_INSTDONE(ring->mmio_base)); error->instps[ring->id] = I915_READ(RING_INSTPS(ring->mmio_base)); if (ring->id == RCS) error->bbaddr = I915_READ64(BB_ADDR); @@ -732,9 +731,10 @@ static void i915_record_ring_state(struct drm_device *dev, error->faddr[ring->id] = I915_READ(DMA_FADD_I8XX); error->ipeir[ring->id] = I915_READ(IPEIR); error->ipehr[ring->id] = I915_READ(IPEHR); - error->instdone[ring->id] = I915_READ(INSTDONE); } + i915_get_extra_instdone(dev, error->instdone[ring->id], + &dev_priv->ring[ring->id]); error->waiting[ring->id] = waitqueue_active(&ring->irq_queue); error->instpm[ring->id] = I915_READ(RING_INSTPM(ring->mmio_base)); error->seqno[ring->id] = ring->get_seqno(ring, false); @@ -899,6 +899,7 @@ void i915_capture_error_state(struct drm_device *dev) struct drm_i915_error_state *error; unsigned long flags; int pipe; + int i; spin_lock_irqsave(&dev_priv->gpu_error.lock, flags); error = dev_priv->gpu_error.first_error; @@ -957,7 +958,9 @@ void i915_capture_error_state(struct drm_device *dev) if (INTEL_INFO(dev)->gen == 7) error->err_int = I915_READ(GEN7_ERR_INT); - i915_get_extra_instdone(dev, error->extra_instdone); + for (i = 0; i < I915_NUM_RINGS; i++) + i915_get_extra_instdone(dev, error->instdone[i], + &dev_priv->ring[i]); i915_gem_capture_buffers(dev_priv, error); i915_gem_record_fences(dev, error); @@ -1026,7 +1029,9 @@ const char *i915_cache_level_str(int type) } /* NB: please notice the memset */ -void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone) +void i915_get_extra_instdone(struct drm_device *dev, + uint32_t *instdone, + struct intel_ring_buffer *ring) { struct drm_i915_private *dev_priv = dev->dev_private; memset(instdone, 0, sizeof(*instdone) * I915_NUM_INSTDONE_REG); @@ -1046,10 +1051,14 @@ void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone) WARN_ONCE(1, "Unsupported platform\n"); case 7: case 8: - instdone[0] = I915_READ(GEN7_INSTDONE_1); - instdone[1] = I915_READ(GEN7_SC_INSTDONE); - instdone[2] = I915_READ(GEN7_SAMPLER_INSTDONE); - instdone[3] = I915_READ(GEN7_ROW_INSTDONE); + instdone[0] = + I915_READ(RING_INSTDONE(ring->mmio_base)); + + if (ring->id == RCS) { + instdone[1] = I915_READ(GEN7_SC_INSTDONE); + instdone[2] = I915_READ(GEN7_SAMPLER_INSTDONE); + instdone[3] = I915_READ(GEN7_ROW_INSTDONE); + } break; } } diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index ac57d6d..78b2cbb 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -957,7 +957,6 @@ static void notify_ring(struct drm_device *dev, trace_i915_gem_request_complete(ring); wake_up_all(&ring->irq_queue); - i915_queue_hangcheck(dev); } static void gen6_pm_rps_work(struct work_struct *work) @@ -1155,12 +1154,14 @@ static void snb_gt_irq_handler(struct drm_device *dev, if (gt_iir & GT_BLT_USER_INTERRUPT) notify_ring(dev, &dev_priv->ring[BCS]); - if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT | - GT_BSD_CS_ERROR_INTERRUPT | - GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) { - DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir); - i915_handle_error(dev, false); - } + if (gt_iir & GT_RENDER_CS_MASTER_ERROR_INTERRUPT) + i915_handle_error(dev, &dev_priv->hangcheck[RCS]); + + if (gt_iir & GT_BSD_CS_ERROR_INTERRUPT) + i915_handle_error(dev, &dev_priv->hangcheck[VCS]); + + if (gt_iir & GT_BLT_CS_ERROR_INTERRUPT) + i915_handle_error(dev, &dev_priv->hangcheck[BCS]); if (gt_iir & GT_PARITY_ERROR(dev)) ivybridge_parity_error_irq_handler(dev, gt_iir); @@ -1403,7 +1404,7 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir) if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) { DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir); - i915_handle_error(dev_priv->dev, false); + i915_handle_error(dev_priv->dev, &dev_priv->hangcheck[VECS]); } } } @@ -1946,9 +1947,41 @@ static void i915_error_work_func(struct work_struct *work) char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; int ret; + int i; + int pipe; + struct drm_crtc *crtc; + struct intel_crtc *intel_crtc; + struct intel_unpin_work *unpin_work; + + /* Set this flag as it should force any waiting processes to release + * struct->mutex if they are holding it */ + atomic_set(&dev_priv->mm.wedged, 1); + + mutex_lock(&dev->struct_mutex); kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event); + /* Skip individual ring reset requests if full_reset requested*/ + if (!i915_reset_in_progress(error)) { + /* Check each ring for a pending reset condition */ + for (i = 0; i < I915_NUM_RINGS; i++) { + if (atomic_read(&dev_priv->hangcheck[i].reset)) { + DRM_DEBUG_TDR("resetting ring %d\n", i); + + if (i915_handle_hung_ring(dev, i) != 0) { + DRM_ERROR("ring %d reset failed", i); + atomic_set_mask( + I915_RESET_IN_PROGRESS_FLAG, + &dev_priv->gpu_error.reset_counter); + break; + } + } + } + } + /* Release struct->mutex for the full GPU reset. It will take + * it itself when it needs it */ + mutex_unlock(&dev->struct_mutex); + /* * Note that there's only one work item which does gpu resets, so we * need not worry about concurrent gpu resets potentially incrementing @@ -1988,8 +2021,35 @@ static void i915_error_work_func(struct work_struct *work) smp_mb__before_atomic_inc(); atomic_inc(&dev_priv->gpu_error.reset_counter); - kobject_uevent_env(&dev->primary->kdev->kobj, - KOBJ_CHANGE, reset_done_event); + for (i = 0; i < I915_NUM_RINGS; i++) { + /* Clear individual ring reset flags*/ + atomic_set(&dev_priv->hangcheck[i].reset, 0); + } + + mutex_lock(&dev->mode_config.mutex); + /* Release any pending page flip + * This is particularly important if ring_stop was set. + + * WARNING: This code could retire a page flip that + * arrives just after reset. In that case we will get + * an extra page flip interrupt that is not expected. + * If another page flip request arrives before the interrupt + * then the unpin work could happen sooner than expected. + */ + for_each_pipe(pipe) { + crtc = + dev_priv->pipe_to_crtc_mapping[pipe]; + intel_crtc = to_intel_crtc(crtc); + unpin_work = intel_crtc->unpin_work; + + if (unpin_work + && unpin_work->pending_flip_obj) { + intel_prepare_page_flip(dev, pipe); + intel_finish_page_flip(dev, pipe); + DRM_DEBUG_TDR("Clr pg flip\n"); + } + } + mutex_unlock(&dev->mode_config.mutex); } else { atomic_set(&error->reset_counter, I915_WEDGED); } @@ -2000,21 +2060,36 @@ static void i915_error_work_func(struct work_struct *work) */ i915_error_wake_up(dev_priv, true); } + + /* Clear wedged condition and wake up waiters*/ + atomic_set(&dev_priv->mm.wedged, 0); + + kobject_uevent_env(&dev->primary->kdev->kobj, + KOBJ_CHANGE, reset_done_event); + + /* Wake anyone waiting on error handling completion*/ + wake_up_all(&dev_priv->error_queue); + + DRM_DEBUG_TDR("End recovery work\n\n"); } static void i915_report_and_clear_eir(struct drm_device *dev) { struct drm_i915_private *dev_priv = dev->dev_private; uint32_t instdone[I915_NUM_INSTDONE_REG]; - u32 eir = I915_READ(EIR); + u32 eir; int pipe, i; + unsigned long flags; + + spin_lock_irqsave(&dev_priv->gpu_error.lock, flags); + eir = I915_READ(EIR); if (!eir) - return; + goto i915_report_and_clear_eir_exit; pr_err("render error detected, EIR: 0x%08x\n", eir); - i915_get_extra_instdone(dev, instdone); + i915_get_extra_instdone(dev, instdone, &dev_priv->ring[RCS]); if (IS_G4X(dev)) { if (eir & (GM45_ERROR_MEM_PRIV | GM45_ERROR_CP_PRIV)) { @@ -2092,6 +2167,9 @@ static void i915_report_and_clear_eir(struct drm_device *dev) I915_WRITE(EMR, I915_READ(EMR) | eir); I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT); } + +i915_report_and_clear_eir_exit: + spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags); } /** @@ -2104,39 +2182,74 @@ static void i915_report_and_clear_eir(struct drm_device *dev) * so userspace knows something bad happened (should trigger collection * of a ring dump etc.). */ -void i915_handle_error(struct drm_device *dev, bool wedged) +void i915_handle_error(struct drm_device *dev, struct intel_hangcheck *hc) { struct drm_i915_private *dev_priv = dev->dev_private; + int full_reset = 0; + unsigned long cur_time; + unsigned long last_reset; i915_capture_error_state(dev); i915_report_and_clear_eir(dev); - if (wedged) { + /* Currently we only support individual ring reset for GEN7 onwards, + * older chips will revert to a full reset. + * Error interrupts trigger a full reset (hc == NULL)*/ + if ((INTEL_INFO(dev)->gen >= 7) && hc) { + cur_time = get_seconds(); + last_reset = hc->last_reset; + hc->last_reset = cur_time; + + if ((cur_time - last_reset) + < i915_ring_reset_min_alive_period) { + /* This ring is hanging too frequently. + * Opt for full-reset instead */ + DRM_DEBUG_TDR("Ring %d hanging too quickly...\r\n", + hc->ringid); + full_reset = 1; + } else { + if (atomic_read(&hc->reset)) { + /* Reset already in progress for this ring */ + return; + } + + atomic_set(&hc->reset, 1); + DRM_DEBUG_TDR("Reset Ring %d\n", hc->ringid); + } + } else + full_reset = 1; + + if (!hc || full_reset) { + if (i915_reset_in_progress(&dev_priv->gpu_error)) + return; + atomic_set_mask(I915_RESET_IN_PROGRESS_FLAG, &dev_priv->gpu_error.reset_counter); - - /* - * Wakeup waiting processes so that the reset work function - * i915_error_work_func doesn't deadlock trying to grab various - * locks. By bumping the reset counter first, the woken - * processes will see a reset in progress and back off, - * releasing their locks and then wait for the reset completion. - * We must do this for _all_ gpu waiters that might hold locks - * that the reset work needs to acquire. - * - * Note: The wake_up serves as the required memory barrier to - * ensure that the waiters see the updated value of the reset - * counter atomic_t. - */ - i915_error_wake_up(dev_priv, false); + DRM_DEBUG_TDR("Full reset of GPU requested\n"); } /* + * Wakeup waiting processes so that the reset work function + * i915_error_work_func doesn't deadlock trying to grab various + * locks. By bumping the reset counter first, the woken + * processes will see a reset in progress and back off, + * releasing their locks and then wait for the reset completion. + * We must do this for _all_ gpu waiters that might hold locks + * that the reset work needs to acquire. + * + * Note: The wake_up serves as the required memory barrier to + * ensure that the waiters see the updated value of the reset + * counter atomic_t. + */ + i915_error_wake_up(dev_priv, false); + + /* * Our reset work can grab modeset locks (since it needs to reset the * state of outstanding pagelips). Hence it must not be run on our own * dev-priv->wq work queue for otherwise the flush_work in the pageflip * code will deadlock. */ + DRM_DEBUG_TDR("Schedule error recovery work\n"); schedule_work(&dev_priv->gpu_error.work); } @@ -2339,245 +2452,191 @@ ring_last_seqno(struct intel_ring_buffer *ring) struct drm_i915_gem_request, list)->seqno; } -static bool -ring_idle(struct intel_ring_buffer *ring, u32 seqno) -{ - return (list_empty(&ring->request_list) || - i915_seqno_passed(seqno, ring_last_seqno(ring))); -} - -static struct intel_ring_buffer * -semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno) -{ - struct drm_i915_private *dev_priv = ring->dev->dev_private; - u32 cmd, ipehr, acthd, acthd_min; - - ipehr = I915_READ(RING_IPEHR(ring->mmio_base)); - if ((ipehr & ~(0x3 << 16)) != - (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER)) - return NULL; - - /* ACTHD is likely pointing to the dword after the actual command, - * so scan backwards until we find the MBOX. - */ - acthd = intel_ring_get_active_head(ring) & HEAD_ADDR; - acthd_min = max((int)acthd - 3 * 4, 0); - do { - cmd = ioread32(ring->virtual_start + acthd); - if (cmd == ipehr) - break; - - acthd -= 4; - if (acthd < acthd_min) - return NULL; - } while (1); - - *seqno = ioread32(ring->virtual_start+acthd+4)+1; - return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3]; -} - -static int semaphore_passed(struct intel_ring_buffer *ring) -{ - struct drm_i915_private *dev_priv = ring->dev->dev_private; - struct intel_ring_buffer *signaller; - u32 seqno, ctl; - - ring->hangcheck.deadlock = true; - - signaller = semaphore_waits_for(ring, &seqno); - if (signaller == NULL || signaller->hangcheck.deadlock) - return -1; - - /* cursory check for an unkickable deadlock */ - ctl = I915_READ_CTL(signaller); - if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0) - return -1; - - return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno); -} - -static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv) +void i915_queue_hangcheck(struct drm_device *dev) { - struct intel_ring_buffer *ring; - int i; + struct drm_i915_private *dev_priv = dev->dev_private; + if (!i915_enable_hangcheck) + return; - for_each_ring(ring, dev_priv, i) - ring->hangcheck.deadlock = false; + mod_timer(&dev_priv->gpu_error.hangcheck_timer, + round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES)); } -static enum intel_ring_hangcheck_action -ring_stuck(struct intel_ring_buffer *ring, u32 acthd) +static bool kick_ring(struct intel_ring_buffer *ring) { struct drm_device *dev = ring->dev; struct drm_i915_private *dev_priv = dev->dev_private; - u32 tmp; - - if (ring->hangcheck.acthd != acthd) - return HANGCHECK_ACTIVE; - - if (IS_GEN2(dev)) - return HANGCHECK_HUNG; - - /* Is the chip hanging on a WAIT_FOR_EVENT? - * If so we can simply poke the RB_WAIT bit - * and break the hang. This should work on - * all but the second generation chipsets. - */ - tmp = I915_READ_CTL(ring); + u32 tmp = I915_READ_CTL(ring); if (tmp & RING_WAIT) { DRM_ERROR("Kicking stuck wait on %s\n", ring->name); - i915_handle_error(dev, false); I915_WRITE_CTL(ring, tmp); - return HANGCHECK_KICK; + return true; } + return false; +} - if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) { - switch (semaphore_passed(ring)) { - default: - return HANGCHECK_HUNG; - case 1: - DRM_ERROR("Kicking stuck semaphore on %s\n", - ring->name); - i915_handle_error(dev, false); - I915_WRITE_CTL(ring, tmp); - return HANGCHECK_KICK; - case 0: - return HANGCHECK_WAIT; +/* This function is called when the TDR algorithm detects that the hardware + * has not advanced during the last sampling period.*/ +static bool i915_hangcheck_hung(struct intel_hangcheck *hc) +{ + struct drm_device *dev = hc->dev; + drm_i915_private_t *dev_priv = dev->dev_private; + uint32_t mbox_wait; + uint32_t threshold; + struct intel_ring_buffer *ring; + + DRM_DEBUG_TDR("Ring [%d] hc->count = %d\n", hc->ringid, hc->count); + + ring = &dev_priv->ring[hc->ringid]; + + /* Is this ring waiting on a semaphore mbox? + * If so, give it a bit longer as it may be waiting on another + * ring which has actually hung. Give the other ring chance to + * reset and clear the hang. + */ + mbox_wait = ((I915_READ(RING_CTL(ring->mmio_base)) >> 10) & 0x1); + threshold = mbox_wait ? MBOX_HANGCHECK_THRESHOLD : HANGCHECK_THRESHOLD; + + if (hc->count++ > threshold) { + bool hung = true; + + DRM_DEBUG_TDR("Hangcheck timer elapsed... ring %d hung\n", + hc->ringid); + /* Reset the counter*/ + hc->count = 0; + + if (!IS_GEN2(dev)) { + /* If the ring is hanging on a WAIT_FOR_EVENT + * then simply poke the RB_WAIT bit + * and break the hang. This should work on + * all but the second generation chipsets. + */ + ring = &dev_priv->ring[hc->ringid]; + hung &= !kick_ring(ring); + DRM_DEBUG_TDR("hung=%d after kick ring\n", hung); + } + if (hung) { + i915_handle_error(dev, hc); } + return hung; } - - return HANGCHECK_HUNG; -} + return false; +} /** - * This is called when the chip hasn't reported back with completed - * batchbuffers in a long time. We keep track per ring seqno progress and - * if there are no progress, hangcheck score for that ring is increased. - * Further, acthd is inspected to see if the ring is stuck. On stuck case - * we kick the ring. If we see no progress on three subsequent calls - * we assume chip is wedged and try to fix it by resetting the chip. + * This is called from the hangcheck timer for each ring. + * It samples the current state of the hardware to make + * sure that it is progressing. */ -static void i915_hangcheck_elapsed(unsigned long data) -{ - struct drm_device *dev = (struct drm_device *)data; - drm_i915_private_t *dev_priv = dev->dev_private; +void i915_hangcheck_sample(unsigned long data) +{ + struct intel_hangcheck *hc = (struct intel_hangcheck *)data; + struct drm_device *dev; + drm_i915_private_t *dev_priv; + uint32_t head, tail, acthd, instdone[I915_NUM_INSTDONE_REG]; + uint32_t cur_seqno = 0; + uint32_t last_seqno = 0; struct intel_ring_buffer *ring; - int i; - int busy_count = 0, rings_hung = 0; - bool stuck[I915_NUM_RINGS] = { 0 }; -#define BUSY 1 -#define KICK 5 -#define HUNG 20 -#define FIRE 30 - - if (!i915_enable_hangcheck) - return; - - for_each_ring(ring, dev_priv, i) { - u32 seqno, acthd; - bool busy = true; - - semaphore_clear_deadlocks(dev_priv); - - seqno = ring->get_seqno(ring, false); - acthd = intel_ring_get_active_head(ring); - - if (ring->hangcheck.seqno == seqno) { - if (ring_idle(ring, seqno)) { - ring->hangcheck.action = HANGCHECK_IDLE; - - if (waitqueue_active(&ring->irq_queue)) { - /* Issue a wake-up to catch stuck h/w. */ - if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) { - if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring))) - DRM_ERROR("Hangcheck timer elapsed... %s idle\n", - ring->name); - else - DRM_INFO("Fake missed irq on %s\n", - ring->name); - wake_up_all(&ring->irq_queue); - } - /* Safeguard against driver failure */ - ring->hangcheck.score += BUSY; - } else - busy = false; + bool idle; + int instdone_cmp; + int pending_work = 1; + int resched_timer = 1; + int empty; + + if (!i915_enable_hangcheck || !hc) + return; + + dev = hc->dev; + dev_priv = dev->dev_private; + + ring = &dev_priv->ring[hc->ringid]; + + /* Sample the current state */ + head = I915_READ_HEAD(ring) & HEAD_ADDR; + tail = I915_READ_TAIL(ring) & TAIL_ADDR; + acthd = intel_ring_get_active_head(ring); + empty = list_empty(&ring->request_list); + + i915_get_extra_instdone(dev, instdone, ring); + instdone_cmp = (memcmp(hc->prev_instdone, + instdone, sizeof(instdone)) == 0) ? 1 : 0; + + if (!empty) { + /* Examine the seqno's to see where the HW has got to + * (Only call ring_last_seqno when the list is non-empty)*/ + cur_seqno = ring->get_seqno(ring, false); + last_seqno = ring_last_seqno(ring); + } + + if (empty || i915_seqno_passed(cur_seqno, last_seqno)) { + /* If the request list is empty or the HW has passed the + * last seqno of the last item in the request list then the + * HW is considered idle. + * The driver may not have cleaned up the request list yet */ + pending_work = 0; + } + + idle = ((head == tail) && (pending_work == 0)); + + DRM_DEBUG_TDR("[%d] HD: 0x%08x 0x%08x, ACTHD: 0x%08x 0x%08x IC: %d\n", + ring->id, head, hc->last_hd, acthd, hc->last_acthd, + instdone_cmp); + DRM_DEBUG_TDR("E:%d PW:%d TL:0x%08x Csq:0x%08x Lsq:0x%08x Idle: %d\n", + empty, pending_work, tail, cur_seqno, last_seqno, idle); + + /* Check both head and active head. + * Neither is enough on its own - acthd can be pointing within the + * batch buffer so is more likely to be moving, but the same + * underlying buffer object could be submitted more than once. + * If it happens to pause at exactly the same place in the batch + * buffer and we sample it at that moment then we could see it as + * hung over 3 sample periods that do not belong to the same + * batch submission - this would result in a false positive. + * We know that the head pointer will have advanced for each + * batch buffer as the ring has to contain a new MI_BATCH_BUFFER_START + * for every do_exec call, so by combining head and active head we can + * ensure that the hang detection distinguishes between batch buffers*/ + if ((hc->last_acthd == acthd) + && (hc->last_hd == head) + && instdone_cmp) { + /* Ring hasn't advanced in this sampling period */ + if (idle) { + /* The hardware is idle */ + if (waitqueue_active(&ring->irq_queue)) { + /* We expect the wait queue to drain + * if the hardware has remained idle + * for 3 consecutive samples. Wake up + * the queue on each sample to try and + * release it, but if it persists then + * trigger a reset */ + + DRM_DEBUG_TDR("Possible stuck wait (0x%08x)\n", + ring->last_irq_seqno); + wake_up_all(&ring->irq_queue); + i915_hangcheck_hung(hc); } else { - /* We always increment the hangcheck score - * if the ring is busy and still processing - * the same request, so that no single request - * can run indefinitely (such as a chain of - * batches). The only time we do not increment - * the hangcheck score on this ring, if this - * ring is in a legitimate wait for another - * ring. In that case the waiting ring is a - * victim and we want to be sure we catch the - * right culprit. Then every time we do kick - * the ring, add a small increment to the - * score so that we can catch a batch that is - * being repeatedly kicked and so responsible - * for stalling the machine. - */ - ring->hangcheck.action = ring_stuck(ring, - acthd); - - switch (ring->hangcheck.action) { - case HANGCHECK_IDLE: - case HANGCHECK_WAIT: - break; - case HANGCHECK_ACTIVE: - ring->hangcheck.score += BUSY; - break; - case HANGCHECK_KICK: - ring->hangcheck.score += KICK; - break; - case HANGCHECK_HUNG: - ring->hangcheck.score += HUNG; - stuck[i] = true; - break; - } + /* Hardware and driver both idle */ + hc->count = 0; + resched_timer = 0; } } else { - ring->hangcheck.action = HANGCHECK_ACTIVE; - - /* Gradually reduce the count so that we catch DoS - * attempts across multiple batches. - */ - if (ring->hangcheck.score > 0) - ring->hangcheck.score--; - } - - ring->hangcheck.seqno = seqno; - ring->hangcheck.acthd = acthd; - busy_count += busy; - } - - for_each_ring(ring, dev_priv, i) { - if (ring->hangcheck.score > FIRE) { - DRM_INFO("%s on %s\n", - stuck[i] ? "stuck" : "no progress", - ring->name); - rings_hung++; + /* The hardware is busy but has not advanced + * since the last sample - possible hang*/ + i915_hangcheck_hung(hc); } + } else { + /* The state has changed so the hardware is active */ + hc->count = 0; } - if (rings_hung) - return i915_handle_error(dev, true); - - if (busy_count) - /* Reset timer case chip hangs without another request - * being added */ - i915_queue_hangcheck(dev); -} - -void i915_queue_hangcheck(struct drm_device *dev) -{ - struct drm_i915_private *dev_priv = dev->dev_private; - if (!i915_enable_hangcheck) - return; + /* Always update last sampled state */ + hc->last_hd = head; + hc->last_acthd = acthd; + memcpy(hc->prev_instdone, instdone, sizeof(instdone)); - mod_timer(&dev_priv->gpu_error.hangcheck_timer, - round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES)); + if (resched_timer) + mod_timer(&hc->timer, jiffies + DRM_I915_HANGCHECK_JIFFIES); } static void ibx_irq_preinstall(struct drm_device *dev) @@ -3189,7 +3248,7 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg) */ spin_lock_irqsave(&dev_priv->irq_lock, irqflags); if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT) - i915_handle_error(dev, false); + i915_handle_error(dev, NULL); for_each_pipe(pipe) { int reg = PIPESTAT(pipe); @@ -3371,7 +3430,7 @@ static irqreturn_t i915_irq_handler(int irq, void *arg) */ spin_lock_irqsave(&dev_priv->irq_lock, irqflags); if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT) - i915_handle_error(dev, false); + i915_handle_error(dev, NULL); for_each_pipe(pipe) { int reg = PIPESTAT(pipe); @@ -3616,7 +3675,7 @@ static irqreturn_t i965_irq_handler(int irq, void *arg) */ spin_lock_irqsave(&dev_priv->irq_lock, irqflags); if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT) - i915_handle_error(dev, false); + i915_handle_error(dev, NULL); for_each_pipe(pipe) { int reg = PIPESTAT(pipe); @@ -3775,10 +3834,8 @@ void intel_irq_init(struct drm_device *dev) INIT_WORK(&dev_priv->gpu_error.work, i915_error_work_func); INIT_WORK(&dev_priv->rps.work, gen6_pm_rps_work); INIT_WORK(&dev_priv->l3_parity.error_work, ivybridge_parity_work); + init_waitqueue_head(&dev_priv->error_queue); - setup_timer(&dev_priv->gpu_error.hangcheck_timer, - i915_hangcheck_elapsed, - (unsigned long) dev); setup_timer(&dev_priv->hotplug_reenable_timer, i915_reenable_hotplug_timer_func, (unsigned long) dev_priv); diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 2df2366..c5131ce 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -8484,7 +8484,7 @@ static int intel_gen7_queue_flip(struct drm_device *dev, goto err_unpin; } - len = 4; + len = 12; if (ring->id == RCS) len += 6; @@ -8512,11 +8512,34 @@ static int intel_gen7_queue_flip(struct drm_device *dev, intel_ring_emit(ring, ring->scratch.gtt_offset + 256); } + /* Set a flag to indicate that a page flip interrupt is expected. + * The flag is used by the TDR logic to detect whether the blitter hung + * on a page flip command, in which case it will need to manually + * complete the page flip. + * The 'flag' is actually the pipe value associated with this page + * flip + 1 so that the TDR code knows which pipe failed to flip. + * A value of 0 indicates that a flip is not currently in progress on + * the HW.*/ + intel_ring_emit(ring, MI_STORE_DWORD_INDEX); + intel_ring_emit(ring, I915_GEM_PGFLIP_INDEX << + MI_STORE_DWORD_INDEX_SHIFT); + intel_ring_emit(ring, intel_crtc->pipe + 1); + intel_ring_emit(ring, MI_NOOP); + intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | plane_bit); intel_ring_emit(ring, (fb->pitches[0] | obj->tiling_mode)); intel_ring_emit(ring, i915_gem_obj_ggtt_offset(obj) + intel_crtc->dspaddr_offset); intel_ring_emit(ring, (MI_NOOP)); + /* Clear the flag as soon as we pass over the page flip command. + * If we passed over the command without hanging then an interrupt should + * be received to complete the page flip.*/ + intel_ring_emit(ring, MI_STORE_DWORD_INDEX); + intel_ring_emit(ring, I915_GEM_PGFLIP_INDEX << + MI_STORE_DWORD_INDEX_SHIFT); + intel_ring_emit(ring, 0); + intel_ring_emit(ring, MI_NOOP); + intel_mark_page_flip_active(intel_crtc); __intel_ring_advance(ring); return 0; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index cce29d0..37c3ed6 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -47,6 +47,14 @@ void __intel_ring_advance(struct intel_ring_buffer *ring) ring->tail &= ring->size - 1; + /* Re-schedule the hangcheck timer each time the ring is given new work + * so that we can detect hangs caused by commands inserted directly + * to the ring as well as bad batch buffers */ + if (!dev_priv->ums.mm_suspended && i915_enable_hangcheck) { + mod_timer(&dev_priv->hangcheck[ring->id].timer, + jiffies + DRM_I915_HANGCHECK_JIFFIES); + } + if (dev_priv->gpu_error.stop_rings & intel_ring_flag(ring)) return; ring->write_tail(ring, ring->tail); @@ -1591,7 +1599,7 @@ static int ring_wait_for_space(struct intel_ring_buffer *ring, int n) msleep(1); ret = i915_gem_check_wedge(&dev_priv->gpu_error, - dev_priv->mm.interruptible); + dev_priv->mm.interruptible, ring); if (ret) return ret; } while (!time_after(jiffies, end)); @@ -1691,7 +1699,7 @@ int intel_ring_begin(struct intel_ring_buffer *ring, int ret; ret = i915_gem_check_wedge(&dev_priv->gpu_error, - dev_priv->mm.interruptible); + dev_priv->mm.interruptible, ring); if (ret) return ret; @@ -2010,7 +2018,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring) /* Spin waiting for the device to ack the reset request */ ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_RENDER) == 0, 500); - DRM_DEBUG("RCS Reset\n"); + DRM_DEBUG_TDR("RCS Reset\n"); break; @@ -2020,7 +2028,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring) /* Spin waiting for the device to ack the reset request */ ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_BLT) == 0, 500); - DRM_DEBUG("BCS Reset\n"); + DRM_DEBUG_TDR("BCS Reset\n"); break; @@ -2030,7 +2038,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring) /* Spin waiting for the device to ack the reset request */ ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_MEDIA) == 0, 500); - DRM_DEBUG("VCS Reset\n"); + DRM_DEBUG_TDR("VCS Reset\n"); break; case VECS: @@ -2039,7 +2047,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring) /* Spin waiting for the device to ack the reset request */ ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST) & GEN6_GRDOM_VEBOX) == 0, 500); - DRM_DEBUG("VECS Reset\n"); + DRM_DEBUG_TDR("VECS Reset\n"); break; default: diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index cd96ad9..6d45b61 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -198,6 +198,8 @@ struct intel_ring_buffer { u32 saved_state[I915_RING_CONTEXT_SIZE]; struct intel_ring_hangcheck hangcheck; + uint32_t last_irq_seqno; + struct { struct drm_i915_gem_object *obj; u32 gtt_offset; diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index 5349215..7c71fc0 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -806,9 +806,13 @@ static int gen6_do_reset(struct drm_device *dev) int intel_gpu_reset(struct drm_device *dev) { + drm_i915_private_t *dev_priv = dev->dev_private; switch (INTEL_INFO(dev)->gen) { case 7: - case 6: return gen6_do_reset(dev); + case 6: + dev_priv->total_resets++; + DRM_DEBUG_TDR("total_resets %d\n", dev_priv->total_resets); + return gen6_do_reset(dev); case 5: return ironlake_do_reset(dev); case 4: return i965_do_reset(dev); default: return -ENODEV; diff --git a/include/drm/drmP.h b/include/drm/drmP.h index 1d4a920..73309c5 100644 --- a/include/drm/drmP.h +++ b/include/drm/drmP.h @@ -90,6 +90,7 @@ struct videomode; #define DRM_UT_DRIVER 0x02 #define DRM_UT_KMS 0x04 #define DRM_UT_PRIME 0x08 +#define DRM_UT_TDR 0x10 /* * Three debug levels are defined. * drm_core, drm_driver, drm_kms @@ -211,6 +212,11 @@ int drm_err(const char *func, const char *format, ...); drm_ut_debug_printk(DRM_UT_PRIME, DRM_NAME, \ __func__, fmt, ##args); \ } while (0) +#define DRM_DEBUG_TDR(fmt, args...) \ + do { \ + drm_ut_debug_printk(DRM_UT_TDR, DRM_NAME, \ + __func__, fmt, ##args); \ + } while (0) #define DRM_LOG(fmt, args...) \ do { \ drm_ut_debug_printk(DRM_UT_CORE, NULL, \ @@ -235,6 +241,7 @@ int drm_err(const char *func, const char *format, ...); #define DRM_DEBUG_DRIVER(fmt, args...) do { } while (0) #define DRM_DEBUG_KMS(fmt, args...) do { } while (0) #define DRM_DEBUG_PRIME(fmt, args...) do { } while (0) +#define DRM_DEBUG_TDR(fmt, args...) do { } while (0) #define DRM_DEBUG(fmt, arg...) do { } while (0) #define DRM_LOG(fmt, arg...) do { } while (0) #define DRM_LOG_KMS(fmt, args...) do { } while (0)

[2/3] drm/i915: Per-engine Timeout detection and recovery on HSW

Commit Message

Patch