@@ -2604,9 +2604,13 @@ static int
i915_wedged_set(void *data, u64 val)
{
struct drm_device *dev = data;
+ drm_i915_private_t *dev_priv = dev->dev_private;
DRM_INFO("Manually setting wedged to %llu\n", val);
- i915_handle_error(dev, val);
+ if (val) {
+ if (!i915_reset_in_progress(&dev_priv->gpu_error))
+ i915_handle_error(dev, NULL);
+ }
return 0;
}
@@ -1466,6 +1466,7 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
struct intel_device_info *info;
int ret = 0, mmio_bar, mmio_size;
uint32_t aperture_size;
+ uint32_t i;
info = (struct intel_device_info *) flags;
@@ -1661,6 +1662,17 @@ int i915_driver_load(struct drm_device *dev, unsigned long flags)
acpi_video_register();
}
+ for (i = 0; i < I915_NUM_RINGS; i++) {
+ dev_priv->hangcheck[i].count = 0;
+ dev_priv->hangcheck[i].last_acthd = 0;
+ dev_priv->hangcheck[i].ringid = i;
+ dev_priv->hangcheck[i].dev = dev;
+
+ setup_timer(&dev_priv->hangcheck[i].timer,
+ i915_hangcheck_sample,
+ (unsigned long) &dev_priv->hangcheck[i]);
+ }
+
if (IS_GEN5(dev))
intel_gpu_ips_init(dev_priv);
@@ -1703,6 +1715,7 @@ int i915_driver_unload(struct drm_device *dev)
{
struct drm_i915_private *dev_priv = dev->dev_private;
int ret;
+ uint32_t i;
intel_gpu_ips_teardown();
@@ -1748,9 +1761,10 @@ int i915_driver_unload(struct drm_device *dev)
}
/* Free error state after interrupts are fully disabled. */
- del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
cancel_work_sync(&dev_priv->gpu_error.work);
i915_destroy_error_state(dev);
+ for (i = 0; i < I915_NUM_RINGS; i++)
+ del_timer_sync(&dev_priv->hangcheck[i].timer);
cancel_delayed_work_sync(&dev_priv->pc8.enable_work);
@@ -113,6 +113,59 @@ MODULE_PARM_DESC(enable_hangcheck,
"WARNING: Disabling this can cause system wide hangs. "
"(default: true)");
+unsigned int i915_hangcheck_period __read_mostly = 1000;
+
+int hangcheck_period_set(const char *val, const struct kernel_param *kp)
+{
+ /* Custom set function so we can validate the range*/
+ unsigned long num;
+ int ret;
+
+ ret = kstrtoul(val, 0, &num);
+
+ if (ret)
+ return ret;
+
+ /* Enforce minimum delay in ms */
+ if ((num >= MINIMUM_HANGCHECK_PERIOD)
+ && (num <= MAXIMUM_HANGCHECK_PERIOD)) {
+ i915_hangcheck_period = num;
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static const struct kernel_param_ops hangcheck_ops = {
+ .set = hangcheck_period_set,
+ .get = param_get_uint,
+};
+
+module_param_cb(i915_hangcheck_period, &hangcheck_ops,
+ &i915_hangcheck_period, 0644);
+MODULE_PARM_DESC(i915_hangcheck_period,
+ "The hangcheck timer period in milliseconds. "
+ "The actual time to detect a hang may be 3 - 4 times "
+ "this value (default = 1000ms)");
+
+unsigned int i915_ring_reset_min_alive_period __read_mostly;
+module_param_named(i915_ring_reset_min_alive_period,
+ i915_ring_reset_min_alive_period, int, 0644);
+MODULE_PARM_DESC(i915_ring_reset_min_alive_period,
+ "Catch excessive ring resets. Each ring maintains a timestamp of "
+ "the last time it was reset. If it hangs again within this period "
+ "then switch to full GPU reset to try and clear the hang."
+ "default=0 seconds (disabled)");
+
+unsigned int i915_gpu_reset_min_alive_period __read_mostly;
+module_param_named(i915_gpu_reset_min_alive_period,
+ i915_gpu_reset_min_alive_period, int, 0644);
+MODULE_PARM_DESC(i915_gpu_reset_min_alive_period,
+ "Catch excessive GPU resets. If the GPU hangs again within this period "
+ "following the previous GPU reset then declare it wedged and "
+ "prevent further resets. "
+ "default=0 seconds (disabled)");
+
int i915_enable_ppgtt __read_mostly = -1;
module_param_named(i915_enable_ppgtt, i915_enable_ppgtt, int, 0600);
MODULE_PARM_DESC(i915_enable_ppgtt,
@@ -726,6 +779,142 @@ int i915_resume(struct drm_device *dev)
return 0;
}
+int i915_handle_hung_ring(struct drm_device *dev, uint32_t ringid)
+{
+ /* TDR Version 1:
+ * Reset the ring that is hung
+ *
+ * WARNING: Hold dev->struct_mutex before entering
+ * this function
+ */
+ drm_i915_private_t *dev_priv = dev->dev_private;
+ struct intel_ring_buffer *ring = &dev_priv->ring[ringid];
+ struct drm_crtc *crtc;
+ struct intel_crtc *intel_crtc;
+ int ret = 0;
+ int pipe = 0;
+ struct intel_unpin_work *unpin_work;
+ uint32_t ring_flags = 0;
+ uint32_t head;
+
+ BUG_ON(!mutex_is_locked(&dev->struct_mutex));
+
+ /* Take wake lock to prevent power saving mode */
+ gen6_gt_force_wake_get(dev_priv);
+
+ /* Check if the ring has hung on a MI_DISPLAY_FLIP command.
+ * The pipe value will be stored in the HWS page if it has.
+ * At the moment this should only happen for the blitter but
+ * each ring has its own status page so this should work for
+ * all rings*/
+ pipe = intel_read_status_page(ring, I915_GEM_PGFLIP_INDEX);
+ if (pipe) {
+ /* Clear it to avoid responding to it twice*/
+ intel_write_status_page(ring, I915_GEM_PGFLIP_INDEX, 0);
+ }
+
+ /* Clear any simulated hang flags */
+ if (dev_priv->stop_rings) {
+ DRM_DEBUG_TDR("Simulated gpu hang, rst stop_rings bits %08x\n",
+ (0x1 << ringid));
+ dev_priv->stop_rings &= ~(0x1 << ringid);
+ }
+
+ DRM_DEBUG_TDR("Resetting ring %d\n", ringid);
+
+ ret = intel_ring_disable(ring);
+ if (ret != 0) {
+ DRM_ERROR("Failed to disable ring %d\n", ringid);
+ goto handle_hung_ring_error;
+ }
+
+ /* Sample the current ring head position */
+ head = I915_READ(RING_HEAD(ring->mmio_base)) & HEAD_ADDR;
+ DRM_DEBUG_TDR("head 0x%08X, last_head 0x%08X\n",
+ head, dev_priv->hangcheck[ringid].last_head);
+ if (head == dev_priv->hangcheck[ringid].last_head) {
+ /* The ring has not advanced since the last
+ * time it hung so force it to advance to the
+ * next QWORD. In most cases the ring head
+ * pointer will automatically advance to the
+ * next instruction as soon as it has read the
+ * current instruction, without waiting for it
+ * to complete. This seems to be the default
+ * behaviour, however an MBOX wait inserted
+ * directly to the VCS/BCS rings does not behave
+ * in the same way, instead the head pointer
+ * will still be pointing at the MBOX instruction
+ * until it completes.*/
+ ring_flags = FORCE_ADVANCE;
+ DRM_DEBUG_TDR("Force ring head to advance\n");
+ }
+ dev_priv->hangcheck[ringid].last_head = head;
+
+ ret = intel_ring_save(ring, ring_flags);
+ if (ret != 0) {
+ DRM_ERROR("Failed to save ring state\n");
+ goto handle_hung_ring_error;
+ }
+
+ ret = intel_ring_reset(ring);
+ if (ret != 0) {
+ DRM_ERROR("Failed to reset ring\n");
+ goto handle_hung_ring_error;
+ }
+
+ DRM_ERROR("Reset ring %d (GPU Hang)\n", ringid);
+
+ /* Clear last_acthd in hangcheck timer for this ring */
+ dev_priv->hangcheck[ringid].last_acthd = 0;
+
+ /* Clear reset to allow future hangchecks */
+ atomic_set(&dev_priv->hangcheck[ringid].reset, 0);
+
+ ret = intel_ring_restore(ring);
+ if (ret != 0) {
+ DRM_ERROR("Failed to restore ring state\n");
+ goto handle_hung_ring_error;
+ }
+
+ /* Correct driver state */
+ intel_ring_resample(ring);
+
+ ret = intel_ring_enable(ring);
+ if (ret != 0) {
+ DRM_ERROR("Failed to enable ring\n");
+ goto handle_hung_ring_error;
+ }
+
+ /* Wake up anything waiting on this rings queue */
+ wake_up_all(&ring->irq_queue);
+
+ if (pipe &&
+ ((pipe - 1) < ARRAY_SIZE(dev_priv->pipe_to_crtc_mapping))) {
+ /* The pipe value in the status page is offset by 1 */
+ pipe -= 1;
+
+ /* The ring hung on a page flip command so we
+ * must manually release the pending flip queue */
+ crtc = dev_priv->pipe_to_crtc_mapping[pipe];
+ intel_crtc = to_intel_crtc(crtc);
+ unpin_work = intel_crtc->unpin_work;
+
+ if (unpin_work && unpin_work->pending_flip_obj) {
+ intel_prepare_page_flip(dev, intel_crtc->pipe);
+ intel_finish_page_flip(dev, intel_crtc->pipe);
+ DRM_DEBUG_TDR("Released stuck page flip for pipe %d\n",
+ pipe);
+ }
+ }
+
+handle_hung_ring_error:
+
+ /* Release power lock */
+ gen6_gt_force_wake_put(dev_priv);
+
+ return ret;
+}
+
/**
* i915_reset - reset chip after a hang
* @dev: drm device to reset
@@ -759,7 +948,11 @@ int i915_reset(struct drm_device *dev)
ret = intel_gpu_reset(dev);
/* Also reset the gpu hangman. */
- if (simulated) {
+ if (!simulated && (get_seconds() - dev_priv->gpu_error.last_reset)
+ < i915_gpu_reset_min_alive_period) {
+ DRM_ERROR("GPU hanging too fast, declaring wedged!\n");
+ ret = -ENODEV;
+ } else if (simulated) {
DRM_INFO("Simulated gpu hang, resetting stop_rings\n");
dev_priv->gpu_error.stop_rings = 0;
if (ret == -ENODEV) {
@@ -307,7 +307,7 @@ struct drm_i915_error_state {
u32 ctl[I915_NUM_RINGS];
u32 ipeir[I915_NUM_RINGS];
u32 ipehr[I915_NUM_RINGS];
- u32 instdone[I915_NUM_RINGS];
+ u32 instdone[I915_NUM_RINGS][I915_NUM_INSTDONE_REG];
u32 acthd[I915_NUM_RINGS];
u32 semaphore_mboxes[I915_NUM_RINGS][I915_NUM_RINGS - 1];
u32 semaphore_seqno[I915_NUM_RINGS][I915_NUM_RINGS - 1];
@@ -1042,6 +1042,13 @@ struct i915_gem_mm {
*/
bool interruptible;
+ /**
+ * This is set when the error_recovery function is running.
+ * It prevents command submission from occurring and makes
+ * every pending request fail
+ */
+ atomic_t wedged;
+
/** Bit 6 swizzling required for X tiling */
uint32_t bit_6_swizzle_x;
/** Bit 6 swizzling required for Y tiling */
@@ -1072,8 +1079,10 @@ struct i915_error_state_file_priv {
struct i915_gpu_error {
/* For hangcheck timer */
+#define MINIMUM_HANGCHECK_PERIOD 100 /* 100ms */
+#define MAXIMUM_HANGCHECK_PERIOD 30000 /* 30s */
#define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
-#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_I915_HANGCHECK_PERIOD)
+#define DRM_I915_HANGCHECK_JIFFIES msecs_to_jiffies(i915_hangcheck_period)
/* Hang gpu twice in this window and your context gets banned */
#define DRM_I915_CTX_BAN_PERIOD DIV_ROUND_UP(8*DRM_I915_HANGCHECK_PERIOD, 1000)
@@ -1087,6 +1096,7 @@ struct i915_gpu_error {
unsigned long missed_irq_rings;
+ unsigned long last_reset;
/**
* State variable and reset counter controlling the reset flow
@@ -1304,6 +1314,41 @@ struct intel_pipe_crc {
wait_queue_head_t wq;
};
+struct intel_hangcheck {
+ /* The ring being monitored*/
+ uint32_t ringid;
+
+ /* Parent drm_device*/
+ struct drm_device *dev;
+
+ /* Timer for this ring only*/
+ struct timer_list timer;
+
+ /* Count of consecutive hang detections
+ * (reset flag set once count exceeds threshold)*/
+#define HANGCHECK_THRESHOLD 1
+#define MBOX_HANGCHECK_THRESHOLD 4
+ int count;
+
+ /* Last sampled head and active head*/
+ uint32_t last_acthd;
+ uint32_t last_hd;
+
+ /* Last recorded ring head index from previous ring hang.
+ * This is only ever a ring index where as active
+ * head may be a graphics address in a ring buffer */
+ uint32_t last_head;
+
+ /* Last recorded instdone*/
+ uint32_t prev_instdone[I915_NUM_INSTDONE_REG];
+
+ /* Flag to indicate if ring reset required*/
+ atomic_t reset;
+
+ /* Keep a record of the last time the ring was reset */
+ unsigned long last_reset;
+};
+
typedef struct drm_i915_private {
struct drm_device *dev;
struct kmem_cache *slab;
@@ -1372,6 +1417,17 @@ typedef struct drm_i915_private {
int num_plane;
+ /* For hangcheck timer */
+ struct intel_hangcheck hangcheck[I915_NUM_RINGS];
+
+ unsigned int stop_rings;
+
+ unsigned long cfb_size;
+ unsigned int cfb_fb;
+ enum plane cfb_plane;
+ int cfb_y;
+ struct intel_fbc_work *fbc_work;
+
struct i915_fbc fbc;
struct intel_opregion opregion;
struct intel_vbt_data vbt;
@@ -1397,6 +1453,11 @@ typedef struct drm_i915_private {
unsigned int fsb_freq, mem_freq, is_ddr3;
+ struct work_struct error_work;
+ atomic_t full_reset;
+ uint32_t total_resets;
+
+ wait_queue_head_t error_queue;
/**
* wq - Driver workqueue for GEM.
*
@@ -1864,6 +1925,9 @@ extern int i915_vbt_sdvo_panel_type __read_mostly;
extern int i915_enable_rc6 __read_mostly;
extern int i915_enable_fbc __read_mostly;
extern bool i915_enable_hangcheck __read_mostly;
+extern unsigned int i915_hangcheck_period __read_mostly;
+extern unsigned int i915_ring_reset_min_alive_period __read_mostly;
+extern unsigned int i915_gpu_reset_min_alive_period __read_mostly;
extern int i915_enable_ppgtt __read_mostly;
extern int i915_enable_psr __read_mostly;
extern unsigned int i915_preliminary_hw_support __read_mostly;
@@ -1899,6 +1963,7 @@ extern int i915_emit_box(struct drm_device *dev,
struct drm_clip_rect *box,
int DR1, int DR4);
extern int intel_gpu_reset(struct drm_device *dev);
+extern int i915_handle_hung_ring(struct drm_device *dev, uint32_t ringid);
extern int i915_reset(struct drm_device *dev);
extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
extern unsigned long i915_mch_val(struct drm_i915_private *dev_priv);
@@ -1909,7 +1974,10 @@ extern void intel_console_resume(struct work_struct *work);
/* i915_irq.c */
void i915_queue_hangcheck(struct drm_device *dev);
-void i915_handle_error(struct drm_device *dev, bool wedged);
+void i915_hangcheck_sample(unsigned long data);
+void i915_handle_error(struct drm_device *dev, struct intel_hangcheck *hc);
+
+
extern void intel_irq_init(struct drm_device *dev);
extern void intel_pm_init(struct drm_device *dev);
@@ -2067,7 +2135,8 @@ i915_gem_object_unpin_fence(struct drm_i915_gem_object *obj)
bool i915_gem_retire_requests(struct drm_device *dev);
void i915_gem_retire_requests_ring(struct intel_ring_buffer *ring);
int __must_check i915_gem_check_wedge(struct i915_gpu_error *error,
- bool interruptible);
+ bool interruptible,
+ struct intel_ring_buffer *ring);
static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
{
return unlikely(atomic_read(&error->reset_counter)
@@ -2312,7 +2381,8 @@ void i915_error_state_get(struct drm_device *dev,
void i915_error_state_put(struct i915_error_state_file_priv *error_priv);
void i915_destroy_error_state(struct drm_device *dev);
-void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone);
+void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone,
+ struct intel_ring_buffer *ring);
const char *i915_cache_level_str(int type);
/* i915_suspend.c */
@@ -142,14 +142,38 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
return 0;
}
-int i915_mutex_lock_interruptible(struct drm_device *dev)
+int i915_gem_wedged(struct drm_device *dev, bool interruptible)
{
+ /* Warning: This function can only give an indication
+ * if the GPU is wedged at a particular instance of time.
+ * The hangcheck process is asynchronous so a hang
+ * may be detected just after the flags have been sampled */
+ unsigned i;
struct drm_i915_private *dev_priv = dev->dev_private;
+ int err = !interruptible ? -EIO : -EAGAIN;
+
+ /* Full reset requested */
+ if (i915_reset_in_progress(&dev_priv->gpu_error))
+ return err;
+
+ /* Check for an individual ring which has hung */
+ for (i = 0; i < I915_NUM_RINGS; i++) {
+ if (atomic_read(&dev_priv->hangcheck[i].reset))
+ return err;
+ }
+
+ return 0;
+}
+
+int i915_mutex_lock_interruptible(struct drm_device *dev)
+{
int ret;
- ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
- if (ret)
- return ret;
+ /* There should be no need to call i915_gem_wait_for_error
+ * as the error recovery handler takes dev->struct_mutex
+ * so if it is active we will wait on the
+ * mutex_lock_interruptible call instead.
+ */
ret = mutex_lock_interruptible(&dev->struct_mutex);
if (ret)
@@ -935,9 +959,15 @@ unlock:
int
i915_gem_check_wedge(struct i915_gpu_error *error,
- bool interruptible)
+ bool interruptible,
+ struct intel_ring_buffer *ring)
{
- if (i915_reset_in_progress(error)) {
+ drm_i915_private_t *dev_priv;
+
+ dev_priv = container_of(error, drm_i915_private_t, gpu_error);
+
+ if ((ring && atomic_read(&dev_priv->hangcheck[ring->id].reset)) ||
+ i915_reset_in_progress(error)) {
/* Non-interruptible callers can't handle -EAGAIN, hence return
* -EIO unconditionally for these. */
if (!interruptible)
@@ -1054,7 +1084,7 @@ static int __wait_seqno(struct intel_ring_buffer *ring, u32 seqno,
if (reset_counter != atomic_read(&dev_priv->gpu_error.reset_counter)) {
/* ... but upgrade the -EAGAIN to an -EIO if the gpu
* is truely gone. */
- ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
+ ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible, ring);
if (ret == 0)
ret = -EAGAIN;
break;
@@ -1124,7 +1154,7 @@ i915_wait_seqno(struct intel_ring_buffer *ring, uint32_t seqno)
BUG_ON(!mutex_is_locked(&dev->struct_mutex));
BUG_ON(seqno == 0);
- ret = i915_gem_check_wedge(&dev_priv->gpu_error, interruptible);
+ ret = i915_gem_wedged(dev, interruptible);
if (ret)
return ret;
@@ -1201,7 +1231,7 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
if (seqno == 0)
return 0;
- ret = i915_gem_check_wedge(&dev_priv->gpu_error, true);
+ ret = i915_gem_check_wedge(&dev_priv->gpu_error, true, ring);
if (ret)
return ret;
@@ -1213,8 +1243,9 @@ i915_gem_object_wait_rendering__nonblocking(struct drm_i915_gem_object *obj,
mutex_unlock(&dev->struct_mutex);
ret = __wait_seqno(ring, seqno, reset_counter, true, NULL, file->driver_priv);
mutex_lock(&dev->struct_mutex);
- if (ret)
+ if (ret) {
return ret;
+ }
return i915_gem_object_wait_rendering__tail(obj, ring);
}
@@ -2180,8 +2211,6 @@ int __i915_add_request(struct intel_ring_buffer *ring,
ring->preallocated_lazy_request = NULL;
if (!dev_priv->ums.mm_suspended) {
- i915_queue_hangcheck(ring->dev);
-
if (was_empty) {
cancel_delayed_work_sync(&dev_priv->mm.idle_work);
queue_delayed_work(dev_priv->wq,
@@ -3810,7 +3839,7 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
if (ret)
return ret;
- ret = i915_gem_check_wedge(&dev_priv->gpu_error, false);
+ ret = i915_gem_check_wedge(&dev_priv->gpu_error, false, NULL);
if (ret)
return ret;
@@ -3828,9 +3857,16 @@ i915_gem_ring_throttle(struct drm_device *dev, struct drm_file *file)
if (seqno == 0)
return 0;
- ret = __wait_seqno(ring, seqno, reset_counter, true, NULL, NULL);
- if (ret == 0)
- queue_delayed_work(dev_priv->wq, &dev_priv->mm.retire_work, 0);
+ if (ring) {
+ if (i915_gem_wedged(dev, 1) != 0)
+ return -EIO;
+
+ ret = __wait_seqno(ring, seqno, reset_counter, true, NULL,
+ file->driver_priv);
+ if (ret == 0)
+ queue_delayed_work(dev_priv->wq,
+ &dev_priv->mm.retire_work, 0);
+ }
return ret;
}
@@ -4275,6 +4311,7 @@ i915_gem_suspend(struct drm_device *dev)
{
drm_i915_private_t *dev_priv = dev->dev_private;
int ret = 0;
+ int i;
mutex_lock(&dev->struct_mutex);
if (dev_priv->ums.mm_suspended)
@@ -4301,7 +4338,8 @@ i915_gem_suspend(struct drm_device *dev)
DRIVER_MODESET);
mutex_unlock(&dev->struct_mutex);
- del_timer_sync(&dev_priv->gpu_error.hangcheck_timer);
+ for (i = 0; i < I915_NUM_RINGS; i++)
+ del_timer_sync(&dev_priv->hangcheck[i].timer);
cancel_delayed_work_sync(&dev_priv->mm.retire_work);
cancel_delayed_work_sync(&dev_priv->mm.idle_work);
@@ -4530,6 +4568,7 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
{
struct drm_i915_private *dev_priv = dev->dev_private;
int ret;
+ int i;
if (drm_core_check_feature(dev, DRIVER_MODESET))
return 0;
@@ -4537,6 +4576,10 @@ i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
if (i915_reset_in_progress(&dev_priv->gpu_error)) {
DRM_ERROR("Reenabling wedged hardware, good luck\n");
atomic_set(&dev_priv->gpu_error.reset_counter, 0);
+ for (i = 0; i < I915_NUM_RINGS; i++) {
+ /* Clear the reset flag */
+ atomic_set(&dev_priv->hangcheck[i].reset, 0);
+ }
}
mutex_lock(&dev->struct_mutex);
@@ -723,7 +723,6 @@ static void i915_record_ring_state(struct drm_device *dev,
error->faddr[ring->id] = I915_READ(RING_DMA_FADD(ring->mmio_base));
error->ipeir[ring->id] = I915_READ(RING_IPEIR(ring->mmio_base));
error->ipehr[ring->id] = I915_READ(RING_IPEHR(ring->mmio_base));
- error->instdone[ring->id] = I915_READ(RING_INSTDONE(ring->mmio_base));
error->instps[ring->id] = I915_READ(RING_INSTPS(ring->mmio_base));
if (ring->id == RCS)
error->bbaddr = I915_READ64(BB_ADDR);
@@ -732,9 +731,10 @@ static void i915_record_ring_state(struct drm_device *dev,
error->faddr[ring->id] = I915_READ(DMA_FADD_I8XX);
error->ipeir[ring->id] = I915_READ(IPEIR);
error->ipehr[ring->id] = I915_READ(IPEHR);
- error->instdone[ring->id] = I915_READ(INSTDONE);
}
+ i915_get_extra_instdone(dev, error->instdone[ring->id],
+ &dev_priv->ring[ring->id]);
error->waiting[ring->id] = waitqueue_active(&ring->irq_queue);
error->instpm[ring->id] = I915_READ(RING_INSTPM(ring->mmio_base));
error->seqno[ring->id] = ring->get_seqno(ring, false);
@@ -899,6 +899,7 @@ void i915_capture_error_state(struct drm_device *dev)
struct drm_i915_error_state *error;
unsigned long flags;
int pipe;
+ int i;
spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
error = dev_priv->gpu_error.first_error;
@@ -957,7 +958,9 @@ void i915_capture_error_state(struct drm_device *dev)
if (INTEL_INFO(dev)->gen == 7)
error->err_int = I915_READ(GEN7_ERR_INT);
- i915_get_extra_instdone(dev, error->extra_instdone);
+ for (i = 0; i < I915_NUM_RINGS; i++)
+ i915_get_extra_instdone(dev, error->instdone[i],
+ &dev_priv->ring[i]);
i915_gem_capture_buffers(dev_priv, error);
i915_gem_record_fences(dev, error);
@@ -1026,7 +1029,9 @@ const char *i915_cache_level_str(int type)
}
/* NB: please notice the memset */
-void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone)
+void i915_get_extra_instdone(struct drm_device *dev,
+ uint32_t *instdone,
+ struct intel_ring_buffer *ring)
{
struct drm_i915_private *dev_priv = dev->dev_private;
memset(instdone, 0, sizeof(*instdone) * I915_NUM_INSTDONE_REG);
@@ -1046,10 +1051,14 @@ void i915_get_extra_instdone(struct drm_device *dev, uint32_t *instdone)
WARN_ONCE(1, "Unsupported platform\n");
case 7:
case 8:
- instdone[0] = I915_READ(GEN7_INSTDONE_1);
- instdone[1] = I915_READ(GEN7_SC_INSTDONE);
- instdone[2] = I915_READ(GEN7_SAMPLER_INSTDONE);
- instdone[3] = I915_READ(GEN7_ROW_INSTDONE);
+ instdone[0] =
+ I915_READ(RING_INSTDONE(ring->mmio_base));
+
+ if (ring->id == RCS) {
+ instdone[1] = I915_READ(GEN7_SC_INSTDONE);
+ instdone[2] = I915_READ(GEN7_SAMPLER_INSTDONE);
+ instdone[3] = I915_READ(GEN7_ROW_INSTDONE);
+ }
break;
}
}
@@ -957,7 +957,6 @@ static void notify_ring(struct drm_device *dev,
trace_i915_gem_request_complete(ring);
wake_up_all(&ring->irq_queue);
- i915_queue_hangcheck(dev);
}
static void gen6_pm_rps_work(struct work_struct *work)
@@ -1155,12 +1154,14 @@ static void snb_gt_irq_handler(struct drm_device *dev,
if (gt_iir & GT_BLT_USER_INTERRUPT)
notify_ring(dev, &dev_priv->ring[BCS]);
- if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
- GT_BSD_CS_ERROR_INTERRUPT |
- GT_RENDER_CS_MASTER_ERROR_INTERRUPT)) {
- DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
- i915_handle_error(dev, false);
- }
+ if (gt_iir & GT_RENDER_CS_MASTER_ERROR_INTERRUPT)
+ i915_handle_error(dev, &dev_priv->hangcheck[RCS]);
+
+ if (gt_iir & GT_BSD_CS_ERROR_INTERRUPT)
+ i915_handle_error(dev, &dev_priv->hangcheck[VCS]);
+
+ if (gt_iir & GT_BLT_CS_ERROR_INTERRUPT)
+ i915_handle_error(dev, &dev_priv->hangcheck[BCS]);
if (gt_iir & GT_PARITY_ERROR(dev))
ivybridge_parity_error_irq_handler(dev, gt_iir);
@@ -1403,7 +1404,7 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT) {
DRM_ERROR("VEBOX CS error interrupt 0x%08x\n", pm_iir);
- i915_handle_error(dev_priv->dev, false);
+ i915_handle_error(dev_priv->dev, &dev_priv->hangcheck[VECS]);
}
}
}
@@ -1946,9 +1947,41 @@ static void i915_error_work_func(struct work_struct *work)
char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
int ret;
+ int i;
+ int pipe;
+ struct drm_crtc *crtc;
+ struct intel_crtc *intel_crtc;
+ struct intel_unpin_work *unpin_work;
+
+ /* Set this flag as it should force any waiting processes to release
+ * struct->mutex if they are holding it */
+ atomic_set(&dev_priv->mm.wedged, 1);
+
+ mutex_lock(&dev->struct_mutex);
kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, error_event);
+ /* Skip individual ring reset requests if full_reset requested*/
+ if (!i915_reset_in_progress(error)) {
+ /* Check each ring for a pending reset condition */
+ for (i = 0; i < I915_NUM_RINGS; i++) {
+ if (atomic_read(&dev_priv->hangcheck[i].reset)) {
+ DRM_DEBUG_TDR("resetting ring %d\n", i);
+
+ if (i915_handle_hung_ring(dev, i) != 0) {
+ DRM_ERROR("ring %d reset failed", i);
+ atomic_set_mask(
+ I915_RESET_IN_PROGRESS_FLAG,
+ &dev_priv->gpu_error.reset_counter);
+ break;
+ }
+ }
+ }
+ }
+ /* Release struct->mutex for the full GPU reset. It will take
+ * it itself when it needs it */
+ mutex_unlock(&dev->struct_mutex);
+
/*
* Note that there's only one work item which does gpu resets, so we
* need not worry about concurrent gpu resets potentially incrementing
@@ -1988,8 +2021,35 @@ static void i915_error_work_func(struct work_struct *work)
smp_mb__before_atomic_inc();
atomic_inc(&dev_priv->gpu_error.reset_counter);
- kobject_uevent_env(&dev->primary->kdev->kobj,
- KOBJ_CHANGE, reset_done_event);
+ for (i = 0; i < I915_NUM_RINGS; i++) {
+ /* Clear individual ring reset flags*/
+ atomic_set(&dev_priv->hangcheck[i].reset, 0);
+ }
+
+ mutex_lock(&dev->mode_config.mutex);
+ /* Release any pending page flip
+ * This is particularly important if ring_stop was set.
+
+ * WARNING: This code could retire a page flip that
+ * arrives just after reset. In that case we will get
+ * an extra page flip interrupt that is not expected.
+ * If another page flip request arrives before the interrupt
+ * then the unpin work could happen sooner than expected.
+ */
+ for_each_pipe(pipe) {
+ crtc =
+ dev_priv->pipe_to_crtc_mapping[pipe];
+ intel_crtc = to_intel_crtc(crtc);
+ unpin_work = intel_crtc->unpin_work;
+
+ if (unpin_work
+ && unpin_work->pending_flip_obj) {
+ intel_prepare_page_flip(dev, pipe);
+ intel_finish_page_flip(dev, pipe);
+ DRM_DEBUG_TDR("Clr pg flip\n");
+ }
+ }
+ mutex_unlock(&dev->mode_config.mutex);
} else {
atomic_set(&error->reset_counter, I915_WEDGED);
}
@@ -2000,21 +2060,36 @@ static void i915_error_work_func(struct work_struct *work)
*/
i915_error_wake_up(dev_priv, true);
}
+
+ /* Clear wedged condition and wake up waiters*/
+ atomic_set(&dev_priv->mm.wedged, 0);
+
+ kobject_uevent_env(&dev->primary->kdev->kobj,
+ KOBJ_CHANGE, reset_done_event);
+
+ /* Wake anyone waiting on error handling completion*/
+ wake_up_all(&dev_priv->error_queue);
+
+ DRM_DEBUG_TDR("End recovery work\n\n");
}
static void i915_report_and_clear_eir(struct drm_device *dev)
{
struct drm_i915_private *dev_priv = dev->dev_private;
uint32_t instdone[I915_NUM_INSTDONE_REG];
- u32 eir = I915_READ(EIR);
+ u32 eir;
int pipe, i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
+ eir = I915_READ(EIR);
if (!eir)
- return;
+ goto i915_report_and_clear_eir_exit;
pr_err("render error detected, EIR: 0x%08x\n", eir);
- i915_get_extra_instdone(dev, instdone);
+ i915_get_extra_instdone(dev, instdone, &dev_priv->ring[RCS]);
if (IS_G4X(dev)) {
if (eir & (GM45_ERROR_MEM_PRIV | GM45_ERROR_CP_PRIV)) {
@@ -2092,6 +2167,9 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
I915_WRITE(EMR, I915_READ(EMR) | eir);
I915_WRITE(IIR, I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT);
}
+
+i915_report_and_clear_eir_exit:
+ spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
}
/**
@@ -2104,39 +2182,74 @@ static void i915_report_and_clear_eir(struct drm_device *dev)
* so userspace knows something bad happened (should trigger collection
* of a ring dump etc.).
*/
-void i915_handle_error(struct drm_device *dev, bool wedged)
+void i915_handle_error(struct drm_device *dev, struct intel_hangcheck *hc)
{
struct drm_i915_private *dev_priv = dev->dev_private;
+ int full_reset = 0;
+ unsigned long cur_time;
+ unsigned long last_reset;
i915_capture_error_state(dev);
i915_report_and_clear_eir(dev);
- if (wedged) {
+ /* Currently we only support individual ring reset for GEN7 onwards,
+ * older chips will revert to a full reset.
+ * Error interrupts trigger a full reset (hc == NULL)*/
+ if ((INTEL_INFO(dev)->gen >= 7) && hc) {
+ cur_time = get_seconds();
+ last_reset = hc->last_reset;
+ hc->last_reset = cur_time;
+
+ if ((cur_time - last_reset)
+ < i915_ring_reset_min_alive_period) {
+ /* This ring is hanging too frequently.
+ * Opt for full-reset instead */
+ DRM_DEBUG_TDR("Ring %d hanging too quickly...\r\n",
+ hc->ringid);
+ full_reset = 1;
+ } else {
+ if (atomic_read(&hc->reset)) {
+ /* Reset already in progress for this ring */
+ return;
+ }
+
+ atomic_set(&hc->reset, 1);
+ DRM_DEBUG_TDR("Reset Ring %d\n", hc->ringid);
+ }
+ } else
+ full_reset = 1;
+
+ if (!hc || full_reset) {
+ if (i915_reset_in_progress(&dev_priv->gpu_error))
+ return;
+
atomic_set_mask(I915_RESET_IN_PROGRESS_FLAG,
&dev_priv->gpu_error.reset_counter);
-
- /*
- * Wakeup waiting processes so that the reset work function
- * i915_error_work_func doesn't deadlock trying to grab various
- * locks. By bumping the reset counter first, the woken
- * processes will see a reset in progress and back off,
- * releasing their locks and then wait for the reset completion.
- * We must do this for _all_ gpu waiters that might hold locks
- * that the reset work needs to acquire.
- *
- * Note: The wake_up serves as the required memory barrier to
- * ensure that the waiters see the updated value of the reset
- * counter atomic_t.
- */
- i915_error_wake_up(dev_priv, false);
+ DRM_DEBUG_TDR("Full reset of GPU requested\n");
}
/*
+ * Wakeup waiting processes so that the reset work function
+ * i915_error_work_func doesn't deadlock trying to grab various
+ * locks. By bumping the reset counter first, the woken
+ * processes will see a reset in progress and back off,
+ * releasing their locks and then wait for the reset completion.
+ * We must do this for _all_ gpu waiters that might hold locks
+ * that the reset work needs to acquire.
+ *
+ * Note: The wake_up serves as the required memory barrier to
+ * ensure that the waiters see the updated value of the reset
+ * counter atomic_t.
+ */
+ i915_error_wake_up(dev_priv, false);
+
+ /*
* Our reset work can grab modeset locks (since it needs to reset the
* state of outstanding pagelips). Hence it must not be run on our own
* dev-priv->wq work queue for otherwise the flush_work in the pageflip
* code will deadlock.
*/
+ DRM_DEBUG_TDR("Schedule error recovery work\n");
schedule_work(&dev_priv->gpu_error.work);
}
@@ -2339,245 +2452,191 @@ ring_last_seqno(struct intel_ring_buffer *ring)
struct drm_i915_gem_request, list)->seqno;
}
-static bool
-ring_idle(struct intel_ring_buffer *ring, u32 seqno)
-{
- return (list_empty(&ring->request_list) ||
- i915_seqno_passed(seqno, ring_last_seqno(ring)));
-}
-
-static struct intel_ring_buffer *
-semaphore_waits_for(struct intel_ring_buffer *ring, u32 *seqno)
-{
- struct drm_i915_private *dev_priv = ring->dev->dev_private;
- u32 cmd, ipehr, acthd, acthd_min;
-
- ipehr = I915_READ(RING_IPEHR(ring->mmio_base));
- if ((ipehr & ~(0x3 << 16)) !=
- (MI_SEMAPHORE_MBOX | MI_SEMAPHORE_COMPARE | MI_SEMAPHORE_REGISTER))
- return NULL;
-
- /* ACTHD is likely pointing to the dword after the actual command,
- * so scan backwards until we find the MBOX.
- */
- acthd = intel_ring_get_active_head(ring) & HEAD_ADDR;
- acthd_min = max((int)acthd - 3 * 4, 0);
- do {
- cmd = ioread32(ring->virtual_start + acthd);
- if (cmd == ipehr)
- break;
-
- acthd -= 4;
- if (acthd < acthd_min)
- return NULL;
- } while (1);
-
- *seqno = ioread32(ring->virtual_start+acthd+4)+1;
- return &dev_priv->ring[(ring->id + (((ipehr >> 17) & 1) + 1)) % 3];
-}
-
-static int semaphore_passed(struct intel_ring_buffer *ring)
-{
- struct drm_i915_private *dev_priv = ring->dev->dev_private;
- struct intel_ring_buffer *signaller;
- u32 seqno, ctl;
-
- ring->hangcheck.deadlock = true;
-
- signaller = semaphore_waits_for(ring, &seqno);
- if (signaller == NULL || signaller->hangcheck.deadlock)
- return -1;
-
- /* cursory check for an unkickable deadlock */
- ctl = I915_READ_CTL(signaller);
- if (ctl & RING_WAIT_SEMAPHORE && semaphore_passed(signaller) < 0)
- return -1;
-
- return i915_seqno_passed(signaller->get_seqno(signaller, false), seqno);
-}
-
-static void semaphore_clear_deadlocks(struct drm_i915_private *dev_priv)
+void i915_queue_hangcheck(struct drm_device *dev)
{
- struct intel_ring_buffer *ring;
- int i;
+ struct drm_i915_private *dev_priv = dev->dev_private;
+ if (!i915_enable_hangcheck)
+ return;
- for_each_ring(ring, dev_priv, i)
- ring->hangcheck.deadlock = false;
+ mod_timer(&dev_priv->gpu_error.hangcheck_timer,
+ round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
}
-static enum intel_ring_hangcheck_action
-ring_stuck(struct intel_ring_buffer *ring, u32 acthd)
+static bool kick_ring(struct intel_ring_buffer *ring)
{
struct drm_device *dev = ring->dev;
struct drm_i915_private *dev_priv = dev->dev_private;
- u32 tmp;
-
- if (ring->hangcheck.acthd != acthd)
- return HANGCHECK_ACTIVE;
-
- if (IS_GEN2(dev))
- return HANGCHECK_HUNG;
-
- /* Is the chip hanging on a WAIT_FOR_EVENT?
- * If so we can simply poke the RB_WAIT bit
- * and break the hang. This should work on
- * all but the second generation chipsets.
- */
- tmp = I915_READ_CTL(ring);
+ u32 tmp = I915_READ_CTL(ring);
if (tmp & RING_WAIT) {
DRM_ERROR("Kicking stuck wait on %s\n",
ring->name);
- i915_handle_error(dev, false);
I915_WRITE_CTL(ring, tmp);
- return HANGCHECK_KICK;
+ return true;
}
+ return false;
+}
- if (INTEL_INFO(dev)->gen >= 6 && tmp & RING_WAIT_SEMAPHORE) {
- switch (semaphore_passed(ring)) {
- default:
- return HANGCHECK_HUNG;
- case 1:
- DRM_ERROR("Kicking stuck semaphore on %s\n",
- ring->name);
- i915_handle_error(dev, false);
- I915_WRITE_CTL(ring, tmp);
- return HANGCHECK_KICK;
- case 0:
- return HANGCHECK_WAIT;
+/* This function is called when the TDR algorithm detects that the hardware
+ * has not advanced during the last sampling period.*/
+static bool i915_hangcheck_hung(struct intel_hangcheck *hc)
+{
+ struct drm_device *dev = hc->dev;
+ drm_i915_private_t *dev_priv = dev->dev_private;
+ uint32_t mbox_wait;
+ uint32_t threshold;
+ struct intel_ring_buffer *ring;
+
+ DRM_DEBUG_TDR("Ring [%d] hc->count = %d\n", hc->ringid, hc->count);
+
+ ring = &dev_priv->ring[hc->ringid];
+
+ /* Is this ring waiting on a semaphore mbox?
+ * If so, give it a bit longer as it may be waiting on another
+ * ring which has actually hung. Give the other ring chance to
+ * reset and clear the hang.
+ */
+ mbox_wait = ((I915_READ(RING_CTL(ring->mmio_base)) >> 10) & 0x1);
+ threshold = mbox_wait ? MBOX_HANGCHECK_THRESHOLD : HANGCHECK_THRESHOLD;
+
+ if (hc->count++ > threshold) {
+ bool hung = true;
+
+ DRM_DEBUG_TDR("Hangcheck timer elapsed... ring %d hung\n",
+ hc->ringid);
+ /* Reset the counter*/
+ hc->count = 0;
+
+ if (!IS_GEN2(dev)) {
+ /* If the ring is hanging on a WAIT_FOR_EVENT
+ * then simply poke the RB_WAIT bit
+ * and break the hang. This should work on
+ * all but the second generation chipsets.
+ */
+ ring = &dev_priv->ring[hc->ringid];
+ hung &= !kick_ring(ring);
+ DRM_DEBUG_TDR("hung=%d after kick ring\n", hung);
+ }
+ if (hung) {
+ i915_handle_error(dev, hc);
}
+ return hung;
}
-
- return HANGCHECK_HUNG;
-}
+ return false;
+}
/**
- * This is called when the chip hasn't reported back with completed
- * batchbuffers in a long time. We keep track per ring seqno progress and
- * if there are no progress, hangcheck score for that ring is increased.
- * Further, acthd is inspected to see if the ring is stuck. On stuck case
- * we kick the ring. If we see no progress on three subsequent calls
- * we assume chip is wedged and try to fix it by resetting the chip.
+ * This is called from the hangcheck timer for each ring.
+ * It samples the current state of the hardware to make
+ * sure that it is progressing.
*/
-static void i915_hangcheck_elapsed(unsigned long data)
-{
- struct drm_device *dev = (struct drm_device *)data;
- drm_i915_private_t *dev_priv = dev->dev_private;
+void i915_hangcheck_sample(unsigned long data)
+{
+ struct intel_hangcheck *hc = (struct intel_hangcheck *)data;
+ struct drm_device *dev;
+ drm_i915_private_t *dev_priv;
+ uint32_t head, tail, acthd, instdone[I915_NUM_INSTDONE_REG];
+ uint32_t cur_seqno = 0;
+ uint32_t last_seqno = 0;
struct intel_ring_buffer *ring;
- int i;
- int busy_count = 0, rings_hung = 0;
- bool stuck[I915_NUM_RINGS] = { 0 };
-#define BUSY 1
-#define KICK 5
-#define HUNG 20
-#define FIRE 30
-
- if (!i915_enable_hangcheck)
- return;
-
- for_each_ring(ring, dev_priv, i) {
- u32 seqno, acthd;
- bool busy = true;
-
- semaphore_clear_deadlocks(dev_priv);
-
- seqno = ring->get_seqno(ring, false);
- acthd = intel_ring_get_active_head(ring);
-
- if (ring->hangcheck.seqno == seqno) {
- if (ring_idle(ring, seqno)) {
- ring->hangcheck.action = HANGCHECK_IDLE;
-
- if (waitqueue_active(&ring->irq_queue)) {
- /* Issue a wake-up to catch stuck h/w. */
- if (!test_and_set_bit(ring->id, &dev_priv->gpu_error.missed_irq_rings)) {
- if (!(dev_priv->gpu_error.test_irq_rings & intel_ring_flag(ring)))
- DRM_ERROR("Hangcheck timer elapsed... %s idle\n",
- ring->name);
- else
- DRM_INFO("Fake missed irq on %s\n",
- ring->name);
- wake_up_all(&ring->irq_queue);
- }
- /* Safeguard against driver failure */
- ring->hangcheck.score += BUSY;
- } else
- busy = false;
+ bool idle;
+ int instdone_cmp;
+ int pending_work = 1;
+ int resched_timer = 1;
+ int empty;
+
+ if (!i915_enable_hangcheck || !hc)
+ return;
+
+ dev = hc->dev;
+ dev_priv = dev->dev_private;
+
+ ring = &dev_priv->ring[hc->ringid];
+
+ /* Sample the current state */
+ head = I915_READ_HEAD(ring) & HEAD_ADDR;
+ tail = I915_READ_TAIL(ring) & TAIL_ADDR;
+ acthd = intel_ring_get_active_head(ring);
+ empty = list_empty(&ring->request_list);
+
+ i915_get_extra_instdone(dev, instdone, ring);
+ instdone_cmp = (memcmp(hc->prev_instdone,
+ instdone, sizeof(instdone)) == 0) ? 1 : 0;
+
+ if (!empty) {
+ /* Examine the seqno's to see where the HW has got to
+ * (Only call ring_last_seqno when the list is non-empty)*/
+ cur_seqno = ring->get_seqno(ring, false);
+ last_seqno = ring_last_seqno(ring);
+ }
+
+ if (empty || i915_seqno_passed(cur_seqno, last_seqno)) {
+ /* If the request list is empty or the HW has passed the
+ * last seqno of the last item in the request list then the
+ * HW is considered idle.
+ * The driver may not have cleaned up the request list yet */
+ pending_work = 0;
+ }
+
+ idle = ((head == tail) && (pending_work == 0));
+
+ DRM_DEBUG_TDR("[%d] HD: 0x%08x 0x%08x, ACTHD: 0x%08x 0x%08x IC: %d\n",
+ ring->id, head, hc->last_hd, acthd, hc->last_acthd,
+ instdone_cmp);
+ DRM_DEBUG_TDR("E:%d PW:%d TL:0x%08x Csq:0x%08x Lsq:0x%08x Idle: %d\n",
+ empty, pending_work, tail, cur_seqno, last_seqno, idle);
+
+ /* Check both head and active head.
+ * Neither is enough on its own - acthd can be pointing within the
+ * batch buffer so is more likely to be moving, but the same
+ * underlying buffer object could be submitted more than once.
+ * If it happens to pause at exactly the same place in the batch
+ * buffer and we sample it at that moment then we could see it as
+ * hung over 3 sample periods that do not belong to the same
+ * batch submission - this would result in a false positive.
+ * We know that the head pointer will have advanced for each
+ * batch buffer as the ring has to contain a new MI_BATCH_BUFFER_START
+ * for every do_exec call, so by combining head and active head we can
+ * ensure that the hang detection distinguishes between batch buffers*/
+ if ((hc->last_acthd == acthd)
+ && (hc->last_hd == head)
+ && instdone_cmp) {
+ /* Ring hasn't advanced in this sampling period */
+ if (idle) {
+ /* The hardware is idle */
+ if (waitqueue_active(&ring->irq_queue)) {
+ /* We expect the wait queue to drain
+ * if the hardware has remained idle
+ * for 3 consecutive samples. Wake up
+ * the queue on each sample to try and
+ * release it, but if it persists then
+ * trigger a reset */
+
+ DRM_DEBUG_TDR("Possible stuck wait (0x%08x)\n",
+ ring->last_irq_seqno);
+ wake_up_all(&ring->irq_queue);
+ i915_hangcheck_hung(hc);
} else {
- /* We always increment the hangcheck score
- * if the ring is busy and still processing
- * the same request, so that no single request
- * can run indefinitely (such as a chain of
- * batches). The only time we do not increment
- * the hangcheck score on this ring, if this
- * ring is in a legitimate wait for another
- * ring. In that case the waiting ring is a
- * victim and we want to be sure we catch the
- * right culprit. Then every time we do kick
- * the ring, add a small increment to the
- * score so that we can catch a batch that is
- * being repeatedly kicked and so responsible
- * for stalling the machine.
- */
- ring->hangcheck.action = ring_stuck(ring,
- acthd);
-
- switch (ring->hangcheck.action) {
- case HANGCHECK_IDLE:
- case HANGCHECK_WAIT:
- break;
- case HANGCHECK_ACTIVE:
- ring->hangcheck.score += BUSY;
- break;
- case HANGCHECK_KICK:
- ring->hangcheck.score += KICK;
- break;
- case HANGCHECK_HUNG:
- ring->hangcheck.score += HUNG;
- stuck[i] = true;
- break;
- }
+ /* Hardware and driver both idle */
+ hc->count = 0;
+ resched_timer = 0;
}
} else {
- ring->hangcheck.action = HANGCHECK_ACTIVE;
-
- /* Gradually reduce the count so that we catch DoS
- * attempts across multiple batches.
- */
- if (ring->hangcheck.score > 0)
- ring->hangcheck.score--;
- }
-
- ring->hangcheck.seqno = seqno;
- ring->hangcheck.acthd = acthd;
- busy_count += busy;
- }
-
- for_each_ring(ring, dev_priv, i) {
- if (ring->hangcheck.score > FIRE) {
- DRM_INFO("%s on %s\n",
- stuck[i] ? "stuck" : "no progress",
- ring->name);
- rings_hung++;
+ /* The hardware is busy but has not advanced
+ * since the last sample - possible hang*/
+ i915_hangcheck_hung(hc);
}
+ } else {
+ /* The state has changed so the hardware is active */
+ hc->count = 0;
}
- if (rings_hung)
- return i915_handle_error(dev, true);
-
- if (busy_count)
- /* Reset timer case chip hangs without another request
- * being added */
- i915_queue_hangcheck(dev);
-}
-
-void i915_queue_hangcheck(struct drm_device *dev)
-{
- struct drm_i915_private *dev_priv = dev->dev_private;
- if (!i915_enable_hangcheck)
- return;
+ /* Always update last sampled state */
+ hc->last_hd = head;
+ hc->last_acthd = acthd;
+ memcpy(hc->prev_instdone, instdone, sizeof(instdone));
- mod_timer(&dev_priv->gpu_error.hangcheck_timer,
- round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES));
+ if (resched_timer)
+ mod_timer(&hc->timer, jiffies + DRM_I915_HANGCHECK_JIFFIES);
}
static void ibx_irq_preinstall(struct drm_device *dev)
@@ -3189,7 +3248,7 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
*/
spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
- i915_handle_error(dev, false);
+ i915_handle_error(dev, NULL);
for_each_pipe(pipe) {
int reg = PIPESTAT(pipe);
@@ -3371,7 +3430,7 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
*/
spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
- i915_handle_error(dev, false);
+ i915_handle_error(dev, NULL);
for_each_pipe(pipe) {
int reg = PIPESTAT(pipe);
@@ -3616,7 +3675,7 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
*/
spin_lock_irqsave(&dev_priv->irq_lock, irqflags);
if (iir & I915_RENDER_COMMAND_PARSER_ERROR_INTERRUPT)
- i915_handle_error(dev, false);
+ i915_handle_error(dev, NULL);
for_each_pipe(pipe) {
int reg = PIPESTAT(pipe);
@@ -3775,10 +3834,8 @@ void intel_irq_init(struct drm_device *dev)
INIT_WORK(&dev_priv->gpu_error.work, i915_error_work_func);
INIT_WORK(&dev_priv->rps.work, gen6_pm_rps_work);
INIT_WORK(&dev_priv->l3_parity.error_work, ivybridge_parity_work);
+ init_waitqueue_head(&dev_priv->error_queue);
- setup_timer(&dev_priv->gpu_error.hangcheck_timer,
- i915_hangcheck_elapsed,
- (unsigned long) dev);
setup_timer(&dev_priv->hotplug_reenable_timer, i915_reenable_hotplug_timer_func,
(unsigned long) dev_priv);
@@ -8484,7 +8484,7 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
goto err_unpin;
}
- len = 4;
+ len = 12;
if (ring->id == RCS)
len += 6;
@@ -8512,11 +8512,34 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
intel_ring_emit(ring, ring->scratch.gtt_offset + 256);
}
+ /* Set a flag to indicate that a page flip interrupt is expected.
+ * The flag is used by the TDR logic to detect whether the blitter hung
+ * on a page flip command, in which case it will need to manually
+ * complete the page flip.
+ * The 'flag' is actually the pipe value associated with this page
+ * flip + 1 so that the TDR code knows which pipe failed to flip.
+ * A value of 0 indicates that a flip is not currently in progress on
+ * the HW.*/
+ intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
+ intel_ring_emit(ring, I915_GEM_PGFLIP_INDEX <<
+ MI_STORE_DWORD_INDEX_SHIFT);
+ intel_ring_emit(ring, intel_crtc->pipe + 1);
+ intel_ring_emit(ring, MI_NOOP);
+
intel_ring_emit(ring, MI_DISPLAY_FLIP_I915 | plane_bit);
intel_ring_emit(ring, (fb->pitches[0] | obj->tiling_mode));
intel_ring_emit(ring, i915_gem_obj_ggtt_offset(obj) + intel_crtc->dspaddr_offset);
intel_ring_emit(ring, (MI_NOOP));
+ /* Clear the flag as soon as we pass over the page flip command.
+ * If we passed over the command without hanging then an interrupt should
+ * be received to complete the page flip.*/
+ intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
+ intel_ring_emit(ring, I915_GEM_PGFLIP_INDEX <<
+ MI_STORE_DWORD_INDEX_SHIFT);
+ intel_ring_emit(ring, 0);
+ intel_ring_emit(ring, MI_NOOP);
+
intel_mark_page_flip_active(intel_crtc);
__intel_ring_advance(ring);
return 0;
@@ -47,6 +47,14 @@ void __intel_ring_advance(struct intel_ring_buffer *ring)
ring->tail &= ring->size - 1;
+ /* Re-schedule the hangcheck timer each time the ring is given new work
+ * so that we can detect hangs caused by commands inserted directly
+ * to the ring as well as bad batch buffers */
+ if (!dev_priv->ums.mm_suspended && i915_enable_hangcheck) {
+ mod_timer(&dev_priv->hangcheck[ring->id].timer,
+ jiffies + DRM_I915_HANGCHECK_JIFFIES);
+ }
+
if (dev_priv->gpu_error.stop_rings & intel_ring_flag(ring))
return;
ring->write_tail(ring, ring->tail);
@@ -1591,7 +1599,7 @@ static int ring_wait_for_space(struct intel_ring_buffer *ring, int n)
msleep(1);
ret = i915_gem_check_wedge(&dev_priv->gpu_error,
- dev_priv->mm.interruptible);
+ dev_priv->mm.interruptible, ring);
if (ret)
return ret;
} while (!time_after(jiffies, end));
@@ -1691,7 +1699,7 @@ int intel_ring_begin(struct intel_ring_buffer *ring,
int ret;
ret = i915_gem_check_wedge(&dev_priv->gpu_error,
- dev_priv->mm.interruptible);
+ dev_priv->mm.interruptible, ring);
if (ret)
return ret;
@@ -2010,7 +2018,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring)
/* Spin waiting for the device to ack the reset request */
ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST)
& GEN6_GRDOM_RENDER) == 0, 500);
- DRM_DEBUG("RCS Reset\n");
+ DRM_DEBUG_TDR("RCS Reset\n");
break;
@@ -2020,7 +2028,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring)
/* Spin waiting for the device to ack the reset request */
ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST)
& GEN6_GRDOM_BLT) == 0, 500);
- DRM_DEBUG("BCS Reset\n");
+ DRM_DEBUG_TDR("BCS Reset\n");
break;
@@ -2030,7 +2038,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring)
/* Spin waiting for the device to ack the reset request */
ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST)
& GEN6_GRDOM_MEDIA) == 0, 500);
- DRM_DEBUG("VCS Reset\n");
+ DRM_DEBUG_TDR("VCS Reset\n");
break;
case VECS:
@@ -2039,7 +2047,7 @@ gen6_ring_reset(struct intel_ring_buffer *ring)
/* Spin waiting for the device to ack the reset request */
ret = wait_for((__raw_i915_read32(dev_priv, GEN6_GDRST)
& GEN6_GRDOM_VEBOX) == 0, 500);
- DRM_DEBUG("VECS Reset\n");
+ DRM_DEBUG_TDR("VECS Reset\n");
break;
default:
@@ -198,6 +198,8 @@ struct intel_ring_buffer {
u32 saved_state[I915_RING_CONTEXT_SIZE];
struct intel_ring_hangcheck hangcheck;
+ uint32_t last_irq_seqno;
+
struct {
struct drm_i915_gem_object *obj;
u32 gtt_offset;
@@ -806,9 +806,13 @@ static int gen6_do_reset(struct drm_device *dev)
int intel_gpu_reset(struct drm_device *dev)
{
+ drm_i915_private_t *dev_priv = dev->dev_private;
switch (INTEL_INFO(dev)->gen) {
case 7:
- case 6: return gen6_do_reset(dev);
+ case 6:
+ dev_priv->total_resets++;
+ DRM_DEBUG_TDR("total_resets %d\n", dev_priv->total_resets);
+ return gen6_do_reset(dev);
case 5: return ironlake_do_reset(dev);
case 4: return i965_do_reset(dev);
default: return -ENODEV;
@@ -90,6 +90,7 @@ struct videomode;
#define DRM_UT_DRIVER 0x02
#define DRM_UT_KMS 0x04
#define DRM_UT_PRIME 0x08
+#define DRM_UT_TDR 0x10
/*
* Three debug levels are defined.
* drm_core, drm_driver, drm_kms
@@ -211,6 +212,11 @@ int drm_err(const char *func, const char *format, ...);
drm_ut_debug_printk(DRM_UT_PRIME, DRM_NAME, \
__func__, fmt, ##args); \
} while (0)
+#define DRM_DEBUG_TDR(fmt, args...) \
+ do { \
+ drm_ut_debug_printk(DRM_UT_TDR, DRM_NAME, \
+ __func__, fmt, ##args); \
+ } while (0)
#define DRM_LOG(fmt, args...) \
do { \
drm_ut_debug_printk(DRM_UT_CORE, NULL, \
@@ -235,6 +241,7 @@ int drm_err(const char *func, const char *format, ...);
#define DRM_DEBUG_DRIVER(fmt, args...) do { } while (0)
#define DRM_DEBUG_KMS(fmt, args...) do { } while (0)
#define DRM_DEBUG_PRIME(fmt, args...) do { } while (0)
+#define DRM_DEBUG_TDR(fmt, args...) do { } while (0)
#define DRM_DEBUG(fmt, arg...) do { } while (0)
#define DRM_LOG(fmt, arg...) do { } while (0)
#define DRM_LOG_KMS(fmt, args...) do { } while (0)