diff mbox

[16/16] drm/i915: Get the error state over the wire (HACKish)

Message ID 1404238671-18760-17-git-send-email-benjamin.widawsky@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Ben Widawsky July 1, 2014, 6:17 p.m. UTC
I was dealing with a bug recently where the system would hard hang
somewhere between hangcheck and reset. There was time after error
collection to actually get my error state out, but I couldn't get the
reads to work.

This patch is also useful for when reset kills the machine, and you want
to keep reset enabled but still get error state.

Since I found the patch pretty useful, I decided to clean it up and
submit it. It was mostly meant as a one-off hack originally though.

If a maintainer decides it's useful, then here it is.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_debugfs.c   |  2 +-
 drivers/gpu/drm/i915/i915_drv.h       |  3 ++-
 drivers/gpu/drm/i915/i915_gpu_error.c | 31 +++++++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_sysfs.c     |  2 +-
 4 files changed, 29 insertions(+), 9 deletions(-)

Comments

Chris Wilson July 4, 2014, 8:02 a.m. UTC | #1
On Tue, Jul 01, 2014 at 11:17:51AM -0700, Ben Widawsky wrote:
> I was dealing with a bug recently where the system would hard hang
> somewhere between hangcheck and reset. There was time after error
> collection to actually get my error state out, but I couldn't get the
> reads to work.
> 
> This patch is also useful for when reset kills the machine, and you want
> to keep reset enabled but still get error state.
> 
> Since I found the patch pretty useful, I decided to clean it up and
> submit it. It was mostly meant as a one-off hack originally though.
> 
> If a maintainer decides it's useful, then here it is.

I think we could certainly push the register dump into DRM_DEBUG. That's
usually enough to identify duplicates and so may be useful from the dev
perspective. Or throw in a QA parameter that dumps an interesting subset to
KERN_ERR. Most people seem to manage to include the dmesg so getting the
right information into it would reduce some of the annoyance in vague
bug reports. (Too much of the wrong information though is equally
annoying.)
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 6b7b32b..2daad46 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -929,7 +929,7 @@  static ssize_t i915_error_state_read(struct file *file, char __user *userbuf,
 	if (ret)
 		return ret;
 
-	ret = i915_error_state_to_str(&error_str, error_priv);
+	ret = i915_error_state_to_str(&error_str, error_priv->dev, error_priv->error);
 	if (ret)
 		goto out;
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1045006..b6a4f1e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2544,7 +2544,8 @@  static inline void intel_display_crc_init(struct drm_device *dev) {}
 __printf(2, 3)
 void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
 int i915_error_state_to_str(struct drm_i915_error_state_buf *estr,
-			    const struct i915_error_state_file_priv *error);
+			    struct drm_device *dev,
+			    const struct drm_i915_error_state *error);
 int i915_error_state_buf_init(struct drm_i915_error_state_buf *eb,
 			      size_t count, loff_t pos);
 static inline void i915_error_state_buf_release(
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index e82e590..1540bf6 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -184,8 +184,22 @@  static void i915_error_puts(struct drm_i915_error_state_buf *e,
 	__i915_error_advance(e, len);
 }
 
-#define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
-#define err_puts(e, s) i915_error_puts(e, s)
+
+static bool wire = false;
+#define err_printf(e, ...) do {				\
+	if (wire) {					\
+		printk(__VA_ARGS__);			\
+	} else {					\
+		i915_error_printf(e, __VA_ARGS__);	\
+	}						\
+} while (0)
+#define err_puts(e, s) do {				\
+	if (wire) {					\
+		printk(s);				\
+	} else {					\
+		i915_error_puts(e, s);			\
+	}						\
+} while (0)
 
 static void print_error_buffers(struct drm_i915_error_state_buf *m,
 				const char *name,
@@ -240,7 +254,7 @@  static const char *hangcheck_action_to_str(enum intel_ring_hangcheck_action a)
 
 static void i915_ring_error_state(struct drm_i915_error_state_buf *m,
 				  struct drm_device *dev,
-				  struct drm_i915_error_ring *ring)
+				  const struct drm_i915_error_ring *ring)
 {
 	if (!ring->valid)
 		return;
@@ -322,11 +336,10 @@  static void print_error_obj(struct drm_i915_error_state_buf *m,
 }
 
 int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
-			    const struct i915_error_state_file_priv *error_priv)
+			    struct drm_device *dev,
+			    const struct drm_i915_error_state *error)
 {
-	struct drm_device *dev = error_priv->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	struct drm_i915_error_state *error = error_priv->error;
 	int i, j, offset, elt;
 	int max_hangcheck_score;
 
@@ -1197,6 +1210,12 @@  void i915_capture_error_state(struct drm_device *dev, bool wedged,
 	spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
 	if (dev_priv->gpu_error.first_error == NULL) {
 		dev_priv->gpu_error.first_error = error;
+#ifdef PUSH_TO_WIRE
+		/* Probably racy, but this is emergency debug */
+		wire = true;
+		i915_error_state_to_str(NULL, dev, error);
+		wire = false;
+#endif
 		error = NULL;
 	}
 	spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_sysfs.c b/drivers/gpu/drm/i915/i915_sysfs.c
index 86ce39a..6f4be9d 100644
--- a/drivers/gpu/drm/i915/i915_sysfs.c
+++ b/drivers/gpu/drm/i915/i915_sysfs.c
@@ -512,7 +512,7 @@  static ssize_t error_state_read(struct file *filp, struct kobject *kobj,
 	error_priv.dev = dev;
 	i915_error_state_get(dev, &error_priv);
 
-	ret = i915_error_state_to_str(&error_str, &error_priv);
+	ret = i915_error_state_to_str(&error_str, dev, error_priv.error);
 	if (ret)
 		goto out;