@@ -374,6 +374,9 @@ struct drm_i915_error_state {
u32 pp_dir_base;
};
} vm_info;
+
+ pid_t pid;
+ char comm[TASK_COMM_LEN];
} ring[I915_NUM_RINGS];
struct drm_i915_error_buffer {
@@ -1825,6 +1828,7 @@ struct drm_i915_gem_request {
struct drm_i915_file_private {
struct drm_i915_private *dev_priv;
+ struct drm_file *file;
struct {
spinlock_t lock;
@@ -4981,6 +4981,7 @@ int i915_gem_open(struct drm_device *dev, struct drm_file *file)
file->driver_priv = file_priv;
file_priv->dev_priv = dev->dev_private;
+ file_priv->file = file;
spin_lock_init(&file_priv->mm.lock);
INIT_LIST_HEAD(&file_priv->mm.request_list);
@@ -309,6 +309,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
drm_i915_private_t *dev_priv = dev->dev_private;
struct drm_i915_error_state *error = error_priv->error;
int i, j, page, offset, elt;
+ int max_hangcheck_score;
if (!error) {
err_printf(m, "no error state collected\n");
@@ -318,6 +319,20 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
error->time.tv_usec);
err_printf(m, "Kernel: " UTS_RELEASE "\n");
+ max_hangcheck_score = 0;
+ for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+ if (error->ring[i].hangcheck_score > max_hangcheck_score)
+ max_hangcheck_score = error->ring[i].hangcheck_score;
+ }
+ for (i = 0; i < ARRAY_SIZE(error->ring); i++) {
+ if (error->ring[i].hangcheck_score == max_hangcheck_score &&
+ error->ring[i].pid != -1) {
+ err_printf(m, "Active process (on ring %s): %s [%d]\n",
+ ring_str(i),
+ error->ring[i].comm,
+ error->ring[i].pid);
+ }
+ }
err_printf(m, "PCI ID: 0x%04x\n", dev->pdev->device);
err_printf(m, "EIR: 0x%08x\n", error->eir);
err_printf(m, "IER: 0x%08x\n", error->ier);
@@ -363,8 +378,11 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
struct drm_i915_error_object *obj;
if ((obj = error->ring[i].batchbuffer)) {
- err_printf(m, "%s --- gtt_offset = 0x%08x\n",
- dev_priv->ring[i].name,
+ err_puts(m, dev_priv->ring[i].name);
+ if (error->ring[i].pid != -1)
+ err_printf(m, " (submitted by %s [%d])",
+ error->ring[i].comm, error->ring[i].pid);
+ err_printf(m, " --- gtt_offset = 0x%08x\n",
obj->gtt_offset);
offset = 0;
for (page = 0; page < obj->page_count; page++) {
@@ -698,9 +716,9 @@ static void i915_gem_record_fences(struct drm_device *dev,
}
}
-static struct drm_i915_error_object *
-i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
- struct intel_ring_buffer *ring)
+static struct drm_i915_gem_request *
+i915_error_first_request(struct drm_i915_private *dev_priv,
+ struct intel_ring_buffer *ring)
{
struct drm_i915_gem_request *request;
u32 seqno;
@@ -708,29 +726,12 @@ i915_error_first_batchbuffer(struct drm_i915_private *dev_priv,
if (!ring->get_seqno)
return NULL;
- if (HAS_BROKEN_CS_TLB(dev_priv->dev)) {
- struct drm_i915_gem_object *obj;
- u32 acthd = I915_READ(ACTHD);
-
- if (WARN_ON(ring->id != RCS))
- return NULL;
-
- obj = ring->scratch.obj;
- if (obj != NULL &&
- acthd >= i915_gem_obj_ggtt_offset(obj) &&
- acthd < i915_gem_obj_ggtt_offset(obj) + obj->base.size)
- return i915_error_ggtt_object_create(dev_priv, obj);
- }
-
seqno = ring->get_seqno(ring, false);
list_for_each_entry(request, &ring->request_list, list) {
if (i915_seqno_passed(seqno, request->seqno))
continue;
- /* We need to copy these to an anonymous buffer as the simplest
- * method to avoid being overwritten by userspace.
- */
- return i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
+ return request;
}
return NULL;
@@ -884,8 +885,26 @@ static void i915_gem_record_rings(struct drm_device *dev,
i915_record_ring_state(dev, ring, &error->ring[i]);
- error->ring[i].batchbuffer =
- i915_error_first_batchbuffer(dev_priv, ring);
+ error->ring[i].pid = -1;
+ request = i915_error_first_request(dev_priv, ring);
+ if (request) {
+ /* We need to copy these to an anonymous buffer as the simplest
+ * method to avoid being overwritten by userspace.
+ */
+ error->ring[i].batchbuffer =
+ i915_error_object_create(dev_priv, request->batch_obj, request->ctx->vm);
+ if (request->file_priv) {
+ struct task_struct *task;
+
+ rcu_read_lock();
+ task = pid_task(request->file_priv->file->pid, PIDTYPE_PID);
+ if (task) {
+ strcpy(error->ring[i].comm, task->comm);
+ error->ring[i].pid = task->pid;
+ }
+ rcu_read_unlock();
+ }
+ }
error->ring[i].ringbuffer =
i915_error_ggtt_object_create(dev_priv, ring->obj);
After finding the guilty batch and request, we can use it to find the process that submitted the batch and then add the culprit into the error state. This is a slightly different approach from Ben's in that instead of adding the extra information into the struct i915_hw_context, we use the information already captured in struct drm_file which is then referenced from the request. Link: http://lists.freedesktop.org/archives/intel-gfx/2013-August/032280.html Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Ben Widawsky <ben@bwidawsk.net> --- drivers/gpu/drm/i915/i915_drv.h | 4 ++ drivers/gpu/drm/i915/i915_gem.c | 1 + drivers/gpu/drm/i915/i915_gpu_error.c | 69 ++++++++++++++++++++++------------- 3 files changed, 49 insertions(+), 25 deletions(-)