drm/i915: Remove nested work in gpu error handling

Message ID	1422457394-27331-1-git-send-email-mika.kuoppala@intel.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <intel-gfx-bounces@lists.freedesktop.org> From: Mika Kuoppala <mika.kuoppala@linux.intel.com> To: intel-gfx@lists.freedesktop.org Date: Wed, 28 Jan 2015 17:03:14 +0200 Message-Id: <1422457394-27331-1-git-send-email-mika.kuoppala@intel.com> In-Reply-To: <20150127095320.GH22081@nuc-i3427.alporthouse.com> References: <20150127095320.GH22081@nuc-i3427.alporthouse.com> Subject: [Intel-gfx] [PATCH] drm/i915: Remove nested work in gpu error handling Precedence: list MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: base64 Errors-To: intel-gfx-bounces@lists.freedesktop.org Sender: "Intel-gfx" <intel-gfx-bounces@lists.freedesktop.org>

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 3b332a4..211d494 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -3969,6 +3969,17 @@ i915_wedged_set(void *data, u64 val) struct drm_device *dev = data; struct drm_i915_private *dev_priv = dev->dev_private; + /* + * There is no safeguard against this debugfs entry colliding + * with the hangcheck calling same i915_handle_error() in + * parallel, causing an explosion. For now we assume that the + * test harness is responsible enough not to inject gpu hangs + * while it is writing to 'i915_wedged' + */ + + if (i915_reset_in_progress(&dev_priv->gpu_error)) + return -EAGAIN; + intel_runtime_pm_get(dev_priv); i915_handle_error(dev, val, diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c index 6eaf795..1a46787 100644 --- a/drivers/gpu/drm/i915/i915_dma.c +++ b/drivers/gpu/drm/i915/i915_dma.c @@ -945,7 +945,6 @@ int i915_driver_unload(struct drm_device *dev) /* Free error state after interrupts are fully disabled. */ cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work); - cancel_work_sync(&dev_priv->gpu_error.work); i915_destroy_error_state(dev); if (dev->pdev->msi_enabled) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index b09173f..07f99ca 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1352,8 +1352,6 @@ struct i915_gpu_error { spinlock_t lock; /* Protected by the above dev->gpu_error.lock. */ struct drm_i915_error_state *first_error; - struct work_struct work; - unsigned long missed_irq_rings; diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 234b1f7..44dbf78 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -2421,19 +2421,15 @@ static void i915_error_wake_up(struct drm_i915_private *dev_priv, } /** - * i915_error_work_func - do process context error handling work - * @work: work struct + * i915_reset_and_wakeup - do process context error handling work * * Fire an error uevent so userspace can see that a hang or error * was detected. */ -static void i915_error_work_func(struct work_struct *work) +static void i915_reset_and_wakeup(struct drm_device *dev) { - struct i915_gpu_error *error = container_of(work, struct i915_gpu_error, - work); - struct drm_i915_private *dev_priv = - container_of(error, struct drm_i915_private, gpu_error); - struct drm_device *dev = dev_priv->dev; + struct drm_i915_private *dev_priv = to_i915(dev); + struct i915_gpu_error *error = &dev_priv->gpu_error; char *error_event[] = { I915_ERROR_UEVENT "=1", NULL }; char *reset_event[] = { I915_RESET_UEVENT "=1", NULL }; char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL }; @@ -2600,10 +2596,10 @@ static void i915_report_and_clear_eir(struct drm_device *dev) } /** - * i915_handle_error - handle an error interrupt + * i915_handle_error - handle a gpu error * @dev: drm device * - * Do some basic checking of regsiter state at error interrupt time and + * Do some basic checking of regsiter state at error time and * dump it to the syslog. Also call i915_capture_error_state() to make * sure we get a record and make it available in debugfs. Fire a uevent * so userspace knows something bad happened (should trigger collection @@ -2616,6 +2612,9 @@ void i915_handle_error(struct drm_device *dev, bool wedged, va_list args; char error_msg[80]; + if (WARN_ON(mutex_is_locked(&dev_priv->dev->struct_mutex))) + return; + va_start(args, fmt); vscnprintf(error_msg, sizeof(error_msg), fmt, args); va_end(args); @@ -2628,9 +2627,9 @@ void i915_handle_error(struct drm_device *dev, bool wedged, &dev_priv->gpu_error.reset_counter); /* - * Wakeup waiting processes so that the reset work function - * i915_error_work_func doesn't deadlock trying to grab various - * locks. By bumping the reset counter first, the woken + * Wakeup waiting processes so that the reset function + * i915_reset_and_wakeup doesn't deadlock trying to grab + * various locks. By bumping the reset counter first, the woken * processes will see a reset in progress and back off, * releasing their locks and then wait for the reset completion. * We must do this for _all_ gpu waiters that might hold locks @@ -2643,13 +2642,7 @@ void i915_handle_error(struct drm_device *dev, bool wedged, i915_error_wake_up(dev_priv, false); } - /* - * Our reset work can grab modeset locks (since it needs to reset the - * state of outstanding pagelips). Hence it must not be run on our own - * dev-priv->wq work queue for otherwise the flush_work in the pageflip - * code will deadlock. - */ - schedule_work(&dev_priv->gpu_error.work); + i915_reset_and_wakeup(dev); } /* Called from drm generic code, passed 'crtc' which @@ -4345,7 +4338,6 @@ void intel_irq_init(struct drm_i915_private *dev_priv) INIT_WORK(&dev_priv->hotplug_work, i915_hotplug_work_func); INIT_WORK(&dev_priv->dig_port_work, i915_digport_work_func); - INIT_WORK(&dev_priv->gpu_error.work, i915_error_work_func); INIT_WORK(&dev_priv->rps.work, gen6_pm_rps_work); INIT_WORK(&dev_priv->l3_parity.error_work, ivybridge_parity_work);

drm/i915: Remove nested work in gpu error handling

Commit Message

Comments

Patch