diff mbox

drm/i915/selftests: Add a safety net to live_workarounds

Message ID 20180711095950.4689-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson July 11, 2018, 9:59 a.m. UTC
Since live_workarounds poke around the w/a registers and checks to see
if they survive across a reset, we are prone to fouling the machine and
leaving it in a non-recoverable state. Wrap the probe inside a timeout
to abort the test if the reset fails.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107188
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/selftests/igt_wedge_me.h | 48 +++++++++++++++++++
 .../drm/i915/selftests/intel_workarounds.c    |  8 +++-
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/i915/selftests/igt_wedge_me.h

Comments

Mika Kuoppala July 11, 2018, 11:27 a.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Since live_workarounds poke around the w/a registers and checks to see
> if they survive across a reset, we are prone to fouling the machine and
> leaving it in a non-recoverable state. Wrap the probe inside a timeout
> to abort the test if the reset fails.
>
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107188

How can we tell it is not about just reset flakyness but
associated whitelist poking?

-Mika

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/selftests/igt_wedge_me.h | 48 +++++++++++++++++++
>  .../drm/i915/selftests/intel_workarounds.c    |  8 +++-
>  2 files changed, 55 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/gpu/drm/i915/selftests/igt_wedge_me.h
>
> diff --git a/drivers/gpu/drm/i915/selftests/igt_wedge_me.h b/drivers/gpu/drm/i915/selftests/igt_wedge_me.h
> new file mode 100644
> index 000000000000..d2518cf9a5c8
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/selftests/igt_wedge_me.h
> @@ -0,0 +1,48 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2018 Intel Corporation
> + */
> +
> +#ifndef IGT_WEDGE_ME_H
> +#define IGT_WEDGE_ME_H
> +
> +struct igt_wedge_me {
> +	struct delayed_work work;
> +	struct drm_i915_private *i915;
> +	const char *name;
> +};
> +
> +static void __igt_wedge_me(struct work_struct *work)
> +{
> +	struct igt_wedge_me *w = container_of(work, typeof(*w), work.work);
> +
> +	pr_err("%s timed out, cancelling test.\n", w->name);
> +	i915_gem_set_wedged(w->i915);
> +}
> +
> +static void __igt_init_wedge(struct igt_wedge_me *w,
> +			     struct drm_i915_private *i915,
> +			     long timeout,
> +			     const char *name)
> +{
> +	w->i915 = i915;
> +	w->name = name;
> +
> +	INIT_DELAYED_WORK_ONSTACK(&w->work, __igt_wedge_me);
> +	schedule_delayed_work(&w->work, timeout);
> +}
> +
> +static void __igt_fini_wedge(struct igt_wedge_me *w)
> +{
> +	cancel_delayed_work_sync(&w->work);
> +	destroy_delayed_work_on_stack(&w->work);
> +	w->i915 = NULL;
> +}
> +
> +#define igt_wedge_on_timeout(W, DEV, TIMEOUT)				\
> +	for (__igt_init_wedge((W), (DEV), (TIMEOUT), __func__);		\
> +	     (W)->i915;							\
> +	     __igt_fini_wedge((W)))
> +
> +#endif /* IGT_WEDGE_ME_H */
> diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> index fafdec3fe83e..0d39b3bf0c0d 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> @@ -6,6 +6,7 @@
>  
>  #include "../i915_selftest.h"
>  
> +#include "igt_wedge_me.h"
>  #include "mock_context.h"
>  
>  static struct drm_i915_gem_object *
> @@ -111,6 +112,7 @@ static int check_whitelist(const struct whitelist *w,
>  			   struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_gem_object *results;
> +	struct igt_wedge_me wedge;
>  	u32 *vaddr;
>  	int err;
>  	int i;
> @@ -119,7 +121,11 @@ static int check_whitelist(const struct whitelist *w,
>  	if (IS_ERR(results))
>  		return PTR_ERR(results);
>  
> -	err = i915_gem_object_set_to_cpu_domain(results, false);
> +	err = 0;
> +	igt_wedge_on_timeout(&wedge, ctx->i915, HZ / 5) /* a safety net! */
> +		err = i915_gem_object_set_to_cpu_domain(results, false);
> +	if (i915_terminally_wedged(&ctx->i915->gpu_error))
> +		err = -EIO;
>  	if (err)
>  		goto out_put;
>  
> -- 
> 2.18.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson July 11, 2018, 11:39 a.m. UTC | #2
Quoting Mika Kuoppala (2018-07-11 12:27:49)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Since live_workarounds poke around the w/a registers and checks to see
> > if they survive across a reset, we are prone to fouling the machine and
> > leaving it in a non-recoverable state. Wrap the probe inside a timeout
> > to abort the test if the reset fails.
> >
> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107188
> 
> How can we tell it is not about just reset flakyness but
> associated whitelist poking?

By reading the dmesg. All we are doing here is breaking the indefinite
wait (as we've disabled hangcheck so as to not interfere with out own
resets) before owatch declares INCOMPLETE.
-Chris
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/selftests/igt_wedge_me.h b/drivers/gpu/drm/i915/selftests/igt_wedge_me.h
new file mode 100644
index 000000000000..d2518cf9a5c8
--- /dev/null
+++ b/drivers/gpu/drm/i915/selftests/igt_wedge_me.h
@@ -0,0 +1,48 @@ 
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2018 Intel Corporation
+ */
+
+#ifndef IGT_WEDGE_ME_H
+#define IGT_WEDGE_ME_H
+
+struct igt_wedge_me {
+	struct delayed_work work;
+	struct drm_i915_private *i915;
+	const char *name;
+};
+
+static void __igt_wedge_me(struct work_struct *work)
+{
+	struct igt_wedge_me *w = container_of(work, typeof(*w), work.work);
+
+	pr_err("%s timed out, cancelling test.\n", w->name);
+	i915_gem_set_wedged(w->i915);
+}
+
+static void __igt_init_wedge(struct igt_wedge_me *w,
+			     struct drm_i915_private *i915,
+			     long timeout,
+			     const char *name)
+{
+	w->i915 = i915;
+	w->name = name;
+
+	INIT_DELAYED_WORK_ONSTACK(&w->work, __igt_wedge_me);
+	schedule_delayed_work(&w->work, timeout);
+}
+
+static void __igt_fini_wedge(struct igt_wedge_me *w)
+{
+	cancel_delayed_work_sync(&w->work);
+	destroy_delayed_work_on_stack(&w->work);
+	w->i915 = NULL;
+}
+
+#define igt_wedge_on_timeout(W, DEV, TIMEOUT)				\
+	for (__igt_init_wedge((W), (DEV), (TIMEOUT), __func__);		\
+	     (W)->i915;							\
+	     __igt_fini_wedge((W)))
+
+#endif /* IGT_WEDGE_ME_H */
diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
index fafdec3fe83e..0d39b3bf0c0d 100644
--- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
@@ -6,6 +6,7 @@ 
 
 #include "../i915_selftest.h"
 
+#include "igt_wedge_me.h"
 #include "mock_context.h"
 
 static struct drm_i915_gem_object *
@@ -111,6 +112,7 @@  static int check_whitelist(const struct whitelist *w,
 			   struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_object *results;
+	struct igt_wedge_me wedge;
 	u32 *vaddr;
 	int err;
 	int i;
@@ -119,7 +121,11 @@  static int check_whitelist(const struct whitelist *w,
 	if (IS_ERR(results))
 		return PTR_ERR(results);
 
-	err = i915_gem_object_set_to_cpu_domain(results, false);
+	err = 0;
+	igt_wedge_on_timeout(&wedge, ctx->i915, HZ / 5) /* a safety net! */
+		err = i915_gem_object_set_to_cpu_domain(results, false);
+	if (i915_terminally_wedged(&ctx->i915->gpu_error))
+		err = -EIO;
 	if (err)
 		goto out_put;