diff mbox series

[32/33] drm/i915/gt: Expose heartbeat interval via sysfs

Message ID 20191212140459.1307617-32-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [01/33] drm/i915: Use EAGAIN for trylock failures | expand

Commit Message

Chris Wilson Dec. 12, 2019, 2:04 p.m. UTC
We monitor the health of the system via periodic heartbeat pulses. The
pulses also provide the opportunity to perform garbage collection.
However, we interpret an incomplete pulse (a missed heartbeat) as an
indication that the system is no longer responsive, i.e. hung, and
perform an engine or full GPU reset. Given that the preemption
granularity can be very coarse on a system, we let the sysadmin override
our legacy timeouts which were "optimised" for desktop applications.

The heartbeat interval can be adjusted per-engine using,

	/sys/class/drm/card?/engine/*/heartbeat_interval_ms

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Kconfig.profile         |  3 ++
 drivers/gpu/drm/i915/gt/intel_engine_sysfs.c | 47 ++++++++++++++++++++
 2 files changed, 50 insertions(+)

Comments

Steve Carbonari Jan. 22, 2020, 9:38 p.m. UTC | #1
On Thu, Dec 12, 2019 at 02:04:58PM +0000, Chris Wilson wrote:
> We monitor the health of the system via periodic heartbeat pulses. The
> pulses also provide the opportunity to perform garbage collection.
> However, we interpret an incomplete pulse (a missed heartbeat) as an
> indication that the system is no longer responsive, i.e. hung, and
> perform an engine or full GPU reset. Given that the preemption
> granularity can be very coarse on a system, we let the sysadmin override
> our legacy timeouts which were "optimised" for desktop applications.
> 
> The heartbeat interval can be adjusted per-engine using,
> 
> 	/sys/class/drm/card?/engine/*/heartbeat_interval_ms
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Code looks good.
Performed light testing with other sysfs related patches in the series.
The heartbeat_interval_ms file exists and can be modified.

Reviewed-by: Steve Carbonari <steven.carbonari@intel.com>
Tested-by: Steve Carbonari <steven.carbonari@intel.com


> ---
>  drivers/gpu/drm/i915/Kconfig.profile         |  3 ++
>  drivers/gpu/drm/i915/gt/intel_engine_sysfs.c | 47 ++++++++++++++++++++
>  2 files changed, 50 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/Kconfig.profile b/drivers/gpu/drm/i915/Kconfig.profile
> index 1f4e98a8532f..ba8767fc0d6e 100644
> --- a/drivers/gpu/drm/i915/Kconfig.profile
> +++ b/drivers/gpu/drm/i915/Kconfig.profile
> @@ -20,6 +20,9 @@ config DRM_I915_HEARTBEAT_INTERVAL
>  	  check the health of the GPU and undertake regular house-keeping of
>  	  internal driver state.
>  
> +	  This is adjustable via
> +	  /sys/class/drm/card?/engine/*/heartbeat_interval_ms
> +
>  	  May be 0 to disable heartbeats and therefore disable automatic GPU
>  	  hang detection.
>  
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> index d299c66cf7ec..33b4c00b93f2 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
> @@ -9,6 +9,7 @@
>  
>  #include "i915_drv.h"
>  #include "intel_engine.h"
> +#include "intel_engine_heartbeat.h"
>  #include "intel_engine_sysfs.h"
>  
>  struct kobj_engine {
> @@ -315,6 +316,49 @@ preempt_timeout_show(struct kobject *kobj, struct kobj_attribute *attr,
>  static struct kobj_attribute preempt_timeout_attr =
>  __ATTR(preempt_timeout_ms, 0644, preempt_timeout_show, preempt_timeout_store);
>  
> +static ssize_t
> +heartbeat_store(struct kobject *kobj, struct kobj_attribute *attr,
> +		const char *buf, size_t count)
> +{
> +	struct intel_engine_cs *engine = kobj_to_engine(kobj);
> +	unsigned long long delay;
> +	int err;
> +
> +	/*
> +	 * We monitor the health of the system via periodic heartbeat pulses.
> +	 * The pulses also provide the opportunity to perform garbage
> +	 * collection.  However, we interpret an incomplete pulse (a missed
> +	 * heartbeat) as an indication that the system is no longer responsive,
> +	 * i.e. hung, and perform an engine or full GPU reset. Given that the
> +	 * preemption granularity can be very coarse on a system, the optimal
> +	 * value for any workload is unknowable!
> +	 */
> +
> +	err = kstrtoull(buf, 0, &delay);
> +	if (err)
> +		return err;
> +
> +	if (delay >= jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
> +		return -EINVAL;
> +
> +	err = intel_engine_set_heartbeat(engine, delay);
> +	if (err)
> +		return err;
> +
> +	return count;
> +}
> +
> +static ssize_t
> +heartbeat_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
> +{
> +	struct intel_engine_cs *engine = kobj_to_engine(kobj);
> +
> +	return sprintf(buf, "%lu\n", engine->props.heartbeat_interval_ms);
> +}
> +
> +static struct kobj_attribute heartbeat_interval_attr =
> +__ATTR(heartbeat_interval_ms, 0644, heartbeat_show, heartbeat_store);
> +
>  static void kobj_engine_release(struct kobject *kobj)
>  {
>  	kfree(kobj);
> @@ -357,6 +401,9 @@ void intel_engines_add_sysfs(struct drm_i915_private *i915)
>  		&all_caps_attr.attr,
>  		&max_spin_attr.attr,
>  		&stop_timeout_attr.attr,
> +#if CONFIG_DRM_I915_HEARTBEAT_INTERVAL
> +		&heartbeat_interval_attr.attr,
> +#endif
>  		NULL
>  	};
>
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/Kconfig.profile b/drivers/gpu/drm/i915/Kconfig.profile
index 1f4e98a8532f..ba8767fc0d6e 100644
--- a/drivers/gpu/drm/i915/Kconfig.profile
+++ b/drivers/gpu/drm/i915/Kconfig.profile
@@ -20,6 +20,9 @@  config DRM_I915_HEARTBEAT_INTERVAL
 	  check the health of the GPU and undertake regular house-keeping of
 	  internal driver state.
 
+	  This is adjustable via
+	  /sys/class/drm/card?/engine/*/heartbeat_interval_ms
+
 	  May be 0 to disable heartbeats and therefore disable automatic GPU
 	  hang detection.
 
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
index d299c66cf7ec..33b4c00b93f2 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_sysfs.c
@@ -9,6 +9,7 @@ 
 
 #include "i915_drv.h"
 #include "intel_engine.h"
+#include "intel_engine_heartbeat.h"
 #include "intel_engine_sysfs.h"
 
 struct kobj_engine {
@@ -315,6 +316,49 @@  preempt_timeout_show(struct kobject *kobj, struct kobj_attribute *attr,
 static struct kobj_attribute preempt_timeout_attr =
 __ATTR(preempt_timeout_ms, 0644, preempt_timeout_show, preempt_timeout_store);
 
+static ssize_t
+heartbeat_store(struct kobject *kobj, struct kobj_attribute *attr,
+		const char *buf, size_t count)
+{
+	struct intel_engine_cs *engine = kobj_to_engine(kobj);
+	unsigned long long delay;
+	int err;
+
+	/*
+	 * We monitor the health of the system via periodic heartbeat pulses.
+	 * The pulses also provide the opportunity to perform garbage
+	 * collection.  However, we interpret an incomplete pulse (a missed
+	 * heartbeat) as an indication that the system is no longer responsive,
+	 * i.e. hung, and perform an engine or full GPU reset. Given that the
+	 * preemption granularity can be very coarse on a system, the optimal
+	 * value for any workload is unknowable!
+	 */
+
+	err = kstrtoull(buf, 0, &delay);
+	if (err)
+		return err;
+
+	if (delay >= jiffies_to_msecs(MAX_SCHEDULE_TIMEOUT))
+		return -EINVAL;
+
+	err = intel_engine_set_heartbeat(engine, delay);
+	if (err)
+		return err;
+
+	return count;
+}
+
+static ssize_t
+heartbeat_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+	struct intel_engine_cs *engine = kobj_to_engine(kobj);
+
+	return sprintf(buf, "%lu\n", engine->props.heartbeat_interval_ms);
+}
+
+static struct kobj_attribute heartbeat_interval_attr =
+__ATTR(heartbeat_interval_ms, 0644, heartbeat_show, heartbeat_store);
+
 static void kobj_engine_release(struct kobject *kobj)
 {
 	kfree(kobj);
@@ -357,6 +401,9 @@  void intel_engines_add_sysfs(struct drm_i915_private *i915)
 		&all_caps_attr.attr,
 		&max_spin_attr.attr,
 		&stop_timeout_attr.attr,
+#if CONFIG_DRM_I915_HEARTBEAT_INTERVAL
+		&heartbeat_interval_attr.attr,
+#endif
 		NULL
 	};