diff mbox series

[v3,2/3] drm/i915/gt: Double check heartbeat timeout before resetting

Message ID 20210204110620.32422-2-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show
Series [v3,1/3] drm/i915/selftests: Restore previous heartbeat interval | expand

Commit Message

Chris Wilson Feb. 4, 2021, 11:06 a.m. UTC
Check that we have actually passed the heartbeat interval since last
checking the request before resetting the device.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2780
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

Comments

Mika Kuoppala Feb. 4, 2021, 12:57 p.m. UTC | #1
Chris Wilson <chris@chris-wilson.co.uk> writes:

> Check that we have actually passed the heartbeat interval since last
> checking the request before resetting the device.
>
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2780
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c | 11 ++++++++++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> index 48a91c0dbad6..93741a65924a 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> @@ -31,7 +31,7 @@ static bool next_heartbeat(struct intel_engine_cs *engine)
>  	delay = msecs_to_jiffies_timeout(delay);
>  	if (delay >= HZ)
>  		delay = round_jiffies_up_relative(delay);
> -	mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay);
> +	mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
>  
>  	return true;
>  }
> @@ -103,6 +103,13 @@ static void heartbeat(struct work_struct *wrk)
>  		goto out;
>  
>  	if (engine->heartbeat.systole) {
> +		long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
> +
> +		/* Safeguard against too-fast worker invocations */
> +		if (!time_after(jiffies,
> +				rq->emitted_jiffies + msecs_to_jiffies(delay)))
> +			goto out;
> +
>  		if (!i915_sw_fence_signaled(&rq->submit)) {
>  			/*
>  			 * Not yet submitted, system is stalled.
> @@ -139,6 +146,8 @@ static void heartbeat(struct work_struct *wrk)
>  					      "stopped heartbeat on %s",
>  					      engine->name);
>  		}
> +
> +		rq->emitted_jiffies = jiffies;

Would possibly interfere with throttle. But who would get handle to
internal request.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

>  		goto out;
>  	}
>  
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
Chris Wilson Feb. 4, 2021, 1:02 p.m. UTC | #2
Quoting Mika Kuoppala (2021-02-04 12:57:46)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Check that we have actually passed the heartbeat interval since last
> > checking the request before resetting the device.
> >
> > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2780
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c | 11 ++++++++++-
> >  1 file changed, 10 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > index 48a91c0dbad6..93741a65924a 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
> > @@ -31,7 +31,7 @@ static bool next_heartbeat(struct intel_engine_cs *engine)
> >       delay = msecs_to_jiffies_timeout(delay);
> >       if (delay >= HZ)
> >               delay = round_jiffies_up_relative(delay);
> > -     mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay);
> > +     mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
> >  
> >       return true;
> >  }
> > @@ -103,6 +103,13 @@ static void heartbeat(struct work_struct *wrk)
> >               goto out;
> >  
> >       if (engine->heartbeat.systole) {
> > +             long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
> > +
> > +             /* Safeguard against too-fast worker invocations */
> > +             if (!time_after(jiffies,
> > +                             rq->emitted_jiffies + msecs_to_jiffies(delay)))
> > +                     goto out;
> > +
> >               if (!i915_sw_fence_signaled(&rq->submit)) {
> >                       /*
> >                        * Not yet submitted, system is stalled.
> > @@ -139,6 +146,8 @@ static void heartbeat(struct work_struct *wrk)
> >                                             "stopped heartbeat on %s",
> >                                             engine->name);
> >               }
> > +
> > +             rq->emitted_jiffies = jiffies;
> 
> Would possibly interfere with throttle. But who would get handle to
> internal request.

Indeed. And it changes the meaning of the pretty printing in the debug
message, but I can live with that.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
index 48a91c0dbad6..93741a65924a 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
@@ -31,7 +31,7 @@  static bool next_heartbeat(struct intel_engine_cs *engine)
 	delay = msecs_to_jiffies_timeout(delay);
 	if (delay >= HZ)
 		delay = round_jiffies_up_relative(delay);
-	mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay);
+	mod_delayed_work(system_highpri_wq, &engine->heartbeat.work, delay + 1);
 
 	return true;
 }
@@ -103,6 +103,13 @@  static void heartbeat(struct work_struct *wrk)
 		goto out;
 
 	if (engine->heartbeat.systole) {
+		long delay = READ_ONCE(engine->props.heartbeat_interval_ms);
+
+		/* Safeguard against too-fast worker invocations */
+		if (!time_after(jiffies,
+				rq->emitted_jiffies + msecs_to_jiffies(delay)))
+			goto out;
+
 		if (!i915_sw_fence_signaled(&rq->submit)) {
 			/*
 			 * Not yet submitted, system is stalled.
@@ -139,6 +146,8 @@  static void heartbeat(struct work_struct *wrk)
 					      "stopped heartbeat on %s",
 					      engine->name);
 		}
+
+		rq->emitted_jiffies = jiffies;
 		goto out;
 	}