diff mbox series

drm/i915: Watchdog timeout: Blindly trust watchdog timeout for reset?

Message ID 20190105024001.37629-9-carlos.santa@intel.com (mailing list archive)
State New, archived
Headers show
Series drm/i915: Watchdog timeout: Blindly trust watchdog timeout for reset? | expand

Commit Message

Santa, Carlos Jan. 5, 2019, 2:40 a.m. UTC
From: Michel Thierry <michel.thierry@intel.com>

XXX: What to do when the watchdog irq fired twice but our hangcheck
logic thinks the engine is not hung? For example, what if the
active-head moved since the irq handler?

One option is to just ignore the watchdog, if the engine is really hung,
then the driver will detect the hang by itself later on (I'm inclined to
this).

But the other option is to blindly trust the HW, which is what this patch
does...

CC: Antonio Argenziano <antonio.argenziano@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Signed-off-by: Michel Thierry <michel.thierry@intel.com>
Signed-off-by: Carlos Santa <carlos.santa@intel.com>
---
 drivers/gpu/drm/i915/intel_hangcheck.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

Comments

kernel test robot Jan. 5, 2019, 4:15 a.m. UTC | #1
Hi Michel,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on drm-intel/for-linux-next]
[also build test ERROR on v4.20 next-20190103]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Carlos-Santa/drm-i915-Watchdog-timeout-Blindly-trust-watchdog-timeout-for-reset/20190105-111445
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
config: x86_64-randconfig-x012-201900 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   drivers/gpu/drm/i915/intel_hangcheck.c: In function 'i915_hangcheck_elapsed':
>> drivers/gpu/drm/i915/intel_hangcheck.c:443:24: error: 'struct intel_engine_hangcheck' has no member named 'watchdog'
          engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) {
                           ^

vim +443 drivers/gpu/drm/i915/intel_hangcheck.c

   400	
   401	/*
   402	 * This is called when the chip hasn't reported back with completed
   403	 * batchbuffers in a long time. We keep track per ring seqno progress and
   404	 * if there are no progress, hangcheck score for that ring is increased.
   405	 * Further, acthd is inspected to see if the ring is stuck. On stuck case
   406	 * we kick the ring. If we see no progress on three subsequent calls
   407	 * we assume chip is wedged and try to fix it by resetting the chip.
   408	 */
   409	static void i915_hangcheck_elapsed(struct work_struct *work)
   410	{
   411		struct drm_i915_private *dev_priv =
   412			container_of(work, typeof(*dev_priv),
   413				     gpu_error.hangcheck_work.work);
   414		struct intel_engine_cs *engine;
   415		enum intel_engine_id id;
   416		unsigned int hung = 0, stuck = 0, wedged = 0;
   417	
   418		if (!i915_modparams.enable_hangcheck)
   419			return;
   420	
   421		if (!READ_ONCE(dev_priv->gt.awake))
   422			return;
   423	
   424		if (i915_terminally_wedged(&dev_priv->gpu_error))
   425			return;
   426	
   427		/* As enabling the GPU requires fairly extensive mmio access,
   428		 * periodically arm the mmio checker to see if we are triggering
   429		 * any invalid access.
   430		 */
   431		intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
   432	
   433		for_each_engine(engine, dev_priv, id) {
   434			struct intel_engine_hangcheck hc;
   435	
   436			semaphore_clear_deadlocks(dev_priv);
   437	
   438			hangcheck_load_sample(engine, &hc);
   439			hangcheck_accumulate_sample(engine, &hc);
   440			hangcheck_store_sample(engine, &hc);
   441	
   442			if (engine->hangcheck.stalled ||
 > 443			    engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) {
   444				hung |= intel_engine_flag(engine);
   445				if (hc.action != ENGINE_DEAD)
   446					stuck |= intel_engine_flag(engine);
   447			}
   448	
   449			if (engine->hangcheck.wedged)
   450				wedged |= intel_engine_flag(engine);
   451		}
   452	
   453		if (wedged) {
   454			dev_err(dev_priv->drm.dev,
   455				"GPU recovery timed out,"
   456				" cancelling all in-flight rendering.\n");
   457			GEM_TRACE_DUMP();
   458			i915_gem_set_wedged(dev_priv);
   459		}
   460	
   461		if (hung)
   462			hangcheck_declare_hang(dev_priv, hung, stuck);
   463	
   464		/* Reset timer in case GPU hangs without another request being added */
   465		i915_queue_hangcheck(dev_priv);
   466	}
   467	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot Jan. 5, 2019, 1:32 p.m. UTC | #2
Hi Michel,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-intel/for-linux-next]
[also build test WARNING on v4.20 next-20190103]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Carlos-Santa/drm-i915-Watchdog-timeout-Blindly-trust-watchdog-timeout-for-reset/20190105-111445
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
config: x86_64-randconfig-s4-01052002 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All warnings (new ones prefixed by >>):

   In file included from include/asm-generic/bug.h:5:0,
                    from arch/x86/include/asm/bug.h:47,
                    from include/linux/bug.h:5,
                    from include/linux/mmdebug.h:5,
                    from include/linux/gfp.h:5,
                    from include/linux/slab.h:15,
                    from include/linux/io-mapping.h:22,
                    from drivers/gpu/drm/i915/i915_drv.h:36,
                    from drivers/gpu/drm/i915/intel_hangcheck.c:25:
   drivers/gpu/drm/i915/intel_hangcheck.c: In function 'i915_hangcheck_elapsed':
   drivers/gpu/drm/i915/intel_hangcheck.c:443:24: error: 'struct intel_engine_hangcheck' has no member named 'watchdog'
          engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) {
                           ^
   include/linux/compiler.h:58:30: note: in definition of macro '__trace_if'
     if (__builtin_constant_p(!!(cond)) ? !!(cond) :   \
                                 ^~~~
>> drivers/gpu/drm/i915/intel_hangcheck.c:442:3: note: in expansion of macro 'if'
      if (engine->hangcheck.stalled ||
      ^~
   drivers/gpu/drm/i915/intel_hangcheck.c:443:24: error: 'struct intel_engine_hangcheck' has no member named 'watchdog'
          engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) {
                           ^
   include/linux/compiler.h:58:42: note: in definition of macro '__trace_if'
     if (__builtin_constant_p(!!(cond)) ? !!(cond) :   \
                                             ^~~~
>> drivers/gpu/drm/i915/intel_hangcheck.c:442:3: note: in expansion of macro 'if'
      if (engine->hangcheck.stalled ||
      ^~
   drivers/gpu/drm/i915/intel_hangcheck.c:443:24: error: 'struct intel_engine_hangcheck' has no member named 'watchdog'
          engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) {
                           ^
   include/linux/compiler.h:69:16: note: in definition of macro '__trace_if'
      ______r = !!(cond);     \
                   ^~~~
>> drivers/gpu/drm/i915/intel_hangcheck.c:442:3: note: in expansion of macro 'if'
      if (engine->hangcheck.stalled ||
      ^~

vim +/if +442 drivers/gpu/drm/i915/intel_hangcheck.c

   400	
   401	/*
   402	 * This is called when the chip hasn't reported back with completed
   403	 * batchbuffers in a long time. We keep track per ring seqno progress and
   404	 * if there are no progress, hangcheck score for that ring is increased.
   405	 * Further, acthd is inspected to see if the ring is stuck. On stuck case
   406	 * we kick the ring. If we see no progress on three subsequent calls
   407	 * we assume chip is wedged and try to fix it by resetting the chip.
   408	 */
   409	static void i915_hangcheck_elapsed(struct work_struct *work)
   410	{
   411		struct drm_i915_private *dev_priv =
   412			container_of(work, typeof(*dev_priv),
   413				     gpu_error.hangcheck_work.work);
   414		struct intel_engine_cs *engine;
   415		enum intel_engine_id id;
   416		unsigned int hung = 0, stuck = 0, wedged = 0;
   417	
   418		if (!i915_modparams.enable_hangcheck)
   419			return;
   420	
   421		if (!READ_ONCE(dev_priv->gt.awake))
   422			return;
   423	
   424		if (i915_terminally_wedged(&dev_priv->gpu_error))
   425			return;
   426	
   427		/* As enabling the GPU requires fairly extensive mmio access,
   428		 * periodically arm the mmio checker to see if we are triggering
   429		 * any invalid access.
   430		 */
   431		intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
   432	
   433		for_each_engine(engine, dev_priv, id) {
   434			struct intel_engine_hangcheck hc;
   435	
   436			semaphore_clear_deadlocks(dev_priv);
   437	
   438			hangcheck_load_sample(engine, &hc);
   439			hangcheck_accumulate_sample(engine, &hc);
   440			hangcheck_store_sample(engine, &hc);
   441	
 > 442			if (engine->hangcheck.stalled ||
   443			    engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) {
   444				hung |= intel_engine_flag(engine);
   445				if (hc.action != ENGINE_DEAD)
   446					stuck |= intel_engine_flag(engine);
   447			}
   448	
   449			if (engine->hangcheck.wedged)
   450				wedged |= intel_engine_flag(engine);
   451		}
   452	
   453		if (wedged) {
   454			dev_err(dev_priv->drm.dev,
   455				"GPU recovery timed out,"
   456				" cancelling all in-flight rendering.\n");
   457			GEM_TRACE_DUMP();
   458			i915_gem_set_wedged(dev_priv);
   459		}
   460	
   461		if (hung)
   462			hangcheck_declare_hang(dev_priv, hung, stuck);
   463	
   464		/* Reset timer in case GPU hangs without another request being added */
   465		i915_queue_hangcheck(dev_priv);
   466	}
   467	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 2906f0ef3d77..1947baa20022 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -281,7 +281,8 @@  static void i915_hangcheck_elapsed(struct work_struct *work)
 		hangcheck_accumulate_sample(engine, &hc);
 		hangcheck_store_sample(engine, &hc);
 
-		if (engine->hangcheck.stalled) {
+		if (engine->hangcheck.stalled ||
+		    engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) {
 			hung |= intel_engine_flag(engine);
 			if (hc.action != ENGINE_DEAD)
 				stuck |= intel_engine_flag(engine);