Message ID | 20190105024001.37629-9-carlos.santa@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | drm/i915: Watchdog timeout: Blindly trust watchdog timeout for reset? | expand |
Hi Michel, Thank you for the patch! Yet something to improve: [auto build test ERROR on drm-intel/for-linux-next] [also build test ERROR on v4.20 next-20190103] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Carlos-Santa/drm-i915-Watchdog-timeout-Blindly-trust-watchdog-timeout-for-reset/20190105-111445 base: git://anongit.freedesktop.org/drm-intel for-linux-next config: x86_64-randconfig-x012-201900 (attached as .config) compiler: gcc-7 (Debian 7.3.0-1) 7.3.0 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 All errors (new ones prefixed by >>): drivers/gpu/drm/i915/intel_hangcheck.c: In function 'i915_hangcheck_elapsed': >> drivers/gpu/drm/i915/intel_hangcheck.c:443:24: error: 'struct intel_engine_hangcheck' has no member named 'watchdog' engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) { ^ vim +443 drivers/gpu/drm/i915/intel_hangcheck.c 400 401 /* 402 * This is called when the chip hasn't reported back with completed 403 * batchbuffers in a long time. We keep track per ring seqno progress and 404 * if there are no progress, hangcheck score for that ring is increased. 405 * Further, acthd is inspected to see if the ring is stuck. On stuck case 406 * we kick the ring. If we see no progress on three subsequent calls 407 * we assume chip is wedged and try to fix it by resetting the chip. 408 */ 409 static void i915_hangcheck_elapsed(struct work_struct *work) 410 { 411 struct drm_i915_private *dev_priv = 412 container_of(work, typeof(*dev_priv), 413 gpu_error.hangcheck_work.work); 414 struct intel_engine_cs *engine; 415 enum intel_engine_id id; 416 unsigned int hung = 0, stuck = 0, wedged = 0; 417 418 if (!i915_modparams.enable_hangcheck) 419 return; 420 421 if (!READ_ONCE(dev_priv->gt.awake)) 422 return; 423 424 if (i915_terminally_wedged(&dev_priv->gpu_error)) 425 return; 426 427 /* As enabling the GPU requires fairly extensive mmio access, 428 * periodically arm the mmio checker to see if we are triggering 429 * any invalid access. 430 */ 431 intel_uncore_arm_unclaimed_mmio_detection(dev_priv); 432 433 for_each_engine(engine, dev_priv, id) { 434 struct intel_engine_hangcheck hc; 435 436 semaphore_clear_deadlocks(dev_priv); 437 438 hangcheck_load_sample(engine, &hc); 439 hangcheck_accumulate_sample(engine, &hc); 440 hangcheck_store_sample(engine, &hc); 441 442 if (engine->hangcheck.stalled || > 443 engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) { 444 hung |= intel_engine_flag(engine); 445 if (hc.action != ENGINE_DEAD) 446 stuck |= intel_engine_flag(engine); 447 } 448 449 if (engine->hangcheck.wedged) 450 wedged |= intel_engine_flag(engine); 451 } 452 453 if (wedged) { 454 dev_err(dev_priv->drm.dev, 455 "GPU recovery timed out," 456 " cancelling all in-flight rendering.\n"); 457 GEM_TRACE_DUMP(); 458 i915_gem_set_wedged(dev_priv); 459 } 460 461 if (hung) 462 hangcheck_declare_hang(dev_priv, hung, stuck); 463 464 /* Reset timer in case GPU hangs without another request being added */ 465 i915_queue_hangcheck(dev_priv); 466 } 467 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
Hi Michel, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on drm-intel/for-linux-next] [also build test WARNING on v4.20 next-20190103] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Carlos-Santa/drm-i915-Watchdog-timeout-Blindly-trust-watchdog-timeout-for-reset/20190105-111445 base: git://anongit.freedesktop.org/drm-intel for-linux-next config: x86_64-randconfig-s4-01052002 (attached as .config) compiler: gcc-7 (Debian 7.3.0-1) 7.3.0 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 All warnings (new ones prefixed by >>): In file included from include/asm-generic/bug.h:5:0, from arch/x86/include/asm/bug.h:47, from include/linux/bug.h:5, from include/linux/mmdebug.h:5, from include/linux/gfp.h:5, from include/linux/slab.h:15, from include/linux/io-mapping.h:22, from drivers/gpu/drm/i915/i915_drv.h:36, from drivers/gpu/drm/i915/intel_hangcheck.c:25: drivers/gpu/drm/i915/intel_hangcheck.c: In function 'i915_hangcheck_elapsed': drivers/gpu/drm/i915/intel_hangcheck.c:443:24: error: 'struct intel_engine_hangcheck' has no member named 'watchdog' engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) { ^ include/linux/compiler.h:58:30: note: in definition of macro '__trace_if' if (__builtin_constant_p(!!(cond)) ? !!(cond) : \ ^~~~ >> drivers/gpu/drm/i915/intel_hangcheck.c:442:3: note: in expansion of macro 'if' if (engine->hangcheck.stalled || ^~ drivers/gpu/drm/i915/intel_hangcheck.c:443:24: error: 'struct intel_engine_hangcheck' has no member named 'watchdog' engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) { ^ include/linux/compiler.h:58:42: note: in definition of macro '__trace_if' if (__builtin_constant_p(!!(cond)) ? !!(cond) : \ ^~~~ >> drivers/gpu/drm/i915/intel_hangcheck.c:442:3: note: in expansion of macro 'if' if (engine->hangcheck.stalled || ^~ drivers/gpu/drm/i915/intel_hangcheck.c:443:24: error: 'struct intel_engine_hangcheck' has no member named 'watchdog' engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) { ^ include/linux/compiler.h:69:16: note: in definition of macro '__trace_if' ______r = !!(cond); \ ^~~~ >> drivers/gpu/drm/i915/intel_hangcheck.c:442:3: note: in expansion of macro 'if' if (engine->hangcheck.stalled || ^~ vim +/if +442 drivers/gpu/drm/i915/intel_hangcheck.c 400 401 /* 402 * This is called when the chip hasn't reported back with completed 403 * batchbuffers in a long time. We keep track per ring seqno progress and 404 * if there are no progress, hangcheck score for that ring is increased. 405 * Further, acthd is inspected to see if the ring is stuck. On stuck case 406 * we kick the ring. If we see no progress on three subsequent calls 407 * we assume chip is wedged and try to fix it by resetting the chip. 408 */ 409 static void i915_hangcheck_elapsed(struct work_struct *work) 410 { 411 struct drm_i915_private *dev_priv = 412 container_of(work, typeof(*dev_priv), 413 gpu_error.hangcheck_work.work); 414 struct intel_engine_cs *engine; 415 enum intel_engine_id id; 416 unsigned int hung = 0, stuck = 0, wedged = 0; 417 418 if (!i915_modparams.enable_hangcheck) 419 return; 420 421 if (!READ_ONCE(dev_priv->gt.awake)) 422 return; 423 424 if (i915_terminally_wedged(&dev_priv->gpu_error)) 425 return; 426 427 /* As enabling the GPU requires fairly extensive mmio access, 428 * periodically arm the mmio checker to see if we are triggering 429 * any invalid access. 430 */ 431 intel_uncore_arm_unclaimed_mmio_detection(dev_priv); 432 433 for_each_engine(engine, dev_priv, id) { 434 struct intel_engine_hangcheck hc; 435 436 semaphore_clear_deadlocks(dev_priv); 437 438 hangcheck_load_sample(engine, &hc); 439 hangcheck_accumulate_sample(engine, &hc); 440 hangcheck_store_sample(engine, &hc); 441 > 442 if (engine->hangcheck.stalled || 443 engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) { 444 hung |= intel_engine_flag(engine); 445 if (hc.action != ENGINE_DEAD) 446 stuck |= intel_engine_flag(engine); 447 } 448 449 if (engine->hangcheck.wedged) 450 wedged |= intel_engine_flag(engine); 451 } 452 453 if (wedged) { 454 dev_err(dev_priv->drm.dev, 455 "GPU recovery timed out," 456 " cancelling all in-flight rendering.\n"); 457 GEM_TRACE_DUMP(); 458 i915_gem_set_wedged(dev_priv); 459 } 460 461 if (hung) 462 hangcheck_declare_hang(dev_priv, hung, stuck); 463 464 /* Reset timer in case GPU hangs without another request being added */ 465 i915_queue_hangcheck(dev_priv); 466 } 467 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c index 2906f0ef3d77..1947baa20022 100644 --- a/drivers/gpu/drm/i915/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/intel_hangcheck.c @@ -281,7 +281,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work) hangcheck_accumulate_sample(engine, &hc); hangcheck_store_sample(engine, &hc); - if (engine->hangcheck.stalled) { + if (engine->hangcheck.stalled || + engine->hangcheck.watchdog == intel_engine_get_seqno(engine)) { hung |= intel_engine_flag(engine); if (hc.action != ENGINE_DEAD) stuck |= intel_engine_flag(engine);