[14/14] drm/i915/execlists: Offline error capture
diff mbox series

Message ID 20200109085839.873553-14-chris@chris-wilson.co.uk
State New
Headers show
Series
  • [01/14] drm/i915/gt: Push context state allocation earlier
Related show

Commit Message

Chris Wilson Jan. 9, 2020, 8:58 a.m. UTC
Currently, we skip error capture upon forced preemption. We apply forced
preemption when there is a higher priority request that should be
running but is being blocked, and we skip inline error capture so that
the preemption request is not further delayed by a user controlled
capture -- extending the denial of service.

However, preemption reset is also used for heartbeats and regular GPU
hangs. By skipping the error capture, we remove the ability to debug GPU
hangs.

In order to capture the error without delaying the preemption request
further, we can do an out-of-line capture by removing the guilty request
from the execution queue and scheduling a work to dump that request.
When removing a request, we need to remove the entire context and all
descendants from the execution queue, so that they do not jump past.

Closes: https://gitlab.freedesktop.org/drm/intel/issues/738
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c | 102 +++++++++++++++++++++++++++-
 1 file changed, 100 insertions(+), 2 deletions(-)

Comments

kernel test robot Jan. 10, 2020, 10:46 p.m. UTC | #1
Hi Chris,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on next-20200109]
[cannot apply to drm-intel/for-linux-next drm-tip/drm-tip v5.5-rc5 v5.5-rc4 v5.5-rc3 v5.5-rc5]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Chris-Wilson/drm-i915-gt-Push-context-state-allocation-earlier/20200110-090110
base:    85cff1ab64327cee3090050b3dd6b5f1df3e5e1f
config: x86_64-randconfig-a003-20200109 (attached as .config)
compiler: gcc-5 (Ubuntu 5.5.0-12ubuntu1) 5.5.0 20171010
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from drivers/gpu/drm/i915/i915_drv.h:97:0,
                    from drivers/gpu/drm/i915/gt/intel_lrc.c:136:
   drivers/gpu/drm/i915/i915_gpu_error.h:312:6: error: conflicting types for 'i915_vma_compress_prepare'
    void i915_vma_compress_prepare(struct i915_vma_compress *compress)
         ^
   drivers/gpu/drm/i915/i915_gpu_error.h:307:1: note: previous definition of 'i915_vma_compress_prepare' was here
    i915_vma_compress_prepare(struct intel_gt_coredump *gt)
    ^
   drivers/gpu/drm/i915/gt/intel_lrc.c: In function 'execlists_capture_work':
>> drivers/gpu/drm/i915/gt/intel_lrc.c:2471:10: error: implicit declaration of function 'i915_vma_capture_prepare' [-Werror=implicit-function-declaration]
      comp = i915_vma_capture_prepare(gt);
             ^
   drivers/gpu/drm/i915/gt/intel_lrc.c:2471:8: warning: assignment makes pointer from integer without a cast [-Wint-conversion]
      comp = i915_vma_capture_prepare(gt);
           ^
>> drivers/gpu/drm/i915/gt/intel_lrc.c:2474:4: error: implicit declaration of function 'i915_vma_capture_finish' [-Werror=implicit-function-declaration]
       i915_vma_capture_finish(gt, comp);
       ^
   drivers/gpu/drm/i915/gt/intel_lrc.c:2481:25: error: passing argument 1 of 'i915_error_state_store' from incompatible pointer type [-Werror=incompatible-pointer-types]
     i915_error_state_store(cap->error);
                            ^
   In file included from drivers/gpu/drm/i915/i915_drv.h:97:0,
                    from drivers/gpu/drm/i915/gt/intel_lrc.c:136:
   drivers/gpu/drm/i915/i915_gpu_error.h:317:1: note: expected 'struct drm_i915_private *' but argument is of type 'struct i915_gpu_coredump *'
    i915_error_state_store(struct drm_i915_private *i915,
    ^
   drivers/gpu/drm/i915/gt/intel_lrc.c:2481:2: error: too few arguments to function 'i915_error_state_store'
     i915_error_state_store(cap->error);
     ^
   In file included from drivers/gpu/drm/i915/i915_drv.h:97:0,
                    from drivers/gpu/drm/i915/gt/intel_lrc.c:136:
   drivers/gpu/drm/i915/i915_gpu_error.h:317:1: note: declared here
    i915_error_state_store(struct drm_i915_private *i915,
    ^
>> drivers/gpu/drm/i915/gt/intel_lrc.c:2482:2: error: implicit declaration of function 'i915_gpu_coredump_put' [-Werror=implicit-function-declaration]
     i915_gpu_coredump_put(cap->error);
     ^
   cc1: some warnings being treated as errors

vim +/i915_vma_capture_prepare +2471 drivers/gpu/drm/i915/gt/intel_lrc.c

  2458	
  2459	static void execlists_capture_work(struct work_struct *work)
  2460	{
  2461		struct execlists_capture *cap = container_of(work, typeof(*cap), work);
  2462		const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
  2463		struct intel_engine_cs *engine = cap->rq->engine;
  2464		struct intel_gt_coredump *gt = cap->error->gt;
  2465		struct intel_engine_capture_vma *vma;
  2466	
  2467		vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
  2468		if (vma) {
  2469			struct i915_vma_compress *comp;
  2470	
> 2471			comp = i915_vma_capture_prepare(gt);
  2472			if (comp) {
  2473				intel_engine_coredump_add_vma(gt->engine, vma, comp);
> 2474				i915_vma_capture_finish(gt, comp);
  2475			}
  2476		}
  2477	
  2478		gt->simulated = gt->engine->simulated;
  2479		cap->error->simulated = gt->simulated;
  2480	
  2481		i915_error_state_store(cap->error);
> 2482		i915_gpu_coredump_put(cap->error);
  2483	
  2484		execlists_unhold(engine, cap->rq);
  2485	
  2486		kfree(cap);
  2487	}
  2488	

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation
kernel test robot Jan. 11, 2020, 10:33 p.m. UTC | #2
Hi Chris,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on next-20200109]
[cannot apply to drm-intel/for-linux-next drm-tip/drm-tip v5.5-rc5 v5.5-rc4 v5.5-rc3 v5.5-rc5]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:    https://github.com/0day-ci/linux/commits/Chris-Wilson/drm-i915-gt-Push-context-state-allocation-earlier/20200110-090110
base:    85cff1ab64327cee3090050b3dd6b5f1df3e5e1f
config: x86_64-randconfig-g001-20200109 (attached as .config)
compiler: gcc-7 (Debian 7.5.0-3) 7.5.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All error/warnings (new ones prefixed by >>):

   In file included from drivers/gpu/drm/i915/i915_drv.h:97:0,
                    from drivers/gpu/drm/i915/gt/intel_lrc.c:136:
   drivers/gpu/drm/i915/i915_gpu_error.h:312:6: error: conflicting types for 'i915_vma_compress_prepare'
    void i915_vma_compress_prepare(struct i915_vma_compress *compress)
         ^~~~~~~~~~~~~~~~~~~~~~~~~
   drivers/gpu/drm/i915/i915_gpu_error.h:307:1: note: previous definition of 'i915_vma_compress_prepare' was here
    i915_vma_compress_prepare(struct intel_gt_coredump *gt)
    ^~~~~~~~~~~~~~~~~~~~~~~~~
   drivers/gpu/drm/i915/gt/intel_lrc.c: In function 'execlists_capture_work':
>> drivers/gpu/drm/i915/gt/intel_lrc.c:2471:10: error: implicit declaration of function 'i915_vma_capture_prepare'; did you mean 'i915_vma_compress_prepare'? [-Werror=implicit-function-declaration]
      comp = i915_vma_capture_prepare(gt);
             ^~~~~~~~~~~~~~~~~~~~~~~~
             i915_vma_compress_prepare
>> drivers/gpu/drm/i915/gt/intel_lrc.c:2471:8: warning: assignment makes pointer from integer without a cast [-Wint-conversion]
      comp = i915_vma_capture_prepare(gt);
           ^
>> drivers/gpu/drm/i915/gt/intel_lrc.c:2474:4: error: implicit declaration of function 'i915_vma_capture_finish'; did you mean 'i915_vma_clock_flush'? [-Werror=implicit-function-declaration]
       i915_vma_capture_finish(gt, comp);
       ^~~~~~~~~~~~~~~~~~~~~~~
       i915_vma_clock_flush
>> drivers/gpu/drm/i915/gt/intel_lrc.c:2481:25: error: passing argument 1 of 'i915_error_state_store' from incompatible pointer type [-Werror=incompatible-pointer-types]
     i915_error_state_store(cap->error);
                            ^~~
   In file included from drivers/gpu/drm/i915/i915_drv.h:97:0,
                    from drivers/gpu/drm/i915/gt/intel_lrc.c:136:
   drivers/gpu/drm/i915/i915_gpu_error.h:317:1: note: expected 'struct drm_i915_private *' but argument is of type 'struct i915_gpu_coredump *'
    i915_error_state_store(struct drm_i915_private *i915,
    ^~~~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/i915/gt/intel_lrc.c:2481:2: error: too few arguments to function 'i915_error_state_store'
     i915_error_state_store(cap->error);
     ^~~~~~~~~~~~~~~~~~~~~~
   In file included from drivers/gpu/drm/i915/i915_drv.h:97:0,
                    from drivers/gpu/drm/i915/gt/intel_lrc.c:136:
   drivers/gpu/drm/i915/i915_gpu_error.h:317:1: note: declared here
    i915_error_state_store(struct drm_i915_private *i915,
    ^~~~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/i915/gt/intel_lrc.c:2482:2: error: implicit declaration of function 'i915_gpu_coredump_put'; did you mean 'i915_gpu_coredump_alloc'? [-Werror=implicit-function-declaration]
     i915_gpu_coredump_put(cap->error);
     ^~~~~~~~~~~~~~~~~~~~~
     i915_gpu_coredump_alloc
   cc1: some warnings being treated as errors

vim +2471 drivers/gpu/drm/i915/gt/intel_lrc.c

  2458	
  2459	static void execlists_capture_work(struct work_struct *work)
  2460	{
  2461		struct execlists_capture *cap = container_of(work, typeof(*cap), work);
  2462		const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
  2463		struct intel_engine_cs *engine = cap->rq->engine;
  2464		struct intel_gt_coredump *gt = cap->error->gt;
  2465		struct intel_engine_capture_vma *vma;
  2466	
  2467		vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
  2468		if (vma) {
  2469			struct i915_vma_compress *comp;
  2470	
> 2471			comp = i915_vma_capture_prepare(gt);
  2472			if (comp) {
  2473				intel_engine_coredump_add_vma(gt->engine, vma, comp);
> 2474				i915_vma_capture_finish(gt, comp);
  2475			}
  2476		}
  2477	
  2478		gt->simulated = gt->engine->simulated;
  2479		cap->error->simulated = gt->simulated;
  2480	
> 2481		i915_error_state_store(cap->error);
> 2482		i915_gpu_coredump_put(cap->error);
  2483	
  2484		execlists_unhold(engine, cap->rq);
  2485	
  2486		kfree(cap);
  2487	}
  2488	

---
0-DAY kernel test infrastructure                 Open Source Technology Center
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org Intel Corporation

Patch
diff mbox series

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index f2791f98ea68..883d6d46e4e8 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2406,7 +2406,6 @@  static void __execlists_hold(struct i915_request *rq)
 	}
 }
 
-__maybe_unused
 static void execlists_hold(struct intel_engine_cs *engine,
 			   struct i915_request *rq)
 {
@@ -2439,7 +2438,12 @@  static void __execlists_unhold(struct i915_request *rq)
 	}
 }
 
-__maybe_unused
+struct execlists_capture {
+	struct work_struct work;
+	struct i915_request *rq;
+	struct i915_gpu_coredump *error;
+};
+
 static void execlists_unhold(struct intel_engine_cs *engine,
 			     struct i915_request *rq)
 {
@@ -2452,6 +2456,99 @@  static void execlists_unhold(struct intel_engine_cs *engine,
 	spin_unlock_irq(&engine->active.lock);
 }
 
+static void execlists_capture_work(struct work_struct *work)
+{
+	struct execlists_capture *cap = container_of(work, typeof(*cap), work);
+	const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
+	struct intel_engine_cs *engine = cap->rq->engine;
+	struct intel_gt_coredump *gt = cap->error->gt;
+	struct intel_engine_capture_vma *vma;
+
+	vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
+	if (vma) {
+		struct i915_vma_compress *comp;
+
+		comp = i915_vma_capture_prepare(gt);
+		if (comp) {
+			intel_engine_coredump_add_vma(gt->engine, vma, comp);
+			i915_vma_capture_finish(gt, comp);
+		}
+	}
+
+	gt->simulated = gt->engine->simulated;
+	cap->error->simulated = gt->simulated;
+
+	i915_error_state_store(cap->error);
+	i915_gpu_coredump_put(cap->error);
+
+	execlists_unhold(engine, cap->rq);
+
+	kfree(cap);
+}
+
+static struct i915_gpu_coredump *capture_regs(struct intel_engine_cs *engine)
+{
+	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+	struct i915_gpu_coredump *e;
+
+	e = i915_gpu_coredump_alloc(engine->i915, gfp);
+	if (!e)
+		return NULL;
+
+	e->gt = intel_gt_coredump_alloc(engine->gt, gfp);
+	if (!e->gt)
+		goto err;
+
+	e->gt->engine = intel_engine_coredump_alloc(engine, gfp);
+	if (!e->gt->engine)
+		goto err_gt;
+
+	return e;
+
+err_gt:
+	kfree(e->gt);
+err:
+	kfree(e);
+	return NULL;
+}
+
+static void execlists_capture(struct intel_engine_cs *engine)
+{
+	struct execlists_capture *cap;
+
+	if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
+		return;
+
+	cap = kmalloc(sizeof(*cap), GFP_ATOMIC);
+	if (!cap)
+		return;
+
+	cap->rq = execlists_active(&engine->execlists);
+	GEM_BUG_ON(!cap->rq);
+
+	/*
+	 * We need to _quickly_ capture the engine state before we reset.
+	 * We are inside an atomic section (softirq) here and we are delaying
+	 * the forced preemption event.
+	 */
+	cap->error = capture_regs(engine);
+	if (!cap->error) {
+		kfree(cap);
+		return;
+	}
+
+	/*
+	 * Remove the request from the execlists queue, and take ownership
+	 * of the request. We pass it to our worker who will _slowly_ compress
+	 * all the pages the _user_ requested for debugging their batch, after
+	 * which we return it to the queue for signaling.
+	 */
+	execlists_hold(engine, cap->rq);
+
+	INIT_WORK(&cap->work, execlists_capture_work);
+	schedule_work(&cap->work);
+}
+
 static noinline void preempt_reset(struct intel_engine_cs *engine)
 {
 	const unsigned int bit = I915_RESET_ENGINE + engine->id;
@@ -2469,6 +2566,7 @@  static noinline void preempt_reset(struct intel_engine_cs *engine)
 	ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
 		     READ_ONCE(engine->props.preempt_timeout_ms),
 		     jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
+	execlists_capture(engine);
 	intel_engine_reset(engine, "preemption time out");
 
 	tasklet_enable(&engine->execlists.tasklet);