diff mbox

drm/i915/selftests: Avoid repeatedly harming the same innocent context

Message ID 20180330131801.18327-1-chris@chris-wilson.co.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Chris Wilson March 30, 2018, 1:18 p.m. UTC
We don't handle resetting the kernel context very well, or presumably any
context executing its breadcrumb commands in the ring as opposed to the
batchbuffer and flush. If we trigger a device reset twice in quick
succession while the kernel context is executing, we may end up skipping
the breadcrumb.  This is really only a problem for the selftest as
normally there is a large interlude between resets (hangcheck), or we
focus on resetting just one engine and so avoid repeatedly resetting
innocents.

Something to try would be a preempt-to-idle to quiesce the engine before
reset, so that innocent contexts would be spared the reset.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
CC: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c                  |  3 ++
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 48 ++++++++++++++++++++++--
 2 files changed, 47 insertions(+), 4 deletions(-)

Comments

kernel test robot March 31, 2018, 8:24 p.m. UTC | #1
Hi Chris,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on drm-intel/for-linux-next]
[also build test ERROR on v4.16-rc7 next-20180329]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Chris-Wilson/drm-i915-selftests-Avoid-repeatedly-harming-the-same-innocent-context/20180401-022503
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
config: i386-randconfig-b0-04010137 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   In file included from drivers/gpu/drm/i915/intel_hangcheck.c:465:0:
   drivers/gpu/drm/i915/selftests/intel_hangcheck.c: In function 'igt_reset_queue':
>> drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:5: error: implicit declaration of function 'GEM_TRACE_DUMP' [-Werror=implicit-function-declaration]
        GEM_TRACE_DUMP();
        ^
   cc1: all warnings being treated as errors

vim +/GEM_TRACE_DUMP +988 drivers/gpu/drm/i915/selftests/intel_hangcheck.c

   922	
   923	static int igt_reset_queue(void *arg)
   924	{
   925		struct drm_i915_private *i915 = arg;
   926		struct intel_engine_cs *engine;
   927		enum intel_engine_id id;
   928		struct hang h;
   929		int err;
   930	
   931		/* Check that we replay pending requests following a hang */
   932	
   933		global_reset_lock(i915);
   934	
   935		mutex_lock(&i915->drm.struct_mutex);
   936		err = hang_init(&h, i915);
   937		if (err)
   938			goto unlock;
   939	
   940		for_each_engine(engine, i915, id) {
   941			struct i915_request *prev;
   942			IGT_TIMEOUT(end_time);
   943			unsigned int count;
   944	
   945			if (!intel_engine_can_store_dword(engine))
   946				continue;
   947	
   948			prev = hang_create_request(&h, engine);
   949			if (IS_ERR(prev)) {
   950				err = PTR_ERR(prev);
   951				goto fini;
   952			}
   953	
   954			i915_request_get(prev);
   955			__i915_request_add(prev, true);
   956	
   957			count = 0;
   958			do {
   959				struct i915_request *rq;
   960				unsigned int reset_count;
   961	
   962				rq = hang_create_request(&h, engine);
   963				if (IS_ERR(rq)) {
   964					err = PTR_ERR(rq);
   965					goto fini;
   966				}
   967	
   968				i915_request_get(rq);
   969				__i915_request_add(rq, true);
   970	
   971				/*
   972				 * XXX We don't handle resetting the kernel context
   973				 * very well. If we trigger a device reset twice in
   974				 * quick succession while the kernel context is
   975				 * executing, we may end up skipping the breadcrumb.
   976				 * This is really only a problem for the selftest as
   977				 * normally there is a large interlude between resets
   978				 * (hangcheck), or we focus on resetting just one
   979				 * engine and so avoid repeatedly resetting innocents.
   980				 */
   981				err = wait_for_others(i915, engine);
   982				if (err) {
   983					pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
   984					       __func__, engine->name);
   985					i915_request_put(rq);
   986					i915_request_put(prev);
   987	
 > 988					GEM_TRACE_DUMP();
   989					i915_gem_set_wedged(i915);
   990					goto fini;
   991				}
   992	
   993				if (!wait_for_hang(&h, prev)) {
   994					struct drm_printer p = drm_info_printer(i915->drm.dev);
   995	
   996					pr_err("%s(%s): Failed to start request %x, at %x\n",
   997					       __func__, engine->name,
   998					       prev->fence.seqno, hws_seqno(&h, prev));
   999					intel_engine_dump(engine, &p,
  1000							  "%s\n", engine->name);
  1001	
  1002					i915_request_put(rq);
  1003					i915_request_put(prev);
  1004	
  1005					i915_reset(i915, 0);
  1006					i915_gem_set_wedged(i915);
  1007	
  1008					err = -EIO;
  1009					goto fini;
  1010				}
  1011	
  1012				reset_count = fake_hangcheck(prev);
  1013	
  1014				i915_reset(i915, I915_RESET_QUIET);
  1015	
  1016				GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
  1017						    &i915->gpu_error.flags));
  1018	
  1019				if (prev->fence.error != -EIO) {
  1020					pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
  1021					       prev->fence.error);
  1022					i915_request_put(rq);
  1023					i915_request_put(prev);
  1024					err = -EINVAL;
  1025					goto fini;
  1026				}
  1027	
  1028				if (rq->fence.error) {
  1029					pr_err("Fence error status not zero [%d] after unrelated reset\n",
  1030					       rq->fence.error);
  1031					i915_request_put(rq);
  1032					i915_request_put(prev);
  1033					err = -EINVAL;
  1034					goto fini;
  1035				}
  1036	
  1037				if (i915_reset_count(&i915->gpu_error) == reset_count) {
  1038					pr_err("No GPU reset recorded!\n");
  1039					i915_request_put(rq);
  1040					i915_request_put(prev);
  1041					err = -EINVAL;
  1042					goto fini;
  1043				}
  1044	
  1045				i915_request_put(prev);
  1046				prev = rq;
  1047				count++;
  1048			} while (time_before(jiffies, end_time));
  1049			pr_info("%s: Completed %d resets\n", engine->name, count);
  1050	
  1051			*h.batch = MI_BATCH_BUFFER_END;
  1052			i915_gem_chipset_flush(i915);
  1053	
  1054			i915_request_put(prev);
  1055	
  1056			err = flush_test(i915, I915_WAIT_LOCKED);
  1057			if (err)
  1058				break;
  1059		}
  1060	
  1061	fini:
  1062		hang_fini(&h);
  1063	unlock:
  1064		mutex_unlock(&i915->drm.struct_mutex);
  1065		global_reset_unlock(i915);
  1066	
  1067		if (i915_terminally_wedged(&i915->gpu_error))
  1068			return -EIO;
  1069	
  1070		return err;
  1071	}
  1072	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot March 31, 2018, 8:41 p.m. UTC | #2
Hi Chris,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on drm-intel/for-linux-next]
[also build test ERROR on v4.16-rc7 next-20180329]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Chris-Wilson/drm-i915-selftests-Avoid-repeatedly-harming-the-same-innocent-context/20180401-022503
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
config: x86_64-randconfig-x011-201813 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-1) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=x86_64 

All errors (new ones prefixed by >>):

   In file included from drivers/gpu/drm/i915/intel_hangcheck.c:465:0:
   drivers/gpu/drm/i915/selftests/intel_hangcheck.c: In function 'igt_reset_queue':
>> drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:5: error: implicit declaration of function 'GEM_TRACE_DUMP'; did you mean 'GEM_TRACE'? [-Werror=implicit-function-declaration]
        GEM_TRACE_DUMP();
        ^~~~~~~~~~~~~~
        GEM_TRACE
   cc1: some warnings being treated as errors

vim +988 drivers/gpu/drm/i915/selftests/intel_hangcheck.c

   922	
   923	static int igt_reset_queue(void *arg)
   924	{
   925		struct drm_i915_private *i915 = arg;
   926		struct intel_engine_cs *engine;
   927		enum intel_engine_id id;
   928		struct hang h;
   929		int err;
   930	
   931		/* Check that we replay pending requests following a hang */
   932	
   933		global_reset_lock(i915);
   934	
   935		mutex_lock(&i915->drm.struct_mutex);
   936		err = hang_init(&h, i915);
   937		if (err)
   938			goto unlock;
   939	
   940		for_each_engine(engine, i915, id) {
   941			struct i915_request *prev;
   942			IGT_TIMEOUT(end_time);
   943			unsigned int count;
   944	
   945			if (!intel_engine_can_store_dword(engine))
   946				continue;
   947	
   948			prev = hang_create_request(&h, engine);
   949			if (IS_ERR(prev)) {
   950				err = PTR_ERR(prev);
   951				goto fini;
   952			}
   953	
   954			i915_request_get(prev);
   955			__i915_request_add(prev, true);
   956	
   957			count = 0;
   958			do {
   959				struct i915_request *rq;
   960				unsigned int reset_count;
   961	
   962				rq = hang_create_request(&h, engine);
   963				if (IS_ERR(rq)) {
   964					err = PTR_ERR(rq);
   965					goto fini;
   966				}
   967	
   968				i915_request_get(rq);
   969				__i915_request_add(rq, true);
   970	
   971				/*
   972				 * XXX We don't handle resetting the kernel context
   973				 * very well. If we trigger a device reset twice in
   974				 * quick succession while the kernel context is
   975				 * executing, we may end up skipping the breadcrumb.
   976				 * This is really only a problem for the selftest as
   977				 * normally there is a large interlude between resets
   978				 * (hangcheck), or we focus on resetting just one
   979				 * engine and so avoid repeatedly resetting innocents.
   980				 */
   981				err = wait_for_others(i915, engine);
   982				if (err) {
   983					pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
   984					       __func__, engine->name);
   985					i915_request_put(rq);
   986					i915_request_put(prev);
   987	
 > 988					GEM_TRACE_DUMP();
   989					i915_gem_set_wedged(i915);
   990					goto fini;
   991				}
   992	
   993				if (!wait_for_hang(&h, prev)) {
   994					struct drm_printer p = drm_info_printer(i915->drm.dev);
   995	
   996					pr_err("%s(%s): Failed to start request %x, at %x\n",
   997					       __func__, engine->name,
   998					       prev->fence.seqno, hws_seqno(&h, prev));
   999					intel_engine_dump(engine, &p,
  1000							  "%s\n", engine->name);
  1001	
  1002					i915_request_put(rq);
  1003					i915_request_put(prev);
  1004	
  1005					i915_reset(i915, 0);
  1006					i915_gem_set_wedged(i915);
  1007	
  1008					err = -EIO;
  1009					goto fini;
  1010				}
  1011	
  1012				reset_count = fake_hangcheck(prev);
  1013	
  1014				i915_reset(i915, I915_RESET_QUIET);
  1015	
  1016				GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
  1017						    &i915->gpu_error.flags));
  1018	
  1019				if (prev->fence.error != -EIO) {
  1020					pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
  1021					       prev->fence.error);
  1022					i915_request_put(rq);
  1023					i915_request_put(prev);
  1024					err = -EINVAL;
  1025					goto fini;
  1026				}
  1027	
  1028				if (rq->fence.error) {
  1029					pr_err("Fence error status not zero [%d] after unrelated reset\n",
  1030					       rq->fence.error);
  1031					i915_request_put(rq);
  1032					i915_request_put(prev);
  1033					err = -EINVAL;
  1034					goto fini;
  1035				}
  1036	
  1037				if (i915_reset_count(&i915->gpu_error) == reset_count) {
  1038					pr_err("No GPU reset recorded!\n");
  1039					i915_request_put(rq);
  1040					i915_request_put(prev);
  1041					err = -EINVAL;
  1042					goto fini;
  1043				}
  1044	
  1045				i915_request_put(prev);
  1046				prev = rq;
  1047				count++;
  1048			} while (time_before(jiffies, end_time));
  1049			pr_info("%s: Completed %d resets\n", engine->name, count);
  1050	
  1051			*h.batch = MI_BATCH_BUFFER_END;
  1052			i915_gem_chipset_flush(i915);
  1053	
  1054			i915_request_put(prev);
  1055	
  1056			err = flush_test(i915, I915_WAIT_LOCKED);
  1057			if (err)
  1058				break;
  1059		}
  1060	
  1061	fini:
  1062		hang_fini(&h);
  1063	unlock:
  1064		mutex_unlock(&i915->drm.struct_mutex);
  1065		global_reset_unlock(i915);
  1066	
  1067		if (i915_terminally_wedged(&i915->gpu_error))
  1068			return -EIO;
  1069	
  1070		return err;
  1071	}
  1072	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
kernel test robot April 1, 2018, 2:45 a.m. UTC | #3
Hi Chris,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-intel/for-linux-next]
[also build test WARNING on v4.16-rc7 next-20180329]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Chris-Wilson/drm-i915-selftests-Avoid-repeatedly-harming-the-same-innocent-context/20180401-022503
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

   drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:33: sparse: undefined identifier 'GEM_TRACE_DUMP'
>> drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:47: sparse: call with no type!
   In file included from drivers/gpu/drm/i915/intel_hangcheck.c:465:0:
   drivers/gpu/drm/i915/selftests/intel_hangcheck.c: In function 'igt_reset_queue':
   drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:5: error: implicit declaration of function 'GEM_TRACE_DUMP'; did you mean 'GEM_TRACE'? [-Werror=implicit-function-declaration]
        GEM_TRACE_DUMP();
        ^~~~~~~~~~~~~~
        GEM_TRACE
   cc1: some warnings being treated as errors

vim +988 drivers/gpu/drm/i915/selftests/intel_hangcheck.c

   922	
   923	static int igt_reset_queue(void *arg)
   924	{
   925		struct drm_i915_private *i915 = arg;
   926		struct intel_engine_cs *engine;
   927		enum intel_engine_id id;
   928		struct hang h;
   929		int err;
   930	
   931		/* Check that we replay pending requests following a hang */
   932	
   933		global_reset_lock(i915);
   934	
   935		mutex_lock(&i915->drm.struct_mutex);
   936		err = hang_init(&h, i915);
   937		if (err)
   938			goto unlock;
   939	
   940		for_each_engine(engine, i915, id) {
   941			struct i915_request *prev;
   942			IGT_TIMEOUT(end_time);
   943			unsigned int count;
   944	
   945			if (!intel_engine_can_store_dword(engine))
   946				continue;
   947	
   948			prev = hang_create_request(&h, engine);
   949			if (IS_ERR(prev)) {
   950				err = PTR_ERR(prev);
   951				goto fini;
   952			}
   953	
   954			i915_request_get(prev);
   955			__i915_request_add(prev, true);
   956	
   957			count = 0;
   958			do {
   959				struct i915_request *rq;
   960				unsigned int reset_count;
   961	
   962				rq = hang_create_request(&h, engine);
   963				if (IS_ERR(rq)) {
   964					err = PTR_ERR(rq);
   965					goto fini;
   966				}
   967	
   968				i915_request_get(rq);
   969				__i915_request_add(rq, true);
   970	
   971				/*
   972				 * XXX We don't handle resetting the kernel context
   973				 * very well. If we trigger a device reset twice in
   974				 * quick succession while the kernel context is
   975				 * executing, we may end up skipping the breadcrumb.
   976				 * This is really only a problem for the selftest as
   977				 * normally there is a large interlude between resets
   978				 * (hangcheck), or we focus on resetting just one
   979				 * engine and so avoid repeatedly resetting innocents.
   980				 */
   981				err = wait_for_others(i915, engine);
   982				if (err) {
   983					pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
   984					       __func__, engine->name);
   985					i915_request_put(rq);
   986					i915_request_put(prev);
   987	
 > 988					GEM_TRACE_DUMP();
   989					i915_gem_set_wedged(i915);
   990					goto fini;
   991				}
   992	
   993				if (!wait_for_hang(&h, prev)) {
   994					struct drm_printer p = drm_info_printer(i915->drm.dev);
   995	
   996					pr_err("%s(%s): Failed to start request %x, at %x\n",
   997					       __func__, engine->name,
   998					       prev->fence.seqno, hws_seqno(&h, prev));
   999					intel_engine_dump(engine, &p,
  1000							  "%s\n", engine->name);
  1001	
  1002					i915_request_put(rq);
  1003					i915_request_put(prev);
  1004	
  1005					i915_reset(i915, 0);
  1006					i915_gem_set_wedged(i915);
  1007	
  1008					err = -EIO;
  1009					goto fini;
  1010				}
  1011	
  1012				reset_count = fake_hangcheck(prev);
  1013	
  1014				i915_reset(i915, I915_RESET_QUIET);
  1015	
  1016				GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
  1017						    &i915->gpu_error.flags));
  1018	
  1019				if (prev->fence.error != -EIO) {
  1020					pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
  1021					       prev->fence.error);
  1022					i915_request_put(rq);
  1023					i915_request_put(prev);
  1024					err = -EINVAL;
  1025					goto fini;
  1026				}
  1027	
  1028				if (rq->fence.error) {
  1029					pr_err("Fence error status not zero [%d] after unrelated reset\n",
  1030					       rq->fence.error);
  1031					i915_request_put(rq);
  1032					i915_request_put(prev);
  1033					err = -EINVAL;
  1034					goto fini;
  1035				}
  1036	
  1037				if (i915_reset_count(&i915->gpu_error) == reset_count) {
  1038					pr_err("No GPU reset recorded!\n");
  1039					i915_request_put(rq);
  1040					i915_request_put(prev);
  1041					err = -EINVAL;
  1042					goto fini;
  1043				}
  1044	
  1045				i915_request_put(prev);
  1046				prev = rq;
  1047				count++;
  1048			} while (time_before(jiffies, end_time));
  1049			pr_info("%s: Completed %d resets\n", engine->name, count);
  1050	
  1051			*h.batch = MI_BATCH_BUFFER_END;
  1052			i915_gem_chipset_flush(i915);
  1053	
  1054			i915_request_put(prev);
  1055	
  1056			err = flush_test(i915, I915_WAIT_LOCKED);
  1057			if (err)
  1058				break;
  1059		}
  1060	
  1061	fini:
  1062		hang_fini(&h);
  1063	unlock:
  1064		mutex_unlock(&i915->drm.struct_mutex);
  1065		global_reset_unlock(i915);
  1066	
  1067		if (i915_terminally_wedged(&i915->gpu_error))
  1068			return -EIO;
  1069	
  1070		return err;
  1071	}
  1072	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Chris Wilson April 5, 2018, 7:29 p.m. UTC | #4
Quoting Chris Wilson (2018-03-30 14:18:01)
> We don't handle resetting the kernel context very well, or presumably any
> context executing its breadcrumb commands in the ring as opposed to the
> batchbuffer and flush. If we trigger a device reset twice in quick
> succession while the kernel context is executing, we may end up skipping
> the breadcrumb.  This is really only a problem for the selftest as
> normally there is a large interlude between resets (hangcheck), or we
> focus on resetting just one engine and so avoid repeatedly resetting
> innocents.
> 
> Something to try would be a preempt-to-idle to quiesce the engine before
> reset, so that innocent contexts would be spared the reset.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> CC: Michel Thierry <michel.thierry@intel.com>

Anyone?

> ---
>  drivers/gpu/drm/i915/i915_drv.c                  |  3 ++
>  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 48 ++++++++++++++++++++++--
>  2 files changed, 47 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index d354627882e3..684060ed8db6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1886,6 +1886,8 @@ void i915_reset(struct drm_i915_private *i915)
>         int ret;
>         int i;
>  
> +       GEM_TRACE("flags=%lx\n", error->flags);
> +
>         might_sleep();
>         lockdep_assert_held(&i915->drm.struct_mutex);
>         GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
> @@ -2016,6 +2018,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
>         struct i915_request *active_request;
>         int ret;
>  
> +       GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
>         GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
>  
>         active_request = i915_gem_reset_prepare_engine(engine);
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 9e4e0ad62724..122a32e0a69d 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -979,6 +979,23 @@ static int igt_wait_reset(void *arg)
>         return err;
>  }
>  
> +static int wait_for_others(struct drm_i915_private *i915,
> +                           struct intel_engine_cs *exclude)
> +{
> +       struct intel_engine_cs *engine;
> +       enum intel_engine_id id;
> +
> +       for_each_engine(engine, i915, id) {
> +               if (engine == exclude)
> +                       continue;
> +
> +               if (wait_for(intel_engine_is_idle(engine), 10))
> +                       return -EIO;
> +       }
> +
> +       return 0;
> +}
> +
>  static int igt_reset_queue(void *arg)
>  {
>         struct drm_i915_private *i915 = arg;
> @@ -1027,13 +1044,36 @@ static int igt_reset_queue(void *arg)
>                         i915_request_get(rq);
>                         __i915_request_add(rq, true);
>  
> +                       /*
> +                        * XXX We don't handle resetting the kernel context
> +                        * very well. If we trigger a device reset twice in
> +                        * quick succession while the kernel context is
> +                        * executing, we may end up skipping the breadcrumb.
> +                        * This is really only a problem for the selftest as
> +                        * normally there is a large interlude between resets
> +                        * (hangcheck), or we focus on resetting just one
> +                        * engine and so avoid repeatedly resetting innocents.
> +                        */
> +                       err = wait_for_others(i915, engine);
> +                       if (err) {
> +                               pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
> +                                      __func__, engine->name);
> +                               i915_request_put(rq);
> +                               i915_request_put(prev);
> +
> +                               GEM_TRACE_DUMP();
> +                               i915_gem_set_wedged(i915);
> +                               goto fini;
> +                       }
> +
>                         if (!wait_for_hang(&h, prev)) {
>                                 struct drm_printer p = drm_info_printer(i915->drm.dev);
>  
> -                               pr_err("%s: Failed to start request %x, at %x\n",
> -                                      __func__, prev->fence.seqno, hws_seqno(&h, prev));
> -                               intel_engine_dump(prev->engine, &p,
> -                                                 "%s\n", prev->engine->name);
> +                               pr_err("%s(%s): Failed to start request %x, at %x\n",
> +                                      __func__, engine->name,
> +                                      prev->fence.seqno, hws_seqno(&h, prev));
> +                               intel_engine_dump(engine, &p,
> +                                                 "%s\n", engine->name);
>  
>                                 i915_request_put(rq);
>                                 i915_request_put(prev);
> -- 
> 2.16.3
>
Tvrtko Ursulin April 6, 2018, 9:11 a.m. UTC | #5
On 30/03/2018 14:18, Chris Wilson wrote:
> We don't handle resetting the kernel context very well, or presumably any
> context executing its breadcrumb commands in the ring as opposed to the
> batchbuffer and flush. If we trigger a device reset twice in quick
> succession while the kernel context is executing, we may end up skipping
> the breadcrumb.  This is really only a problem for the selftest as
> normally there is a large interlude between resets (hangcheck), or we
> focus on resetting just one engine and so avoid repeatedly resetting
> innocents.
> 
> Something to try would be a preempt-to-idle to quiesce the engine before
> reset, so that innocent contexts would be spared the reset.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> CC: Michel Thierry <michel.thierry@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.c                  |  3 ++
>   drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 48 ++++++++++++++++++++++--
>   2 files changed, 47 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index d354627882e3..684060ed8db6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1886,6 +1886,8 @@ void i915_reset(struct drm_i915_private *i915)
>   	int ret;
>   	int i;
>   
> +	GEM_TRACE("flags=%lx\n", error->flags);
> +
>   	might_sleep();
>   	lockdep_assert_held(&i915->drm.struct_mutex);
>   	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
> @@ -2016,6 +2018,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
>   	struct i915_request *active_request;
>   	int ret;
>   
> +	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
>   	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
>   
>   	active_request = i915_gem_reset_prepare_engine(engine);
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 9e4e0ad62724..122a32e0a69d 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -979,6 +979,23 @@ static int igt_wait_reset(void *arg)
>   	return err;
>   }
>   
> +static int wait_for_others(struct drm_i915_private *i915,
> +			    struct intel_engine_cs *exclude)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	for_each_engine(engine, i915, id) { > +		if (engine == exclude)> +			continue;

Bike-shedder in me says:

for_each_engine_masked(engine, i915, ~BIT(exclude->id), id)

:)

Although for_each_engine_masked failed to enclose its arguments in 
parentheses so never mind.

> +
> +		if (wait_for(intel_engine_is_idle(engine), 10))
> +			return -EIO;
> +	}
> +
> +	return 0;
> +}
> +
>   static int igt_reset_queue(void *arg)
>   {
>   	struct drm_i915_private *i915 = arg;
> @@ -1027,13 +1044,36 @@ static int igt_reset_queue(void *arg)
>   			i915_request_get(rq);
>   			__i915_request_add(rq, true);
>   
> +			/*
> +			 * XXX We don't handle resetting the kernel context
> +			 * very well. If we trigger a device reset twice in
> +			 * quick succession while the kernel context is
> +			 * executing, we may end up skipping the breadcrumb.
> +			 * This is really only a problem for the selftest as
> +			 * normally there is a large interlude between resets
> +			 * (hangcheck), or we focus on resetting just one
> +			 * engine and so avoid repeatedly resetting innocents.
> +			 */
> +			err = wait_for_others(i915, engine);
> +			if (err) {
> +				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
> +				       __func__, engine->name);
> +				i915_request_put(rq);
> +				i915_request_put(prev);
> +
> +				GEM_TRACE_DUMP();
> +				i915_gem_set_wedged(i915);
> +				goto fini;
> +			}
> +
>   			if (!wait_for_hang(&h, prev)) {

This was confusing me a period since I was assuming the seqno check is 
against the breadcrumb, but it is actually about the same number written 
to a different place. So it actually means 
wait_for_request_to_start_executing.

>   				struct drm_printer p = drm_info_printer(i915->drm.dev);
>   
> -				pr_err("%s: Failed to start request %x, at %x\n",
> -				       __func__, prev->fence.seqno, hws_seqno(&h, prev));
> -				intel_engine_dump(prev->engine, &p,
> -						  "%s\n", prev->engine->name);
> +				pr_err("%s(%s): Failed to start request %x, at %x\n",
> +				       __func__, engine->name,
> +				       prev->fence.seqno, hws_seqno(&h, prev));
> +				intel_engine_dump(engine, &p,
> +						  "%s\n", engine->name);
>   
>   				i915_request_put(rq);
>   				i915_request_put(prev);
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
Chris Wilson April 6, 2018, 9:40 a.m. UTC | #6
Quoting Tvrtko Ursulin (2018-04-06 10:11:57)
> 
> On 30/03/2018 14:18, Chris Wilson wrote:
> >                       if (!wait_for_hang(&h, prev)) {
> 
> This was confusing me a period since I was assuming the seqno check is 
> against the breadcrumb, but it is actually about the same number written 
> to a different place. So it actually means 
> wait_for_request_to_start_executing.

Bonus points for a better name than struct hang.

struct spinner;

spinner_wait_until_active() ?
-Chris
Tvrtko Ursulin April 6, 2018, 9:59 a.m. UTC | #7
On 06/04/2018 10:40, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-04-06 10:11:57)
>>
>> On 30/03/2018 14:18, Chris Wilson wrote:
>>>                        if (!wait_for_hang(&h, prev)) {
>>
>> This was confusing me a period since I was assuming the seqno check is
>> against the breadcrumb, but it is actually about the same number written
>> to a different place. So it actually means
>> wait_for_request_to_start_executing.
> 
> Bonus points for a better name than struct hang.
> 
> struct spinner;
> 
> spinner_wait_until_active() ?

Maybe just rename the helper to wait_until_running if you feel like it. 
To keep things simple and scope in check.

Regards,

Tvrtko
diff mbox

Patch

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index d354627882e3..684060ed8db6 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1886,6 +1886,8 @@  void i915_reset(struct drm_i915_private *i915)
 	int ret;
 	int i;
 
+	GEM_TRACE("flags=%lx\n", error->flags);
+
 	might_sleep();
 	lockdep_assert_held(&i915->drm.struct_mutex);
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
@@ -2016,6 +2018,7 @@  int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
 	struct i915_request *active_request;
 	int ret;
 
+	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
 	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
 
 	active_request = i915_gem_reset_prepare_engine(engine);
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 9e4e0ad62724..122a32e0a69d 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -979,6 +979,23 @@  static int igt_wait_reset(void *arg)
 	return err;
 }
 
+static int wait_for_others(struct drm_i915_private *i915,
+			    struct intel_engine_cs *exclude)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	for_each_engine(engine, i915, id) {
+		if (engine == exclude)
+			continue;
+
+		if (wait_for(intel_engine_is_idle(engine), 10))
+			return -EIO;
+	}
+
+	return 0;
+}
+
 static int igt_reset_queue(void *arg)
 {
 	struct drm_i915_private *i915 = arg;
@@ -1027,13 +1044,36 @@  static int igt_reset_queue(void *arg)
 			i915_request_get(rq);
 			__i915_request_add(rq, true);
 
+			/*
+			 * XXX We don't handle resetting the kernel context
+			 * very well. If we trigger a device reset twice in
+			 * quick succession while the kernel context is
+			 * executing, we may end up skipping the breadcrumb.
+			 * This is really only a problem for the selftest as
+			 * normally there is a large interlude between resets
+			 * (hangcheck), or we focus on resetting just one
+			 * engine and so avoid repeatedly resetting innocents.
+			 */
+			err = wait_for_others(i915, engine);
+			if (err) {
+				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
+				       __func__, engine->name);
+				i915_request_put(rq);
+				i915_request_put(prev);
+
+				GEM_TRACE_DUMP();
+				i915_gem_set_wedged(i915);
+				goto fini;
+			}
+
 			if (!wait_for_hang(&h, prev)) {
 				struct drm_printer p = drm_info_printer(i915->drm.dev);
 
-				pr_err("%s: Failed to start request %x, at %x\n",
-				       __func__, prev->fence.seqno, hws_seqno(&h, prev));
-				intel_engine_dump(prev->engine, &p,
-						  "%s\n", prev->engine->name);
+				pr_err("%s(%s): Failed to start request %x, at %x\n",
+				       __func__, engine->name,
+				       prev->fence.seqno, hws_seqno(&h, prev));
+				intel_engine_dump(engine, &p,
+						  "%s\n", engine->name);
 
 				i915_request_put(rq);
 				i915_request_put(prev);