Message ID | 20180330131801.18327-1-chris@chris-wilson.co.uk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi Chris, Thank you for the patch! Yet something to improve: [auto build test ERROR on drm-intel/for-linux-next] [also build test ERROR on v4.16-rc7 next-20180329] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Chris-Wilson/drm-i915-selftests-Avoid-repeatedly-harming-the-same-innocent-context/20180401-022503 base: git://anongit.freedesktop.org/drm-intel for-linux-next config: i386-randconfig-b0-04010137 (attached as .config) compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4 reproduce: # save the attached .config to linux build tree make ARCH=i386 All errors (new ones prefixed by >>): In file included from drivers/gpu/drm/i915/intel_hangcheck.c:465:0: drivers/gpu/drm/i915/selftests/intel_hangcheck.c: In function 'igt_reset_queue': >> drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:5: error: implicit declaration of function 'GEM_TRACE_DUMP' [-Werror=implicit-function-declaration] GEM_TRACE_DUMP(); ^ cc1: all warnings being treated as errors vim +/GEM_TRACE_DUMP +988 drivers/gpu/drm/i915/selftests/intel_hangcheck.c 922 923 static int igt_reset_queue(void *arg) 924 { 925 struct drm_i915_private *i915 = arg; 926 struct intel_engine_cs *engine; 927 enum intel_engine_id id; 928 struct hang h; 929 int err; 930 931 /* Check that we replay pending requests following a hang */ 932 933 global_reset_lock(i915); 934 935 mutex_lock(&i915->drm.struct_mutex); 936 err = hang_init(&h, i915); 937 if (err) 938 goto unlock; 939 940 for_each_engine(engine, i915, id) { 941 struct i915_request *prev; 942 IGT_TIMEOUT(end_time); 943 unsigned int count; 944 945 if (!intel_engine_can_store_dword(engine)) 946 continue; 947 948 prev = hang_create_request(&h, engine); 949 if (IS_ERR(prev)) { 950 err = PTR_ERR(prev); 951 goto fini; 952 } 953 954 i915_request_get(prev); 955 __i915_request_add(prev, true); 956 957 count = 0; 958 do { 959 struct i915_request *rq; 960 unsigned int reset_count; 961 962 rq = hang_create_request(&h, engine); 963 if (IS_ERR(rq)) { 964 err = PTR_ERR(rq); 965 goto fini; 966 } 967 968 i915_request_get(rq); 969 __i915_request_add(rq, true); 970 971 /* 972 * XXX We don't handle resetting the kernel context 973 * very well. If we trigger a device reset twice in 974 * quick succession while the kernel context is 975 * executing, we may end up skipping the breadcrumb. 976 * This is really only a problem for the selftest as 977 * normally there is a large interlude between resets 978 * (hangcheck), or we focus on resetting just one 979 * engine and so avoid repeatedly resetting innocents. 980 */ 981 err = wait_for_others(i915, engine); 982 if (err) { 983 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 984 __func__, engine->name); 985 i915_request_put(rq); 986 i915_request_put(prev); 987 > 988 GEM_TRACE_DUMP(); 989 i915_gem_set_wedged(i915); 990 goto fini; 991 } 992 993 if (!wait_for_hang(&h, prev)) { 994 struct drm_printer p = drm_info_printer(i915->drm.dev); 995 996 pr_err("%s(%s): Failed to start request %x, at %x\n", 997 __func__, engine->name, 998 prev->fence.seqno, hws_seqno(&h, prev)); 999 intel_engine_dump(engine, &p, 1000 "%s\n", engine->name); 1001 1002 i915_request_put(rq); 1003 i915_request_put(prev); 1004 1005 i915_reset(i915, 0); 1006 i915_gem_set_wedged(i915); 1007 1008 err = -EIO; 1009 goto fini; 1010 } 1011 1012 reset_count = fake_hangcheck(prev); 1013 1014 i915_reset(i915, I915_RESET_QUIET); 1015 1016 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, 1017 &i915->gpu_error.flags)); 1018 1019 if (prev->fence.error != -EIO) { 1020 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1021 prev->fence.error); 1022 i915_request_put(rq); 1023 i915_request_put(prev); 1024 err = -EINVAL; 1025 goto fini; 1026 } 1027 1028 if (rq->fence.error) { 1029 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1030 rq->fence.error); 1031 i915_request_put(rq); 1032 i915_request_put(prev); 1033 err = -EINVAL; 1034 goto fini; 1035 } 1036 1037 if (i915_reset_count(&i915->gpu_error) == reset_count) { 1038 pr_err("No GPU reset recorded!\n"); 1039 i915_request_put(rq); 1040 i915_request_put(prev); 1041 err = -EINVAL; 1042 goto fini; 1043 } 1044 1045 i915_request_put(prev); 1046 prev = rq; 1047 count++; 1048 } while (time_before(jiffies, end_time)); 1049 pr_info("%s: Completed %d resets\n", engine->name, count); 1050 1051 *h.batch = MI_BATCH_BUFFER_END; 1052 i915_gem_chipset_flush(i915); 1053 1054 i915_request_put(prev); 1055 1056 err = flush_test(i915, I915_WAIT_LOCKED); 1057 if (err) 1058 break; 1059 } 1060 1061 fini: 1062 hang_fini(&h); 1063 unlock: 1064 mutex_unlock(&i915->drm.struct_mutex); 1065 global_reset_unlock(i915); 1066 1067 if (i915_terminally_wedged(&i915->gpu_error)) 1068 return -EIO; 1069 1070 return err; 1071 } 1072 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
Hi Chris, Thank you for the patch! Yet something to improve: [auto build test ERROR on drm-intel/for-linux-next] [also build test ERROR on v4.16-rc7 next-20180329] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Chris-Wilson/drm-i915-selftests-Avoid-repeatedly-harming-the-same-innocent-context/20180401-022503 base: git://anongit.freedesktop.org/drm-intel for-linux-next config: x86_64-randconfig-x011-201813 (attached as .config) compiler: gcc-7 (Debian 7.3.0-1) 7.3.0 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 All errors (new ones prefixed by >>): In file included from drivers/gpu/drm/i915/intel_hangcheck.c:465:0: drivers/gpu/drm/i915/selftests/intel_hangcheck.c: In function 'igt_reset_queue': >> drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:5: error: implicit declaration of function 'GEM_TRACE_DUMP'; did you mean 'GEM_TRACE'? [-Werror=implicit-function-declaration] GEM_TRACE_DUMP(); ^~~~~~~~~~~~~~ GEM_TRACE cc1: some warnings being treated as errors vim +988 drivers/gpu/drm/i915/selftests/intel_hangcheck.c 922 923 static int igt_reset_queue(void *arg) 924 { 925 struct drm_i915_private *i915 = arg; 926 struct intel_engine_cs *engine; 927 enum intel_engine_id id; 928 struct hang h; 929 int err; 930 931 /* Check that we replay pending requests following a hang */ 932 933 global_reset_lock(i915); 934 935 mutex_lock(&i915->drm.struct_mutex); 936 err = hang_init(&h, i915); 937 if (err) 938 goto unlock; 939 940 for_each_engine(engine, i915, id) { 941 struct i915_request *prev; 942 IGT_TIMEOUT(end_time); 943 unsigned int count; 944 945 if (!intel_engine_can_store_dword(engine)) 946 continue; 947 948 prev = hang_create_request(&h, engine); 949 if (IS_ERR(prev)) { 950 err = PTR_ERR(prev); 951 goto fini; 952 } 953 954 i915_request_get(prev); 955 __i915_request_add(prev, true); 956 957 count = 0; 958 do { 959 struct i915_request *rq; 960 unsigned int reset_count; 961 962 rq = hang_create_request(&h, engine); 963 if (IS_ERR(rq)) { 964 err = PTR_ERR(rq); 965 goto fini; 966 } 967 968 i915_request_get(rq); 969 __i915_request_add(rq, true); 970 971 /* 972 * XXX We don't handle resetting the kernel context 973 * very well. If we trigger a device reset twice in 974 * quick succession while the kernel context is 975 * executing, we may end up skipping the breadcrumb. 976 * This is really only a problem for the selftest as 977 * normally there is a large interlude between resets 978 * (hangcheck), or we focus on resetting just one 979 * engine and so avoid repeatedly resetting innocents. 980 */ 981 err = wait_for_others(i915, engine); 982 if (err) { 983 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 984 __func__, engine->name); 985 i915_request_put(rq); 986 i915_request_put(prev); 987 > 988 GEM_TRACE_DUMP(); 989 i915_gem_set_wedged(i915); 990 goto fini; 991 } 992 993 if (!wait_for_hang(&h, prev)) { 994 struct drm_printer p = drm_info_printer(i915->drm.dev); 995 996 pr_err("%s(%s): Failed to start request %x, at %x\n", 997 __func__, engine->name, 998 prev->fence.seqno, hws_seqno(&h, prev)); 999 intel_engine_dump(engine, &p, 1000 "%s\n", engine->name); 1001 1002 i915_request_put(rq); 1003 i915_request_put(prev); 1004 1005 i915_reset(i915, 0); 1006 i915_gem_set_wedged(i915); 1007 1008 err = -EIO; 1009 goto fini; 1010 } 1011 1012 reset_count = fake_hangcheck(prev); 1013 1014 i915_reset(i915, I915_RESET_QUIET); 1015 1016 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, 1017 &i915->gpu_error.flags)); 1018 1019 if (prev->fence.error != -EIO) { 1020 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1021 prev->fence.error); 1022 i915_request_put(rq); 1023 i915_request_put(prev); 1024 err = -EINVAL; 1025 goto fini; 1026 } 1027 1028 if (rq->fence.error) { 1029 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1030 rq->fence.error); 1031 i915_request_put(rq); 1032 i915_request_put(prev); 1033 err = -EINVAL; 1034 goto fini; 1035 } 1036 1037 if (i915_reset_count(&i915->gpu_error) == reset_count) { 1038 pr_err("No GPU reset recorded!\n"); 1039 i915_request_put(rq); 1040 i915_request_put(prev); 1041 err = -EINVAL; 1042 goto fini; 1043 } 1044 1045 i915_request_put(prev); 1046 prev = rq; 1047 count++; 1048 } while (time_before(jiffies, end_time)); 1049 pr_info("%s: Completed %d resets\n", engine->name, count); 1050 1051 *h.batch = MI_BATCH_BUFFER_END; 1052 i915_gem_chipset_flush(i915); 1053 1054 i915_request_put(prev); 1055 1056 err = flush_test(i915, I915_WAIT_LOCKED); 1057 if (err) 1058 break; 1059 } 1060 1061 fini: 1062 hang_fini(&h); 1063 unlock: 1064 mutex_unlock(&i915->drm.struct_mutex); 1065 global_reset_unlock(i915); 1066 1067 if (i915_terminally_wedged(&i915->gpu_error)) 1068 return -EIO; 1069 1070 return err; 1071 } 1072 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
Hi Chris, Thank you for the patch! Perhaps something to improve: [auto build test WARNING on drm-intel/for-linux-next] [also build test WARNING on v4.16-rc7 next-20180329] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Chris-Wilson/drm-i915-selftests-Avoid-repeatedly-harming-the-same-innocent-context/20180401-022503 base: git://anongit.freedesktop.org/drm-intel for-linux-next reproduce: # apt-get install sparse make ARCH=x86_64 allmodconfig make C=1 CF=-D__CHECK_ENDIAN__ sparse warnings: (new ones prefixed by >>) drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:33: sparse: undefined identifier 'GEM_TRACE_DUMP' >> drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:47: sparse: call with no type! In file included from drivers/gpu/drm/i915/intel_hangcheck.c:465:0: drivers/gpu/drm/i915/selftests/intel_hangcheck.c: In function 'igt_reset_queue': drivers/gpu/drm/i915/selftests/intel_hangcheck.c:988:5: error: implicit declaration of function 'GEM_TRACE_DUMP'; did you mean 'GEM_TRACE'? [-Werror=implicit-function-declaration] GEM_TRACE_DUMP(); ^~~~~~~~~~~~~~ GEM_TRACE cc1: some warnings being treated as errors vim +988 drivers/gpu/drm/i915/selftests/intel_hangcheck.c 922 923 static int igt_reset_queue(void *arg) 924 { 925 struct drm_i915_private *i915 = arg; 926 struct intel_engine_cs *engine; 927 enum intel_engine_id id; 928 struct hang h; 929 int err; 930 931 /* Check that we replay pending requests following a hang */ 932 933 global_reset_lock(i915); 934 935 mutex_lock(&i915->drm.struct_mutex); 936 err = hang_init(&h, i915); 937 if (err) 938 goto unlock; 939 940 for_each_engine(engine, i915, id) { 941 struct i915_request *prev; 942 IGT_TIMEOUT(end_time); 943 unsigned int count; 944 945 if (!intel_engine_can_store_dword(engine)) 946 continue; 947 948 prev = hang_create_request(&h, engine); 949 if (IS_ERR(prev)) { 950 err = PTR_ERR(prev); 951 goto fini; 952 } 953 954 i915_request_get(prev); 955 __i915_request_add(prev, true); 956 957 count = 0; 958 do { 959 struct i915_request *rq; 960 unsigned int reset_count; 961 962 rq = hang_create_request(&h, engine); 963 if (IS_ERR(rq)) { 964 err = PTR_ERR(rq); 965 goto fini; 966 } 967 968 i915_request_get(rq); 969 __i915_request_add(rq, true); 970 971 /* 972 * XXX We don't handle resetting the kernel context 973 * very well. If we trigger a device reset twice in 974 * quick succession while the kernel context is 975 * executing, we may end up skipping the breadcrumb. 976 * This is really only a problem for the selftest as 977 * normally there is a large interlude between resets 978 * (hangcheck), or we focus on resetting just one 979 * engine and so avoid repeatedly resetting innocents. 980 */ 981 err = wait_for_others(i915, engine); 982 if (err) { 983 pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", 984 __func__, engine->name); 985 i915_request_put(rq); 986 i915_request_put(prev); 987 > 988 GEM_TRACE_DUMP(); 989 i915_gem_set_wedged(i915); 990 goto fini; 991 } 992 993 if (!wait_for_hang(&h, prev)) { 994 struct drm_printer p = drm_info_printer(i915->drm.dev); 995 996 pr_err("%s(%s): Failed to start request %x, at %x\n", 997 __func__, engine->name, 998 prev->fence.seqno, hws_seqno(&h, prev)); 999 intel_engine_dump(engine, &p, 1000 "%s\n", engine->name); 1001 1002 i915_request_put(rq); 1003 i915_request_put(prev); 1004 1005 i915_reset(i915, 0); 1006 i915_gem_set_wedged(i915); 1007 1008 err = -EIO; 1009 goto fini; 1010 } 1011 1012 reset_count = fake_hangcheck(prev); 1013 1014 i915_reset(i915, I915_RESET_QUIET); 1015 1016 GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, 1017 &i915->gpu_error.flags)); 1018 1019 if (prev->fence.error != -EIO) { 1020 pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n", 1021 prev->fence.error); 1022 i915_request_put(rq); 1023 i915_request_put(prev); 1024 err = -EINVAL; 1025 goto fini; 1026 } 1027 1028 if (rq->fence.error) { 1029 pr_err("Fence error status not zero [%d] after unrelated reset\n", 1030 rq->fence.error); 1031 i915_request_put(rq); 1032 i915_request_put(prev); 1033 err = -EINVAL; 1034 goto fini; 1035 } 1036 1037 if (i915_reset_count(&i915->gpu_error) == reset_count) { 1038 pr_err("No GPU reset recorded!\n"); 1039 i915_request_put(rq); 1040 i915_request_put(prev); 1041 err = -EINVAL; 1042 goto fini; 1043 } 1044 1045 i915_request_put(prev); 1046 prev = rq; 1047 count++; 1048 } while (time_before(jiffies, end_time)); 1049 pr_info("%s: Completed %d resets\n", engine->name, count); 1050 1051 *h.batch = MI_BATCH_BUFFER_END; 1052 i915_gem_chipset_flush(i915); 1053 1054 i915_request_put(prev); 1055 1056 err = flush_test(i915, I915_WAIT_LOCKED); 1057 if (err) 1058 break; 1059 } 1060 1061 fini: 1062 hang_fini(&h); 1063 unlock: 1064 mutex_unlock(&i915->drm.struct_mutex); 1065 global_reset_unlock(i915); 1066 1067 if (i915_terminally_wedged(&i915->gpu_error)) 1068 return -EIO; 1069 1070 return err; 1071 } 1072 --- 0-DAY kernel test infrastructure Open Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation
Quoting Chris Wilson (2018-03-30 14:18:01) > We don't handle resetting the kernel context very well, or presumably any > context executing its breadcrumb commands in the ring as opposed to the > batchbuffer and flush. If we trigger a device reset twice in quick > succession while the kernel context is executing, we may end up skipping > the breadcrumb. This is really only a problem for the selftest as > normally there is a large interlude between resets (hangcheck), or we > focus on resetting just one engine and so avoid repeatedly resetting > innocents. > > Something to try would be a preempt-to-idle to quiesce the engine before > reset, so that innocent contexts would be spared the reset. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> > Cc: Michał Winiarski <michal.winiarski@intel.com> > CC: Michel Thierry <michel.thierry@intel.com> Anyone? > --- > drivers/gpu/drm/i915/i915_drv.c | 3 ++ > drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 48 ++++++++++++++++++++++-- > 2 files changed, 47 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index d354627882e3..684060ed8db6 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -1886,6 +1886,8 @@ void i915_reset(struct drm_i915_private *i915) > int ret; > int i; > > + GEM_TRACE("flags=%lx\n", error->flags); > + > might_sleep(); > lockdep_assert_held(&i915->drm.struct_mutex); > GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); > @@ -2016,6 +2018,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) > struct i915_request *active_request; > int ret; > > + GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); > GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); > > active_request = i915_gem_reset_prepare_engine(engine); > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > index 9e4e0ad62724..122a32e0a69d 100644 > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > @@ -979,6 +979,23 @@ static int igt_wait_reset(void *arg) > return err; > } > > +static int wait_for_others(struct drm_i915_private *i915, > + struct intel_engine_cs *exclude) > +{ > + struct intel_engine_cs *engine; > + enum intel_engine_id id; > + > + for_each_engine(engine, i915, id) { > + if (engine == exclude) > + continue; > + > + if (wait_for(intel_engine_is_idle(engine), 10)) > + return -EIO; > + } > + > + return 0; > +} > + > static int igt_reset_queue(void *arg) > { > struct drm_i915_private *i915 = arg; > @@ -1027,13 +1044,36 @@ static int igt_reset_queue(void *arg) > i915_request_get(rq); > __i915_request_add(rq, true); > > + /* > + * XXX We don't handle resetting the kernel context > + * very well. If we trigger a device reset twice in > + * quick succession while the kernel context is > + * executing, we may end up skipping the breadcrumb. > + * This is really only a problem for the selftest as > + * normally there is a large interlude between resets > + * (hangcheck), or we focus on resetting just one > + * engine and so avoid repeatedly resetting innocents. > + */ > + err = wait_for_others(i915, engine); > + if (err) { > + pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", > + __func__, engine->name); > + i915_request_put(rq); > + i915_request_put(prev); > + > + GEM_TRACE_DUMP(); > + i915_gem_set_wedged(i915); > + goto fini; > + } > + > if (!wait_for_hang(&h, prev)) { > struct drm_printer p = drm_info_printer(i915->drm.dev); > > - pr_err("%s: Failed to start request %x, at %x\n", > - __func__, prev->fence.seqno, hws_seqno(&h, prev)); > - intel_engine_dump(prev->engine, &p, > - "%s\n", prev->engine->name); > + pr_err("%s(%s): Failed to start request %x, at %x\n", > + __func__, engine->name, > + prev->fence.seqno, hws_seqno(&h, prev)); > + intel_engine_dump(engine, &p, > + "%s\n", engine->name); > > i915_request_put(rq); > i915_request_put(prev); > -- > 2.16.3 >
On 30/03/2018 14:18, Chris Wilson wrote: > We don't handle resetting the kernel context very well, or presumably any > context executing its breadcrumb commands in the ring as opposed to the > batchbuffer and flush. If we trigger a device reset twice in quick > succession while the kernel context is executing, we may end up skipping > the breadcrumb. This is really only a problem for the selftest as > normally there is a large interlude between resets (hangcheck), or we > focus on resetting just one engine and so avoid repeatedly resetting > innocents. > > Something to try would be a preempt-to-idle to quiesce the engine before > reset, so that innocent contexts would be spared the reset. > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> > Cc: Michał Winiarski <michal.winiarski@intel.com> > CC: Michel Thierry <michel.thierry@intel.com> > --- > drivers/gpu/drm/i915/i915_drv.c | 3 ++ > drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 48 ++++++++++++++++++++++-- > 2 files changed, 47 insertions(+), 4 deletions(-) > > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c > index d354627882e3..684060ed8db6 100644 > --- a/drivers/gpu/drm/i915/i915_drv.c > +++ b/drivers/gpu/drm/i915/i915_drv.c > @@ -1886,6 +1886,8 @@ void i915_reset(struct drm_i915_private *i915) > int ret; > int i; > > + GEM_TRACE("flags=%lx\n", error->flags); > + > might_sleep(); > lockdep_assert_held(&i915->drm.struct_mutex); > GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); > @@ -2016,6 +2018,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) > struct i915_request *active_request; > int ret; > > + GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); > GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); > > active_request = i915_gem_reset_prepare_engine(engine); > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > index 9e4e0ad62724..122a32e0a69d 100644 > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c > @@ -979,6 +979,23 @@ static int igt_wait_reset(void *arg) > return err; > } > > +static int wait_for_others(struct drm_i915_private *i915, > + struct intel_engine_cs *exclude) > +{ > + struct intel_engine_cs *engine; > + enum intel_engine_id id; > + > + for_each_engine(engine, i915, id) { > + if (engine == exclude)> + continue; Bike-shedder in me says: for_each_engine_masked(engine, i915, ~BIT(exclude->id), id) :) Although for_each_engine_masked failed to enclose its arguments in parentheses so never mind. > + > + if (wait_for(intel_engine_is_idle(engine), 10)) > + return -EIO; > + } > + > + return 0; > +} > + > static int igt_reset_queue(void *arg) > { > struct drm_i915_private *i915 = arg; > @@ -1027,13 +1044,36 @@ static int igt_reset_queue(void *arg) > i915_request_get(rq); > __i915_request_add(rq, true); > > + /* > + * XXX We don't handle resetting the kernel context > + * very well. If we trigger a device reset twice in > + * quick succession while the kernel context is > + * executing, we may end up skipping the breadcrumb. > + * This is really only a problem for the selftest as > + * normally there is a large interlude between resets > + * (hangcheck), or we focus on resetting just one > + * engine and so avoid repeatedly resetting innocents. > + */ > + err = wait_for_others(i915, engine); > + if (err) { > + pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", > + __func__, engine->name); > + i915_request_put(rq); > + i915_request_put(prev); > + > + GEM_TRACE_DUMP(); > + i915_gem_set_wedged(i915); > + goto fini; > + } > + > if (!wait_for_hang(&h, prev)) { This was confusing me a period since I was assuming the seqno check is against the breadcrumb, but it is actually about the same number written to a different place. So it actually means wait_for_request_to_start_executing. > struct drm_printer p = drm_info_printer(i915->drm.dev); > > - pr_err("%s: Failed to start request %x, at %x\n", > - __func__, prev->fence.seqno, hws_seqno(&h, prev)); > - intel_engine_dump(prev->engine, &p, > - "%s\n", prev->engine->name); > + pr_err("%s(%s): Failed to start request %x, at %x\n", > + __func__, engine->name, > + prev->fence.seqno, hws_seqno(&h, prev)); > + intel_engine_dump(engine, &p, > + "%s\n", engine->name); > > i915_request_put(rq); > i915_request_put(prev); > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Regards, Tvrtko
Quoting Tvrtko Ursulin (2018-04-06 10:11:57) > > On 30/03/2018 14:18, Chris Wilson wrote: > > if (!wait_for_hang(&h, prev)) { > > This was confusing me a period since I was assuming the seqno check is > against the breadcrumb, but it is actually about the same number written > to a different place. So it actually means > wait_for_request_to_start_executing. Bonus points for a better name than struct hang. struct spinner; spinner_wait_until_active() ? -Chris
On 06/04/2018 10:40, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2018-04-06 10:11:57) >> >> On 30/03/2018 14:18, Chris Wilson wrote: >>> if (!wait_for_hang(&h, prev)) { >> >> This was confusing me a period since I was assuming the seqno check is >> against the breadcrumb, but it is actually about the same number written >> to a different place. So it actually means >> wait_for_request_to_start_executing. > > Bonus points for a better name than struct hang. > > struct spinner; > > spinner_wait_until_active() ? Maybe just rename the helper to wait_until_running if you feel like it. To keep things simple and scope in check. Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index d354627882e3..684060ed8db6 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1886,6 +1886,8 @@ void i915_reset(struct drm_i915_private *i915) int ret; int i; + GEM_TRACE("flags=%lx\n", error->flags); + might_sleep(); lockdep_assert_held(&i915->drm.struct_mutex); GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags)); @@ -2016,6 +2018,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg) struct i915_request *active_request; int ret; + GEM_TRACE("%s flags=%lx\n", engine->name, error->flags); GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags)); active_request = i915_gem_reset_prepare_engine(engine); diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c index 9e4e0ad62724..122a32e0a69d 100644 --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c @@ -979,6 +979,23 @@ static int igt_wait_reset(void *arg) return err; } +static int wait_for_others(struct drm_i915_private *i915, + struct intel_engine_cs *exclude) +{ + struct intel_engine_cs *engine; + enum intel_engine_id id; + + for_each_engine(engine, i915, id) { + if (engine == exclude) + continue; + + if (wait_for(intel_engine_is_idle(engine), 10)) + return -EIO; + } + + return 0; +} + static int igt_reset_queue(void *arg) { struct drm_i915_private *i915 = arg; @@ -1027,13 +1044,36 @@ static int igt_reset_queue(void *arg) i915_request_get(rq); __i915_request_add(rq, true); + /* + * XXX We don't handle resetting the kernel context + * very well. If we trigger a device reset twice in + * quick succession while the kernel context is + * executing, we may end up skipping the breadcrumb. + * This is really only a problem for the selftest as + * normally there is a large interlude between resets + * (hangcheck), or we focus on resetting just one + * engine and so avoid repeatedly resetting innocents. + */ + err = wait_for_others(i915, engine); + if (err) { + pr_err("%s(%s): Failed to idle other inactive engines after device reset\n", + __func__, engine->name); + i915_request_put(rq); + i915_request_put(prev); + + GEM_TRACE_DUMP(); + i915_gem_set_wedged(i915); + goto fini; + } + if (!wait_for_hang(&h, prev)) { struct drm_printer p = drm_info_printer(i915->drm.dev); - pr_err("%s: Failed to start request %x, at %x\n", - __func__, prev->fence.seqno, hws_seqno(&h, prev)); - intel_engine_dump(prev->engine, &p, - "%s\n", prev->engine->name); + pr_err("%s(%s): Failed to start request %x, at %x\n", + __func__, engine->name, + prev->fence.seqno, hws_seqno(&h, prev)); + intel_engine_dump(engine, &p, + "%s\n", engine->name); i915_request_put(rq); i915_request_put(prev);
We don't handle resetting the kernel context very well, or presumably any context executing its breadcrumb commands in the ring as opposed to the batchbuffer and flush. If we trigger a device reset twice in quick succession while the kernel context is executing, we may end up skipping the breadcrumb. This is really only a problem for the selftest as normally there is a large interlude between resets (hangcheck), or we focus on resetting just one engine and so avoid repeatedly resetting innocents. Something to try would be a preempt-to-idle to quiesce the engine before reset, so that innocent contexts would be spared the reset. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Cc: Michał Winiarski <michal.winiarski@intel.com> CC: Michel Thierry <michel.thierry@intel.com> --- drivers/gpu/drm/i915/i915_drv.c | 3 ++ drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 48 ++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 4 deletions(-)