Message ID | wlnlppj2mmqqv2rrsboy2uddzkxiuxcsevgfkqjvo3f7hdtkxs@arkfgtgqrqyx (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | i915/gt/selftest_lrc: Remove timestamp test | expand |
On 04/03/2025 13:09, Mikolaj Wasiak wrote: > This test exposes bug in tigerlake hardware which prevents it from > succeeding. Since the tested feature is only available on bugged hardware > and we won't support any new hardware, this test is obsolete and > should be removed. I randomly clicked on one TGL, one DG2, one MTL and one RKL in the CI and only saw test passes. Then I looked at the patch below to see if there is a skip condition but don't see one. So I end up confused since commit message is making it sound like this only exists on Tigerlake and it's failing all the time. Is it perhaps a sporadic failure? On all platforms or just TGL? What am I missing? Regards, Tvrtko > Signed-off-by: Mikolaj Wasiak <mikolaj.wasiak@intel.com> > --- > drivers/gpu/drm/i915/gt/selftest_lrc.c | 215 ------------------------- > 1 file changed, 215 deletions(-) > > diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c > index 22e750108c5f..aa9b8af61ba6 100644 > --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c > +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c > @@ -105,32 +105,6 @@ static int emit_semaphore_signal(struct intel_context *ce, void *slot) > return 0; > } > > -static int context_flush(struct intel_context *ce, long timeout) > -{ > - struct i915_request *rq; > - struct dma_fence *fence; > - int err = 0; > - > - rq = intel_engine_create_kernel_request(ce->engine); > - if (IS_ERR(rq)) > - return PTR_ERR(rq); > - > - fence = i915_active_fence_get(&ce->timeline->last_request); > - if (fence) { > - i915_request_await_dma_fence(rq, fence); > - dma_fence_put(fence); > - } > - > - rq = i915_request_get(rq); > - i915_request_add(rq); > - if (i915_request_wait(rq, 0, timeout) < 0) > - err = -ETIME; > - i915_request_put(rq); > - > - rmb(); /* We know the request is written, make sure all state is too! */ > - return err; > -} > - > static int get_lri_mask(struct intel_engine_cs *engine, u32 lri) > { > if ((lri & MI_LRI_LRM_CS_MMIO) == 0) > @@ -733,194 +707,6 @@ static int live_lrc_gpr(void *arg) > return err; > } > > -static struct i915_request * > -create_timestamp(struct intel_context *ce, void *slot, int idx) > -{ > - const u32 offset = > - i915_ggtt_offset(ce->engine->status_page.vma) + > - offset_in_page(slot); > - struct i915_request *rq; > - u32 *cs; > - int err; > - > - rq = intel_context_create_request(ce); > - if (IS_ERR(rq)) > - return rq; > - > - cs = intel_ring_begin(rq, 10); > - if (IS_ERR(cs)) { > - err = PTR_ERR(cs); > - goto err; > - } > - > - *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; > - *cs++ = MI_NOOP; > - > - *cs++ = MI_SEMAPHORE_WAIT | > - MI_SEMAPHORE_GLOBAL_GTT | > - MI_SEMAPHORE_POLL | > - MI_SEMAPHORE_SAD_NEQ_SDD; > - *cs++ = 0; > - *cs++ = offset; > - *cs++ = 0; > - > - *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; > - *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(rq->engine->mmio_base)); > - *cs++ = offset + idx * sizeof(u32); > - *cs++ = 0; > - > - intel_ring_advance(rq, cs); > - > - err = 0; > -err: > - i915_request_get(rq); > - i915_request_add(rq); > - if (err) { > - i915_request_put(rq); > - return ERR_PTR(err); > - } > - > - return rq; > -} > - > -struct lrc_timestamp { > - struct intel_engine_cs *engine; > - struct intel_context *ce[2]; > - u32 poison; > -}; > - > -static bool timestamp_advanced(u32 start, u32 end) > -{ > - return (s32)(end - start) > 0; > -} > - > -static int __lrc_timestamp(const struct lrc_timestamp *arg, bool preempt) > -{ > - u32 *slot = memset32(arg->engine->status_page.addr + 1000, 0, 4); > - struct i915_request *rq; > - u32 timestamp; > - int err = 0; > - > - arg->ce[0]->lrc_reg_state[CTX_TIMESTAMP] = arg->poison; > - rq = create_timestamp(arg->ce[0], slot, 1); > - if (IS_ERR(rq)) > - return PTR_ERR(rq); > - > - err = wait_for_submit(rq->engine, rq, HZ / 2); > - if (err) > - goto err; > - > - if (preempt) { > - arg->ce[1]->lrc_reg_state[CTX_TIMESTAMP] = 0xdeadbeef; > - err = emit_semaphore_signal(arg->ce[1], slot); > - if (err) > - goto err; > - } else { > - slot[0] = 1; > - wmb(); > - } > - > - /* And wait for switch to kernel (to save our context to memory) */ > - err = context_flush(arg->ce[0], HZ / 2); > - if (err) > - goto err; > - > - if (!timestamp_advanced(arg->poison, slot[1])) { > - pr_err("%s(%s): invalid timestamp on restore, context:%x, request:%x\n", > - arg->engine->name, preempt ? "preempt" : "simple", > - arg->poison, slot[1]); > - err = -EINVAL; > - } > - > - timestamp = READ_ONCE(arg->ce[0]->lrc_reg_state[CTX_TIMESTAMP]); > - if (!timestamp_advanced(slot[1], timestamp)) { > - pr_err("%s(%s): invalid timestamp on save, request:%x, context:%x\n", > - arg->engine->name, preempt ? "preempt" : "simple", > - slot[1], timestamp); > - err = -EINVAL; > - } > - > -err: > - memset32(slot, -1, 4); > - i915_request_put(rq); > - return err; > -} > - > -static int live_lrc_timestamp(void *arg) > -{ > - struct lrc_timestamp data = {}; > - struct intel_gt *gt = arg; > - enum intel_engine_id id; > - const u32 poison[] = { > - 0, > - S32_MAX, > - (u32)S32_MAX + 1, > - U32_MAX, > - }; > - > - /* > - * We want to verify that the timestamp is saved and restore across > - * context switches and is monotonic. > - * > - * So we do this with a little bit of LRC poisoning to check various > - * boundary conditions, and see what happens if we preempt the context > - * with a second request (carrying more poison into the timestamp). > - */ > - > - for_each_engine(data.engine, gt, id) { > - int i, err = 0; > - > - st_engine_heartbeat_disable(data.engine); > - > - for (i = 0; i < ARRAY_SIZE(data.ce); i++) { > - struct intel_context *tmp; > - > - tmp = intel_context_create(data.engine); > - if (IS_ERR(tmp)) { > - err = PTR_ERR(tmp); > - goto err; > - } > - > - err = intel_context_pin(tmp); > - if (err) { > - intel_context_put(tmp); > - goto err; > - } > - > - data.ce[i] = tmp; > - } > - > - for (i = 0; i < ARRAY_SIZE(poison); i++) { > - data.poison = poison[i]; > - > - err = __lrc_timestamp(&data, false); > - if (err) > - break; > - > - err = __lrc_timestamp(&data, true); > - if (err) > - break; > - } > - > -err: > - st_engine_heartbeat_enable(data.engine); > - for (i = 0; i < ARRAY_SIZE(data.ce); i++) { > - if (!data.ce[i]) > - break; > - > - intel_context_unpin(data.ce[i]); > - intel_context_put(data.ce[i]); > - } > - > - if (igt_flush_test(gt->i915)) > - err = -EIO; > - if (err) > - return err; > - } > - > - return 0; > -} > - > static struct i915_vma * > create_user_vma(struct i915_address_space *vm, unsigned long size) > { > @@ -1971,7 +1757,6 @@ int intel_lrc_live_selftests(struct drm_i915_private *i915) > SUBTEST(live_lrc_state), > SUBTEST(live_lrc_gpr), > SUBTEST(live_lrc_isolation), > - SUBTEST(live_lrc_timestamp), > SUBTEST(live_lrc_garbage), > SUBTEST(live_pphwsp_runtime), > SUBTEST(live_lrc_indirect_ctx_bb),
Hi Mikolaj On Tue Mar 4, 2025 at 1:09 PM UTC, Mikolaj Wasiak wrote: > This test exposes bug in tigerlake hardware which prevents it from > succeeding. Since the tested feature is only available on bugged hardware > and we won't support any new hardware, this test is obsolete and > should be removed. > > Signed-off-by: Mikolaj Wasiak <mikolaj.wasiak@intel.com> Could you please, add some more explanation. At last for me it's not clear why test expose bug on tigerlake and at the same time you claim that it's not supported. Also, will be nice if you explain what bug are you fixing exacly.
Quoting Tvrtko Ursulin (2025-03-04 16:43:45) > > On 04/03/2025 13:09, Mikolaj Wasiak wrote: > > This test exposes bug in tigerlake hardware which prevents it from > > succeeding. Since the tested feature is only available on bugged hardware > > and we won't support any new hardware, this test is obsolete and > > should be removed. > > I randomly clicked on one TGL, one DG2, one MTL and one RKL in the CI > and only saw test passes. Then I looked at the patch below to see if > there is a skip condition but don't see one. So I end up confused since > commit message is making it sound like this only exists on Tigerlake and > it's failing all the time. Is it perhaps a sporadic failure? On all > platforms or just TGL? What am I missing? The HW issue affects all gen12 platforms currently supported by i915. I don't have any data for derivatives, so I cannot confirm if this bug was fixed. The lrc_timestamp test was written to demonstrate this HW bug, to isolate it from (and explain) the pphwsp runtime discrepancies, covered by another selftest. The question is whether we want to keep a selftest that is expected to sporadically fail, that exists purely to hunt for those failures. In the past, we have kept such selftests, but hidden them behind !IS_ENABLED(CONFIG_DRM_I915_SELFTEST_BROKEN). So, - keep the selftest and expect sporadic failures in BAT, or - remove the selftest and completely forget about the HW issue, or - hide the selftest and stop it running on known bad platforms? -Chris
Hi, > Quoting Tvrtko Ursulin (2025-03-04 16:43:45) > > > > On 04/03/2025 13:09, Mikolaj Wasiak wrote: > > > This test exposes bug in tigerlake hardware which prevents it from > > > succeeding. Since the tested feature is only available on bugged hardware > > > and we won't support any new hardware, this test is obsolete and > > > should be removed. > > > > I randomly clicked on one TGL, one DG2, one MTL and one RKL in the CI > > and only saw test passes. Then I looked at the patch below to see if > > there is a skip condition but don't see one. So I end up confused since > > commit message is making it sound like this only exists on Tigerlake and > > it's failing all the time. Is it perhaps a sporadic failure? On all > > platforms or just TGL? What am I missing? > > The HW issue affects all gen12 platforms currently supported by i915. I > don't have any data for derivatives, so I cannot confirm if this bug was > fixed. The lrc_timestamp test was written to demonstrate this HW bug, to > isolate it from (and explain) the pphwsp runtime discrepancies, covered > by another selftest. The question is whether we want to keep a selftest > that is expected to sporadically fail, that exists purely to hunt for > those failures. > > In the past, we have kept such selftests, but hidden them behind > !IS_ENABLED(CONFIG_DRM_I915_SELFTEST_BROKEN). > > So, > - keep the selftest and expect sporadic failures in BAT, or We cannot rely on such test that "sometimes" fails. If we cannot ensure it works properly and provides predictable results in our environment then we should not run it, I believe. Furthermore, this may cause new bug reports to be filled for the same issue over and over again in the future. > - remove the selftest and completely forget about the HW issue, or Can we do anything about that HW issue? :) > - hide the selftest and stop it running on known bad platforms? This seems like it could be a solution here. I have a question though: would that render the test hidden behind that setting unused in CI? Or is it a similar situation to "FIXME" notes that tell us that somebody is aware of the issue, but could not address it at the time? Best Regards, Krzysztof
Quoting Krzysztof Karas (2025-03-06 16:07:02) > > - remove the selftest and completely forget about the HW issue, or > Can we do anything about that HW issue? :) The root cause was found. > > - hide the selftest and stop it running on known bad platforms? > This seems like it could be a solution here. I have a question > though: would that render the test hidden behind that setting > unused in CI? Or is it a similar situation to "FIXME" notes that > tell us that somebody is aware of the issue, but could not > address it at the time? We don't enable CONFIG_DRM_I915_SELFTEST_BROKEN for CI builds. -Chris
On 06/03/2025 10:37, Chris Wilson wrote: > Quoting Tvrtko Ursulin (2025-03-04 16:43:45) >> >> On 04/03/2025 13:09, Mikolaj Wasiak wrote: >>> This test exposes bug in tigerlake hardware which prevents it from >>> succeeding. Since the tested feature is only available on bugged hardware >>> and we won't support any new hardware, this test is obsolete and >>> should be removed. >> >> I randomly clicked on one TGL, one DG2, one MTL and one RKL in the CI >> and only saw test passes. Then I looked at the patch below to see if >> there is a skip condition but don't see one. So I end up confused since >> commit message is making it sound like this only exists on Tigerlake and >> it's failing all the time. Is it perhaps a sporadic failure? On all >> platforms or just TGL? What am I missing? > > The HW issue affects all gen12 platforms currently supported by i915. I > don't have any data for derivatives, so I cannot confirm if this bug was > fixed. The lrc_timestamp test was written to demonstrate this HW bug, to > isolate it from (and explain) the pphwsp runtime discrepancies, covered > by another selftest. The question is whether we want to keep a selftest > that is expected to sporadically fail, that exists purely to hunt for > those failures. > > In the past, we have kept such selftests, but hidden them behind > !IS_ENABLED(CONFIG_DRM_I915_SELFTEST_BROKEN). > > So, > - keep the selftest and expect sporadic failures in BAT, or Up to Intel - it's not the first sporadically failing test and in the past at least those were handled. > - remove the selftest and completely forget about the HW issue, or > - hide the selftest and stop it running on known bad platforms? Either of these two are also fine I think, as long as, if the removal is chosen, it is made sure that either we already have the comment briefly explaining the above somewhere in code, at a suitable location, or that a brief comment is added with the removal. And commit message improved to be less misleading about the failure frequency. Regards, Tvrtko
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c index 22e750108c5f..aa9b8af61ba6 100644 --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c @@ -105,32 +105,6 @@ static int emit_semaphore_signal(struct intel_context *ce, void *slot) return 0; } -static int context_flush(struct intel_context *ce, long timeout) -{ - struct i915_request *rq; - struct dma_fence *fence; - int err = 0; - - rq = intel_engine_create_kernel_request(ce->engine); - if (IS_ERR(rq)) - return PTR_ERR(rq); - - fence = i915_active_fence_get(&ce->timeline->last_request); - if (fence) { - i915_request_await_dma_fence(rq, fence); - dma_fence_put(fence); - } - - rq = i915_request_get(rq); - i915_request_add(rq); - if (i915_request_wait(rq, 0, timeout) < 0) - err = -ETIME; - i915_request_put(rq); - - rmb(); /* We know the request is written, make sure all state is too! */ - return err; -} - static int get_lri_mask(struct intel_engine_cs *engine, u32 lri) { if ((lri & MI_LRI_LRM_CS_MMIO) == 0) @@ -733,194 +707,6 @@ static int live_lrc_gpr(void *arg) return err; } -static struct i915_request * -create_timestamp(struct intel_context *ce, void *slot, int idx) -{ - const u32 offset = - i915_ggtt_offset(ce->engine->status_page.vma) + - offset_in_page(slot); - struct i915_request *rq; - u32 *cs; - int err; - - rq = intel_context_create_request(ce); - if (IS_ERR(rq)) - return rq; - - cs = intel_ring_begin(rq, 10); - if (IS_ERR(cs)) { - err = PTR_ERR(cs); - goto err; - } - - *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; - *cs++ = MI_NOOP; - - *cs++ = MI_SEMAPHORE_WAIT | - MI_SEMAPHORE_GLOBAL_GTT | - MI_SEMAPHORE_POLL | - MI_SEMAPHORE_SAD_NEQ_SDD; - *cs++ = 0; - *cs++ = offset; - *cs++ = 0; - - *cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT; - *cs++ = i915_mmio_reg_offset(RING_CTX_TIMESTAMP(rq->engine->mmio_base)); - *cs++ = offset + idx * sizeof(u32); - *cs++ = 0; - - intel_ring_advance(rq, cs); - - err = 0; -err: - i915_request_get(rq); - i915_request_add(rq); - if (err) { - i915_request_put(rq); - return ERR_PTR(err); - } - - return rq; -} - -struct lrc_timestamp { - struct intel_engine_cs *engine; - struct intel_context *ce[2]; - u32 poison; -}; - -static bool timestamp_advanced(u32 start, u32 end) -{ - return (s32)(end - start) > 0; -} - -static int __lrc_timestamp(const struct lrc_timestamp *arg, bool preempt) -{ - u32 *slot = memset32(arg->engine->status_page.addr + 1000, 0, 4); - struct i915_request *rq; - u32 timestamp; - int err = 0; - - arg->ce[0]->lrc_reg_state[CTX_TIMESTAMP] = arg->poison; - rq = create_timestamp(arg->ce[0], slot, 1); - if (IS_ERR(rq)) - return PTR_ERR(rq); - - err = wait_for_submit(rq->engine, rq, HZ / 2); - if (err) - goto err; - - if (preempt) { - arg->ce[1]->lrc_reg_state[CTX_TIMESTAMP] = 0xdeadbeef; - err = emit_semaphore_signal(arg->ce[1], slot); - if (err) - goto err; - } else { - slot[0] = 1; - wmb(); - } - - /* And wait for switch to kernel (to save our context to memory) */ - err = context_flush(arg->ce[0], HZ / 2); - if (err) - goto err; - - if (!timestamp_advanced(arg->poison, slot[1])) { - pr_err("%s(%s): invalid timestamp on restore, context:%x, request:%x\n", - arg->engine->name, preempt ? "preempt" : "simple", - arg->poison, slot[1]); - err = -EINVAL; - } - - timestamp = READ_ONCE(arg->ce[0]->lrc_reg_state[CTX_TIMESTAMP]); - if (!timestamp_advanced(slot[1], timestamp)) { - pr_err("%s(%s): invalid timestamp on save, request:%x, context:%x\n", - arg->engine->name, preempt ? "preempt" : "simple", - slot[1], timestamp); - err = -EINVAL; - } - -err: - memset32(slot, -1, 4); - i915_request_put(rq); - return err; -} - -static int live_lrc_timestamp(void *arg) -{ - struct lrc_timestamp data = {}; - struct intel_gt *gt = arg; - enum intel_engine_id id; - const u32 poison[] = { - 0, - S32_MAX, - (u32)S32_MAX + 1, - U32_MAX, - }; - - /* - * We want to verify that the timestamp is saved and restore across - * context switches and is monotonic. - * - * So we do this with a little bit of LRC poisoning to check various - * boundary conditions, and see what happens if we preempt the context - * with a second request (carrying more poison into the timestamp). - */ - - for_each_engine(data.engine, gt, id) { - int i, err = 0; - - st_engine_heartbeat_disable(data.engine); - - for (i = 0; i < ARRAY_SIZE(data.ce); i++) { - struct intel_context *tmp; - - tmp = intel_context_create(data.engine); - if (IS_ERR(tmp)) { - err = PTR_ERR(tmp); - goto err; - } - - err = intel_context_pin(tmp); - if (err) { - intel_context_put(tmp); - goto err; - } - - data.ce[i] = tmp; - } - - for (i = 0; i < ARRAY_SIZE(poison); i++) { - data.poison = poison[i]; - - err = __lrc_timestamp(&data, false); - if (err) - break; - - err = __lrc_timestamp(&data, true); - if (err) - break; - } - -err: - st_engine_heartbeat_enable(data.engine); - for (i = 0; i < ARRAY_SIZE(data.ce); i++) { - if (!data.ce[i]) - break; - - intel_context_unpin(data.ce[i]); - intel_context_put(data.ce[i]); - } - - if (igt_flush_test(gt->i915)) - err = -EIO; - if (err) - return err; - } - - return 0; -} - static struct i915_vma * create_user_vma(struct i915_address_space *vm, unsigned long size) { @@ -1971,7 +1757,6 @@ int intel_lrc_live_selftests(struct drm_i915_private *i915) SUBTEST(live_lrc_state), SUBTEST(live_lrc_gpr), SUBTEST(live_lrc_isolation), - SUBTEST(live_lrc_timestamp), SUBTEST(live_lrc_garbage), SUBTEST(live_pphwsp_runtime), SUBTEST(live_lrc_indirect_ctx_bb),
This test exposes bug in tigerlake hardware which prevents it from succeeding. Since the tested feature is only available on bugged hardware and we won't support any new hardware, this test is obsolete and should be removed. Signed-off-by: Mikolaj Wasiak <mikolaj.wasiak@intel.com> --- drivers/gpu/drm/i915/gt/selftest_lrc.c | 215 ------------------------- 1 file changed, 215 deletions(-)