diff mbox series

[2/3] drm/i915/selftests: Fix memory corruption in live_lrc_isolation

Message ID 20210808180757.81440-3-matthew.brost@intel.com (mailing list archive)
State New, archived
Headers show
Series Clean up some CI failures for GuC submission | expand

Commit Message

Matthew Brost Aug. 8, 2021, 6:07 p.m. UTC
GuC submission has exposed an existing memory corruption in
live_lrc_isolation. We believe that some writes to the watchdog offsets
in the LRC (0x178 & 0x17c) can result in trashing of portions of the
address space. With GuC submission there are additional objects which
can move the context redzone into the space that is trashed. To
workaround this avoid poisoning the watchdog.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 29 +++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

Comments

Daniel Vetter Aug. 9, 2021, 1:38 p.m. UTC | #1
On Sun, Aug 08, 2021 at 11:07:56AM -0700, Matthew Brost wrote:
> GuC submission has exposed an existing memory corruption in
> live_lrc_isolation. We believe that some writes to the watchdog offsets
> in the LRC (0x178 & 0x17c) can result in trashing of portions of the
> address space. With GuC submission there are additional objects which
> can move the context redzone into the space that is trashed. To
> workaround this avoid poisoning the watchdog.

A Bspec reference here would be good (we can quote anything that's marked
for public release, so doesn't have one of the IP markers).

Also I think the above should be replicated in condensed form instead of
the XXX comment.

With those: Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch> since I
definitely have enough clue here for a detailed review.
-Daniel

> 
> Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> ---
>  drivers/gpu/drm/i915/gt/selftest_lrc.c | 29 +++++++++++++++++++++++++-
>  1 file changed, 28 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index b0977a3b699b..6500e9fce8a0 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -1074,6 +1074,32 @@ record_registers(struct intel_context *ce,
>  	goto err_after;
>  }
>  
> +static u32 safe_offset(u32 offset, u32 reg)
> +{
> +	/* XXX skip testing of watchdog */
> +	if (offset == 0x178 || offset == 0x17c)
> +		reg = 0;
> +
> +	return reg;
> +}
> +
> +static int get_offset_mask(struct intel_engine_cs *engine)
> +{
> +	if (GRAPHICS_VER(engine->i915) < 12)
> +		return 0xfff;
> +
> +	switch (engine->class) {
> +	default:
> +	case RENDER_CLASS:
> +		return 0x07ff;
> +	case COPY_ENGINE_CLASS:
> +		return 0x0fff;
> +	case VIDEO_DECODE_CLASS:
> +	case VIDEO_ENHANCEMENT_CLASS:
> +		return 0x3fff;
> +	}
> +}
> +
>  static struct i915_vma *load_context(struct intel_context *ce, u32 poison)
>  {
>  	struct i915_vma *batch;
> @@ -1117,7 +1143,8 @@ static struct i915_vma *load_context(struct intel_context *ce, u32 poison)
>  		len = (len + 1) / 2;
>  		*cs++ = MI_LOAD_REGISTER_IMM(len);
>  		while (len--) {
> -			*cs++ = hw[dw];
> +			*cs++ = safe_offset(hw[dw] & get_offset_mask(ce->engine),
> +					    hw[dw]);
>  			*cs++ = poison;
>  			dw += 2;
>  		}
> -- 
> 2.28.0
>
Matthew Brost Aug. 9, 2021, 7:37 p.m. UTC | #2
On Mon, Aug 09, 2021 at 03:38:38PM +0200, Daniel Vetter wrote:
> On Sun, Aug 08, 2021 at 11:07:56AM -0700, Matthew Brost wrote:
> > GuC submission has exposed an existing memory corruption in
> > live_lrc_isolation. We believe that some writes to the watchdog offsets
> > in the LRC (0x178 & 0x17c) can result in trashing of portions of the
> > address space. With GuC submission there are additional objects which
> > can move the context redzone into the space that is trashed. To
> > workaround this avoid poisoning the watchdog.
> 
> A Bspec reference here would be good (we can quote anything that's marked
> for public release, so doesn't have one of the IP markers).
>

Let me see what I dig up in the bspec.

BTW - Hopefully we can root cause this soon with a proper fix.
 
> Also I think the above should be replicated in condensed form instead of
> the XXX comment.
>

Sure.

Matt

> With those: Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch> since I
> definitely have enough clue here for a detailed review.
> -Daniel
> 
> > 
> > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > ---
> >  drivers/gpu/drm/i915/gt/selftest_lrc.c | 29 +++++++++++++++++++++++++-
> >  1 file changed, 28 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> > index b0977a3b699b..6500e9fce8a0 100644
> > --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> > +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> > @@ -1074,6 +1074,32 @@ record_registers(struct intel_context *ce,
> >  	goto err_after;
> >  }
> >  
> > +static u32 safe_offset(u32 offset, u32 reg)
> > +{
> > +	/* XXX skip testing of watchdog */
> > +	if (offset == 0x178 || offset == 0x17c)
> > +		reg = 0;
> > +
> > +	return reg;
> > +}
> > +
> > +static int get_offset_mask(struct intel_engine_cs *engine)
> > +{
> > +	if (GRAPHICS_VER(engine->i915) < 12)
> > +		return 0xfff;
> > +
> > +	switch (engine->class) {
> > +	default:
> > +	case RENDER_CLASS:
> > +		return 0x07ff;
> > +	case COPY_ENGINE_CLASS:
> > +		return 0x0fff;
> > +	case VIDEO_DECODE_CLASS:
> > +	case VIDEO_ENHANCEMENT_CLASS:
> > +		return 0x3fff;
> > +	}
> > +}
> > +
> >  static struct i915_vma *load_context(struct intel_context *ce, u32 poison)
> >  {
> >  	struct i915_vma *batch;
> > @@ -1117,7 +1143,8 @@ static struct i915_vma *load_context(struct intel_context *ce, u32 poison)
> >  		len = (len + 1) / 2;
> >  		*cs++ = MI_LOAD_REGISTER_IMM(len);
> >  		while (len--) {
> > -			*cs++ = hw[dw];
> > +			*cs++ = safe_offset(hw[dw] & get_offset_mask(ce->engine),
> > +					    hw[dw]);
> >  			*cs++ = poison;
> >  			dw += 2;
> >  		}
> > -- 
> > 2.28.0
> > 
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
Daniel Vetter Aug. 10, 2021, 10:33 a.m. UTC | #3
On Mon, Aug 09, 2021 at 07:37:39PM +0000, Matthew Brost wrote:
> On Mon, Aug 09, 2021 at 03:38:38PM +0200, Daniel Vetter wrote:
> > On Sun, Aug 08, 2021 at 11:07:56AM -0700, Matthew Brost wrote:
> > > GuC submission has exposed an existing memory corruption in
> > > live_lrc_isolation. We believe that some writes to the watchdog offsets
> > > in the LRC (0x178 & 0x17c) can result in trashing of portions of the
> > > address space. With GuC submission there are additional objects which
> > > can move the context redzone into the space that is trashed. To
> > > workaround this avoid poisoning the watchdog.
> > 
> > A Bspec reference here would be good (we can quote anything that's marked
> > for public release, so doesn't have one of the IP markers).
> >
> 
> Let me see what I dig up in the bspec.
> 
> BTW - Hopefully we can root cause this soon with a proper fix.

Well if it's work-in-progress duct-tape without reference that's fine
too, then perhaps sprinkle a JIRA number here (just not the full link,
intel IT doesn't like those leaking). Just something that in a few months
when someone reads that code they can stitch together the story again.
-Daniel

>  
> > Also I think the above should be replicated in condensed form instead of
> > the XXX comment.
> >
> 
> Sure.
> 
> Matt
> 
> > With those: Acked-by: Daniel Vetter <daniel.vetter@ffwll.ch> since I
> > definitely have enough clue here for a detailed review.
> > -Daniel
> > 
> > > 
> > > Signed-off-by: Matthew Brost <matthew.brost@intel.com>
> > > ---
> > >  drivers/gpu/drm/i915/gt/selftest_lrc.c | 29 +++++++++++++++++++++++++-
> > >  1 file changed, 28 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> > > index b0977a3b699b..6500e9fce8a0 100644
> > > --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> > > +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> > > @@ -1074,6 +1074,32 @@ record_registers(struct intel_context *ce,
> > >  	goto err_after;
> > >  }
> > >  
> > > +static u32 safe_offset(u32 offset, u32 reg)
> > > +{
> > > +	/* XXX skip testing of watchdog */
> > > +	if (offset == 0x178 || offset == 0x17c)
> > > +		reg = 0;
> > > +
> > > +	return reg;
> > > +}
> > > +
> > > +static int get_offset_mask(struct intel_engine_cs *engine)
> > > +{
> > > +	if (GRAPHICS_VER(engine->i915) < 12)
> > > +		return 0xfff;
> > > +
> > > +	switch (engine->class) {
> > > +	default:
> > > +	case RENDER_CLASS:
> > > +		return 0x07ff;
> > > +	case COPY_ENGINE_CLASS:
> > > +		return 0x0fff;
> > > +	case VIDEO_DECODE_CLASS:
> > > +	case VIDEO_ENHANCEMENT_CLASS:
> > > +		return 0x3fff;
> > > +	}
> > > +}
> > > +
> > >  static struct i915_vma *load_context(struct intel_context *ce, u32 poison)
> > >  {
> > >  	struct i915_vma *batch;
> > > @@ -1117,7 +1143,8 @@ static struct i915_vma *load_context(struct intel_context *ce, u32 poison)
> > >  		len = (len + 1) / 2;
> > >  		*cs++ = MI_LOAD_REGISTER_IMM(len);
> > >  		while (len--) {
> > > -			*cs++ = hw[dw];
> > > +			*cs++ = safe_offset(hw[dw] & get_offset_mask(ce->engine),
> > > +					    hw[dw]);
> > >  			*cs++ = poison;
> > >  			dw += 2;
> > >  		}
> > > -- 
> > > 2.28.0
> > > 
> > 
> > -- 
> > Daniel Vetter
> > Software Engineer, Intel Corporation
> > http://blog.ffwll.ch
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index b0977a3b699b..6500e9fce8a0 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -1074,6 +1074,32 @@  record_registers(struct intel_context *ce,
 	goto err_after;
 }
 
+static u32 safe_offset(u32 offset, u32 reg)
+{
+	/* XXX skip testing of watchdog */
+	if (offset == 0x178 || offset == 0x17c)
+		reg = 0;
+
+	return reg;
+}
+
+static int get_offset_mask(struct intel_engine_cs *engine)
+{
+	if (GRAPHICS_VER(engine->i915) < 12)
+		return 0xfff;
+
+	switch (engine->class) {
+	default:
+	case RENDER_CLASS:
+		return 0x07ff;
+	case COPY_ENGINE_CLASS:
+		return 0x0fff;
+	case VIDEO_DECODE_CLASS:
+	case VIDEO_ENHANCEMENT_CLASS:
+		return 0x3fff;
+	}
+}
+
 static struct i915_vma *load_context(struct intel_context *ce, u32 poison)
 {
 	struct i915_vma *batch;
@@ -1117,7 +1143,8 @@  static struct i915_vma *load_context(struct intel_context *ce, u32 poison)
 		len = (len + 1) / 2;
 		*cs++ = MI_LOAD_REGISTER_IMM(len);
 		while (len--) {
-			*cs++ = hw[dw];
+			*cs++ = safe_offset(hw[dw] & get_offset_mask(ce->engine),
+					    hw[dw]);
 			*cs++ = poison;
 			dw += 2;
 		}