diff mbox series

[3/4] drm/i915/uc: Inject load errors into intel_uc_init_hw

Message ID 20190729152301.22588-4-michal.wajdeczko@intel.com (mailing list archive)
State New, archived
Headers show
Series add more probe failures | expand

Commit Message

Michal Wajdeczko July 29, 2019, 3:23 p.m. UTC
Inject load errors into intel_uc_init_hw to make sure we
correctly handle uC initialization failures.

To avoid complains from CI about inserted errors or warnings,
use helper macro that checks if there was an error injection.

Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/uc/intel_uc.c | 8 ++++++++
 drivers/gpu/drm/i915/i915_drv.h       | 7 ++++++-
 drivers/gpu/drm/i915/i915_gem.c       | 2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

Comments

Chris Wilson July 30, 2019, 8:47 a.m. UTC | #1
Quoting Michal Wajdeczko (2019-07-29 16:23:00)
> Inject load errors into intel_uc_init_hw to make sure we
> correctly handle uC initialization failures.
> 
> To avoid complains from CI about inserted errors or warnings,
> use helper macro that checks if there was an error injection.
> 
> Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/gt/uc/intel_uc.c | 8 ++++++++
>  drivers/gpu/drm/i915/i915_drv.h       | 7 ++++++-
>  drivers/gpu/drm/i915/i915_gem.c       | 2 +-
>  3 files changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> index fafa9be1e12a..9e1156c29cb1 100644
> --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> @@ -400,6 +400,14 @@ int intel_uc_init_hw(struct intel_uc *uc)
>         if (!intel_uc_is_using_guc(uc))
>                 return 0;
>  
> +       ret = i915_inject_load_error(i915, -EIO);
> +       if (ret)
> +               return ret;
> +
> +       ret = i915_inject_load_error(i915, -ENXIO);
> +       if (ret)
> +               return ret;
> +
>         GEM_BUG_ON(!intel_uc_fw_supported(&guc->fw));
>  
>         guc_reset_interrupts(guc);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 6b059d51aaff..36f7a146f06a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -137,9 +137,14 @@ bool i915_error_injected(void);
>  
>  #define i915_inject_probe_failure(i915) i915_inject_load_error((i915), -ENODEV)
>  
> -#define i915_probe_error(i915, fmt, ...)                                  \
> +#define I915_ERROR(i915, fmt, ...) \
>         __i915_printk(i915, i915_error_injected() ? KERN_DEBUG : KERN_ERR, \
>                       fmt, ##__VA_ARGS__)
> +#define I915_WARN(i915, fmt, ...) \
> +       __i915_printk(i915, i915_error_injected() ? KERN_DEBUG : KERN_WARNING, \
> +                     fmt, ##__VA_ARGS__)

I didn't see I915_WARN be used in this series. Is it likely? We either
abort the module load, in which it is an error, or we are quite happy to
continue in which case I'd vote for a "normal but significant condition"
i.e. KERN_NOTICE.
-Chris
Chris Wilson July 30, 2019, 9:27 a.m. UTC | #2
Quoting Chris Wilson (2019-07-30 09:47:37)
> Quoting Michal Wajdeczko (2019-07-29 16:23:00)
> > Inject load errors into intel_uc_init_hw to make sure we
> > correctly handle uC initialization failures.
> > 
> > To avoid complains from CI about inserted errors or warnings,
> > use helper macro that checks if there was an error injection.
> > 
> > Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> > Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/gt/uc/intel_uc.c | 8 ++++++++
> >  drivers/gpu/drm/i915/i915_drv.h       | 7 ++++++-
> >  drivers/gpu/drm/i915/i915_gem.c       | 2 +-
> >  3 files changed, 15 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > index fafa9be1e12a..9e1156c29cb1 100644
> > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> > @@ -400,6 +400,14 @@ int intel_uc_init_hw(struct intel_uc *uc)
> >         if (!intel_uc_is_using_guc(uc))
> >                 return 0;
> >  
> > +       ret = i915_inject_load_error(i915, -EIO);
> > +       if (ret)
> > +               return ret;
> > +
> > +       ret = i915_inject_load_error(i915, -ENXIO);
> > +       if (ret)
> > +               return ret;
> > +
> >         GEM_BUG_ON(!intel_uc_fw_supported(&guc->fw));
> >  
> >         guc_reset_interrupts(guc);
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index 6b059d51aaff..36f7a146f06a 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -137,9 +137,14 @@ bool i915_error_injected(void);
> >  
> >  #define i915_inject_probe_failure(i915) i915_inject_load_error((i915), -ENODEV)
> >  
> > -#define i915_probe_error(i915, fmt, ...)                                  \
> > +#define I915_ERROR(i915, fmt, ...) \
> >         __i915_printk(i915, i915_error_injected() ? KERN_DEBUG : KERN_ERR, \
> >                       fmt, ##__VA_ARGS__)
> > +#define I915_WARN(i915, fmt, ...) \
> > +       __i915_printk(i915, i915_error_injected() ? KERN_DEBUG : KERN_WARNING, \
> > +                     fmt, ##__VA_ARGS__)
> 
> I didn't see I915_WARN be used in this series. Is it likely? We either
> abort the module load, in which it is an error, or we are quite happy to
> continue in which case I'd vote for a "normal but significant condition"
> i.e. KERN_NOTICE.

Spotted the I915_WARN, I guess that's reasonable.

However, I do dislike I915_ERROR / I915_WARN on principle. Especially as
they have the coupling to the probe-failure magic and I don't feel like
it is a good idea to have that spread too far.
-chris
Michal Wajdeczko July 30, 2019, 10:56 a.m. UTC | #3
On Tue, 30 Jul 2019 11:27:51 +0200, Chris Wilson  
<chris@chris-wilson.co.uk> wrote:

> Quoting Chris Wilson (2019-07-30 09:47:37)
>> Quoting Michal Wajdeczko (2019-07-29 16:23:00)
>> > Inject load errors into intel_uc_init_hw to make sure we
>> > correctly handle uC initialization failures.
>> >
>> > To avoid complains from CI about inserted errors or warnings,
>> > use helper macro that checks if there was an error injection.
>> >
>> > Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
>> > Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
>> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> > ---
>> >  drivers/gpu/drm/i915/gt/uc/intel_uc.c | 8 ++++++++
>> >  drivers/gpu/drm/i915/i915_drv.h       | 7 ++++++-
>> >  drivers/gpu/drm/i915/i915_gem.c       | 2 +-
>> >  3 files changed, 15 insertions(+), 2 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c  
>> b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>> > index fafa9be1e12a..9e1156c29cb1 100644
>> > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
>> > @@ -400,6 +400,14 @@ int intel_uc_init_hw(struct intel_uc *uc)
>> >         if (!intel_uc_is_using_guc(uc))
>> >                 return 0;
>> >
>> > +       ret = i915_inject_load_error(i915, -EIO);
>> > +       if (ret)
>> > +               return ret;
>> > +
>> > +       ret = i915_inject_load_error(i915, -ENXIO);
>> > +       if (ret)
>> > +               return ret;
>> > +
>> >         GEM_BUG_ON(!intel_uc_fw_supported(&guc->fw));
>> >
>> >         guc_reset_interrupts(guc);
>> > diff --git a/drivers/gpu/drm/i915/i915_drv.h  
>> b/drivers/gpu/drm/i915/i915_drv.h
>> > index 6b059d51aaff..36f7a146f06a 100644
>> > --- a/drivers/gpu/drm/i915/i915_drv.h
>> > +++ b/drivers/gpu/drm/i915/i915_drv.h
>> > @@ -137,9 +137,14 @@ bool i915_error_injected(void);
>> >
>> >  #define i915_inject_probe_failure(i915)  
>> i915_inject_load_error((i915), -ENODEV)
>> >
>> > -#define i915_probe_error(i915, fmt,  
>> ...)                                  \
>> > +#define I915_ERROR(i915, fmt, ...) \
>> >         __i915_printk(i915, i915_error_injected() ? KERN_DEBUG :  
>> KERN_ERR, \
>> >                       fmt, ##__VA_ARGS__)
>> > +#define I915_WARN(i915, fmt, ...) \
>> > +       __i915_printk(i915, i915_error_injected() ? KERN_DEBUG :  
>> KERN_WARNING, \
>> > +                     fmt, ##__VA_ARGS__)
>>
>> I didn't see I915_WARN be used in this series. Is it likely? We either
>> abort the module load, in which it is an error, or we are quite happy to
>> continue in which case I'd vote for a "normal but significant condition"
>> i.e. KERN_NOTICE.
>
> Spotted the I915_WARN, I guess that's reasonable.
>
> However, I do dislike I915_ERROR / I915_WARN on principle. Especially as
> they have the coupling to the probe-failure magic and I don't feel like
> it is a good idea to have that spread too far.

But what else can we do in i915 alone to
a) have messages that show origin of the problem as soon as possible,
b) ignore these fake errors if caused by injected failures ?

otherwise we risk that all new injected errors will be immediately
reported as regressions and we stop at CI.BAT

There are other options but we need some buy-in from the CI team:

- we can use existing messages to temporary ignore errors between
   "injecting" and "failed"

i915 0000:00:02.0: Injecting failure %d at checkpoint %u [file:line]
i915 0000:00:02.0: Device initialization failed (%d)

- we can use existing messages to temporary ignore errors between
   "injecting" and new "completed"

i915 0000:00:02.0: Injecting failure %d at checkpoint %u [file:line]
i915 0000:00:02.0: Checkpoint %u failure recovery completed

btw, I'm also thinking about introducing 'recoverable' failures, which
could be used not only to make sure that we don't break the driver while
unwinding, but also to make sure that driver was finally able handle such
failures in expected way:

i915_inject_load_error(i915, -ENOMEM); // probe must fail
i915_inject_recoverable_error(i915, -EIO, WEDGED);  // probe ok but gpu  
wedged
i915_inject_recoverable_error(i915, -ENOEXEC, NORMAL); // probe must be ok
Chris Wilson July 30, 2019, 11:09 a.m. UTC | #4
Quoting Michal Wajdeczko (2019-07-30 11:56:11)
> On Tue, 30 Jul 2019 11:27:51 +0200, Chris Wilson  
> <chris@chris-wilson.co.uk> wrote:
> 
> > Quoting Chris Wilson (2019-07-30 09:47:37)
> >> Quoting Michal Wajdeczko (2019-07-29 16:23:00)
> >> > Inject load errors into intel_uc_init_hw to make sure we
> >> > correctly handle uC initialization failures.
> >> >
> >> > To avoid complains from CI about inserted errors or warnings,
> >> > use helper macro that checks if there was an error injection.
> >> >
> >> > Signed-off-by: Michal Wajdeczko <michal.wajdeczko@intel.com>
> >> > Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> >> > Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >> > ---
> >> >  drivers/gpu/drm/i915/gt/uc/intel_uc.c | 8 ++++++++
> >> >  drivers/gpu/drm/i915/i915_drv.h       | 7 ++++++-
> >> >  drivers/gpu/drm/i915/i915_gem.c       | 2 +-
> >> >  3 files changed, 15 insertions(+), 2 deletions(-)
> >> >
> >> > diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c  
> >> b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >> > index fafa9be1e12a..9e1156c29cb1 100644
> >> > --- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >> > +++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
> >> > @@ -400,6 +400,14 @@ int intel_uc_init_hw(struct intel_uc *uc)
> >> >         if (!intel_uc_is_using_guc(uc))
> >> >                 return 0;
> >> >
> >> > +       ret = i915_inject_load_error(i915, -EIO);
> >> > +       if (ret)
> >> > +               return ret;
> >> > +
> >> > +       ret = i915_inject_load_error(i915, -ENXIO);
> >> > +       if (ret)
> >> > +               return ret;
> >> > +
> >> >         GEM_BUG_ON(!intel_uc_fw_supported(&guc->fw));
> >> >
> >> >         guc_reset_interrupts(guc);
> >> > diff --git a/drivers/gpu/drm/i915/i915_drv.h  
> >> b/drivers/gpu/drm/i915/i915_drv.h
> >> > index 6b059d51aaff..36f7a146f06a 100644
> >> > --- a/drivers/gpu/drm/i915/i915_drv.h
> >> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> >> > @@ -137,9 +137,14 @@ bool i915_error_injected(void);
> >> >
> >> >  #define i915_inject_probe_failure(i915)  
> >> i915_inject_load_error((i915), -ENODEV)
> >> >
> >> > -#define i915_probe_error(i915, fmt,  
> >> ...)                                  \
> >> > +#define I915_ERROR(i915, fmt, ...) \
> >> >         __i915_printk(i915, i915_error_injected() ? KERN_DEBUG :  
> >> KERN_ERR, \
> >> >                       fmt, ##__VA_ARGS__)
> >> > +#define I915_WARN(i915, fmt, ...) \
> >> > +       __i915_printk(i915, i915_error_injected() ? KERN_DEBUG :  
> >> KERN_WARNING, \
> >> > +                     fmt, ##__VA_ARGS__)
> >>
> >> I didn't see I915_WARN be used in this series. Is it likely? We either
> >> abort the module load, in which it is an error, or we are quite happy to
> >> continue in which case I'd vote for a "normal but significant condition"
> >> i.e. KERN_NOTICE.
> >
> > Spotted the I915_WARN, I guess that's reasonable.
> >
> > However, I do dislike I915_ERROR / I915_WARN on principle. Especially as
> > they have the coupling to the probe-failure magic and I don't feel like
> > it is a good idea to have that spread too far.
> 
> But what else can we do in i915 alone to
> a) have messages that show origin of the problem as soon as possible,

(Actually, is it not that the issue here is the opposite, it's errors
being reported later on after the injection.)

> b) ignore these fake errors if caused by injected failures ?

My issue is that I915_ERROR() is simply too generic (and too reminiscent
of DRM_ERROR), it invites use outside of the probe paths. If probe/PROBE
is kept in the name, it should help stop it being applied where it makes
no sense (and I fear could cause confusion if message levels did get
flipped).

> otherwise we risk that all new injected errors will be immediately
> reported as regressions and we stop at CI.BAT

What I thought was a very very simple solution was to teach that test to
simply ignore the expected errors. I'm still waiting on igt being
standalone, or at least being able to control the runner from within the
test.
-Chris
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/gt/uc/intel_uc.c b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
index fafa9be1e12a..9e1156c29cb1 100644
--- a/drivers/gpu/drm/i915/gt/uc/intel_uc.c
+++ b/drivers/gpu/drm/i915/gt/uc/intel_uc.c
@@ -400,6 +400,14 @@  int intel_uc_init_hw(struct intel_uc *uc)
 	if (!intel_uc_is_using_guc(uc))
 		return 0;
 
+	ret = i915_inject_load_error(i915, -EIO);
+	if (ret)
+		return ret;
+
+	ret = i915_inject_load_error(i915, -ENXIO);
+	if (ret)
+		return ret;
+
 	GEM_BUG_ON(!intel_uc_fw_supported(&guc->fw));
 
 	guc_reset_interrupts(guc);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6b059d51aaff..36f7a146f06a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -137,9 +137,14 @@  bool i915_error_injected(void);
 
 #define i915_inject_probe_failure(i915) i915_inject_load_error((i915), -ENODEV)
 
-#define i915_probe_error(i915, fmt, ...)				   \
+#define I915_ERROR(i915, fmt, ...) \
 	__i915_printk(i915, i915_error_injected() ? KERN_DEBUG : KERN_ERR, \
 		      fmt, ##__VA_ARGS__)
+#define I915_WARN(i915, fmt, ...) \
+	__i915_printk(i915, i915_error_injected() ? KERN_DEBUG : KERN_WARNING, \
+		      fmt, ##__VA_ARGS__)
+
+#define i915_probe_error(i915, fmt, ...) I915_ERROR(i915, fmt, ##__VA_ARGS__)
 
 struct drm_i915_gem_object;
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 32b4fa5c579c..c437ab5554ec 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1248,7 +1248,7 @@  int i915_gem_init_hw(struct drm_i915_private *i915)
 	/* We can't enable contexts until all firmware is loaded */
 	ret = intel_uc_init_hw(&i915->gt.uc);
 	if (ret) {
-		DRM_ERROR("Enabling uc failed (%d)\n", ret);
+		I915_ERROR(i915, "Enabling uc failed (%d)\n", ret);
 		goto out;
 	}