diff mbox series

drm/i915: Don't enable hwmon for selftests

Message ID 20240410042855.130262-1-ashutosh.dixit@intel.com (mailing list archive)
State New
Headers show
Series drm/i915: Don't enable hwmon for selftests | expand

Commit Message

Dixit, Ashutosh April 10, 2024, 4:28 a.m. UTC
There are no hwmon selftests so there is no need to enable hwmon for
selftests. So enable hwmon only for real driver load.

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10366
Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
---
 drivers/gpu/drm/i915/i915_driver.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

Comments

Dixit, Ashutosh April 10, 2024, 4:37 a.m. UTC | #1
On Tue, 09 Apr 2024 21:28:55 -0700, Ashutosh Dixit wrote:
>
> There are no hwmon selftests so there is no need to enable hwmon for
> selftests. So enable hwmon only for real driver load.
>
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10366

This will resolve this CI issue for now and give us time to study the issue
further. If this is not acceptable in the main tree, we could consider
merging this to the topic/core-for-CI branch as a temporary fix to
eliminate noise in CI due to this issue.

Regards,
Ashutosh
Nilawar, Badal April 10, 2024, 4:56 a.m. UTC | #2
On 10-04-2024 09:58, Ashutosh Dixit wrote:
> There are no hwmon selftests so there is no need to enable hwmon for
> selftests. So enable hwmon only for real driver load.
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10366
> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
LGTM.
Reviewed-by: Badal Nilawar <badal.nilawar@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_driver.c | 16 ++++++++++++++--
>   1 file changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
> index 9ee902d5b72c..6fa6d2c8109f 100644
> --- a/drivers/gpu/drm/i915/i915_driver.c
> +++ b/drivers/gpu/drm/i915/i915_driver.c
> @@ -94,6 +94,7 @@
>   #include "i915_memcpy.h"
>   #include "i915_perf.h"
>   #include "i915_query.h"
> +#include "i915_selftest.h"
>   #include "i915_suspend.h"
>   #include "i915_switcheroo.h"
>   #include "i915_sysfs.h"
> @@ -589,6 +590,15 @@ static void i915_driver_hw_remove(struct drm_i915_private *dev_priv)
>   		pci_disable_msi(pdev);
>   }
>   
> +static bool is_selftest(void)
> +{
> +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> +	return i915_selftest.live || i915_selftest.perf || i915_selftest.mock;
> +#else
> +	return false;
> +#endif
> +}
> +
>   /**
>    * i915_driver_register - register the driver with the rest of the system
>    * @dev_priv: device private
> @@ -624,7 +634,8 @@ static void i915_driver_register(struct drm_i915_private *dev_priv)
>   
>   	intel_pxp_debugfs_register(dev_priv->pxp);
>   
> -	i915_hwmon_register(dev_priv);
> +	if (!is_selftest())
> +		i915_hwmon_register(dev_priv);
>   
>   	intel_display_driver_register(dev_priv);
>   
> @@ -660,7 +671,8 @@ static void i915_driver_unregister(struct drm_i915_private *dev_priv)
>   	for_each_gt(gt, dev_priv, i)
>   		intel_gt_driver_unregister(gt);
>   
> -	i915_hwmon_unregister(dev_priv);
> +	if (!is_selftest())
> +		i915_hwmon_unregister(dev_priv);
>   
>   	i915_perf_unregister(dev_priv);
>   	i915_pmu_unregister(dev_priv);
Ville Syrjala April 10, 2024, 11:42 a.m. UTC | #3
On Tue, Apr 09, 2024 at 09:28:55PM -0700, Ashutosh Dixit wrote:
> There are no hwmon selftests so there is no need to enable hwmon for
> selftests. So enable hwmon only for real driver load.
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10366
> Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>

Why are we adding duct tape instead of fixing it properly?

> ---
>  drivers/gpu/drm/i915/i915_driver.c | 16 ++++++++++++++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
> index 9ee902d5b72c..6fa6d2c8109f 100644
> --- a/drivers/gpu/drm/i915/i915_driver.c
> +++ b/drivers/gpu/drm/i915/i915_driver.c
> @@ -94,6 +94,7 @@
>  #include "i915_memcpy.h"
>  #include "i915_perf.h"
>  #include "i915_query.h"
> +#include "i915_selftest.h"
>  #include "i915_suspend.h"
>  #include "i915_switcheroo.h"
>  #include "i915_sysfs.h"
> @@ -589,6 +590,15 @@ static void i915_driver_hw_remove(struct drm_i915_private *dev_priv)
>  		pci_disable_msi(pdev);
>  }
>  
> +static bool is_selftest(void)
> +{
> +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> +	return i915_selftest.live || i915_selftest.perf || i915_selftest.mock;
> +#else
> +	return false;
> +#endif
> +}
> +
>  /**
>   * i915_driver_register - register the driver with the rest of the system
>   * @dev_priv: device private
> @@ -624,7 +634,8 @@ static void i915_driver_register(struct drm_i915_private *dev_priv)
>  
>  	intel_pxp_debugfs_register(dev_priv->pxp);
>  
> -	i915_hwmon_register(dev_priv);
> +	if (!is_selftest())
> +		i915_hwmon_register(dev_priv);
>  
>  	intel_display_driver_register(dev_priv);
>  
> @@ -660,7 +671,8 @@ static void i915_driver_unregister(struct drm_i915_private *dev_priv)
>  	for_each_gt(gt, dev_priv, i)
>  		intel_gt_driver_unregister(gt);
>  
> -	i915_hwmon_unregister(dev_priv);
> +	if (!is_selftest())
> +		i915_hwmon_unregister(dev_priv);
>  
>  	i915_perf_unregister(dev_priv);
>  	i915_pmu_unregister(dev_priv);
> -- 
> 2.41.0
Dixit, Ashutosh April 11, 2024, 5:09 a.m. UTC | #4
On Wed, 10 Apr 2024 04:42:46 -0700, Ville Syrjälä wrote:
>
> On Tue, Apr 09, 2024 at 09:28:55PM -0700, Ashutosh Dixit wrote:
> > There are no hwmon selftests so there is no need to enable hwmon for
> > selftests. So enable hwmon only for real driver load.
> >
> > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10366
> > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
>
> Why are we adding duct tape instead of fixing it properly?

Yeah pretty much what I said here myself:

https://patchwork.freedesktop.org/patch/588585/?series=132243&rev=1#comment_1071014

The issue has been difficult to root-cause. My last effort can be seen here:

https://patchwork.freedesktop.org/patch/584859/?series=131630&rev=1#comment_1067888

Though Badal went further and saw that occasionaly the memory would get
freed first and hwmon would get unregistered as much as 2 seconds later,
which will cause the crash if anyone touched hwmon sysfs in those final 2
seconds. So not sure what is causing that 2 second delay.

I am not sure if it is worth root-causing further. I am pretty sure if we
get rid of the devm_ stuff, that will fix the issue too. So if this patch
is not acceptable, we could just go that route (get rid of devm_ in hwmon).

Thanks.
--
Ashutosh

> > ---
> >  drivers/gpu/drm/i915/i915_driver.c | 16 ++++++++++++++--
> >  1 file changed, 14 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
> > index 9ee902d5b72c..6fa6d2c8109f 100644
> > --- a/drivers/gpu/drm/i915/i915_driver.c
> > +++ b/drivers/gpu/drm/i915/i915_driver.c
> > @@ -94,6 +94,7 @@
> >  #include "i915_memcpy.h"
> >  #include "i915_perf.h"
> >  #include "i915_query.h"
> > +#include "i915_selftest.h"
> >  #include "i915_suspend.h"
> >  #include "i915_switcheroo.h"
> >  #include "i915_sysfs.h"
> > @@ -589,6 +590,15 @@ static void i915_driver_hw_remove(struct drm_i915_private *dev_priv)
> >		pci_disable_msi(pdev);
> >  }
> >
> > +static bool is_selftest(void)
> > +{
> > +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> > +	return i915_selftest.live || i915_selftest.perf || i915_selftest.mock;
> > +#else
> > +	return false;
> > +#endif
> > +}
> > +
> >  /**
> >   * i915_driver_register - register the driver with the rest of the system
> >   * @dev_priv: device private
> > @@ -624,7 +634,8 @@ static void i915_driver_register(struct drm_i915_private *dev_priv)
> >
> >	intel_pxp_debugfs_register(dev_priv->pxp);
> >
> > -	i915_hwmon_register(dev_priv);
> > +	if (!is_selftest())
> > +		i915_hwmon_register(dev_priv);
> >
> >	intel_display_driver_register(dev_priv);
> >
> > @@ -660,7 +671,8 @@ static void i915_driver_unregister(struct drm_i915_private *dev_priv)
> >	for_each_gt(gt, dev_priv, i)
> >		intel_gt_driver_unregister(gt);
> >
> > -	i915_hwmon_unregister(dev_priv);
> > +	if (!is_selftest())
> > +		i915_hwmon_unregister(dev_priv);
> >
> >	i915_perf_unregister(dev_priv);
> >	i915_pmu_unregister(dev_priv);
> > --
> > 2.41.0
>
> --
> Ville Syrjälä
> Intel
Ville Syrjala April 11, 2024, 10:47 a.m. UTC | #5
On Wed, Apr 10, 2024 at 10:09:32PM -0700, Dixit, Ashutosh wrote:
> On Wed, 10 Apr 2024 04:42:46 -0700, Ville Syrjälä wrote:
> >
> > On Tue, Apr 09, 2024 at 09:28:55PM -0700, Ashutosh Dixit wrote:
> > > There are no hwmon selftests so there is no need to enable hwmon for
> > > selftests. So enable hwmon only for real driver load.
> > >
> > > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10366
> > > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> >
> > Why are we adding duct tape instead of fixing it properly?
> 
> Yeah pretty much what I said here myself:
> 
> https://patchwork.freedesktop.org/patch/588585/?series=132243&rev=1#comment_1071014
> 
> The issue has been difficult to root-cause. My last effort can be seen here:
> 
> https://patchwork.freedesktop.org/patch/584859/?series=131630&rev=1#comment_1067888
> 
> Though Badal went further and saw that occasionaly the memory would get
> freed first and hwmon would get unregistered as much as 2 seconds later,
> which will cause the crash if anyone touched hwmon sysfs in those final 2
> seconds. So not sure what is causing that 2 second delay.

Sounds like someone holding a sysfs file/etc. open. Should be trivial
to do that by hand and see what happens.

> 
> I am not sure if it is worth root-causing further. I am pretty sure if we
> get rid of the devm_ stuff, that will fix the issue too. So if this patch
> is not acceptable, we could just go that route (get rid of devm_ in hwmon).
> 
> Thanks.
> --
> Ashutosh
> 
> > > ---
> > >  drivers/gpu/drm/i915/i915_driver.c | 16 ++++++++++++++--
> > >  1 file changed, 14 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
> > > index 9ee902d5b72c..6fa6d2c8109f 100644
> > > --- a/drivers/gpu/drm/i915/i915_driver.c
> > > +++ b/drivers/gpu/drm/i915/i915_driver.c
> > > @@ -94,6 +94,7 @@
> > >  #include "i915_memcpy.h"
> > >  #include "i915_perf.h"
> > >  #include "i915_query.h"
> > > +#include "i915_selftest.h"
> > >  #include "i915_suspend.h"
> > >  #include "i915_switcheroo.h"
> > >  #include "i915_sysfs.h"
> > > @@ -589,6 +590,15 @@ static void i915_driver_hw_remove(struct drm_i915_private *dev_priv)
> > >		pci_disable_msi(pdev);
> > >  }
> > >
> > > +static bool is_selftest(void)
> > > +{
> > > +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> > > +	return i915_selftest.live || i915_selftest.perf || i915_selftest.mock;
> > > +#else
> > > +	return false;
> > > +#endif
> > > +}
> > > +
> > >  /**
> > >   * i915_driver_register - register the driver with the rest of the system
> > >   * @dev_priv: device private
> > > @@ -624,7 +634,8 @@ static void i915_driver_register(struct drm_i915_private *dev_priv)
> > >
> > >	intel_pxp_debugfs_register(dev_priv->pxp);
> > >
> > > -	i915_hwmon_register(dev_priv);
> > > +	if (!is_selftest())
> > > +		i915_hwmon_register(dev_priv);
> > >
> > >	intel_display_driver_register(dev_priv);
> > >
> > > @@ -660,7 +671,8 @@ static void i915_driver_unregister(struct drm_i915_private *dev_priv)
> > >	for_each_gt(gt, dev_priv, i)
> > >		intel_gt_driver_unregister(gt);
> > >
> > > -	i915_hwmon_unregister(dev_priv);
> > > +	if (!is_selftest())
> > > +		i915_hwmon_unregister(dev_priv);
> > >
> > >	i915_perf_unregister(dev_priv);
> > >	i915_pmu_unregister(dev_priv);
> > > --
> > > 2.41.0
> >
> > --
> > Ville Syrjälä
> > Intel
Dixit, Ashutosh April 13, 2024, 12:35 a.m. UTC | #6
On Thu, 11 Apr 2024 03:47:13 -0700, Ville Syrjälä wrote:
>
> On Wed, Apr 10, 2024 at 10:09:32PM -0700, Dixit, Ashutosh wrote:
> > On Wed, 10 Apr 2024 04:42:46 -0700, Ville Syrjälä wrote:
> > >
> > > On Tue, Apr 09, 2024 at 09:28:55PM -0700, Ashutosh Dixit wrote:
> > > > There are no hwmon selftests so there is no need to enable hwmon for
> > > > selftests. So enable hwmon only for real driver load.
> > > >
> > > > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10366
> > > > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > >
> > > Why are we adding duct tape instead of fixing it properly?
> >
> > Yeah pretty much what I said here myself:
> >
> > https://patchwork.freedesktop.org/patch/588585/?series=132243&rev=1#comment_1071014
> >
> > The issue has been difficult to root-cause. My last effort can be seen here:
> >
> > https://patchwork.freedesktop.org/patch/584859/?series=131630&rev=1#comment_1067888
> >
> > Though Badal went further and saw that occasionaly the memory would get
> > freed first and hwmon would get unregistered as much as 2 seconds later,
> > which will cause the crash if anyone touched hwmon sysfs in those final 2
> > seconds. So not sure what is causing that 2 second delay.
>
> Sounds like someone holding a sysfs file/etc. open. Should be trivial
> to do that by hand and see what happens.

I checked this out. We see the memory being released before hwmon even when
we don't access the sysfs, so it has norhing to do with holding a sysfs
file open. Holding a sysfs file open also takes a reference on the module
which will prevent the module from being unloaded, which is also what we
don't see.

So the reordering seems to be happening in devres itself occasionally for
some reason.

So anyway, I have submitted a new patch getting rid of devm and freeing
everything explicitly and verified that that fixes the issue:

https://patchwork.freedesktop.org/series/132400/

I have also update https://patchwork.freedesktop.org/series/132400/ with
more details.

Regards,
Ashutosh

>
> >
> > I am not sure if it is worth root-causing further. I am pretty sure if we
> > get rid of the devm_ stuff, that will fix the issue too. So if this patch
> > is not acceptable, we could just go that route (get rid of devm_ in hwmon).
> >
> >
> > > > ---
> > > >  drivers/gpu/drm/i915/i915_driver.c | 16 ++++++++++++++--
> > > >  1 file changed, 14 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
> > > > index 9ee902d5b72c..6fa6d2c8109f 100644
> > > > --- a/drivers/gpu/drm/i915/i915_driver.c
> > > > +++ b/drivers/gpu/drm/i915/i915_driver.c
> > > > @@ -94,6 +94,7 @@
> > > >  #include "i915_memcpy.h"
> > > >  #include "i915_perf.h"
> > > >  #include "i915_query.h"
> > > > +#include "i915_selftest.h"
> > > >  #include "i915_suspend.h"
> > > >  #include "i915_switcheroo.h"
> > > >  #include "i915_sysfs.h"
> > > > @@ -589,6 +590,15 @@ static void i915_driver_hw_remove(struct drm_i915_private *dev_priv)
> > > >		pci_disable_msi(pdev);
> > > >  }
> > > >
> > > > +static bool is_selftest(void)
> > > > +{
> > > > +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> > > > +	return i915_selftest.live || i915_selftest.perf || i915_selftest.mock;
> > > > +#else
> > > > +	return false;
> > > > +#endif
> > > > +}
> > > > +
> > > >  /**
> > > >   * i915_driver_register - register the driver with the rest of the system
> > > >   * @dev_priv: device private
> > > > @@ -624,7 +634,8 @@ static void i915_driver_register(struct drm_i915_private *dev_priv)
> > > >
> > > >	intel_pxp_debugfs_register(dev_priv->pxp);
> > > >
> > > > -	i915_hwmon_register(dev_priv);
> > > > +	if (!is_selftest())
> > > > +		i915_hwmon_register(dev_priv);
> > > >
> > > >	intel_display_driver_register(dev_priv);
> > > >
> > > > @@ -660,7 +671,8 @@ static void i915_driver_unregister(struct drm_i915_private *dev_priv)
> > > >	for_each_gt(gt, dev_priv, i)
> > > >		intel_gt_driver_unregister(gt);
> > > >
> > > > -	i915_hwmon_unregister(dev_priv);
> > > > +	if (!is_selftest())
> > > > +		i915_hwmon_unregister(dev_priv);
> > > >
> > > >	i915_perf_unregister(dev_priv);
> > > >	i915_pmu_unregister(dev_priv);
> > > > --
> > > > 2.41.0
> > >
> > > --
> > > Ville Syrjälä
> > > Intel
>
> --
> Ville Syrjälä
> Intel
Dixit, Ashutosh April 13, 2024, 12:37 a.m. UTC | #7
On Fri, 12 Apr 2024 17:35:15 -0700, Dixit, Ashutosh wrote:
>
> On Thu, 11 Apr 2024 03:47:13 -0700, Ville Syrjälä wrote:
> >
> > On Wed, Apr 10, 2024 at 10:09:32PM -0700, Dixit, Ashutosh wrote:
> > > On Wed, 10 Apr 2024 04:42:46 -0700, Ville Syrjälä wrote:
> > > >
> > > > On Tue, Apr 09, 2024 at 09:28:55PM -0700, Ashutosh Dixit wrote:
> > > > > There are no hwmon selftests so there is no need to enable hwmon for
> > > > > selftests. So enable hwmon only for real driver load.
> > > > >
> > > > > Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/10366
> > > > > Signed-off-by: Ashutosh Dixit <ashutosh.dixit@intel.com>
> > > >
> > > > Why are we adding duct tape instead of fixing it properly?
> > >
> > > Yeah pretty much what I said here myself:
> > >
> > > https://patchwork.freedesktop.org/patch/588585/?series=132243&rev=1#comment_1071014
> > >
> > > The issue has been difficult to root-cause. My last effort can be seen here:
> > >
> > > https://patchwork.freedesktop.org/patch/584859/?series=131630&rev=1#comment_1067888
> > >
> > > Though Badal went further and saw that occasionaly the memory would get
> > > freed first and hwmon would get unregistered as much as 2 seconds later,
> > > which will cause the crash if anyone touched hwmon sysfs in those final 2
> > > seconds. So not sure what is causing that 2 second delay.
> >
> > Sounds like someone holding a sysfs file/etc. open. Should be trivial
> > to do that by hand and see what happens.
>
> I checked this out. We see the memory being released before hwmon even when
> we don't access the sysfs, so it has norhing to do with holding a sysfs
> file open. Holding a sysfs file open also takes a reference on the module
> which will prevent the module from being unloaded, which is also what we
> don't see.
>
> So the reordering seems to be happening in devres itself occasionally for
> some reason.
>
> So anyway, I have submitted a new patch getting rid of devm and freeing
> everything explicitly and verified that that fixes the issue:
>
> https://patchwork.freedesktop.org/series/132400/
>
> I have also update https://patchwork.freedesktop.org/series/132400/ with
> more details.

Sorry I meant: https://gitlab.freedesktop.org/drm/intel/-/issues/10366
diff mbox series

Patch

diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
index 9ee902d5b72c..6fa6d2c8109f 100644
--- a/drivers/gpu/drm/i915/i915_driver.c
+++ b/drivers/gpu/drm/i915/i915_driver.c
@@ -94,6 +94,7 @@ 
 #include "i915_memcpy.h"
 #include "i915_perf.h"
 #include "i915_query.h"
+#include "i915_selftest.h"
 #include "i915_suspend.h"
 #include "i915_switcheroo.h"
 #include "i915_sysfs.h"
@@ -589,6 +590,15 @@  static void i915_driver_hw_remove(struct drm_i915_private *dev_priv)
 		pci_disable_msi(pdev);
 }
 
+static bool is_selftest(void)
+{
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+	return i915_selftest.live || i915_selftest.perf || i915_selftest.mock;
+#else
+	return false;
+#endif
+}
+
 /**
  * i915_driver_register - register the driver with the rest of the system
  * @dev_priv: device private
@@ -624,7 +634,8 @@  static void i915_driver_register(struct drm_i915_private *dev_priv)
 
 	intel_pxp_debugfs_register(dev_priv->pxp);
 
-	i915_hwmon_register(dev_priv);
+	if (!is_selftest())
+		i915_hwmon_register(dev_priv);
 
 	intel_display_driver_register(dev_priv);
 
@@ -660,7 +671,8 @@  static void i915_driver_unregister(struct drm_i915_private *dev_priv)
 	for_each_gt(gt, dev_priv, i)
 		intel_gt_driver_unregister(gt);
 
-	i915_hwmon_unregister(dev_priv);
+	if (!is_selftest())
+		i915_hwmon_unregister(dev_priv);
 
 	i915_perf_unregister(dev_priv);
 	i915_pmu_unregister(dev_priv);