diff mbox series

thermal: sun8i: Be loud when probe fails

Message ID 20200708105527.868987-1-megous@megous.com (mailing list archive)
State New, archived
Headers show
Series thermal: sun8i: Be loud when probe fails | expand

Commit Message

Ondřej Jirman July 8, 2020, 10:55 a.m. UTC
I noticed several mobile Linux distributions failing to enable the
thermal regulation correctly, because the kernel is silent
when thermal driver fails to probe. Add enough error reporting
to debug issues and warn users in case thermal sensor is failing
to probe.

Failing to notify users means, that SoC can easily overheat under
load.

Signed-off-by: Ondrej Jirman <megous@megous.com>
---
 drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
 1 file changed, 43 insertions(+), 12 deletions(-)

Comments

Russell King (Oracle) July 8, 2020, 11:03 a.m. UTC | #1
On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> I noticed several mobile Linux distributions failing to enable the
> thermal regulation correctly, because the kernel is silent
> when thermal driver fails to probe. Add enough error reporting
> to debug issues and warn users in case thermal sensor is failing
> to probe.
> 
> Failing to notify users means, that SoC can easily overheat under
> load.
> 
> Signed-off-by: Ondrej Jirman <megous@megous.com>
> ---
>  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
>  1 file changed, 43 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> index 74d73be16496..9065e79ae743 100644
> --- a/drivers/thermal/sun8i_thermal.c
> +++ b/drivers/thermal/sun8i_thermal.c
> @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
>  
>  	calcell = devm_nvmem_cell_get(dev, "calibration");
>  	if (IS_ERR(calcell)) {
> +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> +			PTR_ERR(calcell));

Consider using:

		dev_err(dev, "Failed to get calibration nvmem cell (%pe)\n",
			calcell);

which means the kernel can print the symbolic errno value.
Ondřej Jirman July 8, 2020, 11:10 a.m. UTC | #2
On Wed, Jul 08, 2020 at 12:03:01PM +0100, Russell King - ARM Linux admin wrote:
> On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > I noticed several mobile Linux distributions failing to enable the
> > thermal regulation correctly, because the kernel is silent
> > when thermal driver fails to probe. Add enough error reporting
> > to debug issues and warn users in case thermal sensor is failing
> > to probe.
> > 
> > Failing to notify users means, that SoC can easily overheat under
> > load.
> > 
> > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > ---
> >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> >  1 file changed, 43 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > index 74d73be16496..9065e79ae743 100644
> > --- a/drivers/thermal/sun8i_thermal.c
> > +++ b/drivers/thermal/sun8i_thermal.c
> > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >  
> >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> >  	if (IS_ERR(calcell)) {
> > +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > +			PTR_ERR(calcell));
> 
> Consider using:
> 
> 		dev_err(dev, "Failed to get calibration nvmem cell (%pe)\n",
> 			calcell);
> 
> which means the kernel can print the symbolic errno value.

Thank you, I'll change it in v2. :)

regards,
	o.

> -- 
> RMK's Patch system: https://www.armlinux.org.uk/developer/patches/
> FTTP is here! 40Mbps down 10Mbps up. Decent connectivity at last!
Yangtao Li July 8, 2020, 11:55 a.m. UTC | #3
HI Ondrej,
On Wed, Jul 8, 2020 at 6:55 PM Ondrej Jirman <megous@megous.com> wrote:
>
> I noticed several mobile Linux distributions failing to enable the
> thermal regulation correctly, because the kernel is silent
> when thermal driver fails to probe. Add enough error reporting
> to debug issues and warn users in case thermal sensor is failing
> to probe.
>
> Failing to notify users means, that SoC can easily overheat under
> load.
>
> Signed-off-by: Ondrej Jirman <megous@megous.com>
> ---
>  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
>  1 file changed, 43 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> index 74d73be16496..9065e79ae743 100644
> --- a/drivers/thermal/sun8i_thermal.c
> +++ b/drivers/thermal/sun8i_thermal.c
> @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
>
>         calcell = devm_nvmem_cell_get(dev, "calibration");
>         if (IS_ERR(calcell)) {
> +               dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> +                       PTR_ERR(calcell));
> +
>                 if (PTR_ERR(calcell) == -EPROBE_DEFER)
>                         return -EPROBE_DEFER;
> +
>                 /*
>                  * Even if the external calibration data stored in sid is
>                  * not accessible, the THS hardware can still work, although
> @@ -308,6 +312,8 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
>         caldata = nvmem_cell_read(calcell, &callen);
>         if (IS_ERR(caldata)) {
>                 ret = PTR_ERR(caldata);
> +               dev_err(dev, "Failed to read calibration data (%d)\n",
> +                       ret);
>                 goto out;
>         }
>
> @@ -330,23 +336,35 @@ static int sun8i_ths_resource_init(struct ths_device *tmdev)
>                 return PTR_ERR(base);
>
>         tmdev->regmap = devm_regmap_init_mmio(dev, base, &config);
> -       if (IS_ERR(tmdev->regmap))
> +       if (IS_ERR(tmdev->regmap)) {
> +               dev_err(dev, "Failed to init regmap (%ld)\n",
> +                       PTR_ERR(tmdev->regmap));
>                 return PTR_ERR(tmdev->regmap);
> +       }
>
>         if (tmdev->chip->has_bus_clk_reset) {
>                 tmdev->reset = devm_reset_control_get(dev, NULL);
> -               if (IS_ERR(tmdev->reset))
> +               if (IS_ERR(tmdev->reset)) {
> +                       dev_err(dev, "Failed to get reset (%ld)\n",
> +                               PTR_ERR(tmdev->reset));
>                         return PTR_ERR(tmdev->reset);
> +               }
>
>                 tmdev->bus_clk = devm_clk_get(&pdev->dev, "bus");
> -               if (IS_ERR(tmdev->bus_clk))
> +               if (IS_ERR(tmdev->bus_clk)) {
> +                       dev_err(dev, "Failed to get bus clock (%ld)\n",
> +                               PTR_ERR(tmdev->bus_clk));
>                         return PTR_ERR(tmdev->bus_clk);
> +               }
>         }
>
>         if (tmdev->chip->has_mod_clk) {
>                 tmdev->mod_clk = devm_clk_get(&pdev->dev, "mod");
> -               if (IS_ERR(tmdev->mod_clk))
> +               if (IS_ERR(tmdev->mod_clk)) {
> +                       dev_err(dev, "Failed to get mod clock (%ld)\n",
> +                               PTR_ERR(tmdev->mod_clk));
>                         return PTR_ERR(tmdev->mod_clk);
> +               }
>         }
>
>         ret = reset_control_deassert(tmdev->reset);
> @@ -471,8 +489,12 @@ static int sun8i_ths_register(struct ths_device *tmdev)
>                                                              i,
>                                                              &tmdev->sensor[i],
>                                                              &ths_ops);
> -               if (IS_ERR(tmdev->sensor[i].tzd))
> +               if (IS_ERR(tmdev->sensor[i].tzd)) {
> +                       dev_err(tmdev->dev,
> +                               "Failed to register sensor %d (%ld)\n",
> +                               i, PTR_ERR(tmdev->sensor[i].tzd));
>                         return PTR_ERR(tmdev->sensor[i].tzd);
> +               }
>
>                 if (devm_thermal_add_hwmon_sysfs(tmdev->sensor[i].tzd))
>                         dev_warn(tmdev->dev,
> @@ -501,19 +523,21 @@ static int sun8i_ths_probe(struct platform_device *pdev)
>
>         ret = sun8i_ths_resource_init(tmdev);
>         if (ret)
> -               return ret;
> +               goto err_out;
>
>         irq = platform_get_irq(pdev, 0);
> -       if (irq < 0)
> -               return irq;
> +       if (irq < 0) {
> +               ret = irq;
> +               goto err_out;
> +       }
>
>         ret = tmdev->chip->init(tmdev);
>         if (ret)
> -               return ret;
> +               goto err_out;
>
>         ret = sun8i_ths_register(tmdev);
>         if (ret)
> -               return ret;
> +               goto err_out;
>
>         /*
>          * Avoid entering the interrupt handler, the thermal device is not
> @@ -523,10 +547,17 @@ static int sun8i_ths_probe(struct platform_device *pdev)
>         ret = devm_request_threaded_irq(dev, irq, NULL,
>                                         sun8i_irq_thread,
>                                         IRQF_ONESHOT, "ths", tmdev);
> -       if (ret)
> -               return ret;
> +       if (ret) {
> +               dev_err(dev, "Failed to request irq (%d)\n", ret);
> +               goto err_out;
> +       }
>
> +       dev_info(dev, "Thermal sensor ready!\n");
>         return 0;
> +
> +err_out:
> +       dev_err(dev, "Failed to probe thermal sensor (%d)\n", ret);

When the driver fails, there will be this print. Isn't it superfluous
for you to add these?

sun8i-thermal: probe of 5070400.thermal-sensor failed with error


Yangtao
Maxime Ripard July 8, 2020, 12:25 p.m. UTC | #4
Hi,

On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> I noticed several mobile Linux distributions failing to enable the
> thermal regulation correctly, because the kernel is silent
> when thermal driver fails to probe. Add enough error reporting
> to debug issues and warn users in case thermal sensor is failing
> to probe.
> 
> Failing to notify users means, that SoC can easily overheat under
> load.
> 
> Signed-off-by: Ondrej Jirman <megous@megous.com>
> ---
>  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
>  1 file changed, 43 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> index 74d73be16496..9065e79ae743 100644
> --- a/drivers/thermal/sun8i_thermal.c
> +++ b/drivers/thermal/sun8i_thermal.c
> @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
>  
>  	calcell = devm_nvmem_cell_get(dev, "calibration");
>  	if (IS_ERR(calcell)) {
> +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> +			PTR_ERR(calcell));
> +
>  		if (PTR_ERR(calcell) == -EPROBE_DEFER)
>  			return -EPROBE_DEFER;
> +

The rest of the patch makes sense, but we should probably put the error
message after the EPROBE_DEFER return so that we don't print any extra
noise that isn't necessarily useful

Maxime
Ondřej Jirman July 8, 2020, 1:21 p.m. UTC | #5
On Wed, Jul 08, 2020 at 07:55:40PM +0800, Frank Lee wrote:
> HI Ondrej,
> On Wed, Jul 8, 2020 at 6:55 PM Ondrej Jirman <megous@megous.com> wrote:
> >
> > I noticed several mobile Linux distributions failing to enable the
> > thermal regulation correctly, because the kernel is silent
> > when thermal driver fails to probe. Add enough error reporting
> > to debug issues and warn users in case thermal sensor is failing
> > to probe.
> >
> > Failing to notify users means, that SoC can easily overheat under
> > load.
> >
> > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > ---
> >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> >  1 file changed, 43 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > index 74d73be16496..9065e79ae743 100644
> > --- a/drivers/thermal/sun8i_thermal.c
> > +++ b/drivers/thermal/sun8i_thermal.c
> > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >
> >         calcell = devm_nvmem_cell_get(dev, "calibration");
> >         if (IS_ERR(calcell)) {
> > +               dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > +                       PTR_ERR(calcell));
> > +
> >                 if (PTR_ERR(calcell) == -EPROBE_DEFER)
> >                         return -EPROBE_DEFER;
> > +
> >                 /*
> >                  * Even if the external calibration data stored in sid is
> >                  * not accessible, the THS hardware can still work, although
> > @@ -308,6 +312,8 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >         caldata = nvmem_cell_read(calcell, &callen);
> >         if (IS_ERR(caldata)) {
> >                 ret = PTR_ERR(caldata);
> > +               dev_err(dev, "Failed to read calibration data (%d)\n",
> > +                       ret);
> >                 goto out;
> >         }
> >
> > @@ -330,23 +336,35 @@ static int sun8i_ths_resource_init(struct ths_device *tmdev)
> >                 return PTR_ERR(base);
> >
> >         tmdev->regmap = devm_regmap_init_mmio(dev, base, &config);
> > -       if (IS_ERR(tmdev->regmap))
> > +       if (IS_ERR(tmdev->regmap)) {
> > +               dev_err(dev, "Failed to init regmap (%ld)\n",
> > +                       PTR_ERR(tmdev->regmap));
> >                 return PTR_ERR(tmdev->regmap);
> > +       }
> >
> >         if (tmdev->chip->has_bus_clk_reset) {
> >                 tmdev->reset = devm_reset_control_get(dev, NULL);
> > -               if (IS_ERR(tmdev->reset))
> > +               if (IS_ERR(tmdev->reset)) {
> > +                       dev_err(dev, "Failed to get reset (%ld)\n",
> > +                               PTR_ERR(tmdev->reset));
> >                         return PTR_ERR(tmdev->reset);
> > +               }
> >
> >                 tmdev->bus_clk = devm_clk_get(&pdev->dev, "bus");
> > -               if (IS_ERR(tmdev->bus_clk))
> > +               if (IS_ERR(tmdev->bus_clk)) {
> > +                       dev_err(dev, "Failed to get bus clock (%ld)\n",
> > +                               PTR_ERR(tmdev->bus_clk));
> >                         return PTR_ERR(tmdev->bus_clk);
> > +               }
> >         }
> >
> >         if (tmdev->chip->has_mod_clk) {
> >                 tmdev->mod_clk = devm_clk_get(&pdev->dev, "mod");
> > -               if (IS_ERR(tmdev->mod_clk))
> > +               if (IS_ERR(tmdev->mod_clk)) {
> > +                       dev_err(dev, "Failed to get mod clock (%ld)\n",
> > +                               PTR_ERR(tmdev->mod_clk));
> >                         return PTR_ERR(tmdev->mod_clk);
> > +               }
> >         }
> >
> >         ret = reset_control_deassert(tmdev->reset);
> > @@ -471,8 +489,12 @@ static int sun8i_ths_register(struct ths_device *tmdev)
> >                                                              i,
> >                                                              &tmdev->sensor[i],
> >                                                              &ths_ops);
> > -               if (IS_ERR(tmdev->sensor[i].tzd))
> > +               if (IS_ERR(tmdev->sensor[i].tzd)) {
> > +                       dev_err(tmdev->dev,
> > +                               "Failed to register sensor %d (%ld)\n",
> > +                               i, PTR_ERR(tmdev->sensor[i].tzd));
> >                         return PTR_ERR(tmdev->sensor[i].tzd);
> > +               }
> >
> >                 if (devm_thermal_add_hwmon_sysfs(tmdev->sensor[i].tzd))
> >                         dev_warn(tmdev->dev,
> > @@ -501,19 +523,21 @@ static int sun8i_ths_probe(struct platform_device *pdev)
> >
> >         ret = sun8i_ths_resource_init(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         irq = platform_get_irq(pdev, 0);
> > -       if (irq < 0)
> > -               return irq;
> > +       if (irq < 0) {
> > +               ret = irq;
> > +               goto err_out;
> > +       }
> >
> >         ret = tmdev->chip->init(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         ret = sun8i_ths_register(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         /*
> >          * Avoid entering the interrupt handler, the thermal device is not
> > @@ -523,10 +547,17 @@ static int sun8i_ths_probe(struct platform_device *pdev)
> >         ret = devm_request_threaded_irq(dev, irq, NULL,
> >                                         sun8i_irq_thread,
> >                                         IRQF_ONESHOT, "ths", tmdev);
> > -       if (ret)
> > -               return ret;
> > +       if (ret) {
> > +               dev_err(dev, "Failed to request irq (%d)\n", ret);
> > +               goto err_out;
> > +       }
> >
> > +       dev_info(dev, "Thermal sensor ready!\n");
> >         return 0;
> > +
> > +err_out:
> > +       dev_err(dev, "Failed to probe thermal sensor (%d)\n", ret);
> 
> When the driver fails, there will be this print. Isn't it superfluous
> for you to add these?
> 
> sun8i-thermal: probe of 5070400.thermal-sensor failed with error

There's no such failure message in the case I investigated, which is
EPROBE_DEFER failure waiting for nvmem driver that never loads,
because it's not configured by the user to build.

regards,
	o.

> 
> Yangtao
Ondřej Jirman July 8, 2020, 1:29 p.m. UTC | #6
Hello Maxime,

On Wed, Jul 08, 2020 at 02:25:42PM +0200, Maxime Ripard wrote:
> Hi,
> 
> On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > I noticed several mobile Linux distributions failing to enable the
> > thermal regulation correctly, because the kernel is silent
> > when thermal driver fails to probe. Add enough error reporting
> > to debug issues and warn users in case thermal sensor is failing
> > to probe.
> > 
> > Failing to notify users means, that SoC can easily overheat under
> > load.
> > 
> > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > ---
> >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> >  1 file changed, 43 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > index 74d73be16496..9065e79ae743 100644
> > --- a/drivers/thermal/sun8i_thermal.c
> > +++ b/drivers/thermal/sun8i_thermal.c
> > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >  
> >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> >  	if (IS_ERR(calcell)) {
> > +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > +			PTR_ERR(calcell));
> > +
> >  		if (PTR_ERR(calcell) == -EPROBE_DEFER)
> >  			return -EPROBE_DEFER;
> > +
> 
> The rest of the patch makes sense, but we should probably put the error
> message after the EPROBE_DEFER return so that we don't print any extra
> noise that isn't necessarily useful

I thought about that, but in this case this would have helped, see my other
e-mail. Though lack of "probe success" message may be enough for me, to
debug the issue, I'm not sure the user will notice that a message is missing, while
he'll surely notice if there's a flood of repeated EPROBE_DEFER messages.

And people run several distros for 3-4 months without anyone noticing any
issues and that thermal regulation doesn't work. So it seems that lack of a
success message is not enough.

Other solution may be to select CONFIG_NVMEM_SUNXI_SID if this driver
is enabled. That may get rid of this error scenario of waiting infinitely
for calibration data with EPROBE_DEFER. And other potential EPROBE_DEFER sources
will probably be quite visible even without this driver telling the user.
So this message may not be necessary in that case.

thank you and regards,
	o.

> Maxime
Maxime Ripard July 8, 2020, 1:29 p.m. UTC | #7
On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> @@ -523,10 +547,17 @@ static int sun8i_ths_probe(struct platform_device *pdev)
>  	ret = devm_request_threaded_irq(dev, irq, NULL,
>  					sun8i_irq_thread,
>  					IRQF_ONESHOT, "ths", tmdev);
> -	if (ret)
> -		return ret;
> +	if (ret) {
> +		dev_err(dev, "Failed to request irq (%d)\n", ret);
> +		goto err_out;
> +	}
>  
> +	dev_info(dev, "Thermal sensor ready!\n");
>  	return 0;

I missed that in my first mail, but I'm not sure we want to print
anything on success. This doesn't bring any value and that will only
make it harder to find errors in other drivers.

Maxime
Ondřej Jirman July 8, 2020, 1:33 p.m. UTC | #8
On Wed, Jul 08, 2020 at 07:55:40PM +0800, Frank Lee wrote:
> HI Ondrej,
> On Wed, Jul 8, 2020 at 6:55 PM Ondrej Jirman <megous@megous.com> wrote:
> >
> > I noticed several mobile Linux distributions failing to enable the
> > thermal regulation correctly, because the kernel is silent
> > when thermal driver fails to probe. Add enough error reporting
> > to debug issues and warn users in case thermal sensor is failing
> > to probe.
> >
> > Failing to notify users means, that SoC can easily overheat under
> > load.
> >
> > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > ---
> >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> >  1 file changed, 43 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > index 74d73be16496..9065e79ae743 100644
> > --- a/drivers/thermal/sun8i_thermal.c
> > +++ b/drivers/thermal/sun8i_thermal.c
> > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >
> >         calcell = devm_nvmem_cell_get(dev, "calibration");
> >         if (IS_ERR(calcell)) {
> > +               dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > +                       PTR_ERR(calcell));
> > +
> >                 if (PTR_ERR(calcell) == -EPROBE_DEFER)
> >                         return -EPROBE_DEFER;
> > +
> >                 /*
> >                  * Even if the external calibration data stored in sid is
> >                  * not accessible, the THS hardware can still work, although
> > @@ -308,6 +312,8 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> >         caldata = nvmem_cell_read(calcell, &callen);
> >         if (IS_ERR(caldata)) {
> >                 ret = PTR_ERR(caldata);
> > +               dev_err(dev, "Failed to read calibration data (%d)\n",
> > +                       ret);
> >                 goto out;
> >         }
> >
> > @@ -330,23 +336,35 @@ static int sun8i_ths_resource_init(struct ths_device *tmdev)
> >                 return PTR_ERR(base);
> >
> >         tmdev->regmap = devm_regmap_init_mmio(dev, base, &config);
> > -       if (IS_ERR(tmdev->regmap))
> > +       if (IS_ERR(tmdev->regmap)) {
> > +               dev_err(dev, "Failed to init regmap (%ld)\n",
> > +                       PTR_ERR(tmdev->regmap));
> >                 return PTR_ERR(tmdev->regmap);
> > +       }
> >
> >         if (tmdev->chip->has_bus_clk_reset) {
> >                 tmdev->reset = devm_reset_control_get(dev, NULL);
> > -               if (IS_ERR(tmdev->reset))
> > +               if (IS_ERR(tmdev->reset)) {
> > +                       dev_err(dev, "Failed to get reset (%ld)\n",
> > +                               PTR_ERR(tmdev->reset));
> >                         return PTR_ERR(tmdev->reset);
> > +               }
> >
> >                 tmdev->bus_clk = devm_clk_get(&pdev->dev, "bus");
> > -               if (IS_ERR(tmdev->bus_clk))
> > +               if (IS_ERR(tmdev->bus_clk)) {
> > +                       dev_err(dev, "Failed to get bus clock (%ld)\n",
> > +                               PTR_ERR(tmdev->bus_clk));
> >                         return PTR_ERR(tmdev->bus_clk);
> > +               }
> >         }
> >
> >         if (tmdev->chip->has_mod_clk) {
> >                 tmdev->mod_clk = devm_clk_get(&pdev->dev, "mod");
> > -               if (IS_ERR(tmdev->mod_clk))
> > +               if (IS_ERR(tmdev->mod_clk)) {
> > +                       dev_err(dev, "Failed to get mod clock (%ld)\n",
> > +                               PTR_ERR(tmdev->mod_clk));
> >                         return PTR_ERR(tmdev->mod_clk);
> > +               }
> >         }
> >
> >         ret = reset_control_deassert(tmdev->reset);
> > @@ -471,8 +489,12 @@ static int sun8i_ths_register(struct ths_device *tmdev)
> >                                                              i,
> >                                                              &tmdev->sensor[i],
> >                                                              &ths_ops);
> > -               if (IS_ERR(tmdev->sensor[i].tzd))
> > +               if (IS_ERR(tmdev->sensor[i].tzd)) {
> > +                       dev_err(tmdev->dev,
> > +                               "Failed to register sensor %d (%ld)\n",
> > +                               i, PTR_ERR(tmdev->sensor[i].tzd));
> >                         return PTR_ERR(tmdev->sensor[i].tzd);
> > +               }
> >
> >                 if (devm_thermal_add_hwmon_sysfs(tmdev->sensor[i].tzd))
> >                         dev_warn(tmdev->dev,
> > @@ -501,19 +523,21 @@ static int sun8i_ths_probe(struct platform_device *pdev)
> >
> >         ret = sun8i_ths_resource_init(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         irq = platform_get_irq(pdev, 0);
> > -       if (irq < 0)
> > -               return irq;
> > +       if (irq < 0) {
> > +               ret = irq;
> > +               goto err_out;
> > +       }
> >
> >         ret = tmdev->chip->init(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         ret = sun8i_ths_register(tmdev);
> >         if (ret)
> > -               return ret;
> > +               goto err_out;
> >
> >         /*
> >          * Avoid entering the interrupt handler, the thermal device is not
> > @@ -523,10 +547,17 @@ static int sun8i_ths_probe(struct platform_device *pdev)
> >         ret = devm_request_threaded_irq(dev, irq, NULL,
> >                                         sun8i_irq_thread,
> >                                         IRQF_ONESHOT, "ths", tmdev);
> > -       if (ret)
> > -               return ret;
> > +       if (ret) {
> > +               dev_err(dev, "Failed to request irq (%d)\n", ret);
> > +               goto err_out;
> > +       }
> >
> > +       dev_info(dev, "Thermal sensor ready!\n");
> >         return 0;
> > +
> > +err_out:
> > +       dev_err(dev, "Failed to probe thermal sensor (%d)\n", ret);
> 
> When the driver fails, there will be this print. Isn't it superfluous
> for you to add these?
> 
> sun8i-thermal: probe of 5070400.thermal-sensor failed with error

Thinking more about it. You're right. This is probably only shown for
non-EPROBE_DEFER errors. So this printk is superfluous, since EPROBE_DEFER
from nvmem/sid is already shown elsewhere in this patch.

thanks,
	o.

> 
> Yangtao
Maxime Ripard July 8, 2020, 1:36 p.m. UTC | #9
On Wed, Jul 08, 2020 at 03:29:24PM +0200, Ondřej Jirman wrote:
> Hello Maxime,
> 
> On Wed, Jul 08, 2020 at 02:25:42PM +0200, Maxime Ripard wrote:
> > Hi,
> > 
> > On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > > I noticed several mobile Linux distributions failing to enable the
> > > thermal regulation correctly, because the kernel is silent
> > > when thermal driver fails to probe. Add enough error reporting
> > > to debug issues and warn users in case thermal sensor is failing
> > > to probe.
> > > 
> > > Failing to notify users means, that SoC can easily overheat under
> > > load.
> > > 
> > > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > > ---
> > >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> > >  1 file changed, 43 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > > index 74d73be16496..9065e79ae743 100644
> > > --- a/drivers/thermal/sun8i_thermal.c
> > > +++ b/drivers/thermal/sun8i_thermal.c
> > > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> > >  
> > >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> > >  	if (IS_ERR(calcell)) {
> > > +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > > +			PTR_ERR(calcell));
> > > +
> > >  		if (PTR_ERR(calcell) == -EPROBE_DEFER)
> > >  			return -EPROBE_DEFER;
> > > +
> > 
> > The rest of the patch makes sense, but we should probably put the error
> > message after the EPROBE_DEFER return so that we don't print any extra
> > noise that isn't necessarily useful
> 
> I thought about that, but in this case this would have helped, see my other
> e-mail. Though lack of "probe success" message may be enough for me, to
> debug the issue, I'm not sure the user will notice that a message is missing, while
> he'll surely notice if there's a flood of repeated EPROBE_DEFER messages.

Yeah, but on the other hand, we regularly have people that come up and
ask if a "legitimate" EPROBE_DEFER error message (as in: the driver
wasn't there on the first attempt but was there on the second) is a
cause of concern or not.

> And people run several distros for 3-4 months without anyone noticing any
> issues and that thermal regulation doesn't work. So it seems that lack of a
> success message is not enough.

I understand what the issue is, but do you really expect phone users to
monitor the kernel logs every time they boot their phone to see if the
thermal throttling is enabled?

If anything, it looks like a distro problem, and the notification /
policy to deal with that should be implemented in userspace.

> Other solution may be to select CONFIG_NVMEM_SUNXI_SID if this driver
> is enabled. That may get rid of this error scenario of waiting infinitely
> for calibration data with EPROBE_DEFER. And other potential EPROBE_DEFER sources
> will probably be quite visible even without this driver telling the user.
> So this message may not be necessary in that case.

That would only partially solve your issue. If the nvmem driver doesn't
load for some reason, you would end up in a similar situation.

Maxime
Robin Murphy July 8, 2020, 1:42 p.m. UTC | #10
On 2020-07-08 14:21, Ondřej Jirman wrote:
[...]
>>> @@ -523,10 +547,17 @@ static int sun8i_ths_probe(struct platform_device *pdev)
>>>          ret = devm_request_threaded_irq(dev, irq, NULL,
>>>                                          sun8i_irq_thread,
>>>                                          IRQF_ONESHOT, "ths", tmdev);
>>> -       if (ret)
>>> -               return ret;
>>> +       if (ret) {
>>> +               dev_err(dev, "Failed to request irq (%d)\n", ret);
>>> +               goto err_out;
>>> +       }
>>>
>>> +       dev_info(dev, "Thermal sensor ready!\n");
>>>          return 0;
>>> +
>>> +err_out:
>>> +       dev_err(dev, "Failed to probe thermal sensor (%d)\n", ret);
>>
>> When the driver fails, there will be this print. Isn't it superfluous
>> for you to add these?
>>
>> sun8i-thermal: probe of 5070400.thermal-sensor failed with error
> 
> There's no such failure message in the case I investigated, which is
> EPROBE_DEFER failure waiting for nvmem driver that never loads,
> because it's not configured by the user to build.

Ah, in that case this was a bit misleading, since "probe failure" isn't 
really the problem at all. As it happens, there's a whole other 
discussion ongoing around making probe deferral issues easier to debug:

https://lore.kernel.org/linux-arm-kernel/20200626100103.18879-1-a.hajda@samsung.com/

Robin.
Ondřej Jirman July 8, 2020, 1:44 p.m. UTC | #11
On Wed, Jul 08, 2020 at 03:36:54PM +0200, Maxime Ripard wrote:
> On Wed, Jul 08, 2020 at 03:29:24PM +0200, Ondřej Jirman wrote:
> > Hello Maxime,
> > 
> > On Wed, Jul 08, 2020 at 02:25:42PM +0200, Maxime Ripard wrote:
> > > Hi,
> > > 
> > > On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > > > I noticed several mobile Linux distributions failing to enable the
> > > > thermal regulation correctly, because the kernel is silent
> > > > when thermal driver fails to probe. Add enough error reporting
> > > > to debug issues and warn users in case thermal sensor is failing
> > > > to probe.
> > > > 
> > > > Failing to notify users means, that SoC can easily overheat under
> > > > load.
> > > > 
> > > > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > > > ---
> > > >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> > > >  1 file changed, 43 insertions(+), 12 deletions(-)
> > > > 
> > > > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > > > index 74d73be16496..9065e79ae743 100644
> > > > --- a/drivers/thermal/sun8i_thermal.c
> > > > +++ b/drivers/thermal/sun8i_thermal.c
> > > > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> > > >  
> > > >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> > > >  	if (IS_ERR(calcell)) {
> > > > +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > > > +			PTR_ERR(calcell));
> > > > +
> > > >  		if (PTR_ERR(calcell) == -EPROBE_DEFER)
> > > >  			return -EPROBE_DEFER;
> > > > +
> > > 
> > > The rest of the patch makes sense, but we should probably put the error
> > > message after the EPROBE_DEFER return so that we don't print any extra
> > > noise that isn't necessarily useful
> > 
> > I thought about that, but in this case this would have helped, see my other
> > e-mail. Though lack of "probe success" message may be enough for me, to
> > debug the issue, I'm not sure the user will notice that a message is missing, while
> > he'll surely notice if there's a flood of repeated EPROBE_DEFER messages.
> 
> Yeah, but on the other hand, we regularly have people that come up and
> ask if a "legitimate" EPROBE_DEFER error message (as in: the driver
> wasn't there on the first attempt but was there on the second) is a
> cause of concern or not.

That's why I also added a success message, to distinguish this case. 

> > And people run several distros for 3-4 months without anyone noticing any
> > issues and that thermal regulation doesn't work. So it seems that lack of a
> > success message is not enough.
> 
> I understand what the issue is, but do you really expect phone users to
> monitor the kernel logs every time they boot their phone to see if the
> thermal throttling is enabled?

Not phone users, but people making their own kernels/distributions. Those people
monitor dmesg, and out of 4 distros or more nobody noticed there was an issue
(despite the complaints of overheating by their users).

So I thought some warning may be in order, so that distro people more easily
notice they have misconfigured the kernel or sometging.

End users really don't care about dmesg.

regards,
	o.

> If anything, it looks like a distro problem, and the notification /
> policy to deal with that should be implemented in userspace.
> 
> > Other solution may be to select CONFIG_NVMEM_SUNXI_SID if this driver
> > is enabled. That may get rid of this error scenario of waiting infinitely
> > for calibration data with EPROBE_DEFER. And other potential EPROBE_DEFER sources
> > will probably be quite visible even without this driver telling the user.
> > So this message may not be necessary in that case.
> 
> That would only partially solve your issue. If the nvmem driver doesn't
> load for some reason, you would end up in a similar situation.
> 
> Maxime
Maxime Ripard July 8, 2020, 1:57 p.m. UTC | #12
On Wed, Jul 08, 2020 at 03:44:41PM +0200, Ondřej Jirman wrote:
> On Wed, Jul 08, 2020 at 03:36:54PM +0200, Maxime Ripard wrote:
> > On Wed, Jul 08, 2020 at 03:29:24PM +0200, Ondřej Jirman wrote:
> > > Hello Maxime,
> > > 
> > > On Wed, Jul 08, 2020 at 02:25:42PM +0200, Maxime Ripard wrote:
> > > > Hi,
> > > > 
> > > > On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > > > > I noticed several mobile Linux distributions failing to enable the
> > > > > thermal regulation correctly, because the kernel is silent
> > > > > when thermal driver fails to probe. Add enough error reporting
> > > > > to debug issues and warn users in case thermal sensor is failing
> > > > > to probe.
> > > > > 
> > > > > Failing to notify users means, that SoC can easily overheat under
> > > > > load.
> > > > > 
> > > > > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > > > > ---
> > > > >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++-------
> > > > >  1 file changed, 43 insertions(+), 12 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
> > > > > index 74d73be16496..9065e79ae743 100644
> > > > > --- a/drivers/thermal/sun8i_thermal.c
> > > > > +++ b/drivers/thermal/sun8i_thermal.c
> > > > > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct ths_device *tmdev)
> > > > >  
> > > > >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> > > > >  	if (IS_ERR(calcell)) {
> > > > > +		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
> > > > > +			PTR_ERR(calcell));
> > > > > +
> > > > >  		if (PTR_ERR(calcell) == -EPROBE_DEFER)
> > > > >  			return -EPROBE_DEFER;
> > > > > +
> > > > 
> > > > The rest of the patch makes sense, but we should probably put the error
> > > > message after the EPROBE_DEFER return so that we don't print any extra
> > > > noise that isn't necessarily useful
> > > 
> > > I thought about that, but in this case this would have helped, see my other
> > > e-mail. Though lack of "probe success" message may be enough for me, to
> > > debug the issue, I'm not sure the user will notice that a message is missing, while
> > > he'll surely notice if there's a flood of repeated EPROBE_DEFER messages.
> > 
> > Yeah, but on the other hand, we regularly have people that come up and
> > ask if a "legitimate" EPROBE_DEFER error message (as in: the driver
> > wasn't there on the first attempt but was there on the second) is a
> > cause of concern or not.
> 
> That's why I also added a success message, to distinguish this case. 

That doesn't really help though. We have plenty of drivers that have
some sort of success message and people will still ask about that error
message earlier.

> > > And people run several distros for 3-4 months without anyone noticing any
> > > issues and that thermal regulation doesn't work. So it seems that lack of a
> > > success message is not enough.
> > 
> > I understand what the issue is, but do you really expect phone users to
> > monitor the kernel logs every time they boot their phone to see if the
> > thermal throttling is enabled?
> 
> Not phone users, but people making their own kernels/distributions. Those people
> monitor dmesg, and out of 4 distros or more nobody noticed there was an issue
> (despite the complaints of overheating by their users).
> 
> So I thought some warning may be in order, so that distro people more easily
> notice they have misconfigured the kernel or sometging.

I mean, then there's nothing we can do to properly address that then.

The configuration system is a gun, we can point at the target, but
anyone is definitely free to shot themself in the foot.

You would have exactly the same result if you left the thermal driver
disabled, or if you didn't have cpufreq support.

Maxime
Ondřej Jirman July 12, 2020, 11:29 p.m. UTC | #13
Hi Maxime,

On Wed, Jul 08, 2020 at 03:57:48PM +0200, Maxime Ripard wrote:
> On Wed, Jul 08, 2020 at 03:44:41PM +0200, Ondřej Jirman wrote:
> > >

[...]

> > > Yeah, but on the other hand, we regularly have people that come up and
> > > ask if a "legitimate" EPROBE_DEFER error message (as in: the driver
> > > wasn't there on the first attempt but was there on the second) is a
> > > cause of concern or not.
> > 
> > That's why I also added a success message, to distinguish this case. 
> 
> That doesn't really help though. We have plenty of drivers that have
> some sort of success message and people will still ask about that error
> message earlier.
> 
> > > > And people run several distros for 3-4 months without anyone noticing any
> > > > issues and that thermal regulation doesn't work. So it seems that lack of a
> > > > success message is not enough.
> > > 
> > > I understand what the issue is, but do you really expect phone users to
> > > monitor the kernel logs every time they boot their phone to see if the
> > > thermal throttling is enabled?
> > 
> > Not phone users, but people making their own kernels/distributions. Those people
> > monitor dmesg, and out of 4 distros or more nobody noticed there was an issue
> > (despite the complaints of overheating by their users).
> > 
> > So I thought some warning may be in order, so that distro people more easily
> > notice they have misconfigured the kernel or sometging.
> 
> I mean, then there's nothing we can do to properly address that then.
> 
> The configuration system is a gun, we can point at the target, but
> anyone is definitely free to shot themself in the foot.
> 
> You would have exactly the same result if you left the thermal driver
> disabled, or if you didn't have cpufreq support.

Right. Though I hope there's some middle ground. I mean all of those dev_err
in error paths of many drivers are there mostly to help debugging stuff.

And even though I was part of this driver's development, it took me quite
some time to figure out it was the missing sunxi-sid driver causing the issue,
with complete silence from the driver.

Maybe this can/will be solved at another level entirely, like having a device
core report devices probes that failed with EPROBE_DEFER some time after
the boot finished and modules had a chance to load, instead of immediately
for each probe retry.

regards,
	o.

> Maxime
Icenowy Zheng July 20, 2020, 7:55 a.m. UTC | #14
在 2020-07-08星期三的 12:03 +0100,Russell King - ARM Linux admin写道:
> On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > I noticed several mobile Linux distributions failing to enable the
> > thermal regulation correctly, because the kernel is silent
> > when thermal driver fails to probe. Add enough error reporting
> > to debug issues and warn users in case thermal sensor is failing
> > to probe.
> > 
> > Failing to notify users means, that SoC can easily overheat under
> > load.
> > 
> > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > ---
> >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++---
> > ----
> >  1 file changed, 43 insertions(+), 12 deletions(-)
> > 
> > diff --git a/drivers/thermal/sun8i_thermal.c
> > b/drivers/thermal/sun8i_thermal.c
> > index 74d73be16496..9065e79ae743 100644
> > --- a/drivers/thermal/sun8i_thermal.c
> > +++ b/drivers/thermal/sun8i_thermal.c
> > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct
> > ths_device *tmdev)
> >  
> >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> >  	if (IS_ERR(calcell)) {
> > +		dev_err(dev, "Failed to get calibration nvmem cell
> > (%ld)\n",
> > +			PTR_ERR(calcell));
> 
> Consider using:
> 
> 		dev_err(dev, "Failed to get calibration nvmem cell
> (%pe)\n",
> 			calcell);
> 
> which means the kernel can print the symbolic errno value.

Oh interesting format here.

When we need to deal with a int return value, is it "%e"?

Thanks

>
Russell King (Oracle) July 20, 2020, 8:28 a.m. UTC | #15
On Mon, Jul 20, 2020 at 03:55:26PM +0800, Icenowy Zheng wrote:
> 在 2020-07-08星期三的 12:03 +0100,Russell King - ARM Linux admin写道:
> > On Wed, Jul 08, 2020 at 12:55:27PM +0200, Ondrej Jirman wrote:
> > > I noticed several mobile Linux distributions failing to enable the
> > > thermal regulation correctly, because the kernel is silent
> > > when thermal driver fails to probe. Add enough error reporting
> > > to debug issues and warn users in case thermal sensor is failing
> > > to probe.
> > > 
> > > Failing to notify users means, that SoC can easily overheat under
> > > load.
> > > 
> > > Signed-off-by: Ondrej Jirman <megous@megous.com>
> > > ---
> > >  drivers/thermal/sun8i_thermal.c | 55 ++++++++++++++++++++++++++---
> > > ----
> > >  1 file changed, 43 insertions(+), 12 deletions(-)
> > > 
> > > diff --git a/drivers/thermal/sun8i_thermal.c
> > > b/drivers/thermal/sun8i_thermal.c
> > > index 74d73be16496..9065e79ae743 100644
> > > --- a/drivers/thermal/sun8i_thermal.c
> > > +++ b/drivers/thermal/sun8i_thermal.c
> > > @@ -287,8 +287,12 @@ static int sun8i_ths_calibrate(struct
> > > ths_device *tmdev)
> > >  
> > >  	calcell = devm_nvmem_cell_get(dev, "calibration");
> > >  	if (IS_ERR(calcell)) {
> > > +		dev_err(dev, "Failed to get calibration nvmem cell
> > > (%ld)\n",
> > > +			PTR_ERR(calcell));
> > 
> > Consider using:
> > 
> > 		dev_err(dev, "Failed to get calibration nvmem cell
> > (%pe)\n",
> > 			calcell);
> > 
> > which means the kernel can print the symbolic errno value.
> 
> Oh interesting format here.
> 
> When we need to deal with a int return value, is it "%e"?

No, because that will lose the ability for the compiler to check the
format string and arguments correspond.  All the extensions are
documented at Documentation/core-api/printk-formats.rst.

Use %pe and ERR_PTR(...) to print an integer -ve errno return value.
Maxime Ripard July 23, 2020, 3:20 p.m. UTC | #16
On Mon, Jul 13, 2020 at 01:29:42AM +0200, Ondřej Jirman wrote:
> Hi Maxime,
> 
> On Wed, Jul 08, 2020 at 03:57:48PM +0200, Maxime Ripard wrote:
> > On Wed, Jul 08, 2020 at 03:44:41PM +0200, Ondřej Jirman wrote:
> > > >
> 
> [...]
> 
> > > > Yeah, but on the other hand, we regularly have people that come up and
> > > > ask if a "legitimate" EPROBE_DEFER error message (as in: the driver
> > > > wasn't there on the first attempt but was there on the second) is a
> > > > cause of concern or not.
> > > 
> > > That's why I also added a success message, to distinguish this case. 
> > 
> > That doesn't really help though. We have plenty of drivers that have
> > some sort of success message and people will still ask about that error
> > message earlier.
> > 
> > > > > And people run several distros for 3-4 months without anyone noticing any
> > > > > issues and that thermal regulation doesn't work. So it seems that lack of a
> > > > > success message is not enough.
> > > > 
> > > > I understand what the issue is, but do you really expect phone users to
> > > > monitor the kernel logs every time they boot their phone to see if the
> > > > thermal throttling is enabled?
> > > 
> > > Not phone users, but people making their own kernels/distributions. Those people
> > > monitor dmesg, and out of 4 distros or more nobody noticed there was an issue
> > > (despite the complaints of overheating by their users).
> > > 
> > > So I thought some warning may be in order, so that distro people more easily
> > > notice they have misconfigured the kernel or sometging.
> > 
> > I mean, then there's nothing we can do to properly address that then.
> > 
> > The configuration system is a gun, we can point at the target, but
> > anyone is definitely free to shot themself in the foot.
> > 
> > You would have exactly the same result if you left the thermal driver
> > disabled, or if you didn't have cpufreq support.
> 
> Right. Though I hope there's some middle ground. I mean all of those dev_err
> in error paths of many drivers are there mostly to help debugging stuff.

Adding all the error messages you have in that patch seems like a good
middle ground to me, and we could definitely use more of them in some
other drivers (like the USB PHY)

> And even though I was part of this driver's development, it took me quite
> some time to figure out it was the missing sunxi-sid driver causing the issue,
> with complete silence from the driver.
> 
> Maybe this can/will be solved at another level entirely, like having a device
> core report devices probes that failed with EPROBE_DEFER some time after
> the boot finished and modules had a chance to load, instead of immediately
> for each probe retry.

The thing is that there's never a point in time where "all the modules
had a chance to load". If you're loading the modules on demand and have
an hotpluggable bus, that might happen after a second or after a year,
you can't say.

The actual fix for this would be to use the on demand probing that seems
to be in the works and avoid EPROBE_DEFER entirely, but that probably
won't happen in a near future.

Maxime
diff mbox series

Patch

diff --git a/drivers/thermal/sun8i_thermal.c b/drivers/thermal/sun8i_thermal.c
index 74d73be16496..9065e79ae743 100644
--- a/drivers/thermal/sun8i_thermal.c
+++ b/drivers/thermal/sun8i_thermal.c
@@ -287,8 +287,12 @@  static int sun8i_ths_calibrate(struct ths_device *tmdev)
 
 	calcell = devm_nvmem_cell_get(dev, "calibration");
 	if (IS_ERR(calcell)) {
+		dev_err(dev, "Failed to get calibration nvmem cell (%ld)\n",
+			PTR_ERR(calcell));
+
 		if (PTR_ERR(calcell) == -EPROBE_DEFER)
 			return -EPROBE_DEFER;
+
 		/*
 		 * Even if the external calibration data stored in sid is
 		 * not accessible, the THS hardware can still work, although
@@ -308,6 +312,8 @@  static int sun8i_ths_calibrate(struct ths_device *tmdev)
 	caldata = nvmem_cell_read(calcell, &callen);
 	if (IS_ERR(caldata)) {
 		ret = PTR_ERR(caldata);
+		dev_err(dev, "Failed to read calibration data (%d)\n",
+			ret);
 		goto out;
 	}
 
@@ -330,23 +336,35 @@  static int sun8i_ths_resource_init(struct ths_device *tmdev)
 		return PTR_ERR(base);
 
 	tmdev->regmap = devm_regmap_init_mmio(dev, base, &config);
-	if (IS_ERR(tmdev->regmap))
+	if (IS_ERR(tmdev->regmap)) {
+		dev_err(dev, "Failed to init regmap (%ld)\n",
+			PTR_ERR(tmdev->regmap));
 		return PTR_ERR(tmdev->regmap);
+	}
 
 	if (tmdev->chip->has_bus_clk_reset) {
 		tmdev->reset = devm_reset_control_get(dev, NULL);
-		if (IS_ERR(tmdev->reset))
+		if (IS_ERR(tmdev->reset)) {
+			dev_err(dev, "Failed to get reset (%ld)\n",
+				PTR_ERR(tmdev->reset));
 			return PTR_ERR(tmdev->reset);
+		}
 
 		tmdev->bus_clk = devm_clk_get(&pdev->dev, "bus");
-		if (IS_ERR(tmdev->bus_clk))
+		if (IS_ERR(tmdev->bus_clk)) {
+			dev_err(dev, "Failed to get bus clock (%ld)\n",
+				PTR_ERR(tmdev->bus_clk));
 			return PTR_ERR(tmdev->bus_clk);
+		}
 	}
 
 	if (tmdev->chip->has_mod_clk) {
 		tmdev->mod_clk = devm_clk_get(&pdev->dev, "mod");
-		if (IS_ERR(tmdev->mod_clk))
+		if (IS_ERR(tmdev->mod_clk)) {
+			dev_err(dev, "Failed to get mod clock (%ld)\n",
+				PTR_ERR(tmdev->mod_clk));
 			return PTR_ERR(tmdev->mod_clk);
+		}
 	}
 
 	ret = reset_control_deassert(tmdev->reset);
@@ -471,8 +489,12 @@  static int sun8i_ths_register(struct ths_device *tmdev)
 							     i,
 							     &tmdev->sensor[i],
 							     &ths_ops);
-		if (IS_ERR(tmdev->sensor[i].tzd))
+		if (IS_ERR(tmdev->sensor[i].tzd)) {
+			dev_err(tmdev->dev,
+				"Failed to register sensor %d (%ld)\n",
+				i, PTR_ERR(tmdev->sensor[i].tzd));
 			return PTR_ERR(tmdev->sensor[i].tzd);
+		}
 
 		if (devm_thermal_add_hwmon_sysfs(tmdev->sensor[i].tzd))
 			dev_warn(tmdev->dev,
@@ -501,19 +523,21 @@  static int sun8i_ths_probe(struct platform_device *pdev)
 
 	ret = sun8i_ths_resource_init(tmdev);
 	if (ret)
-		return ret;
+		goto err_out;
 
 	irq = platform_get_irq(pdev, 0);
-	if (irq < 0)
-		return irq;
+	if (irq < 0) {
+		ret = irq;
+		goto err_out;
+	}
 
 	ret = tmdev->chip->init(tmdev);
 	if (ret)
-		return ret;
+		goto err_out;
 
 	ret = sun8i_ths_register(tmdev);
 	if (ret)
-		return ret;
+		goto err_out;
 
 	/*
 	 * Avoid entering the interrupt handler, the thermal device is not
@@ -523,10 +547,17 @@  static int sun8i_ths_probe(struct platform_device *pdev)
 	ret = devm_request_threaded_irq(dev, irq, NULL,
 					sun8i_irq_thread,
 					IRQF_ONESHOT, "ths", tmdev);
-	if (ret)
-		return ret;
+	if (ret) {
+		dev_err(dev, "Failed to request irq (%d)\n", ret);
+		goto err_out;
+	}
 
+	dev_info(dev, "Thermal sensor ready!\n");
 	return 0;
+
+err_out:
+	dev_err(dev, "Failed to probe thermal sensor (%d)\n", ret);
+	return ret;
 }
 
 static int sun8i_ths_remove(struct platform_device *pdev)