diff mbox series

[v4,05/14] soc: mediatek: mtk-svs: use svs clk control APIs

Message ID 20230111074528.29354-6-roger.lu@mediatek.com (mailing list archive)
State New, archived
Headers show
Series Enahance SVS's robustness | expand

Commit Message

Roger Lu Jan. 11, 2023, 7:45 a.m. UTC
In MediaTek HW design, svs and thermal both use the same clk source.
It means that svs clk reference count from CCF includes thermal control
counts. That makes svs driver confuse whether it disabled svs's main clk
or not from CCF's perspective and lead to turn off their shared clk
unexpectedly. Therefore, we add svs clk control APIs to make sure svs's
main clk is controlled well by svs driver itself.

Here is a NG example. Rely on CCF's reference count and cause problem.

thermal probe (clk ref = 1)
-> svs probe (clk ref = 2)
   -> svs suspend (clk ref = 1)
      -> thermal suspend (clk ref = 0)
      -> thermal resume (clk ref = 1)
   -> svs resume (encounter error, clk ref = 1)
   -> svs suspend (clk ref = 0)
      -> thermal suspend (Fail here, thermal HW control w/o clk)

Fixes: a825d72f74a3 ("soc: mediatek: fix missing clk_disable_unprepare() on err in svs_resume()")
Signed-off-by: Roger Lu <roger.lu@mediatek.com>
---
 drivers/soc/mediatek/mtk-svs.c | 57 ++++++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 13 deletions(-)

Comments

Matthias Brugger Jan. 31, 2023, 1:19 p.m. UTC | #1
On 11/01/2023 08:45, Roger Lu wrote:
> In MediaTek HW design, svs and thermal both use the same clk source.
> It means that svs clk reference count from CCF includes thermal control
> counts. That makes svs driver confuse whether it disabled svs's main clk
> or not from CCF's perspective and lead to turn off their shared clk
> unexpectedly. Therefore, we add svs clk control APIs to make sure svs's
> main clk is controlled well by svs driver itself.
> 
> Here is a NG example. Rely on CCF's reference count and cause problem.
> 
> thermal probe (clk ref = 1)
> -> svs probe (clk ref = 2)
>     -> svs suspend (clk ref = 1)
>        -> thermal suspend (clk ref = 0)
>        -> thermal resume (clk ref = 1)
>     -> svs resume (encounter error, clk ref = 1)
>     -> svs suspend (clk ref = 0)
>        -> thermal suspend (Fail here, thermal HW control w/o clk)
> 
> Fixes: a825d72f74a3 ("soc: mediatek: fix missing clk_disable_unprepare() on err in svs_resume()")
> Signed-off-by: Roger Lu <roger.lu@mediatek.com>

That looks wrong. Although I don't out of my mind, there should be a way to tell 
the clock framework that this clock is shared between several devices.

I wonder if using clk_enable and clk_disable in svs_resume/suspend wouldn't be 
enough.

Regards,
Matthias

> ---
>   drivers/soc/mediatek/mtk-svs.c | 57 ++++++++++++++++++++++++++--------
>   1 file changed, 44 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/soc/mediatek/mtk-svs.c b/drivers/soc/mediatek/mtk-svs.c
> index 9575aa645643..830263bad81e 100644
> --- a/drivers/soc/mediatek/mtk-svs.c
> +++ b/drivers/soc/mediatek/mtk-svs.c
> @@ -326,6 +326,7 @@ static const u32 svs_regs_v2[] = {
>    * @bank_max: total number of svs banks
>    * @efuse: svs efuse data received from NVMEM framework
>    * @tefuse: thermal efuse data received from NVMEM framework
> + * @clk_cnt: clock count shows the clk enable/disable times by svs driver
>    */
>   struct svs_platform {
>   	char *name;
> @@ -343,6 +344,7 @@ struct svs_platform {
>   	u32 bank_max;
>   	u32 *efuse;
>   	u32 *tefuse;
> +	s32 clk_cnt;
>   };
>   
>   struct svs_platform_data {
> @@ -502,6 +504,32 @@ static void svs_switch_bank(struct svs_platform *svsp)
>   	svs_writel_relaxed(svsp, svsb->core_sel, CORESEL);
>   }
>   
> +static bool svs_is_clk_enabled(struct svs_platform *svsp)
> +{
> +	return svsp->clk_cnt > 0 ? true : false;
> +}
> +
> +static int svs_clk_enable(struct svs_platform *svsp)
> +{
> +	int ret;
> +
> +	ret = clk_prepare_enable(svsp->main_clk);
> +	if (ret) {
> +		dev_err(svsp->dev, "cannot enable main_clk: %d\n", ret);
> +		return ret;
> +	}
> +
> +	svsp->clk_cnt++;
> +
> +	return 0;
> +}
> +
> +static void svs_clk_disable(struct svs_platform *svsp)
> +{
> +	clk_disable_unprepare(svsp->main_clk);
> +	svsp->clk_cnt--;
> +}
> +
>   static u32 svs_bank_volt_to_opp_volt(u32 svsb_volt, u32 svsb_volt_step,
>   				     u32 svsb_volt_base)
>   {
> @@ -1569,6 +1597,12 @@ static int svs_suspend(struct device *dev)
>   	int ret;
>   	u32 idx;
>   
> +	if (!svs_is_clk_enabled(svsp)) {
> +		dev_err(svsp->dev, "svs clk is disabled already (%d)\n",
> +			svsp->clk_cnt);
> +		return 0;
> +	}
> +
>   	for (idx = 0; idx < svsp->bank_max; idx++) {
>   		svsb = &svsp->banks[idx];
>   
> @@ -1590,7 +1624,7 @@ static int svs_suspend(struct device *dev)
>   		return ret;
>   	}
>   
> -	clk_disable_unprepare(svsp->main_clk);
> +	svs_clk_disable(svsp);
>   
>   	return 0;
>   }
> @@ -1600,16 +1634,14 @@ static int svs_resume(struct device *dev)
>   	struct svs_platform *svsp = dev_get_drvdata(dev);
>   	int ret;
>   
> -	ret = clk_prepare_enable(svsp->main_clk);
> -	if (ret) {
> -		dev_err(svsp->dev, "cannot enable main_clk, disable svs\n");
> +	ret = svs_clk_enable(svsp);
> +	if (ret)
>   		return ret;
> -	}
>   
>   	ret = reset_control_deassert(svsp->rst);
>   	if (ret) {
>   		dev_err(svsp->dev, "cannot deassert reset %d\n", ret);
> -		goto out_of_resume;
> +		goto svs_resume_clk_disable;
>   	}
>   
>   	ret = svs_init02(svsp);
> @@ -1624,8 +1656,9 @@ static int svs_resume(struct device *dev)
>   	dev_err(svsp->dev, "assert reset: %d\n",
>   		reset_control_assert(svsp->rst));
>   
> -out_of_resume:
> -	clk_disable_unprepare(svsp->main_clk);
> +svs_resume_clk_disable:
> +	svs_clk_disable(svsp);
> +
>   	return ret;
>   }
>   
> @@ -2411,11 +2444,9 @@ static int svs_probe(struct platform_device *pdev)
>   		goto svs_probe_free_resource;
>   	}
>   
> -	ret = clk_prepare_enable(svsp->main_clk);
> -	if (ret) {
> -		dev_err(svsp->dev, "cannot enable main clk: %d\n", ret);
> +	ret = svs_clk_enable(svsp);
> +	if (ret)
>   		goto svs_probe_free_resource;
> -	}
>   
>   	svsp->base = of_iomap(svsp->dev->of_node, 0);
>   	if (IS_ERR_OR_NULL(svsp->base)) {
> @@ -2456,7 +2487,7 @@ static int svs_probe(struct platform_device *pdev)
>   	iounmap(svsp->base);
>   
>   svs_probe_clk_disable:
> -	clk_disable_unprepare(svsp->main_clk);
> +	svs_clk_disable(svsp);
>   
>   svs_probe_free_resource:
>   	if (!IS_ERR_OR_NULL(svsp->efuse))
Roger Lu Feb. 1, 2023, 12:28 p.m. UTC | #2
Hi Matthias Sir,

On Tue, 2023-01-31 at 14:19 +0100, Matthias Brugger wrote:
> 
> On 11/01/2023 08:45, Roger Lu wrote:
> > In MediaTek HW design, svs and thermal both use the same clk source.
> > It means that svs clk reference count from CCF includes thermal control
> > counts. That makes svs driver confuse whether it disabled svs's main clk
> > or not from CCF's perspective and lead to turn off their shared clk
> > unexpectedly. Therefore, we add svs clk control APIs to make sure svs's
> > main clk is controlled well by svs driver itself.
> > 
> > Here is a NG example. Rely on CCF's reference count and cause problem.
> > 
> > thermal probe (clk ref = 1)
> > -> svs probe (clk ref = 2)
> >     -> svs suspend (clk ref = 1)
> >        -> thermal suspend (clk ref = 0)
> >        -> thermal resume (clk ref = 1)
> >     -> svs resume (encounter error, clk ref = 1)
> >     -> svs suspend (clk ref = 0)
> >        -> thermal suspend (Fail here, thermal HW control w/o clk)
> > 
> > Fixes: a825d72f74a3 ("soc: mediatek: fix missing clk_disable_unprepare() on
> > err in svs_resume()")
> > Signed-off-by: Roger Lu <roger.lu@mediatek.com>
> 
> That looks wrong. Although I don't out of my mind, there should be a way to
> tell 
> the clock framework that this clock is shared between several devices.
> 
> I wonder if using clk_enable and clk_disable in svs_resume/suspend wouldn't
> be 
> enough.

Oh yes, Common Clock Framework (CCF) knows the clock shared between several
devices and maintains clock "on/off" by reference count.

We concern how to stop running svs_suspend() when svs clk is already disabled by
svs_resume(). Take an example as below, if we refers to __clk_is_enabled()
result for knowing svs clk status, it will return "true" all the time because
thermal clk is still on. This causes the problem mentioned in commit message.

static int svs_suspend(struct device *dev)
{
...
	if (!__clk_is_enabled(svsp->main_clk)) //always get `true`
		return 0;
...
}

> 
> Regards,
> Matthias

... [snip] ...
Matthias Brugger Feb. 2, 2023, 10:29 a.m. UTC | #3
你好,

On 01/02/2023 13:28, Roger Lu (陸瑞傑) wrote:
> Hi Matthias Sir,
> 
> On Tue, 2023-01-31 at 14:19 +0100, Matthias Brugger wrote:
>>
>> On 11/01/2023 08:45, Roger Lu wrote:
>>> In MediaTek HW design, svs and thermal both use the same clk source.
>>> It means that svs clk reference count from CCF includes thermal control
>>> counts. That makes svs driver confuse whether it disabled svs's main clk
>>> or not from CCF's perspective and lead to turn off their shared clk
>>> unexpectedly. Therefore, we add svs clk control APIs to make sure svs's
>>> main clk is controlled well by svs driver itself.
>>>
>>> Here is a NG example. Rely on CCF's reference count and cause problem.
>>>
>>> thermal probe (clk ref = 1)
>>> -> svs probe (clk ref = 2)
>>>      -> svs suspend (clk ref = 1)
>>>         -> thermal suspend (clk ref = 0)
>>>         -> thermal resume (clk ref = 1)
>>>      -> svs resume (encounter error, clk ref = 1)
>>>      -> svs suspend (clk ref = 0)
>>>         -> thermal suspend (Fail here, thermal HW control w/o clk)
>>>
>>> Fixes: a825d72f74a3 ("soc: mediatek: fix missing clk_disable_unprepare() on
>>> err in svs_resume()")
>>> Signed-off-by: Roger Lu <roger.lu@mediatek.com>
>>
>> That looks wrong. Although I don't out of my mind, there should be a way to
>> tell
>> the clock framework that this clock is shared between several devices.
>>
>> I wonder if using clk_enable and clk_disable in svs_resume/suspend wouldn't
>> be
>> enough.
> 
> Oh yes, Common Clock Framework (CCF) knows the clock shared between several
> devices and maintains clock "on/off" by reference count.
> 

The thing is if you use clk_prepare_enable then the clock framework check's if 
the clock is already prepared, which could happen like you described in the 
svs_resume (encount error) case in the commit message. The question is, can't we 
just use clk_enable and clk_disable in resume/suspend and only prepare the clock 
in the probe function?

> We concern how to stop running svs_suspend() when svs clk is already disabled by
> svs_resume(). Take an example as below, if we refers to __clk_is_enabled()
> result for knowing svs clk status, it will return "true" all the time because
> thermal clk is still on. This causes the problem mentioned in commit message.
> 

I would expect that the kernel takes care that we can't enter a resume path for 
a device before the suspend path has finished. Honestly I don't really 
understand the problem here. It seems something different then what you 
described in the commit message.

Please help me understand better.

谢谢,再见

Matthias

> static int svs_suspend(struct device *dev)
> {
> ...
> 	if (!__clk_is_enabled(svsp->main_clk)) //always get `true`
> 		return 0;
> ...
> }
> 
>>
>> Regards,
>> Matthias
> 
> ... [snip] ...
Roger Lu Feb. 6, 2023, 2:01 a.m. UTC | #4
Hi Matthias Sir,


On Thu, 2023-02-02 at 11:29 +0100, Matthias Brugger wrote:
> 你好,

I got shock and thought someone used your name to reply. However,
your email account helps me clear my mind. Haha.. Nice and warm to see mandarin
on patchwork. It's so fresh and exciting :-).

> 
> On 01/02/2023 13:28, Roger Lu (陸瑞傑) wrote:
> > Hi Matthias Sir,
> > 
> > On Tue, 2023-01-31 at 14:19 +0100, Matthias Brugger wrote:
> > > 
> > > On 11/01/2023 08:45, Roger Lu wrote:
> > > > In MediaTek HW design, svs and thermal both use the same clk source.
> > > > It means that svs clk reference count from CCF includes thermal control
> > > > counts. That makes svs driver confuse whether it disabled svs's main clk
> > > > or not from CCF's perspective and lead to turn off their shared clk
> > > > unexpectedly. Therefore, we add svs clk control APIs to make sure svs's
> > > > main clk is controlled well by svs driver itself.
> > > > 
> > > > Here is a NG example. Rely on CCF's reference count and cause problem.
> > > > 
> > > > thermal probe (clk ref = 1)
> > > > -> svs probe (clk ref = 2)
> > > >      -> svs suspend (clk ref = 1)
> > > >         -> thermal suspend (clk ref = 0)
> > > >         -> thermal resume (clk ref = 1)
> > > >      -> svs resume (encounter error, clk ref = 1)
> > > >      -> svs suspend (clk ref = 0)
> > > >         -> thermal suspend (Fail here, thermal HW control w/o clk)
> > > > 
> > > > Fixes: a825d72f74a3 ("soc: mediatek: fix missing clk_disable_unprepare()
> > > > on
> > > > err in svs_resume()")
> > > > Signed-off-by: Roger Lu <roger.lu@mediatek.com>
> > > 
> > > That looks wrong. Although I don't out of my mind, there should be a way
> > > to
> > > tell
> > > the clock framework that this clock is shared between several devices.
> > > 
> > > I wonder if using clk_enable and clk_disable in svs_resume/suspend
> > > wouldn't
> > > be
> > > enough.
> > 
> > Oh yes, Common Clock Framework (CCF) knows the clock shared between several
> > devices and maintains clock "on/off" by reference count.
> > 
> 
> The thing is if you use clk_prepare_enable then the clock framework check's
> if 
> the clock is already prepared, which could happen like you described in the 
> svs_resume (encount error) case in the commit message. The question is, can't
> we 
> just use clk_enable and clk_disable in resume/suspend and only prepare the
> clock 
> in the probe function?

We'll think if this can fix the problem. Thanks for the advice very much.

> 
> > We concern how to stop running svs_suspend() when svs clk is already
> > disabled by
> > svs_resume(). Take an example as below, if we refers to __clk_is_enabled()
> > result for knowing svs clk status, it will return "true" all the time
> > because
> > thermal clk is still on. This causes the problem mentioned in commit
> > message.
> > 
> 
> I would expect that the kernel takes care that we can't enter a resume path
> for 
> a device before the suspend path has finished. Honestly I don't really 
> understand the problem here. It seems something different then what you 
> described in the commit message.
> 
> Please help me understand better.

I see. This patch title needs to be changed to "avoid turning off svs clk twice
unexpectedly" for pointing out the problem precisely. We saw a loophole that svs
clk might be turned off in svs_resume() first and in svs_suspend() again without
enabling svs clk during these the process. Therefore, we try to fix it by this
patch. Below is our thinking process to explain how we got here.

1. (abandoned) We add __clk_is_enabled() check in svs_suspend() to prevent svs
clk from being turned off twice when svs_resume() turned off svs clk in the
error-handling process. Nonetheless, we met the NG case in the commit message.
2. (current patch) We add svs clk control hint to understand if we need to run
svs_suspend() or not if svs_resume() turned off svs clk before.

> 
> 谢谢,再见

:-)


Sincerely,
Roger Lu
Matthias Brugger Feb. 6, 2023, 12:09 p.m. UTC | #5
On 06/02/2023 03:01, Roger Lu (陸瑞傑) wrote:
> Hi Matthias Sir,
> 
> 
> On Thu, 2023-02-02 at 11:29 +0100, Matthias Brugger wrote:
>> 你好,
> 
> I got shock and thought someone used your name to reply. However,
> your email account helps me clear my mind. Haha.. Nice and warm to see mandarin
> on patchwork. It's so fresh and exciting :-).
> 

谢谢。 I'm learning mainland Chinese for a few month now, I also learned that you 
use different symbols in Taiwan, which I don't know. 对不起。

>>
>> On 01/02/2023 13:28, Roger Lu (陸瑞傑) wrote:
>>> Hi Matthias Sir,
>>>
>>> On Tue, 2023-01-31 at 14:19 +0100, Matthias Brugger wrote:
>>>>
>>>> On 11/01/2023 08:45, Roger Lu wrote:
>>>>> In MediaTek HW design, svs and thermal both use the same clk source.
>>>>> It means that svs clk reference count from CCF includes thermal control
>>>>> counts. That makes svs driver confuse whether it disabled svs's main clk
>>>>> or not from CCF's perspective and lead to turn off their shared clk
>>>>> unexpectedly. Therefore, we add svs clk control APIs to make sure svs's
>>>>> main clk is controlled well by svs driver itself.
>>>>>
>>>>> Here is a NG example. Rely on CCF's reference count and cause problem.
>>>>>
>>>>> thermal probe (clk ref = 1)
>>>>> -> svs probe (clk ref = 2)
>>>>>       -> svs suspend (clk ref = 1)
>>>>>          -> thermal suspend (clk ref = 0)
>>>>>          -> thermal resume (clk ref = 1)
>>>>>       -> svs resume (encounter error, clk ref = 1)
>>>>>       -> svs suspend (clk ref = 0)
>>>>>          -> thermal suspend (Fail here, thermal HW control w/o clk)
>>>>>
>>>>> Fixes: a825d72f74a3 ("soc: mediatek: fix missing clk_disable_unprepare()
>>>>> on
>>>>> err in svs_resume()")
>>>>> Signed-off-by: Roger Lu <roger.lu@mediatek.com>
>>>>
>>>> That looks wrong. Although I don't out of my mind, there should be a way
>>>> to
>>>> tell
>>>> the clock framework that this clock is shared between several devices.
>>>>
>>>> I wonder if using clk_enable and clk_disable in svs_resume/suspend
>>>> wouldn't
>>>> be
>>>> enough.
>>>
>>> Oh yes, Common Clock Framework (CCF) knows the clock shared between several
>>> devices and maintains clock "on/off" by reference count.
>>>
>>
>> The thing is if you use clk_prepare_enable then the clock framework check's
>> if
>> the clock is already prepared, which could happen like you described in the
>> svs_resume (encount error) case in the commit message. The question is, can't
>> we
>> just use clk_enable and clk_disable in resume/suspend and only prepare the
>> clock
>> in the probe function?
> 
> We'll think if this can fix the problem. Thanks for the advice very much.
> 
>>
>>> We concern how to stop running svs_suspend() when svs clk is already
>>> disabled by
>>> svs_resume(). Take an example as below, if we refers to __clk_is_enabled()
>>> result for knowing svs clk status, it will return "true" all the time
>>> because
>>> thermal clk is still on. This causes the problem mentioned in commit
>>> message.
>>>
>>
>> I would expect that the kernel takes care that we can't enter a resume path
>> for
>> a device before the suspend path has finished. Honestly I don't really
>> understand the problem here. It seems something different then what you
>> described in the commit message.
>>
>> Please help me understand better.
> 
> I see. This patch title needs to be changed to "avoid turning off svs clk twice
> unexpectedly" for pointing out the problem precisely. We saw a loophole that svs
> clk might be turned off in svs_resume() first and in svs_suspend() again without
> enabling svs clk during these the process. Therefore, we try to fix it by this
> patch. Below is our thinking process to explain how we got here.
> 
> 1. (abandoned) We add __clk_is_enabled() check in svs_suspend() to prevent svs
> clk from being turned off twice when svs_resume() turned off svs clk in the
> error-handling process. Nonetheless, we met the NG case in the commit message.
> 2. (current patch) We add svs clk control hint to understand if we need to run
> svs_suspend() or not if svs_resume() turned off svs clk before.
> 

Did you had a look on the dev_pm_ops? Maybe we can use suspend_late, 
resume_early to make sure there is no race condition. I wonder also if we can't 
make sure that this does not happen using device links. Sorry, I can't give 
better guidance on how to use this technologies, but I have the feeling we can 
fix this with existing infrastructure.

再见。

Matthias

>>
>> 谢谢,再见
> 
> :-)
> 
> 
> Sincerely,
> Roger Lu
Roger Lu Feb. 11, 2023, 11:34 a.m. UTC | #6
Hi Matthias Sir,

On Mon, 2023-02-06 at 13:09 +0100, Matthias Brugger wrote:
> 
> On 06/02/2023 03:01, Roger Lu (陸瑞傑) wrote:
> > Hi Matthias Sir,
> > 
> > 
> > On Thu, 2023-02-02 at 11:29 +0100, Matthias Brugger wrote:
> > > 你好,
> > 
> > I got shock and thought someone used your name to reply. However,
> > your email account helps me clear my mind. Haha.. Nice and warm to see
> > mandarin
> > on patchwork. It's so fresh and exciting :-).
> > 
> 
> 谢谢。 I'm learning mainland Chinese for a few month now, I also learned that
> you 
> use different symbols in Taiwan, which I don't know. 对不起。

Ha. Both symbols are welcome to me.  :-)

> 
> > > 
> > > On 01/02/2023 13:28, Roger Lu (陸瑞傑) wrote:
> > > > Hi Matthias Sir,
> > > > 
> > > > On Tue, 2023-01-31 at 14:19 +0100, Matthias Brugger wrote:
> > > > > 
> > > > > On 11/01/2023 08:45, Roger Lu wrote:
> > > > > > In MediaTek HW design, svs and thermal both use the same clk source.
> > > > > > It means that svs clk reference count from CCF includes thermal
> > > > > > control
> > > > > > counts. That makes svs driver confuse whether it disabled svs's main
> > > > > > clk
> > > > > > or not from CCF's perspective and lead to turn off their shared clk
> > > > > > unexpectedly. Therefore, we add svs clk control APIs to make sure
> > > > > > svs's
> > > > > > main clk is controlled well by svs driver itself.
> > > > > > 
> > > > > > Here is a NG example. Rely on CCF's reference count and cause
> > > > > > problem.
> > > > > > 
> > > > > > thermal probe (clk ref = 1)
> > > > > > -> svs probe (clk ref = 2)
> > > > > >       -> svs suspend (clk ref = 1)
> > > > > >          -> thermal suspend (clk ref = 0)
> > > > > >          -> thermal resume (clk ref = 1)
> > > > > >       -> svs resume (encounter error, clk ref = 1)
> > > > > >       -> svs suspend (clk ref = 0)
> > > > > >          -> thermal suspend (Fail here, thermal HW control w/o clk)
> > > > > > 
> > > > > > Fixes: a825d72f74a3 ("soc: mediatek: fix missing
> > > > > > clk_disable_unprepare()
> > > > > > on
> > > > > > err in svs_resume()")
> > > > > > Signed-off-by: Roger Lu <roger.lu@mediatek.com>
> > > > > 
> > > > > That looks wrong. Although I don't out of my mind, there should be a
> > > > > way
> > > > > to
> > > > > tell
> > > > > the clock framework that this clock is shared between several devices.
> > > > > 
> > > > > I wonder if using clk_enable and clk_disable in svs_resume/suspend
> > > > > wouldn't
> > > > > be
> > > > > enough.
> > > > 
> > > > Oh yes, Common Clock Framework (CCF) knows the clock shared between
> > > > several
> > > > devices and maintains clock "on/off" by reference count.
> > > > 
> > > 
> > > The thing is if you use clk_prepare_enable then the clock framework
> > > check's
> > > if
> > > the clock is already prepared, which could happen like you described in
> > > the
> > > svs_resume (encount error) case in the commit message. The question is,
> > > can't
> > > we
> > > just use clk_enable and clk_disable in resume/suspend and only prepare the
> > > clock
> > > in the probe function?
> > 
> > We'll think if this can fix the problem. Thanks for the advice very much.
> > 
> > > 
> > > > We concern how to stop running svs_suspend() when svs clk is already
> > > > disabled by
> > > > svs_resume(). Take an example as below, if we refers to
> > > > __clk_is_enabled()
> > > > result for knowing svs clk status, it will return "true" all the time
> > > > because
> > > > thermal clk is still on. This causes the problem mentioned in commit
> > > > message.
> > > > 
> > > 
> > > I would expect that the kernel takes care that we can't enter a resume
> > > path
> > > for
> > > a device before the suspend path has finished. Honestly I don't really
> > > understand the problem here. It seems something different then what you
> > > described in the commit message.
> > > 
> > > Please help me understand better.
> > 
> > I see. This patch title needs to be changed to "avoid turning off svs clk
> > twice
> > unexpectedly" for pointing out the problem precisely. We saw a loophole that
> > svs
> > clk might be turned off in svs_resume() first and in svs_suspend() again
> > without
> > enabling svs clk during these the process. Therefore, we try to fix it by
> > this
> > patch. Below is our thinking process to explain how we got here.
> > 
> > 1. (abandoned) We add __clk_is_enabled() check in svs_suspend() to prevent
> > svs
> > clk from being turned off twice when svs_resume() turned off svs clk in the
> > error-handling process. Nonetheless, we met the NG case in the commit
> > message.
> > 2. (current patch) We add svs clk control hint to understand if we need to
> > run
> > svs_suspend() or not if svs_resume() turned off svs clk before.
> > 
> 
> Did you had a look on the dev_pm_ops? Maybe we can use suspend_late, 
> resume_early to make sure there is no race condition. I wonder also if we
> can't 
> make sure that this does not happen using device links. Sorry, I can't give 
> better guidance on how to use this technologies, but I have the feeling we
> can 
> fix this with existing infrastructure.

No, we didn't look on dev_pm_ops and it seems like another way to fix this issue
with __clk_is_enabled() check in svs_suspend(). Thanks for the advice again.
we'll keep looking for any possible answers to this issue.

Sincerely,
Roger Lu.
diff mbox series

Patch

diff --git a/drivers/soc/mediatek/mtk-svs.c b/drivers/soc/mediatek/mtk-svs.c
index 9575aa645643..830263bad81e 100644
--- a/drivers/soc/mediatek/mtk-svs.c
+++ b/drivers/soc/mediatek/mtk-svs.c
@@ -326,6 +326,7 @@  static const u32 svs_regs_v2[] = {
  * @bank_max: total number of svs banks
  * @efuse: svs efuse data received from NVMEM framework
  * @tefuse: thermal efuse data received from NVMEM framework
+ * @clk_cnt: clock count shows the clk enable/disable times by svs driver
  */
 struct svs_platform {
 	char *name;
@@ -343,6 +344,7 @@  struct svs_platform {
 	u32 bank_max;
 	u32 *efuse;
 	u32 *tefuse;
+	s32 clk_cnt;
 };
 
 struct svs_platform_data {
@@ -502,6 +504,32 @@  static void svs_switch_bank(struct svs_platform *svsp)
 	svs_writel_relaxed(svsp, svsb->core_sel, CORESEL);
 }
 
+static bool svs_is_clk_enabled(struct svs_platform *svsp)
+{
+	return svsp->clk_cnt > 0 ? true : false;
+}
+
+static int svs_clk_enable(struct svs_platform *svsp)
+{
+	int ret;
+
+	ret = clk_prepare_enable(svsp->main_clk);
+	if (ret) {
+		dev_err(svsp->dev, "cannot enable main_clk: %d\n", ret);
+		return ret;
+	}
+
+	svsp->clk_cnt++;
+
+	return 0;
+}
+
+static void svs_clk_disable(struct svs_platform *svsp)
+{
+	clk_disable_unprepare(svsp->main_clk);
+	svsp->clk_cnt--;
+}
+
 static u32 svs_bank_volt_to_opp_volt(u32 svsb_volt, u32 svsb_volt_step,
 				     u32 svsb_volt_base)
 {
@@ -1569,6 +1597,12 @@  static int svs_suspend(struct device *dev)
 	int ret;
 	u32 idx;
 
+	if (!svs_is_clk_enabled(svsp)) {
+		dev_err(svsp->dev, "svs clk is disabled already (%d)\n",
+			svsp->clk_cnt);
+		return 0;
+	}
+
 	for (idx = 0; idx < svsp->bank_max; idx++) {
 		svsb = &svsp->banks[idx];
 
@@ -1590,7 +1624,7 @@  static int svs_suspend(struct device *dev)
 		return ret;
 	}
 
-	clk_disable_unprepare(svsp->main_clk);
+	svs_clk_disable(svsp);
 
 	return 0;
 }
@@ -1600,16 +1634,14 @@  static int svs_resume(struct device *dev)
 	struct svs_platform *svsp = dev_get_drvdata(dev);
 	int ret;
 
-	ret = clk_prepare_enable(svsp->main_clk);
-	if (ret) {
-		dev_err(svsp->dev, "cannot enable main_clk, disable svs\n");
+	ret = svs_clk_enable(svsp);
+	if (ret)
 		return ret;
-	}
 
 	ret = reset_control_deassert(svsp->rst);
 	if (ret) {
 		dev_err(svsp->dev, "cannot deassert reset %d\n", ret);
-		goto out_of_resume;
+		goto svs_resume_clk_disable;
 	}
 
 	ret = svs_init02(svsp);
@@ -1624,8 +1656,9 @@  static int svs_resume(struct device *dev)
 	dev_err(svsp->dev, "assert reset: %d\n",
 		reset_control_assert(svsp->rst));
 
-out_of_resume:
-	clk_disable_unprepare(svsp->main_clk);
+svs_resume_clk_disable:
+	svs_clk_disable(svsp);
+
 	return ret;
 }
 
@@ -2411,11 +2444,9 @@  static int svs_probe(struct platform_device *pdev)
 		goto svs_probe_free_resource;
 	}
 
-	ret = clk_prepare_enable(svsp->main_clk);
-	if (ret) {
-		dev_err(svsp->dev, "cannot enable main clk: %d\n", ret);
+	ret = svs_clk_enable(svsp);
+	if (ret)
 		goto svs_probe_free_resource;
-	}
 
 	svsp->base = of_iomap(svsp->dev->of_node, 0);
 	if (IS_ERR_OR_NULL(svsp->base)) {
@@ -2456,7 +2487,7 @@  static int svs_probe(struct platform_device *pdev)
 	iounmap(svsp->base);
 
 svs_probe_clk_disable:
-	clk_disable_unprepare(svsp->main_clk);
+	svs_clk_disable(svsp);
 
 svs_probe_free_resource:
 	if (!IS_ERR_OR_NULL(svsp->efuse))