diff mbox series

[v13,06/35] clk: tegra: Support runtime PM and power domain

Message ID 20210926224058.1252-7-digetx@gmail.com (mailing list archive)
State New, archived
Headers show
Series NVIDIA Tegra power management patches for 5.16 | expand

Commit Message

Dmitry Osipenko Sept. 26, 2021, 10:40 p.m. UTC
The Clock-and-Reset controller resides in a core power domain on NVIDIA
Tegra SoCs.  In order to support voltage scaling of the core power domain,
we hook up DVFS-capable clocks to the core GENPD for managing of the
GENPD's performance state based on the clock changes.

Some clocks don't have any specific physical hardware unit that backs
them, like root PLLs and system clock and they have theirs own voltage
requirements.  This patch adds new clk-device driver that backs the clocks
and provides runtime PM functionality for them.  A virtual clk-device is
created for each such DVFS-capable clock at the clock's registration time
by the new tegra_clk_register() helper.  Driver changes clock's device
GENPD performance state based on clk-rate notifications.

In result we have this sequence of events:

  1. Clock driver creates virtual device for selective clocks, enables
     runtime PM for the created device and registers the clock.
  2. Clk-device driver starts to listen to clock rate changes.
  3. Something changes clk rate or enables/disables clk.
  4. CCF core propagates the change through the clk tree.
  5. Clk-device driver gets clock rate-change notification or GENPD core
     handles prepare/unprepare of the clock.
  6. Clk-device driver changes GENPD performance state on clock rate
     change.
  7. GENPD driver changes voltage regulator state change.
  8. The regulator state is committed to hardware via I2C.

We rely on fact that DVFS is not needed for Tegra I2C and that Tegra I2C
driver already keeps clock always-prepared.  Hence I2C subsystem stays
independent from the clk power management and there are no deadlock spots
in the sequence.

Currently all clocks are registered very early during kernel boot when the
device driver core isn't available yet.  The clk-device can't be created
at that time.  This patch splits the registration of the clocks in two
phases:

  1. Register all essential clocks which don't use RPM and are needed
     during early boot.

  2. Register at a later boot time the rest of clocks.

This patch adds power management support for Tegra20 and Tegra30 clocks.

Tested-by: Peter Geis <pgwipeout@gmail.com> # Ouya T30
Tested-by: Paul Fertser <fercerpav@gmail.com> # PAZ00 T20
Tested-by: Nicolas Chauvet <kwizart@gmail.com> # PAZ00 T20 and TK1 T124
Tested-by: Matt Merhar <mattmerhar@protonmail.com> # Ouya T30
Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/clk/tegra/Makefile      |   1 +
 drivers/clk/tegra/clk-device.c  | 230 ++++++++++++++++++++++++++++++++
 drivers/clk/tegra/clk-pll.c     |   2 +-
 drivers/clk/tegra/clk-super.c   |   2 +-
 drivers/clk/tegra/clk-tegra20.c |  77 ++++++++---
 drivers/clk/tegra/clk-tegra30.c | 116 +++++++++++-----
 drivers/clk/tegra/clk.c         |  75 ++++++++++-
 drivers/clk/tegra/clk.h         |   2 +
 8 files changed, 451 insertions(+), 54 deletions(-)
 create mode 100644 drivers/clk/tegra/clk-device.c

Comments

Ulf Hansson Oct. 1, 2021, 12:32 p.m. UTC | #1
On Mon, 27 Sept 2021 at 00:42, Dmitry Osipenko <digetx@gmail.com> wrote:
>
> The Clock-and-Reset controller resides in a core power domain on NVIDIA
> Tegra SoCs.  In order to support voltage scaling of the core power domain,
> we hook up DVFS-capable clocks to the core GENPD for managing of the
> GENPD's performance state based on the clock changes.
>
> Some clocks don't have any specific physical hardware unit that backs
> them, like root PLLs and system clock and they have theirs own voltage
> requirements.  This patch adds new clk-device driver that backs the clocks
> and provides runtime PM functionality for them.  A virtual clk-device is
> created for each such DVFS-capable clock at the clock's registration time
> by the new tegra_clk_register() helper.  Driver changes clock's device
> GENPD performance state based on clk-rate notifications.
>
> In result we have this sequence of events:
>
>   1. Clock driver creates virtual device for selective clocks, enables
>      runtime PM for the created device and registers the clock.
>   2. Clk-device driver starts to listen to clock rate changes.
>   3. Something changes clk rate or enables/disables clk.
>   4. CCF core propagates the change through the clk tree.
>   5. Clk-device driver gets clock rate-change notification or GENPD core
>      handles prepare/unprepare of the clock.
>   6. Clk-device driver changes GENPD performance state on clock rate
>      change.
>   7. GENPD driver changes voltage regulator state change.
>   8. The regulator state is committed to hardware via I2C.
>
> We rely on fact that DVFS is not needed for Tegra I2C and that Tegra I2C
> driver already keeps clock always-prepared.  Hence I2C subsystem stays
> independent from the clk power management and there are no deadlock spots
> in the sequence.
>
> Currently all clocks are registered very early during kernel boot when the
> device driver core isn't available yet.  The clk-device can't be created
> at that time.  This patch splits the registration of the clocks in two
> phases:
>
>   1. Register all essential clocks which don't use RPM and are needed
>      during early boot.
>
>   2. Register at a later boot time the rest of clocks.
>
> This patch adds power management support for Tegra20 and Tegra30 clocks.
>
> Tested-by: Peter Geis <pgwipeout@gmail.com> # Ouya T30
> Tested-by: Paul Fertser <fercerpav@gmail.com> # PAZ00 T20
> Tested-by: Nicolas Chauvet <kwizart@gmail.com> # PAZ00 T20 and TK1 T124
> Tested-by: Matt Merhar <mattmerhar@protonmail.com> # Ouya T30
> Signed-off-by: Dmitry Osipenko <digetx@gmail.com>
> ---
>  drivers/clk/tegra/Makefile      |   1 +
>  drivers/clk/tegra/clk-device.c  | 230 ++++++++++++++++++++++++++++++++
>  drivers/clk/tegra/clk-pll.c     |   2 +-
>  drivers/clk/tegra/clk-super.c   |   2 +-
>  drivers/clk/tegra/clk-tegra20.c |  77 ++++++++---
>  drivers/clk/tegra/clk-tegra30.c | 116 +++++++++++-----
>  drivers/clk/tegra/clk.c         |  75 ++++++++++-
>  drivers/clk/tegra/clk.h         |   2 +
>  8 files changed, 451 insertions(+), 54 deletions(-)
>  create mode 100644 drivers/clk/tegra/clk-device.c
>
> diff --git a/drivers/clk/tegra/Makefile b/drivers/clk/tegra/Makefile
> index 7b1816856eb5..a0715cdfc1a4 100644
> --- a/drivers/clk/tegra/Makefile
> +++ b/drivers/clk/tegra/Makefile
> @@ -1,6 +1,7 @@
>  # SPDX-License-Identifier: GPL-2.0
>  obj-y                                  += clk.o
>  obj-y                                  += clk-audio-sync.o
> +obj-y                                  += clk-device.o
>  obj-y                                  += clk-dfll.o
>  obj-y                                  += clk-divider.o
>  obj-y                                  += clk-periph.o
> diff --git a/drivers/clk/tegra/clk-device.c b/drivers/clk/tegra/clk-device.c
> new file mode 100644
> index 000000000000..830bc0ba25d3
> --- /dev/null
> +++ b/drivers/clk/tegra/clk-device.c
> @@ -0,0 +1,230 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +
> +#include <linux/clk.h>
> +#include <linux/clk-provider.h>
> +#include <linux/mutex.h>
> +#include <linux/of_device.h>
> +#include <linux/platform_device.h>
> +#include <linux/pm_domain.h>
> +#include <linux/pm_opp.h>
> +#include <linux/pm_runtime.h>
> +#include <linux/slab.h>
> +
> +#include <soc/tegra/common.h>
> +
> +#include "clk.h"
> +
> +/*
> + * This driver manages performance state of the core power domain for the
> + * independent PLLs and system clocks.  We created a virtual clock device
> + * for such clocks, see tegra_clk_dev_register().
> + */
> +
> +struct tegra_clk_device {
> +       struct notifier_block clk_nb;
> +       struct device *dev;
> +       struct clk_hw *hw;
> +       struct mutex lock;
> +};
> +
> +static int tegra_clock_set_pd_state(struct tegra_clk_device *clk_dev,
> +                                   unsigned long rate)
> +{
> +       struct device *dev = clk_dev->dev;
> +       struct dev_pm_opp *opp;
> +       unsigned int pstate;
> +
> +       opp = dev_pm_opp_find_freq_ceil(dev, &rate);
> +       if (opp == ERR_PTR(-ERANGE)) {
> +               dev_dbg(dev, "failed to find ceil OPP for %luHz\n", rate);
> +               opp = dev_pm_opp_find_freq_floor(dev, &rate);
> +       }
> +
> +       if (IS_ERR(opp)) {
> +               dev_err(dev, "failed to find OPP for %luHz: %pe\n", rate, opp);
> +               return PTR_ERR(opp);
> +       }
> +
> +       pstate = dev_pm_opp_get_required_pstate(opp, 0);
> +       dev_pm_opp_put(opp);
> +
> +       return dev_pm_genpd_set_performance_state(dev, pstate);

The above code certainly looks like it can be made generic through a
common opp helper. I know we have discussed this before, so I am not
saying you should change right now.

Let's instead see what I think (and Viresh), when I have reviewed the
entire series.

> +}
> +
> +static int tegra_clock_change_notify(struct notifier_block *nb,
> +                                    unsigned long msg, void *data)
> +{
> +       struct clk_notifier_data *cnd = data;
> +       struct tegra_clk_device *clk_dev;
> +       int err = 0;
> +
> +       clk_dev = container_of(nb, struct tegra_clk_device, clk_nb);
> +
> +       mutex_lock(&clk_dev->lock);
> +       switch (msg) {
> +       case PRE_RATE_CHANGE:
> +               if (cnd->new_rate > cnd->old_rate)
> +                       err = tegra_clock_set_pd_state(clk_dev, cnd->new_rate);
> +               break;
> +
> +       case ABORT_RATE_CHANGE:
> +               err = tegra_clock_set_pd_state(clk_dev, cnd->old_rate);
> +               break;
> +
> +       case POST_RATE_CHANGE:
> +               if (cnd->new_rate < cnd->old_rate)
> +                       err = tegra_clock_set_pd_state(clk_dev, cnd->new_rate);
> +               break;
> +
> +       default:
> +               break;
> +       }
> +       mutex_unlock(&clk_dev->lock);
> +
> +       return notifier_from_errno(err);
> +}
> +
> +static int tegra_clock_sync_pd_state(struct tegra_clk_device *clk_dev)
> +{
> +       unsigned long rate;
> +       int ret = 0;
> +
> +       mutex_lock(&clk_dev->lock);
> +
> +       if (!pm_runtime_status_suspended(clk_dev->dev)) {
> +               rate = clk_hw_get_rate(clk_dev->hw);
> +               ret = tegra_clock_set_pd_state(clk_dev, rate);

Don't we need to sync the performance state even when the device is
runtime suspended?

Perhaps the clock, via a child-clock for example, can get
prepared/enabled (hence its device gets runtime resumed) before there
is a clock rate update for it. Then there is no performance state set
for it, right? Or maybe that isn't a problem?

> +       }
> +
> +       mutex_unlock(&clk_dev->lock);
> +
> +       return ret;
> +}
> +
> +static int tegra_clock_probe(struct platform_device *pdev)
> +{
> +       struct tegra_core_opp_params opp_params = {};
> +       struct tegra_clk_device *clk_dev;
> +       struct device *dev = &pdev->dev;
> +       struct clk *clk;
> +       int err;
> +
> +       if (!dev->pm_domain)
> +               return -EINVAL;
> +
> +       clk_dev = devm_kzalloc(dev, sizeof(*clk_dev), GFP_KERNEL);
> +       if (!clk_dev)
> +               return -ENOMEM;
> +
> +       clk = devm_clk_get(dev, NULL);
> +       if (IS_ERR(clk))
> +               return PTR_ERR(clk);
> +
> +       clk_dev->dev = dev;
> +       clk_dev->hw = __clk_get_hw(clk);
> +       clk_dev->clk_nb.notifier_call = tegra_clock_change_notify;
> +       mutex_init(&clk_dev->lock);
> +
> +       platform_set_drvdata(pdev, clk_dev);
> +
> +       /*
> +        * Runtime PM was already enabled for this device by the parent clk
> +        * driver and power domain state should be synced under clk_dev lock,
> +        * hence we don't use the common OPP helper that initializes OPP
> +        * state. For some clocks common OPP helper may fail to find ceil
> +        * rate, it's handled by this driver.
> +        */
> +       err = devm_tegra_core_dev_init_opp_table(dev, &opp_params);
> +       if (err)
> +               return err;
> +
> +       err = clk_notifier_register(clk, &clk_dev->clk_nb);
> +       if (err) {
> +               dev_err(dev, "failed to register clk notifier: %d\n", err);
> +               return err;
> +       }
> +
> +       /*
> +        * The driver is attaching to a potentially active/resumed clock, hence
> +        * we need to sync the power domain performance state in a accordance to
> +        * the clock rate if clock is resumed.
> +        */
> +       err = tegra_clock_sync_pd_state(clk_dev);
> +       if (err)
> +               goto unreg_clk;
> +
> +       return 0;
> +
> +unreg_clk:
> +       clk_notifier_unregister(clk, &clk_dev->clk_nb);
> +
> +       return err;
> +}
> +
> +static __maybe_unused int tegra_clock_pm_suspend(struct device *dev)
> +{
> +       struct tegra_clk_device *clk_dev = dev_get_drvdata(dev);
> +
> +       /*
> +        * Power management of the clock is entangled with the Tegra PMC
> +        * GENPD because PMC driver enables/disables clocks for toggling
> +        * of the PD's on/off state.
> +        *
> +        * The PMC GENPD is resumed in NOIRQ phase, before RPM of the clocks
> +        * becomes available, hence PMC can't use clocks at the early resume
> +        * phase if RPM is involved. For example when 3d clock is enabled,
> +        * it may enable the parent PLL clock that needs to be RPM-resumed.
> +        *
> +        * Secondly, the PLL clocks may be enabled by the low level suspend
> +        * code, so we need to assume that PLL is in enabled state during
> +        * suspend.
> +        *
> +        * We will keep PLLs and system clock resumed during suspend time.
> +        * All PLLs on all SoCs are low power and system clock is always-on,
> +        * so practically not much is changed here.
> +        */
> +
> +       return clk_prepare(clk_dev->hw->clk);

I am trying to understand, more exactly, what you intend to achieve
with the clk_prepare() here. It looks a bit weird, to me. Can you try
to elaborate a bit more on the use case?

Is this rather about making sure that the clock's corresponding PM
domain stays powered on during system suspend? In that case, I think
there may be an alternative option....

> +}
> +
> +static __maybe_unused int tegra_clock_pm_resume(struct device *dev)
> +{
> +       struct tegra_clk_device *clk_dev = dev_get_drvdata(dev);
> +
> +       clk_unprepare(clk_dev->hw->clk);
> +
> +       return 0;
> +}
> +
> +static void tegra_clock_shutdown(struct platform_device *pdev)
> +{
> +       struct tegra_clk_device *clk_dev = platform_get_drvdata(pdev);
> +
> +       clk_prepare(clk_dev->hw->clk);
> +}
> +
> +static const struct dev_pm_ops tegra_clock_pm = {
> +       SET_SYSTEM_SLEEP_PM_OPS(tegra_clock_pm_suspend,
> +                               tegra_clock_pm_resume)
> +};

[...]

Kind regards
Uffe
Dmitry Osipenko Oct. 1, 2021, 7:50 p.m. UTC | #2
01.10.2021 15:32, Ulf Hansson пишет:
>> +static int tegra_clock_sync_pd_state(struct tegra_clk_device *clk_dev)
>> +{
>> +       unsigned long rate;
>> +       int ret = 0;
>> +
>> +       mutex_lock(&clk_dev->lock);
>> +
>> +       if (!pm_runtime_status_suspended(clk_dev->dev)) {
>> +               rate = clk_hw_get_rate(clk_dev->hw);
>> +               ret = tegra_clock_set_pd_state(clk_dev, rate);
> Don't we need to sync the performance state even when the device is
> runtime suspended?
> 
> Perhaps the clock, via a child-clock for example, can get
> prepared/enabled (hence its device gets runtime resumed) before there
> is a clock rate update for it. Then there is no performance state set
> for it, right? Or maybe that isn't a problem?
> 

Good catch! Older versions of this patch had a special handling for clk
enable/disable. I just forgot to update this function, it's now not a
problem to change performance state of a suspended device and it
actually needs to be done. I'll correct it, thanks!
Dmitry Osipenko Oct. 2, 2021, 8:44 p.m. UTC | #3
01.10.2021 15:32, Ulf Hansson пишет:
>> +static __maybe_unused int tegra_clock_pm_suspend(struct device *dev)
>> +{
>> +       struct tegra_clk_device *clk_dev = dev_get_drvdata(dev);
>> +
>> +       /*
>> +        * Power management of the clock is entangled with the Tegra PMC
>> +        * GENPD because PMC driver enables/disables clocks for toggling
>> +        * of the PD's on/off state.
>> +        *
>> +        * The PMC GENPD is resumed in NOIRQ phase, before RPM of the clocks
>> +        * becomes available, hence PMC can't use clocks at the early resume
>> +        * phase if RPM is involved. For example when 3d clock is enabled,
>> +        * it may enable the parent PLL clock that needs to be RPM-resumed.
>> +        *
>> +        * Secondly, the PLL clocks may be enabled by the low level suspend
>> +        * code, so we need to assume that PLL is in enabled state during
>> +        * suspend.
>> +        *
>> +        * We will keep PLLs and system clock resumed during suspend time.
>> +        * All PLLs on all SoCs are low power and system clock is always-on,
>> +        * so practically not much is changed here.
>> +        */
>> +
>> +       return clk_prepare(clk_dev->hw->clk);
> I am trying to understand, more exactly, what you intend to achieve
> with the clk_prepare() here. It looks a bit weird, to me. Can you try
> to elaborate a bit more on the use case?

The Tegra GENPD driver enable/disable clocks when domain is turned on.
This can't be done during early system resume, when domains are getting
turned on by the drivers core, because when clock is enabled, it's
getting prepared (RPM-resumed) and this preparation fails because
performance state of the clock goes up and it doesn't work during the
early resume time since I2C, which applies the state to hardware, is
suspended and can't work at that early time.

Secondly, Tegra has arch-specific low level assembly which touches
clocks during last phase of system suspend and in the beginning of
resume. Hence, clocks should stay prepared during suspend just because
technically clock should be prepared before it can be enabled.

> Is this rather about making sure that the clock's corresponding PM
> domain stays powered on during system suspend? In that case, I think
> there may be an alternative option....
> 

This is not about domain staying powered on, this is about keeping the
performance state of the domain high during suspend.
Ulf Hansson Oct. 5, 2021, 1:10 p.m. UTC | #4
On Sat, 2 Oct 2021 at 22:44, Dmitry Osipenko <digetx@gmail.com> wrote:
>
> 01.10.2021 15:32, Ulf Hansson пишет:
> >> +static __maybe_unused int tegra_clock_pm_suspend(struct device *dev)
> >> +{
> >> +       struct tegra_clk_device *clk_dev = dev_get_drvdata(dev);
> >> +
> >> +       /*
> >> +        * Power management of the clock is entangled with the Tegra PMC
> >> +        * GENPD because PMC driver enables/disables clocks for toggling
> >> +        * of the PD's on/off state.
> >> +        *
> >> +        * The PMC GENPD is resumed in NOIRQ phase, before RPM of the clocks
> >> +        * becomes available, hence PMC can't use clocks at the early resume
> >> +        * phase if RPM is involved. For example when 3d clock is enabled,
> >> +        * it may enable the parent PLL clock that needs to be RPM-resumed.
> >> +        *
> >> +        * Secondly, the PLL clocks may be enabled by the low level suspend
> >> +        * code, so we need to assume that PLL is in enabled state during
> >> +        * suspend.
> >> +        *
> >> +        * We will keep PLLs and system clock resumed during suspend time.
> >> +        * All PLLs on all SoCs are low power and system clock is always-on,
> >> +        * so practically not much is changed here.
> >> +        */
> >> +
> >> +       return clk_prepare(clk_dev->hw->clk);
> > I am trying to understand, more exactly, what you intend to achieve
> > with the clk_prepare() here. It looks a bit weird, to me. Can you try
> > to elaborate a bit more on the use case?
>
> The Tegra GENPD driver enable/disable clocks when domain is turned on.

Okay. I noticed that in tegra_genpd_power_on(). And the same clocks
are enabled/disabled also in tegra_genpd_power_off(), when powering
off the PM domain.

So I guess the problem kind of exists for tegra_genpd_power_off() too?

> This can't be done during early system resume, when domains are getting
> turned on by the drivers core, because when clock is enabled, it's
> getting prepared (RPM-resumed) and this preparation fails because
> performance state of the clock goes up and it doesn't work during the
> early resume time since I2C, which applies the state to hardware, is
> suspended and can't work at that early time.

This sounds complicated and I still don't quite follow all of it, sorry.

So, tegra_genpd_power_on() gets called from genpd_resume_noirq(), when
the first device of the attached devices to genpd gets resumed. And
vice versa for tegra_genpd_power_off() and genpd_suspend_noirq().

Are you saying that trying to enable/disable clocks from
tegra_genpd_power_on|off() in these paths doesn't work, because it
would also require the performance state to be changed, which would
fail because the I2C bus/driver is suspended?

>
> Secondly, Tegra has arch-specific low level assembly which touches
> clocks during last phase of system suspend and in the beginning of
> resume. Hence, clocks should stay prepared during suspend just because
> technically clock should be prepared before it can be enabled.

So the low level code is gating and ungating the clock behind the back
of the clock driver then? Why is that done like that, more exactly?

>
> > Is this rather about making sure that the clock's corresponding PM
> > domain stays powered on during system suspend? In that case, I think
> > there may be an alternative option....
> >
>
> This is not about domain staying powered on, this is about keeping the
> performance state of the domain high during suspend.

Right, so the PM domain managed in tegra_genpd_power_on|off() can
still be powered on/off, as long as the clock remains ungated?

Kind regards
Uffe
Dmitry Osipenko Oct. 5, 2021, 10:19 p.m. UTC | #5
05.10.2021 16:10, Ulf Hansson пишет:
> On Sat, 2 Oct 2021 at 22:44, Dmitry Osipenko <digetx@gmail.com> wrote:
>>
>> 01.10.2021 15:32, Ulf Hansson пишет:
>>>> +static __maybe_unused int tegra_clock_pm_suspend(struct device *dev)
>>>> +{
>>>> +       struct tegra_clk_device *clk_dev = dev_get_drvdata(dev);
>>>> +
>>>> +       /*
>>>> +        * Power management of the clock is entangled with the Tegra PMC
>>>> +        * GENPD because PMC driver enables/disables clocks for toggling
>>>> +        * of the PD's on/off state.
>>>> +        *
>>>> +        * The PMC GENPD is resumed in NOIRQ phase, before RPM of the clocks
>>>> +        * becomes available, hence PMC can't use clocks at the early resume
>>>> +        * phase if RPM is involved. For example when 3d clock is enabled,
>>>> +        * it may enable the parent PLL clock that needs to be RPM-resumed.
>>>> +        *
>>>> +        * Secondly, the PLL clocks may be enabled by the low level suspend
>>>> +        * code, so we need to assume that PLL is in enabled state during
>>>> +        * suspend.
>>>> +        *
>>>> +        * We will keep PLLs and system clock resumed during suspend time.
>>>> +        * All PLLs on all SoCs are low power and system clock is always-on,
>>>> +        * so practically not much is changed here.
>>>> +        */
>>>> +
>>>> +       return clk_prepare(clk_dev->hw->clk);
>>> I am trying to understand, more exactly, what you intend to achieve
>>> with the clk_prepare() here. It looks a bit weird, to me. Can you try
>>> to elaborate a bit more on the use case?
>>
>> The Tegra GENPD driver enable/disable clocks when domain is turned on.
> 
> Okay. I noticed that in tegra_genpd_power_on(). And the same clocks
> are enabled/disabled also in tegra_genpd_power_off(), when powering
> off the PM domain.
> 
> So I guess the problem kind of exists for tegra_genpd_power_off() too?

Both OFF/ON are affected by the same problem. If domain was already
turned OFF before genpd_suspend_noirq(), then the OFF problem isn't visible.

I reproduced the OFF problem by removing the clk prepare/unprepare from
the suspend/resume of the clk driver and making some extra changes to
clock tree topology and etc to trigger the problem on Nexus 7.

tegra-pmc 7000e400.pmc: failed to turn off PM domain heg: -13

I happens from genpd_suspend_noirq() -> tegra_genpd_power_off() -> clk
-> GENPD -> I2C -> runtime-pm.

-13 is EACCES, it comes from the runtime PM of I2C device. RPM is
prohibited/disabled during late (NOIRQ) suspend by the drivers core.

>> This can't be done during early system resume, when domains are getting
>> turned on by the drivers core, because when clock is enabled, it's
>> getting prepared (RPM-resumed) and this preparation fails because
>> performance state of the clock goes up and it doesn't work during the
>> early resume time since I2C, which applies the state to hardware, is
>> suspended and can't work at that early time.
> 
> This sounds complicated and I still don't quite follow all of it, sorry.
> 
> So, tegra_genpd_power_on() gets called from genpd_resume_noirq(), when
> the first device of the attached devices to genpd gets resumed. And
> vice versa for tegra_genpd_power_off() and genpd_suspend_noirq().
> 
> Are you saying that trying to enable/disable clocks from
> tegra_genpd_power_on|off() in these paths doesn't work, because it
> would also require the performance state to be changed, which would
> fail because the I2C bus/driver is suspended?

Yes, but it's actually not I2C bus/driver that is suspended, it's
runtime PM that is unavailable during NOIRQ. The I2C driver itself is
suspended after domains are turned OFF and resumed before they are
enabled. It's just runtime PM API that is unavailable. I'm wondering if
this could be changed.

I'm also wondering if we could add some 'was_enabled' flag to GENPDs,
setting it by genpd_suspend_noirq() for the enabled domains, and then
powering-on GENPDs from genpd_resume_noirq() only if they were in the
enabled state during genpd_suspend_noirq() time. It actually puzzled me
for a quite long time why GENPD core enables domains unconditionally
during early resume. This should solve a part of the problem and it
makes suspend/resume a bit safer because there is a smaller chance to
crash hardware during suspend, at least it's easier to debug.

>> Secondly, Tegra has arch-specific low level assembly which touches
>> clocks during last phase of system suspend and in the beginning of
>> resume. Hence, clocks should stay prepared during suspend just because
>> technically clock should be prepared before it can be enabled.
> 
> So the low level code is gating and ungating the clock behind the back
> of the clock driver then? Why is that done like that, more exactly?

I revisited that code again, and it shouldn't touch the clocks.
I changed that code to not toggle the clocks [1] sometime ago, but
forgot about it.

[1] https://git.kernel.org/linus/680ae4452

>>> Is this rather about making sure that the clock's corresponding PM
>>> domain stays powered on during system suspend? In that case, I think
>>> there may be an alternative option....
>>>
>>
>> This is not about domain staying powered on, this is about keeping the
>> performance state of the domain high during suspend.
> 
> Right, so the PM domain managed in tegra_genpd_power_on|off() can
> still be powered on/off, as long as the clock remains ungated?

Not ungated, but prepared.
Dmitry Osipenko Oct. 5, 2021, 10:43 p.m. UTC | #6
06.10.2021 01:19, Dmitry Osipenko пишет:
...
> I reproduced the OFF problem by removing the clk prepare/unprepare from
> the suspend/resume of the clk driver and making some extra changes to
> clock tree topology and etc to trigger the problem on Nexus 7.
> 
> tegra-pmc 7000e400.pmc: failed to turn off PM domain heg: -13
> 
> It happens from genpd_suspend_noirq() -> tegra_genpd_power_off() -> clk
> -> GENPD -> I2C -> runtime-pm.
> 
> -13 is EACCES, it comes from the runtime PM of I2C device. RPM is
> prohibited/disabled during late (NOIRQ) suspend by the drivers core.

My bad, I double-checked and it's not I2C RPM that is failing now, but
the clock's RPM [1], which is also unavailable during NOIRQ.

[1]
https://elixir.free-electrons.com/linux/v5.15-rc4/source/drivers/clk/clk.c#L116

Previously it was I2C RPM that was failing in a similar way, but code
changed a tad since that time.
Dmitry Osipenko Oct. 6, 2021, 2:40 a.m. UTC | #7
06.10.2021 01:43, Dmitry Osipenko пишет:
> 06.10.2021 01:19, Dmitry Osipenko пишет:
> ...
>> I reproduced the OFF problem by removing the clk prepare/unprepare from
>> the suspend/resume of the clk driver and making some extra changes to
>> clock tree topology and etc to trigger the problem on Nexus 7.
>>
>> tegra-pmc 7000e400.pmc: failed to turn off PM domain heg: -13
>>
>> It happens from genpd_suspend_noirq() -> tegra_genpd_power_off() -> clk
>> -> GENPD -> I2C -> runtime-pm.
>>
>> -13 is EACCES, it comes from the runtime PM of I2C device. RPM is
>> prohibited/disabled during late (NOIRQ) suspend by the drivers core.
> 
> My bad, I double-checked and it's not I2C RPM that is failing now, but
> the clock's RPM [1], which is also unavailable during NOIRQ.
> 
> [1]
> https://elixir.free-electrons.com/linux/v5.15-rc4/source/drivers/clk/clk.c#L116
> 
> Previously it was I2C RPM that was failing in a similar way, but code
> changed a tad since that time.
> 

Just in case, I checked that the suspension order isn't somehow the
source of the problem by adding links to device tree in order to always
suspend clocks after the rest of devices and still GENPD gets -EACCESS
from clk_pm_runtime_get().

RPM is disabled by dpm_suspend_late(), which is invoked before
dpm_suspend_noirq() [1]. Hence RPM is unavailable in NOIRQ phase in any
case.

[1]
https://elixir.bootlin.com/linux/v5.15-rc4/source/kernel/power/suspend.c#L399
Ulf Hansson Oct. 6, 2021, 12:38 p.m. UTC | #8
On Wed, 6 Oct 2021 at 00:19, Dmitry Osipenko <digetx@gmail.com> wrote:
>
> 05.10.2021 16:10, Ulf Hansson пишет:
> > On Sat, 2 Oct 2021 at 22:44, Dmitry Osipenko <digetx@gmail.com> wrote:
> >>
> >> 01.10.2021 15:32, Ulf Hansson пишет:
> >>>> +static __maybe_unused int tegra_clock_pm_suspend(struct device *dev)
> >>>> +{
> >>>> +       struct tegra_clk_device *clk_dev = dev_get_drvdata(dev);
> >>>> +
> >>>> +       /*
> >>>> +        * Power management of the clock is entangled with the Tegra PMC
> >>>> +        * GENPD because PMC driver enables/disables clocks for toggling
> >>>> +        * of the PD's on/off state.
> >>>> +        *
> >>>> +        * The PMC GENPD is resumed in NOIRQ phase, before RPM of the clocks
> >>>> +        * becomes available, hence PMC can't use clocks at the early resume
> >>>> +        * phase if RPM is involved. For example when 3d clock is enabled,
> >>>> +        * it may enable the parent PLL clock that needs to be RPM-resumed.
> >>>> +        *
> >>>> +        * Secondly, the PLL clocks may be enabled by the low level suspend
> >>>> +        * code, so we need to assume that PLL is in enabled state during
> >>>> +        * suspend.
> >>>> +        *
> >>>> +        * We will keep PLLs and system clock resumed during suspend time.
> >>>> +        * All PLLs on all SoCs are low power and system clock is always-on,
> >>>> +        * so practically not much is changed here.
> >>>> +        */
> >>>> +
> >>>> +       return clk_prepare(clk_dev->hw->clk);
> >>> I am trying to understand, more exactly, what you intend to achieve
> >>> with the clk_prepare() here. It looks a bit weird, to me. Can you try
> >>> to elaborate a bit more on the use case?
> >>
> >> The Tegra GENPD driver enable/disable clocks when domain is turned on.
> >
> > Okay. I noticed that in tegra_genpd_power_on(). And the same clocks
> > are enabled/disabled also in tegra_genpd_power_off(), when powering
> > off the PM domain.
> >
> > So I guess the problem kind of exists for tegra_genpd_power_off() too?
>
> Both OFF/ON are affected by the same problem. If domain was already
> turned OFF before genpd_suspend_noirq(), then the OFF problem isn't visible.
>
> I reproduced the OFF problem by removing the clk prepare/unprepare from
> the suspend/resume of the clk driver and making some extra changes to
> clock tree topology and etc to trigger the problem on Nexus 7.
>
> tegra-pmc 7000e400.pmc: failed to turn off PM domain heg: -13
>
> I happens from genpd_suspend_noirq() -> tegra_genpd_power_off() -> clk
> -> GENPD -> I2C -> runtime-pm.
>
> -13 is EACCES, it comes from the runtime PM of I2C device. RPM is
> prohibited/disabled during late (NOIRQ) suspend by the drivers core.

Thanks for the clarification!

>
> >> This can't be done during early system resume, when domains are getting
> >> turned on by the drivers core, because when clock is enabled, it's
> >> getting prepared (RPM-resumed) and this preparation fails because
> >> performance state of the clock goes up and it doesn't work during the
> >> early resume time since I2C, which applies the state to hardware, is
> >> suspended and can't work at that early time.
> >
> > This sounds complicated and I still don't quite follow all of it, sorry.
> >
> > So, tegra_genpd_power_on() gets called from genpd_resume_noirq(), when
> > the first device of the attached devices to genpd gets resumed. And
> > vice versa for tegra_genpd_power_off() and genpd_suspend_noirq().
> >
> > Are you saying that trying to enable/disable clocks from
> > tegra_genpd_power_on|off() in these paths doesn't work, because it
> > would also require the performance state to be changed, which would
> > fail because the I2C bus/driver is suspended?
>
> Yes, but it's actually not I2C bus/driver that is suspended, it's
> runtime PM that is unavailable during NOIRQ. The I2C driver itself is
> suspended after domains are turned OFF and resumed before they are
> enabled. It's just runtime PM API that is unavailable. I'm wondering if
> this could be changed.

In principle what you ask for, is if we can avoid calling
__pm_runtime_disable() in __device_suspend_late() (and vice versa in
device_resume_early()).

I think the short answer is no, at least from a generic point of view.
Maybe we can figure out a way to allow this on a per device basis, as
an opt-in solution. I am not sure what Rafael would think about this,
let's see.

Another option to address the problem is already available to use for
these kinds of cases. This would be to add also a pair of
->suspend|resume() callbacks to I2C driver. Along the lines of the
below.

diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c
index c883044715f3..589bf872ab25 100644
--- a/drivers/i2c/busses/i2c-tegra.c
+++ b/drivers/i2c/busses/i2c-tegra.c
@@ -1918,6 +1918,7 @@ static int __maybe_unused
tegra_i2c_resume(struct device *dev)
 }

 static const struct dev_pm_ops tegra_i2c_pm = {
+       SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_put_noidle, pm_runtime_get_sync)
        SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(tegra_i2c_suspend, tegra_i2c_resume)
        SET_RUNTIME_PM_OPS(tegra_i2c_runtime_suspend, tegra_i2c_runtime_resume,
                           NULL)

In this way, the device would already be runtime resumed, if there is
call to pm_runtime_get_sync() from the clock framework due to the
clk_prepare|unprepare() being called. If that also turns out to happen
*after* runtime PM has been disabled for the device, the call to
pm_runtime_get_sync() would still succeed (returning 1, see
rpm_resume()), rather than a negative error code.

Yes, we may end up runtime resuming the device during system suspend,
even if it turns out not to be needed. Although, that doesn't seem to
be the case for the Tegra I2C driver, right?

>
> I'm also wondering if we could add some 'was_enabled' flag to GENPDs,
> setting it by genpd_suspend_noirq() for the enabled domains, and then
> powering-on GENPDs from genpd_resume_noirq() only if they were in the
> enabled state during genpd_suspend_noirq() time. It actually puzzled me
> for a quite long time why GENPD core enables domains unconditionally
> during early resume. This should solve a part of the problem and it
> makes suspend/resume a bit safer because there is a smaller chance to
> crash hardware during suspend, at least it's easier to debug.

Just because the PM domain was already off at genpd_suspend_noirq(),
doesn't mean that it can stay powered off at genpd_resume_noirq(). At
least as is today.

The main reason why genpd_resume_noirq() powers on the PM domain, is
because it's not possible for the consumer drivers to rely on runtime
PM to do it (because runtime PM has been disabled by the PM core).

>
> >> Secondly, Tegra has arch-specific low level assembly which touches
> >> clocks during last phase of system suspend and in the beginning of
> >> resume. Hence, clocks should stay prepared during suspend just because
> >> technically clock should be prepared before it can be enabled.
> >
> > So the low level code is gating and ungating the clock behind the back
> > of the clock driver then? Why is that done like that, more exactly?
>
> I revisited that code again, and it shouldn't touch the clocks.
> I changed that code to not toggle the clocks [1] sometime ago, but
> forgot about it.
>
> [1] https://git.kernel.org/linus/680ae4452
>
> >>> Is this rather about making sure that the clock's corresponding PM
> >>> domain stays powered on during system suspend? In that case, I think
> >>> there may be an alternative option....
> >>>
> >>
> >> This is not about domain staying powered on, this is about keeping the
> >> performance state of the domain high during suspend.
> >
> > Right, so the PM domain managed in tegra_genpd_power_on|off() can
> > still be powered on/off, as long as the clock remains ungated?
>
> Not ungated, but prepared.

Okay, thanks for clarifying!

In summary, it sounds like you should be able to fix this problem in
the I2C driver as I suggested above. If that works, that seems much
better.

Moreover, it would leave the clocks gated/unprepared when the system
is fully suspended, which I guess is better from an energy point of
view?

Kind regards
Uffe
Ulf Hansson Oct. 6, 2021, 12:43 p.m. UTC | #9
On Wed, 6 Oct 2021 at 00:43, Dmitry Osipenko <digetx@gmail.com> wrote:
>
> 06.10.2021 01:19, Dmitry Osipenko пишет:
> ...
> > I reproduced the OFF problem by removing the clk prepare/unprepare from
> > the suspend/resume of the clk driver and making some extra changes to
> > clock tree topology and etc to trigger the problem on Nexus 7.
> >
> > tegra-pmc 7000e400.pmc: failed to turn off PM domain heg: -13
> >
> > It happens from genpd_suspend_noirq() -> tegra_genpd_power_off() -> clk
> > -> GENPD -> I2C -> runtime-pm.
> >
> > -13 is EACCES, it comes from the runtime PM of I2C device. RPM is
> > prohibited/disabled during late (NOIRQ) suspend by the drivers core.
>
> My bad, I double-checked and it's not I2C RPM that is failing now, but
> the clock's RPM [1], which is also unavailable during NOIRQ.

Yes, that sounds reasonable.

You would then need a similar patch for the tegra clock driver as I
suggested for tegra I2C driver. That should solve the problem, I
think.

>
> [1]
> https://elixir.free-electrons.com/linux/v5.15-rc4/source/drivers/clk/clk.c#L116
>
> Previously it was I2C RPM that was failing in a similar way, but code
> changed a tad since that time.

Alright. In any case, as long as the devices gets suspended in the
correct order, I think it should be fine to cook a patch along the
lines of what I suggest for the I2C driver as well.

It should work, I think. Although, maybe you want to avoid runtime
resuming the I2C device, unless it's the device belonging to the PMIC
interface, if there is a way to distinguish that for the driver.

Kind regards
Uffe
Dmitry Osipenko Oct. 6, 2021, 9:14 p.m. UTC | #10
06.10.2021 15:43, Ulf Hansson пишет:
> On Wed, 6 Oct 2021 at 00:43, Dmitry Osipenko <digetx@gmail.com> wrote:
>>
>> 06.10.2021 01:19, Dmitry Osipenko пишет:
>> ...
>>> I reproduced the OFF problem by removing the clk prepare/unprepare from
>>> the suspend/resume of the clk driver and making some extra changes to
>>> clock tree topology and etc to trigger the problem on Nexus 7.
>>>
>>> tegra-pmc 7000e400.pmc: failed to turn off PM domain heg: -13
>>>
>>> It happens from genpd_suspend_noirq() -> tegra_genpd_power_off() -> clk
>>> -> GENPD -> I2C -> runtime-pm.
>>>
>>> -13 is EACCES, it comes from the runtime PM of I2C device. RPM is
>>> prohibited/disabled during late (NOIRQ) suspend by the drivers core.
>>
>> My bad, I double-checked and it's not I2C RPM that is failing now, but
>> the clock's RPM [1], which is also unavailable during NOIRQ.
> 
> Yes, that sounds reasonable.
> 
> You would then need a similar patch for the tegra clock driver as I
> suggested for tegra I2C driver. That should solve the problem, I
> think.
> 
>>
>> [1]
>> https://elixir.free-electrons.com/linux/v5.15-rc4/source/drivers/clk/clk.c#L116
>>
>> Previously it was I2C RPM that was failing in a similar way, but code
>> changed a tad since that time.
> 
> Alright. In any case, as long as the devices gets suspended in the
> correct order, I think it should be fine to cook a patch along the
> lines of what I suggest for the I2C driver as well.
> 
> It should work, I think. Although, maybe you want to avoid runtime
> resuming the I2C device, unless it's the device belonging to the PMIC
> interface, if there is a way to distinguish that for the driver.

Ulf, thank you very much for the suggestions! I was thinking about this
all once again and concluded that the simplest variant will be to just
remove the suspend ops from the clk driver since neither of PLLs require
high voltage. We now have voltage bumped to a nominal level during
suspend by Tegra's regulator-coupler driver and it's much higher than
voltage needed by PLLs. So the problem I was trying to work around
doesn't really exist anymore.
Dmitry Osipenko Oct. 6, 2021, 9:20 p.m. UTC | #11
06.10.2021 15:38, Ulf Hansson пишет:
> In principle what you ask for, is if we can avoid calling
> __pm_runtime_disable() in __device_suspend_late() (and vice versa in
> device_resume_early()).
> 
> I think the short answer is no, at least from a generic point of view.
> Maybe we can figure out a way to allow this on a per device basis, as
> an opt-in solution. I am not sure what Rafael would think about this,
> let's see.
> 
> Another option to address the problem is already available to use for
> these kinds of cases. This would be to add also a pair of
> ->suspend|resume() callbacks to I2C driver. Along the lines of the
> below.
> 
> diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c
> index c883044715f3..589bf872ab25 100644
> --- a/drivers/i2c/busses/i2c-tegra.c
> +++ b/drivers/i2c/busses/i2c-tegra.c
> @@ -1918,6 +1918,7 @@ static int __maybe_unused
> tegra_i2c_resume(struct device *dev)
>  }
> 
>  static const struct dev_pm_ops tegra_i2c_pm = {
> +       SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_put_noidle, pm_runtime_get_sync)
>         SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(tegra_i2c_suspend, tegra_i2c_resume)
>         SET_RUNTIME_PM_OPS(tegra_i2c_runtime_suspend, tegra_i2c_runtime_resume,
>                            NULL)
> 
> In this way, the device would already be runtime resumed, if there is
> call to pm_runtime_get_sync() from the clock framework due to the
> clk_prepare|unprepare() being called. If that also turns out to happen
> *after* runtime PM has been disabled for the device, the call to
> pm_runtime_get_sync() would still succeed (returning 1, see
> rpm_resume()), rather than a negative error code.
> 
> Yes, we may end up runtime resuming the device during system suspend,
> even if it turns out not to be needed. Although, that doesn't seem to
> be the case for the Tegra I2C driver, right?

Tegra I2C will turn off clocks on suspend regardless of RPM state.
Overall, it's a plausible solution, thank you!

As I said in the other reply, I'll simply remove the suspend ops from
clk driver, they are not needed anymore. The problem is gone.
Dmitry Osipenko Oct. 6, 2021, 9:25 p.m. UTC | #12
06.10.2021 15:38, Ulf Hansson пишет:
>> I'm also wondering if we could add some 'was_enabled' flag to GENPDs,
>> setting it by genpd_suspend_noirq() for the enabled domains, and then
>> powering-on GENPDs from genpd_resume_noirq() only if they were in the
>> enabled state during genpd_suspend_noirq() time. It actually puzzled me
>> for a quite long time why GENPD core enables domains unconditionally
>> during early resume. This should solve a part of the problem and it
>> makes suspend/resume a bit safer because there is a smaller chance to
>> crash hardware during suspend, at least it's easier to debug.
> Just because the PM domain was already off at genpd_suspend_noirq(),
> doesn't mean that it can stay powered off at genpd_resume_noirq(). At
> least as is today.
> 
> The main reason why genpd_resume_noirq() powers on the PM domain, is
> because it's not possible for the consumer drivers to rely on runtime
> PM to do it (because runtime PM has been disabled by the PM core).

At least Tegra doesn't need to have domains force-resumed. This should
be a platform-specific behaviour. We may add a new flag for that, I
suppose. I'll try to keep this in mind for a future improvement. Thank
you for the clarification.
Dmitry Osipenko Oct. 6, 2021, 10:01 p.m. UTC | #13
07.10.2021 00:14, Dmitry Osipenko пишет:
> 06.10.2021 15:43, Ulf Hansson пишет:
>> On Wed, 6 Oct 2021 at 00:43, Dmitry Osipenko <digetx@gmail.com> wrote:
>>>
>>> 06.10.2021 01:19, Dmitry Osipenko пишет:
>>> ...
>>>> I reproduced the OFF problem by removing the clk prepare/unprepare from
>>>> the suspend/resume of the clk driver and making some extra changes to
>>>> clock tree topology and etc to trigger the problem on Nexus 7.
>>>>
>>>> tegra-pmc 7000e400.pmc: failed to turn off PM domain heg: -13
>>>>
>>>> It happens from genpd_suspend_noirq() -> tegra_genpd_power_off() -> clk
>>>> -> GENPD -> I2C -> runtime-pm.
>>>>
>>>> -13 is EACCES, it comes from the runtime PM of I2C device. RPM is
>>>> prohibited/disabled during late (NOIRQ) suspend by the drivers core.
>>>
>>> My bad, I double-checked and it's not I2C RPM that is failing now, but
>>> the clock's RPM [1], which is also unavailable during NOIRQ.
>>
>> Yes, that sounds reasonable.
>>
>> You would then need a similar patch for the tegra clock driver as I
>> suggested for tegra I2C driver. That should solve the problem, I
>> think.
>>
>>>
>>> [1]
>>> https://elixir.free-electrons.com/linux/v5.15-rc4/source/drivers/clk/clk.c#L116
>>>
>>> Previously it was I2C RPM that was failing in a similar way, but code
>>> changed a tad since that time.
>>
>> Alright. In any case, as long as the devices gets suspended in the
>> correct order, I think it should be fine to cook a patch along the
>> lines of what I suggest for the I2C driver as well.
>>
>> It should work, I think. Although, maybe you want to avoid runtime
>> resuming the I2C device, unless it's the device belonging to the PMIC
>> interface, if there is a way to distinguish that for the driver.
> 
> Ulf, thank you very much for the suggestions! I was thinking about this
> all once again and concluded that the simplest variant will be to just
> remove the suspend ops from the clk driver since neither of PLLs require
> high voltage. We now have voltage bumped to a nominal level during
> suspend by Tegra's regulator-coupler driver and it's much higher than
> voltage needed by PLLs. So the problem I was trying to work around
> doesn't really exist anymore.

I hurried a bit with the conclusion, keep forgetting that I need to
change the clock tree in order to test it all properly :/ It's not fixed
yet.
Dmitry Osipenko Oct. 6, 2021, 10:03 p.m. UTC | #14
06.10.2021 15:38, Ulf Hansson пишет:
>>> Right, so the PM domain managed in tegra_genpd_power_on|off() can
>>> still be powered on/off, as long as the clock remains ungated?
>> Not ungated, but prepared.
> Okay, thanks for clarifying!
> 
> In summary, it sounds like you should be able to fix this problem in
> the I2C driver as I suggested above. If that works, that seems much
> better.

I'll try this variant, thank you.

> Moreover, it would leave the clocks gated/unprepared when the system
> is fully suspended, which I guess is better from an energy point of
> view?

The clocks are kept gated, it wasn't a problem. The problem was that
clocks were needed to be enabled temporarily. In order to enable a
clock, it needs to be prepared first. When clock is prepared, it resumes
clock's device RPM.

Keeping clocks prepared shouldn't make a noticeable difference from the
energy POV since clocks are gated. It's only voltage that is kept high,
but we need to keep it high during suspend anyways in order to resume
successfully. The hardware is mostly gated during suspend, depending on
suspend mode, so the power consumption difference is negligible. At
least I haven't seen any problems, battery doesn't drain during suspend.
Dmitry Osipenko Oct. 6, 2021, 11:21 p.m. UTC | #15
07.10.2021 01:01, Dmitry Osipenko пишет:
> 07.10.2021 00:14, Dmitry Osipenko пишет:
>> 06.10.2021 15:43, Ulf Hansson пишет:
>>> On Wed, 6 Oct 2021 at 00:43, Dmitry Osipenko <digetx@gmail.com> wrote:
>>>>
>>>> 06.10.2021 01:19, Dmitry Osipenko пишет:
>>>> ...
>>>>> I reproduced the OFF problem by removing the clk prepare/unprepare from
>>>>> the suspend/resume of the clk driver and making some extra changes to
>>>>> clock tree topology and etc to trigger the problem on Nexus 7.
>>>>>
>>>>> tegra-pmc 7000e400.pmc: failed to turn off PM domain heg: -13
>>>>>
>>>>> It happens from genpd_suspend_noirq() -> tegra_genpd_power_off() -> clk
>>>>> -> GENPD -> I2C -> runtime-pm.
>>>>>
>>>>> -13 is EACCES, it comes from the runtime PM of I2C device. RPM is
>>>>> prohibited/disabled during late (NOIRQ) suspend by the drivers core.
>>>>
>>>> My bad, I double-checked and it's not I2C RPM that is failing now, but
>>>> the clock's RPM [1], which is also unavailable during NOIRQ.
>>>
>>> Yes, that sounds reasonable.
>>>
>>> You would then need a similar patch for the tegra clock driver as I
>>> suggested for tegra I2C driver. That should solve the problem, I
>>> think.
>>>
>>>>
>>>> [1]
>>>> https://elixir.free-electrons.com/linux/v5.15-rc4/source/drivers/clk/clk.c#L116
>>>>
>>>> Previously it was I2C RPM that was failing in a similar way, but code
>>>> changed a tad since that time.
>>>
>>> Alright. In any case, as long as the devices gets suspended in the
>>> correct order, I think it should be fine to cook a patch along the
>>> lines of what I suggest for the I2C driver as well.
>>>
>>> It should work, I think. Although, maybe you want to avoid runtime
>>> resuming the I2C device, unless it's the device belonging to the PMIC
>>> interface, if there is a way to distinguish that for the driver.
>>
>> Ulf, thank you very much for the suggestions! I was thinking about this
>> all once again and concluded that the simplest variant will be to just
>> remove the suspend ops from the clk driver since neither of PLLs require
>> high voltage. We now have voltage bumped to a nominal level during
>> suspend by Tegra's regulator-coupler driver and it's much higher than
>> voltage needed by PLLs. So the problem I was trying to work around
>> doesn't really exist anymore.
> 
> I hurried a bit with the conclusion, keep forgetting that I need to
> change the clock tree in order to test it all properly :/ It's not fixed
> yet.
> 

Please let me iterate once again. The problem we currently have is that
clock may be enabled during NOIRQ time. In order to enable clock, it
needs to be prepared. In order to prepare clock, the clock's device
needs to be runtime-resumed. The runtime PM is unavailable at the NOIRQ
time.

To solve this problem we need to prepare clock beforehand.

The clock will stay prepared during suspend, but this is not a problem
since all the clocks we care about don't require high voltage and
voltage is guaranteed to be bumped high during suspend by Tegra's
regulator-coupler driver anyways.

So everything we need to do is to keep clocks prepared. There are two
options how to do that:

[1] this patch which explicitly prepares clocks using clk API.

[2] Use runtime PM API, like this:

static const struct dev_pm_ops tegra_clock_pm = {
	SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_resume_and_get, pm_runtime_put)
};

Ulf, are you now okay with the current variant [1] of the patch or you
prefer the second [2] option more?
Ulf Hansson Oct. 7, 2021, 9:18 a.m. UTC | #16
On Thu, 7 Oct 2021 at 01:21, Dmitry Osipenko <digetx@gmail.com> wrote:
>
> 07.10.2021 01:01, Dmitry Osipenko пишет:
> > 07.10.2021 00:14, Dmitry Osipenko пишет:
> >> 06.10.2021 15:43, Ulf Hansson пишет:
> >>> On Wed, 6 Oct 2021 at 00:43, Dmitry Osipenko <digetx@gmail.com> wrote:
> >>>>
> >>>> 06.10.2021 01:19, Dmitry Osipenko пишет:
> >>>> ...
> >>>>> I reproduced the OFF problem by removing the clk prepare/unprepare from
> >>>>> the suspend/resume of the clk driver and making some extra changes to
> >>>>> clock tree topology and etc to trigger the problem on Nexus 7.
> >>>>>
> >>>>> tegra-pmc 7000e400.pmc: failed to turn off PM domain heg: -13
> >>>>>
> >>>>> It happens from genpd_suspend_noirq() -> tegra_genpd_power_off() -> clk
> >>>>> -> GENPD -> I2C -> runtime-pm.
> >>>>>
> >>>>> -13 is EACCES, it comes from the runtime PM of I2C device. RPM is
> >>>>> prohibited/disabled during late (NOIRQ) suspend by the drivers core.
> >>>>
> >>>> My bad, I double-checked and it's not I2C RPM that is failing now, but
> >>>> the clock's RPM [1], which is also unavailable during NOIRQ.
> >>>
> >>> Yes, that sounds reasonable.
> >>>
> >>> You would then need a similar patch for the tegra clock driver as I
> >>> suggested for tegra I2C driver. That should solve the problem, I
> >>> think.
> >>>
> >>>>
> >>>> [1]
> >>>> https://elixir.free-electrons.com/linux/v5.15-rc4/source/drivers/clk/clk.c#L116
> >>>>
> >>>> Previously it was I2C RPM that was failing in a similar way, but code
> >>>> changed a tad since that time.
> >>>
> >>> Alright. In any case, as long as the devices gets suspended in the
> >>> correct order, I think it should be fine to cook a patch along the
> >>> lines of what I suggest for the I2C driver as well.
> >>>
> >>> It should work, I think. Although, maybe you want to avoid runtime
> >>> resuming the I2C device, unless it's the device belonging to the PMIC
> >>> interface, if there is a way to distinguish that for the driver.
> >>
> >> Ulf, thank you very much for the suggestions! I was thinking about this
> >> all once again and concluded that the simplest variant will be to just
> >> remove the suspend ops from the clk driver since neither of PLLs require
> >> high voltage. We now have voltage bumped to a nominal level during
> >> suspend by Tegra's regulator-coupler driver and it's much higher than
> >> voltage needed by PLLs. So the problem I was trying to work around
> >> doesn't really exist anymore.
> >
> > I hurried a bit with the conclusion, keep forgetting that I need to
> > change the clock tree in order to test it all properly :/ It's not fixed
> > yet.
> >
>
> Please let me iterate once again. The problem we currently have is that
> clock may be enabled during NOIRQ time. In order to enable clock, it
> needs to be prepared. In order to prepare clock, the clock's device
> needs to be runtime-resumed. The runtime PM is unavailable at the NOIRQ
> time.
>
> To solve this problem we need to prepare clock beforehand.
>
> The clock will stay prepared during suspend, but this is not a problem
> since all the clocks we care about don't require high voltage and
> voltage is guaranteed to be bumped high during suspend by Tegra's
> regulator-coupler driver anyways.
>
> So everything we need to do is to keep clocks prepared. There are two
> options how to do that:
>
> [1] this patch which explicitly prepares clocks using clk API.
>
> [2] Use runtime PM API, like this:
>
> static const struct dev_pm_ops tegra_clock_pm = {
>         SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_resume_and_get, pm_runtime_put)
> };
>
> Ulf, are you now okay with the current variant [1] of the patch or you
> prefer the second [2] option more?

I prefer option [2]. The clock_prepare|unprepare() thingy in option
[1], looks more like an odd workaround to me.

Does that make sense to you as well?

Kind regards
Uffe
Dmitry Osipenko Oct. 7, 2021, 10:36 a.m. UTC | #17
07.10.2021 12:18, Ulf Hansson пишет:
>> Please let me iterate once again. The problem we currently have is that
>> clock may be enabled during NOIRQ time. In order to enable clock, it
>> needs to be prepared. In order to prepare clock, the clock's device
>> needs to be runtime-resumed. The runtime PM is unavailable at the NOIRQ
>> time.
>>
>> To solve this problem we need to prepare clock beforehand.
>>
>> The clock will stay prepared during suspend, but this is not a problem
>> since all the clocks we care about don't require high voltage and
>> voltage is guaranteed to be bumped high during suspend by Tegra's
>> regulator-coupler driver anyways.
>>
>> So everything we need to do is to keep clocks prepared. There are two
>> options how to do that:
>>
>> [1] this patch which explicitly prepares clocks using clk API.
>>
>> [2] Use runtime PM API, like this:
>>
>> static const struct dev_pm_ops tegra_clock_pm = {
>>         SET_SYSTEM_SLEEP_PM_OPS(pm_runtime_resume_and_get, pm_runtime_put)
>> };
>>
>> Ulf, are you now okay with the current variant [1] of the patch or you
>> prefer the second [2] option more?
> I prefer option [2]. The clock_prepare|unprepare() thingy in option
> [1], looks more like an odd workaround to me.
> 
> Does that make sense to you as well?

I don't have a strong preference since both variants give the same
effect. I'll keep testing option [2] and will use it in the next version
if no problem will be found. Thank you!
diff mbox series

Patch

diff --git a/drivers/clk/tegra/Makefile b/drivers/clk/tegra/Makefile
index 7b1816856eb5..a0715cdfc1a4 100644
--- a/drivers/clk/tegra/Makefile
+++ b/drivers/clk/tegra/Makefile
@@ -1,6 +1,7 @@ 
 # SPDX-License-Identifier: GPL-2.0
 obj-y					+= clk.o
 obj-y					+= clk-audio-sync.o
+obj-y					+= clk-device.o
 obj-y					+= clk-dfll.o
 obj-y					+= clk-divider.o
 obj-y					+= clk-periph.o
diff --git a/drivers/clk/tegra/clk-device.c b/drivers/clk/tegra/clk-device.c
new file mode 100644
index 000000000000..830bc0ba25d3
--- /dev/null
+++ b/drivers/clk/tegra/clk-device.c
@@ -0,0 +1,230 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/mutex.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/pm_domain.h>
+#include <linux/pm_opp.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+
+#include <soc/tegra/common.h>
+
+#include "clk.h"
+
+/*
+ * This driver manages performance state of the core power domain for the
+ * independent PLLs and system clocks.  We created a virtual clock device
+ * for such clocks, see tegra_clk_dev_register().
+ */
+
+struct tegra_clk_device {
+	struct notifier_block clk_nb;
+	struct device *dev;
+	struct clk_hw *hw;
+	struct mutex lock;
+};
+
+static int tegra_clock_set_pd_state(struct tegra_clk_device *clk_dev,
+				    unsigned long rate)
+{
+	struct device *dev = clk_dev->dev;
+	struct dev_pm_opp *opp;
+	unsigned int pstate;
+
+	opp = dev_pm_opp_find_freq_ceil(dev, &rate);
+	if (opp == ERR_PTR(-ERANGE)) {
+		dev_dbg(dev, "failed to find ceil OPP for %luHz\n", rate);
+		opp = dev_pm_opp_find_freq_floor(dev, &rate);
+	}
+
+	if (IS_ERR(opp)) {
+		dev_err(dev, "failed to find OPP for %luHz: %pe\n", rate, opp);
+		return PTR_ERR(opp);
+	}
+
+	pstate = dev_pm_opp_get_required_pstate(opp, 0);
+	dev_pm_opp_put(opp);
+
+	return dev_pm_genpd_set_performance_state(dev, pstate);
+}
+
+static int tegra_clock_change_notify(struct notifier_block *nb,
+				     unsigned long msg, void *data)
+{
+	struct clk_notifier_data *cnd = data;
+	struct tegra_clk_device *clk_dev;
+	int err = 0;
+
+	clk_dev = container_of(nb, struct tegra_clk_device, clk_nb);
+
+	mutex_lock(&clk_dev->lock);
+	switch (msg) {
+	case PRE_RATE_CHANGE:
+		if (cnd->new_rate > cnd->old_rate)
+			err = tegra_clock_set_pd_state(clk_dev, cnd->new_rate);
+		break;
+
+	case ABORT_RATE_CHANGE:
+		err = tegra_clock_set_pd_state(clk_dev, cnd->old_rate);
+		break;
+
+	case POST_RATE_CHANGE:
+		if (cnd->new_rate < cnd->old_rate)
+			err = tegra_clock_set_pd_state(clk_dev, cnd->new_rate);
+		break;
+
+	default:
+		break;
+	}
+	mutex_unlock(&clk_dev->lock);
+
+	return notifier_from_errno(err);
+}
+
+static int tegra_clock_sync_pd_state(struct tegra_clk_device *clk_dev)
+{
+	unsigned long rate;
+	int ret = 0;
+
+	mutex_lock(&clk_dev->lock);
+
+	if (!pm_runtime_status_suspended(clk_dev->dev)) {
+		rate = clk_hw_get_rate(clk_dev->hw);
+		ret = tegra_clock_set_pd_state(clk_dev, rate);
+	}
+
+	mutex_unlock(&clk_dev->lock);
+
+	return ret;
+}
+
+static int tegra_clock_probe(struct platform_device *pdev)
+{
+	struct tegra_core_opp_params opp_params = {};
+	struct tegra_clk_device *clk_dev;
+	struct device *dev = &pdev->dev;
+	struct clk *clk;
+	int err;
+
+	if (!dev->pm_domain)
+		return -EINVAL;
+
+	clk_dev = devm_kzalloc(dev, sizeof(*clk_dev), GFP_KERNEL);
+	if (!clk_dev)
+		return -ENOMEM;
+
+	clk = devm_clk_get(dev, NULL);
+	if (IS_ERR(clk))
+		return PTR_ERR(clk);
+
+	clk_dev->dev = dev;
+	clk_dev->hw = __clk_get_hw(clk);
+	clk_dev->clk_nb.notifier_call = tegra_clock_change_notify;
+	mutex_init(&clk_dev->lock);
+
+	platform_set_drvdata(pdev, clk_dev);
+
+	/*
+	 * Runtime PM was already enabled for this device by the parent clk
+	 * driver and power domain state should be synced under clk_dev lock,
+	 * hence we don't use the common OPP helper that initializes OPP
+	 * state. For some clocks common OPP helper may fail to find ceil
+	 * rate, it's handled by this driver.
+	 */
+	err = devm_tegra_core_dev_init_opp_table(dev, &opp_params);
+	if (err)
+		return err;
+
+	err = clk_notifier_register(clk, &clk_dev->clk_nb);
+	if (err) {
+		dev_err(dev, "failed to register clk notifier: %d\n", err);
+		return err;
+	}
+
+	/*
+	 * The driver is attaching to a potentially active/resumed clock, hence
+	 * we need to sync the power domain performance state in a accordance to
+	 * the clock rate if clock is resumed.
+	 */
+	err = tegra_clock_sync_pd_state(clk_dev);
+	if (err)
+		goto unreg_clk;
+
+	return 0;
+
+unreg_clk:
+	clk_notifier_unregister(clk, &clk_dev->clk_nb);
+
+	return err;
+}
+
+static __maybe_unused int tegra_clock_pm_suspend(struct device *dev)
+{
+	struct tegra_clk_device *clk_dev = dev_get_drvdata(dev);
+
+	/*
+	 * Power management of the clock is entangled with the Tegra PMC
+	 * GENPD because PMC driver enables/disables clocks for toggling
+	 * of the PD's on/off state.
+	 *
+	 * The PMC GENPD is resumed in NOIRQ phase, before RPM of the clocks
+	 * becomes available, hence PMC can't use clocks at the early resume
+	 * phase if RPM is involved. For example when 3d clock is enabled,
+	 * it may enable the parent PLL clock that needs to be RPM-resumed.
+	 *
+	 * Secondly, the PLL clocks may be enabled by the low level suspend
+	 * code, so we need to assume that PLL is in enabled state during
+	 * suspend.
+	 *
+	 * We will keep PLLs and system clock resumed during suspend time.
+	 * All PLLs on all SoCs are low power and system clock is always-on,
+	 * so practically not much is changed here.
+	 */
+
+	return clk_prepare(clk_dev->hw->clk);
+}
+
+static __maybe_unused int tegra_clock_pm_resume(struct device *dev)
+{
+	struct tegra_clk_device *clk_dev = dev_get_drvdata(dev);
+
+	clk_unprepare(clk_dev->hw->clk);
+
+	return 0;
+}
+
+static void tegra_clock_shutdown(struct platform_device *pdev)
+{
+	struct tegra_clk_device *clk_dev = platform_get_drvdata(pdev);
+
+	clk_prepare(clk_dev->hw->clk);
+}
+
+static const struct dev_pm_ops tegra_clock_pm = {
+	SET_SYSTEM_SLEEP_PM_OPS(tegra_clock_pm_suspend,
+				tegra_clock_pm_resume)
+};
+
+static const struct of_device_id tegra_clock_match[] = {
+	{ .compatible = "nvidia,tegra20-sclk" },
+	{ .compatible = "nvidia,tegra30-sclk" },
+	{ .compatible = "nvidia,tegra30-pllc" },
+	{ .compatible = "nvidia,tegra30-plle" },
+	{ .compatible = "nvidia,tegra30-pllm" },
+	{ }
+};
+
+static struct platform_driver tegra_clock_driver = {
+	.driver = {
+		.name = "tegra-clock",
+		.of_match_table = tegra_clock_match,
+		.pm = &tegra_clock_pm,
+		.suppress_bind_attrs = true,
+	},
+	.probe = tegra_clock_probe,
+	.shutdown = tegra_clock_shutdown,
+};
+builtin_platform_driver(tegra_clock_driver);
diff --git a/drivers/clk/tegra/clk-pll.c b/drivers/clk/tegra/clk-pll.c
index eaa079c177c3..100b5d9b7e26 100644
--- a/drivers/clk/tegra/clk-pll.c
+++ b/drivers/clk/tegra/clk-pll.c
@@ -1914,7 +1914,7 @@  static struct clk *_tegra_clk_register_pll(struct tegra_clk_pll *pll,
 	/* Data in .init is copied by clk_register(), so stack variable OK */
 	pll->hw.init = &init;
 
-	return clk_register(NULL, &pll->hw);
+	return tegra_clk_dev_register(&pll->hw);
 }
 
 struct clk *tegra_clk_register_pll(const char *name, const char *parent_name,
diff --git a/drivers/clk/tegra/clk-super.c b/drivers/clk/tegra/clk-super.c
index 6099c6e9acd4..a98a420398fa 100644
--- a/drivers/clk/tegra/clk-super.c
+++ b/drivers/clk/tegra/clk-super.c
@@ -226,7 +226,7 @@  struct clk *tegra_clk_register_super_mux(const char *name,
 	/* Data in .init is copied by clk_register(), so stack variable OK */
 	super->hw.init = &init;
 
-	clk = clk_register(NULL, &super->hw);
+	clk = tegra_clk_dev_register(&super->hw);
 	if (IS_ERR(clk))
 		kfree(super);
 
diff --git a/drivers/clk/tegra/clk-tegra20.c b/drivers/clk/tegra/clk-tegra20.c
index 3664593a5ba4..be3c33441cfc 100644
--- a/drivers/clk/tegra/clk-tegra20.c
+++ b/drivers/clk/tegra/clk-tegra20.c
@@ -6,8 +6,11 @@ 
 #include <linux/io.h>
 #include <linux/clk-provider.h>
 #include <linux/clkdev.h>
+#include <linux/init.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
 #include <linux/clk/tegra.h>
 #include <linux/delay.h>
 #include <dt-bindings/clock/tegra20-car.h>
@@ -414,7 +417,7 @@  static struct tegra_clk_pll_params pll_e_params = {
 	.fixed_rate = 100000000,
 };
 
-static struct tegra_devclk devclks[] __initdata = {
+static struct tegra_devclk devclks[] = {
 	{ .con_id = "pll_c", .dt_id = TEGRA20_CLK_PLL_C },
 	{ .con_id = "pll_c_out1", .dt_id = TEGRA20_CLK_PLL_C_OUT1 },
 	{ .con_id = "pll_p", .dt_id = TEGRA20_CLK_PLL_P },
@@ -710,13 +713,6 @@  static void tegra20_super_clk_init(void)
 			      NULL);
 	clks[TEGRA20_CLK_CCLK] = clk;
 
-	/* SCLK */
-	clk = tegra_clk_register_super_mux("sclk", sclk_parents,
-			      ARRAY_SIZE(sclk_parents),
-			      CLK_SET_RATE_PARENT | CLK_IS_CRITICAL,
-			      clk_base + SCLK_BURST_POLICY, 0, 4, 0, 0, NULL);
-	clks[TEGRA20_CLK_SCLK] = clk;
-
 	/* twd */
 	clk = clk_register_fixed_factor(NULL, "twd", "cclk", 0, 1, 4);
 	clks[TEGRA20_CLK_TWD] = clk;
@@ -1014,7 +1010,7 @@  static struct tegra_cpu_car_ops tegra20_cpu_car_ops = {
 #endif
 };
 
-static struct tegra_clk_init_table init_table[] __initdata = {
+static struct tegra_clk_init_table init_table[] = {
 	{ TEGRA20_CLK_PLL_P, TEGRA20_CLK_CLK_MAX, 216000000, 1 },
 	{ TEGRA20_CLK_PLL_P_OUT1, TEGRA20_CLK_CLK_MAX, 28800000, 1 },
 	{ TEGRA20_CLK_PLL_P_OUT2, TEGRA20_CLK_CLK_MAX, 48000000, 1 },
@@ -1052,11 +1048,6 @@  static struct tegra_clk_init_table init_table[] __initdata = {
 	{ TEGRA20_CLK_CLK_MAX, TEGRA20_CLK_CLK_MAX, 0, 0 },
 };
 
-static void __init tegra20_clock_apply_init_table(void)
-{
-	tegra_init_from_table(init_table, clks, TEGRA20_CLK_CLK_MAX);
-}
-
 /*
  * Some clocks may be used by different drivers depending on the board
  * configuration.  List those here to register them twice in the clock lookup
@@ -1076,6 +1067,8 @@  static const struct of_device_id pmc_match[] __initconst = {
 	{ },
 };
 
+static bool tegra20_car_initialized;
+
 static struct clk *tegra20_clk_src_onecell_get(struct of_phandle_args *clkspec,
 					       void *data)
 {
@@ -1083,6 +1076,16 @@  static struct clk *tegra20_clk_src_onecell_get(struct of_phandle_args *clkspec,
 	struct clk_hw *hw;
 	struct clk *clk;
 
+	/*
+	 * Timer clocks are needed early, the rest of the clocks shouldn't be
+	 * available to device drivers until clock tree is fully initialized.
+	 */
+	if (clkspec->args[0] != TEGRA20_CLK_RTC &&
+	    clkspec->args[0] != TEGRA20_CLK_TWD &&
+	    clkspec->args[0] != TEGRA20_CLK_TIMER &&
+	    !tegra20_car_initialized)
+		return ERR_PTR(-EPROBE_DEFER);
+
 	clk = of_clk_src_onecell_get(clkspec, data);
 	if (IS_ERR(clk))
 		return clk;
@@ -1149,10 +1152,48 @@  static void __init tegra20_clock_init(struct device_node *np)
 	tegra_init_dup_clks(tegra_clk_duplicates, clks, TEGRA20_CLK_CLK_MAX);
 
 	tegra_add_of_provider(np, tegra20_clk_src_onecell_get);
-	tegra_register_devclks(devclks, ARRAY_SIZE(devclks));
-
-	tegra_clk_apply_init_table = tegra20_clock_apply_init_table;
 
 	tegra_cpu_car_ops = &tegra20_cpu_car_ops;
 }
-CLK_OF_DECLARE(tegra20, "nvidia,tegra20-car", tegra20_clock_init);
+CLK_OF_DECLARE_DRIVER(tegra20, "nvidia,tegra20-car", tegra20_clock_init);
+
+/*
+ * Clocks that use runtime PM can't be created at the tegra20_clock_init
+ * time because drivers' base isn't initialized yet, and thus platform
+ * devices can't be created for the clocks.  Hence we need to split the
+ * registration of the clocks into two phases.  The first phase registers
+ * essential clocks which don't require RPM and are actually used during
+ * early boot.  The second phase registers clocks which use RPM and this
+ * is done when device drivers' core API is ready.
+ */
+static int tegra20_car_probe(struct platform_device *pdev)
+{
+	struct clk *clk;
+
+	clk = tegra_clk_register_super_mux("sclk", sclk_parents,
+			      ARRAY_SIZE(sclk_parents),
+			      CLK_SET_RATE_PARENT | CLK_IS_CRITICAL,
+			      clk_base + SCLK_BURST_POLICY, 0, 4, 0, 0, NULL);
+	clks[TEGRA20_CLK_SCLK] = clk;
+
+	tegra_register_devclks(devclks, ARRAY_SIZE(devclks));
+	tegra_init_from_table(init_table, clks, TEGRA20_CLK_CLK_MAX);
+	tegra20_car_initialized = true;
+
+	return 0;
+}
+
+static const struct of_device_id tegra20_car_match[] = {
+	{ .compatible = "nvidia,tegra20-car" },
+	{ }
+};
+
+static struct platform_driver tegra20_car_driver = {
+	.driver = {
+		.name = "tegra20-car",
+		.of_match_table = tegra20_car_match,
+		.suppress_bind_attrs = true,
+	},
+	.probe = tegra20_car_probe,
+};
+builtin_platform_driver(tegra20_car_driver);
diff --git a/drivers/clk/tegra/clk-tegra30.c b/drivers/clk/tegra/clk-tegra30.c
index 64121bc66d85..04b496123820 100644
--- a/drivers/clk/tegra/clk-tegra30.c
+++ b/drivers/clk/tegra/clk-tegra30.c
@@ -7,8 +7,11 @@ 
 #include <linux/delay.h>
 #include <linux/clk-provider.h>
 #include <linux/clkdev.h>
+#include <linux/init.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
 #include <linux/clk/tegra.h>
 
 #include <soc/tegra/pmc.h>
@@ -532,7 +535,7 @@  static unsigned long tegra30_input_freq[] = {
 	[12] = 26000000,
 };
 
-static struct tegra_devclk devclks[] __initdata = {
+static struct tegra_devclk devclks[] = {
 	{ .con_id = "pll_c", .dt_id = TEGRA30_CLK_PLL_C },
 	{ .con_id = "pll_c_out1", .dt_id = TEGRA30_CLK_PLL_C_OUT1 },
 	{ .con_id = "pll_p", .dt_id = TEGRA30_CLK_PLL_P },
@@ -812,11 +815,6 @@  static void __init tegra30_pll_init(void)
 {
 	struct clk *clk;
 
-	/* PLLC */
-	clk = tegra_clk_register_pll("pll_c", "pll_ref", clk_base, pmc_base, 0,
-				     &pll_c_params, NULL);
-	clks[TEGRA30_CLK_PLL_C] = clk;
-
 	/* PLLC_OUT1 */
 	clk = tegra_clk_register_divider("pll_c_out1_div", "pll_c",
 				clk_base + PLLC_OUT, 0, TEGRA_DIVIDER_ROUND_UP,
@@ -826,11 +824,6 @@  static void __init tegra30_pll_init(void)
 				0, NULL);
 	clks[TEGRA30_CLK_PLL_C_OUT1] = clk;
 
-	/* PLLM */
-	clk = tegra_clk_register_pll("pll_m", "pll_ref", clk_base, pmc_base,
-			    CLK_SET_RATE_GATE, &pll_m_params, NULL);
-	clks[TEGRA30_CLK_PLL_M] = clk;
-
 	/* PLLM_OUT1 */
 	clk = tegra_clk_register_divider("pll_m_out1_div", "pll_m",
 				clk_base + PLLM_OUT, 0, TEGRA_DIVIDER_ROUND_UP,
@@ -880,9 +873,6 @@  static void __init tegra30_pll_init(void)
 			       ARRAY_SIZE(pll_e_parents),
 			       CLK_SET_RATE_NO_REPARENT,
 			       clk_base + PLLE_AUX, 2, 1, 0, NULL);
-	clk = tegra_clk_register_plle("pll_e", "pll_e_mux", clk_base, pmc_base,
-			     CLK_GET_RATE_NOCACHE, &pll_e_params, NULL);
-	clks[TEGRA30_CLK_PLL_E] = clk;
 }
 
 static const char *cclk_g_parents[] = { "clk_m", "pll_c", "clk_32k", "pll_m",
@@ -971,14 +961,6 @@  static void __init tegra30_super_clk_init(void)
 			      NULL);
 	clks[TEGRA30_CLK_CCLK_LP] = clk;
 
-	/* SCLK */
-	clk = tegra_clk_register_super_mux("sclk", sclk_parents,
-				  ARRAY_SIZE(sclk_parents),
-				  CLK_SET_RATE_PARENT | CLK_IS_CRITICAL,
-				  clk_base + SCLK_BURST_POLICY,
-				  0, 4, 0, 0, NULL);
-	clks[TEGRA30_CLK_SCLK] = clk;
-
 	/* twd */
 	clk = clk_register_fixed_factor(NULL, "twd", "cclk_g",
 					CLK_SET_RATE_PARENT, 1, 2);
@@ -1214,7 +1196,7 @@  static struct tegra_cpu_car_ops tegra30_cpu_car_ops = {
 #endif
 };
 
-static struct tegra_clk_init_table init_table[] __initdata = {
+static struct tegra_clk_init_table init_table[] = {
 	{ TEGRA30_CLK_UARTA, TEGRA30_CLK_PLL_P, 408000000, 0 },
 	{ TEGRA30_CLK_UARTB, TEGRA30_CLK_PLL_P, 408000000, 0 },
 	{ TEGRA30_CLK_UARTC, TEGRA30_CLK_PLL_P, 408000000, 0 },
@@ -1259,11 +1241,6 @@  static struct tegra_clk_init_table init_table[] __initdata = {
 	{ TEGRA30_CLK_CLK_MAX, TEGRA30_CLK_CLK_MAX, 0, 0 },
 };
 
-static void __init tegra30_clock_apply_init_table(void)
-{
-	tegra_init_from_table(init_table, clks, TEGRA30_CLK_CLK_MAX);
-}
-
 /*
  * Some clocks may be used by different drivers depending on the board
  * configuration.  List those here to register them twice in the clock lookup
@@ -1294,12 +1271,24 @@  static struct tegra_audio_clk_info tegra30_audio_plls[] = {
 	{ "pll_a", &pll_a_params, tegra_clk_pll_a, "pll_p_out1" },
 };
 
+static bool tegra30_car_initialized;
+
 static struct clk *tegra30_clk_src_onecell_get(struct of_phandle_args *clkspec,
 					       void *data)
 {
 	struct clk_hw *hw;
 	struct clk *clk;
 
+	/*
+	 * Timer clocks are needed early, the rest of the clocks shouldn't be
+	 * available to device drivers until clock tree is fully initialized.
+	 */
+	if (clkspec->args[0] != TEGRA30_CLK_RTC &&
+	    clkspec->args[0] != TEGRA30_CLK_TWD &&
+	    clkspec->args[0] != TEGRA30_CLK_TIMER &&
+	    !tegra30_car_initialized)
+		return ERR_PTR(-EPROBE_DEFER);
+
 	clk = of_clk_src_onecell_get(clkspec, data);
 	if (IS_ERR(clk))
 		return clk;
@@ -1357,10 +1346,75 @@  static void __init tegra30_clock_init(struct device_node *np)
 	tegra_init_dup_clks(tegra_clk_duplicates, clks, TEGRA30_CLK_CLK_MAX);
 
 	tegra_add_of_provider(np, tegra30_clk_src_onecell_get);
+
+	tegra_cpu_car_ops = &tegra30_cpu_car_ops;
+}
+CLK_OF_DECLARE_DRIVER(tegra30, "nvidia,tegra30-car", tegra30_clock_init);
+
+/*
+ * Clocks that use runtime PM can't be created at the tegra30_clock_init
+ * time because drivers' base isn't initialized yet, and thus platform
+ * devices can't be created for the clocks.  Hence we need to split the
+ * registration of the clocks into two phases.  The first phase registers
+ * essential clocks which don't require RPM and are actually used during
+ * early boot.  The second phase registers clocks which use RPM and this
+ * is done when device drivers' core API is ready.
+ */
+static int tegra30_car_probe(struct platform_device *pdev)
+{
+	struct clk *clk;
+
+	/* PLLC */
+	clk = tegra_clk_register_pll("pll_c", "pll_ref", clk_base, pmc_base, 0,
+				     &pll_c_params, NULL);
+	clks[TEGRA30_CLK_PLL_C] = clk;
+
+	/* PLLE */
+	clk = tegra_clk_register_plle("pll_e", "pll_e_mux", clk_base, pmc_base,
+				      CLK_GET_RATE_NOCACHE, &pll_e_params, NULL);
+	clks[TEGRA30_CLK_PLL_E] = clk;
+
+	/* PLLM */
+	clk = tegra_clk_register_pll("pll_m", "pll_ref", clk_base, pmc_base,
+				     CLK_SET_RATE_GATE, &pll_m_params, NULL);
+	clks[TEGRA30_CLK_PLL_M] = clk;
+
+	/* SCLK */
+	clk = tegra_clk_register_super_mux("sclk", sclk_parents,
+					   ARRAY_SIZE(sclk_parents),
+					   CLK_SET_RATE_PARENT | CLK_IS_CRITICAL,
+					   clk_base + SCLK_BURST_POLICY,
+					   0, 4, 0, 0, NULL);
+	clks[TEGRA30_CLK_SCLK] = clk;
+
 	tegra_register_devclks(devclks, ARRAY_SIZE(devclks));
+	tegra_init_from_table(init_table, clks, TEGRA30_CLK_CLK_MAX);
+	tegra30_car_initialized = true;
 
-	tegra_clk_apply_init_table = tegra30_clock_apply_init_table;
+	return 0;
+}
 
-	tegra_cpu_car_ops = &tegra30_cpu_car_ops;
+static const struct of_device_id tegra30_car_match[] = {
+	{ .compatible = "nvidia,tegra30-car" },
+	{ }
+};
+
+static struct platform_driver tegra30_car_driver = {
+	.driver = {
+		.name = "tegra30-car",
+		.of_match_table = tegra30_car_match,
+		.suppress_bind_attrs = true,
+	},
+	.probe = tegra30_car_probe,
+};
+
+/*
+ * Clock driver must be registered before memory controller driver,
+ * which doesn't support deferred probing for today and is registered
+ * from arch init-level.
+ */
+static int tegra30_car_init(void)
+{
+	return platform_driver_register(&tegra30_car_driver);
 }
-CLK_OF_DECLARE(tegra30, "nvidia,tegra30-car", tegra30_clock_init);
+postcore_initcall(tegra30_car_init);
diff --git a/drivers/clk/tegra/clk.c b/drivers/clk/tegra/clk.c
index f6cdce441cf7..26bda45813c0 100644
--- a/drivers/clk/tegra/clk.c
+++ b/drivers/clk/tegra/clk.c
@@ -9,14 +9,19 @@ 
 #include <linux/delay.h>
 #include <linux/io.h>
 #include <linux/of.h>
+#include <linux/of_device.h>
 #include <linux/clk/tegra.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 #include <linux/reset-controller.h>
+#include <linux/string.h>
 
 #include <soc/tegra/fuse.h>
 
 #include "clk.h"
 
 /* Global data of Tegra CPU CAR ops */
+static struct device_node *tegra_car_np;
 static struct tegra_cpu_car_ops dummy_car_ops;
 struct tegra_cpu_car_ops *tegra_cpu_car_ops = &dummy_car_ops;
 
@@ -261,8 +266,8 @@  void __init tegra_init_dup_clks(struct tegra_clk_duplicate *dup_list,
 	}
 }
 
-void __init tegra_init_from_table(struct tegra_clk_init_table *tbl,
-				  struct clk *clks[], int clk_max)
+void tegra_init_from_table(struct tegra_clk_init_table *tbl,
+			   struct clk *clks[], int clk_max)
 {
 	struct clk *clk;
 
@@ -320,6 +325,8 @@  void __init tegra_add_of_provider(struct device_node *np,
 {
 	int i;
 
+	tegra_car_np = np;
+
 	for (i = 0; i < clk_num; i++) {
 		if (IS_ERR(clks[i])) {
 			pr_err
@@ -348,7 +355,7 @@  void __init tegra_init_special_resets(unsigned int num,
 	special_reset_deassert = deassert;
 }
 
-void __init tegra_register_devclks(struct tegra_devclk *dev_clks, int num)
+void tegra_register_devclks(struct tegra_devclk *dev_clks, int num)
 {
 	int i;
 
@@ -372,6 +379,68 @@  struct clk ** __init tegra_lookup_dt_id(int clk_id,
 		return NULL;
 }
 
+static struct device_node *tegra_clk_get_of_node(struct clk_hw *hw)
+{
+	struct device_node *np;
+	char *node_name;
+
+	node_name = kstrdup(hw->init->name, GFP_KERNEL);
+	if (!node_name)
+		return NULL;
+
+	strreplace(node_name, '_', '-');
+
+	for_each_child_of_node(tegra_car_np, np) {
+		if (!strcmp(np->name, node_name))
+			break;
+	}
+
+	kfree(node_name);
+
+	return np;
+}
+
+struct clk *tegra_clk_dev_register(struct clk_hw *hw)
+{
+	struct platform_device *pdev, *parent;
+	const char *dev_name = NULL;
+	struct device *dev = NULL;
+	struct device_node *np;
+
+	np = tegra_clk_get_of_node(hw);
+
+	if (!of_device_is_available(np))
+		goto put_node;
+
+	dev_name = kasprintf(GFP_KERNEL, "tegra_clk_%s", hw->init->name);
+	if (!dev_name)
+		goto put_node;
+
+	parent = of_find_device_by_node(tegra_car_np);
+	if (parent) {
+		pdev = of_platform_device_create(np, dev_name, &parent->dev);
+		put_device(&parent->dev);
+
+		if (!pdev) {
+			pr_err("%s: failed to create device for %pOF\n",
+			       __func__, np);
+			goto free_name;
+		}
+
+		dev = &pdev->dev;
+		pm_runtime_enable(dev);
+	} else {
+		WARN(1, "failed to find device for %pOF\n", tegra_car_np);
+	}
+
+free_name:
+	kfree(dev_name);
+put_node:
+	of_node_put(np);
+
+	return clk_register(dev, hw);
+}
+
 tegra_clk_apply_init_table_func tegra_clk_apply_init_table;
 
 static int __init tegra_clocks_apply_init_table(void)
diff --git a/drivers/clk/tegra/clk.h b/drivers/clk/tegra/clk.h
index 0c3ba0ccce1a..5d80d8b79b8e 100644
--- a/drivers/clk/tegra/clk.h
+++ b/drivers/clk/tegra/clk.h
@@ -927,4 +927,6 @@  struct clk *tegra20_clk_register_emc(void __iomem *ioaddr, bool low_jitter);
 struct clk *tegra210_clk_register_emc(struct device_node *np,
 				      void __iomem *regs);
 
+struct clk *tegra_clk_dev_register(struct clk_hw *hw);
+
 #endif /* TEGRA_CLK_H */