diff mbox series

[v2] opp: Power on (virtual) power domains managed by the OPP core

Message ID 20200826093328.88268-1-stephan@gerhold.net
State New
Delegated to: viresh kumar
Headers show
Series [v2] opp: Power on (virtual) power domains managed by the OPP core | expand

Commit Message

Stephan Gerhold Aug. 26, 2020, 9:33 a.m. UTC
dev_pm_opp_attach_genpd() allows attaching an arbitrary number of
power domains to an OPP table. In that case, the genpd core will
create a virtual device for each of the power domains.

At the moment, the OPP core only calls
dev_pm_genpd_set_performance_state() on these virtual devices.
It does not attempt to power on the power domains. Therefore
the required power domain might never get turned on.

So far, dev_pm_opp_attach_genpd() is only used in qcom-cpufreq-nvmem.c
to attach the CPR power domain to the CPU OPP table. The CPR driver
does not check if it was actually powered on so this did not cause
any problems. However, other drivers (e.g. rpmpd) might ignore the
performance state until the power domain is actually powered on.

Since these virtual devices are managed exclusively by the OPP core,
I would say that it should also be responsible to ensure they are
enabled.

This commit implements this similar to the non-virtual power domains;
we create device links for each of attached power domains so that they
are turned on whenever the consumer device is active.

Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
---
Related discussion: https://lore.kernel.org/linux-arm-msm/20200426123140.GA190483@gerhold.net/

v1: https://lore.kernel.org/linux-pm/20200730080146.25185-4-stephan@gerhold.net/
Changes in v2:
  - Use device links instead of enabling the power domains in
    dev_pm_opp_set_rate()
---
 drivers/opp/core.c | 34 +++++++++++++++++++++++++++++++++-
 drivers/opp/opp.h  |  1 +
 2 files changed, 34 insertions(+), 1 deletion(-)

Comments

Viresh Kumar Aug. 27, 2020, 10:01 a.m. UTC | #1
On 26-08-20, 11:33, Stephan Gerhold wrote:
> diff --git a/drivers/opp/core.c b/drivers/opp/core.c
> index 8b3c3986f589..7e53a7b94c59 100644
> --- a/drivers/opp/core.c
> +++ b/drivers/opp/core.c
> @@ -17,6 +17,7 @@
>  #include <linux/device.h>
>  #include <linux/export.h>
>  #include <linux/pm_domain.h>
> +#include <linux/pm_runtime.h>
>  #include <linux/regulator/consumer.h>
>  
>  #include "opp.h"
> @@ -1964,10 +1965,13 @@ static void _opp_detach_genpd(struct opp_table *opp_table)
>  		if (!opp_table->genpd_virt_devs[index])
>  			continue;
>  
> +		if (opp_table->genpd_virt_links && opp_table->genpd_virt_links[index])
> +			device_link_del(opp_table->genpd_virt_links[index]);
>  		dev_pm_domain_detach(opp_table->genpd_virt_devs[index], false);
>  		opp_table->genpd_virt_devs[index] = NULL;
>  	}
>  
> +	kfree(opp_table->genpd_virt_links);
>  	kfree(opp_table->genpd_virt_devs);
>  	opp_table->genpd_virt_devs = NULL;
>  }
> @@ -1999,8 +2003,10 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
>  {
>  	struct opp_table *opp_table;
>  	struct device *virt_dev;
> -	int index = 0, ret = -EINVAL;
> +	struct device_link *dev_link;
> +	int index = 0, ret = -EINVAL, num_devs;
>  	const char **name = names;
> +	u32 flags;
>  
>  	opp_table = dev_pm_opp_get_opp_table(dev);
>  	if (IS_ERR(opp_table))
> @@ -2049,6 +2055,32 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,

I was about to apply this patch when I noticed that this routine does
return the array of virtual devices back to the caller, like the qcom
cpufreq driver in this case. IIRC we did it this way as a generic
solution for this in the OPP core wasn't preferable.

And so I think again if this patch should be picked instead of letting
the platform handle this ?
Stephan Gerhold Aug. 27, 2020, 11:44 a.m. UTC | #2
On Thu, Aug 27, 2020 at 03:31:04PM +0530, Viresh Kumar wrote:
> On 26-08-20, 11:33, Stephan Gerhold wrote:
> > diff --git a/drivers/opp/core.c b/drivers/opp/core.c
> > index 8b3c3986f589..7e53a7b94c59 100644
> > --- a/drivers/opp/core.c
> > +++ b/drivers/opp/core.c
> > @@ -17,6 +17,7 @@
> >  #include <linux/device.h>
> >  #include <linux/export.h>
> >  #include <linux/pm_domain.h>
> > +#include <linux/pm_runtime.h>
> >  #include <linux/regulator/consumer.h>
> >  
> >  #include "opp.h"
> > @@ -1964,10 +1965,13 @@ static void _opp_detach_genpd(struct opp_table *opp_table)
> >  		if (!opp_table->genpd_virt_devs[index])
> >  			continue;
> >  
> > +		if (opp_table->genpd_virt_links && opp_table->genpd_virt_links[index])
> > +			device_link_del(opp_table->genpd_virt_links[index]);
> >  		dev_pm_domain_detach(opp_table->genpd_virt_devs[index], false);
> >  		opp_table->genpd_virt_devs[index] = NULL;
> >  	}
> >  
> > +	kfree(opp_table->genpd_virt_links);
> >  	kfree(opp_table->genpd_virt_devs);
> >  	opp_table->genpd_virt_devs = NULL;
> >  }
> > @@ -1999,8 +2003,10 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
> >  {
> >  	struct opp_table *opp_table;
> >  	struct device *virt_dev;
> > -	int index = 0, ret = -EINVAL;
> > +	struct device_link *dev_link;
> > +	int index = 0, ret = -EINVAL, num_devs;
> >  	const char **name = names;
> > +	u32 flags;
> >  
> >  	opp_table = dev_pm_opp_get_opp_table(dev);
> >  	if (IS_ERR(opp_table))
> > @@ -2049,6 +2055,32 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
> 
> I was about to apply this patch when I noticed that this routine does
> return the array of virtual devices back to the caller, like the qcom
> cpufreq driver in this case. IIRC we did it this way as a generic
> solution for this in the OPP core wasn't preferable.
> 

Hmm. Actually I was using this parameter for initial testing, and forced
on the power domains from the qcom-cpufreq-nvmem driver. For my v1 patch
I wanted to enable the power domains in dev_pm_opp_set_rate(), so there
using the virt_devs parameter was not possible.

On the other hand, creating the device links would be possible from the
platform driver by using the parameter.

> And so I think again if this patch should be picked instead of letting
> the platform handle this ?
> 

It seems like originally the motivation for the parameter was that
cpufreq consumers do *not* need to power on the power domains:

Commit 17a8f868ae3e ("opp: Return genpd virtual devices from dev_pm_opp_attach_genpd()"):
 "The cpufreq drivers don't need to do runtime PM operations on
  the virtual devices returned by dev_pm_domain_attach_by_name() and so
  the virtual devices weren't shared with the callers of
  dev_pm_opp_attach_genpd() earlier.

  But the IO device drivers would want to do that. This patch updates
  the prototype of dev_pm_opp_attach_genpd() to accept another argument
  to return the pointer to the array of genpd virtual devices."

But the reason why I made this patch is that actually something *should*
enable the power domains even for the cpufreq case.
If every user of dev_pm_opp_attach_genpd() ends up creating these device
links we might as well manage those directly from the OPP core.

I cannot think of any use case where you would not want to manage those
power domains using device links. And if there is such a use case,
chances are good that this use case is so special that using the OPP API
to set the performance states would not work either. In either case,
this seems like something that should be discussed once there is such a
use case.

At the moment, there are only two users of dev_pm_opp_attach_genpd():

  - cpufreq (qcom-cpufreq-nvmem)
  - I/O (venus, recently added in linux-next [1])

[1]: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=9a538b83612c8b5848bf840c2ddcd86dda1c8c76

In fact, venus adds the device link exactly the same way as in my patch.
So we could move that to the OPP core, simplify the venus code and
remove the virt_devs parameter. That would be my suggestion.

I can submit a v3 with that if you agree (or we take this patch as-is
and remove the parameter separately - I just checked and creating a
device link twice does not seem to cause any problems...)

Thanks!
Stephan
Viresh Kumar Aug. 28, 2020, 6:35 a.m. UTC | #3
On 27-08-20, 13:44, Stephan Gerhold wrote:
> Hmm. Actually I was using this parameter for initial testing, and forced
> on the power domains from the qcom-cpufreq-nvmem driver. For my v1 patch
> I wanted to enable the power domains in dev_pm_opp_set_rate(), so there
> using the virt_devs parameter was not possible.

Right, as we really do not want to enable it there and leave it for
the real consumers to handle.

> On the other hand, creating the device links would be possible from the
> platform driver by using the parameter.

Right.

> > And so I think again if this patch should be picked instead of letting
> > the platform handle this ?
> 
> It seems like originally the motivation for the parameter was that
> cpufreq consumers do *not* need to power on the power domains:
> 
> Commit 17a8f868ae3e ("opp: Return genpd virtual devices from dev_pm_opp_attach_genpd()"):
>  "The cpufreq drivers don't need to do runtime PM operations on
>   the virtual devices returned by dev_pm_domain_attach_by_name() and so
>   the virtual devices weren't shared with the callers of
>   dev_pm_opp_attach_genpd() earlier.
> 
>   But the IO device drivers would want to do that. This patch updates
>   the prototype of dev_pm_opp_attach_genpd() to accept another argument
>   to return the pointer to the array of genpd virtual devices."

Not just that I believe. There were also arguments that only the real
consumer knows how to handle multiple power domains. For example for a
USB or Camera module which can work in multiple modes, we may want to
enable only one power domain in say slow mode and another power domain
in fast mode. And so these kind of complex behavior/choices better be
left for the end consumer and not try to handle this generically in
the OPP core.

> But the reason why I made this patch is that actually something *should*
> enable the power domains even for the cpufreq case.

Ulf, what do you think about this ? IIRC from our previous discussions
someone asked me not do so.

> If every user of dev_pm_opp_attach_genpd() ends up creating these device
> links we might as well manage those directly from the OPP core.

Sure, I am all in for reducing code duplication, but ...

> I cannot think of any use case where you would not want to manage those
> power domains using device links. And if there is such a use case,
> chances are good that this use case is so special that using the OPP API
> to set the performance states would not work either. In either case,
> this seems like something that should be discussed once there is such a
> use case.

The example I gave earlier shows a common case where we need to handle
this at the end users which still want to use the OPP API.

> At the moment, there are only two users of dev_pm_opp_attach_genpd():
> 
>   - cpufreq (qcom-cpufreq-nvmem)
>   - I/O (venus, recently added in linux-next [1])
> 
> [1]: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=9a538b83612c8b5848bf840c2ddcd86dda1c8c76
> 
> In fact, venus adds the device link exactly the same way as in my patch.
> So we could move that to the OPP core, simplify the venus code and
> remove the virt_devs parameter. That would be my suggestion.
> 
> I can submit a v3 with that if you agree (or we take this patch as-is
> and remove the parameter separately - I just checked and creating a
> device link twice does not seem to cause any problems...)

I normally tend to agree with the logic that lets only focus on what's
upstream and not think of virtual cases which may never happen. But I
was told that this is too common of a scenario and so it made sense to
do it this way.

Maybe Ulf can again throw some light here :)
Ulf Hansson Aug. 28, 2020, 9:49 a.m. UTC | #4
+ Daniel

> > Commit 17a8f868ae3e ("opp: Return genpd virtual devices from dev_pm_opp_attach_genpd()"):
> >  "The cpufreq drivers don't need to do runtime PM operations on
> >   the virtual devices returned by dev_pm_domain_attach_by_name() and so
> >   the virtual devices weren't shared with the callers of
> >   dev_pm_opp_attach_genpd() earlier.
> >
> >   But the IO device drivers would want to do that. This patch updates
> >   the prototype of dev_pm_opp_attach_genpd() to accept another argument
> >   to return the pointer to the array of genpd virtual devices."
>
> Not just that I believe. There were also arguments that only the real
> consumer knows how to handle multiple power domains. For example for a
> USB or Camera module which can work in multiple modes, we may want to
> enable only one power domain in say slow mode and another power domain
> in fast mode. And so these kind of complex behavior/choices better be
> left for the end consumer and not try to handle this generically in
> the OPP core.
>
> > But the reason why I made this patch is that actually something *should*
> > enable the power domains even for the cpufreq case.
>
> Ulf, what do you think about this ? IIRC from our previous discussions
> someone asked me not do so.

Yes, that's correct, I recall that now as well.

In some cases I have been told that, depending on the running use
case, one of the PM domains could stay off while the other needed to
be on. I was trying to find some thread in the archive, but I failed.
Sorry.

>
> > If every user of dev_pm_opp_attach_genpd() ends up creating these device
> > links we might as well manage those directly from the OPP core.
>
> Sure, I am all in for reducing code duplication, but ...
>
> > I cannot think of any use case where you would not want to manage those
> > power domains using device links. And if there is such a use case,
> > chances are good that this use case is so special that using the OPP API
> > to set the performance states would not work either. In either case,
> > this seems like something that should be discussed once there is such a
> > use case.
>
> The example I gave earlier shows a common case where we need to handle
> this at the end users which still want to use the OPP API.
>
> > At the moment, there are only two users of dev_pm_opp_attach_genpd():
> >
> >   - cpufreq (qcom-cpufreq-nvmem)
> >   - I/O (venus, recently added in linux-next [1])
> >
> > [1]: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=9a538b83612c8b5848bf840c2ddcd86dda1c8c76
> >
> > In fact, venus adds the device link exactly the same way as in my patch.
> > So we could move that to the OPP core, simplify the venus code and
> > remove the virt_devs parameter. That would be my suggestion.
> >
> > I can submit a v3 with that if you agree (or we take this patch as-is
> > and remove the parameter separately - I just checked and creating a
> > device link twice does not seem to cause any problems...)
>
> I normally tend to agree with the logic that lets only focus on what's
> upstream and not think of virtual cases which may never happen. But I
> was told that this is too common of a scenario and so it made sense to
> do it this way.
>
> Maybe Ulf can again throw some light here :)

There is another series that is being discussed [1], which could be
used by the consumer driver to help manage the device links. Maybe
that is the way we should go, to leave room for flexibility.

[1]
[PATCH v3 0/2] Introduce multi PM domains helpers
https://www.spinics.net/lists/kernel/msg3565672.html
Stephan Gerhold Aug. 28, 2020, 9:57 a.m. UTC | #5
On Fri, Aug 28, 2020 at 12:05:11PM +0530, Viresh Kumar wrote:
> On 27-08-20, 13:44, Stephan Gerhold wrote:
> > Hmm. Actually I was using this parameter for initial testing, and forced
> > on the power domains from the qcom-cpufreq-nvmem driver. For my v1 patch
> > I wanted to enable the power domains in dev_pm_opp_set_rate(), so there
> > using the virt_devs parameter was not possible.
> 
> Right, as we really do not want to enable it there and leave it for
> the real consumers to handle.
> 
> > On the other hand, creating the device links would be possible from the
> > platform driver by using the parameter.
> 
> Right.
> 
> > > And so I think again if this patch should be picked instead of letting
> > > the platform handle this ?
> > 
> > It seems like originally the motivation for the parameter was that
> > cpufreq consumers do *not* need to power on the power domains:
> > 
> > Commit 17a8f868ae3e ("opp: Return genpd virtual devices from dev_pm_opp_attach_genpd()"):
> >  "The cpufreq drivers don't need to do runtime PM operations on
> >   the virtual devices returned by dev_pm_domain_attach_by_name() and so
> >   the virtual devices weren't shared with the callers of
> >   dev_pm_opp_attach_genpd() earlier.
> > 
> >   But the IO device drivers would want to do that. This patch updates
> >   the prototype of dev_pm_opp_attach_genpd() to accept another argument
> >   to return the pointer to the array of genpd virtual devices."
> 
> Not just that I believe. There were also arguments that only the real
> consumer knows how to handle multiple power domains. For example for a
> USB or Camera module which can work in multiple modes, we may want to
> enable only one power domain in say slow mode and another power domain
> in fast mode. And so these kind of complex behavior/choices better be
> left for the end consumer and not try to handle this generically in
> the OPP core.
> 

I was thinking about something like that, but can you actually drop
your performance state vote for one of the power domains using
"required-opps"?

At the moment it does not seem possible. I tried adding a special OPP
using opp-level = <0> to reference that from required-opps, but the OPP
core does not allow this:

	vddcx: Not all nodes have performance state set (7: 6)
	vddcx: Failed to add OPP table for index 0: -2

So the "virt_devs" parameter would allow you to disable the power
domain, but not to drop your performance state vote (you could only vote
for the lowest OPP, not 0...)

This is why I said: "And if there is such a use case, chances are good
that this use case is so special that using the OPP API to set the
performance states would not work either."

It seems to me that there is more work needed to make such a use case
really work, but it's hard to speculate without a real example.

> > But the reason why I made this patch is that actually something *should*
> > enable the power domains even for the cpufreq case.
> 
> Ulf, what do you think about this ? IIRC from our previous discussions
> someone asked me not do so.
> 
> > If every user of dev_pm_opp_attach_genpd() ends up creating these device
> > links we might as well manage those directly from the OPP core.
> 
> Sure, I am all in for reducing code duplication, but ...
> 
> > I cannot think of any use case where you would not want to manage those
> > power domains using device links. And if there is such a use case,
> > chances are good that this use case is so special that using the OPP API
> > to set the performance states would not work either. In either case,
> > this seems like something that should be discussed once there is such a
> > use case.
> 
> The example I gave earlier shows a common case where we need to handle
> this at the end users which still want to use the OPP API.
> 
> > At the moment, there are only two users of dev_pm_opp_attach_genpd():
> > 
> >   - cpufreq (qcom-cpufreq-nvmem)
> >   - I/O (venus, recently added in linux-next [1])
> > 
> > [1]: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=9a538b83612c8b5848bf840c2ddcd86dda1c8c76
> > 
> > In fact, venus adds the device link exactly the same way as in my patch.
> > So we could move that to the OPP core, simplify the venus code and
> > remove the virt_devs parameter. That would be my suggestion.
> > 
> > I can submit a v3 with that if you agree (or we take this patch as-is
> > and remove the parameter separately - I just checked and creating a
> > device link twice does not seem to cause any problems...)
> 
> I normally tend to agree with the logic that lets only focus on what's
> upstream and not think of virtual cases which may never happen. But I
> was told that this is too common of a scenario and so it made sense to
> do it this way.
> 

Well, if we're talking about theoretical use cases it's even hard to
make assumptions for the CPUFreq use case.

Theoretically speaking you would assume that you can disable or at least
reduce the performance votes for a power domain used by a CPU if it is
turned off (hotplug / idle). In practice this is probably hard because
the CPU will need these resources until it's actually off, so if the CPU
enters the idle state itself we don't really have a chance to do that
from Linux.

On qcom this is handled by voting for resources controlled by a remote
processor (which knows when we go to idle), so having the power domains
always on is just what I need.
But who knows what you would need on other platforms...

Honestly, I don't know what would be best in this case. I suppose if in
doubt we should rather duplicate some code for now and de-duplicate later
when we (might) know more about other use cases.

FWIW, I think my original motivation to enable the power domains from
the OPP core was that I wanted to suggest making it possible for the OPP
core to manage power domains without involving the consumer driver.

For example, a device tree property like "required-power-domains":

	cpu-opp-table {
		compatible = "operating-points-v2";
		required-power-domains = "mx";

		opp-998400000 {
			opp-hz = /bits/ 64 <998400000>;
			required-opps = <&rpmpd_opp_super_turbo>;
		};
	};

Main motivation for this change is something I mentioned a while ago [1]:
Since the order and names of the power domains is currently hardcoded
in the consumer driver, the device tree does not give any hint
which power domain we are actually controlling with the "required-opps".

Plus, with the current approach it would be quite hard to add new power
domains to an OPP table, or even to change their order. Not sure if this
would really happen that often, though.

But with all the discussions about whether it's a good idea to let the
OPP core enable the power domains or not, I'm not sure if this is really
something that would work properly...

Well, I wanted to mention it :)

Thanks!
Stephan

[1]: https://lore.kernel.org/linux-arm-msm/20200526205403.GA7256@gerhold.net/
Viresh Kumar Aug. 31, 2020, 12:14 p.m. UTC | #6
On 26-08-20, 11:33, Stephan Gerhold wrote:
> dev_pm_opp_attach_genpd() allows attaching an arbitrary number of
> power domains to an OPP table. In that case, the genpd core will
> create a virtual device for each of the power domains.
> 
> At the moment, the OPP core only calls
> dev_pm_genpd_set_performance_state() on these virtual devices.
> It does not attempt to power on the power domains. Therefore
> the required power domain might never get turned on.
> 
> So far, dev_pm_opp_attach_genpd() is only used in qcom-cpufreq-nvmem.c
> to attach the CPR power domain to the CPU OPP table. The CPR driver
> does not check if it was actually powered on so this did not cause
> any problems. However, other drivers (e.g. rpmpd) might ignore the
> performance state until the power domain is actually powered on.
> 
> Since these virtual devices are managed exclusively by the OPP core,
> I would say that it should also be responsible to ensure they are
> enabled.
> 
> This commit implements this similar to the non-virtual power domains;
> we create device links for each of attached power domains so that they
> are turned on whenever the consumer device is active.
> 
> Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
> ---

Applied with some changes, hope that is fine:

Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
[ Viresh: Rearranged the code to remove extra loop and minor cleanup ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/opp/core.c | 37 ++++++++++++++++++++++++++++++++++++-
 drivers/opp/opp.h  |  1 +
 2 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index e65174725a4d..b608b0253079 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -17,6 +17,7 @@
 #include <linux/device.h>
 #include <linux/export.h>
 #include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
 #include <linux/regulator/consumer.h>
 
 #include "opp.h"
@@ -1967,10 +1968,15 @@ static void _opp_detach_genpd(struct opp_table *opp_table)
 		if (!opp_table->genpd_virt_devs[index])
 			continue;
 
+		if (opp_table->genpd_virt_links[index])
+			device_link_del(opp_table->genpd_virt_links[index]);
+
 		dev_pm_domain_detach(opp_table->genpd_virt_devs[index], false);
 		opp_table->genpd_virt_devs[index] = NULL;
 	}
 
+	kfree(opp_table->genpd_virt_links);
+	opp_table->genpd_virt_links = NULL;
 	kfree(opp_table->genpd_virt_devs);
 	opp_table->genpd_virt_devs = NULL;
 }
@@ -2002,8 +2008,10 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
 {
 	struct opp_table *opp_table;
 	struct device *virt_dev;
+	struct device_link *dev_link;
 	int index = 0, ret = -EINVAL;
 	const char **name = names;
+	u32 flags;
 
 	opp_table = dev_pm_opp_get_opp_table(dev);
 	if (IS_ERR(opp_table))
@@ -2030,6 +2038,21 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
 	if (!opp_table->genpd_virt_devs)
 		goto unlock;
 
+	opp_table->genpd_virt_links = kcalloc(opp_table->required_opp_count,
+					      sizeof(*opp_table->genpd_virt_links),
+					      GFP_KERNEL);
+	if (!opp_table->genpd_virt_links) {
+		kfree(opp_table->genpd_virt_devs);
+		opp_table->genpd_virt_devs = NULL;
+		goto unlock;
+	}
+
+	/* Turn on power domain initially if consumer is active */
+	pm_runtime_get_noresume(dev);
+	flags = DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS;
+	if (pm_runtime_active(dev))
+		flags |= DL_FLAG_RPM_ACTIVE;
+
 	while (*name) {
 		if (index >= opp_table->required_opp_count) {
 			dev_err(dev, "Index can't be greater than required-opp-count - 1, %s (%d : %d)\n",
@@ -2043,12 +2066,23 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
 			dev_err(dev, "Couldn't attach to pm_domain: %d\n", ret);
 			goto err;
 		}
-
 		opp_table->genpd_virt_devs[index] = virt_dev;
+
+		/* Create device links to manage runtime PM */
+		dev_link = device_link_add(dev, opp_table->genpd_virt_devs[index],
+					   flags);
+		if (!dev_link) {
+			dev_err(dev, "Failed to create device link\n");
+			goto err;
+		}
+		opp_table->genpd_virt_links[index] = dev_link;
+
 		index++;
 		name++;
 	}
 
+	pm_runtime_put(dev);
+
 	if (virt_devs)
 		*virt_devs = opp_table->genpd_virt_devs;
 	mutex_unlock(&opp_table->genpd_virt_dev_lock);
@@ -2056,6 +2090,7 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
 	return opp_table;
 
 err:
+	pm_runtime_put(dev);
 	_opp_detach_genpd(opp_table);
 unlock:
 	mutex_unlock(&opp_table->genpd_virt_dev_lock);
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 78e876ec803e..be5526cdbdba 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -186,6 +186,7 @@ struct opp_table {
 
 	struct mutex genpd_virt_dev_lock;
 	struct device **genpd_virt_devs;
+	struct device_link **genpd_virt_links;
 	struct opp_table **required_opp_tables;
 	unsigned int required_opp_count;
Stephan Gerhold Aug. 31, 2020, 3:49 p.m. UTC | #7
Hi Viresh,

On Mon, Aug 31, 2020 at 05:44:57PM +0530, Viresh Kumar wrote:
> On 26-08-20, 11:33, Stephan Gerhold wrote:
> > dev_pm_opp_attach_genpd() allows attaching an arbitrary number of
> > power domains to an OPP table. In that case, the genpd core will
> > create a virtual device for each of the power domains.
> > 
> > At the moment, the OPP core only calls
> > dev_pm_genpd_set_performance_state() on these virtual devices.
> > It does not attempt to power on the power domains. Therefore
> > the required power domain might never get turned on.
> > 
> > So far, dev_pm_opp_attach_genpd() is only used in qcom-cpufreq-nvmem.c
> > to attach the CPR power domain to the CPU OPP table. The CPR driver
> > does not check if it was actually powered on so this did not cause
> > any problems. However, other drivers (e.g. rpmpd) might ignore the
> > performance state until the power domain is actually powered on.
> > 
> > Since these virtual devices are managed exclusively by the OPP core,
> > I would say that it should also be responsible to ensure they are
> > enabled.
> > 
> > This commit implements this similar to the non-virtual power domains;
> > we create device links for each of attached power domains so that they
> > are turned on whenever the consumer device is active.
> > 
> > Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
> > ---
> 
> Applied with some changes, hope that is fine:
> 

I appreciate it, thank you! But actually after our discussion regarding
the "manage multiple power domains which might not always need to be on"
use case I would like to explore that a bit further before we decide on
a final solution that complicates changes later.

The reason I mention this is that after our discussion I have been
(again) staring at the vendor implementation of CPU frequency scaling of
the platform I'm working on (qcom msm8916). Actually there seems to be yet
another power domain that is scaled only for some CPU frequencies within
the clock driver. (The vendor driver implies this is a requirement of
the PLL clock that is used for higher CPU frequencies, but not sure...)

I don't fully understand how to implement this in mainline yet. I have
started some research on these "voltage requirements" for clocks, and
based on previous discussions [1] and patches [2] it seems like I'm
*not* supposed to add this to the clock driver, but rather as another
required-opp to the CPU OPP table.

If that is the case, we would pretty much have a situation like you
described, a power domain that should only be scaled for some of the
OPPs (the higher CPU frequencies).

But first I need to do some more research, and probably discuss how to
handle that power domain separately first. I think it would be easier if
we postpone this patch till then. I don't think anyone else needs this
patch at the moment.

[1]: https://lore.kernel.org/linux-clk/9439bd29e3ccd5424a8e9b464c8c7bd9@codeaurora.org/
[2]: https://lore.kernel.org/linux-pm/20190320094918.20234-1-rnayak@codeaurora.org/

> Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
> [ Viresh: Rearranged the code to remove extra loop and minor cleanup ]

FWIW, the reason I put this into an extra loop is that the
dev_pm_domain_attach_by_name() might return -EPROBE_DEFER (or some other
error) for some of the power domains. If you create the device links
before attaching all domains you might unnecessarily turn on+off some of
them.

Having it in a separate loop avoids that, because we only start powering
on the power domains when we know that all the power domains are
available.

Thanks!
Stephan

> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
>  drivers/opp/core.c | 37 ++++++++++++++++++++++++++++++++++++-
>  drivers/opp/opp.h  |  1 +
>  2 files changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/opp/core.c b/drivers/opp/core.c
> index e65174725a4d..b608b0253079 100644
> --- a/drivers/opp/core.c
> +++ b/drivers/opp/core.c
> @@ -17,6 +17,7 @@
>  #include <linux/device.h>
>  #include <linux/export.h>
>  #include <linux/pm_domain.h>
> +#include <linux/pm_runtime.h>
>  #include <linux/regulator/consumer.h>
>  
>  #include "opp.h"
> @@ -1967,10 +1968,15 @@ static void _opp_detach_genpd(struct opp_table *opp_table)
>  		if (!opp_table->genpd_virt_devs[index])
>  			continue;
>  
> +		if (opp_table->genpd_virt_links[index])
> +			device_link_del(opp_table->genpd_virt_links[index]);
> +
>  		dev_pm_domain_detach(opp_table->genpd_virt_devs[index], false);
>  		opp_table->genpd_virt_devs[index] = NULL;
>  	}
>  
> +	kfree(opp_table->genpd_virt_links);
> +	opp_table->genpd_virt_links = NULL;
>  	kfree(opp_table->genpd_virt_devs);
>  	opp_table->genpd_virt_devs = NULL;
>  }
> @@ -2002,8 +2008,10 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
>  {
>  	struct opp_table *opp_table;
>  	struct device *virt_dev;
> +	struct device_link *dev_link;
>  	int index = 0, ret = -EINVAL;
>  	const char **name = names;
> +	u32 flags;
>  
>  	opp_table = dev_pm_opp_get_opp_table(dev);
>  	if (IS_ERR(opp_table))
> @@ -2030,6 +2038,21 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
>  	if (!opp_table->genpd_virt_devs)
>  		goto unlock;
>  
> +	opp_table->genpd_virt_links = kcalloc(opp_table->required_opp_count,
> +					      sizeof(*opp_table->genpd_virt_links),
> +					      GFP_KERNEL);
> +	if (!opp_table->genpd_virt_links) {
> +		kfree(opp_table->genpd_virt_devs);
> +		opp_table->genpd_virt_devs = NULL;
> +		goto unlock;
> +	}
> +
> +	/* Turn on power domain initially if consumer is active */
> +	pm_runtime_get_noresume(dev);
> +	flags = DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS;
> +	if (pm_runtime_active(dev))
> +		flags |= DL_FLAG_RPM_ACTIVE;
> +
>  	while (*name) {
>  		if (index >= opp_table->required_opp_count) {
>  			dev_err(dev, "Index can't be greater than required-opp-count - 1, %s (%d : %d)\n",
> @@ -2043,12 +2066,23 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
>  			dev_err(dev, "Couldn't attach to pm_domain: %d\n", ret);
>  			goto err;
>  		}
> -
>  		opp_table->genpd_virt_devs[index] = virt_dev;
> +
> +		/* Create device links to manage runtime PM */
> +		dev_link = device_link_add(dev, opp_table->genpd_virt_devs[index],
> +					   flags);
> +		if (!dev_link) {
> +			dev_err(dev, "Failed to create device link\n");
> +			goto err;
> +		}
> +		opp_table->genpd_virt_links[index] = dev_link;
> +
>  		index++;
>  		name++;
>  	}
>  
> +	pm_runtime_put(dev);
> +
>  	if (virt_devs)
>  		*virt_devs = opp_table->genpd_virt_devs;
>  	mutex_unlock(&opp_table->genpd_virt_dev_lock);
> @@ -2056,6 +2090,7 @@ struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
>  	return opp_table;
>  
>  err:
> +	pm_runtime_put(dev);
>  	_opp_detach_genpd(opp_table);
>  unlock:
>  	mutex_unlock(&opp_table->genpd_virt_dev_lock);
> diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
> index 78e876ec803e..be5526cdbdba 100644
> --- a/drivers/opp/opp.h
> +++ b/drivers/opp/opp.h
> @@ -186,6 +186,7 @@ struct opp_table {
>  
>  	struct mutex genpd_virt_dev_lock;
>  	struct device **genpd_virt_devs;
> +	struct device_link **genpd_virt_links;
>  	struct opp_table **required_opp_tables;
>  	unsigned int required_opp_count;
Viresh Kumar Sept. 1, 2020, 6 a.m. UTC | #8
On 31-08-20, 17:49, Stephan Gerhold wrote:
> I appreciate it, thank you! But actually after our discussion regarding
> the "manage multiple power domains which might not always need to be on"
> use case I would like to explore that a bit further before we decide on
> a final solution that complicates changes later.
> 
> The reason I mention this is that after our discussion I have been
> (again) staring at the vendor implementation of CPU frequency scaling of
> the platform I'm working on (qcom msm8916). Actually there seems to be yet
> another power domain that is scaled only for some CPU frequencies within
> the clock driver. (The vendor driver implies this is a requirement of
> the PLL clock that is used for higher CPU frequencies, but not sure...)
> 
> I don't fully understand how to implement this in mainline yet. I have
> started some research on these "voltage requirements" for clocks, and
> based on previous discussions [1] and patches [2] it seems like I'm
> *not* supposed to add this to the clock driver, but rather as another
> required-opp to the CPU OPP table.
> 
> If that is the case, we would pretty much have a situation like you
> described, a power domain that should only be scaled for some of the
> OPPs (the higher CPU frequencies).
> 
> But first I need to do some more research, and probably discuss how to
> handle that power domain separately first. I think it would be easier if
> we postpone this patch till then. I don't think anyone else needs this
> patch at the moment.

Heh, okay, I can keep it out of my tree then :)

> [1]: https://lore.kernel.org/linux-clk/9439bd29e3ccd5424a8e9b464c8c7bd9@codeaurora.org/
> [2]: https://lore.kernel.org/linux-pm/20190320094918.20234-1-rnayak@codeaurora.org/
> 
> > Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
> > [ Viresh: Rearranged the code to remove extra loop and minor cleanup ]
> 
> FWIW, the reason I put this into an extra loop is that the
> dev_pm_domain_attach_by_name() might return -EPROBE_DEFER (or some other
> error) for some of the power domains. If you create the device links
> before attaching all domains you might unnecessarily turn on+off some of
> them.

Hmm, but that shouldn't have any significant side affects, right ? And
shouldn't result in some other sort of error. It makes the code look
more sensible/readable and so I would prefer to keep a single loop if
it doesn't make something not-work.

> Having it in a separate loop avoids that, because we only start powering
> on the power domains when we know that all the power domains are
> available.
Stephan Gerhold Sept. 11, 2020, 8:34 a.m. UTC | #9
Hi Viresh,

On Fri, Aug 28, 2020 at 11:57:28AM +0200, Stephan Gerhold wrote:
> On Fri, Aug 28, 2020 at 12:05:11PM +0530, Viresh Kumar wrote:
> > On 27-08-20, 13:44, Stephan Gerhold wrote:
> > > Hmm. Actually I was using this parameter for initial testing, and forced
> > > on the power domains from the qcom-cpufreq-nvmem driver. For my v1 patch
> > > I wanted to enable the power domains in dev_pm_opp_set_rate(), so there
> > > using the virt_devs parameter was not possible.
> > 
> > Right, as we really do not want to enable it there and leave it for
> > the real consumers to handle.
> > 
> > > On the other hand, creating the device links would be possible from the
> > > platform driver by using the parameter.
> > 
> > Right.
> > 
> > > > And so I think again if this patch should be picked instead of letting
> > > > the platform handle this ?
> > > 
> > > It seems like originally the motivation for the parameter was that
> > > cpufreq consumers do *not* need to power on the power domains:
> > > 
> > > Commit 17a8f868ae3e ("opp: Return genpd virtual devices from dev_pm_opp_attach_genpd()"):
> > >  "The cpufreq drivers don't need to do runtime PM operations on
> > >   the virtual devices returned by dev_pm_domain_attach_by_name() and so
> > >   the virtual devices weren't shared with the callers of
> > >   dev_pm_opp_attach_genpd() earlier.
> > > 
> > >   But the IO device drivers would want to do that. This patch updates
> > >   the prototype of dev_pm_opp_attach_genpd() to accept another argument
> > >   to return the pointer to the array of genpd virtual devices."
> > 
> > Not just that I believe. There were also arguments that only the real
> > consumer knows how to handle multiple power domains. For example for a
> > USB or Camera module which can work in multiple modes, we may want to
> > enable only one power domain in say slow mode and another power domain
> > in fast mode. And so these kind of complex behavior/choices better be
> > left for the end consumer and not try to handle this generically in
> > the OPP core.
> > 
> [...]
> 
> It seems to me that there is more work needed to make such a use case
> really work, but it's hard to speculate without a real example.
> 

So it seems like we have a real example now. :)

As mentioned in my other mail [1] it turns out I actually have such a
use case. I briefly explained it in [2], basically the clock that
provides higher CPU frequencies has some voltage requirements that
should be voted for using a power domain.

The clock that provides the lower CPU frequencies has no such
requirement, so I need to scale (and power on) a power domain only for
some of the OPPs.

[1]: https://lore.kernel.org/linux-pm/20200831154938.GA33622@gerhold.net/
[2]: https://lore.kernel.org/linux-arm-msm/20200910162610.GA7008@gerhold.net/

So I think it would be good to discuss this use case first before we
decide on this patch (how to enable power domains managed by the OPP
core). I think there are two problems that need to be solved:

1. How can we drop our performance state votes for some of the OPPs?
   I explained that problem earlier already:

> 
> I was thinking about something like that, but can you actually drop
> your performance state vote for one of the power domains using
> "required-opps"?
> 
> At the moment it does not seem possible. I tried adding a special OPP
> using opp-level = <0> to reference that from required-opps, but the OPP
> core does not allow this:
> 
> 	vddcx: Not all nodes have performance state set (7: 6)
> 	vddcx: Failed to add OPP table for index 0: -2
> 
> So the "virt_devs" parameter would allow you to disable the power
> domain, but not to drop your performance state vote (you could only vote
> for the lowest OPP, not 0...)

Not sure if it makes sense but I think somehow allowing the additional
opp-level = <0> would be a simple solution. For example:

	rpmpd: power-controller {
		rpmpd_opp_table: opp-table {
			compatible = "operating-points-v2";

            /*
             * This one can be referenced to drop the performance state
             * vote within required-opps.
             */
            rpmpd_opp_none: opp0 {
                opp-level = <0>;
            };

			rpmpd_opp_retention: opp1 {
				opp-level = <1>;
			};

            /* ... */
        };
    };                

	cpu_opp_table: cpu-opp-table {
		compatible = "operating-points-v2";
		opp-shared;

        /* Power domain is only needed for frequencies >= 998 MHz */
		opp-200000000 {
			opp-hz = /bits/ 64 <200000000>;
			required-opps = <&rpmpd_opp_none>; /* = drop perf state */
		};
		opp-998400000 {
			opp-hz = /bits/ 64 <998400000>;
			required-opps = <&rpmpd_opp_svs_soc>;
		};
		opp-1209600000 {
			opp-hz = /bits/ 64 <1209600000>;
			required-opps = <&rpmpd_opp_nominal>;
		};
	};	

2. Where/when to enable the power domains: I need to enable the power
   domain whenever I have a vote for a performance state. In the example
   above the power domain should get enabled for >= 998 MHz and disabled
   otherwise.

   At least for the CPUFreq case the "virt_devs" parameter does not
   really help in this case...
   dev_pm_opp_set_rate() is called within cpufreq-dt which is supposed
   to be generic. So I can't enable the power domains myself from there.
   Even if I made a custom cpufreq driver that has control over the
   dev_pm_opp_set_rate() call - I don't really know exactly in the
   driver for which frequencies a power domain is needed.

   On the other hand, the OPP core does have that information.
   This brings me back to my PATCH v1 (where I used runtime PM functions
   instead of device links). If I modify it to enable the power domain
   whenever we have a performance state vote > 0 when setting an OPP,
   it would do exactly what I need...

   I don't think it makes sense to do performance state votes without
   enabling a power domain, so this approach sounds good to me...

What do you think?

Thanks!
Stephan
Viresh Kumar Sept. 11, 2020, 9:25 a.m. UTC | #10
On 11-09-20, 10:34, Stephan Gerhold wrote:
> On Fri, Aug 28, 2020 at 11:57:28AM +0200, Stephan Gerhold wrote:
> > It seems to me that there is more work needed to make such a use case
> > really work, but it's hard to speculate without a real example.
> > 
> 
> So it seems like we have a real example now. :)

We told you :)

> As mentioned in my other mail [1] it turns out I actually have such a
> use case. I briefly explained it in [2], basically the clock that
> provides higher CPU frequencies has some voltage requirements that
> should be voted for using a power domain.
> 
> The clock that provides the lower CPU frequencies has no such
> requirement, so I need to scale (and power on) a power domain only for
> some of the OPPs.
> 
> [1]: https://lore.kernel.org/linux-pm/20200831154938.GA33622@gerhold.net/
> [2]: https://lore.kernel.org/linux-arm-msm/20200910162610.GA7008@gerhold.net/
> 
> So I think it would be good to discuss this use case first before we
> decide on this patch (how to enable power domains managed by the OPP
> core). I think there are two problems that need to be solved:
> 
> 1. How can we drop our performance state votes for some of the OPPs?
>    I explained that problem earlier already:
> 
> > 
> > I was thinking about something like that, but can you actually drop
> > your performance state vote for one of the power domains using
> > "required-opps"?
> > 
> > At the moment it does not seem possible. I tried adding a special OPP
> > using opp-level = <0> to reference that from required-opps, but the OPP
> > core does not allow this:
> > 
> > 	vddcx: Not all nodes have performance state set (7: 6)
> > 	vddcx: Failed to add OPP table for index 0: -2

This should fix it.

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 214c1619b445..2483e765318a 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -2117,9 +2117,6 @@ int dev_pm_opp_xlate_performance_state(struct opp_table *src_table,
        int dest_pstate = -EINVAL;
        int i;
 
-       if (!pstate)
-               return 0;
-
        /*
         * Normally the src_table will have the "required_opps" property set to
         * point to one of the OPPs in the dst_table, but in some cases the
diff --git a/drivers/opp/of.c b/drivers/opp/of.c
index e72753be7dc7..1a9cb96484bb 100644
--- a/drivers/opp/of.c
+++ b/drivers/opp/of.c
@@ -842,7 +842,7 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table,
 static int _of_add_opp_table_v2(struct device *dev, struct opp_table *opp_table)
 {
        struct device_node *np;
-       int ret, count = 0, pstate_count = 0;
+       int ret, count = 0;
        struct dev_pm_opp *opp;
 
        /* OPP table is already initialized for the device */
@@ -876,20 +876,13 @@ static int _of_add_opp_table_v2(struct device *dev, struct opp_table *opp_table)
                goto remove_static_opp;
        }
 
-       list_for_each_entry(opp, &opp_table->opp_list, node)
-               pstate_count += !!opp->pstate;
-
-       /* Either all or none of the nodes shall have performance state set */
-       if (pstate_count && pstate_count != count) {
-               dev_err(dev, "Not all nodes have performance state set (%d: %d)\n",
-                       count, pstate_count);
-               ret = -ENOENT;
-               goto remove_static_opp;
+       list_for_each_entry(opp, &opp_table->opp_list, node) {
+               if (opp->pstate) {
+                       opp_table->genpd_performance_state = true;
+                       break;
+               }
        }
 
-       if (pstate_count)
-               opp_table->genpd_performance_state = true;
-
        return 0;
 
 remove_static_opp:


> Not sure if it makes sense but I think somehow allowing the additional
> opp-level = <0> would be a simple solution. For example:
> 
> 	rpmpd: power-controller {
> 		rpmpd_opp_table: opp-table {
> 			compatible = "operating-points-v2";
> 
>             /*
>              * This one can be referenced to drop the performance state
>              * vote within required-opps.
>              */
>             rpmpd_opp_none: opp0 {
>                 opp-level = <0>;
>             };
> 
> 			rpmpd_opp_retention: opp1 {
> 				opp-level = <1>;
> 			};
> 
>             /* ... */
>         };
>     };                
> 
> 	cpu_opp_table: cpu-opp-table {
> 		compatible = "operating-points-v2";
> 		opp-shared;
> 
>         /* Power domain is only needed for frequencies >= 998 MHz */
> 		opp-200000000 {
> 			opp-hz = /bits/ 64 <200000000>;
> 			required-opps = <&rpmpd_opp_none>; /* = drop perf state */
> 		};
> 		opp-998400000 {
> 			opp-hz = /bits/ 64 <998400000>;
> 			required-opps = <&rpmpd_opp_svs_soc>;
> 		};
> 		opp-1209600000 {
> 			opp-hz = /bits/ 64 <1209600000>;
> 			required-opps = <&rpmpd_opp_nominal>;
> 		};
> 	};	

Yes, makes sense.

> 2. Where/when to enable the power domains: I need to enable the power
>    domain whenever I have a vote for a performance state. In the example
>    above the power domain should get enabled for >= 998 MHz and disabled
>    otherwise.

- Why do you need to enable these power domains like this ?
- What will happen if you don't enable them at all ?
- What will happen if you enable them for ever ?

AFAIU, these are kind of virtual domains which are there just to vote in behalf
of the OS. Only if the accumulated vote is greater than zero, the actual power
domain will start consuming power. Otherwise it should be disabled.

Or is that wrong ?

>    At least for the CPUFreq case the "virt_devs" parameter does not
>    really help in this case...
>    dev_pm_opp_set_rate() is called within cpufreq-dt which is supposed
>    to be generic. So I can't enable the power domains myself from there.
>    Even if I made a custom cpufreq driver that has control over the
>    dev_pm_opp_set_rate() call - I don't really know exactly in the
>    driver for which frequencies a power domain is needed.
> 
>    On the other hand, the OPP core does have that information.
>    This brings me back to my PATCH v1 (where I used runtime PM functions
>    instead of device links). If I modify it to enable the power domain
>    whenever we have a performance state vote > 0 when setting an OPP,
>    it would do exactly what I need...
> 
>    I don't think it makes sense to do performance state votes without
>    enabling a power domain, so this approach sounds good to me...
Stephan Gerhold Sept. 11, 2020, 3:37 p.m. UTC | #11
On Fri, Sep 11, 2020 at 02:55:38PM +0530, Viresh Kumar wrote:
> > As mentioned in my other mail [1] it turns out I actually have such a
> > use case. I briefly explained it in [2], basically the clock that
> > provides higher CPU frequencies has some voltage requirements that
> > should be voted for using a power domain.
> > 
> > The clock that provides the lower CPU frequencies has no such
> > requirement, so I need to scale (and power on) a power domain only for
> > some of the OPPs.
> > 
> > [1]: https://lore.kernel.org/linux-pm/20200831154938.GA33622@gerhold.net/
> > [2]: https://lore.kernel.org/linux-arm-msm/20200910162610.GA7008@gerhold.net/
> > 
> > So I think it would be good to discuss this use case first before we
> > decide on this patch (how to enable power domains managed by the OPP
> > core). I think there are two problems that need to be solved:
> > 
> > 1. How can we drop our performance state votes for some of the OPPs?
> >    I explained that problem earlier already:
> > 
> > > 
> > > I was thinking about something like that, but can you actually drop
> > > your performance state vote for one of the power domains using
> > > "required-opps"?
> > > 
> > > At the moment it does not seem possible. I tried adding a special OPP
> > > using opp-level = <0> to reference that from required-opps, but the OPP
> > > core does not allow this:
> > > 
> > > 	vddcx: Not all nodes have performance state set (7: 6)
> > > 	vddcx: Failed to add OPP table for index 0: -2
> 
> This should fix it.
> 
> diff --git a/drivers/opp/core.c b/drivers/opp/core.c
> index 214c1619b445..2483e765318a 100644
> --- a/drivers/opp/core.c
> +++ b/drivers/opp/core.c
> @@ -2117,9 +2117,6 @@ int dev_pm_opp_xlate_performance_state(struct opp_table *src_table,
>         int dest_pstate = -EINVAL;
>         int i;
>  
> -       if (!pstate)
> -               return 0;
> -
>         /*
>          * Normally the src_table will have the "required_opps" property set to
>          * point to one of the OPPs in the dst_table, but in some cases the
> diff --git a/drivers/opp/of.c b/drivers/opp/of.c
> index e72753be7dc7..1a9cb96484bb 100644
> --- a/drivers/opp/of.c
> +++ b/drivers/opp/of.c
> @@ -842,7 +842,7 @@ static struct dev_pm_opp *_opp_add_static_v2(struct opp_table *opp_table,
>  static int _of_add_opp_table_v2(struct device *dev, struct opp_table *opp_table)
>  {
>         struct device_node *np;
> -       int ret, count = 0, pstate_count = 0;
> +       int ret, count = 0;
>         struct dev_pm_opp *opp;
>  
>         /* OPP table is already initialized for the device */
> @@ -876,20 +876,13 @@ static int _of_add_opp_table_v2(struct device *dev, struct opp_table *opp_table)
>                 goto remove_static_opp;
>         }
>  
> -       list_for_each_entry(opp, &opp_table->opp_list, node)
> -               pstate_count += !!opp->pstate;
> -
> -       /* Either all or none of the nodes shall have performance state set */
> -       if (pstate_count && pstate_count != count) {
> -               dev_err(dev, "Not all nodes have performance state set (%d: %d)\n",
> -                       count, pstate_count);
> -               ret = -ENOENT;
> -               goto remove_static_opp;
> +       list_for_each_entry(opp, &opp_table->opp_list, node) {
> +               if (opp->pstate) {
> +                       opp_table->genpd_performance_state = true;
> +                       break;
> +               }
>         }
>  
> -       if (pstate_count)
> -               opp_table->genpd_performance_state = true;
> -
>         return 0;
>  
>  remove_static_opp:
> 

Thanks, I will test this early next week!

> > 2. Where/when to enable the power domains: I need to enable the power
> >    domain whenever I have a vote for a performance state. In the example
> >    above the power domain should get enabled for >= 998 MHz and disabled
> >    otherwise.
> 

I will answer your questions for my use case, maybe Ulf or someone else
who knows more about power domains can chime in and say more about the
general case:

> - Why do you need to enable these power domains like this ?

At the moment power domains seem to have two independent states:
they can be powered on/off and they can have a performance state.
Both seem to be managed independently, e.g. a power domain can have a
non-zero performance state set but at the same time powered off.

The OPP core only calls dev_pm_genpd_set_performance_state(). No matter
if the power domain is powered on or not this will end up in the
set_performance_state() callback of the power domain driver.

And this is where behavior varies depending on the power domain driver.
Some drivers (e.g. qcom-cpr) will set the performance state even if
powered off, others (in my case: qcom rpmpd) will ignore the performance
state until their power_on() callback is called.

In general, I would expect that a power domain should be powered on
whenever there is at least one performance state vote > 0.
This does not seem to be the case at the moment, though.

> - What will happen if you don't enable them at all ?

If I add the power domains needed for CPUFreq to the CPU OPP table,
without further changes, nothing will power on the power domains.
There is no difference for qcom-cpr, but at least rpmpd will ignore all
the performance state votes.

> - What will happen if you enable them for ever ?
> 
> AFAIU, these are kind of virtual domains which are there just to vote in behalf
> of the OS. Only if the accumulated vote is greater than zero, the actual power
> domain will start consuming power. Otherwise it should be disabled.
> 
> Or is that wrong ?
> 

Right, at least in case of rpmpd the actual power domain is managed by
some "RPM" remote processor which accumulates votes from all the CPUs
running in the SoC.

I believe the power domains in my case are even always-on in the RPM,
but still the RPM interface seems to provide a way to vote for
enabling/disabling the power domains. Sadly it is barely documented,
so I'm not sure what happens if we keep the power domains permanently
powered on. Maybe it will block certain low power modes?

The current implementation in drivers/soc/qcom/rpmpd.c does suggest that
we should vote for "power off" when the power domains are no longer
needed by Linux.

(+CC Bjorn, Andy and linux-arm-msm, maybe they know more...)

Thanks!
Stephan
diff mbox series

Patch

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 8b3c3986f589..7e53a7b94c59 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -17,6 +17,7 @@ 
 #include <linux/device.h>
 #include <linux/export.h>
 #include <linux/pm_domain.h>
+#include <linux/pm_runtime.h>
 #include <linux/regulator/consumer.h>
 
 #include "opp.h"
@@ -1964,10 +1965,13 @@  static void _opp_detach_genpd(struct opp_table *opp_table)
 		if (!opp_table->genpd_virt_devs[index])
 			continue;
 
+		if (opp_table->genpd_virt_links && opp_table->genpd_virt_links[index])
+			device_link_del(opp_table->genpd_virt_links[index]);
 		dev_pm_domain_detach(opp_table->genpd_virt_devs[index], false);
 		opp_table->genpd_virt_devs[index] = NULL;
 	}
 
+	kfree(opp_table->genpd_virt_links);
 	kfree(opp_table->genpd_virt_devs);
 	opp_table->genpd_virt_devs = NULL;
 }
@@ -1999,8 +2003,10 @@  struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
 {
 	struct opp_table *opp_table;
 	struct device *virt_dev;
-	int index = 0, ret = -EINVAL;
+	struct device_link *dev_link;
+	int index = 0, ret = -EINVAL, num_devs;
 	const char **name = names;
+	u32 flags;
 
 	opp_table = dev_pm_opp_get_opp_table(dev);
 	if (IS_ERR(opp_table))
@@ -2049,6 +2055,32 @@  struct opp_table *dev_pm_opp_attach_genpd(struct device *dev,
 		name++;
 	}
 
+	/* Create device links to enable the power domains when necessary */
+	opp_table->genpd_virt_links = kcalloc(opp_table->required_opp_count,
+					      sizeof(*opp_table->genpd_virt_links),
+					      GFP_KERNEL);
+	if (!opp_table->genpd_virt_links)
+		goto err;
+
+	/* Turn on power domain initially if consumer is active */
+	pm_runtime_get_noresume(dev);
+	flags = DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS;
+	if (pm_runtime_active(dev))
+		flags |= DL_FLAG_RPM_ACTIVE;
+
+	num_devs = index;
+	for (index = 0; index < num_devs; index++) {
+		dev_link = device_link_add(dev, opp_table->genpd_virt_devs[index],
+					   flags);
+		if (!dev_link) {
+			dev_err(dev, "Failed to create device link\n");
+			pm_runtime_put(dev);
+			goto err;
+		}
+		opp_table->genpd_virt_links[index] = dev_link;
+	}
+	pm_runtime_put(dev);
+
 	if (virt_devs)
 		*virt_devs = opp_table->genpd_virt_devs;
 	mutex_unlock(&opp_table->genpd_virt_dev_lock);
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index 78e876ec803e..be5526cdbdba 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -186,6 +186,7 @@  struct opp_table {
 
 	struct mutex genpd_virt_dev_lock;
 	struct device **genpd_virt_devs;
+	struct device_link **genpd_virt_links;
 	struct opp_table **required_opp_tables;
 	unsigned int required_opp_count;