diff mbox series

[v5,2/2] PM / Domains: use device's next wakeup to determine domain idle state

Message ID 20201106164811.3698-3-ilina@codeaurora.org (mailing list archive)
State Superseded
Headers show
Series Better domain idle from device wakeup patterns | expand

Commit Message

Lina Iyer Nov. 6, 2020, 4:48 p.m. UTC
Currently, a PM domain's idle state is determined based on whether the
QoS requirements are met. This may not save power, if the idle state
residency requirements are not met.

CPU PM domains use the next timer wakeup for the CPUs in the domain to
determine the sleep duration of the domain. This is compared with the
idle state residencies to determine the optimal idle state. For other PM
domains, determining the sleep length is not that straight forward. But
if the device's next_event is available, we can use that to determine
the sleep duration of the PM domain.

Let's update the domain governor logic to check for idle state residency
based on the next wakeup of devices as well as QoS constraints.

Signed-off-by: Lina Iyer <ilina@codeaurora.org>
---
Changes in v5:
	- Minor code changes suggested by Rafel
Changes in v4:
	- Update to use next_wakeup from struct generic_pm_domain_data.
Changes in v3:
	- None
Changes in v2:
	- Fix state_idx type to hold negative value.
	- Update commit text.
---
 drivers/base/power/domain_governor.c | 81 ++++++++++++++++++++++++++--
 include/linux/pm_domain.h            |  1 +
 2 files changed, 77 insertions(+), 5 deletions(-)

Comments

Ulf Hansson Nov. 9, 2020, 3:26 p.m. UTC | #1
On Fri, 6 Nov 2020 at 17:48, Lina Iyer <ilina@codeaurora.org> wrote:
>
> Currently, a PM domain's idle state is determined based on whether the
> QoS requirements are met. This may not save power, if the idle state
> residency requirements are not met.
>
> CPU PM domains use the next timer wakeup for the CPUs in the domain to
> determine the sleep duration of the domain. This is compared with the
> idle state residencies to determine the optimal idle state. For other PM
> domains, determining the sleep length is not that straight forward. But
> if the device's next_event is available, we can use that to determine
> the sleep duration of the PM domain.
>
> Let's update the domain governor logic to check for idle state residency
> based on the next wakeup of devices as well as QoS constraints.
>
> Signed-off-by: Lina Iyer <ilina@codeaurora.org>
> ---
> Changes in v5:
>         - Minor code changes suggested by Rafel
> Changes in v4:
>         - Update to use next_wakeup from struct generic_pm_domain_data.
> Changes in v3:
>         - None
> Changes in v2:
>         - Fix state_idx type to hold negative value.
>         - Update commit text.
> ---
>  drivers/base/power/domain_governor.c | 81 ++++++++++++++++++++++++++--
>  include/linux/pm_domain.h            |  1 +
>  2 files changed, 77 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
> index 490ed7deb99a..fee050c6ea6a 100644
> --- a/drivers/base/power/domain_governor.c
> +++ b/drivers/base/power/domain_governor.c
> @@ -117,6 +117,47 @@ static bool default_suspend_ok(struct device *dev)
>         return td->cached_suspend_ok;
>  }
>
> +static void update_domain_next_wakeup(struct generic_pm_domain *genpd, ktime_t now)
> +{
> +       ktime_t domain_wakeup = KTIME_MAX;
> +       ktime_t next_wakeup;
> +       struct pm_domain_data *pdd;
> +       struct gpd_link *link;
> +
> +       /* Find the earliest wakeup for all devices in the domain */
> +       list_for_each_entry(pdd, &genpd->dev_list, list_node) {
> +               next_wakeup = to_gpd_data(pdd)->next_wakeup;
> +               if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
> +                       if (ktime_before(next_wakeup, domain_wakeup))
> +                               domain_wakeup = next_wakeup;

If it turns out that one of the device's next_wakeup is before "now",
leading to ktime_before() above returns true - then I think you should
bail out, no?

At least, we shouldn't just continue and ignore this case, right?

> +       }
> +
> +       /* Then find the earliest wakeup of from all the child domains */
> +       list_for_each_entry(link, &genpd->parent_links, parent_node) {
> +               next_wakeup = link->child->next_wakeup;
> +               if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
> +                       if (ktime_before(next_wakeup, domain_wakeup))
> +                               domain_wakeup = next_wakeup;
> +       }
> +
> +       genpd->next_wakeup = domain_wakeup;
> +}
> +
> +static bool next_wakeup_allows_state(struct generic_pm_domain *genpd,
> +                                    unsigned int state, ktime_t now)
> +{
> +       ktime_t domain_wakeup = genpd->next_wakeup;
> +       s64 idle_time_ns, min_sleep_ns;
> +
> +       min_sleep_ns = genpd->states[state].power_off_latency_ns +
> +                      genpd->states[state].power_on_latency_ns +
> +                      genpd->states[state].residency_ns;
> +

I don't think you should include the power_on_latency_ns here.

The validation isn't about QoS constraints, but whether we can meet
the residency time to make it worth entering the state, from an energy
point of view. Right?

> +       idle_time_ns = ktime_to_ns(ktime_sub(domain_wakeup, now));
> +
> +       return idle_time_ns >= min_sleep_ns;
> +}
> +
>  static bool __default_power_down_ok(struct dev_pm_domain *pd,
>                                      unsigned int state)
>  {
> @@ -209,8 +250,34 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd,
>  static bool default_power_down_ok(struct dev_pm_domain *pd)
>  {
>         struct generic_pm_domain *genpd = pd_to_genpd(pd);
> +       int state_idx = genpd->state_count - 1;
> +       ktime_t now = ktime_get();
>         struct gpd_link *link;
>
> +       /*
> +        * Find the next wakeup from devices that can determine their own wakeup
> +        * to find when the domain would wakeup and do it for every device down
> +        * the hierarchy. It is not worth while to sleep if the state's residency
> +        * cannot be met.
> +        */
> +       update_domain_next_wakeup(genpd, now);
> +       if (genpd->next_wakeup != KTIME_MAX) {
> +               /* Let's find out the deepest domain idle state, the devices prefer */
> +               while (state_idx >= 0) {
> +                       if (next_wakeup_allows_state(genpd, state_idx, now)) {
> +                               genpd->max_off_time_changed = true;
> +                               break;
> +                       }
> +                       state_idx--;
> +               }
> +
> +               if (state_idx < 0) {
> +                       state_idx = 0;
> +                       genpd->cached_power_down_ok = false;
> +                       goto done;
> +               }
> +       }
> +

The above would introduce unnecessary overhead, as it may become
executed in cases when it's not needed.

For example, there's no point doing the above, if the domain doesn't
specify residency values for its idle states.

Additionally, we don't need to recompute the domain's next wakup,
unless a device has got a new next_wakeup value set for it. In this
case, we can just select a state based upon an previously computed
value, thus avoiding the recomputation.

>         if (!genpd->max_off_time_changed) {
>                 genpd->state_idx = genpd->cached_power_down_state_idx;
>                 return genpd->cached_power_down_ok;
> @@ -228,17 +295,21 @@ static bool default_power_down_ok(struct dev_pm_domain *pd)
>         genpd->max_off_time_ns = -1;
>         genpd->max_off_time_changed = false;
>         genpd->cached_power_down_ok = true;
> -       genpd->state_idx = genpd->state_count - 1;
>
> -       /* Find a state to power down to, starting from the deepest. */
> -       while (!__default_power_down_ok(pd, genpd->state_idx)) {
> -               if (genpd->state_idx == 0) {
> +       /*
> +        * Find a state to power down to, starting from the state
> +        * determined by the next wakeup.
> +        */
> +       while (!__default_power_down_ok(pd, state_idx)) {
> +               if (state_idx == 0) {
>                         genpd->cached_power_down_ok = false;
>                         break;
>                 }
> -               genpd->state_idx--;
> +               state_idx--;
>         }
>
> +done:
> +       genpd->state_idx = state_idx;
>         genpd->cached_power_down_state_idx = genpd->state_idx;
>         return genpd->cached_power_down_ok;
>  }

Another thing to consider for the above changes, is that the
cpu_power_down_ok() calls into default_power_down_ok().

Even if I am fine with the approach taken in $subject patch, I think
it's important to try to keep the path a slim as possible as it's also
executed in the CPU idlepath. For example, $subject change means also
that we end up calling ktime_get() twice in the same path, introducing
unnecessary overhead. We can do better and avoid that by restructuring
the code a bit, don't you think?

> diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
> index d7875bfde7f4..49982cd58bfd 100644
> --- a/include/linux/pm_domain.h
> +++ b/include/linux/pm_domain.h
> @@ -130,6 +130,7 @@ struct generic_pm_domain {
>                                      unsigned int state);
>         struct gpd_dev_ops dev_ops;
>         s64 max_off_time_ns;    /* Maximum allowed "suspended" time. */
> +       ktime_t next_wakeup;    /* Maintained by the domain governor */
>         bool max_off_time_changed;
>         bool cached_power_down_ok;
>         bool cached_power_down_state_idx;
> --
> The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
> a Linux Foundation Collaborative Project
>

Kind regards
Uffe
Lina Iyer Nov. 9, 2020, 5:41 p.m. UTC | #2
On Mon, Nov 09 2020 at 08:27 -0700, Ulf Hansson wrote:
>On Fri, 6 Nov 2020 at 17:48, Lina Iyer <ilina@codeaurora.org> wrote:
>>
[...]
>> +static void update_domain_next_wakeup(struct generic_pm_domain *genpd, ktime_t now)
>> +{
>> +       ktime_t domain_wakeup = KTIME_MAX;
>> +       ktime_t next_wakeup;
>> +       struct pm_domain_data *pdd;
>> +       struct gpd_link *link;
>> +
>> +       /* Find the earliest wakeup for all devices in the domain */
>> +       list_for_each_entry(pdd, &genpd->dev_list, list_node) {
>> +               next_wakeup = to_gpd_data(pdd)->next_wakeup;
>> +               if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
>> +                       if (ktime_before(next_wakeup, domain_wakeup))
>> +                               domain_wakeup = next_wakeup;
>
>If it turns out that one of the device's next_wakeup is before "now",
>leading to ktime_before() above returns true - then I think you should
>bail out, no?
>
>At least, we shouldn't just continue and ignore this case, right?
>
No, that could be a very common case. Drivers are not expected to clean
up the next wakeup by setting it to KTIME_MAX. The best we can do is
to make a choice with the valid information we have. This will also map
to the current behavior. Say if all next wakeup information provided to
the devices were in the past, we would be no worse (or better) than what
we do without this change.

>> +       }
>> +
>> +       /* Then find the earliest wakeup of from all the child domains */
>> +       list_for_each_entry(link, &genpd->parent_links, parent_node) {
>> +               next_wakeup = link->child->next_wakeup;
>> +               if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
>> +                       if (ktime_before(next_wakeup, domain_wakeup))
>> +                               domain_wakeup = next_wakeup;
>> +       }
>> +
>> +       genpd->next_wakeup = domain_wakeup;
>> +}
>> +
>> +static bool next_wakeup_allows_state(struct generic_pm_domain *genpd,
>> +                                    unsigned int state, ktime_t now)
>> +{
>> +       ktime_t domain_wakeup = genpd->next_wakeup;
>> +       s64 idle_time_ns, min_sleep_ns;
>> +
>> +       min_sleep_ns = genpd->states[state].power_off_latency_ns +
>> +                      genpd->states[state].power_on_latency_ns +
>> +                      genpd->states[state].residency_ns;
>> +
>
>I don't think you should include the power_on_latency_ns here.
>
>The validation isn't about QoS constraints, but whether we can meet
>the residency time to make it worth entering the state, from an energy
>point of view. Right?
>
Fair point. I will remove the power_on_latency_ns.

>> +       idle_time_ns = ktime_to_ns(ktime_sub(domain_wakeup, now));
>> +
>> +       return idle_time_ns >= min_sleep_ns;
>> +}
>> +
>>  static bool __default_power_down_ok(struct dev_pm_domain *pd,
>>                                      unsigned int state)
>>  {
>> @@ -209,8 +250,34 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd,
>>  static bool default_power_down_ok(struct dev_pm_domain *pd)
>>  {
>>         struct generic_pm_domain *genpd = pd_to_genpd(pd);
>> +       int state_idx = genpd->state_count - 1;
>> +       ktime_t now = ktime_get();
>>         struct gpd_link *link;
>>
>> +       /*
>> +        * Find the next wakeup from devices that can determine their own wakeup
>> +        * to find when the domain would wakeup and do it for every device down
>> +        * the hierarchy. It is not worth while to sleep if the state's residency
>> +        * cannot be met.
>> +        */
>> +       update_domain_next_wakeup(genpd, now);
>> +       if (genpd->next_wakeup != KTIME_MAX) {
>> +               /* Let's find out the deepest domain idle state, the devices prefer */
>> +               while (state_idx >= 0) {
>> +                       if (next_wakeup_allows_state(genpd, state_idx, now)) {
>> +                               genpd->max_off_time_changed = true;
>> +                               break;
>> +                       }
>> +                       state_idx--;
>> +               }
>> +
>> +               if (state_idx < 0) {
>> +                       state_idx = 0;
>> +                       genpd->cached_power_down_ok = false;
>> +                       goto done;
>> +               }
>> +       }
>> +
>
>The above would introduce unnecessary overhead, as it may become
>executed in cases when it's not needed.
>
>For example, there's no point doing the above, if the domain doesn't
>specify residency values for its idle states.
>
We would still need to ensure that the next wakeup is after the
power_off_latency, if specified.

>Additionally, we don't need to recompute the domain's next wakup,
>unless a device has got a new next_wakeup value set for it. In this
>case, we can just select a state based upon an previously computed
>value, thus avoiding the recomputation.
>
If the domain's next wakeup was in the past, then using our previously
computed state may be incorrect.

>>         if (!genpd->max_off_time_changed) {
>>                 genpd->state_idx = genpd->cached_power_down_state_idx;
>>                 return genpd->cached_power_down_ok;
>> @@ -228,17 +295,21 @@ static bool default_power_down_ok(struct dev_pm_domain *pd)
>>         genpd->max_off_time_ns = -1;
>>         genpd->max_off_time_changed = false;
>>         genpd->cached_power_down_ok = true;
>> -       genpd->state_idx = genpd->state_count - 1;
>>
>> -       /* Find a state to power down to, starting from the deepest. */
>> -       while (!__default_power_down_ok(pd, genpd->state_idx)) {
>> -               if (genpd->state_idx == 0) {
>> +       /*
>> +        * Find a state to power down to, starting from the state
>> +        * determined by the next wakeup.
>> +        */
>> +       while (!__default_power_down_ok(pd, state_idx)) {
>> +               if (state_idx == 0) {
>>                         genpd->cached_power_down_ok = false;
>>                         break;
>>                 }
>> -               genpd->state_idx--;
>> +               state_idx--;
>>         }
>>
>> +done:
>> +       genpd->state_idx = state_idx;
>>         genpd->cached_power_down_state_idx = genpd->state_idx;
>>         return genpd->cached_power_down_ok;
>>  }
>
>Another thing to consider for the above changes, is that the
>cpu_power_down_ok() calls into default_power_down_ok().
>
>Even if I am fine with the approach taken in $subject patch, I think
>it's important to try to keep the path a slim as possible as it's also
>executed in the CPU idlepath. 
Wouldn't this be called only the last CPU is powering down and only if
the domain is ready to power down?

>For example, $subject change means also
>that we end up calling ktime_get() twice in the same path, introducing
>unnecessary overhead. We can do better and avoid that by restructuring
>the code a bit, don't you think?
>
Hmmm, we could. I will submit a follow on patch to reorganize the code
so the ktime_get() would be called only once for either of the
power_down_ok callbacks.

Thanks for your review, Ulf.

-- Lina
Ulf Hansson Nov. 10, 2020, 10:01 a.m. UTC | #3
On Mon, 9 Nov 2020 at 18:41, Lina Iyer <ilina@codeaurora.org> wrote:
>
> On Mon, Nov 09 2020 at 08:27 -0700, Ulf Hansson wrote:
> >On Fri, 6 Nov 2020 at 17:48, Lina Iyer <ilina@codeaurora.org> wrote:
> >>
> [...]
> >> +static void update_domain_next_wakeup(struct generic_pm_domain *genpd, ktime_t now)
> >> +{
> >> +       ktime_t domain_wakeup = KTIME_MAX;
> >> +       ktime_t next_wakeup;
> >> +       struct pm_domain_data *pdd;
> >> +       struct gpd_link *link;
> >> +
> >> +       /* Find the earliest wakeup for all devices in the domain */
> >> +       list_for_each_entry(pdd, &genpd->dev_list, list_node) {
> >> +               next_wakeup = to_gpd_data(pdd)->next_wakeup;
> >> +               if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
> >> +                       if (ktime_before(next_wakeup, domain_wakeup))
> >> +                               domain_wakeup = next_wakeup;
> >
> >If it turns out that one of the device's next_wakeup is before "now",
> >leading to ktime_before() above returns true - then I think you should
> >bail out, no?
> >
> >At least, we shouldn't just continue and ignore this case, right?
> >
> No, that could be a very common case. Drivers are not expected to clean
> up the next wakeup by setting it to KTIME_MAX. The best we can do is
> to make a choice with the valid information we have. This will also map
> to the current behavior. Say if all next wakeup information provided to
> the devices were in the past, we would be no worse (or better) than what
> we do without this change.

Well, I don't quite agree (at least not yet), but let me elaborate, as
I think we can do better without having to add complexity.

Normally, I don't think a driver needs to clean up its device's next
wakeup in between the remote wakeups, instead it should just set a new
value.

That's because, even if the driver acts to a remote wakeup or deals
with a request entering a queue, the driver needs to runtime resume
its device during this period. This prevents genpd from power off the
PM domain, hence also the genpd governor from potentially looking at
"invalid" wakeup information for its attached devices.

Of course, I assume there are situations, where a driver actually
needs to do a clean up of its device's next wakeup, but that should be
less frequent and likely when a remote wakeup is disabled (for
whatever reason).

>
> >> +       }
> >> +
> >> +       /* Then find the earliest wakeup of from all the child domains */
> >> +       list_for_each_entry(link, &genpd->parent_links, parent_node) {
> >> +               next_wakeup = link->child->next_wakeup;
> >> +               if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
> >> +                       if (ktime_before(next_wakeup, domain_wakeup))
> >> +                               domain_wakeup = next_wakeup;
> >> +       }
> >> +
> >> +       genpd->next_wakeup = domain_wakeup;
> >> +}
> >> +
> >> +static bool next_wakeup_allows_state(struct generic_pm_domain *genpd,
> >> +                                    unsigned int state, ktime_t now)
> >> +{
> >> +       ktime_t domain_wakeup = genpd->next_wakeup;
> >> +       s64 idle_time_ns, min_sleep_ns;
> >> +
> >> +       min_sleep_ns = genpd->states[state].power_off_latency_ns +
> >> +                      genpd->states[state].power_on_latency_ns +
> >> +                      genpd->states[state].residency_ns;
> >> +
> >
> >I don't think you should include the power_on_latency_ns here.
> >
> >The validation isn't about QoS constraints, but whether we can meet
> >the residency time to make it worth entering the state, from an energy
> >point of view. Right?
> >
> Fair point. I will remove the power_on_latency_ns.
>
> >> +       idle_time_ns = ktime_to_ns(ktime_sub(domain_wakeup, now));
> >> +
> >> +       return idle_time_ns >= min_sleep_ns;
> >> +}
> >> +
> >>  static bool __default_power_down_ok(struct dev_pm_domain *pd,
> >>                                      unsigned int state)
> >>  {
> >> @@ -209,8 +250,34 @@ static bool __default_power_down_ok(struct dev_pm_domain *pd,
> >>  static bool default_power_down_ok(struct dev_pm_domain *pd)
> >>  {
> >>         struct generic_pm_domain *genpd = pd_to_genpd(pd);
> >> +       int state_idx = genpd->state_count - 1;
> >> +       ktime_t now = ktime_get();
> >>         struct gpd_link *link;
> >>
> >> +       /*
> >> +        * Find the next wakeup from devices that can determine their own wakeup
> >> +        * to find when the domain would wakeup and do it for every device down
> >> +        * the hierarchy. It is not worth while to sleep if the state's residency
> >> +        * cannot be met.
> >> +        */
> >> +       update_domain_next_wakeup(genpd, now);
> >> +       if (genpd->next_wakeup != KTIME_MAX) {
> >> +               /* Let's find out the deepest domain idle state, the devices prefer */
> >> +               while (state_idx >= 0) {
> >> +                       if (next_wakeup_allows_state(genpd, state_idx, now)) {
> >> +                               genpd->max_off_time_changed = true;
> >> +                               break;
> >> +                       }
> >> +                       state_idx--;
> >> +               }
> >> +
> >> +               if (state_idx < 0) {
> >> +                       state_idx = 0;
> >> +                       genpd->cached_power_down_ok = false;
> >> +                       goto done;
> >> +               }
> >> +       }
> >> +
> >
> >The above would introduce unnecessary overhead, as it may become
> >executed in cases when it's not needed.
> >
> >For example, there's no point doing the above, if the domain doesn't
> >specify residency values for its idle states.
> >
> We would still need to ensure that the next wakeup is after the
> power_off_latency, if specified.

Good point! Although, I would rather avoid adding the overhead, unless
the residency is specified. Do you see a problem with this approach?

Another option is to add a new governor, but if possible, I would like
to avoid that.

>
> >Additionally, we don't need to recompute the domain's next wakup,
> >unless a device has got a new next_wakeup value set for it. In this
> >case, we can just select a state based upon an previously computed
> >value, thus avoiding the recomputation.
> >
> If the domain's next wakeup was in the past, then using our previously
> computed state may be incorrect.

I am not saying you should use the previously computed *idlestate*,
but the previously computed next wakeup.

>
> >>         if (!genpd->max_off_time_changed) {
> >>                 genpd->state_idx = genpd->cached_power_down_state_idx;
> >>                 return genpd->cached_power_down_ok;
> >> @@ -228,17 +295,21 @@ static bool default_power_down_ok(struct dev_pm_domain *pd)
> >>         genpd->max_off_time_ns = -1;
> >>         genpd->max_off_time_changed = false;
> >>         genpd->cached_power_down_ok = true;
> >> -       genpd->state_idx = genpd->state_count - 1;
> >>
> >> -       /* Find a state to power down to, starting from the deepest. */
> >> -       while (!__default_power_down_ok(pd, genpd->state_idx)) {
> >> -               if (genpd->state_idx == 0) {
> >> +       /*
> >> +        * Find a state to power down to, starting from the state
> >> +        * determined by the next wakeup.
> >> +        */
> >> +       while (!__default_power_down_ok(pd, state_idx)) {
> >> +               if (state_idx == 0) {
> >>                         genpd->cached_power_down_ok = false;
> >>                         break;
> >>                 }
> >> -               genpd->state_idx--;
> >> +               state_idx--;
> >>         }
> >>
> >> +done:
> >> +       genpd->state_idx = state_idx;
> >>         genpd->cached_power_down_state_idx = genpd->state_idx;
> >>         return genpd->cached_power_down_ok;
> >>  }
> >
> >Another thing to consider for the above changes, is that the
> >cpu_power_down_ok() calls into default_power_down_ok().
> >
> >Even if I am fine with the approach taken in $subject patch, I think
> >it's important to try to keep the path a slim as possible as it's also
> >executed in the CPU idlepath.
> Wouldn't this be called only the last CPU is powering down and only if
> the domain is ready to power down?
>
> >For example, $subject change means also
> >that we end up calling ktime_get() twice in the same path, introducing
> >unnecessary overhead. We can do better and avoid that by restructuring
> >the code a bit, don't you think?
> >
> Hmmm, we could. I will submit a follow on patch to reorganize the code
> so the ktime_get() would be called only once for either of the
> power_down_ok callbacks.

If possible, I would rather make it part of the series. Just fold in
some "rework" patch before extending the governor, that should be
possible I think.

Kind regards
Uffe
Lina Iyer Nov. 11, 2020, 4:27 p.m. UTC | #4
On Tue, Nov 10 2020 at 03:02 -0700, Ulf Hansson wrote:
>On Mon, 9 Nov 2020 at 18:41, Lina Iyer <ilina@codeaurora.org> wrote:
>>
>> On Mon, Nov 09 2020 at 08:27 -0700, Ulf Hansson wrote:
>> >On Fri, 6 Nov 2020 at 17:48, Lina Iyer <ilina@codeaurora.org> wrote:
>> >>
>> [...]
>> >> +static void update_domain_next_wakeup(struct generic_pm_domain *genpd, ktime_t now)
>> >> +{
>> >> +       ktime_t domain_wakeup = KTIME_MAX;
>> >> +       ktime_t next_wakeup;
>> >> +       struct pm_domain_data *pdd;
>> >> +       struct gpd_link *link;
>> >> +
>> >> +       /* Find the earliest wakeup for all devices in the domain */
>> >> +       list_for_each_entry(pdd, &genpd->dev_list, list_node) {
>> >> +               next_wakeup = to_gpd_data(pdd)->next_wakeup;
>> >> +               if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
>> >> +                       if (ktime_before(next_wakeup, domain_wakeup))
>> >> +                               domain_wakeup = next_wakeup;
>> >
>> >If it turns out that one of the device's next_wakeup is before "now",
>> >leading to ktime_before() above returns true - then I think you should
>> >bail out, no?
>> >
>> >At least, we shouldn't just continue and ignore this case, right?
>> >
>> No, that could be a very common case. Drivers are not expected to clean
>> up the next wakeup by setting it to KTIME_MAX. The best we can do is
>> to make a choice with the valid information we have. This will also map
>> to the current behavior. Say if all next wakeup information provided to
>> the devices were in the past, we would be no worse (or better) than what
>> we do without this change.
>
>Well, I don't quite agree (at least not yet), but let me elaborate, as
>I think we can do better without having to add complexity.
>
>Normally, I don't think a driver needs to clean up its device's next
>wakeup in between the remote wakeups, instead it should just set a new
>value.
>
>That's because, even if the driver acts to a remote wakeup or deals
>with a request entering a queue, the driver needs to runtime resume
>its device during this period. This prevents genpd from power off the
>PM domain, hence also the genpd governor from potentially looking at
>"invalid" wakeup information for its attached devices.
>
Could you elaborate a bit? Why would a remote wakeup affect the next
wakeup. I'm afraid that I'm not getting the situation correctly.

>Of course, I assume there are situations, where a driver actually
>needs to do a clean up of its device's next wakeup, but that should be
>less frequent and likely when a remote wakeup is disabled (for
>whatever reason).
>
A common case would be that the driver does not know when the usecase is
being turned off and therefore may not be able to set the next wakeup to
max. If the stale value continues to exist then we may never power off
the domain.

>> >> +       /*
>> >> +        * Find the next wakeup from devices that can determine their own wakeup
>> >> +        * to find when the domain would wakeup and do it for every device down
>> >> +        * the hierarchy. It is not worth while to sleep if the state's residency
>> >> +        * cannot be met.
>> >> +        */
>> >> +       update_domain_next_wakeup(genpd, now);
>> >> +       if (genpd->next_wakeup != KTIME_MAX) {
>> >> +               /* Let's find out the deepest domain idle state, the devices prefer */
>> >> +               while (state_idx >= 0) {
>> >> +                       if (next_wakeup_allows_state(genpd, state_idx, now)) {
>> >> +                               genpd->max_off_time_changed = true;
>> >> +                               break;
>> >> +                       }
>> >> +                       state_idx--;
>> >> +               }
>> >> +
>> >> +               if (state_idx < 0) {
>> >> +                       state_idx = 0;
>> >> +                       genpd->cached_power_down_ok = false;
>> >> +                       goto done;
>> >> +               }
>> >> +       }
>> >> +
>> >
>> >The above would introduce unnecessary overhead, as it may become
>> >executed in cases when it's not needed.
>> >
>> >For example, there's no point doing the above, if the domain doesn't
>> >specify residency values for its idle states.
>> >
>> We would still need to ensure that the next wakeup is after the
>> power_off_latency, if specified.
>
>Good point! Although, I would rather avoid adding the overhead, unless
>the residency is specified. Do you see a problem with this approach?
>
Hmmm, no strong objections. However, we still need to run through the
states to make sure the residency is not set and have a variable track
that. The devices wouldn't know that and would still continue to set the
next wakeup, unless we find a way to let them know we are not using this
feature for the domain.

>Another option is to add a new governor, but if possible, I would like
>to avoid that.
>
Absolutely, we should avoid that.

>>
>> >Additionally, we don't need to recompute the domain's next wakup,
>> >unless a device has got a new next_wakeup value set for it. In this
>> >case, we can just select a state based upon an previously computed
>> >value, thus avoiding the recomputation.
>> >
>> If the domain's next wakeup was in the past, then using our previously
>> computed state may be incorrect.
>
>I am not saying you should use the previously computed *idlestate*,
>but the previously computed next wakeup.
>
I guess this falls into the first discussion, should be use the next
wakeup from the past.

>>
>> >>         if (!genpd->max_off_time_changed) {
>> >>                 genpd->state_idx = genpd->cached_power_down_state_idx;
>> >>                 return genpd->cached_power_down_ok;
>> >> @@ -228,17 +295,21 @@ static bool default_power_down_ok(struct dev_pm_domain *pd)
>> >>         genpd->max_off_time_ns = -1;
>> >>         genpd->max_off_time_changed = false;
>> >>         genpd->cached_power_down_ok = true;
>> >> -       genpd->state_idx = genpd->state_count - 1;
>> >>
>> >> -       /* Find a state to power down to, starting from the deepest. */
>> >> -       while (!__default_power_down_ok(pd, genpd->state_idx)) {
>> >> -               if (genpd->state_idx == 0) {
>> >> +       /*
>> >> +        * Find a state to power down to, starting from the state
>> >> +        * determined by the next wakeup.
>> >> +        */
>> >> +       while (!__default_power_down_ok(pd, state_idx)) {
>> >> +               if (state_idx == 0) {
>> >>                         genpd->cached_power_down_ok = false;
>> >>                         break;
>> >>                 }
>> >> -               genpd->state_idx--;
>> >> +               state_idx--;
>> >>         }
>> >>
>> >> +done:
>> >> +       genpd->state_idx = state_idx;
>> >>         genpd->cached_power_down_state_idx = genpd->state_idx;
>> >>         return genpd->cached_power_down_ok;
>> >>  }
>> >
>> >Another thing to consider for the above changes, is that the
>> >cpu_power_down_ok() calls into default_power_down_ok().
>> >
>> >Even if I am fine with the approach taken in $subject patch, I think
>> >it's important to try to keep the path a slim as possible as it's also
>> >executed in the CPU idlepath.
>> Wouldn't this be called only the last CPU is powering down and only if
>> the domain is ready to power down?
>>
>> >For example, $subject change means also
>> >that we end up calling ktime_get() twice in the same path, introducing
>> >unnecessary overhead. We can do better and avoid that by restructuring
>> >the code a bit, don't you think?
>> >
>> Hmmm, we could. I will submit a follow on patch to reorganize the code
>> so the ktime_get() would be called only once for either of the
>> power_down_ok callbacks.
>
>If possible, I would rather make it part of the series. Just fold in
>some "rework" patch before extending the governor, that should be
>possible I think.
>
Sure. Since this would affect changing the default_power_down_ok(), I
thought a follow on patch would be appropriate.

Thanks,
Lina
Ulf Hansson Nov. 13, 2020, 10:33 a.m. UTC | #5
On Wed, 11 Nov 2020 at 17:51, Lina Iyer <ilina@codeaurora.org> wrote:
>
> On Tue, Nov 10 2020 at 03:02 -0700, Ulf Hansson wrote:
> >On Mon, 9 Nov 2020 at 18:41, Lina Iyer <ilina@codeaurora.org> wrote:
> >>
> >> On Mon, Nov 09 2020 at 08:27 -0700, Ulf Hansson wrote:
> >> >On Fri, 6 Nov 2020 at 17:48, Lina Iyer <ilina@codeaurora.org> wrote:
> >> >>
> >> [...]
> >> >> +static void update_domain_next_wakeup(struct generic_pm_domain *genpd, ktime_t now)
> >> >> +{
> >> >> +       ktime_t domain_wakeup = KTIME_MAX;
> >> >> +       ktime_t next_wakeup;
> >> >> +       struct pm_domain_data *pdd;
> >> >> +       struct gpd_link *link;
> >> >> +
> >> >> +       /* Find the earliest wakeup for all devices in the domain */
> >> >> +       list_for_each_entry(pdd, &genpd->dev_list, list_node) {
> >> >> +               next_wakeup = to_gpd_data(pdd)->next_wakeup;
> >> >> +               if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
> >> >> +                       if (ktime_before(next_wakeup, domain_wakeup))
> >> >> +                               domain_wakeup = next_wakeup;
> >> >
> >> >If it turns out that one of the device's next_wakeup is before "now",
> >> >leading to ktime_before() above returns true - then I think you should
> >> >bail out, no?
> >> >
> >> >At least, we shouldn't just continue and ignore this case, right?
> >> >
> >> No, that could be a very common case. Drivers are not expected to clean
> >> up the next wakeup by setting it to KTIME_MAX. The best we can do is
> >> to make a choice with the valid information we have. This will also map
> >> to the current behavior. Say if all next wakeup information provided to
> >> the devices were in the past, we would be no worse (or better) than what
> >> we do without this change.
> >
> >Well, I don't quite agree (at least not yet), but let me elaborate, as
> >I think we can do better without having to add complexity.
> >
> >Normally, I don't think a driver needs to clean up its device's next
> >wakeup in between the remote wakeups, instead it should just set a new
> >value.
> >
> >That's because, even if the driver acts to a remote wakeup or deals
> >with a request entering a queue, the driver needs to runtime resume
> >its device during this period. This prevents genpd from power off the
> >PM domain, hence also the genpd governor from potentially looking at
> >"invalid" wakeup information for its attached devices.
> >
> Could you elaborate a bit? Why would a remote wakeup affect the next
> wakeup. I'm afraid that I'm not getting the situation correctly.

Let me try :-)

A remote wakeup is a wakeup irq that is triggered when the device is
in runtime suspended state.

I was expecting that you would be arming a remote wakeup for the
corresponding device that is attached to a genpd, when the use case
becomes enabled. Additionally, to allow the driver to consume the
wakeup irq, it needs to runtime resume its device (which means its PM
domain via genpd must be powered on as well, if it's not on already).

Therefore, during the period of when the driver consumes the wakeup
irq, its device's PM domain remains powered on. When this is
completed, the driver allows its device to become runtime suspended
again. At some point before the device becomes runtime suspended, the
driver should set a new value of the "next wakeup" for its device.

>
> >Of course, I assume there are situations, where a driver actually
> >needs to do a clean up of its device's next wakeup, but that should be
> >less frequent and likely when a remote wakeup is disabled (for
> >whatever reason).
> >
> A common case would be that the driver does not know when the usecase is
> being turned off and therefore may not be able to set the next wakeup to
> max. If the stale value continues to exist then we may never power off
> the domain.

Right.

But, how do you know that the use case starts and what prevents us
from knowing that the use case has stopped?

Maybe if you add a user of the new APIs, this would help me to
understand better?

>
> >> >> +       /*
> >> >> +        * Find the next wakeup from devices that can determine their own wakeup
> >> >> +        * to find when the domain would wakeup and do it for every device down
> >> >> +        * the hierarchy. It is not worth while to sleep if the state's residency
> >> >> +        * cannot be met.
> >> >> +        */
> >> >> +       update_domain_next_wakeup(genpd, now);
> >> >> +       if (genpd->next_wakeup != KTIME_MAX) {
> >> >> +               /* Let's find out the deepest domain idle state, the devices prefer */
> >> >> +               while (state_idx >= 0) {
> >> >> +                       if (next_wakeup_allows_state(genpd, state_idx, now)) {
> >> >> +                               genpd->max_off_time_changed = true;
> >> >> +                               break;
> >> >> +                       }
> >> >> +                       state_idx--;
> >> >> +               }
> >> >> +
> >> >> +               if (state_idx < 0) {
> >> >> +                       state_idx = 0;
> >> >> +                       genpd->cached_power_down_ok = false;
> >> >> +                       goto done;
> >> >> +               }
> >> >> +       }
> >> >> +
> >> >
> >> >The above would introduce unnecessary overhead, as it may become
> >> >executed in cases when it's not needed.
> >> >
> >> >For example, there's no point doing the above, if the domain doesn't
> >> >specify residency values for its idle states.
> >> >
> >> We would still need to ensure that the next wakeup is after the
> >> power_off_latency, if specified.
> >
> >Good point! Although, I would rather avoid adding the overhead, unless
> >the residency is specified. Do you see a problem with this approach?
> >
> Hmmm, no strong objections. However, we still need to run through the
> states to make sure the residency is not set and have a variable track
> that.

Right.

The important part is that we can do that once and not for every call
to the governor.

> The devices wouldn't know that and would still continue to set the
> next wakeup, unless we find a way to let them know we are not using this
> feature for the domain.

Right.

To allow the driver to know, we could respond with an error code from
the new dev_pm_genpd_set_performance_state() API (from patch1), in
case the genpd+governor doesn't support it.

Would that be okay? Otherwise we will have to add a separate genpd
API, asking explicitly if the "next wakeup" feature is supported.

>
> >Another option is to add a new governor, but if possible, I would like
> >to avoid that.
> >
> Absolutely, we should avoid that.
>

[...]

Kind regards
Uffe
Lina Iyer Nov. 16, 2020, 3:57 p.m. UTC | #6
On Fri, Nov 13 2020 at 03:34 -0700, Ulf Hansson wrote:
>On Wed, 11 Nov 2020 at 17:51, Lina Iyer <ilina@codeaurora.org> wrote:
>>
>> On Tue, Nov 10 2020 at 03:02 -0700, Ulf Hansson wrote:
>> >On Mon, 9 Nov 2020 at 18:41, Lina Iyer <ilina@codeaurora.org> wrote:
>> >>
>> >> On Mon, Nov 09 2020 at 08:27 -0700, Ulf Hansson wrote:
>> >> >On Fri, 6 Nov 2020 at 17:48, Lina Iyer <ilina@codeaurora.org> wrote:
>> >> >>
>> >> [...]
>> >> >> +static void update_domain_next_wakeup(struct generic_pm_domain *genpd, ktime_t now)
>> >> >> +{
>> >> >> +       ktime_t domain_wakeup = KTIME_MAX;
>> >> >> +       ktime_t next_wakeup;
>> >> >> +       struct pm_domain_data *pdd;
>> >> >> +       struct gpd_link *link;
>> >> >> +
>> >> >> +       /* Find the earliest wakeup for all devices in the domain */
>> >> >> +       list_for_each_entry(pdd, &genpd->dev_list, list_node) {
>> >> >> +               next_wakeup = to_gpd_data(pdd)->next_wakeup;
>> >> >> +               if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
>> >> >> +                       if (ktime_before(next_wakeup, domain_wakeup))
>> >> >> +                               domain_wakeup = next_wakeup;
>> >> >
>> >> >If it turns out that one of the device's next_wakeup is before "now",
>> >> >leading to ktime_before() above returns true - then I think you should
>> >> >bail out, no?
>> >> >
>> >> >At least, we shouldn't just continue and ignore this case, right?
>> >> >
>> >> No, that could be a very common case. Drivers are not expected to clean
>> >> up the next wakeup by setting it to KTIME_MAX. The best we can do is
>> >> to make a choice with the valid information we have. This will also map
>> >> to the current behavior. Say if all next wakeup information provided to
>> >> the devices were in the past, we would be no worse (or better) than what
>> >> we do without this change.
>> >
>> >Well, I don't quite agree (at least not yet), but let me elaborate, as
>> >I think we can do better without having to add complexity.
>> >
>> >Normally, I don't think a driver needs to clean up its device's next
>> >wakeup in between the remote wakeups, instead it should just set a new
>> >value.
>> >
>> >That's because, even if the driver acts to a remote wakeup or deals
>> >with a request entering a queue, the driver needs to runtime resume
>> >its device during this period. This prevents genpd from power off the
>> >PM domain, hence also the genpd governor from potentially looking at
>> >"invalid" wakeup information for its attached devices.
>> >
>> Could you elaborate a bit? Why would a remote wakeup affect the next
>> wakeup. I'm afraid that I'm not getting the situation correctly.
>
>Let me try :-)
>
>A remote wakeup is a wakeup irq that is triggered when the device is
>in runtime suspended state.
>
>I was expecting that you would be arming a remote wakeup for the
>corresponding device that is attached to a genpd, when the use case
>becomes enabled. Additionally, to allow the driver to consume the
>wakeup irq, it needs to runtime resume its device (which means its PM
>domain via genpd must be powered on as well, if it's not on already).
>
>Therefore, during the period of when the driver consumes the wakeup
>irq, its device's PM domain remains powered on. When this is
>completed, the driver allows its device to become runtime suspended
>again. At some point before the device becomes runtime suspended, the
>driver should set a new value of the "next wakeup" for its device.
>
Okay, that is clear. Thanks :)

Yes, we would expect the device to set up its next_wakeup before doing
runtime suspend and if the next wakeup is in the past, we would possibly
not have runtime suspended the device. That would keep the domain ON and
we would not come to this governor at all. And if we still are doing it,
then the device has not set the next wakeup correctly.

What you are suggesting is that, we should not power down the domain in
such a case. This would be a really hard problem to debug when a device
leaves a stale wakeup behind and we no longer power off the domain
because of that. Tracking that back to the device will be a monumental
effort. Ignoring the next wakeup though might involve a power/perf 
penalty (no worse than today), but we would not have a difficult problem
to solve.

>>
>> >Of course, I assume there are situations, where a driver actually
>> >needs to do a clean up of its device's next wakeup, but that should be
>> >less frequent and likely when a remote wakeup is disabled (for
>> >whatever reason).
>> >
>> A common case would be that the driver does not know when the usecase is
>> being turned off and therefore may not be able to set the next wakeup to
>> max. If the stale value continues to exist then we may never power off
>> the domain.
>
>Right.
>
>But, how do you know that the use case starts and what prevents us
>from knowing that the use case has stopped?
>
Usually, the device is made aware of the usecase when it starts, but
stopping the usecase might be something the device may or may not be
aware of.

>Maybe if you add a user of the new APIs, this would help me to
>understand better?
>
I have been asking some folks, but let's see.

[...]

>> >> >For example, there's no point doing the above, if the domain doesn't
>> >> >specify residency values for its idle states.
>> >> >
>> >> We would still need to ensure that the next wakeup is after the
>> >> power_off_latency, if specified.
>> >
>> >Good point! Although, I would rather avoid adding the overhead, unless
>> >the residency is specified. Do you see a problem with this approach?
>> >
>> Hmmm, no strong objections. However, we still need to run through the
>> states to make sure the residency is not set and have a variable track
>> that.
>
>Right.
>
>The important part is that we can do that once and not for every call
>to the governor.
>
>> The devices wouldn't know that and would still continue to set the
>> next wakeup, unless we find a way to let them know we are not using this
>> feature for the domain.
>
>Right.
>
>To allow the driver to know, we could respond with an error code from
>the new dev_pm_genpd_set_performance_state() API (from patch1), in
>case the genpd+governor doesn't support it.
>
It would an unnecessary work everytime a next wakeup is being set to
iterate through the available states and figure out if the residency has
been set or not. We could probably hold that result in a variable when
we setup the genpd states. Expect the next_wake to be set from a
critical path or an interrupt handler, so we have to be quick.

>Would that be okay? Otherwise we will have to add a separate genpd
>API, asking explicitly if the "next wakeup" feature is supported.
>
Would like to avoid that as much as possible.

Thanks,
Lina
Ulf Hansson Nov. 19, 2020, 9:56 a.m. UTC | #7
On Mon, 16 Nov 2020 at 16:57, Lina Iyer <ilina@codeaurora.org> wrote:
>
> On Fri, Nov 13 2020 at 03:34 -0700, Ulf Hansson wrote:
> >On Wed, 11 Nov 2020 at 17:51, Lina Iyer <ilina@codeaurora.org> wrote:
> >>
> >> On Tue, Nov 10 2020 at 03:02 -0700, Ulf Hansson wrote:
> >> >On Mon, 9 Nov 2020 at 18:41, Lina Iyer <ilina@codeaurora.org> wrote:
> >> >>
> >> >> On Mon, Nov 09 2020 at 08:27 -0700, Ulf Hansson wrote:
> >> >> >On Fri, 6 Nov 2020 at 17:48, Lina Iyer <ilina@codeaurora.org> wrote:
> >> >> >>
> >> >> [...]
> >> >> >> +static void update_domain_next_wakeup(struct generic_pm_domain *genpd, ktime_t now)
> >> >> >> +{
> >> >> >> +       ktime_t domain_wakeup = KTIME_MAX;
> >> >> >> +       ktime_t next_wakeup;
> >> >> >> +       struct pm_domain_data *pdd;
> >> >> >> +       struct gpd_link *link;
> >> >> >> +
> >> >> >> +       /* Find the earliest wakeup for all devices in the domain */
> >> >> >> +       list_for_each_entry(pdd, &genpd->dev_list, list_node) {
> >> >> >> +               next_wakeup = to_gpd_data(pdd)->next_wakeup;
> >> >> >> +               if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
> >> >> >> +                       if (ktime_before(next_wakeup, domain_wakeup))
> >> >> >> +                               domain_wakeup = next_wakeup;
> >> >> >
> >> >> >If it turns out that one of the device's next_wakeup is before "now",
> >> >> >leading to ktime_before() above returns true - then I think you should
> >> >> >bail out, no?
> >> >> >
> >> >> >At least, we shouldn't just continue and ignore this case, right?
> >> >> >
> >> >> No, that could be a very common case. Drivers are not expected to clean
> >> >> up the next wakeup by setting it to KTIME_MAX. The best we can do is
> >> >> to make a choice with the valid information we have. This will also map
> >> >> to the current behavior. Say if all next wakeup information provided to
> >> >> the devices were in the past, we would be no worse (or better) than what
> >> >> we do without this change.
> >> >
> >> >Well, I don't quite agree (at least not yet), but let me elaborate, as
> >> >I think we can do better without having to add complexity.
> >> >
> >> >Normally, I don't think a driver needs to clean up its device's next
> >> >wakeup in between the remote wakeups, instead it should just set a new
> >> >value.
> >> >
> >> >That's because, even if the driver acts to a remote wakeup or deals
> >> >with a request entering a queue, the driver needs to runtime resume
> >> >its device during this period. This prevents genpd from power off the
> >> >PM domain, hence also the genpd governor from potentially looking at
> >> >"invalid" wakeup information for its attached devices.
> >> >
> >> Could you elaborate a bit? Why would a remote wakeup affect the next
> >> wakeup. I'm afraid that I'm not getting the situation correctly.
> >
> >Let me try :-)
> >
> >A remote wakeup is a wakeup irq that is triggered when the device is
> >in runtime suspended state.
> >
> >I was expecting that you would be arming a remote wakeup for the
> >corresponding device that is attached to a genpd, when the use case
> >becomes enabled. Additionally, to allow the driver to consume the
> >wakeup irq, it needs to runtime resume its device (which means its PM
> >domain via genpd must be powered on as well, if it's not on already).
> >
> >Therefore, during the period of when the driver consumes the wakeup
> >irq, its device's PM domain remains powered on. When this is
> >completed, the driver allows its device to become runtime suspended
> >again. At some point before the device becomes runtime suspended, the
> >driver should set a new value of the "next wakeup" for its device.
> >
> Okay, that is clear. Thanks :)
>
> Yes, we would expect the device to set up its next_wakeup before doing
> runtime suspend and if the next wakeup is in the past, we would possibly
> not have runtime suspended the device. That would keep the domain ON and
> we would not come to this governor at all. And if we still are doing it,
> then the device has not set the next wakeup correctly.

Correct.

>
> What you are suggesting is that, we should not power down the domain in
> such a case. This would be a really hard problem to debug when a device
> leaves a stale wakeup behind and we no longer power off the domain
> because of that. Tracking that back to the device will be a monumental
> effort. Ignoring the next wakeup though might involve a power/perf
> penalty (no worse than today), but we would not have a difficult problem
> to solve.

Hmm, you have a good point!

Additionally, I guess it should be a rather seldom situation, as in
principle the wakeup irq should have been triggered already.

That said, I am okay to stick with your suggested approach.

Although, please add a comment in the code, to make it clear that the
behaviour is deliberate. Perhaps we should also clarify the function
description of dev_pm_genpd_set_next_wakeup() (in patch1) to make the
behaviour more clear for the user.

>
> >>
> >> >Of course, I assume there are situations, where a driver actually
> >> >needs to do a clean up of its device's next wakeup, but that should be
> >> >less frequent and likely when a remote wakeup is disabled (for
> >> >whatever reason).
> >> >
> >> A common case would be that the driver does not know when the usecase is
> >> being turned off and therefore may not be able to set the next wakeup to
> >> max. If the stale value continues to exist then we may never power off
> >> the domain.
> >
> >Right.
> >
> >But, how do you know that the use case starts and what prevents us
> >from knowing that the use case has stopped?
> >
> Usually, the device is made aware of the usecase when it starts, but
> stopping the usecase might be something the device may or may not be
> aware of.

Okay, I see.

I guess this means the remote wakeup stays armed, but there are no
longer any wakeups being triggered.

>
> >Maybe if you add a user of the new APIs, this would help me to
> >understand better?
> >
> I have been asking some folks, but let's see.
>
> [...]
>
> >> >> >For example, there's no point doing the above, if the domain doesn't
> >> >> >specify residency values for its idle states.
> >> >> >
> >> >> We would still need to ensure that the next wakeup is after the
> >> >> power_off_latency, if specified.
> >> >
> >> >Good point! Although, I would rather avoid adding the overhead, unless
> >> >the residency is specified. Do you see a problem with this approach?
> >> >
> >> Hmmm, no strong objections. However, we still need to run through the
> >> states to make sure the residency is not set and have a variable track
> >> that.
> >
> >Right.
> >
> >The important part is that we can do that once and not for every call
> >to the governor.
> >
> >> The devices wouldn't know that and would still continue to set the
> >> next wakeup, unless we find a way to let them know we are not using this
> >> feature for the domain.
> >
> >Right.
> >
> >To allow the driver to know, we could respond with an error code from
> >the new dev_pm_genpd_set_performance_state() API (from patch1), in
> >case the genpd+governor doesn't support it.
> >
> It would an unnecessary work everytime a next wakeup is being set to
> iterate through the available states and figure out if the residency has
> been set or not. We could probably hold that result in a variable when
> we setup the genpd states. Expect the next_wake to be set from a
> critical path or an interrupt handler, so we have to be quick.

Yes, that's the idea I had in mind.

Maybe it's not feasible, let's see. However, for sure I am looking at
decreasing overhead, not to increase. :-)

>
> >Would that be okay? Otherwise we will have to add a separate genpd
> >API, asking explicitly if the "next wakeup" feature is supported.
> >
> Would like to avoid that as much as possible.

Okay, good.

Kind regards
Uffe
Lina Iyer Nov. 19, 2020, 3:47 p.m. UTC | #8
On Thu, Nov 19 2020 at 02:57 -0700, Ulf Hansson wrote:
>On Mon, 16 Nov 2020 at 16:57, Lina Iyer <ilina@codeaurora.org> wrote:
>>
>> On Fri, Nov 13 2020 at 03:34 -0700, Ulf Hansson wrote:
>> >On Wed, 11 Nov 2020 at 17:51, Lina Iyer <ilina@codeaurora.org> wrote:
>> >>
>> >> On Tue, Nov 10 2020 at 03:02 -0700, Ulf Hansson wrote:
>> >> >On Mon, 9 Nov 2020 at 18:41, Lina Iyer <ilina@codeaurora.org> wrote:
>> >> >>
>> >> >> On Mon, Nov 09 2020 at 08:27 -0700, Ulf Hansson wrote:
>> >> >> >On Fri, 6 Nov 2020 at 17:48, Lina Iyer <ilina@codeaurora.org> wrote:
>> >> >> >>
>> >> >> [...]

>>
>> What you are suggesting is that, we should not power down the domain in
>> such a case. This would be a really hard problem to debug when a device
>> leaves a stale wakeup behind and we no longer power off the domain
>> because of that. Tracking that back to the device will be a monumental
>> effort. Ignoring the next wakeup though might involve a power/perf
>> penalty (no worse than today), but we would not have a difficult problem
>> to solve.
>
>Hmm, you have a good point!
>
>Additionally, I guess it should be a rather seldom situation, as in
>principle the wakeup irq should have been triggered already.
>
>That said, I am okay to stick with your suggested approach.
>
>Although, please add a comment in the code, to make it clear that the
>behaviour is deliberate. Perhaps we should also clarify the function
>description of dev_pm_genpd_set_next_wakeup() (in patch1) to make the
>behaviour more clear for the user.
>
Sure, will update with comments.
>>

Let's revisit the patch again after I repost, to make sure this is what
we want, atleast for now.

Thanks for your review, Ulf.

--Lina
Lina Iyer Nov. 19, 2020, 5:46 p.m. UTC | #9
On Thu, Nov 19 2020 at 02:57 -0700, Ulf Hansson wrote:
>On Mon, 16 Nov 2020 at 16:57, Lina Iyer <ilina@codeaurora.org> wrote:
>>
>> On Fri, Nov 13 2020 at 03:34 -0700, Ulf Hansson wrote:
>> >On Wed, 11 Nov 2020 at 17:51, Lina Iyer <ilina@codeaurora.org> wrote:
>> >>
>> >> On Tue, Nov 10 2020 at 03:02 -0700, Ulf Hansson wrote:
>> >> >On Mon, 9 Nov 2020 at 18:41, Lina Iyer <ilina@codeaurora.org> wrote:
>> >> >>
>> >> >> On Mon, Nov 09 2020 at 08:27 -0700, Ulf Hansson wrote:
>> >> >> >On Fri, 6 Nov 2020 at 17:48, Lina Iyer <ilina@codeaurora.org> wrote:
>> [...]
>>
>> >> >> >For example, there's no point doing the above, if the domain doesn't
>> >> >> >specify residency values for its idle states.
>> >> >> >
>> >> >> We would still need to ensure that the next wakeup is after the
>> >> >> power_off_latency, if specified.
>> >> >
>> >> >Good point! Although, I would rather avoid adding the overhead, unless
>> >> >the residency is specified. Do you see a problem with this approach?
>> >> >
>> >> Hmmm, no strong objections. However, we still need to run through the
>> >> states to make sure the residency is not set and have a variable track
>> >> that.
>> >
>> >Right.
>> >
>> >The important part is that we can do that once and not for every call
>> >to the governor.
>> >
>> >> The devices wouldn't know that and would still continue to set the
>> >> next wakeup, unless we find a way to let them know we are not using this
>> >> feature for the domain.
>> >
>> >Right.
>> >
>> >To allow the driver to know, we could respond with an error code from
>> >the new dev_pm_genpd_set_performance_state() API (from patch1), in
>> >case the genpd+governor doesn't support it.
>> >
>> It would an unnecessary work everytime a next wakeup is being set to
>> iterate through the available states and figure out if the residency has
>> been set or not. We could probably hold that result in a variable when
>> we setup the genpd states. Expect the next_wake to be set from a
>> critical path or an interrupt handler, so we have to be quick.
>
>Yes, that's the idea I had in mind.
>
>Maybe it's not feasible, let's see. However, for sure I am looking at
>decreasing overhead, not to increase. :-)
>
Wondering what do you think about a genpd flag for this purpose? The
flag may be set when the genpd is initialized with idle states that have
residency specified. In the governor, we could skip this path
completely, if the flag is not set.

--Lina

>>
>> >Would that be okay? Otherwise we will have to add a separate genpd
>> >API, asking explicitly if the "next wakeup" feature is supported.
>> >
>> Would like to avoid that as much as possible.
>
>Okay, good.
>
>Kind regards
>Uffe
diff mbox series

Patch

diff --git a/drivers/base/power/domain_governor.c b/drivers/base/power/domain_governor.c
index 490ed7deb99a..fee050c6ea6a 100644
--- a/drivers/base/power/domain_governor.c
+++ b/drivers/base/power/domain_governor.c
@@ -117,6 +117,47 @@  static bool default_suspend_ok(struct device *dev)
 	return td->cached_suspend_ok;
 }
 
+static void update_domain_next_wakeup(struct generic_pm_domain *genpd, ktime_t now)
+{
+	ktime_t domain_wakeup = KTIME_MAX;
+	ktime_t next_wakeup;
+	struct pm_domain_data *pdd;
+	struct gpd_link *link;
+
+	/* Find the earliest wakeup for all devices in the domain */
+	list_for_each_entry(pdd, &genpd->dev_list, list_node) {
+		next_wakeup = to_gpd_data(pdd)->next_wakeup;
+		if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
+			if (ktime_before(next_wakeup, domain_wakeup))
+				domain_wakeup = next_wakeup;
+	}
+
+	/* Then find the earliest wakeup of from all the child domains */
+	list_for_each_entry(link, &genpd->parent_links, parent_node) {
+		next_wakeup = link->child->next_wakeup;
+		if (next_wakeup != KTIME_MAX && !ktime_before(next_wakeup, now))
+			if (ktime_before(next_wakeup, domain_wakeup))
+				domain_wakeup = next_wakeup;
+	}
+
+	genpd->next_wakeup = domain_wakeup;
+}
+
+static bool next_wakeup_allows_state(struct generic_pm_domain *genpd,
+				     unsigned int state, ktime_t now)
+{
+	ktime_t domain_wakeup = genpd->next_wakeup;
+	s64 idle_time_ns, min_sleep_ns;
+
+	min_sleep_ns = genpd->states[state].power_off_latency_ns +
+		       genpd->states[state].power_on_latency_ns +
+		       genpd->states[state].residency_ns;
+
+	idle_time_ns = ktime_to_ns(ktime_sub(domain_wakeup, now));
+
+	return idle_time_ns >= min_sleep_ns;
+}
+
 static bool __default_power_down_ok(struct dev_pm_domain *pd,
 				     unsigned int state)
 {
@@ -209,8 +250,34 @@  static bool __default_power_down_ok(struct dev_pm_domain *pd,
 static bool default_power_down_ok(struct dev_pm_domain *pd)
 {
 	struct generic_pm_domain *genpd = pd_to_genpd(pd);
+	int state_idx = genpd->state_count - 1;
+	ktime_t now = ktime_get();
 	struct gpd_link *link;
 
+	/*
+	 * Find the next wakeup from devices that can determine their own wakeup
+	 * to find when the domain would wakeup and do it for every device down
+	 * the hierarchy. It is not worth while to sleep if the state's residency
+	 * cannot be met.
+	 */
+	update_domain_next_wakeup(genpd, now);
+	if (genpd->next_wakeup != KTIME_MAX) {
+		/* Let's find out the deepest domain idle state, the devices prefer */
+		while (state_idx >= 0) {
+			if (next_wakeup_allows_state(genpd, state_idx, now)) {
+				genpd->max_off_time_changed = true;
+				break;
+			}
+			state_idx--;
+		}
+
+		if (state_idx < 0) {
+			state_idx = 0;
+			genpd->cached_power_down_ok = false;
+			goto done;
+		}
+	}
+
 	if (!genpd->max_off_time_changed) {
 		genpd->state_idx = genpd->cached_power_down_state_idx;
 		return genpd->cached_power_down_ok;
@@ -228,17 +295,21 @@  static bool default_power_down_ok(struct dev_pm_domain *pd)
 	genpd->max_off_time_ns = -1;
 	genpd->max_off_time_changed = false;
 	genpd->cached_power_down_ok = true;
-	genpd->state_idx = genpd->state_count - 1;
 
-	/* Find a state to power down to, starting from the deepest. */
-	while (!__default_power_down_ok(pd, genpd->state_idx)) {
-		if (genpd->state_idx == 0) {
+	/*
+	 * Find a state to power down to, starting from the state
+	 * determined by the next wakeup.
+	 */
+	while (!__default_power_down_ok(pd, state_idx)) {
+		if (state_idx == 0) {
 			genpd->cached_power_down_ok = false;
 			break;
 		}
-		genpd->state_idx--;
+		state_idx--;
 	}
 
+done:
+	genpd->state_idx = state_idx;
 	genpd->cached_power_down_state_idx = genpd->state_idx;
 	return genpd->cached_power_down_ok;
 }
diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h
index d7875bfde7f4..49982cd58bfd 100644
--- a/include/linux/pm_domain.h
+++ b/include/linux/pm_domain.h
@@ -130,6 +130,7 @@  struct generic_pm_domain {
 				     unsigned int state);
 	struct gpd_dev_ops dev_ops;
 	s64 max_off_time_ns;	/* Maximum allowed "suspended" time. */
+	ktime_t next_wakeup;	/* Maintained by the domain governor */
 	bool max_off_time_changed;
 	bool cached_power_down_ok;
 	bool cached_power_down_state_idx;