diff mbox

[3/3,v3] cpufreq: governor: Replace timers with utilization update callbacks

Message ID 1486401.1RcnnVKZNP@vostro.rjw.lan (mailing list archive)
State Superseded, archived
Delegated to: Rafael Wysocki
Headers show

Commit Message

Rafael J. Wysocki Feb. 5, 2016, 1:28 a.m. UTC
From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Instead of using a per-CPU deferrable timer for queuing up governor
work items, register a utilization update callback that will be
invoked from the scheduler on utilization changes.

The sampling rate is still the same as what was used for the
deferrable timers and the added irq_work overhead should be offset by
the eliminated timers overhead, so in theory the functional impact of
this patch should not be significant.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---

The v3 addresses some review comments from Viresh and a couple of issues found
by me.  Changes from the previous version:
- Synchronize gov_cancel_work() with the (new) irq_work properly.
- Add a comment about the (new) memory barrier.
- Move samle_delay_ns to "shared" (struct cpu_common_dbs_info) so it is the
  same for all policy CPUs (without this modification we may end up taking
  samples too often).
- Drop some more unused code (in dbs_work_handler()).

Thanks,
Rafael

---
 drivers/cpufreq/cpufreq_conservative.c |    6 -
 drivers/cpufreq/cpufreq_governor.c     |  157 ++++++++++++++-------------------
 drivers/cpufreq/cpufreq_governor.h     |   15 ++-
 drivers/cpufreq/cpufreq_ondemand.c     |   25 ++---
 4 files changed, 95 insertions(+), 108 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Viresh Kumar Feb. 5, 2016, 6:50 a.m. UTC | #1
Will suck some more blood, sorry about that :)

On 05-02-16, 02:28, Rafael J. Wysocki wrote:
> The v3 addresses some review comments from Viresh and a couple of issues found
> by me.  Changes from the previous version:
> - Synchronize gov_cancel_work() with the (new) irq_work properly.
> - Add a comment about the (new) memory barrier.
> - Move samle_delay_ns to "shared" (struct cpu_common_dbs_info) so it is the

sample_delay_ns was already there, you moved last_sample_time instead :)

> @@ -139,7 +141,11 @@ struct cpu_common_dbs_info {
>  	struct mutex timer_mutex;
>  
>  	ktime_t time_stamp;
> +	u64 last_sample_time;
> +	s64 sample_delay_ns;
>  	atomic_t skip_work;
> +	struct irq_work irq_work;

Just for my understanding, why can't we schedule a normal work directly? Is it
because of scheduler's hotpath and queue_work() is slow?

> Index: linux-pm/drivers/cpufreq/cpufreq_governor.c
> +void gov_set_update_util(struct cpu_common_dbs_info *shared,
> +			 unsigned int delay_us)
>  {
> +	struct cpufreq_policy *policy = shared->policy;
>  	struct dbs_data *dbs_data = policy->governor_data;
> -	struct cpu_dbs_info *cdbs;
>  	int cpu;
>  
> +	shared->sample_delay_ns = delay_us * NSEC_PER_USEC;
> +	shared->time_stamp = ktime_get();
> +	shared->last_sample_time = 0;

Calling this routine from update_sampling_rate() is still wrong. Because that
will also make last_sample_time = 0, which means that we will schedule the
irq-work on the next util update.

We surely didn't wanted that to happen, isn't it ?

>  	for_each_cpu(cpu, policy->cpus) {
> -		cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
> -		cdbs->timer.expires = jiffies + delay;
> -		add_timer_on(&cdbs->timer, cpu);
> +		struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
> +
> +		cpufreq_set_update_util_data(cpu, &cdbs->update_util);
>  	}
>  }
> -EXPORT_SYMBOL_GPL(gov_add_timers);
> +EXPORT_SYMBOL_GPL(gov_set_update_util);

>  void gov_cancel_work(struct cpu_common_dbs_info *shared)
>  {
> -	/* Tell dbs_timer_handler() to skip queuing up work items. */
> +	/* Tell dbs_update_util_handler() to skip queuing up work items. */
>  	atomic_inc(&shared->skip_work);
>  	/*
> -	 * If dbs_timer_handler() is already running, it may not notice the
> -	 * incremented skip_work, so wait for it to complete to prevent its work
> -	 * item from being queued up after the cancel_work_sync() below.
> -	 */
> -	gov_cancel_timers(shared->policy);
> -	/*
> -	 * In case dbs_timer_handler() managed to run and spawn a work item
> -	 * before the timers have been canceled, wait for that work item to
> -	 * complete and then cancel all of the timers set up by it.  If
> -	 * dbs_timer_handler() runs again at that point, it will see the
> -	 * positive value of skip_work and won't spawn any more work items.
> +	 * If dbs_update_util_handler() is already running, it may not notice
> +	 * the incremented skip_work, so wait for it to complete to prevent its
> +	 * work item from being queued up after the cancel_work_sync() below.
>  	 */
> +	gov_clear_update_util(shared->policy);
> +	wait_for_completion(&shared->irq_work_done);

I may be wrong, but isn't running irq_work_sync() enough here instead ?

>  	cancel_work_sync(&shared->work);
> -	gov_cancel_timers(shared->policy);
>  	atomic_set(&shared->skip_work, 0);
>  }
>  EXPORT_SYMBOL_GPL(gov_cancel_work);

> Index: linux-pm/drivers/cpufreq/cpufreq_ondemand.c
> @@ -264,7 +260,7 @@ static void update_sampling_rate(struct
>  		struct od_cpu_dbs_info_s *dbs_info;
>  		struct cpu_dbs_info *cdbs;
>  		struct cpu_common_dbs_info *shared;
> -		unsigned long next_sampling, appointed_at;
> +		ktime_t next_sampling, appointed_at;
>  
>  		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
>  		cdbs = &dbs_info->cdbs;
> @@ -292,16 +288,19 @@ static void update_sampling_rate(struct
>  			continue;
>  
>  		/*
> -		 * Checking this for any CPU should be fine, timers for all of
> -		 * them are scheduled together.
> +		 * Checking this for any CPU sharing the policy should be fine,
> +		 * they are all scheduled to sample at the same time.
>  		 */
> -		next_sampling = jiffies + usecs_to_jiffies(new_rate);
> -		appointed_at = dbs_info->cdbs.timer.expires;
> +		next_sampling = ktime_add_us(ktime_get(), new_rate);
>  
> -		if (time_before(next_sampling, appointed_at)) {
> -			gov_cancel_work(shared);
> -			gov_add_timers(policy, usecs_to_jiffies(new_rate));
> +		mutex_lock(&shared->timer_mutex);
> +		appointed_at = ktime_add_ns(shared->time_stamp,
> +					    shared->sample_delay_ns);
> +		mutex_unlock(&shared->timer_mutex);
>  
> +		if (ktime_before(next_sampling, appointed_at)) {
> +			gov_cancel_work(shared);
> +			gov_set_update_util(shared, new_rate);

So, I don't think we need to call these heavy routines at all here. Just use the
above timer_mutex to update time_stamp and sample_delay_ns.

Over that, that particular change might turn out to be a big big bonus for us.
Why would we be taking the od_dbs_cdata.mutex in this routine anymore ? We
aren't removing/adding timers anymore, just update the sample_delay_ns and there
shouldn't be any races. Ofcourse you need to use the same timer_mutex in util's
handler as well around sample_delay_ns, I believe.

And that will also kill the circular dependency lockdep we have been chasing
badly :)

Or am I being over excited here ? :(
Rafael J. Wysocki Feb. 5, 2016, 1:36 p.m. UTC | #2
On Fri, Feb 5, 2016 at 7:50 AM, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> Will suck some more blood, sorry about that :)
>
> On 05-02-16, 02:28, Rafael J. Wysocki wrote:
>> The v3 addresses some review comments from Viresh and a couple of issues found
>> by me.  Changes from the previous version:
>> - Synchronize gov_cancel_work() with the (new) irq_work properly.
>> - Add a comment about the (new) memory barrier.
>> - Move samle_delay_ns to "shared" (struct cpu_common_dbs_info) so it is the
>
> sample_delay_ns was already there, you moved last_sample_time instead :)
>
>> @@ -139,7 +141,11 @@ struct cpu_common_dbs_info {
>>       struct mutex timer_mutex;
>>
>>       ktime_t time_stamp;
>> +     u64 last_sample_time;
>> +     s64 sample_delay_ns;
>>       atomic_t skip_work;
>> +     struct irq_work irq_work;
>
> Just for my understanding, why can't we schedule a normal work directly? Is it
> because of scheduler's hotpath and queue_work() is slow?

No, that's not the reason.

That path can't call wake_up_process() as it may be holding the locks
this would have attempted to grab.

That said it is hot too.  For example, ktime_get() may be too slow to
be called from it on some systems.

>> Index: linux-pm/drivers/cpufreq/cpufreq_governor.c
>> +void gov_set_update_util(struct cpu_common_dbs_info *shared,
>> +                      unsigned int delay_us)
>>  {
>> +     struct cpufreq_policy *policy = shared->policy;
>>       struct dbs_data *dbs_data = policy->governor_data;
>> -     struct cpu_dbs_info *cdbs;
>>       int cpu;
>>
>> +     shared->sample_delay_ns = delay_us * NSEC_PER_USEC;
>> +     shared->time_stamp = ktime_get();
>> +     shared->last_sample_time = 0;
>
> Calling this routine from update_sampling_rate() is still wrong. Because that
> will also make last_sample_time = 0, which means that we will schedule the
> irq-work on the next util update.

That isn't a problem, though.

This is the case when the new rate is smaller than the old one and we
want it to take effect immediately.  Taking the next sample
immediately in that case is not going to hurt anyone.

And this observation actually leads to some interesting realization
about update_sampling_rate() (see below).

> We surely didn't wanted that to happen, isn't it ?

No, it isn't. :-)

>
>>       for_each_cpu(cpu, policy->cpus) {
>> -             cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
>> -             cdbs->timer.expires = jiffies + delay;
>> -             add_timer_on(&cdbs->timer, cpu);
>> +             struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
>> +
>> +             cpufreq_set_update_util_data(cpu, &cdbs->update_util);
>>       }
>>  }
>> -EXPORT_SYMBOL_GPL(gov_add_timers);
>> +EXPORT_SYMBOL_GPL(gov_set_update_util);
>
>>  void gov_cancel_work(struct cpu_common_dbs_info *shared)
>>  {
>> -     /* Tell dbs_timer_handler() to skip queuing up work items. */
>> +     /* Tell dbs_update_util_handler() to skip queuing up work items. */
>>       atomic_inc(&shared->skip_work);
>>       /*
>> -      * If dbs_timer_handler() is already running, it may not notice the
>> -      * incremented skip_work, so wait for it to complete to prevent its work
>> -      * item from being queued up after the cancel_work_sync() below.
>> -      */
>> -     gov_cancel_timers(shared->policy);
>> -     /*
>> -      * In case dbs_timer_handler() managed to run and spawn a work item
>> -      * before the timers have been canceled, wait for that work item to
>> -      * complete and then cancel all of the timers set up by it.  If
>> -      * dbs_timer_handler() runs again at that point, it will see the
>> -      * positive value of skip_work and won't spawn any more work items.
>> +      * If dbs_update_util_handler() is already running, it may not notice
>> +      * the incremented skip_work, so wait for it to complete to prevent its
>> +      * work item from being queued up after the cancel_work_sync() below.
>>        */
>> +     gov_clear_update_util(shared->policy);
>> +     wait_for_completion(&shared->irq_work_done);
>
> I may be wrong, but isn't running irq_work_sync() enough here instead ?

Yes, it is.

I assumed that it would only check if the irq_work is running at the
moment for some reason, but that's not the case.

>>       cancel_work_sync(&shared->work);
>> -     gov_cancel_timers(shared->policy);
>>       atomic_set(&shared->skip_work, 0);
>>  }
>>  EXPORT_SYMBOL_GPL(gov_cancel_work);
>
>> Index: linux-pm/drivers/cpufreq/cpufreq_ondemand.c
>> @@ -264,7 +260,7 @@ static void update_sampling_rate(struct
>>               struct od_cpu_dbs_info_s *dbs_info;
>>               struct cpu_dbs_info *cdbs;
>>               struct cpu_common_dbs_info *shared;
>> -             unsigned long next_sampling, appointed_at;
>> +             ktime_t next_sampling, appointed_at;
>>
>>               dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
>>               cdbs = &dbs_info->cdbs;
>> @@ -292,16 +288,19 @@ static void update_sampling_rate(struct
>>                       continue;
>>
>>               /*
>> -              * Checking this for any CPU should be fine, timers for all of
>> -              * them are scheduled together.
>> +              * Checking this for any CPU sharing the policy should be fine,
>> +              * they are all scheduled to sample at the same time.
>>                */
>> -             next_sampling = jiffies + usecs_to_jiffies(new_rate);
>> -             appointed_at = dbs_info->cdbs.timer.expires;
>> +             next_sampling = ktime_add_us(ktime_get(), new_rate);
>>
>> -             if (time_before(next_sampling, appointed_at)) {
>> -                     gov_cancel_work(shared);
>> -                     gov_add_timers(policy, usecs_to_jiffies(new_rate));
>> +             mutex_lock(&shared->timer_mutex);
>> +             appointed_at = ktime_add_ns(shared->time_stamp,
>> +                                         shared->sample_delay_ns);
>> +             mutex_unlock(&shared->timer_mutex);
>>
>> +             if (ktime_before(next_sampling, appointed_at)) {
>> +                     gov_cancel_work(shared);
>> +                     gov_set_update_util(shared, new_rate);
>
> So, I don't think we need to call these heavy routines at all here. Just use the
> above timer_mutex to update time_stamp and sample_delay_ns.

Well, the concern was that sample_delay_ns might not be updated
atomically on 32-bit architectures and that might be a problem for
dbs_update_util_handler().  However, this really isn't a problem,
because dbs_update_util_handler() only decides whether or not to take
a sample *this* time.  If it sees a semi-update value of
sample_delay_ns, that value will be either too small or too big, so it
will either skip the sample unnecessarily or take it immediately and
none of these is a real problem.  It doesn't hurt to take the sample
immediately at this point (as stated earlier) and if it is skipped, it
will be taken on the next attempt when the update has been completed
(which would have happened anyway had the update been atomic).

> Over that, that particular change might turn out to be a big big bonus for us.
> Why would we be taking the od_dbs_cdata.mutex in this routine anymore ? We
> aren't removing/adding timers anymore, just update the sample_delay_ns and there
> shouldn't be any races.

That's a very good point.

The only concern is that this function walks the entire collection of
cpu_dbs_infos and that's potentially racing with anything that updates
those.

> Of course you need to use the same timer_mutex in util's
> handler as well around sample_delay_ns, I believe.

That can't take any mutexes.  It might only take a raw spinlock if
really needed.

> And that will also kill the circular dependency lockdep we have been chasing
> badly :)
>
> Or am I being over excited here ? :(

Not really.  I think you're on the right track.

Before we drop the lock from here, though, we need to audit the code
for any possible races carefully.

Anyway, I'll send an update of the $subject patch later today when I
have a chance to run it through some test.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar Feb. 5, 2016, 2:47 p.m. UTC | #3
On Fri, Feb 5, 2016 at 7:06 PM, Rafael J. Wysocki <rafael@kernel.org> wrote:

>>> Index: linux-pm/drivers/cpufreq/cpufreq_ondemand.c
>>> @@ -264,7 +260,7 @@ static void update_sampling_rate(struct
>>>               struct od_cpu_dbs_info_s *dbs_info;
>>>               struct cpu_dbs_info *cdbs;
>>>               struct cpu_common_dbs_info *shared;
>>> -             unsigned long next_sampling, appointed_at;
>>> +             ktime_t next_sampling, appointed_at;
>>>
>>>               dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
>>>               cdbs = &dbs_info->cdbs;
>>> @@ -292,16 +288,19 @@ static void update_sampling_rate(struct
>>>                       continue;
>>>
>>>               /*
>>> -              * Checking this for any CPU should be fine, timers for all of
>>> -              * them are scheduled together.
>>> +              * Checking this for any CPU sharing the policy should be fine,
>>> +              * they are all scheduled to sample at the same time.
>>>                */
>>> -             next_sampling = jiffies + usecs_to_jiffies(new_rate);
>>> -             appointed_at = dbs_info->cdbs.timer.expires;
>>> +             next_sampling = ktime_add_us(ktime_get(), new_rate);
>>>
>>> -             if (time_before(next_sampling, appointed_at)) {
>>> -                     gov_cancel_work(shared);
>>> -                     gov_add_timers(policy, usecs_to_jiffies(new_rate));
>>> +             mutex_lock(&shared->timer_mutex);
>>> +             appointed_at = ktime_add_ns(shared->time_stamp,
>>> +                                         shared->sample_delay_ns);
>>> +             mutex_unlock(&shared->timer_mutex);
>>>
>>> +             if (ktime_before(next_sampling, appointed_at)) {
>>> +                     gov_cancel_work(shared);
>>> +                     gov_set_update_util(shared, new_rate);
>>
>> So, I don't think we need to call these heavy routines at all here. Just use the
>> above timer_mutex to update time_stamp and sample_delay_ns.
>
> Well, the concern was that sample_delay_ns might not be updated
> atomically on 32-bit architectures and that might be a problem for
> dbs_update_util_handler().  However, this really isn't a problem,
> because dbs_update_util_handler() only decides whether or not to take
> a sample *this* time.  If it sees a semi-update value of
> sample_delay_ns, that value will be either too small or too big, so it
> will either skip the sample unnecessarily or take it immediately and
> none of these is a real problem.  It doesn't hurt to take the sample
> immediately at this point (as stated earlier) and if it is skipped, it
> will be taken on the next attempt when the update has been completed
> (which would have happened anyway had the update been atomic).

Okay, how about this then.

We do some computations here and based on them, conditionally want to
update sample_delay_ns. Because there is no penalty now, in terms of
removing/adding timers/wq, etc, why shouldn't we simply update the
sample_delay_ns everytime without any checks? That would mean that the
change of sampling rate is effective immediately, what can be better than that?

Also, we should do the same from update-sampling-rate of conservative
governor as well.

Just kill all this complex, unwanted code and make life simple.

> The only concern is that this function walks the entire collection of
> cpu_dbs_infos and that's potentially racing with anything that updates
> those.

Yeah, but fixing this race shall be easier than other crazy things we are
looking to do with kobjects :)

> That can't take any mutexes.  It might only take a raw spinlock if
> really needed.

That's doable as well :)

> Before we drop the lock from here, though, we need to audit the code
> for any possible races carefully.

I did bit of that this morning, and there weren't any serious issues as
as far as I could see :)

--
viresh
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki Feb. 5, 2016, 11:01 p.m. UTC | #4
On Friday, February 05, 2016 02:36:54 PM Rafael J. Wysocki wrote:
> On Fri, Feb 5, 2016 at 7:50 AM, Viresh Kumar <viresh.kumar@linaro.org> wrote:
> > Will suck some more blood, sorry about that :)
> >
> > On 05-02-16, 02:28, Rafael J. Wysocki wrote:
> >> The v3 addresses some review comments from Viresh and a couple of issues found
> >> by me.  Changes from the previous version:
> >> - Synchronize gov_cancel_work() with the (new) irq_work properly.
> >> - Add a comment about the (new) memory barrier.
> >> - Move samle_delay_ns to "shared" (struct cpu_common_dbs_info) so it is the
> >
> > sample_delay_ns was already there, you moved last_sample_time instead :)
> >
> >> @@ -139,7 +141,11 @@ struct cpu_common_dbs_info {
> >>       struct mutex timer_mutex;
> >>
> >>       ktime_t time_stamp;
> >> +     u64 last_sample_time;
> >> +     s64 sample_delay_ns;
> >>       atomic_t skip_work;
> >> +     struct irq_work irq_work;
> >
> > Just for my understanding, why can't we schedule a normal work directly? Is it
> > because of scheduler's hotpath and queue_work() is slow?
> 
> No, that's not the reason.
> 
> That path can't call wake_up_process() as it may be holding the locks
> this would have attempted to grab.

My answer wasn't really to the point here.

Among other things, the scheduler path cannot use normal spinlocks.  It can
only use raw spinlocks and this means no work queuing from it.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki Feb. 5, 2016, 11:10 p.m. UTC | #5
On Friday, February 05, 2016 08:17:56 PM Viresh Kumar wrote:
> On Fri, Feb 5, 2016 at 7:06 PM, Rafael J. Wysocki <rafael@kernel.org> wrote:
> 
> >>> Index: linux-pm/drivers/cpufreq/cpufreq_ondemand.c
> >>> @@ -264,7 +260,7 @@ static void update_sampling_rate(struct
> >>>               struct od_cpu_dbs_info_s *dbs_info;
> >>>               struct cpu_dbs_info *cdbs;
> >>>               struct cpu_common_dbs_info *shared;
> >>> -             unsigned long next_sampling, appointed_at;
> >>> +             ktime_t next_sampling, appointed_at;
> >>>
> >>>               dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
> >>>               cdbs = &dbs_info->cdbs;
> >>> @@ -292,16 +288,19 @@ static void update_sampling_rate(struct
> >>>                       continue;
> >>>
> >>>               /*
> >>> -              * Checking this for any CPU should be fine, timers for all of
> >>> -              * them are scheduled together.
> >>> +              * Checking this for any CPU sharing the policy should be fine,
> >>> +              * they are all scheduled to sample at the same time.
> >>>                */
> >>> -             next_sampling = jiffies + usecs_to_jiffies(new_rate);
> >>> -             appointed_at = dbs_info->cdbs.timer.expires;
> >>> +             next_sampling = ktime_add_us(ktime_get(), new_rate);
> >>>
> >>> -             if (time_before(next_sampling, appointed_at)) {
> >>> -                     gov_cancel_work(shared);
> >>> -                     gov_add_timers(policy, usecs_to_jiffies(new_rate));
> >>> +             mutex_lock(&shared->timer_mutex);
> >>> +             appointed_at = ktime_add_ns(shared->time_stamp,
> >>> +                                         shared->sample_delay_ns);
> >>> +             mutex_unlock(&shared->timer_mutex);
> >>>
> >>> +             if (ktime_before(next_sampling, appointed_at)) {
> >>> +                     gov_cancel_work(shared);
> >>> +                     gov_set_update_util(shared, new_rate);
> >>
> >> So, I don't think we need to call these heavy routines at all here. Just use the
> >> above timer_mutex to update time_stamp and sample_delay_ns.
> >
> > Well, the concern was that sample_delay_ns might not be updated
> > atomically on 32-bit architectures and that might be a problem for
> > dbs_update_util_handler().  However, this really isn't a problem,
> > because dbs_update_util_handler() only decides whether or not to take
> > a sample *this* time.  If it sees a semi-update value of
> > sample_delay_ns, that value will be either too small or too big, so it
> > will either skip the sample unnecessarily or take it immediately and
> > none of these is a real problem.  It doesn't hurt to take the sample
> > immediately at this point (as stated earlier) and if it is skipped, it
> > will be taken on the next attempt when the update has been completed
> > (which would have happened anyway had the update been atomic).
> 
> Okay, how about this then.
> 
> We do some computations here and based on them, conditionally want to
> update sample_delay_ns. Because there is no penalty now, in terms of
> removing/adding timers/wq, etc, why shouldn't we simply update the
> sample_delay_ns everytime without any checks? That would mean that the
> change of sampling rate is effective immediately, what can be better than that?

Yes, we can do that.

There is a small concern about updating in parallel with dbs_work_handler()
in which case we may overwrite the (hopefully already correct) sample_delay_ns
value that it has just written, but then it will be corrected next time we
take a sample, so it shouldn't be a big deal.

OK, I'll update the patch to do that.

> Also, we should do the same from update-sampling-rate of conservative
> governor as well.

Let's just not change the whole world in one patch, OK?

> Just kill all this complex, unwanted code and make life simple.
> 
> > The only concern is that this function walks the entire collection of
> > cpu_dbs_infos and that's potentially racing with anything that updates
> > those.
> 
> Yeah, but fixing this race shall be easier than other crazy things we are
> looking to do with kobjects :)

Yes, I agree.

> > That can't take any mutexes.  It might only take a raw spinlock if
> > really needed.
> 
> That's doable as well :)
> 
> > Before we drop the lock from here, though, we need to audit the code
> > for any possible races carefully.
> 
> I did bit of that this morning, and there weren't any serious issues as
> as far as I could see :)

The case I'm mostly concerned about is when update_sampling_rate() looks
at a CPU with a policy completely unrelated to the dbs_data it was called
for.  In that case the "shared" object may just go away from under it at
any time while it is looking at that object in theory.

The existing code has this problem AFAICS and the reason why we don't see
any breakage from it right now is because the granularity of cdata->mutex
is really coarse.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar Feb. 7, 2016, 9:10 a.m. UTC | #6
On 06-02-16, 00:10, Rafael J. Wysocki wrote:
> On Friday, February 05, 2016 08:17:56 PM Viresh Kumar wrote:
> > Okay, how about this then.
> > 
> > We do some computations here and based on them, conditionally want to
> > update sample_delay_ns. Because there is no penalty now, in terms of
> > removing/adding timers/wq, etc, why shouldn't we simply update the
> > sample_delay_ns everytime without any checks? That would mean that the
> > change of sampling rate is effective immediately, what can be better than that?
> 
> Yes, we can do that.
> 
> There is a small concern about updating in parallel with dbs_work_handler()
> in which case we may overwrite the (hopefully already correct) sample_delay_ns
> value that it has just written, but then it will be corrected next time we
> take a sample, so it shouldn't be a big deal.
> 
> OK, I'll update the patch to do that.

Great.

> > Also, we should do the same from update-sampling-rate of conservative
> > governor as well.
> 
> Let's just not change the whole world in one patch, OK?

Yeah, I wasn't asking to update in the same patch, but just that we
should do that as well.

> > I did bit of that this morning, and there weren't any serious issues as
> > as far as I could see :)
> 
> The case I'm mostly concerned about is when update_sampling_rate() looks
> at a CPU with a policy completely unrelated to the dbs_data it was called
> for.  In that case the "shared" object may just go away from under it at
> any time while it is looking at that object in theory.

Right, a way (ofcourse we should try find something better) is to move
that update to a separate work item, just as I did it in my patch..

But, I am quite sure we can get that fixed.
Rafael J. Wysocki Feb. 7, 2016, 2:43 p.m. UTC | #7
On Sunday, February 07, 2016 02:40:40 PM Viresh Kumar wrote:
> On 06-02-16, 00:10, Rafael J. Wysocki wrote:
> > On Friday, February 05, 2016 08:17:56 PM Viresh Kumar wrote:
> > > Okay, how about this then.
> > > 
> > > We do some computations here and based on them, conditionally want to
> > > update sample_delay_ns. Because there is no penalty now, in terms of
> > > removing/adding timers/wq, etc, why shouldn't we simply update the
> > > sample_delay_ns everytime without any checks? That would mean that the
> > > change of sampling rate is effective immediately, what can be better than that?
> > 
> > Yes, we can do that.
> > 
> > There is a small concern about updating in parallel with dbs_work_handler()
> > in which case we may overwrite the (hopefully already correct) sample_delay_ns
> > value that it has just written, but then it will be corrected next time we
> > take a sample, so it shouldn't be a big deal.
> > 
> > OK, I'll update the patch to do that.
> 
> Great.
> 
> > > Also, we should do the same from update-sampling-rate of conservative
> > > governor as well.
> > 
> > Let's just not change the whole world in one patch, OK?
> 
> Yeah, I wasn't asking to update in the same patch, but just that we
> should do that as well.
> 
> > > I did bit of that this morning, and there weren't any serious issues as
> > > as far as I could see :)
> > 
> > The case I'm mostly concerned about is when update_sampling_rate() looks
> > at a CPU with a policy completely unrelated to the dbs_data it was called
> > for.  In that case the "shared" object may just go away from under it at
> > any time while it is looking at that object in theory.
> 
> Right, a way (ofcourse we should try find something better) is to move
> that update to a separate work item, just as I did it in my patch..

No, it isn't.  Trying to do it asynchronously will only lead to more
concurrency-related issues.

> But, I am quite sure we can get that fixed.

What we need to do, is to make it possible for update_sampling_rate()
to walk all of the cpu_dbs_infos and look at what their policy_dbs
fields point to safely.

After my cleanup patches it does that under dbs_data_mutex and that works,
because this mutex is also held around *any* updates of struct cpu_dbs_info
anywhere.

However, the cpu_dbs_infos themselves are actually static, so they can be
accessed at any time.  It looks like, then, we may just need to add a lock to
each of them to ensure that the policy_dbs thing won't go away suddenly and
we may not need dbs_data_mutex in there any more.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

Index: linux-pm/drivers/cpufreq/cpufreq_governor.h
===================================================================
--- linux-pm.orig/drivers/cpufreq/cpufreq_governor.h
+++ linux-pm/drivers/cpufreq/cpufreq_governor.h
@@ -18,6 +18,8 @@ 
 #define _CPUFREQ_GOVERNOR_H
 
 #include <linux/atomic.h>
+#include <linux/irq_work.h>
+#include <linux/completion.h>
 #include <linux/cpufreq.h>
 #include <linux/kernel_stat.h>
 #include <linux/module.h>
@@ -139,7 +141,11 @@  struct cpu_common_dbs_info {
 	struct mutex timer_mutex;
 
 	ktime_t time_stamp;
+	u64 last_sample_time;
+	s64 sample_delay_ns;
 	atomic_t skip_work;
+	struct irq_work irq_work;
+	struct completion irq_work_done;
 	struct work_struct work;
 };
 
@@ -155,7 +161,7 @@  struct cpu_dbs_info {
 	 * wake-up from idle.
 	 */
 	unsigned int prev_load;
-	struct timer_list timer;
+	struct update_util_data update_util;
 	struct cpu_common_dbs_info *shared;
 };
 
@@ -212,8 +218,7 @@  struct common_dbs_data {
 
 	struct cpu_dbs_info *(*get_cpu_cdbs)(int cpu);
 	void *(*get_cpu_dbs_info_s)(int cpu);
-	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy,
-				      bool modify_all);
+	unsigned int (*gov_dbs_timer)(struct cpufreq_policy *policy);
 	void (*gov_check_cpu)(int cpu, unsigned int load);
 	int (*init)(struct dbs_data *dbs_data, bool notify);
 	void (*exit)(struct dbs_data *dbs_data, bool notify);
@@ -270,8 +275,8 @@  static ssize_t show_sampling_rate_min_go
 }
 
 extern struct mutex cpufreq_governor_lock;
-
-void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay);
+void gov_set_update_util(struct cpu_common_dbs_info *shared,
+			 unsigned int delay_us);
 void gov_cancel_work(struct cpu_common_dbs_info *shared);
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
 int cpufreq_governor_dbs(struct cpufreq_policy *policy,
Index: linux-pm/drivers/cpufreq/cpufreq_governor.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/cpufreq_governor.c
+++ linux-pm/drivers/cpufreq/cpufreq_governor.c
@@ -128,10 +128,10 @@  void dbs_check_cpu(struct dbs_data *dbs_
 		 * dropped down. So we perform the copy only once, upon the
 		 * first wake-up from idle.)
 		 *
-		 * Detecting this situation is easy: the governor's deferrable
-		 * timer would not have fired during CPU-idle periods. Hence
-		 * an unusually large 'wall_time' (as compared to the sampling
-		 * rate) indicates this scenario.
+		 * Detecting this situation is easy: the governor's utilization
+		 * update handler would not have run during CPU-idle periods.
+		 * Hence, an unusually large 'wall_time' (as compared to the
+		 * sampling rate) indicates this scenario.
 		 *
 		 * prev_load can be zero in two cases and we must recalculate it
 		 * for both cases:
@@ -161,129 +161,116 @@  void dbs_check_cpu(struct dbs_data *dbs_
 }
 EXPORT_SYMBOL_GPL(dbs_check_cpu);
 
-void gov_add_timers(struct cpufreq_policy *policy, unsigned int delay)
+void gov_set_update_util(struct cpu_common_dbs_info *shared,
+			 unsigned int delay_us)
 {
+	struct cpufreq_policy *policy = shared->policy;
 	struct dbs_data *dbs_data = policy->governor_data;
-	struct cpu_dbs_info *cdbs;
 	int cpu;
 
+	shared->sample_delay_ns = delay_us * NSEC_PER_USEC;
+	shared->time_stamp = ktime_get();
+	shared->last_sample_time = 0;
+
 	for_each_cpu(cpu, policy->cpus) {
-		cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
-		cdbs->timer.expires = jiffies + delay;
-		add_timer_on(&cdbs->timer, cpu);
+		struct cpu_dbs_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
+
+		cpufreq_set_update_util_data(cpu, &cdbs->update_util);
 	}
 }
-EXPORT_SYMBOL_GPL(gov_add_timers);
+EXPORT_SYMBOL_GPL(gov_set_update_util);
 
-static inline void gov_cancel_timers(struct cpufreq_policy *policy)
+static inline void gov_clear_update_util(struct cpufreq_policy *policy)
 {
-	struct dbs_data *dbs_data = policy->governor_data;
-	struct cpu_dbs_info *cdbs;
 	int i;
 
-	for_each_cpu(i, policy->cpus) {
-		cdbs = dbs_data->cdata->get_cpu_cdbs(i);
-		del_timer_sync(&cdbs->timer);
-	}
+	for_each_cpu(i, policy->cpus)
+		cpufreq_set_update_util_data(i, NULL);
+
+	synchronize_rcu();
 }
 
 void gov_cancel_work(struct cpu_common_dbs_info *shared)
 {
-	/* Tell dbs_timer_handler() to skip queuing up work items. */
+	/* Tell dbs_update_util_handler() to skip queuing up work items. */
 	atomic_inc(&shared->skip_work);
 	/*
-	 * If dbs_timer_handler() is already running, it may not notice the
-	 * incremented skip_work, so wait for it to complete to prevent its work
-	 * item from being queued up after the cancel_work_sync() below.
-	 */
-	gov_cancel_timers(shared->policy);
-	/*
-	 * In case dbs_timer_handler() managed to run and spawn a work item
-	 * before the timers have been canceled, wait for that work item to
-	 * complete and then cancel all of the timers set up by it.  If
-	 * dbs_timer_handler() runs again at that point, it will see the
-	 * positive value of skip_work and won't spawn any more work items.
+	 * If dbs_update_util_handler() is already running, it may not notice
+	 * the incremented skip_work, so wait for it to complete to prevent its
+	 * work item from being queued up after the cancel_work_sync() below.
 	 */
+	gov_clear_update_util(shared->policy);
+	wait_for_completion(&shared->irq_work_done);
 	cancel_work_sync(&shared->work);
-	gov_cancel_timers(shared->policy);
 	atomic_set(&shared->skip_work, 0);
 }
 EXPORT_SYMBOL_GPL(gov_cancel_work);
 
-/* Will return if we need to evaluate cpu load again or not */
-static bool need_load_eval(struct cpu_common_dbs_info *shared,
-			   unsigned int sampling_rate)
-{
-	if (policy_is_shared(shared->policy)) {
-		ktime_t time_now = ktime_get();
-		s64 delta_us = ktime_us_delta(time_now, shared->time_stamp);
-
-		/* Do nothing if we recently have sampled */
-		if (delta_us < (s64)(sampling_rate / 2))
-			return false;
-		else
-			shared->time_stamp = time_now;
-	}
-
-	return true;
-}
-
 static void dbs_work_handler(struct work_struct *work)
 {
 	struct cpu_common_dbs_info *shared = container_of(work, struct
 					cpu_common_dbs_info, work);
 	struct cpufreq_policy *policy;
 	struct dbs_data *dbs_data;
-	unsigned int sampling_rate, delay;
-	bool eval_load;
+	unsigned int delay;
 
 	policy = shared->policy;
 	dbs_data = policy->governor_data;
 
-	/* Kill all timers */
-	gov_cancel_timers(policy);
-
-	if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
-		struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
-
-		sampling_rate = cs_tuners->sampling_rate;
-	} else {
-		struct od_dbs_tuners *od_tuners = dbs_data->tuners;
-
-		sampling_rate = od_tuners->sampling_rate;
-	}
-
-	eval_load = need_load_eval(shared, sampling_rate);
-
 	/*
-	 * Make sure cpufreq_governor_limits() isn't evaluating load in
+	 * Make sure cpufreq_governor_limits() isn't evaluating load or the
+	 * ondemand governor isn't reading the time stamp and sampling rate in
 	 * parallel.
 	 */
 	mutex_lock(&shared->timer_mutex);
-	delay = dbs_data->cdata->gov_dbs_timer(policy, eval_load);
+	delay = dbs_data->cdata->gov_dbs_timer(policy);
+	shared->sample_delay_ns = jiffies_to_nsecs(delay);
+	shared->time_stamp = ktime_get();
 	mutex_unlock(&shared->timer_mutex);
 
+	/*
+	 * If the atomic operation below is reordered with respect to the
+	 * sample delay modification, the utilization update handler may end
+	 * up using a stale sample delay value.
+	 */
+	smp_mb__before_atomic();
 	atomic_dec(&shared->skip_work);
+}
 
-	gov_add_timers(policy, delay);
+static void dbs_irq_work(struct irq_work *irq_work)
+{
+	struct cpu_common_dbs_info *shared;
+
+	shared = container_of(irq_work, struct cpu_common_dbs_info, irq_work);
+	schedule_work(&shared->work);
+	complete(&shared->irq_work_done);
 }
 
-static void dbs_timer_handler(unsigned long data)
+static void dbs_update_util_handler(struct update_util_data *data, u64 time,
+				    unsigned long util, unsigned long max)
 {
-	struct cpu_dbs_info *cdbs = (struct cpu_dbs_info *)data;
+	struct cpu_dbs_info *cdbs = container_of(data, struct cpu_dbs_info, update_util);
 	struct cpu_common_dbs_info *shared = cdbs->shared;
 
 	/*
-	 * Timer handler may not be allowed to queue the work at the moment,
-	 * because:
-	 * - Another timer handler has done that
-	 * - We are stopping the governor
-	 * - Or we are updating the sampling rate of the ondemand governor
+	 * The work may not be allowed to be queued up right now.
+	 * Possible reasons:
+	 * - Work has already been queued up or is in progress.
+	 * - The governor is being stopped.
+	 * - It is too early (too little time from the previous sample).
 	 */
-	if (atomic_inc_return(&shared->skip_work) > 1)
-		atomic_dec(&shared->skip_work);
-	else
-		queue_work(system_wq, &shared->work);
+	if (atomic_inc_return(&shared->skip_work) == 1) {
+		u64 delta_ns;
+
+		delta_ns = time - shared->last_sample_time;
+		if ((s64)delta_ns >= shared->sample_delay_ns) {
+			shared->last_sample_time = time;
+			reinit_completion(&shared->irq_work_done);
+			irq_work_queue_on(&shared->irq_work, smp_processor_id());
+			return;
+		}
+	}
+	atomic_dec(&shared->skip_work);
 }
 
 static void set_sampling_rate(struct dbs_data *dbs_data,
@@ -467,9 +454,6 @@  static int cpufreq_governor_start(struct
 		io_busy = od_tuners->io_is_busy;
 	}
 
-	shared->policy = policy;
-	shared->time_stamp = ktime_get();
-
 	for_each_cpu(j, policy->cpus) {
 		struct cpu_dbs_info *j_cdbs = cdata->get_cpu_cdbs(j);
 		unsigned int prev_load;
@@ -485,10 +469,11 @@  static int cpufreq_governor_start(struct
 		if (ignore_nice)
 			j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 
-		__setup_timer(&j_cdbs->timer, dbs_timer_handler,
-			      (unsigned long)j_cdbs,
-			      TIMER_DEFERRABLE | TIMER_IRQSAFE);
+		j_cdbs->update_util.func = dbs_update_util_handler;
 	}
+	shared->policy = policy;
+	init_irq_work(&shared->irq_work, dbs_irq_work);
+	init_completion(&shared->irq_work_done);
 
 	if (cdata->governor == GOV_CONSERVATIVE) {
 		struct cs_cpu_dbs_info_s *cs_dbs_info =
@@ -505,7 +490,7 @@  static int cpufreq_governor_start(struct
 		od_ops->powersave_bias_init_cpu(cpu);
 	}
 
-	gov_add_timers(policy, delay_for_sampling_rate(sampling_rate));
+	gov_set_update_util(shared, sampling_rate);
 	return 0;
 }
 
Index: linux-pm/drivers/cpufreq/cpufreq_ondemand.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/cpufreq_ondemand.c
+++ linux-pm/drivers/cpufreq/cpufreq_ondemand.c
@@ -191,7 +191,7 @@  static void od_check_cpu(int cpu, unsign
 	}
 }
 
-static unsigned int od_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
+static unsigned int od_dbs_timer(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
 	unsigned int cpu = policy->cpu;
@@ -200,9 +200,6 @@  static unsigned int od_dbs_timer(struct
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	int delay = 0, sample_type = dbs_info->sample_type;
 
-	if (!modify_all)
-		goto max_delay;
-
 	/* Common NORMAL_SAMPLE setup */
 	dbs_info->sample_type = OD_NORMAL_SAMPLE;
 	if (sample_type == OD_SUB_SAMPLE) {
@@ -218,7 +215,6 @@  static unsigned int od_dbs_timer(struct
 		}
 	}
 
-max_delay:
 	if (!delay)
 		delay = delay_for_sampling_rate(od_tuners->sampling_rate
 				* dbs_info->rate_mult);
@@ -264,7 +260,7 @@  static void update_sampling_rate(struct
 		struct od_cpu_dbs_info_s *dbs_info;
 		struct cpu_dbs_info *cdbs;
 		struct cpu_common_dbs_info *shared;
-		unsigned long next_sampling, appointed_at;
+		ktime_t next_sampling, appointed_at;
 
 		dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
 		cdbs = &dbs_info->cdbs;
@@ -292,16 +288,19 @@  static void update_sampling_rate(struct
 			continue;
 
 		/*
-		 * Checking this for any CPU should be fine, timers for all of
-		 * them are scheduled together.
+		 * Checking this for any CPU sharing the policy should be fine,
+		 * they are all scheduled to sample at the same time.
 		 */
-		next_sampling = jiffies + usecs_to_jiffies(new_rate);
-		appointed_at = dbs_info->cdbs.timer.expires;
+		next_sampling = ktime_add_us(ktime_get(), new_rate);
 
-		if (time_before(next_sampling, appointed_at)) {
-			gov_cancel_work(shared);
-			gov_add_timers(policy, usecs_to_jiffies(new_rate));
+		mutex_lock(&shared->timer_mutex);
+		appointed_at = ktime_add_ns(shared->time_stamp,
+					    shared->sample_delay_ns);
+		mutex_unlock(&shared->timer_mutex);
 
+		if (ktime_before(next_sampling, appointed_at)) {
+			gov_cancel_work(shared);
+			gov_set_update_util(shared, new_rate);
 		}
 	}
 
Index: linux-pm/drivers/cpufreq/cpufreq_conservative.c
===================================================================
--- linux-pm.orig/drivers/cpufreq/cpufreq_conservative.c
+++ linux-pm/drivers/cpufreq/cpufreq_conservative.c
@@ -115,14 +115,12 @@  static void cs_check_cpu(int cpu, unsign
 	}
 }
 
-static unsigned int cs_dbs_timer(struct cpufreq_policy *policy, bool modify_all)
+static unsigned int cs_dbs_timer(struct cpufreq_policy *policy)
 {
 	struct dbs_data *dbs_data = policy->governor_data;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 
-	if (modify_all)
-		dbs_check_cpu(dbs_data, policy->cpu);
-
+	dbs_check_cpu(dbs_data, policy->cpu);
 	return delay_for_sampling_rate(cs_tuners->sampling_rate);
 }