diff mbox

[V2,0/7] cpufreq: governors: Fix ABBA lockups

Message ID 20160204110907.GE3469@vireshk (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Viresh Kumar Feb. 4, 2016, 11:09 a.m. UTC
On 04-02-16, 00:50, Rafael J. Wysocki wrote:
> This is exactly right.  We've avoided one deadlock only to trip into
> another one.
> 
> This happens because update_sampling_rate() acquires
> od_dbs_cdata.mutex which is held around cpufreq_governor_exit() by
> cpufreq_governor_dbs().
> 
> Worse yet, a deadlock can still happen without (the new)
> dbs_data->mutex, just between s_active and od_dbs_cdata.mutex if
> update_sampling_rate() runs in parallel with
> cpufreq_governor_dbs()->cpufreq_governor_exit() and the latter wins
> the race.
> 
> It looks like we need to drop the governor mutex before putting the
> kobject in cpufreq_governor_exit().

I have tried to explore all possible ways of fixing this, and every
other way looked to be racy in some way.

Does anyone else have a better idea (untested):

-------------------------8<-------------------------

Subject: [PATCH] cpufreq: ondemand: Shoot update_sampling_rate with a separate
 work

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
---
 drivers/cpufreq/cpufreq_governor.h |  2 ++
 drivers/cpufreq/cpufreq_ondemand.c | 39 +++++++++++++++++++++++++++++---------
 2 files changed, 32 insertions(+), 9 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Saravana Kannan Feb. 4, 2016, 5:43 p.m. UTC | #1
On 02/04/2016 03:09 AM, Viresh Kumar wrote:
> On 04-02-16, 00:50, Rafael J. Wysocki wrote:
>> This is exactly right.  We've avoided one deadlock only to trip into
>> another one.
>>
>> This happens because update_sampling_rate() acquires
>> od_dbs_cdata.mutex which is held around cpufreq_governor_exit() by
>> cpufreq_governor_dbs().
>>
>> Worse yet, a deadlock can still happen without (the new)
>> dbs_data->mutex, just between s_active and od_dbs_cdata.mutex if
>> update_sampling_rate() runs in parallel with
>> cpufreq_governor_dbs()->cpufreq_governor_exit() and the latter wins
>> the race.
>>
>> It looks like we need to drop the governor mutex before putting the
>> kobject in cpufreq_governor_exit().
>
> I have tried to explore all possible ways of fixing this, and every
> other way looked to be racy in some way.
>
> Does anyone else have a better idea (untested):
>
> -------------------------8<-------------------------
>
> Subject: [PATCH] cpufreq: ondemand: Shoot update_sampling_rate with a separate
>   work
>
> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
> ---
>   drivers/cpufreq/cpufreq_governor.h |  2 ++
>   drivers/cpufreq/cpufreq_ondemand.c | 39 +++++++++++++++++++++++++++++---------
>   2 files changed, 32 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
> index 7bed63e14e7d..97e604356b20 100644
> --- a/drivers/cpufreq/cpufreq_governor.h
> +++ b/drivers/cpufreq/cpufreq_governor.h
> @@ -141,6 +141,8 @@ struct od_dbs_tuners {
>   	unsigned int powersave_bias;
>   	unsigned int io_is_busy;
>   	unsigned int min_sampling_rate;
> +	struct work_struct work;
> +	struct dbs_data *dbs_data;
>   };
>
>   struct cs_dbs_tuners {
> diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
> index 82ed490f7de0..93ad7a226aee 100644
> --- a/drivers/cpufreq/cpufreq_ondemand.c
> +++ b/drivers/cpufreq/cpufreq_ondemand.c
> @@ -242,20 +242,27 @@ static struct common_dbs_data od_dbs_cdata;
>    * reducing the sampling rate, we need to make the new value effective
>    * immediately.
>    */
> -static void update_sampling_rate(struct dbs_data *dbs_data,
> -		unsigned int new_rate)
> +static void update_sampling_rate(struct work_struct *work)
>   {
> -	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
> +	struct od_dbs_tuners *od_tuners = container_of(work, struct
> +						       od_dbs_tuners, work);
> +	unsigned int new_rate = od_tuners->sampling_rate;
> +	struct dbs_data *dbs_data = od_tuners->dbs_data;
>   	struct cpumask cpumask;
>   	int cpu;
>
> -	od_tuners->sampling_rate = new_rate = max(new_rate,
> -			od_tuners->min_sampling_rate);
> -
>   	/*
>   	 * Lock governor so that governor start/stop can't execute in parallel.
> +	 *
> +	 * We can't do a regular mutex_lock() here, as that may deadlock against
> +	 * another thread performing CPUFREQ_GOV_POLICY_EXIT event on the
> +	 * governor, which might have already taken od_dbs_cdata.mutex and is
> +	 * waiting for this work to finish.
>   	 */
> -	mutex_lock(&od_dbs_cdata.mutex);
> +	if (!mutex_trylock(&od_dbs_cdata.mutex)) {
> +		queue_work(system_wq, &od_tuners->work);
> +		return;
> +	}
>
>   	cpumask_copy(&cpumask, cpu_online_mask);
>
> @@ -311,13 +318,22 @@ static void update_sampling_rate(struct dbs_data *dbs_data,
>   static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
>   		size_t count)
>   {
> +	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
>   	unsigned int input;
>   	int ret;
>   	ret = sscanf(buf, "%u", &input);
>   	if (ret != 1)
>   		return -EINVAL;
>
> -	update_sampling_rate(dbs_data, input);
> +	od_tuners->sampling_rate = max(input, od_tuners->min_sampling_rate);
> +
> +	/*
> +	 * update_sampling_rate() requires to hold od_dbs_cdata.mutex, but we
> +	 * can't take that from this thread, otherwise it results in ABBA
> +	 * lockdep between s_active and od_dbs_cdata.mutex locks.
> +	 */
> +	queue_work(system_wq, &od_tuners->work);
> +
>   	return count;
>   }
>
> @@ -501,6 +517,8 @@ static int od_init(struct dbs_data *dbs_data, bool notify)
>   	tuners->ignore_nice_load = 0;
>   	tuners->powersave_bias = default_powersave_bias;
>   	tuners->io_is_busy = should_io_be_busy();
> +	INIT_WORK(&tuners->work, update_sampling_rate);
> +	tuners->dbs_data = dbs_data;
>
>   	dbs_data->tuners = tuners;
>   	return 0;
> @@ -508,7 +526,10 @@ static int od_init(struct dbs_data *dbs_data, bool notify)
>
>   static void od_exit(struct dbs_data *dbs_data, bool notify)
>   {
> -	kfree(dbs_data->tuners);
> +	struct od_dbs_tuners *tuners = dbs_data->tuners;
> +
> +	cancel_work_sync(&tuners->work);
> +	kfree(tuners);
>   }
>
>   define_get_cpu_dbs_routines(od_cpu_dbs_info);
>

No no no no! Let's not open up this can of worms of queuing up the work 
to handle a write to a sysfs file. It *MIGHT* work for this specific 
tunable (I haven't bothered to analyze), but this makes it impossible to 
return a useful/proper error value.

-Saravana
Saravana Kannan Feb. 4, 2016, 5:44 p.m. UTC | #2
On 02/04/2016 09:43 AM, Saravana Kannan wrote:
> On 02/04/2016 03:09 AM, Viresh Kumar wrote:
>> On 04-02-16, 00:50, Rafael J. Wysocki wrote:
>>> This is exactly right.  We've avoided one deadlock only to trip into
>>> another one.
>>>
>>> This happens because update_sampling_rate() acquires
>>> od_dbs_cdata.mutex which is held around cpufreq_governor_exit() by
>>> cpufreq_governor_dbs().
>>>
>>> Worse yet, a deadlock can still happen without (the new)
>>> dbs_data->mutex, just between s_active and od_dbs_cdata.mutex if
>>> update_sampling_rate() runs in parallel with
>>> cpufreq_governor_dbs()->cpufreq_governor_exit() and the latter wins
>>> the race.
>>>
>>> It looks like we need to drop the governor mutex before putting the
>>> kobject in cpufreq_governor_exit().
>>
>> I have tried to explore all possible ways of fixing this, and every
>> other way looked to be racy in some way.
>>
>> Does anyone else have a better idea (untested):
>>
>> -------------------------8<-------------------------
>>
>> Subject: [PATCH] cpufreq: ondemand: Shoot update_sampling_rate with a
>> separate
>>   work
>>
>> Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
>> ---
>>   drivers/cpufreq/cpufreq_governor.h |  2 ++
>>   drivers/cpufreq/cpufreq_ondemand.c | 39
>> +++++++++++++++++++++++++++++---------
>>   2 files changed, 32 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/cpufreq/cpufreq_governor.h
>> b/drivers/cpufreq/cpufreq_governor.h
>> index 7bed63e14e7d..97e604356b20 100644
>> --- a/drivers/cpufreq/cpufreq_governor.h
>> +++ b/drivers/cpufreq/cpufreq_governor.h
>> @@ -141,6 +141,8 @@ struct od_dbs_tuners {
>>       unsigned int powersave_bias;
>>       unsigned int io_is_busy;
>>       unsigned int min_sampling_rate;
>> +    struct work_struct work;
>> +    struct dbs_data *dbs_data;
>>   };
>>
>>   struct cs_dbs_tuners {
>> diff --git a/drivers/cpufreq/cpufreq_ondemand.c
>> b/drivers/cpufreq/cpufreq_ondemand.c
>> index 82ed490f7de0..93ad7a226aee 100644
>> --- a/drivers/cpufreq/cpufreq_ondemand.c
>> +++ b/drivers/cpufreq/cpufreq_ondemand.c
>> @@ -242,20 +242,27 @@ static struct common_dbs_data od_dbs_cdata;
>>    * reducing the sampling rate, we need to make the new value effective
>>    * immediately.
>>    */
>> -static void update_sampling_rate(struct dbs_data *dbs_data,
>> -        unsigned int new_rate)
>> +static void update_sampling_rate(struct work_struct *work)
>>   {
>> -    struct od_dbs_tuners *od_tuners = dbs_data->tuners;
>> +    struct od_dbs_tuners *od_tuners = container_of(work, struct
>> +                               od_dbs_tuners, work);
>> +    unsigned int new_rate = od_tuners->sampling_rate;
>> +    struct dbs_data *dbs_data = od_tuners->dbs_data;
>>       struct cpumask cpumask;
>>       int cpu;
>>
>> -    od_tuners->sampling_rate = new_rate = max(new_rate,
>> -            od_tuners->min_sampling_rate);
>> -
>>       /*
>>        * Lock governor so that governor start/stop can't execute in
>> parallel.
>> +     *
>> +     * We can't do a regular mutex_lock() here, as that may deadlock
>> against
>> +     * another thread performing CPUFREQ_GOV_POLICY_EXIT event on the
>> +     * governor, which might have already taken od_dbs_cdata.mutex
>> and is
>> +     * waiting for this work to finish.
>>        */
>> -    mutex_lock(&od_dbs_cdata.mutex);
>> +    if (!mutex_trylock(&od_dbs_cdata.mutex)) {
>> +        queue_work(system_wq, &od_tuners->work);
>> +        return;
>> +    }
>>
>>       cpumask_copy(&cpumask, cpu_online_mask);
>>
>> @@ -311,13 +318,22 @@ static void update_sampling_rate(struct dbs_data
>> *dbs_data,
>>   static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const
>> char *buf,
>>           size_t count)
>>   {
>> +    struct od_dbs_tuners *od_tuners = dbs_data->tuners;
>>       unsigned int input;
>>       int ret;
>>       ret = sscanf(buf, "%u", &input);
>>       if (ret != 1)
>>           return -EINVAL;
>>
>> -    update_sampling_rate(dbs_data, input);
>> +    od_tuners->sampling_rate = max(input, od_tuners->min_sampling_rate);
>> +
>> +    /*
>> +     * update_sampling_rate() requires to hold od_dbs_cdata.mutex,
>> but we
>> +     * can't take that from this thread, otherwise it results in ABBA
>> +     * lockdep between s_active and od_dbs_cdata.mutex locks.
>> +     */
>> +    queue_work(system_wq, &od_tuners->work);
>> +
>>       return count;
>>   }
>>
>> @@ -501,6 +517,8 @@ static int od_init(struct dbs_data *dbs_data, bool
>> notify)
>>       tuners->ignore_nice_load = 0;
>>       tuners->powersave_bias = default_powersave_bias;
>>       tuners->io_is_busy = should_io_be_busy();
>> +    INIT_WORK(&tuners->work, update_sampling_rate);
>> +    tuners->dbs_data = dbs_data;
>>
>>       dbs_data->tuners = tuners;
>>       return 0;
>> @@ -508,7 +526,10 @@ static int od_init(struct dbs_data *dbs_data,
>> bool notify)
>>
>>   static void od_exit(struct dbs_data *dbs_data, bool notify)
>>   {
>> -    kfree(dbs_data->tuners);
>> +    struct od_dbs_tuners *tuners = dbs_data->tuners;
>> +
>> +    cancel_work_sync(&tuners->work);
>> +    kfree(tuners);
>>   }
>>
>>   define_get_cpu_dbs_routines(od_cpu_dbs_info);
>>
>
> No no no no! Let's not open up this can of worms of queuing up the work
> to handle a write to a sysfs file. It *MIGHT* work for this specific
> tunable (I haven't bothered to analyze), but this makes it impossible to
> return a useful/proper error value.

Sent too soon. Not only that, but it can also cause the writes to the 
sysfs files to get processed in a different order and I don't know what 
other issues/races THAT will open up.

-Saravana
Rafael J. Wysocki Feb. 4, 2016, 6:18 p.m. UTC | #3
On Thu, Feb 4, 2016 at 6:44 PM, Saravana Kannan <skannan@codeaurora.org> wrote:
> On 02/04/2016 09:43 AM, Saravana Kannan wrote:
>>
>> On 02/04/2016 03:09 AM, Viresh Kumar wrote:
>>>
>>> On 04-02-16, 00:50, Rafael J. Wysocki wrote:
>>>>
>>>> This is exactly right.  We've avoided one deadlock only to trip into
>>>> another one.
>>>>
>>>> This happens because update_sampling_rate() acquires
>>>> od_dbs_cdata.mutex which is held around cpufreq_governor_exit() by
>>>> cpufreq_governor_dbs().
>>>>
>>>> Worse yet, a deadlock can still happen without (the new)
>>>> dbs_data->mutex, just between s_active and od_dbs_cdata.mutex if
>>>> update_sampling_rate() runs in parallel with
>>>> cpufreq_governor_dbs()->cpufreq_governor_exit() and the latter wins
>>>> the race.
>>>>
>>>> It looks like we need to drop the governor mutex before putting the
>>>> kobject in cpufreq_governor_exit().
>>>

[cut]

>>
>> No no no no! Let's not open up this can of worms of queuing up the work
>> to handle a write to a sysfs file. It *MIGHT* work for this specific
>> tunable (I haven't bothered to analyze), but this makes it impossible to
>> return a useful/proper error value.
>
>
> Sent too soon. Not only that, but it can also cause the writes to the sysfs
> files to get processed in a different order and I don't know what other
> issues/races THAT will open up.

Well, I don't like this too.

I actually do have an idea about how to fix these deadlocks, but it is
on top of my cleanup series.

I'll write more about it later today.

Thanks,
Rafael
--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar Feb. 5, 2016, 2:44 a.m. UTC | #4
On 04-02-16, 19:18, Rafael J. Wysocki wrote:
> On Thu, Feb 4, 2016 at 6:44 PM, Saravana Kannan <skannan@codeaurora.org> wrote:
> > On 02/04/2016 09:43 AM, Saravana Kannan wrote:

> >> No no no no! Let's not open up this can of worms of queuing up the work
> >> to handle a write to a sysfs file. It *MIGHT* work for this specific
> >> tunable (I haven't bothered to analyze), but this makes it impossible to
> >> return a useful/proper error value.
> >
> >
> > Sent too soon. Not only that, but it can also cause the writes to the sysfs
> > files to get processed in a different order and I don't know what other
> > issues/races THAT will open up.
> 
> Well, I don't like this too.

I expected similar responses only, so no surprises for me :)

Though there are few things I would like to tell here:
- There wouldn't be any race for updating the file, as that is done
  directly from store_sampling_rate(). It updates the *real* file we
  wanted to.

- What's offloaded to the work-handler is something very special about
  ondemand governor and sampling rate. The same is not done for
  conservative governor as well, don't know why though.

- After updating the sampling rate, we assess if we need to reschedule
  the timers/workqueue to a different time for better efficiency. I
  don't think there can be a race there and it can be safely done in a
  work..

> I actually do have an idea about how to fix these deadlocks, but it is
> on top of my cleanup series.

I would like to hear that, if you can, to save your time. I have tried
so many different ways of fixing this yesterday and found issue
everywhere :(
Rafael J. Wysocki Feb. 5, 2016, 3:54 a.m. UTC | #5
On Thursday, February 04, 2016 07:18:32 PM Rafael J. Wysocki wrote:
> On Thu, Feb 4, 2016 at 6:44 PM, Saravana Kannan <skannan@codeaurora.org> wrote:
> > On 02/04/2016 09:43 AM, Saravana Kannan wrote:
> >>
> >> On 02/04/2016 03:09 AM, Viresh Kumar wrote:
> >>>
> >>> On 04-02-16, 00:50, Rafael J. Wysocki wrote:
> >>>>
> >>>> This is exactly right.  We've avoided one deadlock only to trip into
> >>>> another one.
> >>>>
> >>>> This happens because update_sampling_rate() acquires
> >>>> od_dbs_cdata.mutex which is held around cpufreq_governor_exit() by
> >>>> cpufreq_governor_dbs().
> >>>>
> >>>> Worse yet, a deadlock can still happen without (the new)
> >>>> dbs_data->mutex, just between s_active and od_dbs_cdata.mutex if
> >>>> update_sampling_rate() runs in parallel with
> >>>> cpufreq_governor_dbs()->cpufreq_governor_exit() and the latter wins
> >>>> the race.
> >>>>
> >>>> It looks like we need to drop the governor mutex before putting the
> >>>> kobject in cpufreq_governor_exit().
> >>>
> 
> [cut]
> 
> >>
> >> No no no no! Let's not open up this can of worms of queuing up the work
> >> to handle a write to a sysfs file. It *MIGHT* work for this specific
> >> tunable (I haven't bothered to analyze), but this makes it impossible to
> >> return a useful/proper error value.
> >
> >
> > Sent too soon. Not only that, but it can also cause the writes to the sysfs
> > files to get processed in a different order and I don't know what other
> > issues/races THAT will open up.
> 
> Well, I don't like this too.
> 
> I actually do have an idea about how to fix these deadlocks, but it is
> on top of my cleanup series.
> 
> I'll write more about it later today.

Having actually posted that series again after cleaning it up I can say
what I'm thinking about hopefully without confusing anyone too much.  So
please bear in mind that I'm going to refer to this series below:

http://marc.info/?l=linux-pm&m=145463901630950&w=4

Also this is more of a brain dump rather than actual design description,
so there may be holes etc in it.  Please let me know if you can see any.

The problem at hand is that policy->rwsem needs to be held around *all*
operations in cpufreq_set_policy().  In particular, it cannot be dropped
around invocations of __cpufreq_governor() with the event arg equal to
_EXIT as that leads to interesting races.

Unfortunately, we know that holding policy->rwsem in those places leads
to a deadlock with governor sysfs attributes removal in cpufreq_governor_exit().

Viresh attempted to fix this by avoiding to acquire policy->rwsem for governor
attributes access (as holding it is not necessary for them in principle).  That
was a nice try, but it turned out to be insufficient because of another deadlock
scenario uncovered by it.  Namely, since the ondemand governor's update_sampling_rate()
acquires the governor mutex (called dbs_data_mutex after my patches mentioned
above), it may deadlock with exactly the same piece of code in cpufreq_governor_exit()
in almost exactly the same way.

To avoid that other deadlock, we'd either need to drop dbs_data_mutex from
update_sampling_rate(), or drop it for the removal of the governor sysfs
attributes in cpufreq_governor_exit().  I don't think the former is an option
at least at this point, so it looks like we pretty much have to do the latter.

With that in mind, I'd start with the changes made by Viresh (maybe without the
first patch which really isn't essential here).  That is, introduce a separate
kobject type for the governor attributes kobject and register that in
cpufreq_governor_init().  The show/store callbacks for that kobject type won't
acquire policy->rwsem so the first deadlock will be avoided.

But in addition to that, I'd drop dbs_data_mutex before the removal of governor
sysfs attributes.  That actually happens in two places, in cpufreq_governor_exit()
and in the error path of cpufreq_governor_init().

To that end, I'd move the locking from cpufreq_governor_dbs() to the functions
called by it.  That should be readily doable and they can do all of the
necessary checks themselves.  cpufreq_governor_dbs() would become a pure mux then,
but that's not such a big deal.

With that, cpufreq_governor_exit() may just drop the lock before it does the
final kobject_put().  The danger here is that the sysfs show/store callbacks of
the governor attributes kobject may see invalid dbs_data for a while, after the
lock has been dropped and before the kobject is deleted.  That may be addressed
by checking, for example, the presence of the dbs_data's "tuners" pointer in those
callbacks.  If it is NULL, they can simply return -EAGAIN or similar.

Now, that means, though, that they need to acquire the same lock as
cpufreq_governor_exit(), or they may see things go away while they are running.
The simplest approach here would be to take dbs_data_mutex in them too, although
that's a bit of a sledgehammer.  It might be better to have a per-policy lock
in struct policy_dbs_info for that, for example, but then the governor attribute
sysfs callbacks would need to get that object instead of dbs_data.

On the flip side, it might be possible to migrate update_sampling_rate() to
that lock too.  And maybe we can get rid of dbs_data_mutex even, who knows?

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Viresh Kumar Feb. 5, 2016, 9:49 a.m. UTC | #6
On 05-02-16, 04:54, Rafael J. Wysocki wrote:
> Having actually posted that series again after cleaning it up I can say
> what I'm thinking about hopefully without confusing anyone too much.  So
> please bear in mind that I'm going to refer to this series below:
> 
> http://marc.info/?l=linux-pm&m=145463901630950&w=4
> 
> Also this is more of a brain dump rather than actual design description,
> so there may be holes etc in it.  Please let me know if you can see any.
> 
> The problem at hand is that policy->rwsem needs to be held around *all*
> operations in cpufreq_set_policy().  In particular, it cannot be dropped
> around invocations of __cpufreq_governor() with the event arg equal to
> _EXIT as that leads to interesting races.
> 
> Unfortunately, we know that holding policy->rwsem in those places leads
> to a deadlock with governor sysfs attributes removal in cpufreq_governor_exit().
> 
> Viresh attempted to fix this by avoiding to acquire policy->rwsem for governor
> attributes access (as holding it is not necessary for them in principle).  That
> was a nice try, but it turned out to be insufficient because of another deadlock
> scenario uncovered by it.

Not really.

The other deadlock wasn't uncovered by it, its just that Shilpa tested
directly after my patches and reported the issue. Later yesterday, she
was hitting the exactly same issue on pm/linux-next as well (i.e.
without my patches). And ofcourse Juri has also reported the same
issue on linux-next few days back.

> Namely, since the ondemand governor's update_sampling_rate()
> acquires the governor mutex (called dbs_data_mutex after my patches mentioned
> above), it may deadlock with exactly the same piece of code in cpufreq_governor_exit()
> in almost exactly the same way.

Right.

> To avoid that other deadlock, we'd either need to drop dbs_data_mutex from
> update_sampling_rate(),

And my so called 'ugly' 8th patch tried to do just that :)

But as I also mentioned in reply to the update-util patchset of yours,
its possible somewhat.

> or drop it for the removal of the governor sysfs
> attributes in cpufreq_governor_exit().  I don't think the former is an option
> at least at this point, so it looks like we pretty much have to do the latter.
> 
> With that in mind, I'd start with the changes made by Viresh (maybe without the
> first patch which really isn't essential here).

That was just to cleanup the macro mess a bit, nothing more. Over
that, I think the first 7 patches can be picked as it is without any
changes. Ofcourse they are required to be rebased over your 13
patches, if those are going in first :)

> That is, introduce a separate
> kobject type for the governor attributes kobject and register that in
> cpufreq_governor_init().  The show/store callbacks for that kobject type won't
> acquire policy->rwsem so the first deadlock will be avoided.
> 
> But in addition to that, I'd drop dbs_data_mutex before the removal of governor
> sysfs attributes.  That actually happens in two places, in cpufreq_governor_exit()
> and in the error path of cpufreq_governor_init().
> 
> To that end, I'd move the locking from cpufreq_governor_dbs() to the functions
> called by it.  That should be readily doable and they can do all of the
> necessary checks themselves.  cpufreq_governor_dbs() would become a pure mux then,
> but that's not such a big deal.
> 
> With that, cpufreq_governor_exit() may just drop the lock before it does the
> final kobject_put().  The danger here is that the sysfs show/store callbacks of
> the governor attributes kobject may see invalid dbs_data for a while, after the
> lock has been dropped and before the kobject is deleted.  That may be addressed
> by checking, for example, the presence of the dbs_data's "tuners" pointer in those
> callbacks.  If it is NULL, they can simply return -EAGAIN or similar.

So you mean something like this (consider only !governor_per_policy
case with ondemand governor for now):

exit()
{
       lock-dbs_data_mutex;
       ...
       dbs_data->tuners = NULL; //so that sysfs files can return early
       dbs_governor->gdbs_data = NULL; //For !governor_per_policy case
       unlock-dbs_data_mutex;

       /*
        * Problem: Who is stopping us to set ondemand as governor for
        * another policy, which can try create a kobject which will
        * try to create sysfs directory at the same path ?
        *
        * Though another field in dbs_governor can be used to fix this
        * I think, which needs to block the other INIT operation.
        */
        
       kobject_put(dbs_data->kobj); //This should wait for all sysfs operations to end.

       kfree(dbs_data);
}

And the sysfs operations show/store need to take dbs_data_mutex() for
their entire operations.

??
Saravana Kannan Feb. 6, 2016, 2:22 a.m. UTC | #7
On 02/04/2016 07:54 PM, Rafael J. Wysocki wrote:
> On Thursday, February 04, 2016 07:18:32 PM Rafael J. Wysocki wrote:
>> On Thu, Feb 4, 2016 at 6:44 PM, Saravana Kannan <skannan@codeaurora.org> wrote:
>>> On 02/04/2016 09:43 AM, Saravana Kannan wrote:
>>>>
>>>> On 02/04/2016 03:09 AM, Viresh Kumar wrote:
>>>>>
>>>>> On 04-02-16, 00:50, Rafael J. Wysocki wrote:
>>>>>>
>>>>>> This is exactly right.  We've avoided one deadlock only to trip into
>>>>>> another one.
>>>>>>
>>>>>> This happens because update_sampling_rate() acquires
>>>>>> od_dbs_cdata.mutex which is held around cpufreq_governor_exit() by
>>>>>> cpufreq_governor_dbs().
>>>>>>
>>>>>> Worse yet, a deadlock can still happen without (the new)
>>>>>> dbs_data->mutex, just between s_active and od_dbs_cdata.mutex if
>>>>>> update_sampling_rate() runs in parallel with
>>>>>> cpufreq_governor_dbs()->cpufreq_governor_exit() and the latter wins
>>>>>> the race.
>>>>>>
>>>>>> It looks like we need to drop the governor mutex before putting the
>>>>>> kobject in cpufreq_governor_exit().
>>>>>
>>
>> [cut]
>>
>>>>
>>>> No no no no! Let's not open up this can of worms of queuing up the work
>>>> to handle a write to a sysfs file. It *MIGHT* work for this specific
>>>> tunable (I haven't bothered to analyze), but this makes it impossible to
>>>> return a useful/proper error value.
>>>
>>>
>>> Sent too soon. Not only that, but it can also cause the writes to the sysfs
>>> files to get processed in a different order and I don't know what other
>>> issues/races THAT will open up.
>>
>> Well, I don't like this too.
>>
>> I actually do have an idea about how to fix these deadlocks, but it is
>> on top of my cleanup series.
>>
>> I'll write more about it later today.
>
> Having actually posted that series again after cleaning it up I can say
> what I'm thinking about hopefully without confusing anyone too much.  So
> please bear in mind that I'm going to refer to this series below:
>
> http://marc.info/?l=linux-pm&m=145463901630950&w=4
>
> Also this is more of a brain dump rather than actual design description,
> so there may be holes etc in it.  Please let me know if you can see any.
>
> The problem at hand is that policy->rwsem needs to be held around *all*
> operations in cpufreq_set_policy().  In particular, it cannot be dropped
> around invocations of __cpufreq_governor() with the event arg equal to
> _EXIT as that leads to interesting races.
>
> Unfortunately, we know that holding policy->rwsem in those places leads
> to a deadlock with governor sysfs attributes removal in cpufreq_governor_exit().
>
> Viresh attempted to fix this by avoiding to acquire policy->rwsem for governor
> attributes access (as holding it is not necessary for them in principle).  That
> was a nice try, but it turned out to be insufficient because of another deadlock
> scenario uncovered by it.  Namely, since the ondemand governor's update_sampling_rate()
> acquires the governor mutex (called dbs_data_mutex after my patches mentioned
> above), it may deadlock with exactly the same piece of code in cpufreq_governor_exit()
> in almost exactly the same way.
>
> To avoid that other deadlock, we'd either need to drop dbs_data_mutex from
> update_sampling_rate(), or drop it for the removal of the governor sysfs
> attributes in cpufreq_governor_exit().  I don't think the former is an option
> at least at this point, so it looks like we pretty much have to do the latter.
>
> With that in mind, I'd start with the changes made by Viresh (maybe without the
> first patch which really isn't essential here).  That is, introduce a separate
> kobject type for the governor attributes kobject and register that in
> cpufreq_governor_init().  The show/store callbacks for that kobject type won't
> acquire policy->rwsem so the first deadlock will be avoided.
>
> But in addition to that, I'd drop dbs_data_mutex before the removal of governor
> sysfs attributes.  That actually happens in two places, in cpufreq_governor_exit()
> and in the error path of cpufreq_governor_init().
>
> To that end, I'd move the locking from cpufreq_governor_dbs() to the functions
> called by it.  That should be readily doable and they can do all of the
> necessary checks themselves.  cpufreq_governor_dbs() would become a pure mux then,
> but that's not such a big deal.
>
> With that, cpufreq_governor_exit() may just drop the lock before it does the
> final kobject_put().  The danger here is that the sysfs show/store callbacks of
> the governor attributes kobject may see invalid dbs_data for a while, after the
> lock has been dropped and before the kobject is deleted.  That may be addressed
> by checking, for example, the presence of the dbs_data's "tuners" pointer in those
> callbacks.  If it is NULL, they can simply return -EAGAIN or similar.
>
> Now, that means, though, that they need to acquire the same lock as
> cpufreq_governor_exit(), or they may see things go away while they are running.
> The simplest approach here would be to take dbs_data_mutex in them too, although
> that's a bit of a sledgehammer.  It might be better to have a per-policy lock
> in struct policy_dbs_info for that, for example, but then the governor attribute
> sysfs callbacks would need to get that object instead of dbs_data.
>
> On the flip side, it might be possible to migrate update_sampling_rate() to
> that lock too.  And maybe we can get rid of dbs_data_mutex even, who knows?

I'm glad you've analyzed it this far. So, the rest of my comments will 
be easier to understand.

I'm going to go back to my point of NOT doing the sysfs add/remove 
inside the governor at all (that includes cpufreq_governor.c) and doing 
it in cpufreq.c. That suggestion was confusing to explain/understand 
before when we were using policy rwsem inside the show/store ops for the 
governor attributes. Now that has been removed, my suggestion would be 
even easier/cleaner to implement/understand and you don't have to worry 
about ANY races in the governor.

I'll just talk about the have_governor_per_policy() case. It can be 
easily extended to the global case.

In cpufreq_governor.c:
cpufreq_governor_init(...)
{
  ...
  /* NOT kobject_init_and_add */
  kobject_init();
  /* New field */
  policy->gov_kobj = &dbs_data->kobj);
  ...
}

In cpufreq.c:
__cpufreq_governor(...)
{

    if (event == POLICY_EXIT) {
       kobject_put(policy->gov_kobj);
    }
    ret = policy->governor->governor(policy, event);
    if (event == POLICY_INIT) {
       kobj_add(policy->gov_kobj, policy->kobj, policy->governor->name);
    }
}

This guarantees that there can be no races of the governor specific data 
structures going away while being accessed from sysfs because the first 
thing we do once we decide to "kill" a governor is to remove the sysfs 
files and the accesses to governor data (and flush out all on going 
accesses) and THEN ask the governor to exit.

Thoughts?

Thanks,
Saravana
Rafael J. Wysocki Feb. 8, 2016, 2:20 a.m. UTC | #8
On Friday, February 05, 2016 03:19:25 PM Viresh Kumar wrote:
> On 05-02-16, 04:54, Rafael J. Wysocki wrote:
> > Having actually posted that series again after cleaning it up I can say
> > what I'm thinking about hopefully without confusing anyone too much.  So
> > please bear in mind that I'm going to refer to this series below:
> > 
> > http://marc.info/?l=linux-pm&m=145463901630950&w=4
> > 
> > Also this is more of a brain dump rather than actual design description,
> > so there may be holes etc in it.  Please let me know if you can see any.
> > 
> > The problem at hand is that policy->rwsem needs to be held around *all*
> > operations in cpufreq_set_policy().  In particular, it cannot be dropped
> > around invocations of __cpufreq_governor() with the event arg equal to
> > _EXIT as that leads to interesting races.
> > 
> > Unfortunately, we know that holding policy->rwsem in those places leads
> > to a deadlock with governor sysfs attributes removal in cpufreq_governor_exit().
> > 
> > Viresh attempted to fix this by avoiding to acquire policy->rwsem for governor
> > attributes access (as holding it is not necessary for them in principle).  That
> > was a nice try, but it turned out to be insufficient because of another deadlock
> > scenario uncovered by it.
> 
> Not really.
> 
> The other deadlock wasn't uncovered by it, its just that Shilpa tested
> directly after my patches and reported the issue. Later yesterday, she
> was hitting the exactly same issue on pm/linux-next as well (i.e.
> without my patches). And ofcourse Juri has also reported the same
> issue on linux-next few days back.

OK, fair enough.

> > Namely, since the ondemand governor's update_sampling_rate()
> > acquires the governor mutex (called dbs_data_mutex after my patches mentioned
> > above), it may deadlock with exactly the same piece of code in cpufreq_governor_exit()
> > in almost exactly the same way.
> 
> Right.
> 
> > To avoid that other deadlock, we'd either need to drop dbs_data_mutex from
> > update_sampling_rate(),
> 
> And my so called 'ugly' 8th patch tried to do just that :)
> 
> But as I also mentioned in reply to the update-util patchset of yours,
> its possible somewhat.

Yes, it should be possible and not even too difficult.

> > or drop it for the removal of the governor sysfs
> > attributes in cpufreq_governor_exit().  I don't think the former is an option
> > at least at this point, so it looks like we pretty much have to do the latter.
> > 
> > With that in mind, I'd start with the changes made by Viresh (maybe without the
> > first patch which really isn't essential here).
> 
> That was just to cleanup the macro mess a bit, nothing more. Over
> that, I think the first 7 patches can be picked as it is without any
> changes. Ofcourse they are required to be rebased over your 13
> patches, if those are going in first :)

Yes, please rebase.

Also please skip the first one that was moving min_sampling_rate around,
at least for now.

As I said, we may be moving other attributes in the opposite direction,
so two sets of macros may be necessary anyway.

> > That is, introduce a separate
> > kobject type for the governor attributes kobject and register that in
> > cpufreq_governor_init().  The show/store callbacks for that kobject type won't
> > acquire policy->rwsem so the first deadlock will be avoided.
> > 
> > But in addition to that, I'd drop dbs_data_mutex before the removal of governor
> > sysfs attributes.  That actually happens in two places, in cpufreq_governor_exit()
> > and in the error path of cpufreq_governor_init().
> > 
> > To that end, I'd move the locking from cpufreq_governor_dbs() to the functions
> > called by it.  That should be readily doable and they can do all of the
> > necessary checks themselves.  cpufreq_governor_dbs() would become a pure mux then,
> > but that's not such a big deal.
> > 
> > With that, cpufreq_governor_exit() may just drop the lock before it does the
> > final kobject_put().  The danger here is that the sysfs show/store callbacks of
> > the governor attributes kobject may see invalid dbs_data for a while, after the
> > lock has been dropped and before the kobject is deleted.  That may be addressed
> > by checking, for example, the presence of the dbs_data's "tuners" pointer in those
> > callbacks.  If it is NULL, they can simply return -EAGAIN or similar.
> 
> So you mean something like this (consider only !governor_per_policy
> case with ondemand governor for now):
> 
> exit()
> {
>        lock-dbs_data_mutex;
>        ...
>        dbs_data->tuners = NULL; //so that sysfs files can return early
>        dbs_governor->gdbs_data = NULL; //For !governor_per_policy case
>        unlock-dbs_data_mutex;
> 
>        /*
>         * Problem: Who is stopping us to set ondemand as governor for
>         * another policy, which can try create a kobject which will
>         * try to create sysfs directory at the same path ?
>         *
>         * Though another field in dbs_governor can be used to fix this
>         * I think, which needs to block the other INIT operation.
>         */
>         
>        kobject_put(dbs_data->kobj); //This should wait for all sysfs operations to end.
> 
>        kfree(dbs_data);
> }
> 
> And the sysfs operations show/store need to take dbs_data_mutex() for
> their entire operations.
> 
> ??

Yes, roughly.

But it shouldn't be necessary after all, because dropping the mutex from
update_sampling_rate() looks easier than I thought previously.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rafael J. Wysocki Feb. 8, 2016, 2:28 a.m. UTC | #9
On Friday, February 05, 2016 06:22:35 PM Saravana Kannan wrote:
> On 02/04/2016 07:54 PM, Rafael J. Wysocki wrote:
> > On Thursday, February 04, 2016 07:18:32 PM Rafael J. Wysocki wrote:
> >> On Thu, Feb 4, 2016 at 6:44 PM, Saravana Kannan <skannan@codeaurora.org> wrote:
> >>> On 02/04/2016 09:43 AM, Saravana Kannan wrote:
> >>>>
> >>>> On 02/04/2016 03:09 AM, Viresh Kumar wrote:
> >>>>>
> >>>>> On 04-02-16, 00:50, Rafael J. Wysocki wrote:
> >>>>>>
> >>>>>> This is exactly right.  We've avoided one deadlock only to trip into
> >>>>>> another one.
> >>>>>>
> >>>>>> This happens because update_sampling_rate() acquires
> >>>>>> od_dbs_cdata.mutex which is held around cpufreq_governor_exit() by
> >>>>>> cpufreq_governor_dbs().
> >>>>>>
> >>>>>> Worse yet, a deadlock can still happen without (the new)
> >>>>>> dbs_data->mutex, just between s_active and od_dbs_cdata.mutex if
> >>>>>> update_sampling_rate() runs in parallel with
> >>>>>> cpufreq_governor_dbs()->cpufreq_governor_exit() and the latter wins
> >>>>>> the race.
> >>>>>>
> >>>>>> It looks like we need to drop the governor mutex before putting the
> >>>>>> kobject in cpufreq_governor_exit().
> >>>>>
> >>
> >> [cut]
> >>
> >>>>
> >>>> No no no no! Let's not open up this can of worms of queuing up the work
> >>>> to handle a write to a sysfs file. It *MIGHT* work for this specific
> >>>> tunable (I haven't bothered to analyze), but this makes it impossible to
> >>>> return a useful/proper error value.
> >>>
> >>>
> >>> Sent too soon. Not only that, but it can also cause the writes to the sysfs
> >>> files to get processed in a different order and I don't know what other
> >>> issues/races THAT will open up.
> >>
> >> Well, I don't like this too.
> >>
> >> I actually do have an idea about how to fix these deadlocks, but it is
> >> on top of my cleanup series.
> >>
> >> I'll write more about it later today.
> >
> > Having actually posted that series again after cleaning it up I can say
> > what I'm thinking about hopefully without confusing anyone too much.  So
> > please bear in mind that I'm going to refer to this series below:
> >
> > http://marc.info/?l=linux-pm&m=145463901630950&w=4
> >
> > Also this is more of a brain dump rather than actual design description,
> > so there may be holes etc in it.  Please let me know if you can see any.
> >
> > The problem at hand is that policy->rwsem needs to be held around *all*
> > operations in cpufreq_set_policy().  In particular, it cannot be dropped
> > around invocations of __cpufreq_governor() with the event arg equal to
> > _EXIT as that leads to interesting races.
> >
> > Unfortunately, we know that holding policy->rwsem in those places leads
> > to a deadlock with governor sysfs attributes removal in cpufreq_governor_exit().
> >
> > Viresh attempted to fix this by avoiding to acquire policy->rwsem for governor
> > attributes access (as holding it is not necessary for them in principle).  That
> > was a nice try, but it turned out to be insufficient because of another deadlock
> > scenario uncovered by it.  Namely, since the ondemand governor's update_sampling_rate()
> > acquires the governor mutex (called dbs_data_mutex after my patches mentioned
> > above), it may deadlock with exactly the same piece of code in cpufreq_governor_exit()
> > in almost exactly the same way.
> >
> > To avoid that other deadlock, we'd either need to drop dbs_data_mutex from
> > update_sampling_rate(), or drop it for the removal of the governor sysfs
> > attributes in cpufreq_governor_exit().  I don't think the former is an option
> > at least at this point, so it looks like we pretty much have to do the latter.
> >
> > With that in mind, I'd start with the changes made by Viresh (maybe without the
> > first patch which really isn't essential here).  That is, introduce a separate
> > kobject type for the governor attributes kobject and register that in
> > cpufreq_governor_init().  The show/store callbacks for that kobject type won't
> > acquire policy->rwsem so the first deadlock will be avoided.
> >
> > But in addition to that, I'd drop dbs_data_mutex before the removal of governor
> > sysfs attributes.  That actually happens in two places, in cpufreq_governor_exit()
> > and in the error path of cpufreq_governor_init().
> >
> > To that end, I'd move the locking from cpufreq_governor_dbs() to the functions
> > called by it.  That should be readily doable and they can do all of the
> > necessary checks themselves.  cpufreq_governor_dbs() would become a pure mux then,
> > but that's not such a big deal.
> >
> > With that, cpufreq_governor_exit() may just drop the lock before it does the
> > final kobject_put().  The danger here is that the sysfs show/store callbacks of
> > the governor attributes kobject may see invalid dbs_data for a while, after the
> > lock has been dropped and before the kobject is deleted.  That may be addressed
> > by checking, for example, the presence of the dbs_data's "tuners" pointer in those
> > callbacks.  If it is NULL, they can simply return -EAGAIN or similar.
> >
> > Now, that means, though, that they need to acquire the same lock as
> > cpufreq_governor_exit(), or they may see things go away while they are running.
> > The simplest approach here would be to take dbs_data_mutex in them too, although
> > that's a bit of a sledgehammer.  It might be better to have a per-policy lock
> > in struct policy_dbs_info for that, for example, but then the governor attribute
> > sysfs callbacks would need to get that object instead of dbs_data.
> >
> > On the flip side, it might be possible to migrate update_sampling_rate() to
> > that lock too.  And maybe we can get rid of dbs_data_mutex even, who knows?
> 
> I'm glad you've analyzed it this far. So, the rest of my comments will 
> be easier to understand.
> 
> I'm going to go back to my point of NOT doing the sysfs add/remove 
> inside the governor at all (that includes cpufreq_governor.c) and doing 
> it in cpufreq.c. That suggestion was confusing to explain/understand 
> before when we were using policy rwsem inside the show/store ops for the 
> governor attributes. Now that has been removed, my suggestion would be 
> even easier/cleaner to implement/understand and you don't have to worry 
> about ANY races in the governor.
> 
> I'll just talk about the have_governor_per_policy() case. It can be 
> easily extended to the global case.
> 
> In cpufreq_governor.c:
> cpufreq_governor_init(...)
> {
>   ...
>   /* NOT kobject_init_and_add */
>   kobject_init();
>   /* New field */
>   policy->gov_kobj = &dbs_data->kobj);
>   ...
> }
> 
> In cpufreq.c:
> __cpufreq_governor(...)
> {
> 
>     if (event == POLICY_EXIT) {
>        kobject_put(policy->gov_kobj);
>     }
>     ret = policy->governor->governor(policy, event);
>     if (event == POLICY_INIT) {
>        kobj_add(policy->gov_kobj, policy->kobj, policy->governor->name);
>     }
> }
> 
> This guarantees that there can be no races of the governor specific data 
> structures going away while being accessed from sysfs because the first 
> thing we do once we decide to "kill" a governor is to remove the sysfs 
> files and the accesses to governor data (and flush out all on going 
> accesses) and THEN ask the governor to exit.
> 
> Thoughts?

The core would then have to rely on the governor code to populate the gov_kobj
field correctly which doesn't look really straightforward to me.  It is better
if each code layer arranges the data structures it is going to use by itself.

Besides, ondemand and conservative are the only governors that use the governor
kobject at all, so I'm not sure if that really belongs to the core.

Thanks,
Rafael

--
To unsubscribe from this list: send the line "unsubscribe linux-pm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Saravana Kannan Feb. 9, 2016, 9:02 p.m. UTC | #10
On 02/07/2016 06:28 PM, Rafael J. Wysocki wrote:
> On Friday, February 05, 2016 06:22:35 PM Saravana Kannan wrote:
>> On 02/04/2016 07:54 PM, Rafael J. Wysocki wrote:
>>> On Thursday, February 04, 2016 07:18:32 PM Rafael J. Wysocki wrote:
>>>> On Thu, Feb 4, 2016 at 6:44 PM, Saravana Kannan <skannan@codeaurora.org> wrote:
>>>>> On 02/04/2016 09:43 AM, Saravana Kannan wrote:
>>>>>>
>>>>>> On 02/04/2016 03:09 AM, Viresh Kumar wrote:
>>>>>>>
>>>>>>> On 04-02-16, 00:50, Rafael J. Wysocki wrote:
>>>>>>>>
>>>>>>>> This is exactly right.  We've avoided one deadlock only to trip into
>>>>>>>> another one.
>>>>>>>>
>>>>>>>> This happens because update_sampling_rate() acquires
>>>>>>>> od_dbs_cdata.mutex which is held around cpufreq_governor_exit() by
>>>>>>>> cpufreq_governor_dbs().
>>>>>>>>
>>>>>>>> Worse yet, a deadlock can still happen without (the new)
>>>>>>>> dbs_data->mutex, just between s_active and od_dbs_cdata.mutex if
>>>>>>>> update_sampling_rate() runs in parallel with
>>>>>>>> cpufreq_governor_dbs()->cpufreq_governor_exit() and the latter wins
>>>>>>>> the race.
>>>>>>>>
>>>>>>>> It looks like we need to drop the governor mutex before putting the
>>>>>>>> kobject in cpufreq_governor_exit().
>>>>>>>
>>>>
>>>> [cut]
>>>>
>>>>>>
>>>>>> No no no no! Let's not open up this can of worms of queuing up the work
>>>>>> to handle a write to a sysfs file. It *MIGHT* work for this specific
>>>>>> tunable (I haven't bothered to analyze), but this makes it impossible to
>>>>>> return a useful/proper error value.
>>>>>
>>>>>
>>>>> Sent too soon. Not only that, but it can also cause the writes to the sysfs
>>>>> files to get processed in a different order and I don't know what other
>>>>> issues/races THAT will open up.
>>>>
>>>> Well, I don't like this too.
>>>>
>>>> I actually do have an idea about how to fix these deadlocks, but it is
>>>> on top of my cleanup series.
>>>>
>>>> I'll write more about it later today.
>>>
>>> Having actually posted that series again after cleaning it up I can say
>>> what I'm thinking about hopefully without confusing anyone too much.  So
>>> please bear in mind that I'm going to refer to this series below:
>>>
>>> http://marc.info/?l=linux-pm&m=145463901630950&w=4
>>>
>>> Also this is more of a brain dump rather than actual design description,
>>> so there may be holes etc in it.  Please let me know if you can see any.
>>>
>>> The problem at hand is that policy->rwsem needs to be held around *all*
>>> operations in cpufreq_set_policy().  In particular, it cannot be dropped
>>> around invocations of __cpufreq_governor() with the event arg equal to
>>> _EXIT as that leads to interesting races.
>>>
>>> Unfortunately, we know that holding policy->rwsem in those places leads
>>> to a deadlock with governor sysfs attributes removal in cpufreq_governor_exit().
>>>
>>> Viresh attempted to fix this by avoiding to acquire policy->rwsem for governor
>>> attributes access (as holding it is not necessary for them in principle).  That
>>> was a nice try, but it turned out to be insufficient because of another deadlock
>>> scenario uncovered by it.  Namely, since the ondemand governor's update_sampling_rate()
>>> acquires the governor mutex (called dbs_data_mutex after my patches mentioned
>>> above), it may deadlock with exactly the same piece of code in cpufreq_governor_exit()
>>> in almost exactly the same way.
>>>
>>> To avoid that other deadlock, we'd either need to drop dbs_data_mutex from
>>> update_sampling_rate(), or drop it for the removal of the governor sysfs
>>> attributes in cpufreq_governor_exit().  I don't think the former is an option
>>> at least at this point, so it looks like we pretty much have to do the latter.
>>>
>>> With that in mind, I'd start with the changes made by Viresh (maybe without the
>>> first patch which really isn't essential here).  That is, introduce a separate
>>> kobject type for the governor attributes kobject and register that in
>>> cpufreq_governor_init().  The show/store callbacks for that kobject type won't
>>> acquire policy->rwsem so the first deadlock will be avoided.
>>>
>>> But in addition to that, I'd drop dbs_data_mutex before the removal of governor
>>> sysfs attributes.  That actually happens in two places, in cpufreq_governor_exit()
>>> and in the error path of cpufreq_governor_init().
>>>
>>> To that end, I'd move the locking from cpufreq_governor_dbs() to the functions
>>> called by it.  That should be readily doable and they can do all of the
>>> necessary checks themselves.  cpufreq_governor_dbs() would become a pure mux then,
>>> but that's not such a big deal.
>>>
>>> With that, cpufreq_governor_exit() may just drop the lock before it does the
>>> final kobject_put().  The danger here is that the sysfs show/store callbacks of
>>> the governor attributes kobject may see invalid dbs_data for a while, after the
>>> lock has been dropped and before the kobject is deleted.  That may be addressed
>>> by checking, for example, the presence of the dbs_data's "tuners" pointer in those
>>> callbacks.  If it is NULL, they can simply return -EAGAIN or similar.
>>>
>>> Now, that means, though, that they need to acquire the same lock as
>>> cpufreq_governor_exit(), or they may see things go away while they are running.
>>> The simplest approach here would be to take dbs_data_mutex in them too, although
>>> that's a bit of a sledgehammer.  It might be better to have a per-policy lock
>>> in struct policy_dbs_info for that, for example, but then the governor attribute
>>> sysfs callbacks would need to get that object instead of dbs_data.
>>>
>>> On the flip side, it might be possible to migrate update_sampling_rate() to
>>> that lock too.  And maybe we can get rid of dbs_data_mutex even, who knows?
>>
>> I'm glad you've analyzed it this far. So, the rest of my comments will
>> be easier to understand.
>>
>> I'm going to go back to my point of NOT doing the sysfs add/remove
>> inside the governor at all (that includes cpufreq_governor.c) and doing
>> it in cpufreq.c. That suggestion was confusing to explain/understand
>> before when we were using policy rwsem inside the show/store ops for the
>> governor attributes. Now that has been removed, my suggestion would be
>> even easier/cleaner to implement/understand and you don't have to worry
>> about ANY races in the governor.
>>
>> I'll just talk about the have_governor_per_policy() case. It can be
>> easily extended to the global case.
>>
>> In cpufreq_governor.c:
>> cpufreq_governor_init(...)
>> {
>>    ...
>>    /* NOT kobject_init_and_add */
>>    kobject_init();
>>    /* New field */
>>    policy->gov_kobj = &dbs_data->kobj);
>>    ...
>> }
>>
>> In cpufreq.c:
>> __cpufreq_governor(...)
>> {
>>
>>      if (event == POLICY_EXIT) {
>>         kobject_put(policy->gov_kobj);
>>      }
>>      ret = policy->governor->governor(policy, event);
>>      if (event == POLICY_INIT) {
>>         kobj_add(policy->gov_kobj, policy->kobj, policy->governor->name);
>>      }
>> }
>>
>> This guarantees that there can be no races of the governor specific data
>> structures going away while being accessed from sysfs because the first
>> thing we do once we decide to "kill" a governor is to remove the sysfs
>> files and the accesses to governor data (and flush out all on going
>> accesses) and THEN ask the governor to exit.
>>
>> Thoughts?
>
> The core would then have to rely on the governor code to populate the gov_kobj
> field correctly which doesn't look really straightforward to me.  It is better
> if each code layer arranges the data structures it is going to use by itself.

The core depends a lot on the drivers and governors filling up some 
fields correctly. This isn't any worse than that. It just seems way more 
logical to me to remove the interface to changing governor attributes 
(the sysfs files) before we start "exiting" a governor. But it looks 
like there's a v3 series of patches from Viresh that people seem to 
agree is fixing the race in a different method -- I haven't had time to 
look at it. So, I'm not going to keep pushing my point about removing 
the sysfs files at the core level. I'll jump back to it if we later find 
another race with this v3 patch series :)

> Besides, ondemand and conservative are the only governors that use the governor
> kobject at all, so I'm not sure if that really belongs to the core.

Technically userspace should be using kobject and sysfs attributes for 
set speed, but for whatever reason (legacy/historical I assume) we let 
the core add/remove sysfs files for an op that's supported only by 
userspace governor.

-Saravana
diff mbox

Patch

diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 7bed63e14e7d..97e604356b20 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -141,6 +141,8 @@  struct od_dbs_tuners {
 	unsigned int powersave_bias;
 	unsigned int io_is_busy;
 	unsigned int min_sampling_rate;
+	struct work_struct work;
+	struct dbs_data *dbs_data;
 };
 
 struct cs_dbs_tuners {
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 82ed490f7de0..93ad7a226aee 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -242,20 +242,27 @@  static struct common_dbs_data od_dbs_cdata;
  * reducing the sampling rate, we need to make the new value effective
  * immediately.
  */
-static void update_sampling_rate(struct dbs_data *dbs_data,
-		unsigned int new_rate)
+static void update_sampling_rate(struct work_struct *work)
 {
-	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
+	struct od_dbs_tuners *od_tuners = container_of(work, struct
+						       od_dbs_tuners, work);
+	unsigned int new_rate = od_tuners->sampling_rate;
+	struct dbs_data *dbs_data = od_tuners->dbs_data;
 	struct cpumask cpumask;
 	int cpu;
 
-	od_tuners->sampling_rate = new_rate = max(new_rate,
-			od_tuners->min_sampling_rate);
-
 	/*
 	 * Lock governor so that governor start/stop can't execute in parallel.
+	 *
+	 * We can't do a regular mutex_lock() here, as that may deadlock against
+	 * another thread performing CPUFREQ_GOV_POLICY_EXIT event on the
+	 * governor, which might have already taken od_dbs_cdata.mutex and is
+	 * waiting for this work to finish.
 	 */
-	mutex_lock(&od_dbs_cdata.mutex);
+	if (!mutex_trylock(&od_dbs_cdata.mutex)) {
+		queue_work(system_wq, &od_tuners->work);
+		return;
+	}
 
 	cpumask_copy(&cpumask, cpu_online_mask);
 
@@ -311,13 +318,22 @@  static void update_sampling_rate(struct dbs_data *dbs_data,
 static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
 		size_t count)
 {
+	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
 	if (ret != 1)
 		return -EINVAL;
 
-	update_sampling_rate(dbs_data, input);
+	od_tuners->sampling_rate = max(input, od_tuners->min_sampling_rate);
+
+	/*
+	 * update_sampling_rate() requires to hold od_dbs_cdata.mutex, but we
+	 * can't take that from this thread, otherwise it results in ABBA
+	 * lockdep between s_active and od_dbs_cdata.mutex locks.
+	 */
+	queue_work(system_wq, &od_tuners->work);
+
 	return count;
 }
 
@@ -501,6 +517,8 @@  static int od_init(struct dbs_data *dbs_data, bool notify)
 	tuners->ignore_nice_load = 0;
 	tuners->powersave_bias = default_powersave_bias;
 	tuners->io_is_busy = should_io_be_busy();
+	INIT_WORK(&tuners->work, update_sampling_rate);
+	tuners->dbs_data = dbs_data;
 
 	dbs_data->tuners = tuners;
 	return 0;
@@ -508,7 +526,10 @@  static int od_init(struct dbs_data *dbs_data, bool notify)
 
 static void od_exit(struct dbs_data *dbs_data, bool notify)
 {
-	kfree(dbs_data->tuners);
+	struct od_dbs_tuners *tuners = dbs_data->tuners;
+
+	cancel_work_sync(&tuners->work);
+	kfree(tuners);
 }
 
 define_get_cpu_dbs_routines(od_cpu_dbs_info);