diff mbox

3.18: lockdep problems in cpufreq

Message ID 20141215230922.GL11285@n2100.arm.linux.org.uk (mailing list archive)
State New, archived
Headers show

Commit Message

Russell King - ARM Linux Dec. 15, 2014, 11:09 p.m. UTC
On Mon, Dec 15, 2014 at 10:41:02PM +0100, Rafael J. Wysocki wrote:
> On Monday, December 15, 2014 05:43:36 PM Russell King - ARM Linux wrote:
> > On Mon, Dec 15, 2014 at 08:24:05PM +0530, Yadwinder Singh Brar wrote:
> > > I agree with this approach, if its fine for others also, I can implement
> > > and post patch.
> > 
> > Yes, please do.
> 
> Indeed.  We have a regression here to fix.

Well, here's a patch which I'm running on top of 3.18 at the moment,
which is basically what I described in my email, and I'm running with it
and it is without any lockdep complaint.

8<===
From: Russell King <rmk+kernel@arm.linux.org.uk>
thermal: cpu_cooling: fix lockdep problems in cpu_cooling

A recent change to the cpu_cooling code introduced a AB-BA deadlock
scenario between the cpufreq_policy_notifier_list rwsem and the
cooling_cpufreq_lock.  This is caused by cooling_cpufreq_lock being held
before the registration/removal of the notifier block (an operation
which takes the rwsem), and the notifier code itself which takes the
locks in the reverse order.

Solve this by moving to finer grained locking - use one mutex to protect
the cpufreq_dev_list as a whole, and a separate lock to ensure correct
ordering of cpufreq notifier registration and removal.

I considered taking the cooling_list_lock within cooling_cpufreq_lock to
protect the registration sequence as a whole, but that adds a dependency
between these two locks which is best avoided (lest someone tries to
take those two new locks in the reverse order.)  In any case, it's safer
to have an empty cpufreq_dev_list than to have unnecessary dependencies
between locks.

Fixes: 2dcd851fe4b4 ("thermal: cpu_cooling: Update always cpufreq policy with thermal constraints")
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---

 drivers/thermal/cpu_cooling.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

Comments

Viresh Kumar Dec. 16, 2014, 3:41 a.m. UTC | #1
On 16 December 2014 at 04:39, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> Well, here's a patch which I'm running on top of 3.18 at the moment,
> which is basically what I described in my email, and I'm running with it
> and it is without any lockdep complaint.

We need two separate patches now, one for 3.18 and other one for 3.19-rc.
3.19 has see lots of changes in this particular file and so we need to
change few things here.

> 8<===
> From: Russell King <rmk+kernel@arm.linux.org.uk>
> thermal: cpu_cooling: fix lockdep problems in cpu_cooling
>
> A recent change to the cpu_cooling code introduced a AB-BA deadlock
> scenario between the cpufreq_policy_notifier_list rwsem and the
> cooling_cpufreq_lock.  This is caused by cooling_cpufreq_lock being held
> before the registration/removal of the notifier block (an operation
> which takes the rwsem), and the notifier code itself which takes the
> locks in the reverse order.
>
> Solve this by moving to finer grained locking - use one mutex to protect
> the cpufreq_dev_list as a whole, and a separate lock to ensure correct
> ordering of cpufreq notifier registration and removal.
>
> I considered taking the cooling_list_lock within cooling_cpufreq_lock to
> protect the registration sequence as a whole, but that adds a dependency
> between these two locks which is best avoided (lest someone tries to
> take those two new locks in the reverse order.)  In any case, it's safer
> to have an empty cpufreq_dev_list than to have unnecessary dependencies
> between locks.
>
> Fixes: 2dcd851fe4b4 ("thermal: cpu_cooling: Update always cpufreq policy with thermal constraints")
> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> ---
>
>  drivers/thermal/cpu_cooling.c | 16 ++++++++++++----
>  1 file changed, 12 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
> index ad09e51ffae4..9e42c6f30785 100644
> --- a/drivers/thermal/cpu_cooling.c
> +++ b/drivers/thermal/cpu_cooling.c
> @@ -57,6 +57,7 @@ static DEFINE_MUTEX(cooling_cpufreq_lock);
>
>  static unsigned int cpufreq_dev_count;
>
> +static DEFINE_MUTEX(cooling_list_lock);
>  static LIST_HEAD(cpufreq_dev_list);
>
>  /**
> @@ -317,7 +318,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
>         if (event != CPUFREQ_ADJUST)
>                 return 0;
>
> -       mutex_lock(&cooling_cpufreq_lock);
> +       mutex_lock(&cooling_list_lock);
>         list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
>                 if (!cpumask_test_cpu(policy->cpu,
>                                         &cpufreq_dev->allowed_cpus))
> @@ -333,7 +334,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
>                 if (policy->max != max_freq)
>                         cpufreq_verify_within_limits(policy, 0, max_freq);
>         }
> -       mutex_unlock(&cooling_cpufreq_lock);
> +       mutex_unlock(&cooling_list_lock);
>
>         return 0;
>  }
> @@ -482,6 +483,11 @@ __cpufreq_cooling_register(struct device_node *np,
>         }
>         cpufreq_dev->cool_dev = cool_dev;
>         cpufreq_dev->cpufreq_state = 0;
> +
> +       mutex_lock(&cooling_list_lock);
> +       list_add(&cpufreq_dev->node, &cpufreq_dev_list);
> +       mutex_unlock(&cooling_list_lock);
> +
>         mutex_lock(&cooling_cpufreq_lock);
>
>         /* Register the notifier for first cpufreq cooling device */
> @@ -489,7 +495,6 @@ __cpufreq_cooling_register(struct device_node *np,
>                 cpufreq_register_notifier(&thermal_cpufreq_notifier_block,
>                                           CPUFREQ_POLICY_NOTIFIER);
>         cpufreq_dev_count++;
> -       list_add(&cpufreq_dev->node, &cpufreq_dev_list);
>
>         mutex_unlock(&cooling_cpufreq_lock);
>
> @@ -553,7 +558,6 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
>
>         cpufreq_dev = cdev->devdata;
>         mutex_lock(&cooling_cpufreq_lock);
> -       list_del(&cpufreq_dev->node);
>         cpufreq_dev_count--;
>
>         /* Unregister the notifier for the last cpufreq cooling device */
> @@ -562,6 +566,10 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
>                                             CPUFREQ_POLICY_NOTIFIER);
>         mutex_unlock(&cooling_cpufreq_lock);
>
> +       mutex_lock(&cooling_list_lock);
> +       list_del(&cpufreq_dev->node);
> +       mutex_unlock(&cooling_list_lock);
> +
>         thermal_cooling_device_unregister(cpufreq_dev->cool_dev);
>         release_idr(&cpufreq_idr, cpufreq_dev->id);
>         kfree(cpufreq_dev);

For 3.18

Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Russell King - ARM Linux Jan. 6, 2015, 3:38 p.m. UTC | #2
On Tue, Dec 16, 2014 at 09:11:53AM +0530, Viresh Kumar wrote:
> On 16 December 2014 at 04:39, Russell King - ARM Linux
> <linux@arm.linux.org.uk> wrote:
> > Well, here's a patch which I'm running on top of 3.18 at the moment,
> > which is basically what I described in my email, and I'm running with it
> > and it is without any lockdep complaint.
> 
> We need two separate patches now, one for 3.18 and other one for 3.19-rc.
> 3.19 has see lots of changes in this particular file and so we need to
> change few things here.

I'm sorry, I don't have the bandwidth to create a patch for 3.19-rc
right now.
Russell King - ARM Linux May 18, 2015, 6:56 p.m. UTC | #3
On Tue, Dec 16, 2014 at 09:11:53AM +0530, Viresh Kumar wrote:
> On 16 December 2014 at 04:39, Russell King - ARM Linux
> <linux@arm.linux.org.uk> wrote:
> > Well, here's a patch which I'm running on top of 3.18 at the moment,
> > which is basically what I described in my email, and I'm running with it
> > and it is without any lockdep complaint.
> 
> We need two separate patches now, one for 3.18 and other one for 3.19-rc.
> 3.19 has see lots of changes in this particular file and so we need to
> change few things here.

What happened with this?  I'm still carrying the patch.

> > 8<===
> > From: Russell King <rmk+kernel@arm.linux.org.uk>
> > thermal: cpu_cooling: fix lockdep problems in cpu_cooling
> >
> > A recent change to the cpu_cooling code introduced a AB-BA deadlock
> > scenario between the cpufreq_policy_notifier_list rwsem and the
> > cooling_cpufreq_lock.  This is caused by cooling_cpufreq_lock being held
> > before the registration/removal of the notifier block (an operation
> > which takes the rwsem), and the notifier code itself which takes the
> > locks in the reverse order.
> >
> > Solve this by moving to finer grained locking - use one mutex to protect
> > the cpufreq_dev_list as a whole, and a separate lock to ensure correct
> > ordering of cpufreq notifier registration and removal.
> >
> > I considered taking the cooling_list_lock within cooling_cpufreq_lock to
> > protect the registration sequence as a whole, but that adds a dependency
> > between these two locks which is best avoided (lest someone tries to
> > take those two new locks in the reverse order.)  In any case, it's safer
> > to have an empty cpufreq_dev_list than to have unnecessary dependencies
> > between locks.
> >
> > Fixes: 2dcd851fe4b4 ("thermal: cpu_cooling: Update always cpufreq policy with thermal constraints")
> > Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> > ---
> >
> >  drivers/thermal/cpu_cooling.c | 16 ++++++++++++----
> >  1 file changed, 12 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
> > index ad09e51ffae4..9e42c6f30785 100644
> > --- a/drivers/thermal/cpu_cooling.c
> > +++ b/drivers/thermal/cpu_cooling.c
> > @@ -57,6 +57,7 @@ static DEFINE_MUTEX(cooling_cpufreq_lock);
> >
> >  static unsigned int cpufreq_dev_count;
> >
> > +static DEFINE_MUTEX(cooling_list_lock);
> >  static LIST_HEAD(cpufreq_dev_list);
> >
> >  /**
> > @@ -317,7 +318,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
> >         if (event != CPUFREQ_ADJUST)
> >                 return 0;
> >
> > -       mutex_lock(&cooling_cpufreq_lock);
> > +       mutex_lock(&cooling_list_lock);
> >         list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
> >                 if (!cpumask_test_cpu(policy->cpu,
> >                                         &cpufreq_dev->allowed_cpus))
> > @@ -333,7 +334,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
> >                 if (policy->max != max_freq)
> >                         cpufreq_verify_within_limits(policy, 0, max_freq);
> >         }
> > -       mutex_unlock(&cooling_cpufreq_lock);
> > +       mutex_unlock(&cooling_list_lock);
> >
> >         return 0;
> >  }
> > @@ -482,6 +483,11 @@ __cpufreq_cooling_register(struct device_node *np,
> >         }
> >         cpufreq_dev->cool_dev = cool_dev;
> >         cpufreq_dev->cpufreq_state = 0;
> > +
> > +       mutex_lock(&cooling_list_lock);
> > +       list_add(&cpufreq_dev->node, &cpufreq_dev_list);
> > +       mutex_unlock(&cooling_list_lock);
> > +
> >         mutex_lock(&cooling_cpufreq_lock);
> >
> >         /* Register the notifier for first cpufreq cooling device */
> > @@ -489,7 +495,6 @@ __cpufreq_cooling_register(struct device_node *np,
> >                 cpufreq_register_notifier(&thermal_cpufreq_notifier_block,
> >                                           CPUFREQ_POLICY_NOTIFIER);
> >         cpufreq_dev_count++;
> > -       list_add(&cpufreq_dev->node, &cpufreq_dev_list);
> >
> >         mutex_unlock(&cooling_cpufreq_lock);
> >
> > @@ -553,7 +558,6 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
> >
> >         cpufreq_dev = cdev->devdata;
> >         mutex_lock(&cooling_cpufreq_lock);
> > -       list_del(&cpufreq_dev->node);
> >         cpufreq_dev_count--;
> >
> >         /* Unregister the notifier for the last cpufreq cooling device */
> > @@ -562,6 +566,10 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
> >                                             CPUFREQ_POLICY_NOTIFIER);
> >         mutex_unlock(&cooling_cpufreq_lock);
> >
> > +       mutex_lock(&cooling_list_lock);
> > +       list_del(&cpufreq_dev->node);
> > +       mutex_unlock(&cooling_list_lock);
> > +
> >         thermal_cooling_device_unregister(cpufreq_dev->cool_dev);
> >         release_idr(&cpufreq_idr, cpufreq_dev->id);
> >         kfree(cpufreq_dev);
> 
> For 3.18
> 
> Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Rafael J. Wysocki May 18, 2015, 10:05 p.m. UTC | #4
On Monday, May 18, 2015 07:56:45 PM Russell King - ARM Linux wrote:
> On Tue, Dec 16, 2014 at 09:11:53AM +0530, Viresh Kumar wrote:
> > On 16 December 2014 at 04:39, Russell King - ARM Linux
> > <linux@arm.linux.org.uk> wrote:
> > > Well, here's a patch which I'm running on top of 3.18 at the moment,
> > > which is basically what I described in my email, and I'm running with it
> > > and it is without any lockdep complaint.
> > 
> > We need two separate patches now, one for 3.18 and other one for 3.19-rc.
> > 3.19 has see lots of changes in this particular file and so we need to
> > change few things here.
> 
> What happened with this?  I'm still carrying the patch.

This should go in through the thermal tree.  Eduardo?

> 
> > > 8<===
> > > From: Russell King <rmk+kernel@arm.linux.org.uk>
> > > thermal: cpu_cooling: fix lockdep problems in cpu_cooling
> > >
> > > A recent change to the cpu_cooling code introduced a AB-BA deadlock
> > > scenario between the cpufreq_policy_notifier_list rwsem and the
> > > cooling_cpufreq_lock.  This is caused by cooling_cpufreq_lock being held
> > > before the registration/removal of the notifier block (an operation
> > > which takes the rwsem), and the notifier code itself which takes the
> > > locks in the reverse order.
> > >
> > > Solve this by moving to finer grained locking - use one mutex to protect
> > > the cpufreq_dev_list as a whole, and a separate lock to ensure correct
> > > ordering of cpufreq notifier registration and removal.
> > >
> > > I considered taking the cooling_list_lock within cooling_cpufreq_lock to
> > > protect the registration sequence as a whole, but that adds a dependency
> > > between these two locks which is best avoided (lest someone tries to
> > > take those two new locks in the reverse order.)  In any case, it's safer
> > > to have an empty cpufreq_dev_list than to have unnecessary dependencies
> > > between locks.
> > >
> > > Fixes: 2dcd851fe4b4 ("thermal: cpu_cooling: Update always cpufreq policy with thermal constraints")
> > > Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
> > > ---
> > >
> > >  drivers/thermal/cpu_cooling.c | 16 ++++++++++++----
> > >  1 file changed, 12 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
> > > index ad09e51ffae4..9e42c6f30785 100644
> > > --- a/drivers/thermal/cpu_cooling.c
> > > +++ b/drivers/thermal/cpu_cooling.c
> > > @@ -57,6 +57,7 @@ static DEFINE_MUTEX(cooling_cpufreq_lock);
> > >
> > >  static unsigned int cpufreq_dev_count;
> > >
> > > +static DEFINE_MUTEX(cooling_list_lock);
> > >  static LIST_HEAD(cpufreq_dev_list);
> > >
> > >  /**
> > > @@ -317,7 +318,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
> > >         if (event != CPUFREQ_ADJUST)
> > >                 return 0;
> > >
> > > -       mutex_lock(&cooling_cpufreq_lock);
> > > +       mutex_lock(&cooling_list_lock);
> > >         list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
> > >                 if (!cpumask_test_cpu(policy->cpu,
> > >                                         &cpufreq_dev->allowed_cpus))
> > > @@ -333,7 +334,7 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
> > >                 if (policy->max != max_freq)
> > >                         cpufreq_verify_within_limits(policy, 0, max_freq);
> > >         }
> > > -       mutex_unlock(&cooling_cpufreq_lock);
> > > +       mutex_unlock(&cooling_list_lock);
> > >
> > >         return 0;
> > >  }
> > > @@ -482,6 +483,11 @@ __cpufreq_cooling_register(struct device_node *np,
> > >         }
> > >         cpufreq_dev->cool_dev = cool_dev;
> > >         cpufreq_dev->cpufreq_state = 0;
> > > +
> > > +       mutex_lock(&cooling_list_lock);
> > > +       list_add(&cpufreq_dev->node, &cpufreq_dev_list);
> > > +       mutex_unlock(&cooling_list_lock);
> > > +
> > >         mutex_lock(&cooling_cpufreq_lock);
> > >
> > >         /* Register the notifier for first cpufreq cooling device */
> > > @@ -489,7 +495,6 @@ __cpufreq_cooling_register(struct device_node *np,
> > >                 cpufreq_register_notifier(&thermal_cpufreq_notifier_block,
> > >                                           CPUFREQ_POLICY_NOTIFIER);
> > >         cpufreq_dev_count++;
> > > -       list_add(&cpufreq_dev->node, &cpufreq_dev_list);
> > >
> > >         mutex_unlock(&cooling_cpufreq_lock);
> > >
> > > @@ -553,7 +558,6 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
> > >
> > >         cpufreq_dev = cdev->devdata;
> > >         mutex_lock(&cooling_cpufreq_lock);
> > > -       list_del(&cpufreq_dev->node);
> > >         cpufreq_dev_count--;
> > >
> > >         /* Unregister the notifier for the last cpufreq cooling device */
> > > @@ -562,6 +566,10 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
> > >                                             CPUFREQ_POLICY_NOTIFIER);
> > >         mutex_unlock(&cooling_cpufreq_lock);
> > >
> > > +       mutex_lock(&cooling_list_lock);
> > > +       list_del(&cpufreq_dev->node);
> > > +       mutex_unlock(&cooling_list_lock);
> > > +
> > >         thermal_cooling_device_unregister(cpufreq_dev->cool_dev);
> > >         release_idr(&cpufreq_idr, cpufreq_dev->id);
> > >         kfree(cpufreq_dev);
> > 
> > For 3.18
> > 
> > Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
> 
>
Russell King - ARM Linux Aug. 11, 2015, 5:03 p.m. UTC | #5
On Tue, May 19, 2015 at 12:05:55AM +0200, Rafael J. Wysocki wrote:
> On Monday, May 18, 2015 07:56:45 PM Russell King - ARM Linux wrote:
> > On Tue, Dec 16, 2014 at 09:11:53AM +0530, Viresh Kumar wrote:
> > > On 16 December 2014 at 04:39, Russell King - ARM Linux
> > > <linux@arm.linux.org.uk> wrote:
> > > > Well, here's a patch which I'm running on top of 3.18 at the moment,
> > > > which is basically what I described in my email, and I'm running with it
> > > > and it is without any lockdep complaint.
> > > 
> > > We need two separate patches now, one for 3.18 and other one for 3.19-rc.
> > > 3.19 has see lots of changes in this particular file and so we need to
> > > change few things here.
> > 
> > What happened with this?  I'm still carrying the patch.
> 
> This should go in through the thermal tree.  Eduardo?

Having waited a long time for any kind of response from Eduardo, I've
given up.  My conclusion is that Eduardo isn't interested in this.

I've re-checked, and the AB-BA deadlock is still there in the latest
code.  So, I've taken it upon myself to throw this into my for-next
branch to force the issue - not something I _want_ to do, but I'm doing
this out of frustration.  It's clear to me that "playing nice" by email
does _not_ work with some people.

I'm rather hoping that Stephen reports a merge conflict with linux-next
this evening to highlight this situation.  I've added additional commentry
to the commit message on the patch giving the reason why I've done this,
and the relevant message IDs showing the past history.

I've not decided whether I'm going to ask Linus to take this patch
directly or not, that rather depends whether there's any co-operation
from Eduardo on this.  I'd rather Eduardo took the patch.

The patch I have has had to be updated again for changes to the driver,
but I really don't see the point of re-posting it just for it to be
ignored yet again.

I'm really disappointed by this dysfunctional state of affairs, and
that what should be an urgent fix for an observable problem is still
not merged some nine months after it was first identified.
Viresh Kumar Aug. 12, 2015, 5:16 a.m. UTC | #6
On 11-08-15, 18:03, Russell King - ARM Linux wrote:
> On Tue, May 19, 2015 at 12:05:55AM +0200, Rafael J. Wysocki wrote:
> > On Monday, May 18, 2015 07:56:45 PM Russell King - ARM Linux wrote:
> > > On Tue, Dec 16, 2014 at 09:11:53AM +0530, Viresh Kumar wrote:
> > > > On 16 December 2014 at 04:39, Russell King - ARM Linux
> > > > <linux@arm.linux.org.uk> wrote:
> > > > > Well, here's a patch which I'm running on top of 3.18 at the moment,
> > > > > which is basically what I described in my email, and I'm running with it
> > > > > and it is without any lockdep complaint.
> > > > 
> > > > We need two separate patches now, one for 3.18 and other one for 3.19-rc.
> > > > 3.19 has see lots of changes in this particular file and so we need to
> > > > change few things here.

Was the patch for latest kernel ever circulated ? Probably that's why
Eduardo never got back to it.
Russell King - ARM Linux Aug. 12, 2015, 7:21 a.m. UTC | #7
On Wed, Aug 12, 2015 at 10:46:59AM +0530, Viresh Kumar wrote:
> On 11-08-15, 18:03, Russell King - ARM Linux wrote:
> > On Tue, May 19, 2015 at 12:05:55AM +0200, Rafael J. Wysocki wrote:
> > > On Monday, May 18, 2015 07:56:45 PM Russell King - ARM Linux wrote:
> > > > On Tue, Dec 16, 2014 at 09:11:53AM +0530, Viresh Kumar wrote:
> > > > > On 16 December 2014 at 04:39, Russell King - ARM Linux
> > > > > <linux@arm.linux.org.uk> wrote:
> > > > > > Well, here's a patch which I'm running on top of 3.18 at the moment,
> > > > > > which is basically what I described in my email, and I'm running with it
> > > > > > and it is without any lockdep complaint.
> > > > > 
> > > > > We need two separate patches now, one for 3.18 and other one for 3.19-rc.
> > > > > 3.19 has see lots of changes in this particular file and so we need to
> > > > > change few things here.
> 
> Was the patch for latest kernel ever circulated ? Probably that's why
> Eduardo never got back to it.

Maybe not - I often lose track of which patches are in what state -
tracking the state of 250+ patches over many months and in some cases
_years_ is not easy.

That's still no reason for Eduardo to _completely_ ignore this thread,
especially when Rafael explicitly tried to get comment from Eduardo.
Viresh Kumar Aug. 12, 2015, 7:35 a.m. UTC | #8
On 12-08-15, 08:21, Russell King - ARM Linux wrote:
> Maybe not - I often lose track of which patches are in what state -
> tracking the state of 250+ patches over many months and in some cases
> _years_ is not easy.

I agree..

> That's still no reason for Eduardo to _completely_ ignore this thread,
> especially when Rafael explicitly tried to get comment from Eduardo.

Oh, no. I am not trying to be anybody's advocate here. I was just
highlighting the point here, that I remembered.

Even I was quite busy then and couldn't get back to it.

Lets fix this now. I believe you have a patch for this based on latest
kernel? Mind pasting that here for quick review?
Russell King - ARM Linux Aug. 12, 2015, 7:49 a.m. UTC | #9
On Wed, Aug 12, 2015 at 01:05:30PM +0530, Viresh Kumar wrote:
> On 12-08-15, 08:21, Russell King - ARM Linux wrote:
> > Maybe not - I often lose track of which patches are in what state -
> > tracking the state of 250+ patches over many months and in some cases
> > _years_ is not easy.
> 
> I agree..
> 
> > That's still no reason for Eduardo to _completely_ ignore this thread,
> > especially when Rafael explicitly tried to get comment from Eduardo.
> 
> Oh, no. I am not trying to be anybody's advocate here. I was just
> highlighting the point here, that I remembered.
> 
> Even I was quite busy then and couldn't get back to it.
> 
> Lets fix this now. I believe you have a patch for this based on latest
> kernel? Mind pasting that here for quick review?

I don't think it's changed much since the original, but I've posted
what's in linux-next as of today, and it doesn't conflict with anything.
So there's no excuse about getting it merged.

The problem will be back-porting it to stable kernels, because I think
it's had to be updated at least a couple of times.  I don't have the old
versions anymore, so I'm just going to say "not my problem" - sorry.
Rafael J. Wysocki Aug. 13, 2015, 1:20 a.m. UTC | #10
On Tuesday, August 11, 2015 06:03:57 PM Russell King - ARM Linux wrote:
> On Tue, May 19, 2015 at 12:05:55AM +0200, Rafael J. Wysocki wrote:
> > On Monday, May 18, 2015 07:56:45 PM Russell King - ARM Linux wrote:
> > > On Tue, Dec 16, 2014 at 09:11:53AM +0530, Viresh Kumar wrote:
> > > > On 16 December 2014 at 04:39, Russell King - ARM Linux
> > > > <linux@arm.linux.org.uk> wrote:
> > > > > Well, here's a patch which I'm running on top of 3.18 at the moment,
> > > > > which is basically what I described in my email, and I'm running with it
> > > > > and it is without any lockdep complaint.
> > > > 
> > > > We need two separate patches now, one for 3.18 and other one for 3.19-rc.
> > > > 3.19 has see lots of changes in this particular file and so we need to
> > > > change few things here.
> > > 
> > > What happened with this?  I'm still carrying the patch.
> > 
> > This should go in through the thermal tree.  Eduardo?
> 
> Having waited a long time for any kind of response from Eduardo, I've
> given up.  My conclusion is that Eduardo isn't interested in this.
> 
> I've re-checked, and the AB-BA deadlock is still there in the latest
> code.  So, I've taken it upon myself to throw this into my for-next
> branch to force the issue - not something I _want_ to do, but I'm doing
> this out of frustration.  It's clear to me that "playing nice" by email
> does _not_ work with some people.
> 
> I'm rather hoping that Stephen reports a merge conflict with linux-next
> this evening to highlight this situation.  I've added additional commentry
> to the commit message on the patch giving the reason why I've done this,
> and the relevant message IDs showing the past history.
> 
> I've not decided whether I'm going to ask Linus to take this patch
> directly or not, that rather depends whether there's any co-operation
> from Eduardo on this.  I'd rather Eduardo took the patch.
> 
> The patch I have has had to be updated again for changes to the driver,
> but I really don't see the point of re-posting it just for it to be
> ignored yet again.
> 
> I'm really disappointed by this dysfunctional state of affairs, and
> that what should be an urgent fix for an observable problem is still
> not merged some nine months after it was first identified.

I guess it might help if you sent the updated patch in a new thread.
Russell King - ARM Linux Aug. 13, 2015, 8:17 a.m. UTC | #11
On Thu, Aug 13, 2015 at 03:20:35AM +0200, Rafael J. Wysocki wrote:
> On Tuesday, August 11, 2015 06:03:57 PM Russell King - ARM Linux wrote:
> > On Tue, May 19, 2015 at 12:05:55AM +0200, Rafael J. Wysocki wrote:
> > > On Monday, May 18, 2015 07:56:45 PM Russell King - ARM Linux wrote:
> > > > On Tue, Dec 16, 2014 at 09:11:53AM +0530, Viresh Kumar wrote:
> > > > > On 16 December 2014 at 04:39, Russell King - ARM Linux
> > > > > <linux@arm.linux.org.uk> wrote:
> > > > > > Well, here's a patch which I'm running on top of 3.18 at the moment,
> > > > > > which is basically what I described in my email, and I'm running with it
> > > > > > and it is without any lockdep complaint.
> > > > > 
> > > > > We need two separate patches now, one for 3.18 and other one for 3.19-rc.
> > > > > 3.19 has see lots of changes in this particular file and so we need to
> > > > > change few things here.
> > > > 
> > > > What happened with this?  I'm still carrying the patch.
> > > 
> > > This should go in through the thermal tree.  Eduardo?
> > 
> > Having waited a long time for any kind of response from Eduardo, I've
> > given up.  My conclusion is that Eduardo isn't interested in this.
> > 
> > I've re-checked, and the AB-BA deadlock is still there in the latest
> > code.  So, I've taken it upon myself to throw this into my for-next
> > branch to force the issue - not something I _want_ to do, but I'm doing
> > this out of frustration.  It's clear to me that "playing nice" by email
> > does _not_ work with some people.
> > 
> > I'm rather hoping that Stephen reports a merge conflict with linux-next
> > this evening to highlight this situation.  I've added additional commentry
> > to the commit message on the patch giving the reason why I've done this,
> > and the relevant message IDs showing the past history.
> > 
> > I've not decided whether I'm going to ask Linus to take this patch
> > directly or not, that rather depends whether there's any co-operation
> > from Eduardo on this.  I'd rather Eduardo took the patch.
> > 
> > The patch I have has had to be updated again for changes to the driver,
> > but I really don't see the point of re-posting it just for it to be
> > ignored yet again.
> > 
> > I'm really disappointed by this dysfunctional state of affairs, and
> > that what should be an urgent fix for an observable problem is still
> > not merged some nine months after it was first identified.
> 
> I guess it might help if you sent the updated patch in a new thread.

That I doubt.  Eduardo has not bothered to reply at _any_ time.  I
have to question whether there is anyone even reading that email
address, or whether it's a redirect to /dev/null.  All the evidence I
have right now is that this Eduardo is a ficticous character.

I would have at least expected some complaints when I said "I've put
it in linux-next" but... absolutely nothing.

So... my only conclusion is that you're all pulling my leg that there
_is_ this "Eduardo" maintainer who's supposed to be taking patches for
this stuff.

As I've said, I'm not bothering with this anymore, it's just far too
much effort to play these stupid games.  The deadlock can stay for all
I care.

Sorry.
Viresh Kumar Aug. 13, 2015, 8:22 a.m. UTC | #12
On 13-08-15, 09:17, Russell King - ARM Linux wrote:
> That I doubt.  Eduardo has not bothered to reply at _any_ time.  I
> have to question whether there is anyone even reading that email
> address, or whether it's a redirect to /dev/null.  All the evidence I
> have right now is that this Eduardo is a ficticous character.
> 
> I would have at least expected some complaints when I said "I've put
> it in linux-next" but... absolutely nothing.
> 
> So... my only conclusion is that you're all pulling my leg that there
> _is_ this "Eduardo" maintainer who's supposed to be taking patches for
> this stuff.

Haha, If that's the case then we are all in the same boat.
Who is Eduardo ? A bot ? :)

Anyway, AFAIK, he wasn't around for sometime and came back last week
only. Even I have expected him to reply to yesterday's discussions.

> As I've said, I'm not bothering with this anymore, it's just far too
> much effort to play these stupid games.  The deadlock can stay for all
> I care.

And finally I took the charge to carry it now. :)
Rafael J. Wysocki Aug. 18, 2015, 1:32 a.m. UTC | #13
On Thursday, August 13, 2015 09:17:44 AM Russell King - ARM Linux wrote:
> On Thu, Aug 13, 2015 at 03:20:35AM +0200, Rafael J. Wysocki wrote:
> > On Tuesday, August 11, 2015 06:03:57 PM Russell King - ARM Linux wrote:
> > > On Tue, May 19, 2015 at 12:05:55AM +0200, Rafael J. Wysocki wrote:
> > > > On Monday, May 18, 2015 07:56:45 PM Russell King - ARM Linux wrote:
> > > > > On Tue, Dec 16, 2014 at 09:11:53AM +0530, Viresh Kumar wrote:
> > > > > > On 16 December 2014 at 04:39, Russell King - ARM Linux
> > > > > > <linux@arm.linux.org.uk> wrote:
> > > > > > > Well, here's a patch which I'm running on top of 3.18 at the moment,
> > > > > > > which is basically what I described in my email, and I'm running with it
> > > > > > > and it is without any lockdep complaint.
> > > > > > 
> > > > > > We need two separate patches now, one for 3.18 and other one for 3.19-rc.
> > > > > > 3.19 has see lots of changes in this particular file and so we need to
> > > > > > change few things here.
> > > > > 
> > > > > What happened with this?  I'm still carrying the patch.
> > > > 
> > > > This should go in through the thermal tree.  Eduardo?
> > > 
> > > Having waited a long time for any kind of response from Eduardo, I've
> > > given up.  My conclusion is that Eduardo isn't interested in this.
> > > 
> > > I've re-checked, and the AB-BA deadlock is still there in the latest
> > > code.  So, I've taken it upon myself to throw this into my for-next
> > > branch to force the issue - not something I _want_ to do, but I'm doing
> > > this out of frustration.  It's clear to me that "playing nice" by email
> > > does _not_ work with some people.
> > > 
> > > I'm rather hoping that Stephen reports a merge conflict with linux-next
> > > this evening to highlight this situation.  I've added additional commentry
> > > to the commit message on the patch giving the reason why I've done this,
> > > and the relevant message IDs showing the past history.
> > > 
> > > I've not decided whether I'm going to ask Linus to take this patch
> > > directly or not, that rather depends whether there's any co-operation
> > > from Eduardo on this.  I'd rather Eduardo took the patch.
> > > 
> > > The patch I have has had to be updated again for changes to the driver,
> > > but I really don't see the point of re-posting it just for it to be
> > > ignored yet again.
> > > 
> > > I'm really disappointed by this dysfunctional state of affairs, and
> > > that what should be an urgent fix for an observable problem is still
> > > not merged some nine months after it was first identified.
> > 
> > I guess it might help if you sent the updated patch in a new thread.
> 
> That I doubt.  Eduardo has not bothered to reply at _any_ time.  I
> have to question whether there is anyone even reading that email
> address, or whether it's a redirect to /dev/null.  All the evidence I
> have right now is that this Eduardo is a ficticous character.
> 
> I would have at least expected some complaints when I said "I've put
> it in linux-next" but... absolutely nothing.
> 
> So... my only conclusion is that you're all pulling my leg that there
> _is_ this "Eduardo" maintainer who's supposed to be taking patches for
> this stuff.

Well, that's the situation as per MAINTAINERS today.  You seem to be concerned
that it may not reflect the reality, but in that case I can only recommend
sending a patch against MAINTAINERS to remove Eduardo from there.

> As I've said, I'm not bothering with this anymore, it's just far too
> much effort to play these stupid games.

I'm not sure what you mean by "these stupid games" here.

> The deadlock can stay for all I care.

Fair enough.

Thanks,
Rafael
Eduardo Valentin Aug. 18, 2015, 9:30 a.m. UTC | #14
Hi there,

On Mon, Aug 17, 2015 at 6:32 PM, Rafael J. Wysocki <rjw@rjwysocki.net> wrote:

<cut>

>
> Well, that's the situation as per MAINTAINERS today.  You seem to be concerned
> that it may not reflect the reality, but in that case I can only recommend
> sending a patch against MAINTAINERS to remove Eduardo from there.
>
>> As I've said, I'm not bothering with this anymore, it's just far too
>> much effort to play these stupid games.
>
> I'm not sure what you mean by "these stupid games" here.
>
>> The deadlock can stay for all I care.
>
> Fair enough.

I applied the patch. And it has generated the conflict in linux-next, as was mentioned in this thread. I already sent the pull request.

Yes, I missed the thread. Apologize for this and for the inconvenience I have caused. The deadlock will be taken care using Russel's patch.

Thanks Viresh for notifying me.


>
> Thanks,
> Rafael
>

BR,
diff mbox

Patch

diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index ad09e51ffae4..9e42c6f30785 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -57,6 +57,7 @@  static DEFINE_MUTEX(cooling_cpufreq_lock);
 
 static unsigned int cpufreq_dev_count;
 
+static DEFINE_MUTEX(cooling_list_lock);
 static LIST_HEAD(cpufreq_dev_list);
 
 /**
@@ -317,7 +318,7 @@  static int cpufreq_thermal_notifier(struct notifier_block *nb,
 	if (event != CPUFREQ_ADJUST)
 		return 0;
 
-	mutex_lock(&cooling_cpufreq_lock);
+	mutex_lock(&cooling_list_lock);
 	list_for_each_entry(cpufreq_dev, &cpufreq_dev_list, node) {
 		if (!cpumask_test_cpu(policy->cpu,
 					&cpufreq_dev->allowed_cpus))
@@ -333,7 +334,7 @@  static int cpufreq_thermal_notifier(struct notifier_block *nb,
 		if (policy->max != max_freq)
 			cpufreq_verify_within_limits(policy, 0, max_freq);
 	}
-	mutex_unlock(&cooling_cpufreq_lock);
+	mutex_unlock(&cooling_list_lock);
 
 	return 0;
 }
@@ -482,6 +483,11 @@  __cpufreq_cooling_register(struct device_node *np,
 	}
 	cpufreq_dev->cool_dev = cool_dev;
 	cpufreq_dev->cpufreq_state = 0;
+
+	mutex_lock(&cooling_list_lock);
+	list_add(&cpufreq_dev->node, &cpufreq_dev_list);
+	mutex_unlock(&cooling_list_lock);
+
 	mutex_lock(&cooling_cpufreq_lock);
 
 	/* Register the notifier for first cpufreq cooling device */
@@ -489,7 +495,6 @@  __cpufreq_cooling_register(struct device_node *np,
 		cpufreq_register_notifier(&thermal_cpufreq_notifier_block,
 					  CPUFREQ_POLICY_NOTIFIER);
 	cpufreq_dev_count++;
-	list_add(&cpufreq_dev->node, &cpufreq_dev_list);
 
 	mutex_unlock(&cooling_cpufreq_lock);
 
@@ -553,7 +558,6 @@  void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 
 	cpufreq_dev = cdev->devdata;
 	mutex_lock(&cooling_cpufreq_lock);
-	list_del(&cpufreq_dev->node);
 	cpufreq_dev_count--;
 
 	/* Unregister the notifier for the last cpufreq cooling device */
@@ -562,6 +566,10 @@  void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 					    CPUFREQ_POLICY_NOTIFIER);
 	mutex_unlock(&cooling_cpufreq_lock);
 
+	mutex_lock(&cooling_list_lock);
+	list_del(&cpufreq_dev->node);
+	mutex_unlock(&cooling_list_lock);
+
 	thermal_cooling_device_unregister(cpufreq_dev->cool_dev);
 	release_idr(&cpufreq_idr, cpufreq_dev->id);
 	kfree(cpufreq_dev);