diff mbox

[v5,09/13] PCI: Introduce /sys/bus/pci/devices/.../remove

Message ID 20090320205636.12275.1825.stgit@bob.kio
State Accepted, archived
Headers show

Commit Message

Alexander Chiang March 20, 2009, 8:56 p.m. UTC
This patch adds an attribute named "remove" to a PCI device's sysfs
directory.  Writing a non-zero value to this attribute will remove the PCI
device and any children of it.

Trent Piepho wrote the original implementation and documentation.

Thanks to Vegard Nossum for testing under kmemcheck and finding locking
issues with the sysfs interface.

Cc: Trent Piepho <xyzzy@speakeasy.org>
Signed-off-by: Alex Chiang <achiang@hp.com>
---

 Documentation/ABI/testing/sysfs-bus-pci |    8 +++++++
 Documentation/filesystems/sysfs-pci.txt |   10 +++++++++
 drivers/pci/pci-sysfs.c                 |   36 +++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 0 deletions(-)


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Kenji Kaneshige March 23, 2009, 9:01 a.m. UTC | #1
Alex Chiang wrote:
> This patch adds an attribute named "remove" to a PCI device's sysfs
> directory.  Writing a non-zero value to this attribute will remove the PCI
> device and any children of it.
> 
> Trent Piepho wrote the original implementation and documentation.
> 
> Thanks to Vegard Nossum for testing under kmemcheck and finding locking
> issues with the sysfs interface.
> 
> Cc: Trent Piepho <xyzzy@speakeasy.org>
> Signed-off-by: Alex Chiang <achiang@hp.com>
> ---
> 
>  Documentation/ABI/testing/sysfs-bus-pci |    8 +++++++
>  Documentation/filesystems/sysfs-pci.txt |   10 +++++++++
>  drivers/pci/pci-sysfs.c                 |   36 +++++++++++++++++++++++++++++++
>  3 files changed, 54 insertions(+), 0 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
> index 1697a16..1350fa6 100644
> --- a/Documentation/ABI/testing/sysfs-bus-pci
> +++ b/Documentation/ABI/testing/sysfs-bus-pci
> @@ -66,6 +66,14 @@ Description:
>  		re-discover previously removed devices.
>  		Depends on CONFIG_HOTPLUG.
>  
> +What:		/sys/bus/pci/devices/.../remove
> +Date:		January 2009
> +Contact:	Linux PCI developers <linux-pci@vger.kernel.org>
> +Description:
> +		Writing a non-zero value to this attribute will
> +		hot-remove the PCI device and any of its children.
> +		Depends on CONFIG_HOTPLUG.
> +
>  What:		/sys/bus/pci/devices/.../vpd
>  Date:		February 2008
>  Contact:	Ben Hutchings <bhutchings@solarflare.com>
> diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
> index 9f8740c..26e4b8b 100644
> --- a/Documentation/filesystems/sysfs-pci.txt
> +++ b/Documentation/filesystems/sysfs-pci.txt
> @@ -12,6 +12,7 @@ that support it.  For example, a given bus might look like this:
>       |   |-- enable
>       |   |-- irq
>       |   |-- local_cpus
> +     |   |-- remove
>       |   |-- resource
>       |   |-- resource0
>       |   |-- resource1
> @@ -36,6 +37,7 @@ files, each with their own function.
>         enable	           Whether the device is enabled (ascii, rw)
>         irq		   IRQ number (ascii, ro)
>         local_cpus	   nearby CPU mask (cpumask, ro)
> +       remove		   remove device from kernel's list (ascii, wo)
>         resource		   PCI resource host addresses (ascii, ro)
>         resource0..N	   PCI resource N, if present (binary, mmap)
>         resource0_wc..N_wc  PCI WC map resource N, if prefetchable (binary, mmap)
> @@ -46,6 +48,7 @@ files, each with their own function.
>  
>    ro - read only file
>    rw - file is readable and writable
> +  wo - write only file
>    mmap - file is mmapable
>    ascii - file contains ascii text
>    binary - file contains binary data
> @@ -73,6 +76,13 @@ that the device must be enabled for a rom read to return data succesfully.
>  In the event a driver is not bound to the device, it can be enabled using the
>  'enable' file, documented above.
>  
> +The 'remove' file is used to remove the PCI device, by writing a non-zero
> +integer to the file.  This does not involve any kind of hot-plug functionality,
> +e.g. powering off the device.  The device is removed from the kernel's list of
> +PCI devices, the sysfs directory for it is removed, and the device will be
> +removed from any drivers attached to it. Removal of PCI root buses is
> +disallowed.
> +
>  Accessing legacy resources through sysfs
>  ----------------------------------------
>  
> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> index be7468a..e16990e 100644
> --- a/drivers/pci/pci-sysfs.c
> +++ b/drivers/pci/pci-sysfs.c
> @@ -243,6 +243,39 @@ struct bus_attribute pci_bus_attrs[] = {
>  	__ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, bus_rescan_store),
>  	__ATTR_NULL
>  };
> +
> +static void remove_callback(struct device *dev)
> +{
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +
> +	mutex_lock(&pci_remove_rescan_mutex);
> +	pci_remove_bus_device(pdev);
> +	mutex_unlock(&pci_remove_rescan_mutex);
> +}
> +
> +static ssize_t
> +remove_store(struct device *dev, struct device_attribute *dummy,
> +	     const char *buf, size_t count)
> +{
> +	int ret = 0;
> +	unsigned long val;
> +	struct pci_dev *pdev = to_pci_dev(dev);
> +
> +	if (strict_strtoul(buf, 0, &val) < 0)
> +		return -EINVAL;
> +
> +	if (pci_is_root_bus(pdev->bus))
> +		return -EBUSY;
> +
> +	/* An attribute cannot be unregistered by one of its own methods,
> +	 * so we have to use this roundabout approach.
> +	 */
> +	if (val)
> +		ret = device_schedule_callback(dev, remove_callback);
> +	if (ret)
> +		count = ret;
> +	return count;
> +}
>  #endif
>  

I still have the following kernel error messages in testing with your
latest set of patches (Jesse's linux-next). The test case is removing
e1000e device or its parent bridge by "echo 1 > /sys/bus/pci/devices/
.../remove".

[  537.379995] =============================================
[  537.380124] [ INFO: possible recursive locking detected ]
[  537.380128] 2.6.29-rc8-kk #1
[  537.380128] ---------------------------------------------
[  537.380128] events/4/56 is trying to acquire lock:
[  537.380128]  (events){--..}, at: [<ffffffff80257fc0>] flush_workqueue+0x0/0xa0
[  537.380128]
[  537.380128] but task is already holding lock:
[  537.380128]  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
[  537.380128]
[  537.380128] other info that might help us debug this:
[  537.380128] 3 locks held by events/4/56:
[  537.380128]  #0:  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
[  537.380128]  #1:  (&ss->work){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
[  537.380128]  #2:  (pci_remove_rescan_mutex){--..}, at: [<ffffffff803c10d1>] remove_callback+0x21/0x40
[  537.380128]
[  537.380128] stack backtrace:
[  537.380128] Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1
[  537.380128] Call Trace:
[  537.380128]  [<ffffffff8026dfcd>] validate_chain+0xb7d/0x1260
[  537.380128]  [<ffffffff8026eade>] __lock_acquire+0x42e/0xa40
[  537.380128]  [<ffffffff8026f148>] lock_acquire+0x58/0x80
[  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
[  537.380128]  [<ffffffff8025800d>] flush_workqueue+0x4d/0xa0
[  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
[  537.383380]  [<ffffffff80258070>] flush_scheduled_work+0x10/0x20
[  537.383380]  [<ffffffffa0144065>] e1000_remove+0x55/0xfe [e1000e]
[  537.383380]  [<ffffffff8033ee30>] ? sysfs_schedule_callback_work+0x0/0x50
[  537.383380]  [<ffffffff803bfeb2>] pci_device_remove+0x32/0x70
[  537.383380]  [<ffffffff80441da9>] __device_release_driver+0x59/0x90
[  537.383380]  [<ffffffff80441edb>] device_release_driver+0x2b/0x40
[  537.383380]  [<ffffffff804419d6>] bus_remove_device+0xa6/0x120
[  537.384382]  [<ffffffff8043e46b>] device_del+0x12b/0x190
[  537.384382]  [<ffffffff8043e4f6>] device_unregister+0x26/0x70
[  537.384382]  [<ffffffff803ba969>] pci_stop_dev+0x49/0x60
[  537.384382]  [<ffffffff803baab0>] pci_remove_bus_device+0x40/0xc0
[  537.384382]  [<ffffffff803c10d9>] remove_callback+0x29/0x40
[  537.384382]  [<ffffffff8033ee4f>] sysfs_schedule_callback_work+0x1f/0x50
[  537.384382]  [<ffffffff8025769a>] run_workqueue+0x15a/0x230
[  537.384382]  [<ffffffff80257648>] ? run_workqueue+0x108/0x230
[  537.384382]  [<ffffffff8025846f>] worker_thread+0x9f/0x100
[  537.384382]  [<ffffffff8025bce0>] ? autoremove_wake_function+0x0/0x40
[  537.384382]  [<ffffffff802583d0>] ? worker_thread+0x0/0x100
[  537.384382]  [<ffffffff8025b89d>] kthread+0x4d/0x80
[  537.384382]  [<ffffffff8020d4ba>] child_rip+0xa/0x20
[  537.386380]  [<ffffffff8020cebc>] ? restore_args+0x0/0x30
[  537.386380]  [<ffffffff8025b850>] ? kthread+0x0/0x80
[  537.386380]  [<ffffffff8020d4b0>] ? child_rip+0x0/0x20

I think the cause of this error message is flush_workqueue() from the
work of keventd. When removing device using "/sys/bus/pci/devices/.../
remove", pci_remove_bus_device() is executed by the keventd's work
through device_schedule_callback(), and it invokes e1000e's remove
callback. And then, e1000e's remove callback invokes flush_workqueue().
Actually, the kernel error messages are not displayed when I changed
e1000e driver to not call flush_workqueue(). In my understanding, flush_workqueue() from the work must be avoided because it can cause
a deadlock. Please note that this is not a problem of e1000e driver.
Drivers can use flush_workqueue(), of course.

BTW, I also have another worry about executing pci_remove_bus_device()
by the work of keventd. The pci_remove_bus_device() will take a long
time  especially when the bridge device near the root bus is specified.
The long delay of keventd's work will have bad effects to other works
on the workqueue.

Thanks,
Kenji Kaneshige



>  struct device_attribute pci_dev_attrs[] = {
> @@ -263,6 +296,9 @@ struct device_attribute pci_dev_attrs[] = {
>  	__ATTR(broken_parity_status,(S_IRUGO|S_IWUSR),
>  		broken_parity_status_show,broken_parity_status_store),
>  	__ATTR(msi_bus, 0644, msi_bus_show, msi_bus_store),
> +#ifdef CONFIG_HOTPLUG
> +	__ATTR(remove, (S_IWUSR|S_IWGRP), NULL, remove_store),
> +#endif
>  	__ATTR_NULL,
>  };
>  
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Chiang March 24, 2009, 3:23 a.m. UTC | #2
Hi Ingo,

* Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>:
> Alex Chiang wrote:
>> This patch adds an attribute named "remove" to a PCI device's sysfs
>> directory.  Writing a non-zero value to this attribute will remove the PCI
>> device and any children of it.
>>
>> Trent Piepho wrote the original implementation and documentation.
>>
>> Thanks to Vegard Nossum for testing under kmemcheck and finding locking
>> issues with the sysfs interface.
>>
>> Cc: Trent Piepho <xyzzy@speakeasy.org>
>> Signed-off-by: Alex Chiang <achiang@hp.com>

[snip part of patch]

>>  diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
>> index be7468a..e16990e 100644
>> --- a/drivers/pci/pci-sysfs.c
>> +++ b/drivers/pci/pci-sysfs.c
>> @@ -243,6 +243,39 @@ struct bus_attribute pci_bus_attrs[] = {
>>  	__ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, bus_rescan_store),
>>  	__ATTR_NULL
>>  };
>> +
>> +static void remove_callback(struct device *dev)
>> +{
>> +	struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +	mutex_lock(&pci_remove_rescan_mutex);
>> +	pci_remove_bus_device(pdev);
>> +	mutex_unlock(&pci_remove_rescan_mutex);
>> +}
>> +
>> +static ssize_t
>> +remove_store(struct device *dev, struct device_attribute *dummy,
>> +	     const char *buf, size_t count)
>> +{
>> +	int ret = 0;
>> +	unsigned long val;
>> +	struct pci_dev *pdev = to_pci_dev(dev);
>> +
>> +	if (strict_strtoul(buf, 0, &val) < 0)
>> +		return -EINVAL;
>> +
>> +	if (pci_is_root_bus(pdev->bus))
>> +		return -EBUSY;
>> +
>> +	/* An attribute cannot be unregistered by one of its own methods,
>> +	 * so we have to use this roundabout approach.
>> +	 */
>> +	if (val)
>> +		ret = device_schedule_callback(dev, remove_callback);
>> +	if (ret)
>> +		count = ret;
>> +	return count;
>> +}
>>  #endif
>>  

Kenji Kaneshige reported the below lockdep problem when testing
my patch on one of his machines.

> I still have the following kernel error messages in testing with your
> latest set of patches (Jesse's linux-next). The test case is removing
> e1000e device or its parent bridge by "echo 1 > /sys/bus/pci/devices/
> .../remove".
>
> [  537.379995] =============================================
> [  537.380124] [ INFO: possible recursive locking detected ]
> [  537.380128] 2.6.29-rc8-kk #1
> [  537.380128] ---------------------------------------------
> [  537.380128] events/4/56 is trying to acquire lock:
> [  537.380128]  (events){--..}, at: [<ffffffff80257fc0>] flush_workqueue+0x0/0xa0
> [  537.380128]
> [  537.380128] but task is already holding lock:
> [  537.380128]  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> [  537.380128]
> [  537.380128] other info that might help us debug this:
> [  537.380128] 3 locks held by events/4/56:
> [  537.380128]  #0:  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> [  537.380128]  #1:  (&ss->work){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> [  537.380128]  #2:  (pci_remove_rescan_mutex){--..}, at: [<ffffffff803c10d1>] remove_callback+0x21/0x40
> [  537.380128]
> [  537.380128] stack backtrace:
> [  537.380128] Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1
> [  537.380128] Call Trace:
> [  537.380128]  [<ffffffff8026dfcd>] validate_chain+0xb7d/0x1260
> [  537.380128]  [<ffffffff8026eade>] __lock_acquire+0x42e/0xa40
> [  537.380128]  [<ffffffff8026f148>] lock_acquire+0x58/0x80
> [  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
> [  537.380128]  [<ffffffff8025800d>] flush_workqueue+0x4d/0xa0
> [  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
> [  537.383380]  [<ffffffff80258070>] flush_scheduled_work+0x10/0x20
> [  537.383380]  [<ffffffffa0144065>] e1000_remove+0x55/0xfe [e1000e]
> [  537.383380]  [<ffffffff8033ee30>] ? sysfs_schedule_callback_work+0x0/0x50
> [  537.383380]  [<ffffffff803bfeb2>] pci_device_remove+0x32/0x70
> [  537.383380]  [<ffffffff80441da9>] __device_release_driver+0x59/0x90
> [  537.383380]  [<ffffffff80441edb>] device_release_driver+0x2b/0x40
> [  537.383380]  [<ffffffff804419d6>] bus_remove_device+0xa6/0x120
> [  537.384382]  [<ffffffff8043e46b>] device_del+0x12b/0x190
> [  537.384382]  [<ffffffff8043e4f6>] device_unregister+0x26/0x70
> [  537.384382]  [<ffffffff803ba969>] pci_stop_dev+0x49/0x60
> [  537.384382]  [<ffffffff803baab0>] pci_remove_bus_device+0x40/0xc0
> [  537.384382]  [<ffffffff803c10d9>] remove_callback+0x29/0x40
> [  537.384382]  [<ffffffff8033ee4f>] sysfs_schedule_callback_work+0x1f/0x50
> [  537.384382]  [<ffffffff8025769a>] run_workqueue+0x15a/0x230
> [  537.384382]  [<ffffffff80257648>] ? run_workqueue+0x108/0x230
> [  537.384382]  [<ffffffff8025846f>] worker_thread+0x9f/0x100
> [  537.384382]  [<ffffffff8025bce0>] ? autoremove_wake_function+0x0/0x40
> [  537.384382]  [<ffffffff802583d0>] ? worker_thread+0x0/0x100
> [  537.384382]  [<ffffffff8025b89d>] kthread+0x4d/0x80
> [  537.384382]  [<ffffffff8020d4ba>] child_rip+0xa/0x20
> [  537.386380]  [<ffffffff8020cebc>] ? restore_args+0x0/0x30
> [  537.386380]  [<ffffffff8025b850>] ? kthread+0x0/0x80
> [  537.386380]  [<ffffffff8020d4b0>] ? child_rip+0x0/0x20
>
> I think the cause of this error message is flush_workqueue()
> from the work of keventd. When removing device using
> "/sys/bus/pci/devices/.../ remove", pci_remove_bus_device() is
> executed by the keventd's work through
> device_schedule_callback(), and it invokes e1000e's remove
> callback. And then, e1000e's remove callback invokes
> flush_workqueue().  Actually, the kernel error messages are not
> displayed when I changed e1000e driver to not call
> flush_workqueue(). In my understanding, flush_workqueue() from
> the work must be avoided because it can cause a deadlock.
> Please note that this is not a problem of e1000e driver.
> Drivers can use flush_workqueue(), of course.

I agree with this analysis; the reason we're seeing this lockdep
warning is because the sysfs attributed scheduled a removal for
itself using device_schedule_callback(). This is necessary
because sysfs attributes can't remove themselves due to other
locking issues.

My question is -- is it a bug to call flush_workqueue during
run_workqueue?

Conceptually, I don't think it should be a bug; it should be a
nop, since run_workqueue _is_ flushing the work queue.

Thoughts?

> BTW, I also have another worry about executing pci_remove_bus_device()
> by the work of keventd. The pci_remove_bus_device() will take a long
> time  especially when the bridge device near the root bus is specified.
> The long delay of keventd's work will have bad effects to other works
> on the workqueue.

The real fix is to fix sysfs so that attributes can remove
themselves directly. I will work with Tejun Heo on getting this
working sooner rather than later. That will avoid the locking
issue you discovered above as well as the concern you point out
about putting long running tasks in the keventd work queue.

Thanks.

/ac

>
> Thanks,
> Kenji Kaneshige
>
>
>
>>  struct device_attribute pci_dev_attrs[] = {
>> @@ -263,6 +296,9 @@ struct device_attribute pci_dev_attrs[] = {
>>  	__ATTR(broken_parity_status,(S_IRUGO|S_IWUSR),
>>  		broken_parity_status_show,broken_parity_status_store),
>>  	__ATTR(msi_bus, 0644, msi_bus_show, msi_bus_store),
>> +#ifdef CONFIG_HOTPLUG
>> +	__ATTR(remove, (S_IWUSR|S_IWGRP), NULL, remove_store),
>> +#endif
>>  	__ATTR_NULL,
>>  };
>>  
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar March 24, 2009, 9:25 a.m. UTC | #3
( Cc:-ed a few more interested parties - the thread is about 
  workqueue dependency lockdep coverage. )

* Alex Chiang <achiang@hp.com> wrote:

> Hi Ingo,
> 
> * Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>:
> > Alex Chiang wrote:
> >> This patch adds an attribute named "remove" to a PCI device's sysfs
> >> directory.  Writing a non-zero value to this attribute will remove the PCI
> >> device and any children of it.
> >>
> >> Trent Piepho wrote the original implementation and documentation.
> >>
> >> Thanks to Vegard Nossum for testing under kmemcheck and finding locking
> >> issues with the sysfs interface.
> >>
> >> Cc: Trent Piepho <xyzzy@speakeasy.org>
> >> Signed-off-by: Alex Chiang <achiang@hp.com>
> 
> [snip part of patch]
> 
> >>  diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> >> index be7468a..e16990e 100644
> >> --- a/drivers/pci/pci-sysfs.c
> >> +++ b/drivers/pci/pci-sysfs.c
> >> @@ -243,6 +243,39 @@ struct bus_attribute pci_bus_attrs[] = {
> >>  	__ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, bus_rescan_store),
> >>  	__ATTR_NULL
> >>  };
> >> +
> >> +static void remove_callback(struct device *dev)
> >> +{
> >> +	struct pci_dev *pdev = to_pci_dev(dev);
> >> +
> >> +	mutex_lock(&pci_remove_rescan_mutex);
> >> +	pci_remove_bus_device(pdev);
> >> +	mutex_unlock(&pci_remove_rescan_mutex);
> >> +}
> >> +
> >> +static ssize_t
> >> +remove_store(struct device *dev, struct device_attribute *dummy,
> >> +	     const char *buf, size_t count)
> >> +{
> >> +	int ret = 0;
> >> +	unsigned long val;
> >> +	struct pci_dev *pdev = to_pci_dev(dev);
> >> +
> >> +	if (strict_strtoul(buf, 0, &val) < 0)
> >> +		return -EINVAL;
> >> +
> >> +	if (pci_is_root_bus(pdev->bus))
> >> +		return -EBUSY;
> >> +
> >> +	/* An attribute cannot be unregistered by one of its own methods,
> >> +	 * so we have to use this roundabout approach.
> >> +	 */
> >> +	if (val)
> >> +		ret = device_schedule_callback(dev, remove_callback);
> >> +	if (ret)
> >> +		count = ret;
> >> +	return count;
> >> +}
> >>  #endif
> >>  
> 
> Kenji Kaneshige reported the below lockdep problem when testing
> my patch on one of his machines.
> 
> > I still have the following kernel error messages in testing with your
> > latest set of patches (Jesse's linux-next). The test case is removing
> > e1000e device or its parent bridge by "echo 1 > /sys/bus/pci/devices/
> > .../remove".
> >
> > [  537.379995] =============================================
> > [  537.380124] [ INFO: possible recursive locking detected ]
> > [  537.380128] 2.6.29-rc8-kk #1
> > [  537.380128] ---------------------------------------------
> > [  537.380128] events/4/56 is trying to acquire lock:
> > [  537.380128]  (events){--..}, at: [<ffffffff80257fc0>] flush_workqueue+0x0/0xa0
> > [  537.380128]
> > [  537.380128] but task is already holding lock:
> > [  537.380128]  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > [  537.380128]
> > [  537.380128] other info that might help us debug this:
> > [  537.380128] 3 locks held by events/4/56:
> > [  537.380128]  #0:  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > [  537.380128]  #1:  (&ss->work){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > [  537.380128]  #2:  (pci_remove_rescan_mutex){--..}, at: [<ffffffff803c10d1>] remove_callback+0x21/0x40
> > [  537.380128]
> > [  537.380128] stack backtrace:
> > [  537.380128] Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1
> > [  537.380128] Call Trace:
> > [  537.380128]  [<ffffffff8026dfcd>] validate_chain+0xb7d/0x1260
> > [  537.380128]  [<ffffffff8026eade>] __lock_acquire+0x42e/0xa40
> > [  537.380128]  [<ffffffff8026f148>] lock_acquire+0x58/0x80
> > [  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
> > [  537.380128]  [<ffffffff8025800d>] flush_workqueue+0x4d/0xa0
> > [  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
> > [  537.383380]  [<ffffffff80258070>] flush_scheduled_work+0x10/0x20
> > [  537.383380]  [<ffffffffa0144065>] e1000_remove+0x55/0xfe [e1000e]
> > [  537.383380]  [<ffffffff8033ee30>] ? sysfs_schedule_callback_work+0x0/0x50
> > [  537.383380]  [<ffffffff803bfeb2>] pci_device_remove+0x32/0x70
> > [  537.383380]  [<ffffffff80441da9>] __device_release_driver+0x59/0x90
> > [  537.383380]  [<ffffffff80441edb>] device_release_driver+0x2b/0x40
> > [  537.383380]  [<ffffffff804419d6>] bus_remove_device+0xa6/0x120
> > [  537.384382]  [<ffffffff8043e46b>] device_del+0x12b/0x190
> > [  537.384382]  [<ffffffff8043e4f6>] device_unregister+0x26/0x70
> > [  537.384382]  [<ffffffff803ba969>] pci_stop_dev+0x49/0x60
> > [  537.384382]  [<ffffffff803baab0>] pci_remove_bus_device+0x40/0xc0
> > [  537.384382]  [<ffffffff803c10d9>] remove_callback+0x29/0x40
> > [  537.384382]  [<ffffffff8033ee4f>] sysfs_schedule_callback_work+0x1f/0x50
> > [  537.384382]  [<ffffffff8025769a>] run_workqueue+0x15a/0x230
> > [  537.384382]  [<ffffffff80257648>] ? run_workqueue+0x108/0x230
> > [  537.384382]  [<ffffffff8025846f>] worker_thread+0x9f/0x100
> > [  537.384382]  [<ffffffff8025bce0>] ? autoremove_wake_function+0x0/0x40
> > [  537.384382]  [<ffffffff802583d0>] ? worker_thread+0x0/0x100
> > [  537.384382]  [<ffffffff8025b89d>] kthread+0x4d/0x80
> > [  537.384382]  [<ffffffff8020d4ba>] child_rip+0xa/0x20
> > [  537.386380]  [<ffffffff8020cebc>] ? restore_args+0x0/0x30
> > [  537.386380]  [<ffffffff8025b850>] ? kthread+0x0/0x80
> > [  537.386380]  [<ffffffff8020d4b0>] ? child_rip+0x0/0x20
> >
> > I think the cause of this error message is flush_workqueue()
> > from the work of keventd. When removing device using
> > "/sys/bus/pci/devices/.../ remove", pci_remove_bus_device() is
> > executed by the keventd's work through
> > device_schedule_callback(), and it invokes e1000e's remove
> > callback. And then, e1000e's remove callback invokes
> > flush_workqueue().  Actually, the kernel error messages are not
> > displayed when I changed e1000e driver to not call
> > flush_workqueue(). In my understanding, flush_workqueue() from
> > the work must be avoided because it can cause a deadlock.
> > Please note that this is not a problem of e1000e driver.
> > Drivers can use flush_workqueue(), of course.
> 
> I agree with this analysis; the reason we're seeing this lockdep
> warning is because the sysfs attributed scheduled a removal for
> itself using device_schedule_callback(). This is necessary
> because sysfs attributes can't remove themselves due to other
> locking issues.
> 
> My question is -- is it a bug to call flush_workqueue during 
> run_workqueue?

Yes, it generally is.

> Conceptually, I don't think it should be a bug; it should be a
> nop, since run_workqueue _is_ flushing the work queue.
> 
> Thoughts?

well ... but running a work item holds up further processing of the 
queue - and there lies the deadlock potential. (but ... i have not 
looked deeply, there's always the possibility of a false positive.)

	Ingo
> 
> > BTW, I also have another worry about executing pci_remove_bus_device()
> > by the work of keventd. The pci_remove_bus_device() will take a long
> > time  especially when the bridge device near the root bus is specified.
> > The long delay of keventd's work will have bad effects to other works
> > on the workqueue.
> 
> The real fix is to fix sysfs so that attributes can remove
> themselves directly. I will work with Tejun Heo on getting this
> working sooner rather than later. That will avoid the locking
> issue you discovered above as well as the concern you point out
> about putting long running tasks in the keventd work queue.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Morton March 24, 2009, 10:46 a.m. UTC | #4
On Tue, 24 Mar 2009 10:25:25 +0100 Ingo Molnar <mingo@elte.hu> wrote:

> 
> ( Cc:-ed a few more interested parties - the thread is about 
>   workqueue dependency lockdep coverage. )
> 
> * Alex Chiang <achiang@hp.com> wrote:
> 
> > Hi Ingo,
> > 
> > * Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>:
> > > Alex Chiang wrote:
> > >> This patch adds an attribute named "remove" to a PCI device's sysfs
> > >> directory.  Writing a non-zero value to this attribute will remove the PCI
> > >> device and any children of it.
> > >>
> > >> Trent Piepho wrote the original implementation and documentation.
> > >>
> > >> Thanks to Vegard Nossum for testing under kmemcheck and finding locking
> > >> issues with the sysfs interface.
> > >>
> > >> Cc: Trent Piepho <xyzzy@speakeasy.org>
> > >> Signed-off-by: Alex Chiang <achiang@hp.com>
> > 
> > [snip part of patch]
> > 
> > >>  diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> > >> index be7468a..e16990e 100644
> > >> --- a/drivers/pci/pci-sysfs.c
> > >> +++ b/drivers/pci/pci-sysfs.c
> > >> @@ -243,6 +243,39 @@ struct bus_attribute pci_bus_attrs[] = {
> > >>  	__ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, bus_rescan_store),
> > >>  	__ATTR_NULL
> > >>  };
> > >> +
> > >> +static void remove_callback(struct device *dev)
> > >> +{
> > >> +	struct pci_dev *pdev = to_pci_dev(dev);
> > >> +
> > >> +	mutex_lock(&pci_remove_rescan_mutex);
> > >> +	pci_remove_bus_device(pdev);
> > >> +	mutex_unlock(&pci_remove_rescan_mutex);
> > >> +}
> > >> +
> > >> +static ssize_t
> > >> +remove_store(struct device *dev, struct device_attribute *dummy,
> > >> +	     const char *buf, size_t count)
> > >> +{
> > >> +	int ret = 0;
> > >> +	unsigned long val;
> > >> +	struct pci_dev *pdev = to_pci_dev(dev);
> > >> +
> > >> +	if (strict_strtoul(buf, 0, &val) < 0)
> > >> +		return -EINVAL;
> > >> +
> > >> +	if (pci_is_root_bus(pdev->bus))
> > >> +		return -EBUSY;
> > >> +
> > >> +	/* An attribute cannot be unregistered by one of its own methods,
> > >> +	 * so we have to use this roundabout approach.
> > >> +	 */
> > >> +	if (val)
> > >> +		ret = device_schedule_callback(dev, remove_callback);
> > >> +	if (ret)
> > >> +		count = ret;
> > >> +	return count;
> > >> +}
> > >>  #endif
> > >>  
> > 
> > Kenji Kaneshige reported the below lockdep problem when testing
> > my patch on one of his machines.
> > 
> > > I still have the following kernel error messages in testing with your
> > > latest set of patches (Jesse's linux-next). The test case is removing
> > > e1000e device or its parent bridge by "echo 1 > /sys/bus/pci/devices/
> > > .../remove".
> > >
> > > [  537.379995] =============================================
> > > [  537.380124] [ INFO: possible recursive locking detected ]
> > > [  537.380128] 2.6.29-rc8-kk #1
> > > [  537.380128] ---------------------------------------------
> > > [  537.380128] events/4/56 is trying to acquire lock:
> > > [  537.380128]  (events){--..}, at: [<ffffffff80257fc0>] flush_workqueue+0x0/0xa0
> > > [  537.380128]
> > > [  537.380128] but task is already holding lock:
> > > [  537.380128]  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > > [  537.380128]
> > > [  537.380128] other info that might help us debug this:
> > > [  537.380128] 3 locks held by events/4/56:
> > > [  537.380128]  #0:  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > > [  537.380128]  #1:  (&ss->work){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > > [  537.380128]  #2:  (pci_remove_rescan_mutex){--..}, at: [<ffffffff803c10d1>] remove_callback+0x21/0x40
> > > [  537.380128]
> > > [  537.380128] stack backtrace:
> > > [  537.380128] Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1
> > > [  537.380128] Call Trace:
> > > [  537.380128]  [<ffffffff8026dfcd>] validate_chain+0xb7d/0x1260
> > > [  537.380128]  [<ffffffff8026eade>] __lock_acquire+0x42e/0xa40
> > > [  537.380128]  [<ffffffff8026f148>] lock_acquire+0x58/0x80
> > > [  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
> > > [  537.380128]  [<ffffffff8025800d>] flush_workqueue+0x4d/0xa0
> > > [  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
> > > [  537.383380]  [<ffffffff80258070>] flush_scheduled_work+0x10/0x20
> > > [  537.383380]  [<ffffffffa0144065>] e1000_remove+0x55/0xfe [e1000e]
> > > [  537.383380]  [<ffffffff8033ee30>] ? sysfs_schedule_callback_work+0x0/0x50
> > > [  537.383380]  [<ffffffff803bfeb2>] pci_device_remove+0x32/0x70
> > > [  537.383380]  [<ffffffff80441da9>] __device_release_driver+0x59/0x90
> > > [  537.383380]  [<ffffffff80441edb>] device_release_driver+0x2b/0x40
> > > [  537.383380]  [<ffffffff804419d6>] bus_remove_device+0xa6/0x120
> > > [  537.384382]  [<ffffffff8043e46b>] device_del+0x12b/0x190
> > > [  537.384382]  [<ffffffff8043e4f6>] device_unregister+0x26/0x70
> > > [  537.384382]  [<ffffffff803ba969>] pci_stop_dev+0x49/0x60
> > > [  537.384382]  [<ffffffff803baab0>] pci_remove_bus_device+0x40/0xc0
> > > [  537.384382]  [<ffffffff803c10d9>] remove_callback+0x29/0x40
> > > [  537.384382]  [<ffffffff8033ee4f>] sysfs_schedule_callback_work+0x1f/0x50
> > > [  537.384382]  [<ffffffff8025769a>] run_workqueue+0x15a/0x230
> > > [  537.384382]  [<ffffffff80257648>] ? run_workqueue+0x108/0x230
> > > [  537.384382]  [<ffffffff8025846f>] worker_thread+0x9f/0x100
> > > [  537.384382]  [<ffffffff8025bce0>] ? autoremove_wake_function+0x0/0x40
> > > [  537.384382]  [<ffffffff802583d0>] ? worker_thread+0x0/0x100
> > > [  537.384382]  [<ffffffff8025b89d>] kthread+0x4d/0x80
> > > [  537.384382]  [<ffffffff8020d4ba>] child_rip+0xa/0x20
> > > [  537.386380]  [<ffffffff8020cebc>] ? restore_args+0x0/0x30
> > > [  537.386380]  [<ffffffff8025b850>] ? kthread+0x0/0x80
> > > [  537.386380]  [<ffffffff8020d4b0>] ? child_rip+0x0/0x20
> > >
> > > I think the cause of this error message is flush_workqueue()
> > > from the work of keventd. When removing device using
> > > "/sys/bus/pci/devices/.../ remove", pci_remove_bus_device() is
> > > executed by the keventd's work through
> > > device_schedule_callback(), and it invokes e1000e's remove
> > > callback. And then, e1000e's remove callback invokes
> > > flush_workqueue().  Actually, the kernel error messages are not
> > > displayed when I changed e1000e driver to not call
> > > flush_workqueue(). In my understanding, flush_workqueue() from
> > > the work must be avoided because it can cause a deadlock.
> > > Please note that this is not a problem of e1000e driver.
> > > Drivers can use flush_workqueue(), of course.
> > 
> > I agree with this analysis; the reason we're seeing this lockdep
> > warning is because the sysfs attributed scheduled a removal for
> > itself using device_schedule_callback(). This is necessary
> > because sysfs attributes can't remove themselves due to other
> > locking issues.
> > 
> > My question is -- is it a bug to call flush_workqueue during 
> > run_workqueue?
> 
> Yes, it generally is.
> 
> > Conceptually, I don't think it should be a bug; it should be a
> > nop, since run_workqueue _is_ flushing the work queue.
> > 
> > Thoughts?
> 
> well ... but running a work item holds up further processing of the 
> queue - and there lies the deadlock potential. (but ... i have not 
> looked deeply, there's always the possibility of a false positive.)
> 

Thing is, we've always supported kevetnd-calls-flush_work().  That's what
"morton gets to eat his hat" in run_workqueue() is all about.

Now, -mm's workqueue-avoid-recursion-in-run_workqueue.patch changes all of
that.  And that patch recently triggered a warning due to some games which
USB was playing.  I was told this is because USB is being bad.

But I don't think we've seen a coherent description of what's actually
_wrong_ with the current code.  flush_cpu_workqueue() has been handling
this case for many years with no problems reported as far as I know.

So what has caused this sudden flurry of reports?  Did something change in
lockdep?  What is this

[  537.380128]  (events){--..}, at: [<ffffffff80257fc0>] flush_workqueue+0x0/0xa0
[  537.380128]
[  537.380128] but task is already holding lock:
[  537.380128]  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230

supposed to mean?  "events" isn't a lock - it's the name of a kernel
thread, isn't it?  If this is supposed to be deadlockable then how?

Because I don't immediately see what's wrong with e1000_remove() calling
flush_work().  It's undesirable, and we can perhaps improve it via some
means, but where is the bug?


> > 
> > > BTW, I also have another worry about executing pci_remove_bus_device()
> > > by the work of keventd. The pci_remove_bus_device() will take a long
> > > time  especially when the bridge device near the root bus is specified.
> > > The long delay of keventd's work will have bad effects to other works
> > > on the workqueue.
> > 
> > The real fix is to fix sysfs so that attributes can remove
> > themselves directly. I will work with Tejun Heo on getting this
> > working sooner rather than later. That will avoid the locking
> > issue you discovered above as well as the concern you point out
> > about putting long running tasks in the keventd work queue.
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra March 24, 2009, 11:17 a.m. UTC | #5
On Tue, 2009-03-24 at 03:46 -0700, Andrew Morton wrote:
> 
> Thing is, we've always supported kevetnd-calls-flush_work().  That's what
> "morton gets to eat his hat" in run_workqueue() is all about.

Supported as in not complained about it, but its always presented a
deadlock scenario.

> Now, -mm's workqueue-avoid-recursion-in-run_workqueue.patch changes all of
> that.

See the discussions around that patch, Lai Jiangshan discovered that it
had more deadlock potential than we even suspected.

To quote:

---
On 02/06, Lai Jiangshan wrote:
>
> Oleg Nesterov wrote:
> > On 02/05, Lai Jiangshan wrote:
> >> DEADLOCK EXAMPLE for explain my above option:
> >>
> >> (work_func0() and work_func1() are work callback, and they
> >> calls flush_workqueue())
> >>
> >> CPU#0                                      CPU#1
> >> run_workqueue()                         run_workqueue()
> >>   work_func0()                            work_func1()
> >>     flush_workqueue()                       flush_workqueue()
> >>       flush_cpu_workqueue(0)                  .
> >>       flush_cpu_workqueue(cpu#1)              flush_cpu_workqueue(cpu#0)
> >>         waiting work_func1() in cpu#1           waiting work_func0 in cpu#0
> >>
> >> DEADLOCK!
> >
> > I am not sure. Note that when work_func0() calls run_workqueue(),
> > it will clear cwq->current_work, so another flush_ on CPU#1 will
> > not wait for work_func0, no?
>
> cwq->current_work is changed only when
> !list_empty(&cwq->worklist)
> in run_workqueue().
>
> so cwq->current_work may not be changed.

Ah, indeed.

Thanks for correcting me!
---

>   And that patch recently triggered a warning due to some games which
> USB was playing.  I was told this is because USB is being bad.
> 
> But I don't think we've seen a coherent description of what's actually
> _wrong_ with the current code.  flush_cpu_workqueue() has been handling
> this case for many years with no problems reported as far as I know.

Might be sheer luck, but afaik we did have some actual deadlocks due to
workqueue flushing -- a particular one I can remember was cpu-hotplug vs
cpufreq.

> So what has caused this sudden flurry of reports?  Did something change in
> lockdep?  What is this
> 
> [  537.380128]  (events){--..}, at: [<ffffffff80257fc0>] flush_workqueue+0x0/0xa0
> [  537.380128]
> [  537.380128] but task is already holding lock:
> [  537.380128]  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> 
> supposed to mean?  "events" isn't a lock - it's the name of a kernel
> thread, isn't it?

No workqueue lockdep support has been in there for a while now. /me
pokes at git for a bit..

4e6045f134784f4b158b3c0f7a282b04bd816887 -- Oct 2007, ca. 2.6.24-rc1

What it does it gives the workqueue a lock-object and each worklet. It
then validates that you only get:

 workqueue
   worklet

nestings, eg. calling flush_workqueue() from a worklet will generate

 workqueue    <-.
   worklet      |
     workqueue -'

recursion, IOW the above splat.

Another thing it does is connect the lockchains of workqueue callers
with those of the worklet. eg.

---
    code path 1:
      my_function() -> lock(L1); ...; flush_workqueue(); ...
    
    code path 2:
      run_workqueue() -> my_work() -> ...; lock(L1); ...
    
    you can get a deadlock when my_work() is queued or running
    but my_function() has acquired L1 already.
---

>   If this is supposed to be deadlockable then how?
> 
> Because I don't immediately see what's wrong with e1000_remove() calling
> flush_work().  It's undesirable, and we can perhaps improve it via some
> means, but where is the bug?

I hope the above answers why flushing a workqueue from within that same
workqueue is a very bad thing.

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Johannes Berg March 24, 2009, 12:32 p.m. UTC | #6
On Tue, 2009-03-24 at 03:46 -0700, Andrew Morton wrote:

> But I don't think we've seen a coherent description of what's actually
> _wrong_ with the current code.  flush_cpu_workqueue() has been handling
> this case for many years with no problems reported as far as I know.
> 
> So what has caused this sudden flurry of reports?  Did something change in
> lockdep?  What is this
> 
> [  537.380128]  (events){--..}, at: [<ffffffff80257fc0>] flush_workqueue+0x0/0xa0
> [  537.380128]
> [  537.380128] but task is already holding lock:
> [  537.380128]  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> 
> supposed to mean?  "events" isn't a lock - it's the name of a kernel
> thread, isn't it?  If this is supposed to be deadlockable then how?

events is indeed the schedule_work workqueue thread name -- I just used
that for lack of a better name.

> Because I don't immediately see what's wrong with e1000_remove() calling
> flush_work().  It's undesirable, and we can perhaps improve it via some
> means, but where is the bug?

There is no bug -- it's a false positive in a way. I've pointed this out
in the original thread, see
http://thread.gmane.org/gmane.linux.kernel/550877/focus=550932

johannes
Johannes Berg March 24, 2009, 1:21 p.m. UTC | #7
On Tue, 2009-03-24 at 12:17 +0100, Peter Zijlstra wrote:

> > But I don't think we've seen a coherent description of what's actually
> > _wrong_ with the current code.  flush_cpu_workqueue() has been handling
> > this case for many years with no problems reported as far as I know.
> 
> Might be sheer luck, but afaik we did have some actual deadlocks due to
> workqueue flushing -- a particular one I can remember was cpu-hotplug vs
> cpufreq.

Two cases are relevant here actually -- the recursion which hasn't ever
shown up before, and a number of possible deadlocks of e.g. some people
doing, effectively:

	rtnl_lock();
	flush_scheduled_work();
	rtnl_unlock();

vs. the linkwatch work that can, at this point in time, still be queued,
and needs the rtnl as well.


A little digging through git logs finds more references, e.g. commits
f90d4118bacef87894621a3e8aba853fa0c89abc and
fd781fa25c9e9c6fd1599df060b05e7c4ad724e5.

Some others were fixed that I remember, but apparently without putting
the lockdep report into the commit log.

johannes
Oleg Nesterov March 24, 2009, 4:12 p.m. UTC | #8
On 03/24, Ingo Molnar wrote:
> >
> > * Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>:
> >
> > Kenji Kaneshige reported the below lockdep problem when testing
> > my patch on one of his machines.
> >
> > > I still have the following kernel error messages in testing with your
> > > latest set of patches (Jesse's linux-next). The test case is removing
> > > e1000e device or its parent bridge by "echo 1 > /sys/bus/pci/devices/
> > > .../remove".
> > >
> > > [  537.379995] =============================================
> > > [  537.380124] [ INFO: possible recursive locking detected ]
> > > [  537.380128] 2.6.29-rc8-kk #1
> > > [  537.380128] ---------------------------------------------
> > > [  537.380128] events/4/56 is trying to acquire lock:
> > > [  537.380128]  (events){--..}, at: [<ffffffff80257fc0>] flush_workqueue+0x0/0xa0
> > > [  537.380128]
> > > [  537.380128] but task is already holding lock:
> > > [  537.380128]  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > > [  537.380128]
> > > [  537.380128] other info that might help us debug this:
> > > [  537.380128] 3 locks held by events/4/56:
> > > [  537.380128]  #0:  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > > [  537.380128]  #1:  (&ss->work){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > > [  537.380128]  #2:  (pci_remove_rescan_mutex){--..}, at: [<ffffffff803c10d1>] remove_callback+0x21/0x40
> > > [  537.380128]
> > > [  537.380128] stack backtrace:
> > > [  537.380128] Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1
> > > [  537.380128] Call Trace:
> > > [  537.380128]  [<ffffffff8026dfcd>] validate_chain+0xb7d/0x1260
> > > [  537.380128]  [<ffffffff8026eade>] __lock_acquire+0x42e/0xa40
> > > [  537.380128]  [<ffffffff8026f148>] lock_acquire+0x58/0x80
> > > [  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
> > > [  537.380128]  [<ffffffff8025800d>] flush_workqueue+0x4d/0xa0
> > > [  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
> > > [  537.383380]  [<ffffffff80258070>] flush_scheduled_work+0x10/0x20
> > > [  537.383380]  [<ffffffffa0144065>] e1000_remove+0x55/0xfe [e1000e]
> > > [  537.383380]  [<ffffffff8033ee30>] ? sysfs_schedule_callback_work+0x0/0x50
> > > [  537.383380]  [<ffffffff803bfeb2>] pci_device_remove+0x32/0x70
> > > [  537.383380]  [<ffffffff80441da9>] __device_release_driver+0x59/0x90
> > > [  537.383380]  [<ffffffff80441edb>] device_release_driver+0x2b/0x40
> > > [  537.383380]  [<ffffffff804419d6>] bus_remove_device+0xa6/0x120
> > > [  537.384382]  [<ffffffff8043e46b>] device_del+0x12b/0x190
> > > [  537.384382]  [<ffffffff8043e4f6>] device_unregister+0x26/0x70
> > > [  537.384382]  [<ffffffff803ba969>] pci_stop_dev+0x49/0x60
> > > [  537.384382]  [<ffffffff803baab0>] pci_remove_bus_device+0x40/0xc0
> > > [  537.384382]  [<ffffffff803c10d9>] remove_callback+0x29/0x40
> > > [  537.384382]  [<ffffffff8033ee4f>] sysfs_schedule_callback_work+0x1f/0x50
> > > [  537.384382]  [<ffffffff8025769a>] run_workqueue+0x15a/0x230
> > > [  537.384382]  [<ffffffff80257648>] ? run_workqueue+0x108/0x230
> > > [  537.384382]  [<ffffffff8025846f>] worker_thread+0x9f/0x100
> > > [  537.384382]  [<ffffffff8025bce0>] ? autoremove_wake_function+0x0/0x40
> > > [  537.384382]  [<ffffffff802583d0>] ? worker_thread+0x0/0x100
> > > [  537.384382]  [<ffffffff8025b89d>] kthread+0x4d/0x80
> > > [  537.384382]  [<ffffffff8020d4ba>] child_rip+0xa/0x20
> > > [  537.386380]  [<ffffffff8020cebc>] ? restore_args+0x0/0x30
> > > [  537.386380]  [<ffffffff8025b850>] ? kthread+0x0/0x80
> > > [  537.386380]  [<ffffffff8020d4b0>] ? child_rip+0x0/0x20
> > >
> > > I think the cause of this error message is flush_workqueue()
> > > from the work of keventd. When removing device using
> > > "/sys/bus/pci/devices/.../ remove", pci_remove_bus_device() is
> > > executed by the keventd's work through
> > > device_schedule_callback(), and it invokes e1000e's remove
> > > callback. And then, e1000e's remove callback invokes
> > > flush_workqueue().  Actually, the kernel error messages are not
> > > displayed when I changed e1000e driver to not call
> > > flush_workqueue(). In my understanding, flush_workqueue() from
> > > the work must be avoided because it can cause a deadlock.
> > > Please note that this is not a problem of e1000e driver.
> > > Drivers can use flush_workqueue(), of course.
> >
> > I agree with this analysis; the reason we're seeing this lockdep
> > warning is because the sysfs attributed scheduled a removal for
> > itself using device_schedule_callback(). This is necessary
> > because sysfs attributes can't remove themselves due to other
> > locking issues.
> >
> > My question is -- is it a bug to call flush_workqueue during
> > run_workqueue?
>
> Yes, it generally is.
>
> > Conceptually, I don't think it should be a bug; it should be a
> > nop, since run_workqueue _is_ flushing the work queue.

As it was already said, we can deadlock.

Can't e1000_remove() avoid flush_scheduled_work() ? (and it should
be always avoided when possible).

Of course, I don't understand this code. But afaics e1000_remove()
can just cancel its own works (in struct e1000_adapter), no?

cancel_work_sync(work) from run_workqueue() should be OK even if
this work is queued on the same wq. If it is queued on the same CPU
cancel_work_sync() won't block because we are ->current_work.


Btw. Again, I don't understand the code, but this looks suspicious:

	e1000_remove:

		set_bit(__E1000_DOWN, &adapter->state);
		del_timer_sync(&adapter->watchdog_timer);
		flush_scheduled_work();

What if e1000_watchdog_task() is running, has already checked
!test_bit(__E1000_DOWN, &adapter->state), but didn't call
mod_timer(&adapter->phy_info_timer) yet?

Oleg.

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Chiang March 24, 2009, 5:23 p.m. UTC | #9
* Johannes Berg <johannes@sipsolutions.net>:
> On Tue, 2009-03-24 at 03:46 -0700, Andrew Morton wrote:
> 
> > But I don't think we've seen a coherent description of what's actually
> > _wrong_ with the current code.  flush_cpu_workqueue() has been handling
> > this case for many years with no problems reported as far as I know.
> > 
> > So what has caused this sudden flurry of reports?  Did something change in
> > lockdep?  What is this
> > 
> > [  537.380128]  (events){--..}, at: [<ffffffff80257fc0>] flush_workqueue+0x0/0xa0
> > [  537.380128]
> > [  537.380128] but task is already holding lock:
> > [  537.380128]  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > 
> > supposed to mean?  "events" isn't a lock - it's the name of a kernel
> > thread, isn't it?  If this is supposed to be deadlockable then how?
> 
> events is indeed the schedule_work workqueue thread name -- I just used
> that for lack of a better name.
> 
> > Because I don't immediately see what's wrong with e1000_remove() calling
> > flush_work().  It's undesirable, and we can perhaps improve it via some
> > means, but where is the bug?
> 
> There is no bug -- it's a false positive in a way. I've pointed this out
> in the original thread, see
> http://thread.gmane.org/gmane.linux.kernel/550877/focus=550932

I'm actually a bit confused now.

Peter explained why flushing a workqueue from the same queue is
bad, and in general I agree, but what do you mean by "false
positive"?

By the way, this scenario:

	code path 1:
	  my_function() -> lock(L1); ...; flush_workqueue(); ...

	code path 2:
	  run_workqueue() -> my_work() -> ...; lock(L1); ...

is _not_ what is happening here.

sysfs_schedule_callback() is an ugly piece of code that exists
because a sysfs attribute cannot remove itself without
deadlocking. So the callback mechanism was created to allow a
different kernel thread to remove the sysfs attribute and avoid
deadlock.

So what you really have going on is:

	sysfs callback -> add remove callback to global workqueue
	remove callback fires off (pci_remove_bus_device) and we do...
	    device_unregister
	    driver's ->remove method called
	    driver's ->remove method calls flush_scheduled_work

Yes, after read the thread I agree that generically calling
flush_workqueue in the middle of run_workqueue is bad, but the
lockdep warning that Kenji showed us really won't deadlock.

This is because pci_remove_bus_device() will not acquire any lock
L1 that an individual device driver will attempt to acquire in
the remove path. If that were the case, we would deadlock every
time you rmmod'ed a device driver's module or every time you shut
your machine down.

I think from my end, there are 2 things I need to do:

	a) make sysfs_schedule_callback() use its own work queue
	   instead of global work queue, because too many drivers
	   call flush_scheduled_work in their remove path

	b) give sysfs attributes the ability to commit suicide

(a) is short term work, 2.6.30 timeframe, since it doesn't
involve any large conceptual changes.

(b) is picking up Tejun Heo's existing work, but that was a bit
controversial last time, and I'm not sure it will make it during
this merge window.

Question for the lockdep folks though -- given what I described,
do you agree that the warning we saw was a false positive? Or am
I off in left field?

Thanks.

/ac

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexander Chiang March 24, 2009, 5:32 p.m. UTC | #10
(Adding some e1000e folks to answer questions about their driver)

* Oleg Nesterov <oleg@redhat.com>:
> On 03/24, Ingo Molnar wrote:
> > >
> > > * Kenji Kaneshige <kaneshige.kenji@jp.fujitsu.com>:
> > >
> > > Kenji Kaneshige reported the below lockdep problem when testing
> > > my patch on one of his machines.
> > >
> > > > I still have the following kernel error messages in testing with your
> > > > latest set of patches (Jesse's linux-next). The test case is removing
> > > > e1000e device or its parent bridge by "echo 1 > /sys/bus/pci/devices/
> > > > .../remove".
> > > >
> > > > [  537.379995] =============================================
> > > > [  537.380124] [ INFO: possible recursive locking detected ]
> > > > [  537.380128] 2.6.29-rc8-kk #1
> > > > [  537.380128] ---------------------------------------------
> > > > [  537.380128] events/4/56 is trying to acquire lock:
> > > > [  537.380128]  (events){--..}, at: [<ffffffff80257fc0>] flush_workqueue+0x0/0xa0
> > > > [  537.380128]
> > > > [  537.380128] but task is already holding lock:
> > > > [  537.380128]  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > > > [  537.380128]
> > > > [  537.380128] other info that might help us debug this:
> > > > [  537.380128] 3 locks held by events/4/56:
> > > > [  537.380128]  #0:  (events){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > > > [  537.380128]  #1:  (&ss->work){--..}, at: [<ffffffff80257648>] run_workqueue+0x108/0x230
> > > > [  537.380128]  #2:  (pci_remove_rescan_mutex){--..}, at: [<ffffffff803c10d1>] remove_callback+0x21/0x40
> > > > [  537.380128]
> > > > [  537.380128] stack backtrace:
> > > > [  537.380128] Pid: 56, comm: events/4 Not tainted 2.6.29-rc8-kk #1
> > > > [  537.380128] Call Trace:
> > > > [  537.380128]  [<ffffffff8026dfcd>] validate_chain+0xb7d/0x1260
> > > > [  537.380128]  [<ffffffff8026eade>] __lock_acquire+0x42e/0xa40
> > > > [  537.380128]  [<ffffffff8026f148>] lock_acquire+0x58/0x80
> > > > [  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
> > > > [  537.380128]  [<ffffffff8025800d>] flush_workqueue+0x4d/0xa0
> > > > [  537.380128]  [<ffffffff80257fc0>] ? flush_workqueue+0x0/0xa0
> > > > [  537.383380]  [<ffffffff80258070>] flush_scheduled_work+0x10/0x20
> > > > [  537.383380]  [<ffffffffa0144065>] e1000_remove+0x55/0xfe [e1000e]
> > > > [  537.383380]  [<ffffffff8033ee30>] ? sysfs_schedule_callback_work+0x0/0x50
> > > > [  537.383380]  [<ffffffff803bfeb2>] pci_device_remove+0x32/0x70
> > > > [  537.383380]  [<ffffffff80441da9>] __device_release_driver+0x59/0x90
> > > > [  537.383380]  [<ffffffff80441edb>] device_release_driver+0x2b/0x40
> > > > [  537.383380]  [<ffffffff804419d6>] bus_remove_device+0xa6/0x120
> > > > [  537.384382]  [<ffffffff8043e46b>] device_del+0x12b/0x190
> > > > [  537.384382]  [<ffffffff8043e4f6>] device_unregister+0x26/0x70
> > > > [  537.384382]  [<ffffffff803ba969>] pci_stop_dev+0x49/0x60
> > > > [  537.384382]  [<ffffffff803baab0>] pci_remove_bus_device+0x40/0xc0
> > > > [  537.384382]  [<ffffffff803c10d9>] remove_callback+0x29/0x40
> > > > [  537.384382]  [<ffffffff8033ee4f>] sysfs_schedule_callback_work+0x1f/0x50
> > > > [  537.384382]  [<ffffffff8025769a>] run_workqueue+0x15a/0x230
> > > > [  537.384382]  [<ffffffff80257648>] ? run_workqueue+0x108/0x230
> > > > [  537.384382]  [<ffffffff8025846f>] worker_thread+0x9f/0x100
> > > > [  537.384382]  [<ffffffff8025bce0>] ? autoremove_wake_function+0x0/0x40
> > > > [  537.384382]  [<ffffffff802583d0>] ? worker_thread+0x0/0x100
> > > > [  537.384382]  [<ffffffff8025b89d>] kthread+0x4d/0x80
> > > > [  537.384382]  [<ffffffff8020d4ba>] child_rip+0xa/0x20
> > > > [  537.386380]  [<ffffffff8020cebc>] ? restore_args+0x0/0x30
> > > > [  537.386380]  [<ffffffff8025b850>] ? kthread+0x0/0x80
> > > > [  537.386380]  [<ffffffff8020d4b0>] ? child_rip+0x0/0x20
> > > >
> > > > I think the cause of this error message is flush_workqueue()
> > > > from the work of keventd. When removing device using
> > > > "/sys/bus/pci/devices/.../ remove", pci_remove_bus_device() is
> > > > executed by the keventd's work through
> > > > device_schedule_callback(), and it invokes e1000e's remove
> > > > callback. And then, e1000e's remove callback invokes
> > > > flush_workqueue().  Actually, the kernel error messages are not
> > > > displayed when I changed e1000e driver to not call
> > > > flush_workqueue(). In my understanding, flush_workqueue() from
> > > > the work must be avoided because it can cause a deadlock.
> > > > Please note that this is not a problem of e1000e driver.
> > > > Drivers can use flush_workqueue(), of course.
> > >
> > > I agree with this analysis; the reason we're seeing this lockdep
> > > warning is because the sysfs attributed scheduled a removal for
> > > itself using device_schedule_callback(). This is necessary
> > > because sysfs attributes can't remove themselves due to other
> > > locking issues.
> > >
> > > My question is -- is it a bug to call flush_workqueue during
> > > run_workqueue?
> >
> > Yes, it generally is.
> >
> > > Conceptually, I don't think it should be a bug; it should be a
> > > nop, since run_workqueue _is_ flushing the work queue.
> 
> As it was already said, we can deadlock.
 
As answered in another email, I agree that generically, it can
deadlock. I think this particular warning is a false positive
though, because pci_remove_bus_device() definitely cannot acquire
any locks that lower level device drivers want to acquire.

If that were true, then hotplug would never work, and you would
also see hangs every time you shut your machine off.

I can't answer the e1000e questions below but...

> Can't e1000_remove() avoid flush_scheduled_work() ? (and it should
> be always avoided when possible).
> 
> Of course, I don't understand this code. But afaics e1000_remove()
> can just cancel its own works (in struct e1000_adapter), no?
> 
> cancel_work_sync(work) from run_workqueue() should be OK even if
> this work is queued on the same wq. If it is queued on the same CPU
> cancel_work_sync() won't block because we are ->current_work.
> 
> 
> Btw. Again, I don't understand the code, but this looks suspicious:
> 
> 	e1000_remove:
> 
> 		set_bit(__E1000_DOWN, &adapter->state);
> 		del_timer_sync(&adapter->watchdog_timer);
> 		flush_scheduled_work();
> 
> What if e1000_watchdog_task() is running, has already checked
> !test_bit(__E1000_DOWN, &adapter->state), but didn't call
> mod_timer(&adapter->phy_info_timer) yet?

Even if e1000e is doing something funny, the conclusion I'm
coming to is that the sysfs_schedule_callback implementation
needs to change, because there are simply too many drivers that
call flush_scheduled_work.

Thanks.

/ac

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Johannes Berg March 24, 2009, 8:22 p.m. UTC | #11
On Tue, 2009-03-24 at 11:23 -0600, Alex Chiang wrote:

> > There is no bug -- it's a false positive in a way. I've pointed this out
> > in the original thread, see
> > http://thread.gmane.org/gmane.linux.kernel/550877/focus=550932
> 
> I'm actually a bit confused now.

Sorry.

> Peter explained why flushing a workqueue from the same queue is
> bad, and in general I agree, but what do you mean by "false
> positive"?

Well, even though generally flushing it from within is bad, the actual
thing lockdep reports is bogus -- it's reporting a nested locking.

> By the way, this scenario:
> 
> 	code path 1:
> 	  my_function() -> lock(L1); ...; flush_workqueue(); ...
> 
> 	code path 2:
> 	  run_workqueue() -> my_work() -> ...; lock(L1); ...
> 
> is _not_ what is happening here.

Indeed.

> So what you really have going on is:
> 
> 	sysfs callback -> add remove callback to global workqueue
> 	remove callback fires off (pci_remove_bus_device) and we do...
> 	    device_unregister
> 	    driver's ->remove method called
> 	    driver's ->remove method calls flush_scheduled_work
> 
> Yes, after read the thread I agree that generically calling
> flush_workqueue in the middle of run_workqueue is bad, but the
> lockdep warning that Kenji showed us really won't deadlock.

Exactly that is what I meant by "false positive".

> This is because pci_remove_bus_device() will not acquire any lock
> L1 that an individual device driver will attempt to acquire in
> the remove path. If that were the case, we would deadlock every
> time you rmmod'ed a device driver's module or every time you shut
> your machine down.
> 
> I think from my end, there are 2 things I need to do:
> 
> 	a) make sysfs_schedule_callback() use its own work queue
> 	   instead of global work queue, because too many drivers
> 	   call flush_scheduled_work in their remove path
> 
> 	b) give sysfs attributes the ability to commit suicide
> 
> (a) is short term work, 2.6.30 timeframe, since it doesn't
> involve any large conceptual changes.
> 
> (b) is picking up Tejun Heo's existing work, but that was a bit
> controversial last time, and I'm not sure it will make it during
> this merge window.
> 
> Question for the lockdep folks though -- given what I described,
> do you agree that the warning we saw was a false positive? Or am
> I off in left field?

I think we're not sure yet -- it seems Lai Jiangshan described a
scenario in which flushing from within the work actually _can_ deadlock.

johannes
diff mbox

Patch

diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 1697a16..1350fa6 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -66,6 +66,14 @@  Description:
 		re-discover previously removed devices.
 		Depends on CONFIG_HOTPLUG.
 
+What:		/sys/bus/pci/devices/.../remove
+Date:		January 2009
+Contact:	Linux PCI developers <linux-pci@vger.kernel.org>
+Description:
+		Writing a non-zero value to this attribute will
+		hot-remove the PCI device and any of its children.
+		Depends on CONFIG_HOTPLUG.
+
 What:		/sys/bus/pci/devices/.../vpd
 Date:		February 2008
 Contact:	Ben Hutchings <bhutchings@solarflare.com>
diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt
index 9f8740c..26e4b8b 100644
--- a/Documentation/filesystems/sysfs-pci.txt
+++ b/Documentation/filesystems/sysfs-pci.txt
@@ -12,6 +12,7 @@  that support it.  For example, a given bus might look like this:
      |   |-- enable
      |   |-- irq
      |   |-- local_cpus
+     |   |-- remove
      |   |-- resource
      |   |-- resource0
      |   |-- resource1
@@ -36,6 +37,7 @@  files, each with their own function.
        enable	           Whether the device is enabled (ascii, rw)
        irq		   IRQ number (ascii, ro)
        local_cpus	   nearby CPU mask (cpumask, ro)
+       remove		   remove device from kernel's list (ascii, wo)
        resource		   PCI resource host addresses (ascii, ro)
        resource0..N	   PCI resource N, if present (binary, mmap)
        resource0_wc..N_wc  PCI WC map resource N, if prefetchable (binary, mmap)
@@ -46,6 +48,7 @@  files, each with their own function.
 
   ro - read only file
   rw - file is readable and writable
+  wo - write only file
   mmap - file is mmapable
   ascii - file contains ascii text
   binary - file contains binary data
@@ -73,6 +76,13 @@  that the device must be enabled for a rom read to return data succesfully.
 In the event a driver is not bound to the device, it can be enabled using the
 'enable' file, documented above.
 
+The 'remove' file is used to remove the PCI device, by writing a non-zero
+integer to the file.  This does not involve any kind of hot-plug functionality,
+e.g. powering off the device.  The device is removed from the kernel's list of
+PCI devices, the sysfs directory for it is removed, and the device will be
+removed from any drivers attached to it. Removal of PCI root buses is
+disallowed.
+
 Accessing legacy resources through sysfs
 ----------------------------------------
 
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index be7468a..e16990e 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -243,6 +243,39 @@  struct bus_attribute pci_bus_attrs[] = {
 	__ATTR(rescan, (S_IWUSR|S_IWGRP), NULL, bus_rescan_store),
 	__ATTR_NULL
 };
+
+static void remove_callback(struct device *dev)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	mutex_lock(&pci_remove_rescan_mutex);
+	pci_remove_bus_device(pdev);
+	mutex_unlock(&pci_remove_rescan_mutex);
+}
+
+static ssize_t
+remove_store(struct device *dev, struct device_attribute *dummy,
+	     const char *buf, size_t count)
+{
+	int ret = 0;
+	unsigned long val;
+	struct pci_dev *pdev = to_pci_dev(dev);
+
+	if (strict_strtoul(buf, 0, &val) < 0)
+		return -EINVAL;
+
+	if (pci_is_root_bus(pdev->bus))
+		return -EBUSY;
+
+	/* An attribute cannot be unregistered by one of its own methods,
+	 * so we have to use this roundabout approach.
+	 */
+	if (val)
+		ret = device_schedule_callback(dev, remove_callback);
+	if (ret)
+		count = ret;
+	return count;
+}
 #endif
 
 struct device_attribute pci_dev_attrs[] = {
@@ -263,6 +296,9 @@  struct device_attribute pci_dev_attrs[] = {
 	__ATTR(broken_parity_status,(S_IRUGO|S_IWUSR),
 		broken_parity_status_show,broken_parity_status_store),
 	__ATTR(msi_bus, 0644, msi_bus_show, msi_bus_store),
+#ifdef CONFIG_HOTPLUG
+	__ATTR(remove, (S_IWUSR|S_IWGRP), NULL, remove_store),
+#endif
 	__ATTR_NULL,
 };