diff mbox series

[RFC,1/6] Revert "PCI: Fix is_added/is_busmaster race condition"

Message ID 20180817044902.31420-2-benh@kernel.crashing.org (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show
Series pci: Rework is_added race fix and address bridge enable races | expand

Commit Message

Benjamin Herrenschmidt Aug. 17, 2018, 4:48 a.m. UTC
This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76.

The new pci state mutex provides a simpler way of addressing
this race and avoids the relative includes added to the powerpc
code.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/pci-common.c          |  4 +---
 arch/powerpc/platforms/powernv/pci-ioda.c |  3 +--
 arch/powerpc/platforms/pseries/setup.c    |  3 +--
 drivers/pci/bus.c                         |  6 +++---
 drivers/pci/hotplug/acpiphp_glue.c        |  2 +-
 drivers/pci/pci.h                         | 11 -----------
 drivers/pci/probe.c                       |  4 ++--
 drivers/pci/remove.c                      |  5 ++---
 include/linux/pci.h                       |  1 +
 9 files changed, 12 insertions(+), 27 deletions(-)

Comments

Benjamin Herrenschmidt Aug. 17, 2018, 4:57 a.m. UTC | #1
On Fri, 2018-08-17 at 14:48 +1000, Benjamin Herrenschmidt wrote:
> This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76.
> 
> The new pci state mutex provides a simpler way of addressing
> this race and avoids the relative includes added to the powerpc
> code.

Ignore the cset comment, my "fix" no longer relies on a state mutex,
I'll re-post with a better comment after discussions.

The actual fix is in patch 2:

[RFC PATCH 2/6] pci: Set pci_dev->is_added before calling device_add

(and yes that was supposed be device_attach ... ugh, not enough
caffeine today).

> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> ---
>  arch/powerpc/kernel/pci-common.c          |  4 +---
>  arch/powerpc/platforms/powernv/pci-ioda.c |  3 +--
>  arch/powerpc/platforms/pseries/setup.c    |  3 +--
>  drivers/pci/bus.c                         |  6 +++---
>  drivers/pci/hotplug/acpiphp_glue.c        |  2 +-
>  drivers/pci/pci.h                         | 11 -----------
>  drivers/pci/probe.c                       |  4 ++--
>  drivers/pci/remove.c                      |  5 ++---
>  include/linux/pci.h                       |  1 +
>  9 files changed, 12 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
> index 471aac313b89..fe9733ffffaa 100644
> --- a/arch/powerpc/kernel/pci-common.c
> +++ b/arch/powerpc/kernel/pci-common.c
> @@ -42,8 +42,6 @@
>  #include <asm/ppc-pci.h>
>  #include <asm/eeh.h>
>  
> -#include "../../../drivers/pci/pci.h"
> -
>  /* hose_spinlock protects accesses to the the phb_bitmap. */
>  static DEFINE_SPINLOCK(hose_spinlock);
>  LIST_HEAD(hose_list);
> @@ -1016,7 +1014,7 @@ void pcibios_setup_bus_devices(struct pci_bus *bus)
>  		/* Cardbus can call us to add new devices to a bus, so ignore
>  		 * those who are already fully discovered
>  		 */
> -		if (pci_dev_is_added(dev))
> +		if (dev->is_added)
>  			continue;
>  
>  		pcibios_setup_device(dev);
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 70b2e1e0f23c..5bd0eb6681bc 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -46,7 +46,6 @@
>  
>  #include "powernv.h"
>  #include "pci.h"
> -#include "../../../../drivers/pci/pci.h"
>  
>  #define PNV_IODA1_M64_NUM	16	/* Number of M64 BARs	*/
>  #define PNV_IODA1_M64_SEGS	8	/* Segments per M64 BAR	*/
> @@ -3139,7 +3138,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
>  	struct pci_dn *pdn;
>  	int mul, total_vfs;
>  
> -	if (!pdev->is_physfn || pci_dev_is_added(pdev))
> +	if (!pdev->is_physfn || pdev->is_added)
>  		return;
>  
>  	pdn = pci_get_pdn(pdev);
> diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
> index 8a4868a3964b..139f0af6c3d9 100644
> --- a/arch/powerpc/platforms/pseries/setup.c
> +++ b/arch/powerpc/platforms/pseries/setup.c
> @@ -71,7 +71,6 @@
>  #include <asm/security_features.h>
>  
>  #include "pseries.h"
> -#include "../../../../drivers/pci/pci.h"
>  
>  int CMO_PrPSP = -1;
>  int CMO_SecPSP = -1;
> @@ -665,7 +664,7 @@ static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
>  	const int *indexes;
>  	struct device_node *dn = pci_device_to_OF_node(pdev);
>  
> -	if (!pdev->is_physfn || pci_dev_is_added(pdev))
> +	if (!pdev->is_physfn || pdev->is_added)
>  		return;
>  	/*Firmware must support open sriov otherwise dont configure*/
>  	indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
> diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
> index 5cb40b2518f9..35b7fc87eac5 100644
> --- a/drivers/pci/bus.c
> +++ b/drivers/pci/bus.c
> @@ -330,7 +330,7 @@ void pci_bus_add_device(struct pci_dev *dev)
>  		return;
>  	}
>  
> -	pci_dev_assign_added(dev, true);
> +	dev->is_added = 1;
>  }
>  EXPORT_SYMBOL_GPL(pci_bus_add_device);
>  
> @@ -347,14 +347,14 @@ void pci_bus_add_devices(const struct pci_bus *bus)
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		/* Skip already-added devices */
> -		if (pci_dev_is_added(dev))
> +		if (dev->is_added)
>  			continue;
>  		pci_bus_add_device(dev);
>  	}
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		/* Skip if device attach failed */
> -		if (!pci_dev_is_added(dev))
> +		if (!dev->is_added)
>  			continue;
>  		child = dev->subordinate;
>  		if (child)
> diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
> index ef0b1b6ba86f..3a17b290df5d 100644
> --- a/drivers/pci/hotplug/acpiphp_glue.c
> +++ b/drivers/pci/hotplug/acpiphp_glue.c
> @@ -509,7 +509,7 @@ static void enable_slot(struct acpiphp_slot *slot)
>  
>  	list_for_each_entry(dev, &bus->devices, bus_list) {
>  		/* Assume that newly added devices are powered on already. */
> -		if (!pci_dev_is_added(dev))
> +		if (!dev->is_added)
>  			dev->current_state = PCI_D0;
>  	}
>  
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 6e0d1528d471..473aa10a5dbf 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -295,7 +295,6 @@ struct pci_sriov {
>  
>  /* pci_dev priv_flags */
>  #define PCI_DEV_DISCONNECTED 0
> -#define PCI_DEV_ADDED 1
>  
>  static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
>  {
> @@ -308,16 +307,6 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev)
>  	return test_bit(PCI_DEV_DISCONNECTED, &dev->priv_flags);
>  }
>  
> -static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
> -{
> -	assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added);
> -}
> -
> -static inline bool pci_dev_is_added(const struct pci_dev *dev)
> -{
> -	return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
> -}
> -
>  #ifdef CONFIG_PCIEAER
>  #include <linux/aer.h>
>  
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index ec784009a36b..440445ac7dfa 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c
> @@ -2525,13 +2525,13 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
>  	dev = pci_scan_single_device(bus, devfn);
>  	if (!dev)
>  		return 0;
> -	if (!pci_dev_is_added(dev))
> +	if (!dev->is_added)
>  		nr++;
>  
>  	for (fn = next_fn(bus, dev, 0); fn > 0; fn = next_fn(bus, dev, fn)) {
>  		dev = pci_scan_single_device(bus, devfn + fn);
>  		if (dev) {
> -			if (!pci_dev_is_added(dev))
> +			if (!dev->is_added)
>  				nr++;
>  			dev->multifunction = 1;
>  		}
> diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
> index 461e7fd2756f..01ec7fcb5634 100644
> --- a/drivers/pci/remove.c
> +++ b/drivers/pci/remove.c
> @@ -18,12 +18,11 @@ static void pci_stop_dev(struct pci_dev *dev)
>  {
>  	pci_pme_active(dev, false);
>  
> -	if (pci_dev_is_added(dev)) {
> +	if (dev->is_added) {
>  		device_release_driver(&dev->dev);
>  		pci_proc_detach_device(dev);
>  		pci_remove_sysfs_dev_files(dev);
> -
> -		pci_dev_assign_added(dev, false);
> +		dev->is_added = 0;
>  	}
>  
>  	if (dev->bus->self)
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 9b87f1936906..9799109c5e25 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -373,6 +373,7 @@ struct pci_dev {
>  	unsigned int	transparent:1;		/* Subtractive decode bridge */
>  	unsigned int	multifunction:1;	/* Multi-function device */
>  
> +	unsigned int	is_added:1;
>  	unsigned int	is_busmaster:1;		/* Is busmaster */
>  	unsigned int	no_msi:1;		/* May not use MSI */
>  	unsigned int	no_64bit_msi:1; 	/* May only use 32-bit MSIs */
Bjorn Helgaas Aug. 17, 2018, 3:44 p.m. UTC | #2
On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote:
> This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76.

Just to be clear, if I understand correctly, this is a pure revert of
44bda4b7d26e and as such it reintroduces the problem solved by that
commit.

If your solution turns out to be better, that's great, but it would be
nice to avoid the bisection hole of reintroducing the problem, then
fixing it again later.

> The new pci state mutex provides a simpler way of addressing
> this race and avoids the relative includes added to the powerpc
> code.
Benjamin Herrenschmidt Aug. 18, 2018, 3:24 a.m. UTC | #3
On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote:
> On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote:
> > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76.
> 
> Just to be clear, if I understand correctly, this is a pure revert of
> 44bda4b7d26e and as such it reintroduces the problem solved by that
> commit.
> 
> If your solution turns out to be better, that's great, but it would be
> nice to avoid the bisection hole of reintroducing the problem, then
> fixing it again later.

There is no way to do that other than merging the revert and the fix
into one. That said, the race is sufficiently hard to hit that I
wouldn't worry too much about it.

> > The new pci state mutex provides a simpler way of addressing
> > this race and avoids the relative includes added to the powerpc
> > code.
Bjorn Helgaas Aug. 19, 2018, 2:24 a.m. UTC | #4
On Sat, Aug 18, 2018 at 01:24:51PM +1000, Benjamin Herrenschmidt wrote:
> On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote:
> > On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote:
> > > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76.
> > 
> > Just to be clear, if I understand correctly, this is a pure revert of
> > 44bda4b7d26e and as such it reintroduces the problem solved by that
> > commit.
> > 
> > If your solution turns out to be better, that's great, but it would be
> > nice to avoid the bisection hole of reintroducing the problem, then
> > fixing it again later.
> 
> There is no way to do that other than merging the revert and the fix
> into one. That said, the race is sufficiently hard to hit that I
> wouldn't worry too much about it.

OK, then at least mention that in the changelog.

> > > The new pci state mutex provides a simpler way of addressing
> > > this race and avoids the relative includes added to the powerpc
> > > code.
>
Benjamin Herrenschmidt Aug. 20, 2018, 2:10 a.m. UTC | #5
On Sat, 2018-08-18 at 21:24 -0500, Bjorn Helgaas wrote:
> On Sat, Aug 18, 2018 at 01:24:51PM +1000, Benjamin Herrenschmidt wrote:
> > On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote:
> > > On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote:
> > > > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76.
> > > 
> > > Just to be clear, if I understand correctly, this is a pure revert of
> > > 44bda4b7d26e and as such it reintroduces the problem solved by that
> > > commit.
> > > 
> > > If your solution turns out to be better, that's great, but it would be
> > > nice to avoid the bisection hole of reintroducing the problem, then
> > > fixing it again later.
> > 
> > There is no way to do that other than merging the revert and the fix
> > into one. That said, the race is sufficiently hard to hit that I
> > wouldn't worry too much about it.
> 
> OK, then at least mention that in the changelog.

Sure will do. This is just RFC at this stage :-)

As for the race with enable, what's your take on my approach ? The
basic premise is that we need some kind of mutex to make the updates to
enable_cnt and the actual config space changes atomic. While at it we
can fold pci_set_master vs. is_busmaster as well as those are racy too.

I chose to create a new mutex which we should be able to address other
similar races if we find them. The other solutions that I dismissed
were:

 - Using the device_lock. A explained previously, this is tricky, I
prefer not using this for anything other than locking against
concurrent add/remove. The main issue is that drivers will be sometimes
called in context where that's already held, so we can't take it inside
pci_enable_device() and I'd rather not add new constraints such as
"pci_enable_device() must be only called from probe() unless you also
take the device lock". It would be tricky to audit everybody...

 - Using a global mutex. We could move the bridge lock from AER to core
code for example, and use that. But it doesn't buy us much, and
slightly redecuces parallelism. It also makes it a little bit more
messy to walk up the bridge chain, we'd have to do a
pci_enable_device_unlocked or something, messy.

So are you ok with the approach ? Do you prefer one of the above
regardless ? Something else ?

Cheers,
Ben.
Hari Vyas Aug. 20, 2018, 6:25 a.m. UTC | #6
On Mon, Aug 20, 2018 at 7:40 AM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Sat, 2018-08-18 at 21:24 -0500, Bjorn Helgaas wrote:
>> On Sat, Aug 18, 2018 at 01:24:51PM +1000, Benjamin Herrenschmidt wrote:
>> > On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote:
>> > > On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote:
>> > > > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76.
>> > >
>> > > Just to be clear, if I understand correctly, this is a pure revert of
>> > > 44bda4b7d26e and as such it reintroduces the problem solved by that
>> > > commit.
>> > >
>> > > If your solution turns out to be better, that's great, but it would be
>> > > nice to avoid the bisection hole of reintroducing the problem, then
>> > > fixing it again later.
>> >
>> > There is no way to do that other than merging the revert and the fix
>> > into one. That said, the race is sufficiently hard to hit that I
>> > wouldn't worry too much about it.
>>
>> OK, then at least mention that in the changelog.
>
> Sure will do. This is just RFC at this stage :-)
>
> As for the race with enable, what's your take on my approach ? The
> basic premise is that we need some kind of mutex to make the updates to
> enable_cnt and the actual config space changes atomic. While at it we
> can fold pci_set_master vs. is_busmaster as well as those are racy too.
>
> I chose to create a new mutex which we should be able to address other
> similar races if we find them. The other solutions that I dismissed
> were:
>
>  - Using the device_lock. A explained previously, this is tricky, I
> prefer not using this for anything other than locking against
> concurrent add/remove. The main issue is that drivers will be sometimes
> called in context where that's already held, so we can't take it inside
> pci_enable_device() and I'd rather not add new constraints such as
> "pci_enable_device() must be only called from probe() unless you also
> take the device lock". It would be tricky to audit everybody...
>
>  - Using a global mutex. We could move the bridge lock from AER to core
> code for example, and use that. But it doesn't buy us much, and
> slightly redecuces parallelism. It also makes it a little bit more
> messy to walk up the bridge chain, we'd have to do a
> pci_enable_device_unlocked or something, messy.
>
> So are you ok with the approach ? Do you prefer one of the above
> regardless ? Something else ?
>
> Cheers,
> Ben.
>
>
Some concern was raised about race situation so just to be more clear
about race condition.
Situation is described in Bug 200283 - PCI: Data corruption happening
due to a race condition.
Issue was discovered by our broadcom QA team.
Initially commit was to use one separate lock only for avoiding race
condition but after review, approach was slightly changed to move
is_added  to pci_dev private flags as it was completely
internal/private variable of pci core driver.
Powerpc legacy arch code was using is_added flag directly which looks
bit strange so  ../../ type of inclusion of pci.h was required. I know
it looks ugly but it is being used in some legacy code still.
Anyway, as stated earlier too, problem should be just solved in better way.

Regards,
hari
Lukas Wunner Aug. 20, 2018, 7:17 a.m. UTC | #7
On Mon, Aug 20, 2018 at 12:10:59PM +1000, Benjamin Herrenschmidt wrote:
> I chose to create a new mutex which we should be able to address other
> similar races if we find them. The other solutions that I dismissed
> were:
> 
>  - Using the device_lock. A explained previously, this is tricky, I
> prefer not using this for anything other than locking against
> concurrent add/remove. The main issue is that drivers will be sometimes
> called in context where that's already held, so we can't take it inside
> pci_enable_device() and I'd rather not add new constraints such as
> "pci_enable_device() must be only called from probe() unless you also
> take the device lock". It would be tricky to audit everybody...
> 
>  - Using a global mutex. We could move the bridge lock from AER to core
> code for example, and use that. But it doesn't buy us much, and
> slightly redecuces parallelism. It also makes it a little bit more
> messy to walk up the bridge chain, we'd have to do a
> pci_enable_device_unlocked or something, messy.

+1 from my side for adding a struct mutex to struct pci_dev to protect
state changes.

The device_lock() primarily protects binding / unbinding of the device
and pci_dev state may have to be changed while binding / unbinding.

A global lock invites deadlocks if multiple devices are added / removed
concurrently where one is a parent of the other.  (Think hot-removal of
multiple devices on a Thunderbolt daisy-chain.)

As said I'd also welcome folding PCI_DEV_DISCONNECTED into enum
pci_channel_state, either as an additional state or by using
pci_channel_io_perm_failure.

Thanks,

Lukas
Benjamin Herrenschmidt Aug. 20, 2018, 11:09 a.m. UTC | #8
On Mon, 2018-08-20 at 11:55 +0530, Hari Vyas wrote:
> On Mon, Aug 20, 2018 at 7:40 AM, Benjamin Herrenschmidt
> <benh@kernel.crashing.org> wrote:
> > On Sat, 2018-08-18 at 21:24 -0500, Bjorn Helgaas wrote:
> > > On Sat, Aug 18, 2018 at 01:24:51PM +1000, Benjamin Herrenschmidt wrote:
> > > > On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote:
> > > > > On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote:
> > > > > > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76.
> > > > > 
> > > > > Just to be clear, if I understand correctly, this is a pure revert of
> > > > > 44bda4b7d26e and as such it reintroduces the problem solved by that
> > > > > commit.
> > > > > 
> > > > > If your solution turns out to be better, that's great, but it would be
> > > > > nice to avoid the bisection hole of reintroducing the problem, then
> > > > > fixing it again later.
> > > > 
> > > > There is no way to do that other than merging the revert and the fix
> > > > into one. That said, the race is sufficiently hard to hit that I
> > > > wouldn't worry too much about it.
> > > 
> > > OK, then at least mention that in the changelog.
> > 
> > Sure will do. This is just RFC at this stage :-)
> > 
> > As for the race with enable, what's your take on my approach ? The
> > basic premise is that we need some kind of mutex to make the updates to
> > enable_cnt and the actual config space changes atomic. While at it we
> > can fold pci_set_master vs. is_busmaster as well as those are racy too.
> > 
> > I chose to create a new mutex which we should be able to address other
> > similar races if we find them. The other solutions that I dismissed
> > were:
> > 
> >  - Using the device_lock. A explained previously, this is tricky, I
> > prefer not using this for anything other than locking against
> > concurrent add/remove. The main issue is that drivers will be sometimes
> > called in context where that's already held, so we can't take it inside
> > pci_enable_device() and I'd rather not add new constraints such as
> > "pci_enable_device() must be only called from probe() unless you also
> > take the device lock". It would be tricky to audit everybody...
> > 
> >  - Using a global mutex. We could move the bridge lock from AER to core
> > code for example, and use that. But it doesn't buy us much, and
> > slightly redecuces parallelism. It also makes it a little bit more
> > messy to walk up the bridge chain, we'd have to do a
> > pci_enable_device_unlocked or something, messy.
> > 
> > So are you ok with the approach ? Do you prefer one of the above
> > regardless ? Something else ?
> > 
> > Cheers,
> > Ben.
> > 
> > 
> 
> Some concern was raised about race situation so just to be more clear
> about race condition.

This is not what the above discussion is about.

The race with is is_added is due to the fact that the bit is set too
later as discussed previously and in my changelog.

The race I'm talking about here is the race related to multiple
subtrees trying to simultaneously enable a parent bridge.

> Situation is described in Bug 200283 - PCI: Data corruption happening
> due to a race condition.
> Issue was discovered by our broadcom QA team.
> Initially commit was to use one separate lock only for avoiding race
> condition but after review, approach was slightly changed to move
> is_added  to pci_dev private flags as it was completely
> internal/private variable of pci core driver.
> Powerpc legacy arch code was using is_added flag directly which looks
> bit strange so  ../../ type of inclusion of pci.h was required. I know
> it looks ugly but it is being used in some legacy code still.
> Anyway, as stated earlier too, problem should be just solved in better way.

The is_added race can be fixed with a 3 lines patch moving is_added up
to before device_attach() I believe. I yet have to find a scenario
where doing so would break something but it's possible that I missed a
case.

At this stage, I'm more intested however in us agreeing how to fix the
other race, the one with enabling. As I wrote above, I proposed an
approach based on a mutex in pci_dev, and this is what I would like
discussed.

This race is currently causing our large systems to randomly error out
at boot time when probing some PCIe devices. I'm putting a band-aid in
the powerpc code for now to pre-enable bridges at boot, but I'd rather
have the race fixed in the generic code.

Ben.
Benjamin Herrenschmidt Aug. 20, 2018, 11:12 a.m. UTC | #9
On Mon, 2018-08-20 at 09:17 +0200, Lukas Wunner wrote:
> On Mon, Aug 20, 2018 at 12:10:59PM +1000, Benjamin Herrenschmidt wrote:
> > I chose to create a new mutex which we should be able to address other
> > similar races if we find them. The other solutions that I dismissed
> > were:
> > 
> >  - Using the device_lock. A explained previously, this is tricky, I
> > prefer not using this for anything other than locking against
> > concurrent add/remove. The main issue is that drivers will be sometimes
> > called in context where that's already held, so we can't take it inside
> > pci_enable_device() and I'd rather not add new constraints such as
> > "pci_enable_device() must be only called from probe() unless you also
> > take the device lock". It would be tricky to audit everybody...
> > 
> >  - Using a global mutex. We could move the bridge lock from AER to core
> > code for example, and use that. But it doesn't buy us much, and
> > slightly redecuces parallelism. It also makes it a little bit more
> > messy to walk up the bridge chain, we'd have to do a
> > pci_enable_device_unlocked or something, messy.
> 
> +1 from my side for adding a struct mutex to struct pci_dev to protect
> state changes.

Ok thanks. This is what my patch proposes. We can use it later to
protect more things if we wish to do so.

> The device_lock() primarily protects binding / unbinding of the device
> and pci_dev state may have to be changed while binding / unbinding.

Yup, precisely.

> A global lock invites deadlocks if multiple devices are added / removed
> concurrently where one is a parent of the other.  (Think hot-removal of
> multiple devices on a Thunderbolt daisy-chain.)

Yes.

> As said I'd also welcome folding PCI_DEV_DISCONNECTED into enum
> pci_channel_state, either as an additional state or by using
> pci_channel_io_perm_failure.

Ok. I have that in my tentative series but I think for robustness, I
should make the error_state field atomically updated in order to ensure
that no transition out of "disconnected" can happen while racing with
concurrent error_state updates at interrupt time (at least with EEH, it
can be updated from any read{b,w,l,q}).

I'll do a bit more work on the patches this week as time permits and
send a non-RFC series.

Cheers,
Ben. 

> Thanks,
> 
> Lukas
Hari Vyas Aug. 20, 2018, 11:43 a.m. UTC | #10
On Mon, Aug 20, 2018 at 4:39 PM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Mon, 2018-08-20 at 11:55 +0530, Hari Vyas wrote:
>> On Mon, Aug 20, 2018 at 7:40 AM, Benjamin Herrenschmidt
>> <benh@kernel.crashing.org> wrote:
>> > On Sat, 2018-08-18 at 21:24 -0500, Bjorn Helgaas wrote:
>> > > On Sat, Aug 18, 2018 at 01:24:51PM +1000, Benjamin Herrenschmidt wrote:
>> > > > On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote:
>> > > > > On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote:
>> > > > > > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76.
>> > > > >
>> > > > > Just to be clear, if I understand correctly, this is a pure revert of
>> > > > > 44bda4b7d26e and as such it reintroduces the problem solved by that
>> > > > > commit.
>> > > > >
>> > > > > If your solution turns out to be better, that's great, but it would be
>> > > > > nice to avoid the bisection hole of reintroducing the problem, then
>> > > > > fixing it again later.
>> > > >
>> > > > There is no way to do that other than merging the revert and the fix
>> > > > into one. That said, the race is sufficiently hard to hit that I
>> > > > wouldn't worry too much about it.
>> > >
>> > > OK, then at least mention that in the changelog.
>> >
>> > Sure will do. This is just RFC at this stage :-)
>> >
>> > As for the race with enable, what's your take on my approach ? The
>> > basic premise is that we need some kind of mutex to make the updates to
>> > enable_cnt and the actual config space changes atomic. While at it we
>> > can fold pci_set_master vs. is_busmaster as well as those are racy too.
>> >
>> > I chose to create a new mutex which we should be able to address other
>> > similar races if we find them. The other solutions that I dismissed
>> > were:
>> >
>> >  - Using the device_lock. A explained previously, this is tricky, I
>> > prefer not using this for anything other than locking against
>> > concurrent add/remove. The main issue is that drivers will be sometimes
>> > called in context where that's already held, so we can't take it inside
>> > pci_enable_device() and I'd rather not add new constraints such as
>> > "pci_enable_device() must be only called from probe() unless you also
>> > take the device lock". It would be tricky to audit everybody...
>> >
>> >  - Using a global mutex. We could move the bridge lock from AER to core
>> > code for example, and use that. But it doesn't buy us much, and
>> > slightly redecuces parallelism. It also makes it a little bit more
>> > messy to walk up the bridge chain, we'd have to do a
>> > pci_enable_device_unlocked or something, messy.
>> >
>> > So are you ok with the approach ? Do you prefer one of the above
>> > regardless ? Something else ?
>> >
>> > Cheers,
>> > Ben.
>> >
>> >
>>
>> Some concern was raised about race situation so just to be more clear
>> about race condition.
>
> This is not what the above discussion is about.
>
> The race with is is_added is due to the fact that the bit is set too
> later as discussed previously and in my changelog.
>
> The race I'm talking about here is the race related to multiple
> subtrees trying to simultaneously enable a parent bridge.
>
>> Situation is described in Bug 200283 - PCI: Data corruption happening
>> due to a race condition.
>> Issue was discovered by our broadcom QA team.
>> Initially commit was to use one separate lock only for avoiding race
>> condition but after review, approach was slightly changed to move
>> is_added  to pci_dev private flags as it was completely
>> internal/private variable of pci core driver.
>> Powerpc legacy arch code was using is_added flag directly which looks
>> bit strange so  ../../ type of inclusion of pci.h was required. I know
>> it looks ugly but it is being used in some legacy code still.
>> Anyway, as stated earlier too, problem should be just solved in better way.
>
> The is_added race can be fixed with a 3 lines patch moving is_added up
> to before device_attach() I believe. I yet have to find a scenario
> where doing so would break something but it's possible that I missed a
> case.
>
> At this stage, I'm more intested however in us agreeing how to fix the
> other race, the one with enabling. As I wrote above, I proposed an
> approach based on a mutex in pci_dev, and this is what I would like
> discussed.
>
> This race is currently causing our large systems to randomly error out
> at boot time when probing some PCIe devices. I'm putting a band-aid in
> the powerpc code for now to pre-enable bridges at boot, but I'd rather
> have the race fixed in the generic code.
>
> Ben.
>
>
I was initially using spin lock in
"PATCH v1] PCI: Data corruption happening due to race condition" and didn't
face  issue at-least in our environment for our race condition.
Anyway, we can leave this minor is_added issue time-being and concentrate on
current pci bridge concern. Could you re-share your latest patch in
your environment
and your first doubt where race situation could happen. May be forum
can suggest
some-thing good.


Regard,
hari.
diff mbox series

Patch

diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 471aac313b89..fe9733ffffaa 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -42,8 +42,6 @@ 
 #include <asm/ppc-pci.h>
 #include <asm/eeh.h>
 
-#include "../../../drivers/pci/pci.h"
-
 /* hose_spinlock protects accesses to the the phb_bitmap. */
 static DEFINE_SPINLOCK(hose_spinlock);
 LIST_HEAD(hose_list);
@@ -1016,7 +1014,7 @@  void pcibios_setup_bus_devices(struct pci_bus *bus)
 		/* Cardbus can call us to add new devices to a bus, so ignore
 		 * those who are already fully discovered
 		 */
-		if (pci_dev_is_added(dev))
+		if (dev->is_added)
 			continue;
 
 		pcibios_setup_device(dev);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 70b2e1e0f23c..5bd0eb6681bc 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -46,7 +46,6 @@ 
 
 #include "powernv.h"
 #include "pci.h"
-#include "../../../../drivers/pci/pci.h"
 
 #define PNV_IODA1_M64_NUM	16	/* Number of M64 BARs	*/
 #define PNV_IODA1_M64_SEGS	8	/* Segments per M64 BAR	*/
@@ -3139,7 +3138,7 @@  static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 	struct pci_dn *pdn;
 	int mul, total_vfs;
 
-	if (!pdev->is_physfn || pci_dev_is_added(pdev))
+	if (!pdev->is_physfn || pdev->is_added)
 		return;
 
 	pdn = pci_get_pdn(pdev);
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 8a4868a3964b..139f0af6c3d9 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -71,7 +71,6 @@ 
 #include <asm/security_features.h>
 
 #include "pseries.h"
-#include "../../../../drivers/pci/pci.h"
 
 int CMO_PrPSP = -1;
 int CMO_SecPSP = -1;
@@ -665,7 +664,7 @@  static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev)
 	const int *indexes;
 	struct device_node *dn = pci_device_to_OF_node(pdev);
 
-	if (!pdev->is_physfn || pci_dev_is_added(pdev))
+	if (!pdev->is_physfn || pdev->is_added)
 		return;
 	/*Firmware must support open sriov otherwise dont configure*/
 	indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL);
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 5cb40b2518f9..35b7fc87eac5 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -330,7 +330,7 @@  void pci_bus_add_device(struct pci_dev *dev)
 		return;
 	}
 
-	pci_dev_assign_added(dev, true);
+	dev->is_added = 1;
 }
 EXPORT_SYMBOL_GPL(pci_bus_add_device);
 
@@ -347,14 +347,14 @@  void pci_bus_add_devices(const struct pci_bus *bus)
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		/* Skip already-added devices */
-		if (pci_dev_is_added(dev))
+		if (dev->is_added)
 			continue;
 		pci_bus_add_device(dev);
 	}
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		/* Skip if device attach failed */
-		if (!pci_dev_is_added(dev))
+		if (!dev->is_added)
 			continue;
 		child = dev->subordinate;
 		if (child)
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index ef0b1b6ba86f..3a17b290df5d 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -509,7 +509,7 @@  static void enable_slot(struct acpiphp_slot *slot)
 
 	list_for_each_entry(dev, &bus->devices, bus_list) {
 		/* Assume that newly added devices are powered on already. */
-		if (!pci_dev_is_added(dev))
+		if (!dev->is_added)
 			dev->current_state = PCI_D0;
 	}
 
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 6e0d1528d471..473aa10a5dbf 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -295,7 +295,6 @@  struct pci_sriov {
 
 /* pci_dev priv_flags */
 #define PCI_DEV_DISCONNECTED 0
-#define PCI_DEV_ADDED 1
 
 static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
 {
@@ -308,16 +307,6 @@  static inline bool pci_dev_is_disconnected(const struct pci_dev *dev)
 	return test_bit(PCI_DEV_DISCONNECTED, &dev->priv_flags);
 }
 
-static inline void pci_dev_assign_added(struct pci_dev *dev, bool added)
-{
-	assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added);
-}
-
-static inline bool pci_dev_is_added(const struct pci_dev *dev)
-{
-	return test_bit(PCI_DEV_ADDED, &dev->priv_flags);
-}
-
 #ifdef CONFIG_PCIEAER
 #include <linux/aer.h>
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index ec784009a36b..440445ac7dfa 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2525,13 +2525,13 @@  int pci_scan_slot(struct pci_bus *bus, int devfn)
 	dev = pci_scan_single_device(bus, devfn);
 	if (!dev)
 		return 0;
-	if (!pci_dev_is_added(dev))
+	if (!dev->is_added)
 		nr++;
 
 	for (fn = next_fn(bus, dev, 0); fn > 0; fn = next_fn(bus, dev, fn)) {
 		dev = pci_scan_single_device(bus, devfn + fn);
 		if (dev) {
-			if (!pci_dev_is_added(dev))
+			if (!dev->is_added)
 				nr++;
 			dev->multifunction = 1;
 		}
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 461e7fd2756f..01ec7fcb5634 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -18,12 +18,11 @@  static void pci_stop_dev(struct pci_dev *dev)
 {
 	pci_pme_active(dev, false);
 
-	if (pci_dev_is_added(dev)) {
+	if (dev->is_added) {
 		device_release_driver(&dev->dev);
 		pci_proc_detach_device(dev);
 		pci_remove_sysfs_dev_files(dev);
-
-		pci_dev_assign_added(dev, false);
+		dev->is_added = 0;
 	}
 
 	if (dev->bus->self)
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9b87f1936906..9799109c5e25 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -373,6 +373,7 @@  struct pci_dev {
 	unsigned int	transparent:1;		/* Subtractive decode bridge */
 	unsigned int	multifunction:1;	/* Multi-function device */
 
+	unsigned int	is_added:1;
 	unsigned int	is_busmaster:1;		/* Is busmaster */
 	unsigned int	no_msi:1;		/* May not use MSI */
 	unsigned int	no_64bit_msi:1; 	/* May only use 32-bit MSIs */