Message ID | 20180817044902.31420-2-benh@kernel.crashing.org (mailing list archive) |
---|---|
State | New, archived |
Delegated to: | Bjorn Helgaas |
Headers | show |
Series | pci: Rework is_added race fix and address bridge enable races | expand |
On Fri, 2018-08-17 at 14:48 +1000, Benjamin Herrenschmidt wrote: > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76. > > The new pci state mutex provides a simpler way of addressing > this race and avoids the relative includes added to the powerpc > code. Ignore the cset comment, my "fix" no longer relies on a state mutex, I'll re-post with a better comment after discussions. The actual fix is in patch 2: [RFC PATCH 2/6] pci: Set pci_dev->is_added before calling device_add (and yes that was supposed be device_attach ... ugh, not enough caffeine today). > Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> > --- > arch/powerpc/kernel/pci-common.c | 4 +--- > arch/powerpc/platforms/powernv/pci-ioda.c | 3 +-- > arch/powerpc/platforms/pseries/setup.c | 3 +-- > drivers/pci/bus.c | 6 +++--- > drivers/pci/hotplug/acpiphp_glue.c | 2 +- > drivers/pci/pci.h | 11 ----------- > drivers/pci/probe.c | 4 ++-- > drivers/pci/remove.c | 5 ++--- > include/linux/pci.h | 1 + > 9 files changed, 12 insertions(+), 27 deletions(-) > > diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c > index 471aac313b89..fe9733ffffaa 100644 > --- a/arch/powerpc/kernel/pci-common.c > +++ b/arch/powerpc/kernel/pci-common.c > @@ -42,8 +42,6 @@ > #include <asm/ppc-pci.h> > #include <asm/eeh.h> > > -#include "../../../drivers/pci/pci.h" > - > /* hose_spinlock protects accesses to the the phb_bitmap. */ > static DEFINE_SPINLOCK(hose_spinlock); > LIST_HEAD(hose_list); > @@ -1016,7 +1014,7 @@ void pcibios_setup_bus_devices(struct pci_bus *bus) > /* Cardbus can call us to add new devices to a bus, so ignore > * those who are already fully discovered > */ > - if (pci_dev_is_added(dev)) > + if (dev->is_added) > continue; > > pcibios_setup_device(dev); > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c > index 70b2e1e0f23c..5bd0eb6681bc 100644 > --- a/arch/powerpc/platforms/powernv/pci-ioda.c > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c > @@ -46,7 +46,6 @@ > > #include "powernv.h" > #include "pci.h" > -#include "../../../../drivers/pci/pci.h" > > #define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */ > #define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ > @@ -3139,7 +3138,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) > struct pci_dn *pdn; > int mul, total_vfs; > > - if (!pdev->is_physfn || pci_dev_is_added(pdev)) > + if (!pdev->is_physfn || pdev->is_added) > return; > > pdn = pci_get_pdn(pdev); > diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c > index 8a4868a3964b..139f0af6c3d9 100644 > --- a/arch/powerpc/platforms/pseries/setup.c > +++ b/arch/powerpc/platforms/pseries/setup.c > @@ -71,7 +71,6 @@ > #include <asm/security_features.h> > > #include "pseries.h" > -#include "../../../../drivers/pci/pci.h" > > int CMO_PrPSP = -1; > int CMO_SecPSP = -1; > @@ -665,7 +664,7 @@ static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev) > const int *indexes; > struct device_node *dn = pci_device_to_OF_node(pdev); > > - if (!pdev->is_physfn || pci_dev_is_added(pdev)) > + if (!pdev->is_physfn || pdev->is_added) > return; > /*Firmware must support open sriov otherwise dont configure*/ > indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL); > diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c > index 5cb40b2518f9..35b7fc87eac5 100644 > --- a/drivers/pci/bus.c > +++ b/drivers/pci/bus.c > @@ -330,7 +330,7 @@ void pci_bus_add_device(struct pci_dev *dev) > return; > } > > - pci_dev_assign_added(dev, true); > + dev->is_added = 1; > } > EXPORT_SYMBOL_GPL(pci_bus_add_device); > > @@ -347,14 +347,14 @@ void pci_bus_add_devices(const struct pci_bus *bus) > > list_for_each_entry(dev, &bus->devices, bus_list) { > /* Skip already-added devices */ > - if (pci_dev_is_added(dev)) > + if (dev->is_added) > continue; > pci_bus_add_device(dev); > } > > list_for_each_entry(dev, &bus->devices, bus_list) { > /* Skip if device attach failed */ > - if (!pci_dev_is_added(dev)) > + if (!dev->is_added) > continue; > child = dev->subordinate; > if (child) > diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c > index ef0b1b6ba86f..3a17b290df5d 100644 > --- a/drivers/pci/hotplug/acpiphp_glue.c > +++ b/drivers/pci/hotplug/acpiphp_glue.c > @@ -509,7 +509,7 @@ static void enable_slot(struct acpiphp_slot *slot) > > list_for_each_entry(dev, &bus->devices, bus_list) { > /* Assume that newly added devices are powered on already. */ > - if (!pci_dev_is_added(dev)) > + if (!dev->is_added) > dev->current_state = PCI_D0; > } > > diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h > index 6e0d1528d471..473aa10a5dbf 100644 > --- a/drivers/pci/pci.h > +++ b/drivers/pci/pci.h > @@ -295,7 +295,6 @@ struct pci_sriov { > > /* pci_dev priv_flags */ > #define PCI_DEV_DISCONNECTED 0 > -#define PCI_DEV_ADDED 1 > > static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused) > { > @@ -308,16 +307,6 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev) > return test_bit(PCI_DEV_DISCONNECTED, &dev->priv_flags); > } > > -static inline void pci_dev_assign_added(struct pci_dev *dev, bool added) > -{ > - assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added); > -} > - > -static inline bool pci_dev_is_added(const struct pci_dev *dev) > -{ > - return test_bit(PCI_DEV_ADDED, &dev->priv_flags); > -} > - > #ifdef CONFIG_PCIEAER > #include <linux/aer.h> > > diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c > index ec784009a36b..440445ac7dfa 100644 > --- a/drivers/pci/probe.c > +++ b/drivers/pci/probe.c > @@ -2525,13 +2525,13 @@ int pci_scan_slot(struct pci_bus *bus, int devfn) > dev = pci_scan_single_device(bus, devfn); > if (!dev) > return 0; > - if (!pci_dev_is_added(dev)) > + if (!dev->is_added) > nr++; > > for (fn = next_fn(bus, dev, 0); fn > 0; fn = next_fn(bus, dev, fn)) { > dev = pci_scan_single_device(bus, devfn + fn); > if (dev) { > - if (!pci_dev_is_added(dev)) > + if (!dev->is_added) > nr++; > dev->multifunction = 1; > } > diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c > index 461e7fd2756f..01ec7fcb5634 100644 > --- a/drivers/pci/remove.c > +++ b/drivers/pci/remove.c > @@ -18,12 +18,11 @@ static void pci_stop_dev(struct pci_dev *dev) > { > pci_pme_active(dev, false); > > - if (pci_dev_is_added(dev)) { > + if (dev->is_added) { > device_release_driver(&dev->dev); > pci_proc_detach_device(dev); > pci_remove_sysfs_dev_files(dev); > - > - pci_dev_assign_added(dev, false); > + dev->is_added = 0; > } > > if (dev->bus->self) > diff --git a/include/linux/pci.h b/include/linux/pci.h > index 9b87f1936906..9799109c5e25 100644 > --- a/include/linux/pci.h > +++ b/include/linux/pci.h > @@ -373,6 +373,7 @@ struct pci_dev { > unsigned int transparent:1; /* Subtractive decode bridge */ > unsigned int multifunction:1; /* Multi-function device */ > > + unsigned int is_added:1; > unsigned int is_busmaster:1; /* Is busmaster */ > unsigned int no_msi:1; /* May not use MSI */ > unsigned int no_64bit_msi:1; /* May only use 32-bit MSIs */
On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote: > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76. Just to be clear, if I understand correctly, this is a pure revert of 44bda4b7d26e and as such it reintroduces the problem solved by that commit. If your solution turns out to be better, that's great, but it would be nice to avoid the bisection hole of reintroducing the problem, then fixing it again later. > The new pci state mutex provides a simpler way of addressing > this race and avoids the relative includes added to the powerpc > code.
On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote: > On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote: > > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76. > > Just to be clear, if I understand correctly, this is a pure revert of > 44bda4b7d26e and as such it reintroduces the problem solved by that > commit. > > If your solution turns out to be better, that's great, but it would be > nice to avoid the bisection hole of reintroducing the problem, then > fixing it again later. There is no way to do that other than merging the revert and the fix into one. That said, the race is sufficiently hard to hit that I wouldn't worry too much about it. > > The new pci state mutex provides a simpler way of addressing > > this race and avoids the relative includes added to the powerpc > > code.
On Sat, Aug 18, 2018 at 01:24:51PM +1000, Benjamin Herrenschmidt wrote: > On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote: > > On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote: > > > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76. > > > > Just to be clear, if I understand correctly, this is a pure revert of > > 44bda4b7d26e and as such it reintroduces the problem solved by that > > commit. > > > > If your solution turns out to be better, that's great, but it would be > > nice to avoid the bisection hole of reintroducing the problem, then > > fixing it again later. > > There is no way to do that other than merging the revert and the fix > into one. That said, the race is sufficiently hard to hit that I > wouldn't worry too much about it. OK, then at least mention that in the changelog. > > > The new pci state mutex provides a simpler way of addressing > > > this race and avoids the relative includes added to the powerpc > > > code. >
On Sat, 2018-08-18 at 21:24 -0500, Bjorn Helgaas wrote: > On Sat, Aug 18, 2018 at 01:24:51PM +1000, Benjamin Herrenschmidt wrote: > > On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote: > > > On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote: > > > > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76. > > > > > > Just to be clear, if I understand correctly, this is a pure revert of > > > 44bda4b7d26e and as such it reintroduces the problem solved by that > > > commit. > > > > > > If your solution turns out to be better, that's great, but it would be > > > nice to avoid the bisection hole of reintroducing the problem, then > > > fixing it again later. > > > > There is no way to do that other than merging the revert and the fix > > into one. That said, the race is sufficiently hard to hit that I > > wouldn't worry too much about it. > > OK, then at least mention that in the changelog. Sure will do. This is just RFC at this stage :-) As for the race with enable, what's your take on my approach ? The basic premise is that we need some kind of mutex to make the updates to enable_cnt and the actual config space changes atomic. While at it we can fold pci_set_master vs. is_busmaster as well as those are racy too. I chose to create a new mutex which we should be able to address other similar races if we find them. The other solutions that I dismissed were: - Using the device_lock. A explained previously, this is tricky, I prefer not using this for anything other than locking against concurrent add/remove. The main issue is that drivers will be sometimes called in context where that's already held, so we can't take it inside pci_enable_device() and I'd rather not add new constraints such as "pci_enable_device() must be only called from probe() unless you also take the device lock". It would be tricky to audit everybody... - Using a global mutex. We could move the bridge lock from AER to core code for example, and use that. But it doesn't buy us much, and slightly redecuces parallelism. It also makes it a little bit more messy to walk up the bridge chain, we'd have to do a pci_enable_device_unlocked or something, messy. So are you ok with the approach ? Do you prefer one of the above regardless ? Something else ? Cheers, Ben.
On Mon, Aug 20, 2018 at 7:40 AM, Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote: > On Sat, 2018-08-18 at 21:24 -0500, Bjorn Helgaas wrote: >> On Sat, Aug 18, 2018 at 01:24:51PM +1000, Benjamin Herrenschmidt wrote: >> > On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote: >> > > On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote: >> > > > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76. >> > > >> > > Just to be clear, if I understand correctly, this is a pure revert of >> > > 44bda4b7d26e and as such it reintroduces the problem solved by that >> > > commit. >> > > >> > > If your solution turns out to be better, that's great, but it would be >> > > nice to avoid the bisection hole of reintroducing the problem, then >> > > fixing it again later. >> > >> > There is no way to do that other than merging the revert and the fix >> > into one. That said, the race is sufficiently hard to hit that I >> > wouldn't worry too much about it. >> >> OK, then at least mention that in the changelog. > > Sure will do. This is just RFC at this stage :-) > > As for the race with enable, what's your take on my approach ? The > basic premise is that we need some kind of mutex to make the updates to > enable_cnt and the actual config space changes atomic. While at it we > can fold pci_set_master vs. is_busmaster as well as those are racy too. > > I chose to create a new mutex which we should be able to address other > similar races if we find them. The other solutions that I dismissed > were: > > - Using the device_lock. A explained previously, this is tricky, I > prefer not using this for anything other than locking against > concurrent add/remove. The main issue is that drivers will be sometimes > called in context where that's already held, so we can't take it inside > pci_enable_device() and I'd rather not add new constraints such as > "pci_enable_device() must be only called from probe() unless you also > take the device lock". It would be tricky to audit everybody... > > - Using a global mutex. We could move the bridge lock from AER to core > code for example, and use that. But it doesn't buy us much, and > slightly redecuces parallelism. It also makes it a little bit more > messy to walk up the bridge chain, we'd have to do a > pci_enable_device_unlocked or something, messy. > > So are you ok with the approach ? Do you prefer one of the above > regardless ? Something else ? > > Cheers, > Ben. > > Some concern was raised about race situation so just to be more clear about race condition. Situation is described in Bug 200283 - PCI: Data corruption happening due to a race condition. Issue was discovered by our broadcom QA team. Initially commit was to use one separate lock only for avoiding race condition but after review, approach was slightly changed to move is_added to pci_dev private flags as it was completely internal/private variable of pci core driver. Powerpc legacy arch code was using is_added flag directly which looks bit strange so ../../ type of inclusion of pci.h was required. I know it looks ugly but it is being used in some legacy code still. Anyway, as stated earlier too, problem should be just solved in better way. Regards, hari
On Mon, Aug 20, 2018 at 12:10:59PM +1000, Benjamin Herrenschmidt wrote: > I chose to create a new mutex which we should be able to address other > similar races if we find them. The other solutions that I dismissed > were: > > - Using the device_lock. A explained previously, this is tricky, I > prefer not using this for anything other than locking against > concurrent add/remove. The main issue is that drivers will be sometimes > called in context where that's already held, so we can't take it inside > pci_enable_device() and I'd rather not add new constraints such as > "pci_enable_device() must be only called from probe() unless you also > take the device lock". It would be tricky to audit everybody... > > - Using a global mutex. We could move the bridge lock from AER to core > code for example, and use that. But it doesn't buy us much, and > slightly redecuces parallelism. It also makes it a little bit more > messy to walk up the bridge chain, we'd have to do a > pci_enable_device_unlocked or something, messy. +1 from my side for adding a struct mutex to struct pci_dev to protect state changes. The device_lock() primarily protects binding / unbinding of the device and pci_dev state may have to be changed while binding / unbinding. A global lock invites deadlocks if multiple devices are added / removed concurrently where one is a parent of the other. (Think hot-removal of multiple devices on a Thunderbolt daisy-chain.) As said I'd also welcome folding PCI_DEV_DISCONNECTED into enum pci_channel_state, either as an additional state or by using pci_channel_io_perm_failure. Thanks, Lukas
On Mon, 2018-08-20 at 11:55 +0530, Hari Vyas wrote: > On Mon, Aug 20, 2018 at 7:40 AM, Benjamin Herrenschmidt > <benh@kernel.crashing.org> wrote: > > On Sat, 2018-08-18 at 21:24 -0500, Bjorn Helgaas wrote: > > > On Sat, Aug 18, 2018 at 01:24:51PM +1000, Benjamin Herrenschmidt wrote: > > > > On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote: > > > > > On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote: > > > > > > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76. > > > > > > > > > > Just to be clear, if I understand correctly, this is a pure revert of > > > > > 44bda4b7d26e and as such it reintroduces the problem solved by that > > > > > commit. > > > > > > > > > > If your solution turns out to be better, that's great, but it would be > > > > > nice to avoid the bisection hole of reintroducing the problem, then > > > > > fixing it again later. > > > > > > > > There is no way to do that other than merging the revert and the fix > > > > into one. That said, the race is sufficiently hard to hit that I > > > > wouldn't worry too much about it. > > > > > > OK, then at least mention that in the changelog. > > > > Sure will do. This is just RFC at this stage :-) > > > > As for the race with enable, what's your take on my approach ? The > > basic premise is that we need some kind of mutex to make the updates to > > enable_cnt and the actual config space changes atomic. While at it we > > can fold pci_set_master vs. is_busmaster as well as those are racy too. > > > > I chose to create a new mutex which we should be able to address other > > similar races if we find them. The other solutions that I dismissed > > were: > > > > - Using the device_lock. A explained previously, this is tricky, I > > prefer not using this for anything other than locking against > > concurrent add/remove. The main issue is that drivers will be sometimes > > called in context where that's already held, so we can't take it inside > > pci_enable_device() and I'd rather not add new constraints such as > > "pci_enable_device() must be only called from probe() unless you also > > take the device lock". It would be tricky to audit everybody... > > > > - Using a global mutex. We could move the bridge lock from AER to core > > code for example, and use that. But it doesn't buy us much, and > > slightly redecuces parallelism. It also makes it a little bit more > > messy to walk up the bridge chain, we'd have to do a > > pci_enable_device_unlocked or something, messy. > > > > So are you ok with the approach ? Do you prefer one of the above > > regardless ? Something else ? > > > > Cheers, > > Ben. > > > > > > Some concern was raised about race situation so just to be more clear > about race condition. This is not what the above discussion is about. The race with is is_added is due to the fact that the bit is set too later as discussed previously and in my changelog. The race I'm talking about here is the race related to multiple subtrees trying to simultaneously enable a parent bridge. > Situation is described in Bug 200283 - PCI: Data corruption happening > due to a race condition. > Issue was discovered by our broadcom QA team. > Initially commit was to use one separate lock only for avoiding race > condition but after review, approach was slightly changed to move > is_added to pci_dev private flags as it was completely > internal/private variable of pci core driver. > Powerpc legacy arch code was using is_added flag directly which looks > bit strange so ../../ type of inclusion of pci.h was required. I know > it looks ugly but it is being used in some legacy code still. > Anyway, as stated earlier too, problem should be just solved in better way. The is_added race can be fixed with a 3 lines patch moving is_added up to before device_attach() I believe. I yet have to find a scenario where doing so would break something but it's possible that I missed a case. At this stage, I'm more intested however in us agreeing how to fix the other race, the one with enabling. As I wrote above, I proposed an approach based on a mutex in pci_dev, and this is what I would like discussed. This race is currently causing our large systems to randomly error out at boot time when probing some PCIe devices. I'm putting a band-aid in the powerpc code for now to pre-enable bridges at boot, but I'd rather have the race fixed in the generic code. Ben.
On Mon, 2018-08-20 at 09:17 +0200, Lukas Wunner wrote: > On Mon, Aug 20, 2018 at 12:10:59PM +1000, Benjamin Herrenschmidt wrote: > > I chose to create a new mutex which we should be able to address other > > similar races if we find them. The other solutions that I dismissed > > were: > > > > - Using the device_lock. A explained previously, this is tricky, I > > prefer not using this for anything other than locking against > > concurrent add/remove. The main issue is that drivers will be sometimes > > called in context where that's already held, so we can't take it inside > > pci_enable_device() and I'd rather not add new constraints such as > > "pci_enable_device() must be only called from probe() unless you also > > take the device lock". It would be tricky to audit everybody... > > > > - Using a global mutex. We could move the bridge lock from AER to core > > code for example, and use that. But it doesn't buy us much, and > > slightly redecuces parallelism. It also makes it a little bit more > > messy to walk up the bridge chain, we'd have to do a > > pci_enable_device_unlocked or something, messy. > > +1 from my side for adding a struct mutex to struct pci_dev to protect > state changes. Ok thanks. This is what my patch proposes. We can use it later to protect more things if we wish to do so. > The device_lock() primarily protects binding / unbinding of the device > and pci_dev state may have to be changed while binding / unbinding. Yup, precisely. > A global lock invites deadlocks if multiple devices are added / removed > concurrently where one is a parent of the other. (Think hot-removal of > multiple devices on a Thunderbolt daisy-chain.) Yes. > As said I'd also welcome folding PCI_DEV_DISCONNECTED into enum > pci_channel_state, either as an additional state or by using > pci_channel_io_perm_failure. Ok. I have that in my tentative series but I think for robustness, I should make the error_state field atomically updated in order to ensure that no transition out of "disconnected" can happen while racing with concurrent error_state updates at interrupt time (at least with EEH, it can be updated from any read{b,w,l,q}). I'll do a bit more work on the patches this week as time permits and send a non-RFC series. Cheers, Ben. > Thanks, > > Lukas
On Mon, Aug 20, 2018 at 4:39 PM, Benjamin Herrenschmidt <benh@kernel.crashing.org> wrote: > On Mon, 2018-08-20 at 11:55 +0530, Hari Vyas wrote: >> On Mon, Aug 20, 2018 at 7:40 AM, Benjamin Herrenschmidt >> <benh@kernel.crashing.org> wrote: >> > On Sat, 2018-08-18 at 21:24 -0500, Bjorn Helgaas wrote: >> > > On Sat, Aug 18, 2018 at 01:24:51PM +1000, Benjamin Herrenschmidt wrote: >> > > > On Fri, 2018-08-17 at 10:44 -0500, Bjorn Helgaas wrote: >> > > > > On Fri, Aug 17, 2018 at 02:48:57PM +1000, Benjamin Herrenschmidt wrote: >> > > > > > This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76. >> > > > > >> > > > > Just to be clear, if I understand correctly, this is a pure revert of >> > > > > 44bda4b7d26e and as such it reintroduces the problem solved by that >> > > > > commit. >> > > > > >> > > > > If your solution turns out to be better, that's great, but it would be >> > > > > nice to avoid the bisection hole of reintroducing the problem, then >> > > > > fixing it again later. >> > > > >> > > > There is no way to do that other than merging the revert and the fix >> > > > into one. That said, the race is sufficiently hard to hit that I >> > > > wouldn't worry too much about it. >> > > >> > > OK, then at least mention that in the changelog. >> > >> > Sure will do. This is just RFC at this stage :-) >> > >> > As for the race with enable, what's your take on my approach ? The >> > basic premise is that we need some kind of mutex to make the updates to >> > enable_cnt and the actual config space changes atomic. While at it we >> > can fold pci_set_master vs. is_busmaster as well as those are racy too. >> > >> > I chose to create a new mutex which we should be able to address other >> > similar races if we find them. The other solutions that I dismissed >> > were: >> > >> > - Using the device_lock. A explained previously, this is tricky, I >> > prefer not using this for anything other than locking against >> > concurrent add/remove. The main issue is that drivers will be sometimes >> > called in context where that's already held, so we can't take it inside >> > pci_enable_device() and I'd rather not add new constraints such as >> > "pci_enable_device() must be only called from probe() unless you also >> > take the device lock". It would be tricky to audit everybody... >> > >> > - Using a global mutex. We could move the bridge lock from AER to core >> > code for example, and use that. But it doesn't buy us much, and >> > slightly redecuces parallelism. It also makes it a little bit more >> > messy to walk up the bridge chain, we'd have to do a >> > pci_enable_device_unlocked or something, messy. >> > >> > So are you ok with the approach ? Do you prefer one of the above >> > regardless ? Something else ? >> > >> > Cheers, >> > Ben. >> > >> > >> >> Some concern was raised about race situation so just to be more clear >> about race condition. > > This is not what the above discussion is about. > > The race with is is_added is due to the fact that the bit is set too > later as discussed previously and in my changelog. > > The race I'm talking about here is the race related to multiple > subtrees trying to simultaneously enable a parent bridge. > >> Situation is described in Bug 200283 - PCI: Data corruption happening >> due to a race condition. >> Issue was discovered by our broadcom QA team. >> Initially commit was to use one separate lock only for avoiding race >> condition but after review, approach was slightly changed to move >> is_added to pci_dev private flags as it was completely >> internal/private variable of pci core driver. >> Powerpc legacy arch code was using is_added flag directly which looks >> bit strange so ../../ type of inclusion of pci.h was required. I know >> it looks ugly but it is being used in some legacy code still. >> Anyway, as stated earlier too, problem should be just solved in better way. > > The is_added race can be fixed with a 3 lines patch moving is_added up > to before device_attach() I believe. I yet have to find a scenario > where doing so would break something but it's possible that I missed a > case. > > At this stage, I'm more intested however in us agreeing how to fix the > other race, the one with enabling. As I wrote above, I proposed an > approach based on a mutex in pci_dev, and this is what I would like > discussed. > > This race is currently causing our large systems to randomly error out > at boot time when probing some PCIe devices. I'm putting a band-aid in > the powerpc code for now to pre-enable bridges at boot, but I'd rather > have the race fixed in the generic code. > > Ben. > > I was initially using spin lock in "PATCH v1] PCI: Data corruption happening due to race condition" and didn't face issue at-least in our environment for our race condition. Anyway, we can leave this minor is_added issue time-being and concentrate on current pci bridge concern. Could you re-share your latest patch in your environment and your first doubt where race situation could happen. May be forum can suggest some-thing good. Regard, hari.
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 471aac313b89..fe9733ffffaa 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -42,8 +42,6 @@ #include <asm/ppc-pci.h> #include <asm/eeh.h> -#include "../../../drivers/pci/pci.h" - /* hose_spinlock protects accesses to the the phb_bitmap. */ static DEFINE_SPINLOCK(hose_spinlock); LIST_HEAD(hose_list); @@ -1016,7 +1014,7 @@ void pcibios_setup_bus_devices(struct pci_bus *bus) /* Cardbus can call us to add new devices to a bus, so ignore * those who are already fully discovered */ - if (pci_dev_is_added(dev)) + if (dev->is_added) continue; pcibios_setup_device(dev); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 70b2e1e0f23c..5bd0eb6681bc 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -46,7 +46,6 @@ #include "powernv.h" #include "pci.h" -#include "../../../../drivers/pci/pci.h" #define PNV_IODA1_M64_NUM 16 /* Number of M64 BARs */ #define PNV_IODA1_M64_SEGS 8 /* Segments per M64 BAR */ @@ -3139,7 +3138,7 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev) struct pci_dn *pdn; int mul, total_vfs; - if (!pdev->is_physfn || pci_dev_is_added(pdev)) + if (!pdev->is_physfn || pdev->is_added) return; pdn = pci_get_pdn(pdev); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 8a4868a3964b..139f0af6c3d9 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -71,7 +71,6 @@ #include <asm/security_features.h> #include "pseries.h" -#include "../../../../drivers/pci/pci.h" int CMO_PrPSP = -1; int CMO_SecPSP = -1; @@ -665,7 +664,7 @@ static void pseries_pci_fixup_iov_resources(struct pci_dev *pdev) const int *indexes; struct device_node *dn = pci_device_to_OF_node(pdev); - if (!pdev->is_physfn || pci_dev_is_added(pdev)) + if (!pdev->is_physfn || pdev->is_added) return; /*Firmware must support open sriov otherwise dont configure*/ indexes = of_get_property(dn, "ibm,open-sriov-vf-bar-info", NULL); diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c index 5cb40b2518f9..35b7fc87eac5 100644 --- a/drivers/pci/bus.c +++ b/drivers/pci/bus.c @@ -330,7 +330,7 @@ void pci_bus_add_device(struct pci_dev *dev) return; } - pci_dev_assign_added(dev, true); + dev->is_added = 1; } EXPORT_SYMBOL_GPL(pci_bus_add_device); @@ -347,14 +347,14 @@ void pci_bus_add_devices(const struct pci_bus *bus) list_for_each_entry(dev, &bus->devices, bus_list) { /* Skip already-added devices */ - if (pci_dev_is_added(dev)) + if (dev->is_added) continue; pci_bus_add_device(dev); } list_for_each_entry(dev, &bus->devices, bus_list) { /* Skip if device attach failed */ - if (!pci_dev_is_added(dev)) + if (!dev->is_added) continue; child = dev->subordinate; if (child) diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c index ef0b1b6ba86f..3a17b290df5d 100644 --- a/drivers/pci/hotplug/acpiphp_glue.c +++ b/drivers/pci/hotplug/acpiphp_glue.c @@ -509,7 +509,7 @@ static void enable_slot(struct acpiphp_slot *slot) list_for_each_entry(dev, &bus->devices, bus_list) { /* Assume that newly added devices are powered on already. */ - if (!pci_dev_is_added(dev)) + if (!dev->is_added) dev->current_state = PCI_D0; } diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h index 6e0d1528d471..473aa10a5dbf 100644 --- a/drivers/pci/pci.h +++ b/drivers/pci/pci.h @@ -295,7 +295,6 @@ struct pci_sriov { /* pci_dev priv_flags */ #define PCI_DEV_DISCONNECTED 0 -#define PCI_DEV_ADDED 1 static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused) { @@ -308,16 +307,6 @@ static inline bool pci_dev_is_disconnected(const struct pci_dev *dev) return test_bit(PCI_DEV_DISCONNECTED, &dev->priv_flags); } -static inline void pci_dev_assign_added(struct pci_dev *dev, bool added) -{ - assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added); -} - -static inline bool pci_dev_is_added(const struct pci_dev *dev) -{ - return test_bit(PCI_DEV_ADDED, &dev->priv_flags); -} - #ifdef CONFIG_PCIEAER #include <linux/aer.h> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c index ec784009a36b..440445ac7dfa 100644 --- a/drivers/pci/probe.c +++ b/drivers/pci/probe.c @@ -2525,13 +2525,13 @@ int pci_scan_slot(struct pci_bus *bus, int devfn) dev = pci_scan_single_device(bus, devfn); if (!dev) return 0; - if (!pci_dev_is_added(dev)) + if (!dev->is_added) nr++; for (fn = next_fn(bus, dev, 0); fn > 0; fn = next_fn(bus, dev, fn)) { dev = pci_scan_single_device(bus, devfn + fn); if (dev) { - if (!pci_dev_is_added(dev)) + if (!dev->is_added) nr++; dev->multifunction = 1; } diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c index 461e7fd2756f..01ec7fcb5634 100644 --- a/drivers/pci/remove.c +++ b/drivers/pci/remove.c @@ -18,12 +18,11 @@ static void pci_stop_dev(struct pci_dev *dev) { pci_pme_active(dev, false); - if (pci_dev_is_added(dev)) { + if (dev->is_added) { device_release_driver(&dev->dev); pci_proc_detach_device(dev); pci_remove_sysfs_dev_files(dev); - - pci_dev_assign_added(dev, false); + dev->is_added = 0; } if (dev->bus->self) diff --git a/include/linux/pci.h b/include/linux/pci.h index 9b87f1936906..9799109c5e25 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -373,6 +373,7 @@ struct pci_dev { unsigned int transparent:1; /* Subtractive decode bridge */ unsigned int multifunction:1; /* Multi-function device */ + unsigned int is_added:1; unsigned int is_busmaster:1; /* Is busmaster */ unsigned int no_msi:1; /* May not use MSI */ unsigned int no_64bit_msi:1; /* May only use 32-bit MSIs */
This reverts commit 44bda4b7d26e9fffed6d7152d98a2e9edaeb2a76. The new pci state mutex provides a simpler way of addressing this race and avoids the relative includes added to the powerpc code. Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> --- arch/powerpc/kernel/pci-common.c | 4 +--- arch/powerpc/platforms/powernv/pci-ioda.c | 3 +-- arch/powerpc/platforms/pseries/setup.c | 3 +-- drivers/pci/bus.c | 6 +++--- drivers/pci/hotplug/acpiphp_glue.c | 2 +- drivers/pci/pci.h | 11 ----------- drivers/pci/probe.c | 4 ++-- drivers/pci/remove.c | 5 ++--- include/linux/pci.h | 1 + 9 files changed, 12 insertions(+), 27 deletions(-)