diff mbox

pci: iov: use device lock to protect IOV sysfs accesses

Message ID 20170522225023.14010-1-jakub.kicinski@netronome.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Jakub Kicinski May 22, 2017, 10:50 p.m. UTC
PCI core sets the driver pointer before calling ->probe() and only
clears it after ->remove().  This means driver's ->sriov_configure()
callback will happily race with probe() and remove(), most likely
leading to BUGs, since drivers don't expect this.

We could reorder pointer assignments, or try detecting races in all
drivers, but it seems simpler and cleaner to just hold the device lock
instead of special SR-IOV lock, since that lock is already supposed
to synchronize the driver callbacks.

Remove the iov lock completely, since we remove the last user.

Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/pci/iov.c       | 4 ----
 drivers/pci/pci-sysfs.c | 5 ++---
 drivers/pci/pci.h       | 1 -
 3 files changed, 2 insertions(+), 8 deletions(-)

Comments

Christoph Hellwig May 23, 2017, 5:25 a.m. UTC | #1
On Mon, May 22, 2017 at 03:50:23PM -0700, Jakub Kicinski wrote:
> PCI core sets the driver pointer before calling ->probe() and only
> clears it after ->remove().  This means driver's ->sriov_configure()
> callback will happily race with probe() and remove(), most likely
> leading to BUGs, since drivers don't expect this.
> 
> We could reorder pointer assignments, or try detecting races in all
> drivers, but it seems simpler and cleaner to just hold the device lock
> instead of special SR-IOV lock, since that lock is already supposed
> to synchronize the driver callbacks.
> 
> Remove the iov lock completely, since we remove the last user.

Having just debugged a different method vs ->remove race I violently
agree.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Bjorn Helgaas May 26, 2017, 11:47 p.m. UTC | #2
Hi Jakub,

On Mon, May 22, 2017 at 03:50:23PM -0700, Jakub Kicinski wrote:
> PCI core sets the driver pointer before calling ->probe() and only
> clears it after ->remove().  This means driver's ->sriov_configure()
> callback will happily race with probe() and remove(), most likely
> leading to BUGs, since drivers don't expect this.

I guess you're referring to the pci_dev->driver pointer set by
local_pci_probe(), and this is important because sriov_numvfs_store()
checks that pointer, right?

I was trying to make sure there weren't other similar problems elsewhere.
But I don't see any other sysfs functions that use pci_dev->driver in that
way, so I think this is the only one.

I think this looks good.

    pci_bus_add_devices
      pci_bus_add_device
        pci_create_sysfs_dev_files
        device_attach
          __device_attach
            device_lock(dev)
            __device_attach_driver
              ...
                local_pci_probe
                  pci_dev->driver = pci_drv       <--- set
                  pci_drv->probe()
            device_unlock(dev)

    sriov_numvfs_store
  -   mutex_lock(&iov->dev->sriov->lock)
  +   device_lock(&pdev->dev)
      if (pdev->driver && pdev->driver->sriov_configure)  <--- test
        pdev->driver->sriov_configure
  -   mutex_unlock(&iov->dev->sriov->lock)
  +   device_unlock(&pdev->dev)

> We could reorder pointer assignments, or try detecting races in all
> drivers, but it seems simpler and cleaner to just hold the device lock
> instead of special SR-IOV lock, since that lock is already supposed
> to synchronize the driver callbacks.
> 
> Remove the iov lock completely, since we remove the last user.
> 
> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> ---
>  drivers/pci/iov.c       | 4 ----
>  drivers/pci/pci-sysfs.c | 5 ++---
>  drivers/pci/pci.h       | 1 -
>  3 files changed, 2 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index d9dc7363ac77..120485d6f352 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -461,8 +461,6 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  	else
>  		iov->dev = dev;
>  
> -	mutex_init(&iov->lock);
> -
>  	dev->sriov = iov;
>  	dev->is_physfn = 1;
>  	rc = compute_max_vf_buses(dev);
> @@ -491,8 +489,6 @@ static void sriov_release(struct pci_dev *dev)
>  	if (dev != dev->sriov->dev)
>  		pci_dev_put(dev->sriov->dev);
>  
> -	mutex_destroy(&dev->sriov->lock);
> -
>  	kfree(dev->sriov);
>  	dev->sriov = NULL;
>  }
> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> index 31e99613a12e..7755559558df 100644
> --- a/drivers/pci/pci-sysfs.c
> +++ b/drivers/pci/pci-sysfs.c
> @@ -472,7 +472,6 @@ static ssize_t sriov_numvfs_store(struct device *dev,
>  				  const char *buf, size_t count)
>  {
>  	struct pci_dev *pdev = to_pci_dev(dev);
> -	struct pci_sriov *iov = pdev->sriov;
>  	int ret;
>  	u16 num_vfs;
>  
> @@ -483,7 +482,7 @@ static ssize_t sriov_numvfs_store(struct device *dev,
>  	if (num_vfs > pci_sriov_get_totalvfs(pdev))
>  		return -ERANGE;
>  
> -	mutex_lock(&iov->dev->sriov->lock);
> +	device_lock(&pdev->dev);
>  
>  	if (num_vfs == pdev->sriov->num_VFs)
>  		goto exit;
> @@ -518,7 +517,7 @@ static ssize_t sriov_numvfs_store(struct device *dev,
>  			 num_vfs, ret);
>  
>  exit:
> -	mutex_unlock(&iov->dev->sriov->lock);
> +	device_unlock(&pdev->dev);
>  
>  	if (ret < 0)
>  		return ret;
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index f8113e5b9812..93f4044b8f4b 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -272,7 +272,6 @@ struct pci_sriov {
>  	u16 driver_max_VFs;	/* max num VFs driver supports */
>  	struct pci_dev *dev;	/* lowest numbered PF */
>  	struct pci_dev *self;	/* this PF */
> -	struct mutex lock;	/* lock for setting sriov_numvfs in sysfs */
>  	resource_size_t barsz[PCI_SRIOV_NUM_BARS];	/* VF BAR size */
>  	bool drivers_autoprobe;	/* auto probing of VFs by driver */
>  };
> -- 
> 2.11.0
>
Jakub Kicinski May 26, 2017, 11:58 p.m. UTC | #3
On Fri, 26 May 2017 18:47:26 -0500, Bjorn Helgaas wrote:
> On Mon, May 22, 2017 at 03:50:23PM -0700, Jakub Kicinski wrote:
> > PCI core sets the driver pointer before calling ->probe() and only
> > clears it after ->remove().  This means driver's ->sriov_configure()
> > callback will happily race with probe() and remove(), most likely
> > leading to BUGs, since drivers don't expect this.  
> 
> I guess you're referring to the pci_dev->driver pointer set by
> local_pci_probe(), and this is important because sriov_numvfs_store()
> checks that pointer, right?

Yes, exactly.  I initially thought this is how the safety of sriov
callback may have been ensured, but since the order of
local_pci_probe() and the assignment is what it is, it can't.
Bjorn Helgaas May 30, 2017, 11:07 p.m. UTC | #4
On Fri, May 26, 2017 at 04:58:20PM -0700, Jakub Kicinski wrote:
> On Fri, 26 May 2017 18:47:26 -0500, Bjorn Helgaas wrote:
> > On Mon, May 22, 2017 at 03:50:23PM -0700, Jakub Kicinski wrote:
> > > PCI core sets the driver pointer before calling ->probe() and only
> > > clears it after ->remove().  This means driver's ->sriov_configure()
> > > callback will happily race with probe() and remove(), most likely
> > > leading to BUGs, since drivers don't expect this.  
> > 
> > I guess you're referring to the pci_dev->driver pointer set by
> > local_pci_probe(), and this is important because sriov_numvfs_store()
> > checks that pointer, right?
> 
> Yes, exactly.  I initially thought this is how the safety of sriov
> callback may have been ensured, but since the order of
> local_pci_probe() and the assignment is what it is, it can't.

Right.  I was hoping other subsystems would establish a convention
about whether we set the ->driver pointer before or after calling the
driver probe() method, but if there is one, I don't see it.
local_pci_probe() and really_probe() set ->driver first, but
pnp_device_probe() calls the probe() method first.

Can you expand on how you reproduce this problem?  The only real way I
see to call ->sriov_configure() is via the sysfs entry point, and I
would think user-space code would typically not touch that until after
it knows the driver has claimed a device.  But I can certainly imagine
targeted test code that could hit this problem.

Bjorn
Jakub Kicinski May 30, 2017, 11:34 p.m. UTC | #5
On Tue, 30 May 2017 18:07:18 -0500, Bjorn Helgaas wrote:
> On Fri, May 26, 2017 at 04:58:20PM -0700, Jakub Kicinski wrote:
> > On Fri, 26 May 2017 18:47:26 -0500, Bjorn Helgaas wrote:  
> > > On Mon, May 22, 2017 at 03:50:23PM -0700, Jakub Kicinski wrote:  
> > > > PCI core sets the driver pointer before calling ->probe() and only
> > > > clears it after ->remove().  This means driver's ->sriov_configure()
> > > > callback will happily race with probe() and remove(), most likely
> > > > leading to BUGs, since drivers don't expect this.    
> > > 
> > > I guess you're referring to the pci_dev->driver pointer set by
> > > local_pci_probe(), and this is important because sriov_numvfs_store()
> > > checks that pointer, right?  
> > 
> > Yes, exactly.  I initially thought this is how the safety of sriov
> > callback may have been ensured, but since the order of
> > local_pci_probe() and the assignment is what it is, it can't.  
> 
> Right.  I was hoping other subsystems would establish a convention
> about whether we set the ->driver pointer before or after calling the
> driver probe() method, but if there is one, I don't see it.
> local_pci_probe() and really_probe() set ->driver first, but
> pnp_device_probe() calls the probe() method first.

I didn't dig into reordering the pointer setting, to be honest.  I
thought establishing that driver callbacks should generally hold device
lock, whenever possible, would be even better than pointer setting
conventions.

If we order the assignments better, wouldn't we still need appropriate
memory barriers to rely on the order? (:

> Can you expand on how you reproduce this problem?  The only real way I
> see to call ->sriov_configure() is via the sysfs entry point, and I
> would think user-space code would typically not touch that until after
> it knows the driver has claimed a device.  But I can certainly imagine
> targeted test code that could hit this problem.

Correct.  It's not something that users should be triggering often in
normal use.  I also found it by code inspection rather than by getting
an oops.

OTOH if the driver performs FW load or other time-consuming operations
in ->probe() the time window when this can be triggered may be counted
in seconds.
Jakub Kicinski June 14, 2017, 4:57 a.m. UTC | #6
On Tue, 30 May 2017 16:34:29 -0700, Jakub Kicinski wrote:
> On Tue, 30 May 2017 18:07:18 -0500, Bjorn Helgaas wrote:
> > On Fri, May 26, 2017 at 04:58:20PM -0700, Jakub Kicinski wrote:  
> > > On Fri, 26 May 2017 18:47:26 -0500, Bjorn Helgaas wrote:    
> > > > On Mon, May 22, 2017 at 03:50:23PM -0700, Jakub Kicinski wrote:    
> > > > > PCI core sets the driver pointer before calling ->probe() and only
> > > > > clears it after ->remove().  This means driver's ->sriov_configure()
> > > > > callback will happily race with probe() and remove(), most likely
> > > > > leading to BUGs, since drivers don't expect this.      
> > > > 
> > > > I guess you're referring to the pci_dev->driver pointer set by
> > > > local_pci_probe(), and this is important because sriov_numvfs_store()
> > > > checks that pointer, right?    
> > > 
> > > Yes, exactly.  I initially thought this is how the safety of sriov
> > > callback may have been ensured, but since the order of
> > > local_pci_probe() and the assignment is what it is, it can't.    
> > 
> > Right.  I was hoping other subsystems would establish a convention
> > about whether we set the ->driver pointer before or after calling the
> > driver probe() method, but if there is one, I don't see it.
> > local_pci_probe() and really_probe() set ->driver first, but
> > pnp_device_probe() calls the probe() method first.  
> 
> I didn't dig into reordering the pointer setting, to be honest.  I
> thought establishing that driver callbacks should generally hold device
> lock, whenever possible, would be even better than pointer setting
> conventions.
> 
> If we order the assignments better, wouldn't we still need appropriate
> memory barriers to rely on the order? (:
> 
> > Can you expand on how you reproduce this problem?  The only real way I
> > see to call ->sriov_configure() is via the sysfs entry point, and I
> > would think user-space code would typically not touch that until after
> > it knows the driver has claimed a device.  But I can certainly imagine
> > targeted test code that could hit this problem.  
> 
> Correct.  It's not something that users should be triggering often in
> normal use.  I also found it by code inspection rather than by getting
> an oops.
> 
> OTOH if the driver performs FW load or other time-consuming operations
> in ->probe() the time window when this can be triggered may be counted
> in seconds.

Hi Bjorn, 

is this patch still considered for 4.13, or should I change it somehow?
Bjorn Helgaas June 15, 2017, 2:47 a.m. UTC | #7
On Mon, May 22, 2017 at 03:50:23PM -0700, Jakub Kicinski wrote:
> PCI core sets the driver pointer before calling ->probe() and only
> clears it after ->remove().  This means driver's ->sriov_configure()
> callback will happily race with probe() and remove(), most likely
> leading to BUGs, since drivers don't expect this.
> 
> We could reorder pointer assignments, or try detecting races in all
> drivers, but it seems simpler and cleaner to just hold the device lock
> instead of special SR-IOV lock, since that lock is already supposed
> to synchronize the driver callbacks.
> 
> Remove the iov lock completely, since we remove the last user.
> 
> Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>

Applied with Christoph's reviewed-by to pci/virtualization for v4.13,
thanks!

> ---
>  drivers/pci/iov.c       | 4 ----
>  drivers/pci/pci-sysfs.c | 5 ++---
>  drivers/pci/pci.h       | 1 -
>  3 files changed, 2 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index d9dc7363ac77..120485d6f352 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -461,8 +461,6 @@ static int sriov_init(struct pci_dev *dev, int pos)
>  	else
>  		iov->dev = dev;
>  
> -	mutex_init(&iov->lock);
> -
>  	dev->sriov = iov;
>  	dev->is_physfn = 1;
>  	rc = compute_max_vf_buses(dev);
> @@ -491,8 +489,6 @@ static void sriov_release(struct pci_dev *dev)
>  	if (dev != dev->sriov->dev)
>  		pci_dev_put(dev->sriov->dev);
>  
> -	mutex_destroy(&dev->sriov->lock);
> -
>  	kfree(dev->sriov);
>  	dev->sriov = NULL;
>  }
> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> index 31e99613a12e..7755559558df 100644
> --- a/drivers/pci/pci-sysfs.c
> +++ b/drivers/pci/pci-sysfs.c
> @@ -472,7 +472,6 @@ static ssize_t sriov_numvfs_store(struct device *dev,
>  				  const char *buf, size_t count)
>  {
>  	struct pci_dev *pdev = to_pci_dev(dev);
> -	struct pci_sriov *iov = pdev->sriov;
>  	int ret;
>  	u16 num_vfs;
>  
> @@ -483,7 +482,7 @@ static ssize_t sriov_numvfs_store(struct device *dev,
>  	if (num_vfs > pci_sriov_get_totalvfs(pdev))
>  		return -ERANGE;
>  
> -	mutex_lock(&iov->dev->sriov->lock);
> +	device_lock(&pdev->dev);
>  
>  	if (num_vfs == pdev->sriov->num_VFs)
>  		goto exit;
> @@ -518,7 +517,7 @@ static ssize_t sriov_numvfs_store(struct device *dev,
>  			 num_vfs, ret);
>  
>  exit:
> -	mutex_unlock(&iov->dev->sriov->lock);
> +	device_unlock(&pdev->dev);
>  
>  	if (ret < 0)
>  		return ret;
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index f8113e5b9812..93f4044b8f4b 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -272,7 +272,6 @@ struct pci_sriov {
>  	u16 driver_max_VFs;	/* max num VFs driver supports */
>  	struct pci_dev *dev;	/* lowest numbered PF */
>  	struct pci_dev *self;	/* this PF */
> -	struct mutex lock;	/* lock for setting sriov_numvfs in sysfs */
>  	resource_size_t barsz[PCI_SRIOV_NUM_BARS];	/* VF BAR size */
>  	bool drivers_autoprobe;	/* auto probing of VFs by driver */
>  };
> -- 
> 2.11.0
>
Christoph Hellwig June 15, 2017, 8:30 a.m. UTC | #8
On Wed, Jun 14, 2017 at 09:47:26PM -0500, Bjorn Helgaas wrote:
> > Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
> 
> Applied with Christoph's reviewed-by to pci/virtualization for v4.13,
> thanks!

Btw, given how you wanted the comments on locking for the reset
methods it might be worth to comment the locking here as well.
diff mbox

Patch

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index d9dc7363ac77..120485d6f352 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -461,8 +461,6 @@  static int sriov_init(struct pci_dev *dev, int pos)
 	else
 		iov->dev = dev;
 
-	mutex_init(&iov->lock);
-
 	dev->sriov = iov;
 	dev->is_physfn = 1;
 	rc = compute_max_vf_buses(dev);
@@ -491,8 +489,6 @@  static void sriov_release(struct pci_dev *dev)
 	if (dev != dev->sriov->dev)
 		pci_dev_put(dev->sriov->dev);
 
-	mutex_destroy(&dev->sriov->lock);
-
 	kfree(dev->sriov);
 	dev->sriov = NULL;
 }
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 31e99613a12e..7755559558df 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -472,7 +472,6 @@  static ssize_t sriov_numvfs_store(struct device *dev,
 				  const char *buf, size_t count)
 {
 	struct pci_dev *pdev = to_pci_dev(dev);
-	struct pci_sriov *iov = pdev->sriov;
 	int ret;
 	u16 num_vfs;
 
@@ -483,7 +482,7 @@  static ssize_t sriov_numvfs_store(struct device *dev,
 	if (num_vfs > pci_sriov_get_totalvfs(pdev))
 		return -ERANGE;
 
-	mutex_lock(&iov->dev->sriov->lock);
+	device_lock(&pdev->dev);
 
 	if (num_vfs == pdev->sriov->num_VFs)
 		goto exit;
@@ -518,7 +517,7 @@  static ssize_t sriov_numvfs_store(struct device *dev,
 			 num_vfs, ret);
 
 exit:
-	mutex_unlock(&iov->dev->sriov->lock);
+	device_unlock(&pdev->dev);
 
 	if (ret < 0)
 		return ret;
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index f8113e5b9812..93f4044b8f4b 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -272,7 +272,6 @@  struct pci_sriov {
 	u16 driver_max_VFs;	/* max num VFs driver supports */
 	struct pci_dev *dev;	/* lowest numbered PF */
 	struct pci_dev *self;	/* this PF */
-	struct mutex lock;	/* lock for setting sriov_numvfs in sysfs */
 	resource_size_t barsz[PCI_SRIOV_NUM_BARS];	/* VF BAR size */
 	bool drivers_autoprobe;	/* auto probing of VFs by driver */
 };