diff mbox series

PCI: hotplug: Allow marking devices as disconnected during bind/unbind

Message ID 3dc88ea82bdc0e37d9000e413d5ebce481cbd629.1674205689.git.lukas@wunner.de (mailing list archive)
State Accepted
Delegated to: Bjorn Helgaas
Headers show
Series PCI: hotplug: Allow marking devices as disconnected during bind/unbind | expand

Commit Message

Lukas Wunner Jan. 20, 2023, 9:19 a.m. UTC
On surprise removal, pciehp_unconfigure_device() and acpiphp's
trim_stale_devices() call pci_dev_set_disconnected() to mark removed
devices as permanently offline.  Thereby, the PCI core and drivers know
to skip device accesses.

However pci_dev_set_disconnected() takes the device_lock and thus waits
for a concurrent driver bind or unbind to complete.  As a result, the
driver's ->probe and ->remove hooks have no chance to learn that the
device is gone.

That doesn't make any sense, so drop the device_lock and instead use
atomic xchg() and cmpxchg() operations to update the device state.

As a byproduct, an AB-BA deadlock reported by Anatoli is fixed which
occurs on surprise removal with AER concurrently performing a bus reset.

AER bus reset:

  INFO: task irq/26-aerdrv:95 blocked for more than 120 seconds.
  Tainted: G        W          6.2.0-rc3-custom-norework-jan11+
  schedule
  rwsem_down_write_slowpath
  down_write_nested
  pciehp_reset_slot                      # acquires reset_lock
  pci_reset_hotplug_slot
  pci_slot_reset                         # acquires device_lock
  pci_bus_error_reset
  aer_root_reset
  pcie_do_recovery
  aer_process_err_devices
  aer_isr

pciehp surprise removal:

  INFO: task irq/26-pciehp:96 blocked for more than 120 seconds.
  Tainted: G        W          6.2.0-rc3-custom-norework-jan11+
  schedule_preempt_disabled
  __mutex_lock
  mutex_lock_nested
  pci_dev_set_disconnected               # acquires device_lock
  pci_walk_bus
  pciehp_unconfigure_device
  pciehp_disable_slot
  pciehp_handle_presence_or_link_change
  pciehp_ist                             # acquires reset_lock

Fixes: a6bd101b8f84 ("PCI: Unify device inaccessible")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=215590
Reported-by: Anatoli Antonovitch <anatoli.antonovitch@amd.com>
Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: stable@vger.kernel.org # v4.20+
Cc: Keith Busch <kbusch@kernel.org>
---
 drivers/pci/pci.h | 43 +++++++++++++------------------------------
 1 file changed, 13 insertions(+), 30 deletions(-)

Comments

Christian König Jan. 20, 2023, 9:41 a.m. UTC | #1
Am 20.01.23 um 10:19 schrieb Lukas Wunner:
> On surprise removal, pciehp_unconfigure_device() and acpiphp's
> trim_stale_devices() call pci_dev_set_disconnected() to mark removed
> devices as permanently offline.  Thereby, the PCI core and drivers know
> to skip device accesses.
>
> However pci_dev_set_disconnected() takes the device_lock and thus waits
> for a concurrent driver bind or unbind to complete.  As a result, the
> driver's ->probe and ->remove hooks have no chance to learn that the
> device is gone.

Who is reading dev->error_state in this situation and especially do we 
have the necessary read barrier in place?

Alternatively if this is just opportunistically we should document that 
somehow.

> That doesn't make any sense, so drop the device_lock and instead use
> atomic xchg() and cmpxchg() operations to update the device state.

You use xchg() instead of WRITE_ONCE() for the memory barrier here?

> As a byproduct, an AB-BA deadlock reported by Anatoli is fixed which
> occurs on surprise removal with AER concurrently performing a bus reset.

Well this byproduct is probably the main fix in this patch. I'm 
wondering why lockdep didn't complained about that more drastically in 
our testing.

This approach indeed looks much cleaner.

Thanks,
Christian.

>
> AER bus reset:
>
>    INFO: task irq/26-aerdrv:95 blocked for more than 120 seconds.
>    Tainted: G        W          6.2.0-rc3-custom-norework-jan11+
>    schedule
>    rwsem_down_write_slowpath
>    down_write_nested
>    pciehp_reset_slot                      # acquires reset_lock
>    pci_reset_hotplug_slot
>    pci_slot_reset                         # acquires device_lock
>    pci_bus_error_reset
>    aer_root_reset
>    pcie_do_recovery
>    aer_process_err_devices
>    aer_isr
>
> pciehp surprise removal:
>
>    INFO: task irq/26-pciehp:96 blocked for more than 120 seconds.
>    Tainted: G        W          6.2.0-rc3-custom-norework-jan11+
>    schedule_preempt_disabled
>    __mutex_lock
>    mutex_lock_nested
>    pci_dev_set_disconnected               # acquires device_lock
>    pci_walk_bus
>    pciehp_unconfigure_device
>    pciehp_disable_slot
>    pciehp_handle_presence_or_link_change
>    pciehp_ist                             # acquires reset_lock
>
> Fixes: a6bd101b8f84 ("PCI: Unify device inaccessible")
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=215590
> Reported-by: Anatoli Antonovitch <anatoli.antonovitch@amd.com>
> Signed-off-by: Lukas Wunner <lukas@wunner.de>
> Cc: stable@vger.kernel.org # v4.20+
> Cc: Keith Busch <kbusch@kernel.org>
> ---
>   drivers/pci/pci.h | 43 +++++++++++++------------------------------
>   1 file changed, 13 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 9ed3b55..5d5a44a 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -310,53 +310,36 @@ struct pci_sriov {
>    * @dev: PCI device to set new error_state
>    * @new: the state we want dev to be in
>    *
> - * Must be called with device_lock held.
> + * If the device is experiencing perm_failure, it has to remain in that state.
> + * Any other transition is allowed.
>    *
>    * Returns true if state has been changed to the requested state.
>    */
>   static inline bool pci_dev_set_io_state(struct pci_dev *dev,
>   					pci_channel_state_t new)
>   {
> -	bool changed = false;
> +	pci_channel_state_t old;
>   
> -	device_lock_assert(&dev->dev);
>   	switch (new) {
>   	case pci_channel_io_perm_failure:
> -		switch (dev->error_state) {
> -		case pci_channel_io_frozen:
> -		case pci_channel_io_normal:
> -		case pci_channel_io_perm_failure:
> -			changed = true;
> -			break;
> -		}
> -		break;
> +		xchg(&dev->error_state, pci_channel_io_perm_failure);
> +		return true;
>   	case pci_channel_io_frozen:
> -		switch (dev->error_state) {
> -		case pci_channel_io_frozen:
> -		case pci_channel_io_normal:
> -			changed = true;
> -			break;
> -		}
> -		break;
> +		old = cmpxchg(&dev->error_state, pci_channel_io_normal,
> +			      pci_channel_io_frozen);
> +		return old != pci_channel_io_perm_failure;
>   	case pci_channel_io_normal:
> -		switch (dev->error_state) {
> -		case pci_channel_io_frozen:
> -		case pci_channel_io_normal:
> -			changed = true;
> -			break;
> -		}
> -		break;
> +		old = cmpxchg(&dev->error_state, pci_channel_io_frozen,
> +			      pci_channel_io_normal);
> +		return old != pci_channel_io_perm_failure;
> +	default:
> +		return false;
>   	}
> -	if (changed)
> -		dev->error_state = new;
> -	return changed;
>   }
>   
>   static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
>   {
> -	device_lock(&dev->dev);
>   	pci_dev_set_io_state(dev, pci_channel_io_perm_failure);
> -	device_unlock(&dev->dev);
>   
>   	return 0;
>   }
Lukas Wunner Jan. 20, 2023, 5:52 p.m. UTC | #2
On Fri, Jan 20, 2023 at 10:41:13AM +0100, Christian König wrote:
> Am 20.01.23 um 10:19 schrieb Lukas Wunner:
> > On surprise removal, pciehp_unconfigure_device() and acpiphp's
> > trim_stale_devices() call pci_dev_set_disconnected() to mark removed
> > devices as permanently offline.  Thereby, the PCI core and drivers know
> > to skip device accesses.
> > 
> > However pci_dev_set_disconnected() takes the device_lock and thus waits
> > for a concurrent driver bind or unbind to complete.  As a result, the
> > driver's ->probe and ->remove hooks have no chance to learn that the
> > device is gone.
> 
> Who is reading dev->error_state in this situation and especially do we have
> the necessary read barrier in place?
> 
> Alternatively if this is just opportunistically we should document that
> somehow.

dev->error_state is read by pci_dev_is_disconnected() and by various
drivers.  None of them has a specific read barrier AFAICS or is using
the device_lock to protect read access.

For pci_dev_is_disconnected(), the read is indeed opportunistic.
It's an optimization that prevents the kernel from performing
a lot of non-posted requests to a removed device and waiting
for them to time out.  Or worse, having them be received by a
replacement device.


> > That doesn't make any sense, so drop the device_lock and instead use
> > atomic xchg() and cmpxchg() operations to update the device state.
> 
> You use xchg() instead of WRITE_ONCE() for the memory barrier here?

xchg() implies a full memory barrier to ensure that the new state is
immediately visible to all CPUs.  WRITE_ONCE() would be weaker,
it's just a compiler barrier.

The desire to immediately make the state change visible is why we
absolutely do not want locking here.


> > As a byproduct, an AB-BA deadlock reported by Anatoli is fixed which
> > occurs on surprise removal with AER concurrently performing a bus reset.
> 
> Well this byproduct is probably the main fix in this patch. I'm wondering
> why lockdep didn't complained about that more drastically in our testing.

Well I guess I could rephrase the commit message if you feel I'm burying
the lede.

Thanks,

Lukas
Bjorn Helgaas Feb. 15, 2023, 9:13 p.m. UTC | #3
On Fri, Jan 20, 2023 at 10:19:02AM +0100, Lukas Wunner wrote:
> On surprise removal, pciehp_unconfigure_device() and acpiphp's
> trim_stale_devices() call pci_dev_set_disconnected() to mark removed
> devices as permanently offline.  Thereby, the PCI core and drivers know
> to skip device accesses.
> 
> However pci_dev_set_disconnected() takes the device_lock and thus waits
> for a concurrent driver bind or unbind to complete.  As a result, the
> driver's ->probe and ->remove hooks have no chance to learn that the
> device is gone.
> 
> That doesn't make any sense, so drop the device_lock and instead use
> atomic xchg() and cmpxchg() operations to update the device state.
> 
> As a byproduct, an AB-BA deadlock reported by Anatoli is fixed which
> occurs on surprise removal with AER concurrently performing a bus reset.
> 
> AER bus reset:
> 
>   INFO: task irq/26-aerdrv:95 blocked for more than 120 seconds.
>   Tainted: G        W          6.2.0-rc3-custom-norework-jan11+
>   schedule
>   rwsem_down_write_slowpath
>   down_write_nested
>   pciehp_reset_slot                      # acquires reset_lock
>   pci_reset_hotplug_slot
>   pci_slot_reset                         # acquires device_lock
>   pci_bus_error_reset
>   aer_root_reset
>   pcie_do_recovery
>   aer_process_err_devices
>   aer_isr
> 
> pciehp surprise removal:
> 
>   INFO: task irq/26-pciehp:96 blocked for more than 120 seconds.
>   Tainted: G        W          6.2.0-rc3-custom-norework-jan11+
>   schedule_preempt_disabled
>   __mutex_lock
>   mutex_lock_nested
>   pci_dev_set_disconnected               # acquires device_lock
>   pci_walk_bus
>   pciehp_unconfigure_device
>   pciehp_disable_slot
>   pciehp_handle_presence_or_link_change
>   pciehp_ist                             # acquires reset_lock
> 
> Fixes: a6bd101b8f84 ("PCI: Unify device inaccessible")
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=215590
> Reported-by: Anatoli Antonovitch <anatoli.antonovitch@amd.com>
> Signed-off-by: Lukas Wunner <lukas@wunner.de>
> Cc: stable@vger.kernel.org # v4.20+
> Cc: Keith Busch <kbusch@kernel.org>

Applied to pci/hotplug for v6.3, thanks!

The xchg()/cmpxchg() is a little subtle just because we don't see it
every day, but a really nice simplification and explanation of the
state change in addition to the locking improvement.  Thank you!

> ---
>  drivers/pci/pci.h | 43 +++++++++++++------------------------------
>  1 file changed, 13 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 9ed3b55..5d5a44a 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -310,53 +310,36 @@ struct pci_sriov {
>   * @dev: PCI device to set new error_state
>   * @new: the state we want dev to be in
>   *
> - * Must be called with device_lock held.
> + * If the device is experiencing perm_failure, it has to remain in that state.
> + * Any other transition is allowed.
>   *
>   * Returns true if state has been changed to the requested state.
>   */
>  static inline bool pci_dev_set_io_state(struct pci_dev *dev,
>  					pci_channel_state_t new)
>  {
> -	bool changed = false;
> +	pci_channel_state_t old;
>  
> -	device_lock_assert(&dev->dev);
>  	switch (new) {
>  	case pci_channel_io_perm_failure:
> -		switch (dev->error_state) {
> -		case pci_channel_io_frozen:
> -		case pci_channel_io_normal:
> -		case pci_channel_io_perm_failure:
> -			changed = true;
> -			break;
> -		}
> -		break;
> +		xchg(&dev->error_state, pci_channel_io_perm_failure);
> +		return true;
>  	case pci_channel_io_frozen:
> -		switch (dev->error_state) {
> -		case pci_channel_io_frozen:
> -		case pci_channel_io_normal:
> -			changed = true;
> -			break;
> -		}
> -		break;
> +		old = cmpxchg(&dev->error_state, pci_channel_io_normal,
> +			      pci_channel_io_frozen);
> +		return old != pci_channel_io_perm_failure;
>  	case pci_channel_io_normal:
> -		switch (dev->error_state) {
> -		case pci_channel_io_frozen:
> -		case pci_channel_io_normal:
> -			changed = true;
> -			break;
> -		}
> -		break;
> +		old = cmpxchg(&dev->error_state, pci_channel_io_frozen,
> +			      pci_channel_io_normal);
> +		return old != pci_channel_io_perm_failure;
> +	default:
> +		return false;
>  	}
> -	if (changed)
> -		dev->error_state = new;
> -	return changed;
>  }
>  
>  static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
>  {
> -	device_lock(&dev->dev);
>  	pci_dev_set_io_state(dev, pci_channel_io_perm_failure);
> -	device_unlock(&dev->dev);
>  
>  	return 0;
>  }
> -- 
> 2.39.1
>
diff mbox series

Patch

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9ed3b55..5d5a44a 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -310,53 +310,36 @@  struct pci_sriov {
  * @dev: PCI device to set new error_state
  * @new: the state we want dev to be in
  *
- * Must be called with device_lock held.
+ * If the device is experiencing perm_failure, it has to remain in that state.
+ * Any other transition is allowed.
  *
  * Returns true if state has been changed to the requested state.
  */
 static inline bool pci_dev_set_io_state(struct pci_dev *dev,
 					pci_channel_state_t new)
 {
-	bool changed = false;
+	pci_channel_state_t old;
 
-	device_lock_assert(&dev->dev);
 	switch (new) {
 	case pci_channel_io_perm_failure:
-		switch (dev->error_state) {
-		case pci_channel_io_frozen:
-		case pci_channel_io_normal:
-		case pci_channel_io_perm_failure:
-			changed = true;
-			break;
-		}
-		break;
+		xchg(&dev->error_state, pci_channel_io_perm_failure);
+		return true;
 	case pci_channel_io_frozen:
-		switch (dev->error_state) {
-		case pci_channel_io_frozen:
-		case pci_channel_io_normal:
-			changed = true;
-			break;
-		}
-		break;
+		old = cmpxchg(&dev->error_state, pci_channel_io_normal,
+			      pci_channel_io_frozen);
+		return old != pci_channel_io_perm_failure;
 	case pci_channel_io_normal:
-		switch (dev->error_state) {
-		case pci_channel_io_frozen:
-		case pci_channel_io_normal:
-			changed = true;
-			break;
-		}
-		break;
+		old = cmpxchg(&dev->error_state, pci_channel_io_frozen,
+			      pci_channel_io_normal);
+		return old != pci_channel_io_perm_failure;
+	default:
+		return false;
 	}
-	if (changed)
-		dev->error_state = new;
-	return changed;
 }
 
 static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused)
 {
-	device_lock(&dev->dev);
 	pci_dev_set_io_state(dev, pci_channel_io_perm_failure);
-	device_unlock(&dev->dev);
 
 	return 0;
 }