diff mbox series

[RFC,7/8] pci: reference count subordinate

Message ID 20240722151936.1452299-8-kbusch@meta.com (mailing list archive)
State RFC
Delegated to: Bjorn Helgaas
Headers show
Series pci: rescan/remove locking rework | expand

Commit Message

Keith Busch July 22, 2024, 3:19 p.m. UTC
From: Keith Busch <kbusch@kernel.org>

The subordinate is accessed under the pci_bus_sem (or often times no
lock at at all), but unset under the rescan_remove_lock. Access the
subordinate buses with reference counts to guard against a concurrent
removal and use-after-free.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/pci/bus.c                | 18 +++++++++++++++---
 drivers/pci/hotplug/pciehp_pci.c | 12 ++++++++++--
 drivers/pci/pci.c                | 28 ++++++++++++++++++++++------
 drivers/pci/pci.h                |  1 +
 drivers/pci/pcie/aspm.c          |  7 +++++--
 drivers/pci/probe.c              |  7 +++++--
 drivers/pci/remove.c             | 18 +++++++++++++-----
 7 files changed, 71 insertions(+), 20 deletions(-)

Comments

Jonathan Cameron Aug. 15, 2024, 3:10 p.m. UTC | #1
On Mon, 22 Jul 2024 08:19:35 -0700
Keith Busch <kbusch@meta.com> wrote:

> From: Keith Busch <kbusch@kernel.org>
> 
> The subordinate is accessed under the pci_bus_sem (or often times no
> lock at at all), but unset under the rescan_remove_lock. Access the
> subordinate buses with reference counts to guard against a concurrent
> removal and use-after-free.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---

Hi Keith,

A few comments inline.

Jonathan



> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index e3a49f66982d5..0cd36b7772c8b 100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -3113,9 +3113,14 @@ void pci_bridge_d3_update(struct pci_dev *dev)
>  	 * so we need to go through all children to find out if one of them
>  	 * continues to block D3.
>  	 */
> -	if (d3cold_ok && !bridge->bridge_d3)
> -		pci_walk_bus(bridge->subordinate, pci_dev_check_d3cold,
> -			     &d3cold_ok);
> +	if (d3cold_ok && !bridge->bridge_d3) {
> +		struct pci_bus *bus = pci_dev_get_subordinate(bridge);
> +
> +		if (bus) {
> +			pci_walk_bus(bus, pci_dev_check_d3cold, &d3cold_ok);
> +			pci_bus_put(bus);
I'd be tempted to call pci_bus_put(bus) unconditionally but doesn't matter
a lot.
> +		}
> +	}
>  
>  	if (bridge->bridge_d3 != d3cold_ok) {
>  		bridge->bridge_d3 = d3cold_ok;
> @@ -4824,6 +4829,7 @@ static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
>  int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  {
>  	struct pci_dev *child __free(pci_dev_put) = NULL;

I would moan about constructors and desctructors together, but unrelated
to this patch and would actually break the change I suggest below given
the lifetime of child is longer than the loop where it's gotten.
So I won't moan about it :)

> +	struct pci_bus *bus;
>  	int delay;
>  
>  	if (pci_dev_is_disconnected(dev))
> @@ -4840,7 +4846,17 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	 * board_added(). In case of ACPI hotplug the firmware is expected
>  	 * to configure the devices before OS is notified.
>  	 */
> -	if (!dev->subordinate || list_empty(&dev->subordinate->devices)) {
> +	bus = pci_dev_get_subordinate(dev);
> +	if (!bus) {
> +		up_read(&pci_bus_sem);
> +		return 0;
> +	}
> +
> +	child = pci_dev_get(list_first_entry_or_null(&bus->devices,
> +						     struct pci_dev,
> +						     bus_list));
> +	if (!child) {
> +		pci_bus_put(bus);
>  		up_read(&pci_bus_sem);
>  		return 0;
>  	}
> @@ -4848,12 +4864,12 @@ int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
>  	/* Take d3cold_delay requirements into account */
>  	delay = pci_bus_max_d3cold_delay(dev->subordinate);
>  	if (!delay) {
> +		pci_bus_put(bus);
>  		up_read(&pci_bus_sem);
>  		return 0;
>  	}
>  
> -	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
> -					     struct pci_dev, bus_list));
> +	pci_bus_put(bus);
>  	up_read(&pci_bus_sem);

I'd like scoped_guard() {
	struct pci_bus *bus __free(pci_bus_put) = pci_dev_get_sub...
	here so that the manual cleanup can be avoided in the early return paths.

}
}
>  
>  	/*

...

> diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
> index cee2365e54b8b..3c0c83d35ab86 100644
> --- a/drivers/pci/pcie/aspm.c
> +++ b/drivers/pci/pcie/aspm.c
> @@ -1212,9 +1212,11 @@ static void pcie_update_aspm_capable(struct pcie_link_state *root)
>  		link->aspm_capable = link->aspm_support;
>  	}
>  	list_for_each_entry(link, &link_list, sibling) {
> +		struct pci_bus *linkbus;
>  		struct pci_dev *child;
> -		struct pci_bus *linkbus = link->pdev->subordinate;
> -		if (link->root != root)
> +
> +		linkbus = pci_dev_get_subordinate(link->pdev);
Maybe worth a 
DEFINE_FREE() for pci_bus_put to match the one for pci_dev_put?

> +		if (!linkbus || link->root != root)
>  			continue;
>  		list_for_each_entry(child, &linkbus->devices, bus_list) {
>  			if ((pci_pcie_type(child) != PCI_EXP_TYPE_ENDPOINT) &&
> @@ -1222,6 +1224,7 @@ static void pcie_update_aspm_capable(struct pcie_link_state *root)
>  				continue;
>  			pcie_aspm_check_latency(child);
>  		}
> +		pci_bus_put(linkbus);
>  	}
>  }
>  
> diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> index b14b9876c0303..53522685193da 100644
> --- a/drivers/pci/probe.c
> +++ b/drivers/pci/probe.c

...

> @@ -3380,7 +3383,7 @@ int pci_hp_add_bridge(struct pci_dev *dev)

As far as I can tell the return value of this function is never used.
So could just drop the code below. Maybe clean up this function
to return void or start handling the return value.

>  	/* Scan bridges that need to be reconfigured */
>  	pci_scan_bridge_extend(parent, dev, busnr, available_buses, 1);
>  
> -	if (!dev->subordinate)
> +	if (!READ_ONCE(dev->subordinate))
>  		return -1;
>  
>  	return 0;
diff mbox series

Patch

diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 638e79d10bfab..3085ecaa2ba40 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -382,9 +382,11 @@  void pci_bus_add_devices(const struct pci_bus *bus)
 		/* Skip if device attach failed */
 		if (!pci_dev_is_added(dev))
 			continue;
-		child = dev->subordinate;
+
+		child = pci_dev_get_subordinate(dev);
 		if (child)
 			pci_bus_add_devices(child);
+		pci_bus_put(child);
 	}
 }
 EXPORT_SYMBOL(pci_bus_add_devices);
@@ -396,11 +398,16 @@  static int __pci_walk_bus(struct pci_bus *top, int (*cb)(struct pci_dev *, void
 	int ret = 0;
 
 	list_for_each_entry(dev, &top->devices, bus_list) {
+		struct pci_bus *bus;
+
 		ret = cb(dev, userdata);
 		if (ret)
 			break;
-		if (dev->subordinate) {
-			ret = __pci_walk_bus(dev->subordinate, cb, userdata);
+
+		bus = pci_dev_get_subordinate(dev);
+		if (bus) {
+			ret = __pci_walk_bus(bus, cb, userdata);
+			pci_bus_put(bus);
 			if (ret)
 				break;
 		}
@@ -448,3 +455,8 @@  void pci_bus_put(struct pci_bus *bus)
 	if (bus)
 		put_device(&bus->dev);
 }
+
+struct pci_bus *pci_dev_get_subordinate(struct pci_dev *dev)
+{
+	return pci_bus_get(READ_ONCE(dev->subordinate));
+}
diff --git a/drivers/pci/hotplug/pciehp_pci.c b/drivers/pci/hotplug/pciehp_pci.c
index 65e50bee1a8c0..b15dcfd86c60a 100644
--- a/drivers/pci/hotplug/pciehp_pci.c
+++ b/drivers/pci/hotplug/pciehp_pci.c
@@ -33,9 +33,12 @@  int pciehp_configure_device(struct controller *ctrl)
 {
 	struct pci_dev *dev;
 	struct pci_dev *bridge = ctrl->pcie->port;
-	struct pci_bus *parent = bridge->subordinate;
+	struct pci_bus *parent = pci_dev_get_subordinate(bridge);
 	int num, ret = 0;
 
+	if (!parent)
+		return 0;
+
 	pci_lock_rescan_remove();
 
 	dev = pci_get_slot(parent, PCI_DEVFN(0, 0));
@@ -78,6 +81,7 @@  int pciehp_configure_device(struct controller *ctrl)
 
  out:
 	pci_unlock_rescan_remove();
+	pci_bus_put(parent);
 	return ret;
 }
 
@@ -95,9 +99,12 @@  int pciehp_configure_device(struct controller *ctrl)
 void pciehp_unconfigure_device(struct controller *ctrl, bool presence)
 {
 	struct pci_dev *dev, *temp;
-	struct pci_bus *parent = ctrl->pcie->port->subordinate;
+	struct pci_bus *parent = pci_dev_get_subordinate(ctrl->pcie->port);
 	u16 command;
 
+	if (!parent)
+		return;
+
 	ctrl_dbg(ctrl, "%s: domain:bus:dev = %04x:%02x:00\n",
 		 __func__, pci_domain_nr(parent), parent->number);
 
@@ -138,4 +145,5 @@  void pciehp_unconfigure_device(struct controller *ctrl, bool presence)
 	}
 
 	pci_unlock_rescan_remove();
+	pci_bus_put(parent);
 }
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e3a49f66982d5..0cd36b7772c8b 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -3113,9 +3113,14 @@  void pci_bridge_d3_update(struct pci_dev *dev)
 	 * so we need to go through all children to find out if one of them
 	 * continues to block D3.
 	 */
-	if (d3cold_ok && !bridge->bridge_d3)
-		pci_walk_bus(bridge->subordinate, pci_dev_check_d3cold,
-			     &d3cold_ok);
+	if (d3cold_ok && !bridge->bridge_d3) {
+		struct pci_bus *bus = pci_dev_get_subordinate(bridge);
+
+		if (bus) {
+			pci_walk_bus(bus, pci_dev_check_d3cold, &d3cold_ok);
+			pci_bus_put(bus);
+		}
+	}
 
 	if (bridge->bridge_d3 != d3cold_ok) {
 		bridge->bridge_d3 = d3cold_ok;
@@ -4824,6 +4829,7 @@  static int pci_bus_max_d3cold_delay(const struct pci_bus *bus)
 int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 {
 	struct pci_dev *child __free(pci_dev_put) = NULL;
+	struct pci_bus *bus;
 	int delay;
 
 	if (pci_dev_is_disconnected(dev))
@@ -4840,7 +4846,17 @@  int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	 * board_added(). In case of ACPI hotplug the firmware is expected
 	 * to configure the devices before OS is notified.
 	 */
-	if (!dev->subordinate || list_empty(&dev->subordinate->devices)) {
+	bus = pci_dev_get_subordinate(dev);
+	if (!bus) {
+		up_read(&pci_bus_sem);
+		return 0;
+	}
+
+	child = pci_dev_get(list_first_entry_or_null(&bus->devices,
+						     struct pci_dev,
+						     bus_list));
+	if (!child) {
+		pci_bus_put(bus);
 		up_read(&pci_bus_sem);
 		return 0;
 	}
@@ -4848,12 +4864,12 @@  int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type)
 	/* Take d3cold_delay requirements into account */
 	delay = pci_bus_max_d3cold_delay(dev->subordinate);
 	if (!delay) {
+		pci_bus_put(bus);
 		up_read(&pci_bus_sem);
 		return 0;
 	}
 
-	child = pci_dev_get(list_first_entry(&dev->subordinate->devices,
-					     struct pci_dev, bus_list));
+	pci_bus_put(bus);
 	up_read(&pci_bus_sem);
 
 	/*
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 19cbf18743a96..83e449253066e 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -312,6 +312,7 @@  void pci_reassigndev_resource_alignment(struct pci_dev *dev);
 void pci_disable_bridge_window(struct pci_dev *dev);
 struct pci_bus *pci_bus_get(struct pci_bus *bus);
 void pci_bus_put(struct pci_bus *bus);
+struct pci_bus *pci_dev_get_subordinate(struct pci_dev *dev);
 
 /* PCIe link information from Link Capabilities 2 */
 #define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index cee2365e54b8b..3c0c83d35ab86 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -1212,9 +1212,11 @@  static void pcie_update_aspm_capable(struct pcie_link_state *root)
 		link->aspm_capable = link->aspm_support;
 	}
 	list_for_each_entry(link, &link_list, sibling) {
+		struct pci_bus *linkbus;
 		struct pci_dev *child;
-		struct pci_bus *linkbus = link->pdev->subordinate;
-		if (link->root != root)
+
+		linkbus = pci_dev_get_subordinate(link->pdev);
+		if (!linkbus || link->root != root)
 			continue;
 		list_for_each_entry(child, &linkbus->devices, bus_list) {
 			if ((pci_pcie_type(child) != PCI_EXP_TYPE_ENDPOINT) &&
@@ -1222,6 +1224,7 @@  static void pcie_update_aspm_capable(struct pcie_link_state *root)
 				continue;
 			pcie_aspm_check_latency(child);
 		}
+		pci_bus_put(linkbus);
 	}
 }
 
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index b14b9876c0303..53522685193da 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1167,7 +1167,10 @@  static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
 		child->resource[i] = &bridge->resource[PCI_BRIDGE_RESOURCES+i];
 		child->resource[i]->name = child->name;
 	}
-	bridge->subordinate = child;
+
+	down_write(&pci_bus_sem);
+	WRITE_ONCE(bridge->subordinate, child);
+	up_write(&pci_bus_sem);
 
 add_dev:
 	pci_set_bus_msi_domain(child);
@@ -3380,7 +3383,7 @@  int pci_hp_add_bridge(struct pci_dev *dev)
 	/* Scan bridges that need to be reconfigured */
 	pci_scan_bridge_extend(parent, dev, busnr, available_buses, 1);
 
-	if (!dev->subordinate)
+	if (!READ_ONCE(dev->subordinate))
 		return -1;
 
 	return 0;
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 288162a11ab19..646c97e41a323 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -77,10 +77,12 @@  EXPORT_SYMBOL(pci_remove_bus);
 
 static void pci_stop_bus_device(struct pci_dev *dev)
 {
-	struct pci_bus *bus = dev->subordinate;
+	struct pci_bus *bus = pci_dev_get_subordinate(dev);
 
-	if (bus)
+	if (bus) {
 		pci_stop_bus(bus);
+		pci_bus_put(bus);
+	}
 	pci_stop_dev(dev);
 }
 
@@ -100,12 +102,18 @@  static void pci_stop_bus(struct pci_bus *bus)
 
 static void pci_remove_bus_device(struct pci_dev *dev)
 {
-	struct pci_bus *bus = dev->subordinate;
+	struct pci_bus *bus;
 
+	down_write(&pci_bus_sem);
+	bus = pci_dev_get_subordinate(dev);
 	if (bus) {
+		WRITE_ONCE(dev->subordinate, NULL);
+		up_write(&pci_bus_sem);
+
 		pci_remove_bus(bus);
-		dev->subordinate = NULL;
-	}
+		pci_bus_put(bus);
+	} else
+		up_write(&pci_bus_sem);
 	pci_destroy_dev(dev);
 }