diff mbox series

[12/12] PCI/pciehp: Use device managed allocations

Message ID 20180918235848.26694-13-keith.busch@intel.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show
Series error handling and pciehp maintenance | expand

Commit Message

Keith Busch Sept. 18, 2018, 11:58 p.m. UTC
All of pciehp's resources are tied to the lifetime of the device it is
driving. This patch simplifies the resource tracking by using the device
managed resource allocations.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/pci/hotplug/pciehp_core.c | 14 +++---------
 drivers/pci/hotplug/pciehp_hpc.c  | 48 +++++++++++----------------------------
 2 files changed, 16 insertions(+), 46 deletions(-)

Comments

Sinan Kaya Sept. 19, 2018, 3:11 p.m. UTC | #1
On 9/18/2018 7:58 PM, Keith Busch wrote:
> -	ctrl->notification_enabled = 1;
>   	return 0;
>   }
>   
>   void pcie_shutdown_notification(struct controller *ctrl)
>   {
> -	if (ctrl->notification_enabled) {
> -		pcie_disable_notification(ctrl);
> -		pciehp_free_irq(ctrl);
> -		ctrl->notification_enabled = 0;
> -	}
> +	pcie_disable_notification(ctrl);
> +	if (pciehp_poll_mode)
> +		kthread_stop(ctrl->poll_thread);
>   }
>   

I think this notification_enabled bit change needs to go to
another path. The rest of the change in this file is pretty mechanic changes.
Also, are you going to remove the notification_enabled member?
Keith Busch Sept. 19, 2018, 4:17 p.m. UTC | #2
On Wed, Sep 19, 2018 at 11:11:19AM -0400, Sinan Kaya wrote:
> On 9/18/2018 7:58 PM, Keith Busch wrote:
> > -	ctrl->notification_enabled = 1;
> >   	return 0;
> >   }
> >   void pcie_shutdown_notification(struct controller *ctrl)
> >   {
> > -	if (ctrl->notification_enabled) {
> > -		pcie_disable_notification(ctrl);
> > -		pciehp_free_irq(ctrl);
> > -		ctrl->notification_enabled = 0;
> > -	}
> > +	pcie_disable_notification(ctrl);
> > +	if (pciehp_poll_mode)
> > +		kthread_stop(ctrl->poll_thread);
> >   }
> 
> I think this notification_enabled bit change needs to go to
> another path. The rest of the change in this file is pretty mechanic changes.
> Also, are you going to remove the notification_enabled member?

Right, we don't need it, and I should have removed it. There is no path
that would call pcie_shutdown_notification if it hadn't been enabled.
Lukas Wunner Sept. 22, 2018, 6:10 p.m. UTC | #3
On Tue, Sep 18, 2018 at 05:58:48PM -0600, Keith Busch wrote:
> --- a/drivers/pci/hotplug/pciehp_core.c
> +++ b/drivers/pci/hotplug/pciehp_core.c
> @@ -58,16 +58,16 @@ static int init_slot(struct controller *ctrl)
>  	char name[SLOT_NAME_SIZE];
>  	int retval = -ENOMEM;
>  
> -	hotplug = kzalloc(sizeof(*hotplug), GFP_KERNEL);
> +	hotplug = devm_kzalloc(&ctrl->pcie->device, sizeof(*hotplug), GFP_KERNEL);
>  	if (!hotplug)
>  		goto out;
>  
> -	info = kzalloc(sizeof(*info), GFP_KERNEL);
> +	info = devm_kzalloc(&ctrl->pcie->device, sizeof(*info), GFP_KERNEL);
>  	if (!info)
>  		goto out;
>  
>  	/* Setup hotplug slot ops */
> -	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
> +	ops = devm_kzalloc(&ctrl->pcie->device, sizeof(*ops), GFP_KERNEL);
>  	if (!ops)
>  		goto out;

The "hotplug" and "info" allocations are gone on the pci/hotplug branch,
so this needs a rebase.


> @@ -111,9 +106,6 @@ static void cleanup_slot(struct controller *ctrl)
>  	struct hotplug_slot *hotplug_slot = ctrl->slot->hotplug_slot;
>  
>  	pci_hp_destroy(hotplug_slot);
> -	kfree(hotplug_slot->ops);
> -	kfree(hotplug_slot->info);
> -	kfree(hotplug_slot);
>  }

Please replace all invocations of cleanup_slot() with a direct call to
pci_hp_destroy() since this is now becoming merely a wrapper function.


> --- a/drivers/pci/hotplug/pciehp_hpc.c
> +++ b/drivers/pci/hotplug/pciehp_hpc.c
> @@ -45,22 +45,15 @@ static inline int pciehp_request_irq(struct controller *ctrl)
>  	}
>  
>  	/* Installs the interrupt handler */
> -	retval = request_threaded_irq(irq, pciehp_isr, pciehp_ist,
> -				      IRQF_SHARED, MY_NAME, ctrl);
> +	retval = devm_request_threaded_irq(&ctrl->pcie->device, irq, pciehp_isr,
> +					   pciehp_ist, IRQF_SHARED, MY_NAME,
> +					   ctrl);

While using devm_kzalloc() for the "ops" and "ctrl" allocations is fine,
using devm_request_threaded_irq() is *not* because it allows a
use-after-free of the hotplug_slot_name():

With your patch the teardown order becomes:
  pci_hp_del(&ctrl->hotplug_slot);       # After this user space can no longer
                                         # issue enable/disable requests (but
				         # ->reset_slot() is still possible).
  pcie_disable_notification();           # After this, the IRQ thread is no
                                         # longer woken because of slot events.
  pci_hp_destroy(hotplug_slot);          # After this, hotplug_slot_name() must
                                         # no longer be called.
  cancel_delayed_work_sync(&slot->work); # After this, the IRQ thread is no
                                         # longer woken by the button_work.

What can happen here is the button_work gets scheduled before it is
canceled, it wakes up the IRQ thread, the IRQ thread brings up or
down the slot and prints messages to the log which include the
hotplug_slot_name(), leading to a use-after-free.

This cannot happen with what's in v4.19 or on the pci/hotplug branch
because the IRQ is released right after the call to pci_hp_del().
If the button_work happens to be scheduled afterwards, it will call
irq_wake_thread(ctrl->pcie->irq, ctrl).  Now if you look at the
implementation of irq_wake_thread(), it iterates over all the
actions of the IRQ (in case it's shared by multiple devices) and
finds the action which matches the ctrl pointer.  But because we've
already released the IRQ at that point, it won't find the IRQ action
and just return.  So irq_wake_thread() essentially becomes a no-op.

The teardown order is very meticulously designed, I went over it
dozens of times to ensure it's bullet-proof.  The IRQ really has to
be released at exactly the right time and using a device-managed IRQ
breaks the correct order I'm afraid.

While I had thought of using devm_kzalloc(), I also knew that Bjorn
wants to get rid of portdrv and I didn't want to make any changes that
might constrain the future design of portdrv.  Right now, portdrv
creates a sub-device for each service and service drivers bind to it.
An alternative design could either be a single driver that binds to
ports and handles all services at once (probably wouldn't be all that
different to the current situation), or no driver would be bound to
ports and the functionality of the port services would be handled
within the core.  In either case we have the pci_dev of the port to
attach the device-managed allocations to, so I suppose using
devm_kzalloc() here isn't constraining us all that much, hence
seems fine.

Thanks,

Lukas
Bjorn Helgaas Sept. 24, 2018, 11:43 p.m. UTC | #4
On Sat, Sep 22, 2018 at 08:10:57PM +0200, Lukas Wunner wrote:
> On Tue, Sep 18, 2018 at 05:58:48PM -0600, Keith Busch wrote:
> > --- a/drivers/pci/hotplug/pciehp_core.c
> > +++ b/drivers/pci/hotplug/pciehp_core.c

> >  	/* Installs the interrupt handler */
> > -	retval = request_threaded_irq(irq, pciehp_isr, pciehp_ist,
> > -				      IRQF_SHARED, MY_NAME, ctrl);
> > +	retval = devm_request_threaded_irq(&ctrl->pcie->device, irq, pciehp_isr,
> > +					   pciehp_ist, IRQF_SHARED, MY_NAME,
> > +					   ctrl);
> 
> While using devm_kzalloc() for the "ops" and "ctrl" allocations is fine,
> using devm_request_threaded_irq() is *not* because it allows a
> use-after-free of the hotplug_slot_name():
> 
> With your patch the teardown order becomes:
>   pci_hp_del(&ctrl->hotplug_slot);       # After this user space can no longer
>                                          # issue enable/disable requests (but
>                                          # ->reset_slot() is still possible).
>   pcie_disable_notification();           # After this, the IRQ thread is no
>                                          # longer woken because of slot events.
>   pci_hp_destroy(hotplug_slot);          # After this, hotplug_slot_name() must
>                                          # no longer be called.
>   cancel_delayed_work_sync(&slot->work); # After this, the IRQ thread is no
>                                          # longer woken by the button_work.
> 
> What can happen here is the button_work gets scheduled before it is
> canceled, it wakes up the IRQ thread, the IRQ thread brings up or
> down the slot and prints messages to the log which include the
> hotplug_slot_name(), leading to a use-after-free.
> 
> This cannot happen with what's in v4.19 or on the pci/hotplug branch
> because the IRQ is released right after the call to pci_hp_del().
> If the button_work happens to be scheduled afterwards, it will call
> irq_wake_thread(ctrl->pcie->irq, ctrl).  Now if you look at the
> implementation of irq_wake_thread(), it iterates over all the
> actions of the IRQ (in case it's shared by multiple devices) and
> finds the action which matches the ctrl pointer.  But because we've
> already released the IRQ at that point, it won't find the IRQ action
> and just return.  So irq_wake_thread() essentially becomes a no-op.
> 
> The teardown order is very meticulously designed, I went over it
> dozens of times to ensure it's bullet-proof.  The IRQ really has to
> be released at exactly the right time and using a device-managed IRQ
> breaks the correct order I'm afraid.

Thanks for the very detailed analysis!  

This seems like a very subtle issue that we're likely to trip over
again.  I assume most drivers can use device-managed IRQs safely, but
it's a problem here because of the extra complexity around
hotplug_slot?  Is there some way we can make it less subtle, e.g., by
moving the slot cleanup into the pci_destroy_dev() path or something?

Is there some value we get from having both struct hotplug_slot and
struct pci_slot?

> While I had thought of using devm_kzalloc(), I also knew that Bjorn
> wants to get rid of portdrv and I didn't want to make any changes that
> might constrain the future design of portdrv.  Right now, portdrv
> creates a sub-device for each service and service drivers bind to it.
> An alternative design could either be a single driver that binds to
> ports and handles all services at once (probably wouldn't be all that
> different to the current situation), or no driver would be bound to
> ports and the functionality of the port services would be handled
> within the core.  In either case we have the pci_dev of the port to
> attach the device-managed allocations to, so I suppose using
> devm_kzalloc() here isn't constraining us all that much, hence
> seems fine.

I'd *like* to get rid of portdrv, but I don't see a clear path to do
that yet, so it might be impractical.  One reason I don't like it is
that only one driver can bind to a device, so portdrv prevents us from
having drivers for device-specific bridge features like performance
monitors.  For that reason, I'd like to have the architected services
handled directly in the core.

Bjorn
Lukas Wunner Sept. 25, 2018, 7:13 a.m. UTC | #5
On Mon, Sep 24, 2018 at 06:43:07PM -0500, Bjorn Helgaas wrote:
> On Sat, Sep 22, 2018 at 08:10:57PM +0200, Lukas Wunner wrote:
> > This cannot happen with what's in v4.19 or on the pci/hotplug branch
> > because the IRQ is released right after the call to pci_hp_del().
> > If the button_work happens to be scheduled afterwards, it will call
> > irq_wake_thread(ctrl->pcie->irq, ctrl).  Now if you look at the
> > implementation of irq_wake_thread(), it iterates over all the
> > actions of the IRQ (in case it's shared by multiple devices) and
> > finds the action which matches the ctrl pointer.  But because we've
> > already released the IRQ at that point, it won't find the IRQ action
> > and just return.  So irq_wake_thread() essentially becomes a no-op.
> 
> This seems like a very subtle issue that we're likely to trip over
> again.  I assume most drivers can use device-managed IRQs safely, but
> it's a problem here because of the extra complexity around
> hotplug_slot?  Is there some way we can make it less subtle, e.g., by
> moving the slot cleanup into the pci_destroy_dev() path or something?

The extra complexity is caused by the various ways in which events can
be triggered (Slot Status register change, sysfs, button_work) plus the
deregistration of hotplug_slot.  Teardown paths are always hard, I don't
really see a better code solution, but I should add more code comments to
make it less subtle.


> Is there some value we get from having both struct hotplug_slot and
> struct pci_slot?

When Alex Chiang introduced struct pci_slot, he thought of migrating
struct hotplug_slot into it.  See the "migrate over time" code comment
in commit f46753c5e354.  (It got changed to "move here" by 0aa0f5d1084c.)

On the current pci/hotplug branch, I've embedded struct hotplug_slot
in each hotplug driver's struct slot.  But that doesn't preclude
merging struct hotplug_slot into struct pci_slot, the hotplug drivers
would just have to embed a struct pci_slot instead, or struct pci_slot
would be embedded in struct hotplug_slot.

TBH the rationale behind all the different structs isn't quite clear
to me, e.g. struct pci_slot is also used for non-hotplug slots it seems,
see 8344b568f5bd.

(Note, Alex is no longer reachable under his hp.com address, same for
his canonical.com address.)


> > While I had thought of using devm_kzalloc(), I also knew that Bjorn
> > wants to get rid of portdrv and I didn't want to make any changes that
> > might constrain the future design of portdrv.  Right now, portdrv
> > creates a sub-device for each service and service drivers bind to it.
> > An alternative design could either be a single driver that binds to
> > ports and handles all services at once (probably wouldn't be all that
> > different to the current situation), or no driver would be bound to
> > ports and the functionality of the port services would be handled
> > within the core.  In either case we have the pci_dev of the port to
> > attach the device-managed allocations to, so I suppose using
> > devm_kzalloc() here isn't constraining us all that much, hence
> > seems fine.
> 
> I'd *like* to get rid of portdrv, but I don't see a clear path to do
> that yet, so it might be impractical.  One reason I don't like it is
> that only one driver can bind to a device, so portdrv prevents us from
> having drivers for device-specific bridge features like performance
> monitors.  For that reason, I'd like to have the architected services
> handled directly in the core.

Hm, well in that case we can't use devm_*() at all, as Benjamin
Herrenschmidt has just correctly pointed out, devres_release_all()
is called both on driver unbind in __device_release_driver() as well
as on device destruction in device_release().

Thus, if the port services are handled in the core to allow a driver
to bind to the port and devm_kzalloc() is used by the port services,
those allocations will be freed when that driver unbinds, and any
further use of the allocations is a use-after-free.

Thanks,

Lukas
diff mbox series

Patch

diff --git a/drivers/pci/hotplug/pciehp_core.c b/drivers/pci/hotplug/pciehp_core.c
index 334044814dbe..c9ae89f25e8c 100644
--- a/drivers/pci/hotplug/pciehp_core.c
+++ b/drivers/pci/hotplug/pciehp_core.c
@@ -58,16 +58,16 @@  static int init_slot(struct controller *ctrl)
 	char name[SLOT_NAME_SIZE];
 	int retval = -ENOMEM;
 
-	hotplug = kzalloc(sizeof(*hotplug), GFP_KERNEL);
+	hotplug = devm_kzalloc(&ctrl->pcie->device, sizeof(*hotplug), GFP_KERNEL);
 	if (!hotplug)
 		goto out;
 
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	info = devm_kzalloc(&ctrl->pcie->device, sizeof(*info), GFP_KERNEL);
 	if (!info)
 		goto out;
 
 	/* Setup hotplug slot ops */
-	ops = kzalloc(sizeof(*ops), GFP_KERNEL);
+	ops = devm_kzalloc(&ctrl->pcie->device, sizeof(*ops), GFP_KERNEL);
 	if (!ops)
 		goto out;
 
@@ -98,11 +98,6 @@  static int init_slot(struct controller *ctrl)
 	if (retval)
 		ctrl_err(ctrl, "pci_hp_initialize failed: error %d\n", retval);
 out:
-	if (retval) {
-		kfree(ops);
-		kfree(info);
-		kfree(hotplug);
-	}
 	return retval;
 }
 
@@ -111,9 +106,6 @@  static void cleanup_slot(struct controller *ctrl)
 	struct hotplug_slot *hotplug_slot = ctrl->slot->hotplug_slot;
 
 	pci_hp_destroy(hotplug_slot);
-	kfree(hotplug_slot->ops);
-	kfree(hotplug_slot->info);
-	kfree(hotplug_slot);
 }
 
 /*
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 13650f079188..72c22e9c0b63 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -45,22 +45,15 @@  static inline int pciehp_request_irq(struct controller *ctrl)
 	}
 
 	/* Installs the interrupt handler */
-	retval = request_threaded_irq(irq, pciehp_isr, pciehp_ist,
-				      IRQF_SHARED, MY_NAME, ctrl);
+	retval = devm_request_threaded_irq(&ctrl->pcie->device, irq, pciehp_isr,
+					   pciehp_ist, IRQF_SHARED, MY_NAME,
+					   ctrl);
 	if (retval)
 		ctrl_err(ctrl, "Cannot get irq %d for the hotplug controller\n",
 			 irq);
 	return retval;
 }
 
-static inline void pciehp_free_irq(struct controller *ctrl)
-{
-	if (pciehp_poll_mode)
-		kthread_stop(ctrl->poll_thread);
-	else
-		free_irq(ctrl->pcie->irq, ctrl);
-}
-
 static int pcie_poll_cmd(struct controller *ctrl, int timeout)
 {
 	struct pci_dev *pdev = ctrl_dev(ctrl);
@@ -780,17 +773,14 @@  int pcie_init_notification(struct controller *ctrl)
 	if (pciehp_request_irq(ctrl))
 		return -1;
 	pcie_enable_notification(ctrl);
-	ctrl->notification_enabled = 1;
 	return 0;
 }
 
 void pcie_shutdown_notification(struct controller *ctrl)
 {
-	if (ctrl->notification_enabled) {
-		pcie_disable_notification(ctrl);
-		pciehp_free_irq(ctrl);
-		ctrl->notification_enabled = 0;
-	}
+	pcie_disable_notification(ctrl);
+	if (pciehp_poll_mode)
+		kthread_stop(ctrl->poll_thread);
 }
 
 static int pcie_init_slot(struct controller *ctrl)
@@ -798,7 +788,7 @@  static int pcie_init_slot(struct controller *ctrl)
 	struct pci_bus *subordinate = ctrl_dev(ctrl)->subordinate;
 	struct slot *slot;
 
-	slot = kzalloc(sizeof(*slot), GFP_KERNEL);
+	slot = devm_kzalloc(&ctrl->pcie->device, sizeof(*slot), GFP_KERNEL);
 	if (!slot)
 		return -ENOMEM;
 
@@ -813,14 +803,6 @@  static int pcie_init_slot(struct controller *ctrl)
 	return 0;
 }
 
-static void pcie_cleanup_slot(struct controller *ctrl)
-{
-	struct slot *slot = ctrl->slot;
-
-	cancel_delayed_work_sync(&slot->work);
-	kfree(slot);
-}
-
 static inline void dbg_ctrl(struct controller *ctrl)
 {
 	struct pci_dev *pdev = ctrl->pcie->port;
@@ -845,9 +827,9 @@  struct controller *pcie_init(struct pcie_device *dev)
 	u8 occupied, poweron;
 	struct pci_dev *pdev = dev->port;
 
-	ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
+	ctrl = devm_kzalloc(&dev->device, sizeof(*ctrl), GFP_KERNEL);
 	if (!ctrl)
-		goto abort;
+		return NULL;
 
 	ctrl->pcie = dev;
 	pcie_capability_read_dword(pdev, PCI_EXP_SLTCAP, &slot_cap);
@@ -893,7 +875,7 @@  struct controller *pcie_init(struct pcie_device *dev)
 		pdev->broken_cmd_compl ? " (with Cmd Compl erratum)" : "");
 
 	if (pcie_init_slot(ctrl))
-		goto abort_ctrl;
+		return NULL;
 
 	/*
 	 * If empty slot's power status is on, turn power off.  The IRQ isn't
@@ -909,17 +891,13 @@  struct controller *pcie_init(struct pcie_device *dev)
 	}
 
 	return ctrl;
-
-abort_ctrl:
-	kfree(ctrl);
-abort:
-	return NULL;
 }
 
 void pciehp_release_ctrl(struct controller *ctrl)
 {
-	pcie_cleanup_slot(ctrl);
-	kfree(ctrl);
+	struct slot *slot = ctrl->slot;
+
+	cancel_delayed_work_sync(&slot->work);
 }
 
 static void quirk_cmd_compl(struct pci_dev *pdev)