diff mbox series

[PATCHv4,08/12] PCI: ERR: Always use the first downstream port

Message ID 20180920162717.31066-9-keith.busch@intel.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show
Series pci error handling fixes | expand

Commit Message

Keith Busch Sept. 20, 2018, 4:27 p.m. UTC
The link reset always used the first bridge device, but AER broadcast
error handling may have reported an end device. This means the reset may
hit devices that were never notified of the impending error recovery.

This patch uses the first downstream port in the hierarchy considered
reliable. An error detected by a switch upstream port should mean it
occurred on its upstream link, so the patch selects the parent device
if the error is not a root or downstream port.

This allows two other clean-ups. First, error handling can only run
on bridges so this patch removes checks for end devices. Second, the
first accessible port does not inherit the channel error state since we
can access it, so the special cases for error detect and resume are no
longer necessary.

Signed-off-by: Keith Busch <keith.busch@intel.com>
---
 drivers/pci/pcie/err.c | 85 +++++++++++++-------------------------------------
 1 file changed, 21 insertions(+), 64 deletions(-)

Comments

Bjorn Helgaas Sept. 26, 2018, 10:01 p.m. UTC | #1
On Thu, Sep 20, 2018 at 10:27:13AM -0600, Keith Busch wrote:
> The link reset always used the first bridge device, but AER broadcast
> error handling may have reported an end device. This means the reset may
> hit devices that were never notified of the impending error recovery.
> 
> This patch uses the first downstream port in the hierarchy considered
> reliable. An error detected by a switch upstream port should mean it
> occurred on its upstream link, so the patch selects the parent device
> if the error is not a root or downstream port.

I'm not really clear on what "Always use the first downstream port"
means.  Always use it for *what*?

I already applied this, but if we can improve the changelog, I'll
gladly update it.

> This allows two other clean-ups. First, error handling can only run
> on bridges so this patch removes checks for end devices. Second, the
> first accessible port does not inherit the channel error state since we
> can access it, so the special cases for error detect and resume are no
> longer necessary.
> 
> Signed-off-by: Keith Busch <keith.busch@intel.com>
> ---
>  drivers/pci/pcie/err.c | 85 +++++++++++++-------------------------------------
>  1 file changed, 21 insertions(+), 64 deletions(-)
> 
> diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
> index 644f3f725ef0..0fa5e1417a4a 100644
> --- a/drivers/pci/pcie/err.c
> +++ b/drivers/pci/pcie/err.c
> @@ -63,30 +63,12 @@ static int report_error_detected(struct pci_dev *dev, void *data)
>  	if (!dev->driver ||
>  		!dev->driver->err_handler ||
>  		!dev->driver->err_handler->error_detected) {
> -		if (result_data->state == pci_channel_io_frozen &&
> -			dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
> -			/*
> -			 * In case of fatal recovery, if one of down-
> -			 * stream device has no driver. We might be
> -			 * unable to recover because a later insmod
> -			 * of a driver for this device is unaware of
> -			 * its hw state.
> -			 */
> -			pci_printk(KERN_DEBUG, dev, "device has %s\n",
> -				   dev->driver ?
> -				   "no AER-aware driver" : "no driver");
> -		}
> -
>  		/*
> -		 * If there's any device in the subtree that does not
> -		 * have an error_detected callback, returning
> -		 * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
> -		 * the subsequent mmio_enabled/slot_reset/resume
> -		 * callbacks of "any" device in the subtree. All the
> -		 * devices in the subtree are left in the error state
> -		 * without recovery.
> +		 * If any device in the subtree does not have an error_detected
> +		 * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent
> +		 * error callbacks of "any" device in the subtree, and will
> +		 * exit in the disconnected error state.
>  		 */
> -
>  		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
>  			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
>  		else
> @@ -184,34 +166,23 @@ static pci_ers_result_t default_reset_link(struct pci_dev *dev)
>  
>  static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
>  {
> -	struct pci_dev *udev;
>  	pci_ers_result_t status;
>  	struct pcie_port_service_driver *driver = NULL;
>  
> -	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
> -		/* Reset this port for all subordinates */
> -		udev = dev;
> -	} else {
> -		/* Reset the upstream component (likely downstream port) */
> -		udev = dev->bus->self;
> -	}
> -
> -	/* Use the aer driver of the component firstly */
> -	driver = pcie_port_find_service(udev, service);
> -
> +	driver = pcie_port_find_service(dev, service);
>  	if (driver && driver->reset_link) {
> -		status = driver->reset_link(udev);
> -	} else if (udev->has_secondary_link) {
> -		status = default_reset_link(udev);
> +		status = driver->reset_link(dev);
> +	} else if (dev->has_secondary_link) {
> +		status = default_reset_link(dev);
>  	} else {
>  		pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
> -			pci_name(udev));
> +			pci_name(dev));
>  		return PCI_ERS_RESULT_DISCONNECT;
>  	}
>  
>  	if (status != PCI_ERS_RESULT_RECOVERED) {
>  		pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
> -			pci_name(udev));
> +			pci_name(dev));
>  		return PCI_ERS_RESULT_DISCONNECT;
>  	}
>  
> @@ -243,31 +214,7 @@ static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
>  	else
>  		result_data.result = PCI_ERS_RESULT_RECOVERED;
>  
> -	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
> -		/*
> -		 * If the error is reported by a bridge, we think this error
> -		 * is related to the downstream link of the bridge, so we
> -		 * do error recovery on all subordinates of the bridge instead
> -		 * of the bridge and clear the error status of the bridge.
> -		 */
> -		if (cb == report_error_detected)
> -			dev->error_state = state;
> -		pci_walk_bus(dev->subordinate, cb, &result_data);
> -		if (cb == report_resume) {
> -			pci_aer_clear_device_status(dev);
> -			pci_cleanup_aer_uncorrect_error_status(dev);
> -			dev->error_state = pci_channel_io_normal;
> -		}
> -	} else {
> -		/*
> -		 * If the error is reported by an end point, we think this
> -		 * error is related to the upstream link of the end point.
> -		 * The error is non fatal so the bus is ok; just invoke
> -		 * the callback for the function that logged the error.
> -		 */
> -		cb(dev, &result_data);
> -	}
> -
> +	pci_walk_bus(dev->subordinate, cb, &result_data);
>  	return result_data.result;
>  }
>  
> @@ -276,6 +223,14 @@ void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
>  {
>  	pci_ers_result_t status;
>  
> +	/*
> +	 * Error recovery runs on all subordinates of the first downstream port.
> +	 * If the downstream port detected the error, it is cleared at the end.
> +	 */
> +	if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
> +	      pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
> +		dev = dev->bus->self;
> +
>  	status = broadcast_error_message(dev,
>  			state,
>  			"error_detected",
> @@ -311,6 +266,8 @@ void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
>  				"resume",
>  				report_resume);
>  
> +	pci_aer_clear_device_status(dev);
> +	pci_cleanup_aer_uncorrect_error_status(dev);
>  	pci_info(dev, "AER: Device recovery successful\n");
>  	return;
>  
> -- 
> 2.14.4
>
Keith Busch Sept. 26, 2018, 10:19 p.m. UTC | #2
On Wed, Sep 26, 2018 at 05:01:16PM -0500, Bjorn Helgaas wrote:
> On Thu, Sep 20, 2018 at 10:27:13AM -0600, Keith Busch wrote:
> > The link reset always used the first bridge device, but AER broadcast
> > error handling may have reported an end device. This means the reset may
> > hit devices that were never notified of the impending error recovery.
> > 
> > This patch uses the first downstream port in the hierarchy considered
> > reliable. An error detected by a switch upstream port should mean it
> > occurred on its upstream link, so the patch selects the parent device
> > if the error is not a root or downstream port.
> 
> I'm not really clear on what "Always use the first downstream port"
> means.  Always use it for *what*?
> 
> I already applied this, but if we can improve the changelog, I'll
> gladly update it.

I'll see if I can better rephrase.

Error handling should notify all affected pci functions. If an end device
detects and emits ERR_FATAL, the old way would have only notified that
end-device driver, but other functions may be on or below the same bus.

Using the downstream port that connects to that bus where the error was
detectedas the anchor point to broadcast error handling progression,
we can notify all functions so they have a chance to prepare for the
link reset.
Bjorn Helgaas Sept. 27, 2018, 10:56 p.m. UTC | #3
On Wed, Sep 26, 2018 at 04:19:25PM -0600, Keith Busch wrote:
> On Wed, Sep 26, 2018 at 05:01:16PM -0500, Bjorn Helgaas wrote:
> > On Thu, Sep 20, 2018 at 10:27:13AM -0600, Keith Busch wrote:
> > > The link reset always used the first bridge device, but AER broadcast
> > > error handling may have reported an end device. This means the reset may
> > > hit devices that were never notified of the impending error recovery.
> > > 
> > > This patch uses the first downstream port in the hierarchy considered
> > > reliable. An error detected by a switch upstream port should mean it
> > > occurred on its upstream link, so the patch selects the parent device
> > > if the error is not a root or downstream port.
> > 
> > I'm not really clear on what "Always use the first downstream port"
> > means.  Always use it for *what*?

And I forgot to ask what "first downstream port" means.

> I'll see if I can better rephrase.
> 
> Error handling should notify all affected pci functions. If an end device
> detects and emits ERR_FATAL, the old way would have only notified that
> end-device driver, but other functions may be on or below the same bus.
> 
> Using the downstream port that connects to that bus where the error was
> detectedas the anchor point to broadcast error handling progression,
> we can notify all functions so they have a chance to prepare for the
> link reset.

So do I understand correctly that if the ERR_FATAL source is:

  - a Switch Upstream Port, you assume the problem is with the Link
    upstream from the Port, and that Link may need to be reset, so you
    notify everything below that Link, including the Upstream Port,
    everything below it (the Downstream Ports and anything below
    them), and potentially even any peers of the Upstream Port (is it
    even possible for a Upstream Port to have peer multi-function
    devices?)

  - a Switch Downstream Port, you assume the Port (and the Link going
    downstream) may need to be reset, so you notify the Port and
    anything below it

  - an Endpoint, you assume the Link leading to the Endpoint may need
    to be reset, so you notify the Endpoint, any peer multi-function
    devices, any related SR-IOV devices, and any devices below a peer
    that happens to be a bridge

And this is different from before because it notifies more devices in
some cases?  There was a pci_walk_bus() in broadcast_error_message(),
so we should have notified several devices in *some* cases, at least.

Bjorn
Keith Busch Sept. 28, 2018, 3:42 p.m. UTC | #4
On Thu, Sep 27, 2018 at 05:56:25PM -0500, Bjorn Helgaas wrote:
> On Wed, Sep 26, 2018 at 04:19:25PM -0600, Keith Busch wrote:
> > On Wed, Sep 26, 2018 at 05:01:16PM -0500, Bjorn Helgaas wrote:
> > > On Thu, Sep 20, 2018 at 10:27:13AM -0600, Keith Busch wrote:
> > > > The link reset always used the first bridge device, but AER broadcast
> > > > error handling may have reported an end device. This means the reset may
> > > > hit devices that were never notified of the impending error recovery.
> > > > 
> > > > This patch uses the first downstream port in the hierarchy considered
> > > > reliable. An error detected by a switch upstream port should mean it
> > > > occurred on its upstream link, so the patch selects the parent device
> > > > if the error is not a root or downstream port.
> > > 
> > > I'm not really clear on what "Always use the first downstream port"
> > > means.  Always use it for *what*?
> 
> And I forgot to ask what "first downstream port" means.

The "first downstream port" was supposed to mean the first DSP we
find when walking toward the root from the pci_dev that reported
ERR_[NON]FATAL.

By "use", I mean "walking down the sub-tree for error handling".

After thinking more on this, that doesn't really capture the intent. A
better way might be:

  Run error handling starting from the downstream port of the bus
  reporting an error

I'm struggling to make that short enough for a changelog subject.

> > I'll see if I can better rephrase.
> > 
> > Error handling should notify all affected pci functions. If an end device
> > detects and emits ERR_FATAL, the old way would have only notified that
> > end-device driver, but other functions may be on or below the same bus.
> > 
> > Using the downstream port that connects to that bus where the error was
> > detectedas the anchor point to broadcast error handling progression,
> > we can notify all functions so they have a chance to prepare for the
> > link reset.
> 
> So do I understand correctly that if the ERR_FATAL source is:
> 
>   - a Switch Upstream Port, you assume the problem is with the Link
>     upstream from the Port, and that Link may need to be reset, so you
>     notify everything below that Link, including the Upstream Port,
>     everything below it (the Downstream Ports and anything below
>     them), and potentially even any peers of the Upstream Port (is it
>     even possible for a Upstream Port to have peer multi-function
>     devices?)

Yep, the Microsemi Switchtec is one such real life example of an end
device function on the same bus as a USP.


>   - a Switch Downstream Port, you assume the Port (and the Link going
>     downstream) may need to be reset, so you notify the Port and
>     anything below it
> 
>   - an Endpoint, you assume the Link leading to the Endpoint may need
>     to be reset, so you notify the Endpoint, any peer multi-function
>     devices, any related SR-IOV devices, and any devices below a peer
>     that happens to be a bridge
> 
> And this is different from before because it notifies more devices in
> some cases?  There was a pci_walk_bus() in broadcast_error_message(),
> so we should have notified several devices in *some* cases, at least.

broadcast_error_message() had been using the pci_dev that detected the
error, and it's pci_walk_bus() used dev->subordinate. If the pci_dev
that detected an error was an end device, we didn't walk the bus.
Bjorn Helgaas Sept. 28, 2018, 8:50 p.m. UTC | #5
On Fri, Sep 28, 2018 at 09:42:20AM -0600, Keith Busch wrote:
> On Thu, Sep 27, 2018 at 05:56:25PM -0500, Bjorn Helgaas wrote:
> > On Wed, Sep 26, 2018 at 04:19:25PM -0600, Keith Busch wrote:
> > > On Wed, Sep 26, 2018 at 05:01:16PM -0500, Bjorn Helgaas wrote:
> > > > On Thu, Sep 20, 2018 at 10:27:13AM -0600, Keith Busch wrote:
> > > > > The link reset always used the first bridge device, but AER broadcast
> > > > > error handling may have reported an end device. This means the reset may
> > > > > hit devices that were never notified of the impending error recovery.
> > > > > 
> > > > > This patch uses the first downstream port in the hierarchy considered
> > > > > reliable. An error detected by a switch upstream port should mean it
> > > > > occurred on its upstream link, so the patch selects the parent device
> > > > > if the error is not a root or downstream port.
> > > > 
> > > > I'm not really clear on what "Always use the first downstream port"
> > > > means.  Always use it for *what*?
> > 
> > And I forgot to ask what "first downstream port" means.
> 
> The "first downstream port" was supposed to mean the first DSP we
> find when walking toward the root from the pci_dev that reported
> ERR_[NON]FATAL.
> 
> By "use", I mean "walking down the sub-tree for error handling".
> 
> After thinking more on this, that doesn't really capture the intent. A
> better way might be:
> 
>   Run error handling starting from the downstream port of the bus
>   reporting an error

I think the light is beginning to dawn.  Does this make sense (as far
as it goes)?

  PCI/ERR: Run error recovery callbacks for all affected devices

  If an Endpoint reports an error with ERR_FATAL, we previously ran
  driver error recovery callbacks only for the Endpoint.  But if
  recovery requires that we reset the Endpoint, we may have to use a
  port farther upstream to initiate a Link reset, and that may affect
  components other than the Endpoint, e.g., multi-function peers and
  their children.  Drivers for those devices were never notified of
  the impending reset.

  Call driver error recovery callbacks for every device that will be
  reset.

Now help me understand this part:

> This allows two other clean-ups.  First, error handling can only run
> on bridges so this patch removes checks for endpoints.  

"error handling can only run on bridges"?  I *think* only Root Ports
and Root Complex Event Collectors can assert AER interrupts, so
aer_irq() is only run for them (actually I don't think we quite
support Event Collectors yet).  Is this what you're getting at?

Probably not, because the "dev" passed to pcie_do_recovery() isn't an
RP or RCEC.  But the e_info->dev[i] that aer_process_err_devices()
eventually passes in doesn't have to be a bridge at all, does it?

So I can't quite figure out the bridge connection here.

> Second, the first accessible port does not inherit the channel error
> state since we can access it, so the special cases for error detect
> and resume are no longer necessary.

When we call pcie_do_recovery(), I guess we specify a "dev" that
logged an AER event and a pci_channel_state.  It seems a little bit
weird to handle those separately, as opposed to incorporating into 
the pci_dev or something.

But if you're just saying that "if A is frozen because of an error,
that doesn't mean bridges upstream from A are frozen", that makes good
sense.

Bjorn
Keith Busch Sept. 28, 2018, 9:35 p.m. UTC | #6
On Fri, Sep 28, 2018 at 03:50:34PM -0500, Bjorn Helgaas wrote:
> On Fri, Sep 28, 2018 at 09:42:20AM -0600, Keith Busch wrote:
> > On Thu, Sep 27, 2018 at 05:56:25PM -0500, Bjorn Helgaas wrote:
> > > On Wed, Sep 26, 2018 at 04:19:25PM -0600, Keith Busch wrote:
> > The "first downstream port" was supposed to mean the first DSP we
> > find when walking toward the root from the pci_dev that reported
> > ERR_[NON]FATAL.
> > 
> > By "use", I mean "walking down the sub-tree for error handling".
> > 
> > After thinking more on this, that doesn't really capture the intent. A
> > better way might be:
> > 
> >   Run error handling starting from the downstream port of the bus
> >   reporting an error
> 
> I think the light is beginning to dawn.  Does this make sense (as far
> as it goes)?
> 
>   PCI/ERR: Run error recovery callbacks for all affected devices
> 
>   If an Endpoint reports an error with ERR_FATAL, we previously ran
>   driver error recovery callbacks only for the Endpoint.  But if
>   recovery requires that we reset the Endpoint, we may have to use a
>   port farther upstream to initiate a Link reset, and that may affect
>   components other than the Endpoint, e.g., multi-function peers and
>   their children.  Drivers for those devices were never notified of
>   the impending reset.
> 
>   Call driver error recovery callbacks for every device that will be
>   reset.

Yes!
 
> Now help me understand this part:
> 
> > This allows two other clean-ups.  First, error handling can only run
> > on bridges so this patch removes checks for endpoints.  
> 
> "error handling can only run on bridges"?  I *think* only Root Ports
> and Root Complex Event Collectors can assert AER interrupts, so
> aer_irq() is only run for them (actually I don't think we quite
> support Event Collectors yet).  Is this what you're getting at?

I mean the pci_dev sent to pcie_do_recovery(), which may be any device
in the topology including or below the root port that aer_irq() serviced.

> Probably not, because the "dev" passed to pcie_do_recovery() isn't an
> RP or RCEC.  But the e_info->dev[i] that aer_process_err_devices()
> eventually passes in doesn't have to be a bridge at all, does it?

Yes, e_info->dev[i] is sent to pcie_do_recovery(). That could be an RP,
but it may also be anything anything below it.

The assumption I'm making (which I think is a safe assumption with
general consensus) is that errors detected on an end point or an upstream
port happened because of something wrong with the link going upstream:
end devices have no other option, and I don't think it's possible a
bus error occurs on "virtual" busses.

The patch then assumes we should avoid sending any traffic through that
link until error recovery completes.
 
> So I can't quite figure out the bridge connection here.

I should have included RP's, or more generically "type 1 header devices".

> > Second, the first accessible port does not inherit the channel error
> > state since we can access it, so the special cases for error detect
> > and resume are no longer necessary.
> 
> When we call pcie_do_recovery(), I guess we specify a "dev" that
> logged an AER event and a pci_channel_state.  It seems a little bit
> weird to handle those separately, as opposed to incorporating into 
> the pci_dev or something.
>
> But if you're just saying that "if A is frozen because of an error,
> that doesn't mean bridges upstream from A are frozen", that makes good
> sense.

Yes. Another way to think of it is in terms of busses instead of
devices. While devices report errors, the error is because of the
bus. Paths going through that bus are considered frozen, and everything
else is not. We can safely communicate with the RP or DSP that connects
to that bus because communicating with those do not go through the
frozen bus.
Bjorn Helgaas Sept. 28, 2018, 11:28 p.m. UTC | #7
On Fri, Sep 28, 2018 at 03:35:23PM -0600, Keith Busch wrote:
> On Fri, Sep 28, 2018 at 03:50:34PM -0500, Bjorn Helgaas wrote:
> > On Fri, Sep 28, 2018 at 09:42:20AM -0600, Keith Busch wrote:
> > > On Thu, Sep 27, 2018 at 05:56:25PM -0500, Bjorn Helgaas wrote:
> > > > On Wed, Sep 26, 2018 at 04:19:25PM -0600, Keith Busch wrote:
> > > The "first downstream port" was supposed to mean the first DSP we
> > > find when walking toward the root from the pci_dev that reported
> > > ERR_[NON]FATAL.
> > > 
> > > By "use", I mean "walking down the sub-tree for error handling".
> > > 
> > > After thinking more on this, that doesn't really capture the intent. A
> > > better way might be:
> > > 
> > >   Run error handling starting from the downstream port of the bus
> > >   reporting an error
> > 
> > I think the light is beginning to dawn.  Does this make sense (as far
> > as it goes)?
> > 
> >   PCI/ERR: Run error recovery callbacks for all affected devices
> > 
> >   If an Endpoint reports an error with ERR_FATAL, we previously ran
> >   driver error recovery callbacks only for the Endpoint.  But if
> >   recovery requires that we reset the Endpoint, we may have to use a
> >   port farther upstream to initiate a Link reset, and that may affect
> >   components other than the Endpoint, e.g., multi-function peers and
> >   their children.  Drivers for those devices were never notified of
> >   the impending reset.
> > 
> >   Call driver error recovery callbacks for every device that will be
> >   reset.
> 
> Yes!
>  
> > Now help me understand this part:
> > 
> > > This allows two other clean-ups.  First, error handling can only run
> > > on bridges so this patch removes checks for endpoints.  
> > 
> > "error handling can only run on bridges"?  I *think* only Root Ports
> > and Root Complex Event Collectors can assert AER interrupts, so
> > aer_irq() is only run for them (actually I don't think we quite
> > support Event Collectors yet).  Is this what you're getting at?
> 
> I mean the pci_dev sent to pcie_do_recovery(), which may be any device
> in the topology including or below the root port that aer_irq() serviced.

Yep.

> > Probably not, because the "dev" passed to pcie_do_recovery() isn't an
> > RP or RCEC.  But the e_info->dev[i] that aer_process_err_devices()
> > eventually passes in doesn't have to be a bridge at all, does it?
> 
> Yes, e_info->dev[i] is sent to pcie_do_recovery(). That could be an RP,
> but it may also be anything anything below it.

Yep.

> The assumption I'm making (which I think is a safe assumption with
> general consensus) is that errors detected on an end point or an upstream
> port happened because of something wrong with the link going upstream:
> end devices have no other option, 

Is this really true?  It looks like "Internal Errors" (sec 6.2.9) may
be unrelated to a packet or event (though they are supposed to be
associated with a PCIe interface).

It says the only method of recovering is reset or hardware
replacement.  It doesn't specify, but it seems like a FLR or similar
reset might be appropriate and we may not have to reset the link.

Getting back to the changelog, "error handling can only run on
bridges" clearly doesn't refer to the driver callbacks (since those
only apply to endpoints).  Maybe "error handling" refers to the
reset_link(), which can only be done on a bridge?

That would make sense to me, although the current code may be
resetting more devices than necessary if Internal Errors can be
handled without a link reset.

Bjorn
Keith Busch Oct. 1, 2018, 3:14 p.m. UTC | #8
On Fri, Sep 28, 2018 at 06:28:02PM -0500, Bjorn Helgaas wrote:
> On Fri, Sep 28, 2018 at 03:35:23PM -0600, Keith Busch wrote:
> > The assumption I'm making (which I think is a safe assumption with
> > general consensus) is that errors detected on an end point or an upstream
> > port happened because of something wrong with the link going upstream:
> > end devices have no other option, 
> 
> Is this really true?  It looks like "Internal Errors" (sec 6.2.9) may
> be unrelated to a packet or event (though they are supposed to be
> associated with a PCIe interface).
> 
> It says the only method of recovering is reset or hardware
> replacement.  It doesn't specify, but it seems like a FLR or similar
> reset might be appropriate and we may not have to reset the link.

That is an interesting case we might want to handle better. I've a couple
concerns to consider for implementing:

We don't know an ERR_FATAL occured for an internal reason until we read the
config register across the link, and the AER driver historically avoided
accessing potentially unhealthy links. I don't *think* it's harmful to
attempt reading the register, but we'd just need to check for an "all 1's"
completion before trusting the result.

The other issue with trying to use FLR is a device may not implement
it, so pci reset has fallback methods depending on the device's
capabilities. We can end up calling pci_parent_bus_reset(), which does the
same secondary bus reset that already happens as part of error recovery.
We'd just need to make sure affected devices and drivers have a chance
to be notified (which is the this patch's intention).
 
> Getting back to the changelog, "error handling can only run on
> bridges" clearly doesn't refer to the driver callbacks (since those
> only apply to endpoints).  Maybe "error handling" refers to the
> reset_link(), which can only be done on a bridge?

Yep, referring to how link reset_link is only sent from bridges.
 
> That would make sense to me, although the current code may be
> resetting more devices than necessary if Internal Errors can be
> handled without a link reset.

That sounds good, I'll test some scenarios out here.
Bjorn Helgaas Oct. 2, 2018, 7:35 p.m. UTC | #9
On Mon, Oct 01, 2018 at 09:14:51AM -0600, Keith Busch wrote:
> On Fri, Sep 28, 2018 at 06:28:02PM -0500, Bjorn Helgaas wrote:
> > On Fri, Sep 28, 2018 at 03:35:23PM -0600, Keith Busch wrote:
> > > The assumption I'm making (which I think is a safe assumption with
> > > general consensus) is that errors detected on an end point or an upstream
> > > port happened because of something wrong with the link going upstream:
> > > end devices have no other option, 
> > 
> > Is this really true?  It looks like "Internal Errors" (sec 6.2.9) may
> > be unrelated to a packet or event (though they are supposed to be
> > associated with a PCIe interface).
> > 
> > It says the only method of recovering is reset or hardware
> > replacement.  It doesn't specify, but it seems like a FLR or similar
> > reset might be appropriate and we may not have to reset the link.
> 
> That is an interesting case we might want to handle better. I've a couple
> concerns to consider for implementing:
> 
> We don't know an ERR_FATAL occured for an internal reason until we read the
> config register across the link, and the AER driver historically avoided
> accessing potentially unhealthy links. I don't *think* it's harmful to
> attempt reading the register, but we'd just need to check for an "all 1's"
> completion before trusting the result.
> 
> The other issue with trying to use FLR is a device may not implement
> it, so pci reset has fallback methods depending on the device's
> capabilities. We can end up calling pci_parent_bus_reset(), which does the
> same secondary bus reset that already happens as part of error recovery.
> We'd just need to make sure affected devices and drivers have a chance
> to be notified (which is the this patch's intention).
>  
> > Getting back to the changelog, "error handling can only run on
> > bridges" clearly doesn't refer to the driver callbacks (since those
> > only apply to endpoints).  Maybe "error handling" refers to the
> > reset_link(), which can only be done on a bridge?
> 
> Yep, referring to how link reset_link is only sent from bridges.
>  
> > That would make sense to me, although the current code may be
> > resetting more devices than necessary if Internal Errors can be
> > handled without a link reset.
> 
> That sounds good, I'll test some scenarios out here.

The main point here is that we call the driver callbacks for all every
device that might be reset.  If that set of devices is larger than
strictly necessary, that's an opportunity for future optimization,
which we can defer for now.

Here's my proposal for the changelog.  Let me know what I screwed up.


commit 1f7d2967334433d885c0712b8ac3f073f20211ee
Author: Keith Busch <keith.busch@intel.com>
Date:   Thu Sep 20 10:27:13 2018 -0600

    PCI/ERR: Run error recovery callbacks for all affected devices
    
    If an Endpoint reported an error with ERR_FATAL, we previously ran driver
    error recovery callbacks only for the Endpoint's driver.  But if we reset a
    Link to recover from the error, all downstream components are affected,
    including the Endpoint, any multi-function peers, and children of those
    peers.
    
    Initiate the Link reset from the deepest Downstream Port that is
    reliable, and call the error recovery callbacks for all its children.
    
    If a Downstream Port (including a Root Port) reports an error, we assume
    the Port itself is reliable and we need to reset its downstream Link.  In
    all other cases (Switch Upstream Ports, Endpoints, Bridges, etc), we assume
    the Link leading to the component needs to be reset, so we initiate the
    reset at the parent Downstream Port.
    
    This allows two other clean-ups.  First, we currently only use a Link
    reset, which can only be initiated using a Downstream Port, so we can
    remove checks for Endpoints.  Second, the Downstream Port where we initiate
    the Link reset is reliable (unlike the device that reported the error), so
    the special cases for error detect and resume are no longer necessary.
    
    Signed-off-by: Keith Busch <keith.busch@intel.com>
    [bhelgaas: changelog]
    Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
    Reviewed-by: Sinan Kaya <okaya@kernel.org>
Keith Busch Oct. 2, 2018, 7:55 p.m. UTC | #10
On Tue, Oct 02, 2018 at 02:35:22PM -0500, Bjorn Helgaas wrote:
> Here's my proposal for the changelog.  Let me know what I screwed up.
> 
> commit 1f7d2967334433d885c0712b8ac3f073f20211ee
> Author: Keith Busch <keith.busch@intel.com>
> Date:   Thu Sep 20 10:27:13 2018 -0600
> 
>     PCI/ERR: Run error recovery callbacks for all affected devices
>     
>     If an Endpoint reported an error with ERR_FATAL, we previously ran driver
>     error recovery callbacks only for the Endpoint's driver.  But if we reset a
>     Link to recover from the error, all downstream components are affected,
>     including the Endpoint, any multi-function peers, and children of those
>     peers.
>     
>     Initiate the Link reset from the deepest Downstream Port that is
>     reliable, and call the error recovery callbacks for all its children.
>     
>     If a Downstream Port (including a Root Port) reports an error, we assume
>     the Port itself is reliable and we need to reset its downstream Link.  In
>     all other cases (Switch Upstream Ports, Endpoints, Bridges, etc), we assume
>     the Link leading to the component needs to be reset, so we initiate the
>     reset at the parent Downstream Port.
>     
>     This allows two other clean-ups.  First, we currently only use a Link
>     reset, which can only be initiated using a Downstream Port, so we can
>     remove checks for Endpoints.  Second, the Downstream Port where we initiate
>     the Link reset is reliable (unlike the device that reported the error), so
>     the special cases for error detect and resume are no longer necessary.

A downstream port may have been the device that reports the error, but
we still consider that to be accessible. Maybe "unlike its subordinate
bus".

Otherwise this sounds good to me.
diff mbox series

Patch

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index 644f3f725ef0..0fa5e1417a4a 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -63,30 +63,12 @@  static int report_error_detected(struct pci_dev *dev, void *data)
 	if (!dev->driver ||
 		!dev->driver->err_handler ||
 		!dev->driver->err_handler->error_detected) {
-		if (result_data->state == pci_channel_io_frozen &&
-			dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) {
-			/*
-			 * In case of fatal recovery, if one of down-
-			 * stream device has no driver. We might be
-			 * unable to recover because a later insmod
-			 * of a driver for this device is unaware of
-			 * its hw state.
-			 */
-			pci_printk(KERN_DEBUG, dev, "device has %s\n",
-				   dev->driver ?
-				   "no AER-aware driver" : "no driver");
-		}
-
 		/*
-		 * If there's any device in the subtree that does not
-		 * have an error_detected callback, returning
-		 * PCI_ERS_RESULT_NO_AER_DRIVER prevents calling of
-		 * the subsequent mmio_enabled/slot_reset/resume
-		 * callbacks of "any" device in the subtree. All the
-		 * devices in the subtree are left in the error state
-		 * without recovery.
+		 * If any device in the subtree does not have an error_detected
+		 * callback, PCI_ERS_RESULT_NO_AER_DRIVER prevents subsequent
+		 * error callbacks of "any" device in the subtree, and will
+		 * exit in the disconnected error state.
 		 */
-
 		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE)
 			vote = PCI_ERS_RESULT_NO_AER_DRIVER;
 		else
@@ -184,34 +166,23 @@  static pci_ers_result_t default_reset_link(struct pci_dev *dev)
 
 static pci_ers_result_t reset_link(struct pci_dev *dev, u32 service)
 {
-	struct pci_dev *udev;
 	pci_ers_result_t status;
 	struct pcie_port_service_driver *driver = NULL;
 
-	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-		/* Reset this port for all subordinates */
-		udev = dev;
-	} else {
-		/* Reset the upstream component (likely downstream port) */
-		udev = dev->bus->self;
-	}
-
-	/* Use the aer driver of the component firstly */
-	driver = pcie_port_find_service(udev, service);
-
+	driver = pcie_port_find_service(dev, service);
 	if (driver && driver->reset_link) {
-		status = driver->reset_link(udev);
-	} else if (udev->has_secondary_link) {
-		status = default_reset_link(udev);
+		status = driver->reset_link(dev);
+	} else if (dev->has_secondary_link) {
+		status = default_reset_link(dev);
 	} else {
 		pci_printk(KERN_DEBUG, dev, "no link-reset support at upstream device %s\n",
-			pci_name(udev));
+			pci_name(dev));
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 
 	if (status != PCI_ERS_RESULT_RECOVERED) {
 		pci_printk(KERN_DEBUG, dev, "link reset at upstream device %s failed\n",
-			pci_name(udev));
+			pci_name(dev));
 		return PCI_ERS_RESULT_DISCONNECT;
 	}
 
@@ -243,31 +214,7 @@  static pci_ers_result_t broadcast_error_message(struct pci_dev *dev,
 	else
 		result_data.result = PCI_ERS_RESULT_RECOVERED;
 
-	if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-		/*
-		 * If the error is reported by a bridge, we think this error
-		 * is related to the downstream link of the bridge, so we
-		 * do error recovery on all subordinates of the bridge instead
-		 * of the bridge and clear the error status of the bridge.
-		 */
-		if (cb == report_error_detected)
-			dev->error_state = state;
-		pci_walk_bus(dev->subordinate, cb, &result_data);
-		if (cb == report_resume) {
-			pci_aer_clear_device_status(dev);
-			pci_cleanup_aer_uncorrect_error_status(dev);
-			dev->error_state = pci_channel_io_normal;
-		}
-	} else {
-		/*
-		 * If the error is reported by an end point, we think this
-		 * error is related to the upstream link of the end point.
-		 * The error is non fatal so the bus is ok; just invoke
-		 * the callback for the function that logged the error.
-		 */
-		cb(dev, &result_data);
-	}
-
+	pci_walk_bus(dev->subordinate, cb, &result_data);
 	return result_data.result;
 }
 
@@ -276,6 +223,14 @@  void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
 {
 	pci_ers_result_t status;
 
+	/*
+	 * Error recovery runs on all subordinates of the first downstream port.
+	 * If the downstream port detected the error, it is cleared at the end.
+	 */
+	if (!(pci_pcie_type(dev) == PCI_EXP_TYPE_ROOT_PORT ||
+	      pci_pcie_type(dev) == PCI_EXP_TYPE_DOWNSTREAM))
+		dev = dev->bus->self;
+
 	status = broadcast_error_message(dev,
 			state,
 			"error_detected",
@@ -311,6 +266,8 @@  void pcie_do_recovery(struct pci_dev *dev, enum pci_channel_state state,
 				"resume",
 				report_resume);
 
+	pci_aer_clear_device_status(dev);
+	pci_cleanup_aer_uncorrect_error_status(dev);
 	pci_info(dev, "AER: Device recovery successful\n");
 	return;