diff mbox series

usb: hub: add I/O error retry & reset routine

Message ID 20181115171418.23522-1-nsaenzjulienne@suse.de (mailing list archive)
State New, archived
Headers show
Series usb: hub: add I/O error retry & reset routine | expand

Commit Message

Nicolas Saenz Julienne Nov. 15, 2018, 5:14 p.m. UTC
An URB submission error in the HUB's endpoint completion function
renders the whole HUB device unresponsive. This patch introduces a
routine that retries the submission for 1s to then, as a last resort,
reset the whole device.

The implementation is based on usbhid/hid_core.c's, which implements the
same functionality. It was tested with the help of BCC's error injection
tool (inject.py).

Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
---
 drivers/usb/core/hub.c | 43 +++++++++++++++++++++++++++++++++++++++++-
 drivers/usb/core/hub.h |  3 +++
 2 files changed, 45 insertions(+), 1 deletion(-)

Comments

Alan Stern Nov. 15, 2018, 7:24 p.m. UTC | #1
On Thu, 15 Nov 2018, Nicolas Saenz Julienne wrote:

> An URB submission error in the HUB's endpoint completion function
> renders the whole HUB device unresponsive. This patch introduces a
> routine that retries the submission for 1s to then, as a last resort,
> reset the whole device.
> 
> The implementation is based on usbhid/hid_core.c's, which implements the
> same functionality. It was tested with the help of BCC's error injection
> tool (inject.py).
> 
> Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>

Why do you think this is needed?  Are you experiencing these 
sorts of URB submission errors?

Why do you handle only errors during submission but not during
completion?  And if you keep on getting errors during submission, why
would resetting the hub make any difference?

The patch doesn't delete the io_retry timer when the hub is removed.

Alan Stern


> ---
>  drivers/usb/core/hub.c | 43 +++++++++++++++++++++++++++++++++++++++++-
>  drivers/usb/core/hub.h |  3 +++
>  2 files changed, 45 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
> index d9bd7576786a..1447bdba59ec 100644
> --- a/drivers/usb/core/hub.c
> +++ b/drivers/usb/core/hub.c
> @@ -607,6 +607,44 @@ static int hub_port_status(struct usb_hub *hub, int port1,
>  				   status, change, NULL);
>  }
>  
> +static void hub_io_error(struct usb_hub *hub)
> +{
> +	/*
> +	 * If it has been a while since the last error, we'll assume
> +	 * this a brand new error and reset the retry timeout.
> +	 */
> +	if (time_after(jiffies, hub->stop_retry + HZ/2))
> +		hub->retry_delay = 0;
> +
> +	/* When an error occurs, retry at increasing intervals */
> +	if (hub->retry_delay == 0) {
> +		hub->retry_delay = 13;	/* Then 26, 52, 104, 104, ... */
> +		hub->stop_retry = jiffies + msecs_to_jiffies(1000);
> +	} else if (hub->retry_delay < 100)
> +		hub->retry_delay *= 2;
> +
> +	if (time_after(jiffies, hub->stop_retry)) {
> +		/* Retries failed, so do a port reset */
> +		usb_queue_reset_device(to_usb_interface(hub->intfdev));
> +		return;
> +	}
> +
> +	mod_timer(&hub->io_retry,
> +			jiffies + msecs_to_jiffies(hub->retry_delay));
> +}
> +
> +static void hub_retry_timeout(struct timer_list *t)
> +{
> +	struct usb_hub *hub = from_timer(hub, t, io_retry);
> +
> +	if (hub->disconnected || hub->quiescing)
> +		return;
> +
> +	dev_err(hub->intfdev, "retrying int urb\n");
> +	if (usb_submit_urb(hub->urb, GFP_ATOMIC))
> +		hub_io_error(hub);
> +}
> +
>  static void kick_hub_wq(struct usb_hub *hub)
>  {
>  	struct usb_interface *intf;
> @@ -713,8 +751,10 @@ static void hub_irq(struct urb *urb)
>  		return;
>  
>  	status = usb_submit_urb(hub->urb, GFP_ATOMIC);
> -	if (status != 0 && status != -ENODEV && status != -EPERM)
> +	if (status != 0 && status != -ENODEV && status != -EPERM) {
>  		dev_err(hub->intfdev, "resubmit --> %d\n", status);
> +		hub_io_error(hub);
> +	}
>  }
>  
>  /* USB 2.0 spec Section 11.24.2.3 */
> @@ -1800,6 +1840,7 @@ static int hub_probe(struct usb_interface *intf, const struct usb_device_id *id)
>  	INIT_DELAYED_WORK(&hub->leds, led_work);
>  	INIT_DELAYED_WORK(&hub->init_work, NULL);
>  	INIT_WORK(&hub->events, hub_event);
> +	timer_setup(&hub->io_retry, hub_retry_timeout, 0);
>  	usb_get_intf(intf);
>  	usb_get_dev(hdev);
>  
> diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
> index 4accfb63f7dc..7dbd19c2c8d9 100644
> --- a/drivers/usb/core/hub.h
> +++ b/drivers/usb/core/hub.h
> @@ -69,6 +69,9 @@ struct usb_hub {
>  	struct delayed_work	leds;
>  	struct delayed_work	init_work;
>  	struct work_struct      events;
> +	struct timer_list	io_retry;
> +	unsigned long		stop_retry;
> +	unsigned int		retry_delay;
>  	struct usb_port		**ports;
>  };
>  
>
Nicolas Saenz Julienne Nov. 16, 2018, 10:49 a.m. UTC | #2
Hi Alan,
thanks for the review.

On Thu, 2018-11-15 at 14:24 -0500, Alan Stern wrote:
> On Thu, 15 Nov 2018, Nicolas Saenz Julienne wrote:
> 
> > An URB submission error in the HUB's endpoint completion function
> > renders the whole HUB device unresponsive. This patch introduces a
> > routine that retries the submission for 1s to then, as a last
> resort,
> > reset the whole device.
> > 
> > The implementation is based on usbhid/hid_core.c's, which
> implements the
> > same functionality. It was tested with the help of BCC's error
> injection
> > tool (inject.py).
> > 
> > Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
> 
> Why do you think this is needed?  Are you experiencing these 
> sorts of URB submission errors?

Sorry, I should have been more explicit on where I come from. I've been
playing around injecting atomic allocation errors on the USB stack. For
example any URB submission marked with GFP_ATOMIC that ends up into
xhci will allocate some memory with that flag.

Most subsystems, after facing a burst of memory allocation failures,
seem to recover well (usbnet/hid/uas/alsa/serial). But I found out that
it's not the case for USB hubs: In the event of detecting a new device
the hub will complete an int URB which was previously sent to it. The
event data is saved by the host and the URB is resubmitted for further
event passing. In the case that URB submission failed, we lose all
further events.

It is indeed pretty hard to find this issue in the wild, since you have
to time plugging or unplugging an USB device with the system running
out of memory. But I don't think it's unrealistic to think it might
happen.

As I comment in the patch description, I'm injecting the errors using
BCC and eBPF's function override capabilities.  

> 
> Why do you handle only errors during submission but not during
> completion?  And if you keep on getting errors during submission, why
> would resetting the hub make any difference?

Well, as far as I know, errors during completion are handled. The error
is marked in hub->error, which later-on, in hub-event(), triggers a
device reset. 

While implementing the solution I took into account the hub's
completion error processing behavior and HID's implementation of the
submission error handling (see hid_irq_in() in usbhid/hid-core.c). My
rationale was that since both HID and hub are USB devices with a
similar behavior there was no point in reinventing a mechanism. That
said I have no spec data to back the "1s retry window to then reset the
device".

One could argue that in the event of an error having a timer running
forever is not the best design. It has to stop sometime. If that's the
case, the HUB will be in a unknown state, i.e. a device might have
disappeared. Resetting the hub will at least unbind all the USB devices
attached to it and retry the enumeration. Regardless of the
enumeration's success we'll at least be in a "safe" state.

> 
> The patch doesn't delete the io_retry timer when the hub is removed.

Right, that was silly of me...

Kind Regards,
Nicolas

> 
> Alan Stern
> 
> 
> > ---
> >  drivers/usb/core/hub.c | 43
> +++++++++++++++++++++++++++++++++++++++++-
> >  drivers/usb/core/hub.h |  3 +++
> >  2 files changed, 45 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
> > index d9bd7576786a..1447bdba59ec 100644
> > --- a/drivers/usb/core/hub.c
> > +++ b/drivers/usb/core/hub.c
> > @@ -607,6 +607,44 @@ static int hub_port_status(struct usb_hub
> *hub, int port1,
> >                                  status, change, NULL);
> >  }
> >  
> > +static void hub_io_error(struct usb_hub *hub)
> > +{
> > +     /*
> > +      * If it has been a while since the last error, we'll assume
> > +      * this a brand new error and reset the retry timeout.
> > +      */
> > +     if (time_after(jiffies, hub->stop_retry + HZ/2))
> > +             hub->retry_delay = 0;
> > +
> > +     /* When an error occurs, retry at increasing intervals */
> > +     if (hub->retry_delay == 0) {
> > +             hub->retry_delay = 13;  /* Then 26, 52, 104, 104, ...
> */
> > +             hub->stop_retry = jiffies + msecs_to_jiffies(1000);
> > +     } else if (hub->retry_delay < 100)
> > +             hub->retry_delay *= 2;
> > +
> > +     if (time_after(jiffies, hub->stop_retry)) {
> > +             /* Retries failed, so do a port reset */
> > +             usb_queue_reset_device(to_usb_interface(hub-
> >intfdev));
> > +             return;
> > +     }
> > +
> > +     mod_timer(&hub->io_retry,
> > +                     jiffies + msecs_to_jiffies(hub-
> >retry_delay));
> > +}
> > +
> > +static void hub_retry_timeout(struct timer_list *t)
> > +{
> > +     struct usb_hub *hub = from_timer(hub, t, io_retry);
> > +
> > +     if (hub->disconnected || hub->quiescing)
> > +             return;
> > +
> > +     dev_err(hub->intfdev, "retrying int urb\n");
> > +     if (usb_submit_urb(hub->urb, GFP_ATOMIC))
> > +             hub_io_error(hub);
> > +}
> > +
> >  static void kick_hub_wq(struct usb_hub *hub)
> >  {
> >       struct usb_interface *intf;
> > @@ -713,8 +751,10 @@ static void hub_irq(struct urb *urb)
> >               return;
> >  
> >       status = usb_submit_urb(hub->urb, GFP_ATOMIC);
> > -     if (status != 0 && status != -ENODEV && status != -EPERM)
> > +     if (status != 0 && status != -ENODEV && status != -EPERM) {
> >               dev_err(hub->intfdev, "resubmit --> %d\n", status);
> > +             hub_io_error(hub);
> > +     }
> >  }
> >  
> >  /* USB 2.0 spec Section 11.24.2.3 */
> > @@ -1800,6 +1840,7 @@ static int hub_probe(struct usb_interface
> *intf, const struct usb_device_id *id)
> >       INIT_DELAYED_WORK(&hub->leds, led_work);
> >       INIT_DELAYED_WORK(&hub->init_work, NULL);
> >       INIT_WORK(&hub->events, hub_event);
> > +     timer_setup(&hub->io_retry, hub_retry_timeout, 0);
> >       usb_get_intf(intf);
> >       usb_get_dev(hdev);
> >  
> > diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
> > index 4accfb63f7dc..7dbd19c2c8d9 100644
> > --- a/drivers/usb/core/hub.h
> > +++ b/drivers/usb/core/hub.h
> > @@ -69,6 +69,9 @@ struct usb_hub {
> >       struct delayed_work     leds;
> >       struct delayed_work     init_work;
> >       struct work_struct      events;
> > +     struct timer_list       io_retry;
> > +     unsigned long           stop_retry;
> > +     unsigned int            retry_delay;
> >       struct usb_port         **ports;
> >  };
> >  
> > 
>
Alan Stern Nov. 16, 2018, 4:21 p.m. UTC | #3
On Fri, 16 Nov 2018, Nicolas Saenz Julienne wrote:

> Hi Alan,
> thanks for the review.
> 
> On Thu, 2018-11-15 at 14:24 -0500, Alan Stern wrote:
> > On Thu, 15 Nov 2018, Nicolas Saenz Julienne wrote:
> > 
> > > An URB submission error in the HUB's endpoint completion function
> > > renders the whole HUB device unresponsive. This patch introduces a
> > > routine that retries the submission for 1s to then, as a last
> > resort,
> > > reset the whole device.
> > > 
> > > The implementation is based on usbhid/hid_core.c's, which
> > implements the
> > > same functionality. It was tested with the help of BCC's error
> > injection
> > > tool (inject.py).
> > > 
> > > Signed-off-by: Nicolas Saenz Julienne <nsaenzjulienne@suse.de>
> > 
> > Why do you think this is needed?  Are you experiencing these 
> > sorts of URB submission errors?
> 
> Sorry, I should have been more explicit on where I come from. I've been
> playing around injecting atomic allocation errors on the USB stack. For
> example any URB submission marked with GFP_ATOMIC that ends up into
> xhci will allocate some memory with that flag.
> 
> Most subsystems, after facing a burst of memory allocation failures,
> seem to recover well (usbnet/hid/uas/alsa/serial). But I found out that
> it's not the case for USB hubs: In the event of detecting a new device
> the hub will complete an int URB which was previously sent to it. The
> event data is saved by the host and the URB is resubmitted for further
> event passing. In the case that URB submission failed, we lose all
> further events.

Ah, I see.

> It is indeed pretty hard to find this issue in the wild, since you have
> to time plugging or unplugging an USB device with the system running
> out of memory. But I don't think it's unrealistic to think it might
> happen.
> 
> As I comment in the patch description, I'm injecting the errors using
> BCC and eBPF's function override capabilities.  
> 
> > 
> > Why do you handle only errors during submission but not during
> > completion?  And if you keep on getting errors during submission, why
> > would resetting the hub make any difference?
> 
> Well, as far as I know, errors during completion are handled. The error
> is marked in hub->error, which later-on, in hub-event(), triggers a
> device reset. 
> 
> While implementing the solution I took into account the hub's
> completion error processing behavior and HID's implementation of the
> submission error handling (see hid_irq_in() in usbhid/hid-core.c). My
> rationale was that since both HID and hub are USB devices with a
> similar behavior there was no point in reinventing a mechanism. That
> said I have no spec data to back the "1s retry window to then reset the
> device".
> 
> One could argue that in the event of an error having a timer running
> forever is not the best design. It has to stop sometime. If that's the
> case, the HUB will be in a unknown state, i.e. a device might have
> disappeared. Resetting the hub will at least unbind all the USB devices
> attached to it and retry the enumeration. Regardless of the
> enumeration's success we'll at least be in a "safe" state.

Well, the timer will get deleted when the hub is unplugged.  If that
doesn't happen, we can assume that the hub has retained its state and
so a reset isn't necessary.

I would just keep retrying at, say, 1-second intervals.  Don't even 
bother with the exponential slowdown that the HID driver does.

> > The patch doesn't delete the io_retry timer when the hub is removed.
> 
> Right, that was silly of me...

You could even rename the timer to something like irq_urb_retry; I 
think that would be a more accurate description of what it does.

So fix those things up, simplify the retries, and expand the patch
description -- then resubmit.  :-)

Alan Stern
Oliver Neukum Nov. 19, 2018, 9:20 a.m. UTC | #4
On Do, 2018-11-15 at 18:14 +0100, Nicolas Saenz Julienne wrote:

Hi,

what Alan said, in addition you need to stop the error handling
when the device is suspended or reset.
 
> @@ -713,8 +751,10 @@ static void hub_irq(struct urb *urb)
>  		return;
>  
>  	status = usb_submit_urb(hub->urb, GFP_ATOMIC);
> -	if (status != 0 && status != -ENODEV && status != -EPERM)
> +	if (status != 0 && status != -ENODEV && status != -EPERM) {

This also needs to check for -ESHUTDOWN

	Regards
		Oliver
diff mbox series

Patch

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index d9bd7576786a..1447bdba59ec 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -607,6 +607,44 @@  static int hub_port_status(struct usb_hub *hub, int port1,
 				   status, change, NULL);
 }
 
+static void hub_io_error(struct usb_hub *hub)
+{
+	/*
+	 * If it has been a while since the last error, we'll assume
+	 * this a brand new error and reset the retry timeout.
+	 */
+	if (time_after(jiffies, hub->stop_retry + HZ/2))
+		hub->retry_delay = 0;
+
+	/* When an error occurs, retry at increasing intervals */
+	if (hub->retry_delay == 0) {
+		hub->retry_delay = 13;	/* Then 26, 52, 104, 104, ... */
+		hub->stop_retry = jiffies + msecs_to_jiffies(1000);
+	} else if (hub->retry_delay < 100)
+		hub->retry_delay *= 2;
+
+	if (time_after(jiffies, hub->stop_retry)) {
+		/* Retries failed, so do a port reset */
+		usb_queue_reset_device(to_usb_interface(hub->intfdev));
+		return;
+	}
+
+	mod_timer(&hub->io_retry,
+			jiffies + msecs_to_jiffies(hub->retry_delay));
+}
+
+static void hub_retry_timeout(struct timer_list *t)
+{
+	struct usb_hub *hub = from_timer(hub, t, io_retry);
+
+	if (hub->disconnected || hub->quiescing)
+		return;
+
+	dev_err(hub->intfdev, "retrying int urb\n");
+	if (usb_submit_urb(hub->urb, GFP_ATOMIC))
+		hub_io_error(hub);
+}
+
 static void kick_hub_wq(struct usb_hub *hub)
 {
 	struct usb_interface *intf;
@@ -713,8 +751,10 @@  static void hub_irq(struct urb *urb)
 		return;
 
 	status = usb_submit_urb(hub->urb, GFP_ATOMIC);
-	if (status != 0 && status != -ENODEV && status != -EPERM)
+	if (status != 0 && status != -ENODEV && status != -EPERM) {
 		dev_err(hub->intfdev, "resubmit --> %d\n", status);
+		hub_io_error(hub);
+	}
 }
 
 /* USB 2.0 spec Section 11.24.2.3 */
@@ -1800,6 +1840,7 @@  static int hub_probe(struct usb_interface *intf, const struct usb_device_id *id)
 	INIT_DELAYED_WORK(&hub->leds, led_work);
 	INIT_DELAYED_WORK(&hub->init_work, NULL);
 	INIT_WORK(&hub->events, hub_event);
+	timer_setup(&hub->io_retry, hub_retry_timeout, 0);
 	usb_get_intf(intf);
 	usb_get_dev(hdev);
 
diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
index 4accfb63f7dc..7dbd19c2c8d9 100644
--- a/drivers/usb/core/hub.h
+++ b/drivers/usb/core/hub.h
@@ -69,6 +69,9 @@  struct usb_hub {
 	struct delayed_work	leds;
 	struct delayed_work	init_work;
 	struct work_struct      events;
+	struct timer_list	io_retry;
+	unsigned long		stop_retry;
+	unsigned int		retry_delay;
 	struct usb_port		**ports;
 };