diff mbox series

[v2] usb: core: stop USB enumeration if too many retries

Message ID 20220902091535.3572333-1-raychi@google.com (mailing list archive)
State Superseded
Headers show
Series [v2] usb: core: stop USB enumeration if too many retries | expand

Commit Message

Ray Chi Sept. 2, 2022, 9:15 a.m. UTC
If a broken accessory connected to a USB host, usbcore might
keep doing enumeration retries and it will take a long time to
cause system unstable.

This patch provides a quirk to specific USB ports of the hub to
stop USB enumeration if needed.

Signed-off-by: Ray Chi <raychi@google.com>
---
Changes since v1:
 - remove usb_hub_set_port_power()
 - add a variable ignore_connect into struct port_dev
 - modify hub_port_stop_enumerate() and set ignore_connect in
   this function
 - avoid calling hub_port_connect_change() in port_event()
---
 drivers/usb/core/hub.c | 40 ++++++++++++++++++++++++++++++++++++++++
 drivers/usb/core/hub.h |  2 ++
 include/linux/usb.h    |  3 +++
 3 files changed, 45 insertions(+)

Comments

Alan Stern Sept. 2, 2022, 2:49 p.m. UTC | #1
On Fri, Sep 02, 2022 at 05:15:35PM +0800, Ray Chi wrote:
> If a broken accessory connected to a USB host, usbcore might
> keep doing enumeration retries and it will take a long time to
> cause system unstable.
> 
> This patch provides a quirk to specific USB ports of the hub to
> stop USB enumeration if needed.

This seems very awkward.  Why not have a quirk that prevents USB 
enumeration completely, instead of after some number of retries?  After 
all, if the port is connected to a broken accessory, there's no reason 
to try enumerating it even once.

For that matter, have you tried using the existing "disabled" port 
attribute instead of adding a new quirk?  Does it already solve your 
problem?

> 
> Signed-off-by: Ray Chi <raychi@google.com>
> ---
> Changes since v1:
>  - remove usb_hub_set_port_power()
>  - add a variable ignore_connect into struct port_dev
>  - modify hub_port_stop_enumerate() and set ignore_connect in
>    this function
>  - avoid calling hub_port_connect_change() in port_event()
> ---
>  drivers/usb/core/hub.c | 40 ++++++++++++++++++++++++++++++++++++++++
>  drivers/usb/core/hub.h |  2 ++
>  include/linux/usb.h    |  3 +++
>  3 files changed, 45 insertions(+)
> 
> diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
> index 2633acde7ac1..7f34ee8bb81e 100644
> --- a/drivers/usb/core/hub.c
> +++ b/drivers/usb/core/hub.c
> @@ -3081,6 +3081,30 @@ static int hub_port_reset(struct usb_hub *hub, int port1,
>  	return status;
>  }
>  
> +/* Stop enumerate if the port met errors and quirk is set */
> +static bool hub_port_stop_enumerate(struct usb_hub *hub, int port1, int retries)
> +{
> +	struct usb_port *port_dev = hub->ports[port1 - 1];
> +
> +	if (port_dev->quirks & USB_PORT_QUIRK_STOP_ENUM) {
> +		if (port_dev->ignore_connect)
> +			return true;
> +
> +		if (retries < (PORT_INIT_TRIES - 1) / 2)
> +			return false;
> +
> +		/*
> +		 * Some USB hosts can't take a long time to keep doing enumeration
> +		 * retry. After doing half of the retries, we would turn off the port
> +		 * power to stop enumeration if the quirk is set.

What made you decide that half of the retries was the right place to 
stop?  Why not do all the retries?

> +		 */
> +		port_dev->ignore_connect = true;
> +	} else
> +		port_dev->ignore_connect = false;
> +
> +	return port_dev->ignore_connect;
> +}

If the quirk prevented enumeration completely then this function 
wouldn't be needed.

> +
>  /* Check if a port is power on */
>  int usb_port_is_power_on(struct usb_hub *hub, unsigned int portstatus)
>  {
> @@ -4855,6 +4879,11 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
>  					buf->bMaxPacketSize0;
>  			kfree(buf);
>  
> +			if (r < 0 && (port_dev->quirks & USB_PORT_QUIRK_STOP_ENUM)) {

How come this line tests the quirk but doesn't call 
hub_port_stop_enumerate()?

> +				retval = r;
> +				goto fail;
> +			}
> +
>  			retval = hub_port_reset(hub, port1, udev, delay, false);
>  			if (retval < 0)		/* error or disconnect */
>  				goto fail;
> @@ -5387,6 +5416,9 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus,
>  		if ((status == -ENOTCONN) || (status == -ENOTSUPP))
>  			break;
>  
> +		if (hub_port_stop_enumerate(hub, port1, i))
> +			break;
> +
>  		/* When halfway through our retry count, power-cycle the port */
>  		if (i == (PORT_INIT_TRIES - 1) / 2) {
>  			dev_info(&port_dev->dev, "attempt power cycle\n");
> @@ -5550,6 +5582,9 @@ static void port_event(struct usb_hub *hub, int port1)
>  	if (usb_hub_port_status(hub, port1, &portstatus, &portchange) < 0)
>  		return;
>  
> +	if (hub_port_stop_enumerate(hub, port1, 0))
> +		return;

This test is in the wrong place.  It should go right next to the check 
for pm_runtime_active(&port_dev->dev); even though the port isn't being 
used we still want to turn off the port-change bits in the port status.

> +
>  	if (portchange & USB_PORT_STAT_C_CONNECTION) {
>  		usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION);
>  		connect_change = 1;
> @@ -5934,6 +5969,9 @@ static int usb_reset_and_verify_device(struct usb_device *udev)
>  		ret = hub_port_init(parent_hub, udev, port1, i);
>  		if (ret >= 0 || ret == -ENOTCONN || ret == -ENODEV)
>  			break;
> +
> +		if (hub_port_stop_enumerate(parent_hub, port1, i))
> +			goto stop_enumerate;

Also this -- the purpose is to avoid calling hub_port_init() for ports 
with the quirk, so this test belongs before the call to hub_port_init(), 
not after.

>  	}
>  	mutex_unlock(hcd->address0_mutex);
>  
> @@ -6022,6 +6060,8 @@ static int usb_reset_and_verify_device(struct usb_device *udev)
>  	udev->bos = bos;
>  	return 0;
>  
> +stop_enumerate:
> +	mutex_unlock(hcd->address0_mutex);
>  re_enumerate:
>  	usb_release_bos_descriptor(udev);
>  	udev->bos = bos;
> diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
> index b2925856b4cb..f0aa718f4c7f 100644
> --- a/drivers/usb/core/hub.h
> +++ b/drivers/usb/core/hub.h
> @@ -90,6 +90,7 @@ struct usb_hub {
>   * @is_superspeed cache super-speed status
>   * @usb3_lpm_u1_permit: whether USB3 U1 LPM is permitted.
>   * @usb3_lpm_u2_permit: whether USB3 U2 LPM is permitted.
> + * @ignore_connect: ignore the connection or not
>   */
>  struct usb_port {
>  	struct usb_device *child;
> @@ -103,6 +104,7 @@ struct usb_port {
>  	u32 over_current_count;
>  	u8 portnum;
>  	u32 quirks;
> +	bool ignore_connect;

This should be a bitfield like the following entries.  It's okay to make 
it a bool rather than unsigned int.  But you may find that you don't 
need this field at all.

>  	unsigned int is_superspeed:1;
>  	unsigned int usb3_lpm_u1_permit:1;
>  	unsigned int usb3_lpm_u2_permit:1;
> diff --git a/include/linux/usb.h b/include/linux/usb.h
> index f7a9914fc97f..fc0fef58c706 100644
> --- a/include/linux/usb.h
> +++ b/include/linux/usb.h
> @@ -490,6 +490,9 @@ enum usb_port_connect_type {
>  /* Decrease TRSTRCY to 10ms during device enumeration. */
>  #define USB_PORT_QUIRK_FAST_ENUM	BIT(1)
>  
> +/* Stop the enumeration for the given port if there are too many failures*/
> +#define USB_PORT_QUIRK_STOP_ENUM	BIT(2)

When you define a new port quirk, you have to document it in the 
/sys/bus/usb/devices/.../<hub_interface>/port<X>/quirks section of 
Documentation/ABI/testing/sysfs-bus-usb.

Alan Stern

> +
>  /*
>   * USB 2.0 Link Power Management (LPM) parameters.
>   */
> -- 
> 2.37.2.789.g6183377224-goog
>
Ray Chi Sept. 2, 2022, 4:08 p.m. UTC | #2
On Fri, Sep 2, 2022 at 10:49 PM Alan Stern <stern@rowland.harvard.edu> wrote:
>
> On Fri, Sep 02, 2022 at 05:15:35PM +0800, Ray Chi wrote:
> > If a broken accessory connected to a USB host, usbcore might
> > keep doing enumeration retries and it will take a long time to
> > cause system unstable.
> >
> > This patch provides a quirk to specific USB ports of the hub to
> > stop USB enumeration if needed.
>
> This seems very awkward.  Why not have a quirk that prevents USB
> enumeration completely, instead of after some number of retries?  After
> all, if the port is connected to a broken accessory, there's no reason
> to try enumerating it even once.
>
> For that matter, have you tried using the existing "disabled" port
> attribute instead of adding a new quirk?  Does it already solve your
> problem?
>

Since we don't know if the connected accessory is normal or broken, doing port
initialization is necessary.

> >
> > Signed-off-by: Ray Chi <raychi@google.com>
> > ---
> > Changes since v1:
> >  - remove usb_hub_set_port_power()
> >  - add a variable ignore_connect into struct port_dev
> >  - modify hub_port_stop_enumerate() and set ignore_connect in
> >    this function
> >  - avoid calling hub_port_connect_change() in port_event()
> > ---
> >  drivers/usb/core/hub.c | 40 ++++++++++++++++++++++++++++++++++++++++
> >  drivers/usb/core/hub.h |  2 ++
> >  include/linux/usb.h    |  3 +++
> >  3 files changed, 45 insertions(+)
> >
> > diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
> > index 2633acde7ac1..7f34ee8bb81e 100644
> > --- a/drivers/usb/core/hub.c
> > +++ b/drivers/usb/core/hub.c
> > @@ -3081,6 +3081,30 @@ static int hub_port_reset(struct usb_hub *hub, int port1,
> >       return status;
> >  }
> >
> > +/* Stop enumerate if the port met errors and quirk is set */
> > +static bool hub_port_stop_enumerate(struct usb_hub *hub, int port1, int retries)
> > +{
> > +     struct usb_port *port_dev = hub->ports[port1 - 1];
> > +
> > +     if (port_dev->quirks & USB_PORT_QUIRK_STOP_ENUM) {
> > +             if (port_dev->ignore_connect)
> > +                     return true;
> > +
> > +             if (retries < (PORT_INIT_TRIES - 1) / 2)
> > +                     return false;
> > +
> > +             /*
> > +              * Some USB hosts can't take a long time to keep doing enumeration
> > +              * retry. After doing half of the retries, we would turn off the port
> > +              * power to stop enumeration if the quirk is set.
>
> What made you decide that half of the retries was the right place to
> stop?  Why not do all the retries?

Since some normal devices will be timeout in the first attempt, I set
the condition to half
of the retries. All the retries will take 12*timeout seconds. It is
too long so that a watchdog
timeout problem may happen.

>
> > +              */
> > +             port_dev->ignore_connect = true;
> > +     } else
> > +             port_dev->ignore_connect = false;
> > +
> > +     return port_dev->ignore_connect;
> > +}
>
> If the quirk prevented enumeration completely then this function
> wouldn't be needed.

The enumeration is still needed as above.

>
> > +
> >  /* Check if a port is power on */
> >  int usb_port_is_power_on(struct usb_hub *hub, unsigned int portstatus)
> >  {
> > @@ -4855,6 +4879,11 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
> >                                       buf->bMaxPacketSize0;
> >                       kfree(buf);
> >
> > +                     if (r < 0 && (port_dev->quirks & USB_PORT_QUIRK_STOP_ENUM)) {
>
> How come this line tests the quirk but doesn't call
> hub_port_stop_enumerate()?

Since the quirk is used to stop enumeration and reduce the total time.
If the port has the quirk, I think the port doesn't need to do
set_address after the port gets
failures in the new scheme. It will add 2 attempts * timeout (defined
in hc_driver) seconds.

>
> > +                             retval = r;
> > +                             goto fail;
> > +                     }
> > +
> >                       retval = hub_port_reset(hub, port1, udev, delay, false);
> >                       if (retval < 0)         /* error or disconnect */
> >                               goto fail;
> > @@ -5387,6 +5416,9 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus,
> >               if ((status == -ENOTCONN) || (status == -ENOTSUPP))
> >                       break;
> >
> > +             if (hub_port_stop_enumerate(hub, port1, i))
> > +                     break;
> > +
> >               /* When halfway through our retry count, power-cycle the port */
> >               if (i == (PORT_INIT_TRIES - 1) / 2) {
> >                       dev_info(&port_dev->dev, "attempt power cycle\n");
> > @@ -5550,6 +5582,9 @@ static void port_event(struct usb_hub *hub, int port1)
> >       if (usb_hub_port_status(hub, port1, &portstatus, &portchange) < 0)
> >               return;
> >
> > +     if (hub_port_stop_enumerate(hub, port1, 0))
> > +             return;
>
> This test is in the wrong place.  It should go right next to the check
> for pm_runtime_active(&port_dev->dev); even though the port isn't being
> used we still want to turn off the port-change bits in the port status.
>

I will modify it later.

> > +
> >       if (portchange & USB_PORT_STAT_C_CONNECTION) {
> >               usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION);
> >               connect_change = 1;
> > @@ -5934,6 +5969,9 @@ static int usb_reset_and_verify_device(struct usb_device *udev)
> >               ret = hub_port_init(parent_hub, udev, port1, i);
> >               if (ret >= 0 || ret == -ENOTCONN || ret == -ENODEV)
> >                       break;
> > +
> > +             if (hub_port_stop_enumerate(parent_hub, port1, i))
> > +                     goto stop_enumerate;
>
> Also this -- the purpose is to avoid calling hub_port_init() for ports
> with the quirk, so this test belongs before the call to hub_port_init(),
> not after.

Since hub_port_init() is needed to know if the connected accessory is
normal or not,
I put the hub_port_stop_enumerate() after hub_port_init().

>
> >       }
> >       mutex_unlock(hcd->address0_mutex);
> >
> > @@ -6022,6 +6060,8 @@ static int usb_reset_and_verify_device(struct usb_device *udev)
> >       udev->bos = bos;
> >       return 0;
> >
> > +stop_enumerate:
> > +     mutex_unlock(hcd->address0_mutex);
> >  re_enumerate:
> >       usb_release_bos_descriptor(udev);
> >       udev->bos = bos;
> > diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
> > index b2925856b4cb..f0aa718f4c7f 100644
> > --- a/drivers/usb/core/hub.h
> > +++ b/drivers/usb/core/hub.h
> > @@ -90,6 +90,7 @@ struct usb_hub {
> >   * @is_superspeed cache super-speed status
> >   * @usb3_lpm_u1_permit: whether USB3 U1 LPM is permitted.
> >   * @usb3_lpm_u2_permit: whether USB3 U2 LPM is permitted.
> > + * @ignore_connect: ignore the connection or not
> >   */
> >  struct usb_port {
> >       struct usb_device *child;
> > @@ -103,6 +104,7 @@ struct usb_port {
> >       u32 over_current_count;
> >       u8 portnum;
> >       u32 quirks;
> > +     bool ignore_connect;
>
> This should be a bitfield like the following entries.  It's okay to make
> it a bool rather than unsigned int.  But you may find that you don't
> need this field at all.
>
> >       unsigned int is_superspeed:1;
> >       unsigned int usb3_lpm_u1_permit:1;
> >       unsigned int usb3_lpm_u2_permit:1;
> > diff --git a/include/linux/usb.h b/include/linux/usb.h
> > index f7a9914fc97f..fc0fef58c706 100644
> > --- a/include/linux/usb.h
> > +++ b/include/linux/usb.h
> > @@ -490,6 +490,9 @@ enum usb_port_connect_type {
> >  /* Decrease TRSTRCY to 10ms during device enumeration. */
> >  #define USB_PORT_QUIRK_FAST_ENUM     BIT(1)
> >
> > +/* Stop the enumeration for the given port if there are too many failures*/
> > +#define USB_PORT_QUIRK_STOP_ENUM     BIT(2)
>
> When you define a new port quirk, you have to document it in the
> /sys/bus/usb/devices/.../<hub_interface>/port<X>/quirks section of
> Documentation/ABI/testing/sysfs-bus-usb.
>

I will document it later.

> Alan Stern
>
> > +
> >  /*
> >   * USB 2.0 Link Power Management (LPM) parameters.
> >   */
> > --
> > 2.37.2.789.g6183377224-goog
> >

Thanks,
Ray
Alan Stern Sept. 2, 2022, 5:07 p.m. UTC | #3
On Sat, Sep 03, 2022 at 12:08:04AM +0800, Ray Chi wrote:
> On Fri, Sep 2, 2022 at 10:49 PM Alan Stern <stern@rowland.harvard.edu> wrote:
> >
> > On Fri, Sep 02, 2022 at 05:15:35PM +0800, Ray Chi wrote:
> > > If a broken accessory connected to a USB host, usbcore might
> > > keep doing enumeration retries and it will take a long time to
> > > cause system unstable.
> > >
> > > This patch provides a quirk to specific USB ports of the hub to
> > > stop USB enumeration if needed.
> >
> > This seems very awkward.  Why not have a quirk that prevents USB
> > enumeration completely, instead of after some number of retries?  After
> > all, if the port is connected to a broken accessory, there's no reason
> > to try enumerating it even once.
> >
> > For that matter, have you tried using the existing "disabled" port
> > attribute instead of adding a new quirk?  Does it already solve your
> > problem?
> >
> 
> Since we don't know if the connected accessory is normal or broken, doing port
> initialization is necessary.

I don't understand.  If you don't know whether the accessory is broken, 
how do you know whether to set the quirk?

On the other hand, if you always set the quirk even before you know 
whether the accessory is broken, why make it a quirk at all?  Why not 
make it the normal behavior of the driver?

> > > +              * Some USB hosts can't take a long time to keep doing enumeration
> > > +              * retry. After doing half of the retries, we would turn off the port
> > > +              * power to stop enumeration if the quirk is set.
> >
> > What made you decide that half of the retries was the right place to
> > stop?  Why not do all the retries?
> 
> Since some normal devices will be timeout in the first attempt, I set
> the condition to half
> of the retries. All the retries will take 12*timeout seconds. It is
> too long so that a watchdog
> timeout problem may happen.

Why not set CONFIG_USB_FEW_INIT_RETRIES instead?

> > If the quirk prevented enumeration completely then this function
> > wouldn't be needed.
> 
> The enumeration is still needed as above.
> 
> >
> > > +
> > >  /* Check if a port is power on */
> > >  int usb_port_is_power_on(struct usb_hub *hub, unsigned int portstatus)
> > >  {
> > > @@ -4855,6 +4879,11 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
> > >                                       buf->bMaxPacketSize0;
> > >                       kfree(buf);
> > >
> > > +                     if (r < 0 && (port_dev->quirks & USB_PORT_QUIRK_STOP_ENUM)) {
> >
> > How come this line tests the quirk but doesn't call
> > hub_port_stop_enumerate()?
> 
> Since the quirk is used to stop enumeration and reduce the total time.
> If the port has the quirk, I think the port doesn't need to do
> set_address after the port gets
> failures in the new scheme. It will add 2 attempts * timeout (defined
> in hc_driver) seconds.

I still can't tell what you're trying to accomplish.  You need to do a 
much better job of explaining the point of this.  For instance, you 
might describe in detail a situation where the quirk is needed, 
explaining what sort of behavior of the system would lead you to set the 
quirk, and why.

Alan Stern
Ray Chi Sept. 5, 2022, 8:36 a.m. UTC | #4
On Sat, Sep 3, 2022 at 1:07 AM Alan Stern <stern@rowland.harvard.edu> wrote:
>
> On Sat, Sep 03, 2022 at 12:08:04AM +0800, Ray Chi wrote:
> > On Fri, Sep 2, 2022 at 10:49 PM Alan Stern <stern@rowland.harvard.edu> wrote:
> > >
> > > On Fri, Sep 02, 2022 at 05:15:35PM +0800, Ray Chi wrote:
> > > > If a broken accessory connected to a USB host, usbcore might
> > > > keep doing enumeration retries and it will take a long time to
> > > > cause system unstable.
> > > >
> > > > This patch provides a quirk to specific USB ports of the hub to
> > > > stop USB enumeration if needed.
> > >
> > > This seems very awkward.  Why not have a quirk that prevents USB
> > > enumeration completely, instead of after some number of retries?  After
> > > all, if the port is connected to a broken accessory, there's no reason
> > > to try enumerating it even once.
> > >
> > > For that matter, have you tried using the existing "disabled" port
> > > attribute instead of adding a new quirk?  Does it already solve your
> > > problem?
> > >
> >
> > Since we don't know if the connected accessory is normal or broken, doing port
> > initialization is necessary.
>
> I don't understand.  If you don't know whether the accessory is broken,
> how do you know whether to set the quirk?
>
> On the other hand, if you always set the quirk even before you know
> whether the accessory is broken, why make it a quirk at all?  Why not
> make it the normal behavior of the driver?
>

Since our device has a watchdog mechanism, when the device connects to
a broken accessory, the kernel panic will happen. This problem didn't happen
in all USB Hosts, so I want to use the quirk to fix this problem for those hosts
with a watchdog mechanism.

> > > > +              * Some USB hosts can't take a long time to keep doing enumeration
> > > > +              * retry. After doing half of the retries, we would turn off the port
> > > > +              * power to stop enumeration if the quirk is set.
> > >
> > > What made you decide that half of the retries was the right place to
> > > stop?  Why not do all the retries?
> >
> > Since some normal devices will be timeout in the first attempt, I set
> > the condition to half
> > of the retries. All the retries will take 12*timeout seconds. It is
> > too long so that a watchdog
> > timeout problem may happen.
>
> Why not set CONFIG_USB_FEW_INIT_RETRIES instead?
>

https://source.android.com/docs/core/architecture/kernel/android-common
According to Android Common Kernel, I can't only add this config to one project.
In addition, it can't stop enumeration so that the timeout problem
still happens.

> > > If the quirk prevented enumeration completely then this function
> > > wouldn't be needed.
> >
> > The enumeration is still needed as above.
> >
> > >
> > > > +
> > > >  /* Check if a port is power on */
> > > >  int usb_port_is_power_on(struct usb_hub *hub, unsigned int portstatus)
> > > >  {
> > > > @@ -4855,6 +4879,11 @@ hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
> > > >                                       buf->bMaxPacketSize0;
> > > >                       kfree(buf);
> > > >
> > > > +                     if (r < 0 && (port_dev->quirks & USB_PORT_QUIRK_STOP_ENUM)) {
> > >
> > > How come this line tests the quirk but doesn't call
> > > hub_port_stop_enumerate()?
> >
> > Since the quirk is used to stop enumeration and reduce the total time.
> > If the port has the quirk, I think the port doesn't need to do
> > set_address after the port gets
> > failures in the new scheme. It will add 2 attempts * timeout (defined
> > in hc_driver) seconds.
>
> I still can't tell what you're trying to accomplish.  You need to do a
> much better job of explaining the point of this.  For instance, you
> might describe in detail a situation where the quirk is needed,
> explaining what sort of behavior of the system would lead you to set the
> quirk, and why.
>

There is a kernel panic when the device connects to the broken accessory.
I tried to modify the initial_descriptor_timeout. When the accessory is not
working, the total time is 6.5s (get descriptor retry) + 5*2 seconds
(set address of xhci timeout).
The time is so long to cause kernel panic for the device. This is why I want to
stop enumeration instead reducing the retries or timeout.

[16433.648337] usb 2-1: reset full-speed USB device number 2 using
xhci-hcd-exynos
[16435.311614] usb 2-1: device descriptor read/64, error -110
[16437.103767] usb 2-1: device descriptor read/64, error -110
[16437.339768] usb 2-1: reset full-speed USB device number 2 using
xhci-hcd-exynos
[16439.023868] usb 2-1: device descriptor read/64, error -110
[16440.815597] usb 2-1: device descriptor read/64, error -110
[16441.051575] usb 2-1: reset full-speed USB device number 2 using
xhci-hcd-exynos
[16446.063656] xhci-hcd-exynos xhci-hcd-exynos.4.auto: Timeout while
waiting for setup device command
[16446.953961] Kernel panic - not syncing: PM suspend timeout
[16446.978122] Workqueue: events_unbound async_run_entry_fn.cfi_jt
[16446.978136] Call trace:
[16446.978150]  __switch_to+0x260/0x4dc
[16446.978165]  __schedule+0x6c4/0xabc
[16446.978181]  schedule+0x12c/0x24c
[16446.978195]  schedule_timeout+0x48/0x138
[16446.978210]  wait_for_common+0x148/0x310
[16446.978238]  xhci_setup_device+0x470/0xe30
[16446.978250]  xhci_address_device+0x18/0x28
[16446.978340]  hub_port_init+0x5a0/0xfec
[16446.978356]  usb_reset_and_verify_device+0x710/0xb8c
[16446.978370]  usb_port_resume+0x5a8/0x780
[16446.978393]  usb_generic_driver_resume+0x28/0x60
[16446.978413]  usb_resume_both+0x16c/0x474
[16446.978436]  usb_dev_resume+0x2c/0x84
[16446.978446]  dpm_run_callback+0x50/0x250
[16446.978459]  device_resume+0x250/0x2f8
[16446.978473]  async_resume+0x28/0x12c
[16446.978492]  async_run_entry_fn+0x6c/0x3dc
[16446.978516]  process_one_work+0x24c/0x5bc
[16446.978530]  worker_thread+0x3e8/0xa50
[16446.978547]  kthread+0x150/0x1b4
[16446.978563]  ret_from_fork+0x10/0x30

> Alan Stern

Thanks,
Ray
Alan Stern Sept. 5, 2022, 3:18 p.m. UTC | #5
On Mon, Sep 05, 2022 at 04:36:16PM +0800, Ray Chi wrote:
> On Sat, Sep 3, 2022 at 1:07 AM Alan Stern <stern@rowland.harvard.edu> wrote:
> > I don't understand.  If you don't know whether the accessory is broken,
> > how do you know whether to set the quirk?
> >
> > On the other hand, if you always set the quirk even before you know
> > whether the accessory is broken, why make it a quirk at all?  Why not
> > make it the normal behavior of the driver?
> >
> 
> Since our device has a watchdog mechanism, when the device connects to
> a broken accessory, the kernel panic will happen. This problem didn't happen
> in all USB Hosts, so I want to use the quirk to fix this problem for those hosts
> with a watchdog mechanism.

Okay.  So this shouldn't be a quirk; it should apply all the time to any 
hub where the host controller has this watchdog mechanism.

> > Why not set CONFIG_USB_FEW_INIT_RETRIES instead?
> >
> 
> https://source.android.com/docs/core/architecture/kernel/android-common
> According to Android Common Kernel, I can't only add this config to one project.
> In addition, it can't stop enumeration so that the timeout problem
> still happens.

This is the first time you have mentioned either the watchdog mechanism 
or the fact that this is intended for Android.  It would have been a lot 
better if both of these facts were included in the initial patch 
description.  You can't expect people to evaluate a new patch properly 
if they don't have a clear picture of what it was meant for.

> > might describe in detail a situation where the quirk is needed,
> > explaining what sort of behavior of the system would lead you to set the
> > quirk, and why.
> >
> 
> There is a kernel panic when the device connects to the broken accessory.
> I tried to modify the initial_descriptor_timeout. When the accessory is not
> working, the total time is 6.5s (get descriptor retry) + 5*2 seconds
> (set address of xhci timeout).
> The time is so long to cause kernel panic for the device. This is why I want to
> stop enumeration instead reducing the retries or timeout.

It sounds like what you need is a "quick initialization" option that 
will limit the timeout lengths and the numbers of retries, and will 
cause the system to ignore connections on a port once an initialization 
has failed.  There should also be a way to make the system stop ignoring 
a port, perhaps by writing to a sysfs file.

In addition, there should be an automatic algorithm to determine which 
hub ports this option will apply to.  I don't think you want it to be 
based on a quirk, because you shouldn't need to wait for a kernel panic 
before realizing that the quirk is needed -- that's why the algorithm 
has to be automatic.

Can you write a new patch that works more like this?

Alan Stern
Ray Chi Sept. 8, 2022, 11:11 a.m. UTC | #6
On Mon, Sep 5, 2022 at 11:18 PM Alan Stern <stern@rowland.harvard.edu> wrote:
>
> On Mon, Sep 05, 2022 at 04:36:16PM +0800, Ray Chi wrote:
> > On Sat, Sep 3, 2022 at 1:07 AM Alan Stern <stern@rowland.harvard.edu> wrote:
> > > I don't understand.  If you don't know whether the accessory is broken,
> > > how do you know whether to set the quirk?
> > >
> > > On the other hand, if you always set the quirk even before you know
> > > whether the accessory is broken, why make it a quirk at all?  Why not
> > > make it the normal behavior of the driver?
> > >
> >
> > Since our device has a watchdog mechanism, when the device connects to
> > a broken accessory, the kernel panic will happen. This problem didn't happen
> > in all USB Hosts, so I want to use the quirk to fix this problem for those hosts
> > with a watchdog mechanism.
>
> Okay.  So this shouldn't be a quirk; it should apply all the time to any
> hub where the host controller has this watchdog mechanism.
>
> > > Why not set CONFIG_USB_FEW_INIT_RETRIES instead?
> > >
> >
> > https://source.android.com/docs/core/architecture/kernel/android-common
> > According to Android Common Kernel, I can't only add this config to one project.
> > In addition, it can't stop enumeration so that the timeout problem
> > still happens.
>
> This is the first time you have mentioned either the watchdog mechanism
> or the fact that this is intended for Android.  It would have been a lot
> better if both of these facts were included in the initial patch
> description.  You can't expect people to evaluate a new patch properly
> if they don't have a clear picture of what it was meant for.
>
> > > might describe in detail a situation where the quirk is needed,
> > > explaining what sort of behavior of the system would lead you to set the
> > > quirk, and why.
> > >
> >
> > There is a kernel panic when the device connects to the broken accessory.
> > I tried to modify the initial_descriptor_timeout. When the accessory is not
> > working, the total time is 6.5s (get descriptor retry) + 5*2 seconds
> > (set address of xhci timeout).
> > The time is so long to cause kernel panic for the device. This is why I want to
> > stop enumeration instead reducing the retries or timeout.
>
> It sounds like what you need is a "quick initialization" option that
> will limit the timeout lengths and the numbers of retries, and will
> cause the system to ignore connections on a port once an initialization
> has failed.  There should also be a way to make the system stop ignoring
> a port, perhaps by writing to a sysfs file.
>

I will remove the quirk and use the sysfs file to do.

> In addition, there should be an automatic algorithm to determine which
> hub ports this option will apply to.  I don't think you want it to be
> based on a quirk, because you shouldn't need to wait for a kernel panic
> before realizing that the quirk is needed -- that's why the algorithm
> has to be automatic.
>
> Can you write a new patch that works more like this?
>

I had two ideas to determine whether the port should apply the option or not.
One is a timeout and the other one is using a retry count. If using
the retry count,
I think it is close to current retry definitions. Maybe we can modify
the design.
If I use the timeout, I need more time to think of a better way to do it for the
synchronization problem. Currently, they are rough ideas. I will
upload the commit
if I have a better solution to determine the behavior automatically.

I will upload a v3 patch using sysfs file to fix the current problem.

> Alan Stern

Thanks,
Ray
diff mbox series

Patch

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 2633acde7ac1..7f34ee8bb81e 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -3081,6 +3081,30 @@  static int hub_port_reset(struct usb_hub *hub, int port1,
 	return status;
 }
 
+/* Stop enumerate if the port met errors and quirk is set */
+static bool hub_port_stop_enumerate(struct usb_hub *hub, int port1, int retries)
+{
+	struct usb_port *port_dev = hub->ports[port1 - 1];
+
+	if (port_dev->quirks & USB_PORT_QUIRK_STOP_ENUM) {
+		if (port_dev->ignore_connect)
+			return true;
+
+		if (retries < (PORT_INIT_TRIES - 1) / 2)
+			return false;
+
+		/*
+		 * Some USB hosts can't take a long time to keep doing enumeration
+		 * retry. After doing half of the retries, we would turn off the port
+		 * power to stop enumeration if the quirk is set.
+		 */
+		port_dev->ignore_connect = true;
+	} else
+		port_dev->ignore_connect = false;
+
+	return port_dev->ignore_connect;
+}
+
 /* Check if a port is power on */
 int usb_port_is_power_on(struct usb_hub *hub, unsigned int portstatus)
 {
@@ -4855,6 +4879,11 @@  hub_port_init(struct usb_hub *hub, struct usb_device *udev, int port1,
 					buf->bMaxPacketSize0;
 			kfree(buf);
 
+			if (r < 0 && (port_dev->quirks & USB_PORT_QUIRK_STOP_ENUM)) {
+				retval = r;
+				goto fail;
+			}
+
 			retval = hub_port_reset(hub, port1, udev, delay, false);
 			if (retval < 0)		/* error or disconnect */
 				goto fail;
@@ -5387,6 +5416,9 @@  static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus,
 		if ((status == -ENOTCONN) || (status == -ENOTSUPP))
 			break;
 
+		if (hub_port_stop_enumerate(hub, port1, i))
+			break;
+
 		/* When halfway through our retry count, power-cycle the port */
 		if (i == (PORT_INIT_TRIES - 1) / 2) {
 			dev_info(&port_dev->dev, "attempt power cycle\n");
@@ -5550,6 +5582,9 @@  static void port_event(struct usb_hub *hub, int port1)
 	if (usb_hub_port_status(hub, port1, &portstatus, &portchange) < 0)
 		return;
 
+	if (hub_port_stop_enumerate(hub, port1, 0))
+		return;
+
 	if (portchange & USB_PORT_STAT_C_CONNECTION) {
 		usb_clear_port_feature(hdev, port1, USB_PORT_FEAT_C_CONNECTION);
 		connect_change = 1;
@@ -5934,6 +5969,9 @@  static int usb_reset_and_verify_device(struct usb_device *udev)
 		ret = hub_port_init(parent_hub, udev, port1, i);
 		if (ret >= 0 || ret == -ENOTCONN || ret == -ENODEV)
 			break;
+
+		if (hub_port_stop_enumerate(parent_hub, port1, i))
+			goto stop_enumerate;
 	}
 	mutex_unlock(hcd->address0_mutex);
 
@@ -6022,6 +6060,8 @@  static int usb_reset_and_verify_device(struct usb_device *udev)
 	udev->bos = bos;
 	return 0;
 
+stop_enumerate:
+	mutex_unlock(hcd->address0_mutex);
 re_enumerate:
 	usb_release_bos_descriptor(udev);
 	udev->bos = bos;
diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h
index b2925856b4cb..f0aa718f4c7f 100644
--- a/drivers/usb/core/hub.h
+++ b/drivers/usb/core/hub.h
@@ -90,6 +90,7 @@  struct usb_hub {
  * @is_superspeed cache super-speed status
  * @usb3_lpm_u1_permit: whether USB3 U1 LPM is permitted.
  * @usb3_lpm_u2_permit: whether USB3 U2 LPM is permitted.
+ * @ignore_connect: ignore the connection or not
  */
 struct usb_port {
 	struct usb_device *child;
@@ -103,6 +104,7 @@  struct usb_port {
 	u32 over_current_count;
 	u8 portnum;
 	u32 quirks;
+	bool ignore_connect;
 	unsigned int is_superspeed:1;
 	unsigned int usb3_lpm_u1_permit:1;
 	unsigned int usb3_lpm_u2_permit:1;
diff --git a/include/linux/usb.h b/include/linux/usb.h
index f7a9914fc97f..fc0fef58c706 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -490,6 +490,9 @@  enum usb_port_connect_type {
 /* Decrease TRSTRCY to 10ms during device enumeration. */
 #define USB_PORT_QUIRK_FAST_ENUM	BIT(1)
 
+/* Stop the enumeration for the given port if there are too many failures*/
+#define USB_PORT_QUIRK_STOP_ENUM	BIT(2)
+
 /*
  * USB 2.0 Link Power Management (LPM) parameters.
  */