diff mbox series

[v2,2/2] Revert "usb: dwc3: Don't switch OTG -> peripheral if extcon is present"

Message ID 20220927155332.10762-3-andriy.shevchenko@linux.intel.com (mailing list archive)
State Accepted
Commit 7a84e7353e23202d4f82b05093af4db2b26e6768
Headers show
Series usb: dwc3: revert OTG changes for Intel Merrifield | expand

Commit Message

Andy Shevchenko Sept. 27, 2022, 3:53 p.m. UTC
This reverts commit 0f01017191384e3962fa31520a9fd9846c3d352f.

As pointed out by Ferry this breaks Dual Role support on
Intel Merrifield platforms.

Fixes: 0f0101719138 ("usb: dwc3: Don't switch OTG -> peripheral if extcon is present")
Reported-by: Ferry Toth <fntoth@gmail.com>
Cc: stable@vger.kernel.org
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Tested-by: Ferry Toth <fntoth@gmail.com> # for Merrifield
---
 drivers/usb/dwc3/core.c | 55 +----------------------------------------
 drivers/usb/dwc3/drd.c  | 50 +++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 54 deletions(-)

Comments

Andrey Smirnov Sept. 29, 2022, 3:01 a.m. UTC | #1
On Tue, Sep 27, 2022 at 8:53 AM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> This reverts commit 0f01017191384e3962fa31520a9fd9846c3d352f.
>
> As pointed out by Ferry this breaks Dual Role support on
> Intel Merrifield platforms.
>
> Fixes: 0f0101719138 ("usb: dwc3: Don't switch OTG -> peripheral if extcon is present")
> Reported-by: Ferry Toth <fntoth@gmail.com>
> Cc: stable@vger.kernel.org
> Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
> Tested-by: Ferry Toth <fntoth@gmail.com> # for Merrifield

Sven can you check that this also fixes the regression of your fix?

> ---
>  drivers/usb/dwc3/core.c | 55 +----------------------------------------
>  drivers/usb/dwc3/drd.c  | 50 +++++++++++++++++++++++++++++++++++++
>  2 files changed, 51 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
> index c2b463469d51..219d797e2230 100644
> --- a/drivers/usb/dwc3/core.c
> +++ b/drivers/usb/dwc3/core.c
> @@ -23,7 +23,6 @@
>  #include <linux/delay.h>
>  #include <linux/dma-mapping.h>
>  #include <linux/of.h>
> -#include <linux/of_graph.h>
>  #include <linux/acpi.h>
>  #include <linux/pinctrl/consumer.h>
>  #include <linux/reset.h>
> @@ -86,7 +85,7 @@ static int dwc3_get_dr_mode(struct dwc3 *dwc)
>                  * mode. If the controller supports DRD but the dr_mode is not
>                  * specified or set to OTG, then set the mode to peripheral.
>                  */
> -               if (mode == USB_DR_MODE_OTG && !dwc->edev &&
> +               if (mode == USB_DR_MODE_OTG &&
>                     (!IS_ENABLED(CONFIG_USB_ROLE_SWITCH) ||
>                      !device_property_read_bool(dwc->dev, "usb-role-switch")) &&
>                     !DWC3_VER_IS_PRIOR(DWC3, 330A))
> @@ -1668,51 +1667,6 @@ static void dwc3_check_params(struct dwc3 *dwc)
>         }
>  }
>
> -static struct extcon_dev *dwc3_get_extcon(struct dwc3 *dwc)
> -{
> -       struct device *dev = dwc->dev;
> -       struct device_node *np_phy;
> -       struct extcon_dev *edev = NULL;
> -       const char *name;
> -
> -       if (device_property_read_bool(dev, "extcon"))
> -               return extcon_get_edev_by_phandle(dev, 0);
> -
> -       /*
> -        * Device tree platforms should get extcon via phandle.
> -        * On ACPI platforms, we get the name from a device property.
> -        * This device property is for kernel internal use only and
> -        * is expected to be set by the glue code.
> -        */
> -       if (device_property_read_string(dev, "linux,extcon-name", &name) == 0) {
> -               edev = extcon_get_extcon_dev(name);
> -               if (!edev)
> -                       return ERR_PTR(-EPROBE_DEFER);
> -
> -               return edev;
> -       }
> -
> -       /*
> -        * Try to get an extcon device from the USB PHY controller's "port"
> -        * node. Check if it has the "port" node first, to avoid printing the
> -        * error message from underlying code, as it's a valid case: extcon
> -        * device (and "port" node) may be missing in case of "usb-role-switch"
> -        * or OTG mode.
> -        */
> -       np_phy = of_parse_phandle(dev->of_node, "phys", 0);
> -       if (of_graph_is_present(np_phy)) {
> -               struct device_node *np_conn;
> -
> -               np_conn = of_graph_get_remote_node(np_phy, -1, -1);
> -               if (np_conn)
> -                       edev = extcon_find_edev_by_node(np_conn);
> -               of_node_put(np_conn);
> -       }
> -       of_node_put(np_phy);
> -
> -       return edev;
> -}
> -
>  static int dwc3_probe(struct platform_device *pdev)
>  {
>         struct device           *dev = &pdev->dev;
> @@ -1849,13 +1803,6 @@ static int dwc3_probe(struct platform_device *pdev)
>                 goto err2;
>         }
>
> -       dwc->edev = dwc3_get_extcon(dwc);
> -       if (IS_ERR(dwc->edev)) {
> -               ret = PTR_ERR(dwc->edev);
> -               dev_err_probe(dwc->dev, ret, "failed to get extcon\n");
> -               goto err3;
> -       }
> -
>         ret = dwc3_get_dr_mode(dwc);
>         if (ret)
>                 goto err3;
> diff --git a/drivers/usb/dwc3/drd.c b/drivers/usb/dwc3/drd.c
> index 039bf241769a..8cad9e7d3368 100644
> --- a/drivers/usb/dwc3/drd.c
> +++ b/drivers/usb/dwc3/drd.c
> @@ -8,6 +8,7 @@
>   */
>
>  #include <linux/extcon.h>
> +#include <linux/of_graph.h>
>  #include <linux/of_platform.h>
>  #include <linux/platform_device.h>
>  #include <linux/property.h>
> @@ -438,6 +439,51 @@ static int dwc3_drd_notifier(struct notifier_block *nb,
>         return NOTIFY_DONE;
>  }
>
> +static struct extcon_dev *dwc3_get_extcon(struct dwc3 *dwc)
> +{
> +       struct device *dev = dwc->dev;
> +       struct device_node *np_phy;
> +       struct extcon_dev *edev = NULL;
> +       const char *name;
> +
> +       if (device_property_read_bool(dev, "extcon"))
> +               return extcon_get_edev_by_phandle(dev, 0);
> +
> +       /*
> +        * Device tree platforms should get extcon via phandle.
> +        * On ACPI platforms, we get the name from a device property.
> +        * This device property is for kernel internal use only and
> +        * is expected to be set by the glue code.
> +        */
> +       if (device_property_read_string(dev, "linux,extcon-name", &name) == 0) {
> +               edev = extcon_get_extcon_dev(name);
> +               if (!edev)
> +                       return ERR_PTR(-EPROBE_DEFER);
> +
> +               return edev;
> +       }
> +
> +       /*
> +        * Try to get an extcon device from the USB PHY controller's "port"
> +        * node. Check if it has the "port" node first, to avoid printing the
> +        * error message from underlying code, as it's a valid case: extcon
> +        * device (and "port" node) may be missing in case of "usb-role-switch"
> +        * or OTG mode.
> +        */
> +       np_phy = of_parse_phandle(dev->of_node, "phys", 0);
> +       if (of_graph_is_present(np_phy)) {
> +               struct device_node *np_conn;
> +
> +               np_conn = of_graph_get_remote_node(np_phy, -1, -1);
> +               if (np_conn)
> +                       edev = extcon_find_edev_by_node(np_conn);
> +               of_node_put(np_conn);
> +       }
> +       of_node_put(np_phy);
> +
> +       return edev;
> +}
> +
>  #if IS_ENABLED(CONFIG_USB_ROLE_SWITCH)
>  #define ROLE_SWITCH 1
>  static int dwc3_usb_role_switch_set(struct usb_role_switch *sw,
> @@ -542,6 +588,10 @@ int dwc3_drd_init(struct dwc3 *dwc)
>             device_property_read_bool(dwc->dev, "usb-role-switch"))
>                 return dwc3_setup_role_switch(dwc);
>
> +       dwc->edev = dwc3_get_extcon(dwc);
> +       if (IS_ERR(dwc->edev))
> +               return PTR_ERR(dwc->edev);
> +
>         if (dwc->edev) {
>                 dwc->edev_nb.notifier_call = dwc3_drd_notifier;
>                 ret = extcon_register_notifier(dwc->edev, EXTCON_USB_HOST,
> --
> 2.35.1
>
Sven Peter Sept. 29, 2022, 8:47 a.m. UTC | #2
On Thu, Sep 29, 2022, at 05:01, Andrey Smirnov wrote:
> On Tue, Sep 27, 2022 at 8:53 AM Andy Shevchenko
> <andriy.shevchenko@linux.intel.com> wrote:
>>
>> This reverts commit 0f01017191384e3962fa31520a9fd9846c3d352f.
>>
>> As pointed out by Ferry this breaks Dual Role support on
>> Intel Merrifield platforms.
>>
>> Fixes: 0f0101719138 ("usb: dwc3: Don't switch OTG -> peripheral if extcon is present")
>> Reported-by: Ferry Toth <fntoth@gmail.com>
>> Cc: stable@vger.kernel.org
>> Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
>> Tested-by: Ferry Toth <fntoth@gmail.com> # for Merrifield
>
> Sven can you check that this also fixes the regression of your fix?

I'll only have time to test this on the weekend but the fix looks good to me.

Reviewed-by: Sven Peter <sven@svenpeter.dev>


Sven
Thinh Nguyen Oct. 3, 2022, 9:57 p.m. UTC | #3
Hi Andy,

On Tue, Sep 27, 2022, Andy Shevchenko wrote:
> This reverts commit 0f01017191384e3962fa31520a9fd9846c3d352f.
> 
> As pointed out by Ferry this breaks Dual Role support on
> Intel Merrifield platforms.

Can you provide more info than this (any debug info/description)? It
will be difficult to come back to fix with just this to go on. The
reverted patch was needed to fix a different issue.

Thanks,
Thinh

> 
> Fixes: 0f0101719138 ("usb: dwc3: Don't switch OTG -> peripheral if extcon is present")
> Reported-by: Ferry Toth <fntoth@gmail.com>
> Cc: stable@vger.kernel.org
> Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
> Tested-by: Ferry Toth <fntoth@gmail.com> # for Merrifield
> ---
>  drivers/usb/dwc3/core.c | 55 +----------------------------------------
>  drivers/usb/dwc3/drd.c  | 50 +++++++++++++++++++++++++++++++++++++
>  2 files changed, 51 insertions(+), 54 deletions(-)
Andy Shevchenko Oct. 4, 2022, 8:28 a.m. UTC | #4
On Mon, Oct 03, 2022 at 09:57:35PM +0000, Thinh Nguyen wrote:
> On Tue, Sep 27, 2022, Andy Shevchenko wrote:
> > This reverts commit 0f01017191384e3962fa31520a9fd9846c3d352f.
> > 
> > As pointed out by Ferry this breaks Dual Role support on
> > Intel Merrifield platforms.
> 
> Can you provide more info than this (any debug info/description)? It
> will be difficult to come back to fix with just this to go on. The
> reverted patch was needed to fix a different issue.

It's already applied, but Ferry probably can provide more info if you describe
step-by-step instructions. (I'm still unable to test this particular type of
features since remove access is always in host mode.)
Ferry Toth Oct. 4, 2022, 7:14 p.m. UTC | #5
Hi

Op 04-10-2022 om 10:28 schreef Andy Shevchenko:
> On Mon, Oct 03, 2022 at 09:57:35PM +0000, Thinh Nguyen wrote:
>> On Tue, Sep 27, 2022, Andy Shevchenko wrote:
>>> This reverts commit 0f01017191384e3962fa31520a9fd9846c3d352f.
>>>
>>> As pointed out by Ferry this breaks Dual Role support on
>>> Intel Merrifield platforms.
>> Can you provide more info than this (any debug info/description)? It
>> will be difficult to come back to fix with just this to go on. The
>> reverted patch was needed to fix a different issue.

On Merrifield we have a switch with extcon driver to switch between host 
and device mode. Now with commit 0f01017, device mode works. In host 
mode the root hub appears, but no devices appear. In the logs there are 
no messages from tusb1210, but there should be because lately there 
normally are (harmless) error messages. Nothing in the logs point in the 
direction of tusb1210 not being probed.

The discussion is here: https://lkml.org/lkml/2022/9/24/237

I tried moving some code as suggested without result: 
https://lkml.org/lkml/2022/9/24/434

And with success: https://lkml.org/lkml/2022/9/25/285

So, as Andrey Smirnov writes "I think we'd want to figure out why the 
ordering is important if we want to justify the above fix."

> It's already applied, but Ferry probably can provide more info if you describe
> step-by-step instructions. (I'm still unable to test this particular type of
> features since remove access is always in host mode.)
>
I'd be happy to test.
Thinh Nguyen Oct. 5, 2022, 2:12 a.m. UTC | #6
On Tue, Oct 04, 2022, Ferry Toth wrote:
> Hi
> 
> Op 04-10-2022 om 10:28 schreef Andy Shevchenko:
> > On Mon, Oct 03, 2022 at 09:57:35PM +0000, Thinh Nguyen wrote:
> > > On Tue, Sep 27, 2022, Andy Shevchenko wrote:
> > > > This reverts commit 0f01017191384e3962fa31520a9fd9846c3d352f.
> > > > 
> > > > As pointed out by Ferry this breaks Dual Role support on
> > > > Intel Merrifield platforms.
> > > Can you provide more info than this (any debug info/description)? It
> > > will be difficult to come back to fix with just this to go on. The
> > > reverted patch was needed to fix a different issue.
> 
> On Merrifield we have a switch with extcon driver to switch between host and
> device mode. Now with commit 0f01017, device mode works. In host mode the
> root hub appears, but no devices appear. In the logs there are no messages
> from tusb1210, but there should be because lately there normally are
> (harmless) error messages. Nothing in the logs point in the direction of
> tusb1210 not being probed.
> 
> The discussion is here: https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/24/237__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_IZbroEM$
> 
> I tried moving some code as suggested without result: https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/24/434__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_boaK8Qw$
> 
> And with success: https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/25/285__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_MbbbZII$
> 
> So, as Andrey Smirnov writes "I think we'd want to figure out why the
> ordering is important if we want to justify the above fix."
> 
> > It's already applied, but Ferry probably can provide more info if you describe
> > step-by-step instructions. (I'm still unable to test this particular type of
> > features since remove access is always in host mode.)
> > 
> I'd be happy to test.

Thanks!

Does the failure only happen the first time host is initialized? Or can
it recover after switching to device then back to host mode?

Probably the failure happens if some step(s) in dwc3_core_init() hasn't
completed.

tusb1210 is a phy driver right? The issue is probably because we didn't
initialize the phy yet. So, I suspect placing dwc3_get_extcon() after
initializing the phy will probably solve the dependency problem.

You can try something for yourself or I can provide something to test
later if you don't mind (maybe next week if it's ok).

Thanks,
Thinh
Andrey Smirnov Oct. 5, 2022, 2:39 a.m. UTC | #7
On Tue, Oct 4, 2022 at 7:12 PM Thinh Nguyen <Thinh.Nguyen@synopsys.com> wrote:
>
> On Tue, Oct 04, 2022, Ferry Toth wrote:
> > Hi
> >
> > Op 04-10-2022 om 10:28 schreef Andy Shevchenko:
> > > On Mon, Oct 03, 2022 at 09:57:35PM +0000, Thinh Nguyen wrote:
> > > > On Tue, Sep 27, 2022, Andy Shevchenko wrote:
> > > > > This reverts commit 0f01017191384e3962fa31520a9fd9846c3d352f.
> > > > >
> > > > > As pointed out by Ferry this breaks Dual Role support on
> > > > > Intel Merrifield platforms.
> > > > Can you provide more info than this (any debug info/description)? It
> > > > will be difficult to come back to fix with just this to go on. The
> > > > reverted patch was needed to fix a different issue.
> >
> > On Merrifield we have a switch with extcon driver to switch between host and
> > device mode. Now with commit 0f01017, device mode works. In host mode the
> > root hub appears, but no devices appear. In the logs there are no messages
> > from tusb1210, but there should be because lately there normally are
> > (harmless) error messages. Nothing in the logs point in the direction of
> > tusb1210 not being probed.
> >
> > The discussion is here: https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/24/237__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_IZbroEM$
> >
> > I tried moving some code as suggested without result: https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/24/434__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_boaK8Qw$
> >
> > And with success: https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/25/285__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_MbbbZII$
> >
> > So, as Andrey Smirnov writes "I think we'd want to figure out why the
> > ordering is important if we want to justify the above fix."
> >
> > > It's already applied, but Ferry probably can provide more info if you describe
> > > step-by-step instructions. (I'm still unable to test this particular type of
> > > features since remove access is always in host mode.)
> > >
> > I'd be happy to test.
>
> Thanks!
>
> Does the failure only happen the first time host is initialized? Or can
> it recover after switching to device then back to host mode?
>
> Probably the failure happens if some step(s) in dwc3_core_init() hasn't
> completed.
>
> tusb1210 is a phy driver right? The issue is probably because we didn't
> initialize the phy yet. So, I suspect placing dwc3_get_extcon() after
> initializing the phy will probably solve the dependency problem.
>
> You can try something for yourself or I can provide something to test
> later if you don't mind (maybe next week if it's ok).

FWIW, I just got the same HW Ferry has last week and am planning to
work on this over the weekend.
Andy Shevchenko Oct. 5, 2022, 8:44 a.m. UTC | #8
On Wed, Oct 05, 2022 at 10:10:42AM +0200, Ferry Toth wrote:
> On 05-10-2022 04:39, Andrey Smirnov wrote:
> > On Tue, Oct 4, 2022 at 7:12 PM Thinh Nguyen<Thinh.Nguyen@synopsys.com>  wrote:

...

> > FWIW, I just got the same HW Ferry has last week and am planning to
> > work on this over the weekend.
> I can help you setup, we have binary images available on github as well as
> Yocto recipies to build them.

Also you can build all components (U-Boot, kernel, Buildroot initrd) separately
as explained here:
https://edison-fw.github.io/edison-wiki/u-boot-update
https://edison-fw.github.io/edison-wiki/vanilla
https://github.com/andy-shev/buildroot/tree/intel/board/intel/common
Ferry Toth Oct. 5, 2022, 8:45 a.m. UTC | #9
Hi

On 05-10-2022 04:39, Andrey Smirnov wrote:
> On Tue, Oct 4, 2022 at 7:12 PM Thinh Nguyen<Thinh.Nguyen@synopsys.com>  wrote:
>> On Tue, Oct 04, 2022, Ferry Toth wrote:
>>> Hi
>>>
>>> Op 04-10-2022 om 10:28 schreef Andy Shevchenko:
>>>> On Mon, Oct 03, 2022 at 09:57:35PM +0000, Thinh Nguyen wrote:
>>>>> On Tue, Sep 27, 2022, Andy Shevchenko wrote:
>>>>>> This reverts commit 0f01017191384e3962fa31520a9fd9846c3d352f.
>>>>>>
>>>>>> As pointed out by Ferry this breaks Dual Role support on
>>>>>> Intel Merrifield platforms.
>>>>> Can you provide more info than this (any debug info/description)? It
>>>>> will be difficult to come back to fix with just this to go on. The
>>>>> reverted patch was needed to fix a different issue.
>>> On Merrifield we have a switch with extcon driver to switch between host and
>>> device mode. Now with commit 0f01017, device mode works. In host mode the
>>> root hub appears, but no devices appear. In the logs there are no messages
>>> from tusb1210, but there should be because lately there normally are
>>> (harmless) error messages. Nothing in the logs point in the direction of
>>> tusb1210 not being probed.
>>>
>>> The discussion is here:https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/24/237__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_IZbroEM$
>>>
>>> I tried moving some code as suggested without result:https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/24/434__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_boaK8Qw$
>>>
>>> And with success:https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/25/285__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_MbbbZII$
>>>
>>> So, as Andrey Smirnov writes "I think we'd want to figure out why the
>>> ordering is important if we want to justify the above fix."
>>>
>>>> It's already applied, but Ferry probably can provide more info if you describe
>>>> step-by-step instructions. (I'm still unable to test this particular type of
>>>> features since remove access is always in host mode.)
>>>>
>>> I'd be happy to test.
>> Thanks!
>>
>> Does the failure only happen the first time host is initialized? Or can
>> it recover after switching to device then back to host mode?
>>
>> Probably the failure happens if some step(s) in dwc3_core_init() hasn't
>> completed.
>>
>> tusb1210 is a phy driver right? The issue is probably because we didn't
>> initialize the phy yet. So, I suspect placing dwc3_get_extcon() after
>> initializing the phy will probably solve the dependency problem.
>>
>> You can try something for yourself or I can provide something to test
>> later if you don't mind (maybe next week if it's ok).
> FWIW, I just got the same HW Ferry has last week and am planning to
> work on this over the weekend.
I can help you setup, we have binary images available on github as well 
as Yocto recipies to build them.
Thinh Nguyen Oct. 6, 2022, 2 a.m. UTC | #10
On Tue, Oct 04, 2022, Andrey Smirnov wrote:
> On Tue, Oct 4, 2022 at 7:12 PM Thinh Nguyen <Thinh.Nguyen@synopsys.com> wrote:
> >
> > On Tue, Oct 04, 2022, Ferry Toth wrote:
> > > Hi
> > >
> > > Op 04-10-2022 om 10:28 schreef Andy Shevchenko:
> > > > On Mon, Oct 03, 2022 at 09:57:35PM +0000, Thinh Nguyen wrote:
> > > > > On Tue, Sep 27, 2022, Andy Shevchenko wrote:
> > > > > > This reverts commit 0f01017191384e3962fa31520a9fd9846c3d352f.
> > > > > >
> > > > > > As pointed out by Ferry this breaks Dual Role support on
> > > > > > Intel Merrifield platforms.
> > > > > Can you provide more info than this (any debug info/description)? It
> > > > > will be difficult to come back to fix with just this to go on. The
> > > > > reverted patch was needed to fix a different issue.
> > >
> > > On Merrifield we have a switch with extcon driver to switch between host and
> > > device mode. Now with commit 0f01017, device mode works. In host mode the
> > > root hub appears, but no devices appear. In the logs there are no messages
> > > from tusb1210, but there should be because lately there normally are
> > > (harmless) error messages. Nothing in the logs point in the direction of
> > > tusb1210 not being probed.
> > >
> > > The discussion is here: https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/24/237__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_IZbroEM$
> > >
> > > I tried moving some code as suggested without result: https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/24/434__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_boaK8Qw$
> > >
> > > And with success: https://urldefense.com/v3/__https://lkml.org/lkml/2022/9/25/285__;!!A4F2R9G_pg!avfDjiGwi8xu0grHYrQQTZEEUnmaKu82xxdty0ZltxyU8BkoFD6AMq4a-7STYiKxNQpdYXgP1QG_MbbbZII$
> > >
> > > So, as Andrey Smirnov writes "I think we'd want to figure out why the
> > > ordering is important if we want to justify the above fix."
> > >
> > > > It's already applied, but Ferry probably can provide more info if you describe
> > > > step-by-step instructions. (I'm still unable to test this particular type of
> > > > features since remove access is always in host mode.)
> > > >
> > > I'd be happy to test.
> >
> > Thanks!
> >
> > Does the failure only happen the first time host is initialized? Or can
> > it recover after switching to device then back to host mode?
> >
> > Probably the failure happens if some step(s) in dwc3_core_init() hasn't
> > completed.
> >
> > tusb1210 is a phy driver right? The issue is probably because we didn't
> > initialize the phy yet. So, I suspect placing dwc3_get_extcon() after
> > initializing the phy will probably solve the dependency problem.
> >
> > You can try something for yourself or I can provide something to test
> > later if you don't mind (maybe next week if it's ok).
> 
> FWIW, I just got the same HW Ferry has last week and am planning to
> work on this over the weekend.

That'd be great. Thanks!

Thinh
Thinh Nguyen Oct. 6, 2022, 2:12 a.m. UTC | #11
On Wed, Oct 05, 2022, Ferry Toth wrote:
> Hi,
> 
>     Thanks!
> 
>     Does the failure only happen the first time host is initialized? Or can
>     it recover after switching to device then back to host mode?
> 
> I can switch back and forth and device mode works each time, host mode remains
> dead.

Ok.

> 
>     Probably the failure happens if some step(s) in dwc3_core_init() hasn't
>     completed.
> 
>     tusb1210 is a phy driver right? The issue is probably because we didn't
>     initialize the phy yet. So, I suspect placing dwc3_get_extcon() after
>     initializing the phy will probably solve the dependency problem.
> 
>     You can try something for yourself or I can provide something to test
>     later if you don't mind (maybe next week if it's ok).
> 
> Yes, the code move I mentioned above "moves dwc3_get_extcon() until after
> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU initially
> dwc3_get_extcon() was called from within dwc3_core_init_mode() but only for
> case USB_DR_MODE_OTG. So with this change order of events is more or less
> unchanged" solves the issue.
> 

I saw the experiment you did from the link you provided. We want to also
confirm exactly which step in dwc3_core_init() was needed.

Thanks,
Thinh
Ferry Toth Oct. 6, 2022, 12:28 p.m. UTC | #12
Hi

On 06-10-2022 04:12, Thinh Nguyen wrote:
> On Wed, Oct 05, 2022, Ferry Toth wrote:
>> Hi,
>>
>>      Thanks!
>>
>>      Does the failure only happen the first time host is initialized? Or can
>>      it recover after switching to device then back to host mode?
>>
>> I can switch back and forth and device mode works each time, host mode remains
>> dead.
> Ok.
>
>>      Probably the failure happens if some step(s) in dwc3_core_init() hasn't
>>      completed.
>>
>>      tusb1210 is a phy driver right? The issue is probably because we didn't
>>      initialize the phy yet. So, I suspect placing dwc3_get_extcon() after
>>      initializing the phy will probably solve the dependency problem.
>>
>>      You can try something for yourself or I can provide something to test
>>      later if you don't mind (maybe next week if it's ok).
>>
>> Yes, the code move I mentioned above "moves dwc3_get_extcon() until after
>> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU initially
>> dwc3_get_extcon() was called from within dwc3_core_init_mode() but only for
>> case USB_DR_MODE_OTG. So with this change order of events is more or less
>> unchanged" solves the issue.
>>
> I saw the experiment you did from the link you provided. We want to also
> confirm exactly which step in dwc3_core_init() was needed.

Ok. I first tried the code move suggested by Andrey (didn't work). Then 
after reading the actual code I moved a bit further.

This move was on top of -rc6 without any reverts. I did not make 
additional changes to dwc3_core_init()

So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... - 
dwc3_core_init - .. - dwc3_core_init_mode (not working)

I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. - dwc3_core_init - 
.. - dwc3_core_init_mode (no change)

Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. - dwc3_get_extcon - 
dwc3_core_init_mode (works)

.. are what I believe for this issue irrelevant calls to 
dwc3_alloc_scratch_buffers, dwc3_check_params and dwc3_debugfs_init.

>
> Thanks,
> Thinh
Thinh Nguyen Oct. 7, 2022, 2:11 a.m. UTC | #13
On Thu, Oct 06, 2022, Ferry Toth wrote:
> Hi
> 
> On 06-10-2022 04:12, Thinh Nguyen wrote:
> > On Wed, Oct 05, 2022, Ferry Toth wrote:
> > > Hi,
> > > 
> > >      Thanks!
> > > 
> > >      Does the failure only happen the first time host is initialized? Or can
> > >      it recover after switching to device then back to host mode?
> > > 
> > > I can switch back and forth and device mode works each time, host mode remains
> > > dead.
> > Ok.
> > 
> > >      Probably the failure happens if some step(s) in dwc3_core_init() hasn't
> > >      completed.
> > > 
> > >      tusb1210 is a phy driver right? The issue is probably because we didn't
> > >      initialize the phy yet. So, I suspect placing dwc3_get_extcon() after
> > >      initializing the phy will probably solve the dependency problem.
> > > 
> > >      You can try something for yourself or I can provide something to test
> > >      later if you don't mind (maybe next week if it's ok).
> > > 
> > > Yes, the code move I mentioned above "moves dwc3_get_extcon() until after
> > > dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU initially
> > > dwc3_get_extcon() was called from within dwc3_core_init_mode() but only for
> > > case USB_DR_MODE_OTG. So with this change order of events is more or less
> > > unchanged" solves the issue.
> > > 
> > I saw the experiment you did from the link you provided. We want to also
> > confirm exactly which step in dwc3_core_init() was needed.
> 
> Ok. I first tried the code move suggested by Andrey (didn't work). Then
> after reading the actual code I moved a bit further.
> 
> This move was on top of -rc6 without any reverts. I did not make additional
> changes to dwc3_core_init()
> 
> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
> dwc3_core_init - .. - dwc3_core_init_mode (not working)
> 
> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. - dwc3_core_init - ..
> - dwc3_core_init_mode (no change)
> 
> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. - dwc3_get_extcon -
> dwc3_core_init_mode (works)
> 
> .. are what I believe for this issue irrelevant calls to
> dwc3_alloc_scratch_buffers, dwc3_check_params and dwc3_debugfs_init.
> 

Right. Thanks for narrowing it down. There are still many steps in
dwc3_core_init(). We have some suspicion, but we still haven't confirmed
the exact cause of the failure. We can write a proper patch once we know
the reason.

Thanks,
Thinh
Ferry Toth Oct. 7, 2022, 1:07 p.m. UTC | #14
On 07-10-2022 04:11, Thinh Nguyen wrote:
> On Thu, Oct 06, 2022, Ferry Toth wrote:
>> Hi
>>
>> On 06-10-2022 04:12, Thinh Nguyen wrote:
>>> On Wed, Oct 05, 2022, Ferry Toth wrote:
>>>> Hi,
>>>>
>>>>       Thanks!
>>>>
>>>>       Does the failure only happen the first time host is initialized? Or can
>>>>       it recover after switching to device then back to host mode?
>>>>
>>>> I can switch back and forth and device mode works each time, host mode remains
>>>> dead.
>>> Ok.
>>>
>>>>       Probably the failure happens if some step(s) in dwc3_core_init() hasn't
>>>>       completed.
>>>>
>>>>       tusb1210 is a phy driver right? The issue is probably because we didn't
>>>>       initialize the phy yet. So, I suspect placing dwc3_get_extcon() after
>>>>       initializing the phy will probably solve the dependency problem.
>>>>
>>>>       You can try something for yourself or I can provide something to test
>>>>       later if you don't mind (maybe next week if it's ok).
>>>>
>>>> Yes, the code move I mentioned above "moves dwc3_get_extcon() until after
>>>> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU initially
>>>> dwc3_get_extcon() was called from within dwc3_core_init_mode() but only for
>>>> case USB_DR_MODE_OTG. So with this change order of events is more or less
>>>> unchanged" solves the issue.
>>>>
>>> I saw the experiment you did from the link you provided. We want to also
>>> confirm exactly which step in dwc3_core_init() was needed.
>> Ok. I first tried the code move suggested by Andrey (didn't work). Then
>> after reading the actual code I moved a bit further.
>>
>> This move was on top of -rc6 without any reverts. I did not make additional
>> changes to dwc3_core_init()
>>
>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
>>
>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. - dwc3_core_init - ..
>> - dwc3_core_init_mode (no change)
>>
>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. - dwc3_get_extcon -
>> dwc3_core_init_mode (works)
>>
>> .. are what I believe for this issue irrelevant calls to
>> dwc3_alloc_scratch_buffers, dwc3_check_params and dwc3_debugfs_init.
>>
> Right. Thanks for narrowing it down. There are still many steps in
> dwc3_core_init(). We have some suspicion, but we still haven't confirmed
> the exact cause of the failure. We can write a proper patch once we know
> the reason.
If you would like me to test your suspicion, just tell me what to do :-)
>
> Thanks,
> Thinh
Andrey Smirnov Oct. 10, 2022, 5:02 a.m. UTC | #15
On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
>
>
> On 07-10-2022 04:11, Thinh Nguyen wrote:
> > On Thu, Oct 06, 2022, Ferry Toth wrote:
> >> Hi
> >>
> >> On 06-10-2022 04:12, Thinh Nguyen wrote:
> >>> On Wed, Oct 05, 2022, Ferry Toth wrote:
> >>>> Hi,
> >>>>
> >>>>       Thanks!
> >>>>
> >>>>       Does the failure only happen the first time host is initialized? Or can
> >>>>       it recover after switching to device then back to host mode?
> >>>>
> >>>> I can switch back and forth and device mode works each time, host mode remains
> >>>> dead.
> >>> Ok.
> >>>
> >>>>       Probably the failure happens if some step(s) in dwc3_core_init() hasn't
> >>>>       completed.
> >>>>
> >>>>       tusb1210 is a phy driver right? The issue is probably because we didn't
> >>>>       initialize the phy yet. So, I suspect placing dwc3_get_extcon() after
> >>>>       initializing the phy will probably solve the dependency problem.
> >>>>
> >>>>       You can try something for yourself or I can provide something to test
> >>>>       later if you don't mind (maybe next week if it's ok).
> >>>>
> >>>> Yes, the code move I mentioned above "moves dwc3_get_extcon() until after
> >>>> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU initially
> >>>> dwc3_get_extcon() was called from within dwc3_core_init_mode() but only for
> >>>> case USB_DR_MODE_OTG. So with this change order of events is more or less
> >>>> unchanged" solves the issue.
> >>>>
> >>> I saw the experiment you did from the link you provided. We want to also
> >>> confirm exactly which step in dwc3_core_init() was needed.
> >> Ok. I first tried the code move suggested by Andrey (didn't work). Then
> >> after reading the actual code I moved a bit further.
> >>
> >> This move was on top of -rc6 without any reverts. I did not make additional
> >> changes to dwc3_core_init()
> >>
> >> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
> >> dwc3_core_init - .. - dwc3_core_init_mode (not working)
> >>
> >> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. - dwc3_core_init - ..
> >> - dwc3_core_init_mode (no change)
> >>
> >> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. - dwc3_get_extcon -
> >> dwc3_core_init_mode (works)
> >>
> >> .. are what I believe for this issue irrelevant calls to
> >> dwc3_alloc_scratch_buffers, dwc3_check_params and dwc3_debugfs_init.
> >>
> > Right. Thanks for narrowing it down. There are still many steps in
> > dwc3_core_init(). We have some suspicion, but we still haven't confirmed
> > the exact cause of the failure. We can write a proper patch once we know
> > the reason.
> If you would like me to test your suspicion, just tell me what to do :-)


OK, Ferry, I think I'm going to need clarification on specifics on
your test setup. Can you share your kernel config, maybe your
"/proc/config.gz", somewhere? When you say you are running vanilla
Linux, do you mean it or do you mean vanilla tree + some patch delta?

The reason I'm asking is because I'm having a hard time reproducing
the problem on my end. In fact, when I build v6.0
(4fe89d07dcc2804c8b562f6c7896a45643d34b2f) and then do a

git revert 8bd6b8c4b100 0f0101719138 (original revert proposed by Andy)

I get an infinite loop of reprobing that looks something like (some
debug tracing, function name + line number, included):

[    6.160732] tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41
to reg 0x80
[    6.172299] XXXXXXXXXXX: dwc3_probe 1834
[    6.172426] XXXXXXXXXXX: dwc3_core_init_mode 1386
[    6.176391] XXXXXXXXXXX: dwc3_drd_init 593
[    6.181573] dwc3 dwc3.0.auto: Driver dwc3 requests probe deferral
[    6.191886] platform dwc3.0.auto: Added to deferred list
[    6.197249] platform dwc3.0.auto: Retrying from deferred list
[    6.203057] bus: 'platform': __driver_probe_device: matched device
dwc3.0.auto with driver dwc3
[    6.211783] bus: 'platform': really_probe: probing driver dwc3 with
device dwc3.0.auto
[    6.219935] XXXXXXXXXXX: dwc3_probe 1822
[    6.219952] XXXXXXXXXXX: dwc3_core_init 1092
[    6.223903] XXXXXXXXXXX: dwc3_core_init 1095
[    6.234839] bus: 'ulpi': __driver_probe_device: matched device
dwc3.0.auto.ulpi with driver tusb1210
[    6.248335] bus: 'ulpi': really_probe: probing driver tusb1210 with
device dwc3.0.auto.ulpi
[    6.257039] driver: 'tusb1210': driver_bound: bound to device
'dwc3.0.auto.ulpi'
[    6.264501] bus: 'ulpi': really_probe: bound device
dwc3.0.auto.ulpi to driver tusb1210
[    6.272553] debugfs: Directory 'dwc3.0.auto' with parent 'ulpi'
already present!
[    6.279978] XXXXXXXXXXX: dwc3_core_init 1099
[    6.279991] XXXXXXXXXXX: dwc3_core_init 1103
[    6.345769] tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41
to reg 0x80
[    6.357316] XXXXXXXXXXX: dwc3_probe 1834
[    6.357447] XXXXXXXXXXX: dwc3_core_init_mode 1386
[    6.361402] XXXXXXXXXXX: dwc3_drd_init 593
[    6.366589] dwc3 dwc3.0.auto: Driver dwc3 requests probe deferral
[    6.376901] platform dwc3.0.auto: Added to deferred list

which renders the system completely unusable, but USB host is
definitely going to be broken too. Now, ironically, with my patch
in-place, an attempt to probe extcon that ends up deferring the probe
happens before the ULPI driver failure (which wasn't failing driver
probe prior to https://lore.kernel.org/all/20220213130524.18748-7-hdegoede@redhat.com/),
there no "driver binding" event that re-triggers deferred probe
causing the loop, so the system progresses to a point where extcon is
available and dwc3 driver eventually loads.

After that, and I don't know if I'm doing the same test, USB host
seems to work as expected. lsusb works, my USB stick enumerates as
expected. Switching the USB mux to micro-USB and back shuts the host
functionality down and brings it up as expected. Now I didn't try to
load any gadgets to make sure USB gadget works 100%, but since you
were saying it was USB host that was broken, I wasn't concerned with
that. Am I doing the right test?

For the reference what I test with is:
 - vanilla kernel, no patch delta (sans minor debug tracing) + initrd
built with Buildroot 2022.08.1
 - Initrd is using systemd (don't think that really matters, but who knows)
 - U-Boot 2022.04 (built with Buildroot as well)
 - kernel config is x86_64_defconfig + whatever I gathered from *.cfg
files in https://github.com/edison-fw/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files
Andy Shevchenko Oct. 10, 2022, 7:10 a.m. UTC | #16
On Sun, Oct 09, 2022 at 10:02:26PM -0700, Andrey Smirnov wrote:
> On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:

Thank you for the testing on your side!

...

> OK, Ferry, I think I'm going to need clarification on specifics on
> your test setup. Can you share your kernel config, maybe your
> "/proc/config.gz", somewhere? When you say you are running vanilla
> Linux, do you mean it or do you mean vanilla tree + some patch delta?
> 
> The reason I'm asking is because I'm having a hard time reproducing
> the problem on my end. In fact, when I build v6.0
> (4fe89d07dcc2804c8b562f6c7896a45643d34b2f) and then do a
> 
> git revert 8bd6b8c4b100 0f0101719138 (original revert proposed by Andy)
> 
> I get an infinite loop of reprobing that looks something like (some
> debug tracing, function name + line number, included):

Yes, this is (one of) known drawback(s) of deferred probe hack. I think
the kernel that Ferry runs has a patch that basically reverts one from
2014 [1] and allows to have extcon as a module. (1)

[1]: 58b116bce136 ("drivercore: deferral race condition fix")

> which renders the system completely unusable, but USB host is
> definitely going to be broken too. Now, ironically, with my patch
> in-place, an attempt to probe extcon that ends up deferring the probe
> happens before the ULPI driver failure (which wasn't failing driver
> probe prior to https://lore.kernel.org/all/20220213130524.18748-7-hdegoede@redhat.com/),
> there no "driver binding" event that re-triggers deferred probe
> causing the loop, so the system progresses to a point where extcon is
> available and dwc3 driver eventually loads.
> 
> After that, and I don't know if I'm doing the same test, USB host
> seems to work as expected. lsusb works, my USB stick enumerates as
> expected. Switching the USB mux to micro-USB and back shuts the host
> functionality down and brings it up as expected. Now I didn't try to
> load any gadgets to make sure USB gadget works 100%, but since you
> were saying it was USB host that was broken, I wasn't concerned with
> that. Am I doing the right test?

Hmm... What you described above sounds more like a yet another attempt to
workaround (1). _If_ this is the case, we probably can discuss how to fix
it in generic way (somewhere in dd.c, rather than in the certain driver).

That said, the real test case should be performed on top of clean kernel
before judging if it's good or bad.

> For the reference what I test with is:
>  - vanilla kernel, no patch delta (sans minor debug tracing) + initrd
> built with Buildroot 2022.08.1
>  - Initrd is using systemd (don't think that really matters, but who knows)
>  - U-Boot 2022.04 (built with Buildroot as well)
>  - kernel config is x86_64_defconfig + whatever I gathered from *.cfg
> files in https://github.com/edison-fw/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files
Ferry Toth Oct. 10, 2022, 11:04 a.m. UTC | #17
Hi

On 10-10-2022 07:02, Andrey Smirnov wrote:
> On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
>>
>> On 07-10-2022 04:11, Thinh Nguyen wrote:
>>> On Thu, Oct 06, 2022, Ferry Toth wrote:
>>>> Hi
>>>>
>>>> On 06-10-2022 04:12, Thinh Nguyen wrote:
>>>>> On Wed, Oct 05, 2022, Ferry Toth wrote:
>>>>>> Hi,
>>>>>>
>>>>>>        Thanks!
>>>>>>
>>>>>>        Does the failure only happen the first time host is initialized? Or can
>>>>>>        it recover after switching to device then back to host mode?
>>>>>>
>>>>>> I can switch back and forth and device mode works each time, host mode remains
>>>>>> dead.
>>>>> Ok.
>>>>>
>>>>>>        Probably the failure happens if some step(s) in dwc3_core_init() hasn't
>>>>>>        completed.
>>>>>>
>>>>>>        tusb1210 is a phy driver right? The issue is probably because we didn't
>>>>>>        initialize the phy yet. So, I suspect placing dwc3_get_extcon() after
>>>>>>        initializing the phy will probably solve the dependency problem.
>>>>>>
>>>>>>        You can try something for yourself or I can provide something to test
>>>>>>        later if you don't mind (maybe next week if it's ok).
>>>>>>
>>>>>> Yes, the code move I mentioned above "moves dwc3_get_extcon() until after
>>>>>> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU initially
>>>>>> dwc3_get_extcon() was called from within dwc3_core_init_mode() but only for
>>>>>> case USB_DR_MODE_OTG. So with this change order of events is more or less
>>>>>> unchanged" solves the issue.
>>>>>>
>>>>> I saw the experiment you did from the link you provided. We want to also
>>>>> confirm exactly which step in dwc3_core_init() was needed.
>>>> Ok. I first tried the code move suggested by Andrey (didn't work). Then
>>>> after reading the actual code I moved a bit further.
>>>>
>>>> This move was on top of -rc6 without any reverts. I did not make additional
>>>> changes to dwc3_core_init()
>>>>
>>>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
>>>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
>>>>
>>>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. - dwc3_core_init - ..
>>>> - dwc3_core_init_mode (no change)
>>>>
>>>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. - dwc3_get_extcon -
>>>> dwc3_core_init_mode (works)
>>>>
>>>> .. are what I believe for this issue irrelevant calls to
>>>> dwc3_alloc_scratch_buffers, dwc3_check_params and dwc3_debugfs_init.
>>>>
>>> Right. Thanks for narrowing it down. There are still many steps in
>>> dwc3_core_init(). We have some suspicion, but we still haven't confirmed
>>> the exact cause of the failure. We can write a proper patch once we know
>>> the reason.
>> If you would like me to test your suspicion, just tell me what to do :-)
>
> OK, Ferry, I think I'm going to need clarification on specifics on
> your test setup. Can you share your kernel config, maybe your
> "/proc/config.gz", somewhere? When you say you are running vanilla
> Linux, do you mean it or do you mean vanilla tree + some patch delta?

For v6.0 I can get the exacts tonight. But earlier I had this for v5.17:

https://github.com/htot/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_5.17.bb

There are 2 patches referred in #67 and #68. One is related to the 
infinite loop. The other is I believe also needed to get dwc3 to work.

All the kernel config are applied as .cfg.

Patches and cfs's here:

https://github.com/htot/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files

> The reason I'm asking is because I'm having a hard time reproducing
> the problem on my end. In fact, when I build v6.0
> (4fe89d07dcc2804c8b562f6c7896a45643d34b2f) and then do a
>
> git revert 8bd6b8c4b100 0f0101719138 (original revert proposed by Andy)
>
> I get an infinite loop of reprobing that looks something like (some
> debug tracing, function name + line number, included):
>
> [    6.160732] tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41
> to reg 0x80
> [    6.172299] XXXXXXXXXXX: dwc3_probe 1834
> [    6.172426] XXXXXXXXXXX: dwc3_core_init_mode 1386
> [    6.176391] XXXXXXXXXXX: dwc3_drd_init 593
> [    6.181573] dwc3 dwc3.0.auto: Driver dwc3 requests probe deferral
> [    6.191886] platform dwc3.0.auto: Added to deferred list
> [    6.197249] platform dwc3.0.auto: Retrying from deferred list
> [    6.203057] bus: 'platform': __driver_probe_device: matched device
> dwc3.0.auto with driver dwc3
> [    6.211783] bus: 'platform': really_probe: probing driver dwc3 with
> device dwc3.0.auto
> [    6.219935] XXXXXXXXXXX: dwc3_probe 1822
> [    6.219952] XXXXXXXXXXX: dwc3_core_init 1092
> [    6.223903] XXXXXXXXXXX: dwc3_core_init 1095
> [    6.234839] bus: 'ulpi': __driver_probe_device: matched device
> dwc3.0.auto.ulpi with driver tusb1210
> [    6.248335] bus: 'ulpi': really_probe: probing driver tusb1210 with
> device dwc3.0.auto.ulpi
> [    6.257039] driver: 'tusb1210': driver_bound: bound to device
> 'dwc3.0.auto.ulpi'
> [    6.264501] bus: 'ulpi': really_probe: bound device
> dwc3.0.auto.ulpi to driver tusb1210
> [    6.272553] debugfs: Directory 'dwc3.0.auto' with parent 'ulpi'
> already present!
> [    6.279978] XXXXXXXXXXX: dwc3_core_init 1099
> [    6.279991] XXXXXXXXXXX: dwc3_core_init 1103
> [    6.345769] tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41
> to reg 0x80
> [    6.357316] XXXXXXXXXXX: dwc3_probe 1834
> [    6.357447] XXXXXXXXXXX: dwc3_core_init_mode 1386
> [    6.361402] XXXXXXXXXXX: dwc3_drd_init 593
> [    6.366589] dwc3 dwc3.0.auto: Driver dwc3 requests probe deferral
> [    6.376901] platform dwc3.0.auto: Added to deferred list
>
> which renders the system completely unusable, but USB host is
> definitely going to be broken too. Now, ironically, with my patch
> in-place, an attempt to probe extcon that ends up deferring the probe
> happens before the ULPI driver failure (which wasn't failing driver
> probe prior to https://lore.kernel.org/all/20220213130524.18748-7-hdegoede@redhat.com/),
> there no "driver binding" event that re-triggers deferred probe
> causing the loop, so the system progresses to a point where extcon is
> available and dwc3 driver eventually loads.
>
> After that, and I don't know if I'm doing the same test, USB host
> seems to work as expected. lsusb works, my USB stick enumerates as
> expected. Switching the USB mux to micro-USB and back shuts the host
> functionality down and brings it up as expected. Now I didn't try to
> load any gadgets to make sure USB gadget works 100%, but since you
> were saying it was USB host that was broken, I wasn't concerned with
> that. Am I doing the right test?
>
> For the reference what I test with is:
>   - vanilla kernel, no patch delta (sans minor debug tracing) + initrd
> built with Buildroot 2022.08.1
>   - Initrd is using systemd (don't think that really matters, but who knows)
>   - U-Boot 2022.04 (built with Buildroot as well)
>   - kernel config is x86_64_defconfig + whatever I gathered from *.cfg
> files in https://github.com/edison-fw/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files
Ferry Toth Oct. 10, 2022, 8:52 p.m. UTC | #18
Hi

Op 10-10-2022 om 13:04 schreef Ferry Toth:
> Hi
>
> On 10-10-2022 07:02, Andrey Smirnov wrote:
>> On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
>>>
>>> On 07-10-2022 04:11, Thinh Nguyen wrote:
>>>> On Thu, Oct 06, 2022, Ferry Toth wrote:
>>>>> Hi
>>>>>
>>>>> On 06-10-2022 04:12, Thinh Nguyen wrote:
>>>>>> On Wed, Oct 05, 2022, Ferry Toth wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>>        Thanks!
>>>>>>>
>>>>>>>        Does the failure only happen the first time host is 
>>>>>>> initialized? Or can
>>>>>>>        it recover after switching to device then back to host mode?
>>>>>>>
>>>>>>> I can switch back and forth and device mode works each time, 
>>>>>>> host mode remains
>>>>>>> dead.
>>>>>> Ok.
>>>>>>
>>>>>>>        Probably the failure happens if some step(s) in 
>>>>>>> dwc3_core_init() hasn't
>>>>>>>        completed.
>>>>>>>
>>>>>>>        tusb1210 is a phy driver right? The issue is probably 
>>>>>>> because we didn't
>>>>>>>        initialize the phy yet. So, I suspect placing 
>>>>>>> dwc3_get_extcon() after
>>>>>>>        initializing the phy will probably solve the dependency 
>>>>>>> problem.
>>>>>>>
>>>>>>>        You can try something for yourself or I can provide 
>>>>>>> something to test
>>>>>>>        later if you don't mind (maybe next week if it's ok).
>>>>>>>
>>>>>>> Yes, the code move I mentioned above "moves dwc3_get_extcon() 
>>>>>>> until after
>>>>>>> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU 
>>>>>>> initially
>>>>>>> dwc3_get_extcon() was called from within dwc3_core_init_mode() 
>>>>>>> but only for
>>>>>>> case USB_DR_MODE_OTG. So with this change order of events is 
>>>>>>> more or less
>>>>>>> unchanged" solves the issue.
>>>>>>>
>>>>>> I saw the experiment you did from the link you provided. We want 
>>>>>> to also
>>>>>> confirm exactly which step in dwc3_core_init() was needed.
>>>>> Ok. I first tried the code move suggested by Andrey (didn't work). 
>>>>> Then
>>>>> after reading the actual code I moved a bit further.
>>>>>
>>>>> This move was on top of -rc6 without any reverts. I did not make 
>>>>> additional
>>>>> changes to dwc3_core_init()
>>>>>
>>>>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
>>>>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
>>>>>
>>>>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. - 
>>>>> dwc3_core_init - ..
>>>>> - dwc3_core_init_mode (no change)
>>>>>
>>>>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. - 
>>>>> dwc3_get_extcon -
>>>>> dwc3_core_init_mode (works)
>>>>>
>>>>> .. are what I believe for this issue irrelevant calls to
>>>>> dwc3_alloc_scratch_buffers, dwc3_check_params and dwc3_debugfs_init.
>>>>>
>>>> Right. Thanks for narrowing it down. There are still many steps in
>>>> dwc3_core_init(). We have some suspicion, but we still haven't 
>>>> confirmed
>>>> the exact cause of the failure. We can write a proper patch once we 
>>>> know
>>>> the reason.
>>> If you would like me to test your suspicion, just tell me what to do 
>>> :-)
>>
>> OK, Ferry, I think I'm going to need clarification on specifics on
>> your test setup. Can you share your kernel config, maybe your
>> "/proc/config.gz", somewhere? When you say you are running vanilla
>> Linux, do you mean it or do you mean vanilla tree + some patch delta?
>
> For v6.0 I can get the exacts tonight. But earlier I had this for v5.17:
>
> https://github.com/htot/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_5.17.bb 
>
>
> There are 2 patches referred in #67 and #68. One is related to the 
> infinite loop. The other is I believe also needed to get dwc3 to work.
>
> All the kernel config are applied as .cfg.
>
> Patches and cfs's here:
>
> https://github.com/htot/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files 
>

Updated Yocto recipe for v6.0 here:

https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb

#75-#77 are the 2 reverts from Andy, + one SOF revert (not related to 
this thread).

Otherwise via the git route, https://github.com/andy-shev/linux should 
lead to the same, although you might want to drop "WIP: serial: 
8250_dma: use sgl on transmit "

>
>> The reason I'm asking is because I'm having a hard time reproducing
>> the problem on my end. In fact, when I build v6.0
>> (4fe89d07dcc2804c8b562f6c7896a45643d34b2f) and then do a
>>
>> git revert 8bd6b8c4b100 0f0101719138 (original revert proposed by Andy)
>>
>> I get an infinite loop of reprobing that looks something like (some
>> debug tracing, function name + line number, included):
>>
>> [    6.160732] tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41
>> to reg 0x80
>> [    6.172299] XXXXXXXXXXX: dwc3_probe 1834
>> [    6.172426] XXXXXXXXXXX: dwc3_core_init_mode 1386
>> [    6.176391] XXXXXXXXXXX: dwc3_drd_init 593
>> [    6.181573] dwc3 dwc3.0.auto: Driver dwc3 requests probe deferral
>> [    6.191886] platform dwc3.0.auto: Added to deferred list
>> [    6.197249] platform dwc3.0.auto: Retrying from deferred list
>> [    6.203057] bus: 'platform': __driver_probe_device: matched device
>> dwc3.0.auto with driver dwc3
>> [    6.211783] bus: 'platform': really_probe: probing driver dwc3 with
>> device dwc3.0.auto
>> [    6.219935] XXXXXXXXXXX: dwc3_probe 1822
>> [    6.219952] XXXXXXXXXXX: dwc3_core_init 1092
>> [    6.223903] XXXXXXXXXXX: dwc3_core_init 1095
>> [    6.234839] bus: 'ulpi': __driver_probe_device: matched device
>> dwc3.0.auto.ulpi with driver tusb1210
>> [    6.248335] bus: 'ulpi': really_probe: probing driver tusb1210 with
>> device dwc3.0.auto.ulpi
>> [    6.257039] driver: 'tusb1210': driver_bound: bound to device
>> 'dwc3.0.auto.ulpi'
>> [    6.264501] bus: 'ulpi': really_probe: bound device
>> dwc3.0.auto.ulpi to driver tusb1210
>> [    6.272553] debugfs: Directory 'dwc3.0.auto' with parent 'ulpi'
>> already present!
>> [    6.279978] XXXXXXXXXXX: dwc3_core_init 1099
>> [    6.279991] XXXXXXXXXXX: dwc3_core_init 1103
>> [    6.345769] tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41
>> to reg 0x80
>> [    6.357316] XXXXXXXXXXX: dwc3_probe 1834
>> [    6.357447] XXXXXXXXXXX: dwc3_core_init_mode 1386
>> [    6.361402] XXXXXXXXXXX: dwc3_drd_init 593
>> [    6.366589] dwc3 dwc3.0.auto: Driver dwc3 requests probe deferral
>> [    6.376901] platform dwc3.0.auto: Added to deferred list
>>
>> which renders the system completely unusable, but USB host is
>> definitely going to be broken too. Now, ironically, with my patch
>> in-place, an attempt to probe extcon that ends up deferring the probe
>> happens before the ULPI driver failure (which wasn't failing driver
>> probe prior to 
>> https://lore.kernel.org/all/20220213130524.18748-7-hdegoede@redhat.com/),
>> there no "driver binding" event that re-triggers deferred probe
>> causing the loop, so the system progresses to a point where extcon is
>> available and dwc3 driver eventually loads.
>>
>> After that, and I don't know if I'm doing the same test, USB host
>> seems to work as expected. lsusb works, my USB stick enumerates as
>> expected. Switching the USB mux to micro-USB and back shuts the host
>> functionality down and brings it up as expected. Now I didn't try to
>> load any gadgets to make sure USB gadget works 100%, but since you
>> were saying it was USB host that was broken, I wasn't concerned with
>> that. Am I doing the right test?
>>
>> For the reference what I test with is:
>>   - vanilla kernel, no patch delta (sans minor debug tracing) + initrd
>> built with Buildroot 2022.08.1
>>   - Initrd is using systemd (don't think that really matters, but who 
>> knows)
>>   - U-Boot 2022.04 (built with Buildroot as well)
>>   - kernel config is x86_64_defconfig + whatever I gathered from *.cfg
>> files in 
>> https://github.com/edison-fw/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files
Andrey Smirnov Oct. 10, 2022, 9:35 p.m. UTC | #19
On Mon, Oct 10, 2022 at 1:52 PM Ferry Toth <fntoth@gmail.com> wrote:
>
> Hi
>
> Op 10-10-2022 om 13:04 schreef Ferry Toth:
> > Hi
> >
> > On 10-10-2022 07:02, Andrey Smirnov wrote:
> >> On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
> >>>
> >>> On 07-10-2022 04:11, Thinh Nguyen wrote:
> >>>> On Thu, Oct 06, 2022, Ferry Toth wrote:
> >>>>> Hi
> >>>>>
> >>>>> On 06-10-2022 04:12, Thinh Nguyen wrote:
> >>>>>> On Wed, Oct 05, 2022, Ferry Toth wrote:
> >>>>>>> Hi,
> >>>>>>>
> >>>>>>>        Thanks!
> >>>>>>>
> >>>>>>>        Does the failure only happen the first time host is
> >>>>>>> initialized? Or can
> >>>>>>>        it recover after switching to device then back to host mode?
> >>>>>>>
> >>>>>>> I can switch back and forth and device mode works each time,
> >>>>>>> host mode remains
> >>>>>>> dead.
> >>>>>> Ok.
> >>>>>>
> >>>>>>>        Probably the failure happens if some step(s) in
> >>>>>>> dwc3_core_init() hasn't
> >>>>>>>        completed.
> >>>>>>>
> >>>>>>>        tusb1210 is a phy driver right? The issue is probably
> >>>>>>> because we didn't
> >>>>>>>        initialize the phy yet. So, I suspect placing
> >>>>>>> dwc3_get_extcon() after
> >>>>>>>        initializing the phy will probably solve the dependency
> >>>>>>> problem.
> >>>>>>>
> >>>>>>>        You can try something for yourself or I can provide
> >>>>>>> something to test
> >>>>>>>        later if you don't mind (maybe next week if it's ok).
> >>>>>>>
> >>>>>>> Yes, the code move I mentioned above "moves dwc3_get_extcon()
> >>>>>>> until after
> >>>>>>> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU
> >>>>>>> initially
> >>>>>>> dwc3_get_extcon() was called from within dwc3_core_init_mode()
> >>>>>>> but only for
> >>>>>>> case USB_DR_MODE_OTG. So with this change order of events is
> >>>>>>> more or less
> >>>>>>> unchanged" solves the issue.
> >>>>>>>
> >>>>>> I saw the experiment you did from the link you provided. We want
> >>>>>> to also
> >>>>>> confirm exactly which step in dwc3_core_init() was needed.
> >>>>> Ok. I first tried the code move suggested by Andrey (didn't work).
> >>>>> Then
> >>>>> after reading the actual code I moved a bit further.
> >>>>>
> >>>>> This move was on top of -rc6 without any reverts. I did not make
> >>>>> additional
> >>>>> changes to dwc3_core_init()
> >>>>>
> >>>>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
> >>>>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
> >>>>>
> >>>>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. -
> >>>>> dwc3_core_init - ..
> >>>>> - dwc3_core_init_mode (no change)
> >>>>>
> >>>>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. -
> >>>>> dwc3_get_extcon -
> >>>>> dwc3_core_init_mode (works)
> >>>>>
> >>>>> .. are what I believe for this issue irrelevant calls to
> >>>>> dwc3_alloc_scratch_buffers, dwc3_check_params and dwc3_debugfs_init.
> >>>>>
> >>>> Right. Thanks for narrowing it down. There are still many steps in
> >>>> dwc3_core_init(). We have some suspicion, but we still haven't
> >>>> confirmed
> >>>> the exact cause of the failure. We can write a proper patch once we
> >>>> know
> >>>> the reason.
> >>> If you would like me to test your suspicion, just tell me what to do
> >>> :-)
> >>
> >> OK, Ferry, I think I'm going to need clarification on specifics on
> >> your test setup. Can you share your kernel config, maybe your
> >> "/proc/config.gz", somewhere? When you say you are running vanilla
> >> Linux, do you mean it or do you mean vanilla tree + some patch delta?
> >
> > For v6.0 I can get the exacts tonight. But earlier I had this for v5.17:
> >
> > https://github.com/htot/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_5.17.bb
> >
> >
> > There are 2 patches referred in #67 and #68. One is related to the
> > infinite loop. The other is I believe also needed to get dwc3 to work.
> >
> > All the kernel config are applied as .cfg.
> >
> > Patches and cfs's here:
> >
> > https://github.com/htot/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files
> >
>
> Updated Yocto recipe for v6.0 here:
>
> https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb
>
> #75-#77 are the 2 reverts from Andy, + one SOF revert (not related to
> this thread).

Please drop all of this
https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb#L69-L77
and re do the testing. Assuming things are still broken, that's how
you want to do the bisecting.
Andrey Smirnov Oct. 10, 2022, 9:40 p.m. UTC | #20
On Mon, Oct 10, 2022 at 12:13 AM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> On Sun, Oct 09, 2022 at 10:02:26PM -0700, Andrey Smirnov wrote:
> > On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
>
> Thank you for the testing on your side!
>
> ...
>
> > OK, Ferry, I think I'm going to need clarification on specifics on
> > your test setup. Can you share your kernel config, maybe your
> > "/proc/config.gz", somewhere? When you say you are running vanilla
> > Linux, do you mean it or do you mean vanilla tree + some patch delta?
> >
> > The reason I'm asking is because I'm having a hard time reproducing
> > the problem on my end. In fact, when I build v6.0
> > (4fe89d07dcc2804c8b562f6c7896a45643d34b2f) and then do a
> >
> > git revert 8bd6b8c4b100 0f0101719138 (original revert proposed by Andy)
> >
> > I get an infinite loop of reprobing that looks something like (some
> > debug tracing, function name + line number, included):
>
> Yes, this is (one of) known drawback(s) of deferred probe hack. I think
> the kernel that Ferry runs has a patch that basically reverts one from
> 2014 [1] and allows to have extcon as a module. (1)
>
> [1]: 58b116bce136 ("drivercore: deferral race condition fix")
>
> > which renders the system completely unusable, but USB host is
> > definitely going to be broken too. Now, ironically, with my patch
> > in-place, an attempt to probe extcon that ends up deferring the probe
> > happens before the ULPI driver failure (which wasn't failing driver
> > probe prior to https://lore.kernel.org/all/20220213130524.18748-7-hdegoede@redhat.com/),
> > there no "driver binding" event that re-triggers deferred probe
> > causing the loop, so the system progresses to a point where extcon is
> > available and dwc3 driver eventually loads.
> >
> > After that, and I don't know if I'm doing the same test, USB host
> > seems to work as expected. lsusb works, my USB stick enumerates as
> > expected. Switching the USB mux to micro-USB and back shuts the host
> > functionality down and brings it up as expected. Now I didn't try to
> > load any gadgets to make sure USB gadget works 100%, but since you
> > were saying it was USB host that was broken, I wasn't concerned with
> > that. Am I doing the right test?
>
> Hmm... What you described above sounds more like a yet another attempt to
> workaround (1). _If_ this is the case, we probably can discuss how to fix
> it in generic way (somewhere in dd.c, rather than in the certain driver).
>

No, I'm not describing an attempt to fix anything. Just how vanilla
v6.0 (where my patch is not reverted) works and where my patch, fixing
a logical problem in which extcon was requested too late causing a
forced OTG -> "gadget only" switch, also changed the ordering enough
to accidentally avoid the loop.

> That said, the real test case should be performed on top of clean kernel
> before judging if it's good or bad.
>

Given your level of involvemnt with this particular platform and you
being the author of
https://github.com/edison-fw/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/files/0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
I assumed/expected you to double check this before sending this revert
out. Please do so next time.
Andy Shevchenko Oct. 11, 2022, 9:21 a.m. UTC | #21
On Mon, Oct 10, 2022 at 02:40:30PM -0700, Andrey Smirnov wrote:
> On Mon, Oct 10, 2022 at 12:13 AM Andy Shevchenko
> <andriy.shevchenko@linux.intel.com> wrote:
> > On Sun, Oct 09, 2022 at 10:02:26PM -0700, Andrey Smirnov wrote:
> > > On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:

...

> > > OK, Ferry, I think I'm going to need clarification on specifics on
> > > your test setup. Can you share your kernel config, maybe your
> > > "/proc/config.gz", somewhere? When you say you are running vanilla
> > > Linux, do you mean it or do you mean vanilla tree + some patch delta?
> > >
> > > The reason I'm asking is because I'm having a hard time reproducing
> > > the problem on my end. In fact, when I build v6.0
> > > (4fe89d07dcc2804c8b562f6c7896a45643d34b2f) and then do a
> > >
> > > git revert 8bd6b8c4b100 0f0101719138 (original revert proposed by Andy)
> > >
> > > I get an infinite loop of reprobing that looks something like (some
> > > debug tracing, function name + line number, included):
> >
> > Yes, this is (one of) known drawback(s) of deferred probe hack. I think
> > the kernel that Ferry runs has a patch that basically reverts one from
> > 2014 [1] and allows to have extcon as a module. (1)
> >
> > [1]: 58b116bce136 ("drivercore: deferral race condition fix")
> >
> > > which renders the system completely unusable, but USB host is
> > > definitely going to be broken too. Now, ironically, with my patch
> > > in-place, an attempt to probe extcon that ends up deferring the probe
> > > happens before the ULPI driver failure (which wasn't failing driver
> > > probe prior to https://lore.kernel.org/all/20220213130524.18748-7-hdegoede@redhat.com/),
> > > there no "driver binding" event that re-triggers deferred probe
> > > causing the loop, so the system progresses to a point where extcon is
> > > available and dwc3 driver eventually loads.
> > >
> > > After that, and I don't know if I'm doing the same test, USB host
> > > seems to work as expected. lsusb works, my USB stick enumerates as
> > > expected. Switching the USB mux to micro-USB and back shuts the host
> > > functionality down and brings it up as expected. Now I didn't try to
> > > load any gadgets to make sure USB gadget works 100%, but since you
> > > were saying it was USB host that was broken, I wasn't concerned with
> > > that. Am I doing the right test?
> >
> > Hmm... What you described above sounds more like a yet another attempt to
> > workaround (1). _If_ this is the case, we probably can discuss how to fix
> > it in generic way (somewhere in dd.c, rather than in the certain driver).
> 
> No, I'm not describing an attempt to fix anything. Just how vanilla
> v6.0 (where my patch is not reverted) works and where my patch, fixing
> a logical problem in which extcon was requested too late causing a
> forced OTG -> "gadget only" switch, also changed the ordering enough
> to accidentally avoid the loop.

You still refer to a fix, but my question was if it's the same problem or not.

> > That said, the real test case should be performed on top of clean kernel
> > before judging if it's good or bad.
> 
> Given your level of involvemnt with this particular platform and you
> being the author of
> https://github.com/edison-fw/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/files/0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
> I assumed/expected you to double check this before sending this revert
> out. Please do so next time.

As I said I have not yet restored my testing environment for that platform and
I relied on the Ferry's report. Taking into account the history of breakages
that done for Intel Merrifield, in particular by not wide tested patches
against DWC3 driver, I immediately react with a revert. I agree that I had had
to think about that ordering issue and a patch on top of it first. I thought
that Yocto configuration that Ferry is using is clean of custom (controversial)
patches like that. Now, since you have this platform, you can also help with
testing the DWC3 on it.
Ferry Toth Oct. 11, 2022, 9:36 a.m. UTC | #22
Hi

On 11-10-2022 11:21, Andy Shevchenko wrote:
> On Mon, Oct 10, 2022 at 02:40:30PM -0700, Andrey Smirnov wrote:
>> On Mon, Oct 10, 2022 at 12:13 AM Andy Shevchenko
>> <andriy.shevchenko@linux.intel.com>  wrote:
>>> On Sun, Oct 09, 2022 at 10:02:26PM -0700, Andrey Smirnov wrote:
>>>> On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth<fntoth@gmail.com>  wrote:
> ...
>
>>>> OK, Ferry, I think I'm going to need clarification on specifics on
>>>> your test setup. Can you share your kernel config, maybe your
>>>> "/proc/config.gz", somewhere? When you say you are running vanilla
>>>> Linux, do you mean it or do you mean vanilla tree + some patch delta?
>>>>
>>>> The reason I'm asking is because I'm having a hard time reproducing
>>>> the problem on my end. In fact, when I build v6.0
>>>> (4fe89d07dcc2804c8b562f6c7896a45643d34b2f) and then do a
>>>>
>>>> git revert 8bd6b8c4b100 0f0101719138 (original revert proposed by Andy)
>>>>
>>>> I get an infinite loop of reprobing that looks something like (some
>>>> debug tracing, function name + line number, included):
>>> Yes, this is (one of) known drawback(s) of deferred probe hack. I think
>>> the kernel that Ferry runs has a patch that basically reverts one from
>>> 2014 [1] and allows to have extcon as a module. (1)
>>>
>>> [1]: 58b116bce136 ("drivercore: deferral race condition fix")
>>>
>>>> which renders the system completely unusable, but USB host is
>>>> definitely going to be broken too. Now, ironically, with my patch
>>>> in-place, an attempt to probe extcon that ends up deferring the probe
>>>> happens before the ULPI driver failure (which wasn't failing driver
>>>> probe prior tohttps://lore.kernel.org/all/20220213130524.18748-7-hdegoede@redhat.com/),
>>>> there no "driver binding" event that re-triggers deferred probe
>>>> causing the loop, so the system progresses to a point where extcon is
>>>> available and dwc3 driver eventually loads.
>>>>
>>>> After that, and I don't know if I'm doing the same test, USB host
>>>> seems to work as expected. lsusb works, my USB stick enumerates as
>>>> expected. Switching the USB mux to micro-USB and back shuts the host
>>>> functionality down and brings it up as expected. Now I didn't try to
>>>> load any gadgets to make sure USB gadget works 100%, but since you
>>>> were saying it was USB host that was broken, I wasn't concerned with
>>>> that. Am I doing the right test?
>>> Hmm... What you described above sounds more like a yet another attempt to
>>> workaround (1). _If_ this is the case, we probably can discuss how to fix
>>> it in generic way (somewhere in dd.c, rather than in the certain driver).
>> No, I'm not describing an attempt to fix anything. Just how vanilla
>> v6.0 (where my patch is not reverted) works and where my patch, fixing
>> a logical problem in which extcon was requested too late causing a
>> forced OTG -> "gadget only" switch, also changed the ordering enough
>> to accidentally avoid the loop.
> You still refer to a fix, but my question was if it's the same problem or not.
>
>>> That said, the real test case should be performed on top of clean kernel
>>> before judging if it's good or bad.
>> Given your level of involvemnt with this particular platform and you
>> being the author of
>> https://github.com/edison-fw/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/files/0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
>> I assumed/expected you to double check this before sending this revert
>> out. Please do so next time.
> As I said I have not yet restored my testing environment for that platform and
> I relied on the Ferry's report. Taking into account the history of breakages
> that done for Intel Merrifield, in particular by not wide tested patches
> against DWC3 driver, I immediately react with a revert. I agree that I had had
> to think about that ordering issue and a patch on top of it first. I thought
> that Yocto configuration that Ferry is using is clean of custom (controversial)
> patches like that. Now, since you have this platform, you can also help with
> testing the DWC3 on it.
>
It's my fault I'm afraid. My apologies. We have been needing the "Break 
infinite loop" patch at least since v5.10. Without IIRC we can not even 
boot normally or at least we have no dwc3. No way to bisect. In my mind 
I didn't  link that patch to this issue. Other patches were indeed 
removed during bisecting.

But it is interesting that we should be able to drop that patch for 
v6.0, even if that is a side effect. I will try to reproduce and report 
back here.
Ferry Toth Oct. 11, 2022, 6:38 p.m. UTC | #23
Hi,

Op 10-10-2022 om 23:35 schreef Andrey Smirnov:
> On Mon, Oct 10, 2022 at 1:52 PM Ferry Toth <fntoth@gmail.com> wrote:
>>
>> Hi
>>
>> Op 10-10-2022 om 13:04 schreef Ferry Toth:
>>> Hi
>>>
>>> On 10-10-2022 07:02, Andrey Smirnov wrote:
>>>> On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
>>>>>
>>>>> On 07-10-2022 04:11, Thinh Nguyen wrote:
>>>>>> On Thu, Oct 06, 2022, Ferry Toth wrote:
>>>>>>> Hi
>>>>>>>
>>>>>>> On 06-10-2022 04:12, Thinh Nguyen wrote:
>>>>>>>> On Wed, Oct 05, 2022, Ferry Toth wrote:
>>>>>>>>> Hi,
>>>>>>>>>
>>>>>>>>>         Thanks!
>>>>>>>>>
>>>>>>>>>         Does the failure only happen the first time host is
>>>>>>>>> initialized? Or can
>>>>>>>>>         it recover after switching to device then back to host mode?
>>>>>>>>>
>>>>>>>>> I can switch back and forth and device mode works each time,
>>>>>>>>> host mode remains
>>>>>>>>> dead.
>>>>>>>> Ok.
>>>>>>>>
>>>>>>>>>         Probably the failure happens if some step(s) in
>>>>>>>>> dwc3_core_init() hasn't
>>>>>>>>>         completed.
>>>>>>>>>
>>>>>>>>>         tusb1210 is a phy driver right? The issue is probably
>>>>>>>>> because we didn't
>>>>>>>>>         initialize the phy yet. So, I suspect placing
>>>>>>>>> dwc3_get_extcon() after
>>>>>>>>>         initializing the phy will probably solve the dependency
>>>>>>>>> problem.
>>>>>>>>>
>>>>>>>>>         You can try something for yourself or I can provide
>>>>>>>>> something to test
>>>>>>>>>         later if you don't mind (maybe next week if it's ok).
>>>>>>>>>
>>>>>>>>> Yes, the code move I mentioned above "moves dwc3_get_extcon()
>>>>>>>>> until after
>>>>>>>>> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU
>>>>>>>>> initially
>>>>>>>>> dwc3_get_extcon() was called from within dwc3_core_init_mode()
>>>>>>>>> but only for
>>>>>>>>> case USB_DR_MODE_OTG. So with this change order of events is
>>>>>>>>> more or less
>>>>>>>>> unchanged" solves the issue.
>>>>>>>>>
>>>>>>>> I saw the experiment you did from the link you provided. We want
>>>>>>>> to also
>>>>>>>> confirm exactly which step in dwc3_core_init() was needed.
>>>>>>> Ok. I first tried the code move suggested by Andrey (didn't work).
>>>>>>> Then
>>>>>>> after reading the actual code I moved a bit further.
>>>>>>>
>>>>>>> This move was on top of -rc6 without any reverts. I did not make
>>>>>>> additional
>>>>>>> changes to dwc3_core_init()
>>>>>>>
>>>>>>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
>>>>>>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
>>>>>>>
>>>>>>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. -
>>>>>>> dwc3_core_init - ..
>>>>>>> - dwc3_core_init_mode (no change)
>>>>>>>
>>>>>>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. -
>>>>>>> dwc3_get_extcon -
>>>>>>> dwc3_core_init_mode (works)
>>>>>>>
>>>>>>> .. are what I believe for this issue irrelevant calls to
>>>>>>> dwc3_alloc_scratch_buffers, dwc3_check_params and dwc3_debugfs_init.
>>>>>>>
>>>>>> Right. Thanks for narrowing it down. There are still many steps in
>>>>>> dwc3_core_init(). We have some suspicion, but we still haven't
>>>>>> confirmed
>>>>>> the exact cause of the failure. We can write a proper patch once we
>>>>>> know
>>>>>> the reason.
>>>>> If you would like me to test your suspicion, just tell me what to do
>>>>> :-)
>>>>
>>>> OK, Ferry, I think I'm going to need clarification on specifics on
>>>> your test setup. Can you share your kernel config, maybe your
>>>> "/proc/config.gz", somewhere? When you say you are running vanilla
>>>> Linux, do you mean it or do you mean vanilla tree + some patch delta?
>>>
>>> For v6.0 I can get the exacts tonight. But earlier I had this for v5.17:
>>>
>>> https://github.com/htot/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_5.17.bb
>>>
>>>
>>> There are 2 patches referred in #67 and #68. One is related to the
>>> infinite loop. The other is I believe also needed to get dwc3 to work.
>>>
>>> All the kernel config are applied as .cfg.
>>>
>>> Patches and cfs's here:
>>>
>>> https://github.com/htot/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files
>>>
>>
>> Updated Yocto recipe for v6.0 here:
>>
>> https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb
>>
>> #75-#77 are the 2 reverts from Andy, + one SOF revert (not related to
>> this thread).
> 
> Please drop all of this
> https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb#L69-L77
> and re do the testing. Assuming things are still broken, that's how
> you want to do the bisecting.

I removed 4 patches:
0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
0044-REVERTME-usb-dwc3-gadget-skip-endpoints-ep-18-in-out.patch
0001-Revert-USB-fixup-for-merge-issue-with-usb-dwc3-Don-t.patch
0001-Revert-usb-dwc3-Don-t-switch-OTG-peripheral-if-extco.patch

and indeed as you expect kernel boots (no infinite loop). However dwc3 
host mode is not working as in your case, device mode works fine (Yocto 
configures a set of gadgets for me).

Just to be sure if I could have bisected without 0043a I added back the 
2 0001-Revert* and indeed I run into the infinite loop with the console 
spitting out continuous:
debugfs: Directory 'dwc3.0.auto' with parent 'ulpi' already present!
tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41 to reg 0x80

so yes it seems either 0043b or your patch "usb: dwc3: Don't switch OTG 
-> peripheral if extcon is present" is needed to boot (break the 
infinite loop). But your patch is in my case not sufficient to make host 
mode work.

As I understand it depends a bit on the timing, I might have a different 
initrd (built by Yocto vs. Buildroot). F.i. I see I have 
extcon-intel-mrfld in initrd and dwc3 / phy-tusb1210 built-in.
Andrey Smirnov Oct. 11, 2022, 8:17 p.m. UTC | #24
On Tue, Oct 11, 2022 at 2:21 AM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> On Mon, Oct 10, 2022 at 02:40:30PM -0700, Andrey Smirnov wrote:
> > On Mon, Oct 10, 2022 at 12:13 AM Andy Shevchenko
> > <andriy.shevchenko@linux.intel.com> wrote:
> > > On Sun, Oct 09, 2022 at 10:02:26PM -0700, Andrey Smirnov wrote:
> > > > On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
>
> ...
>
> > > > OK, Ferry, I think I'm going to need clarification on specifics on
> > > > your test setup. Can you share your kernel config, maybe your
> > > > "/proc/config.gz", somewhere? When you say you are running vanilla
> > > > Linux, do you mean it or do you mean vanilla tree + some patch delta?
> > > >
> > > > The reason I'm asking is because I'm having a hard time reproducing
> > > > the problem on my end. In fact, when I build v6.0
> > > > (4fe89d07dcc2804c8b562f6c7896a45643d34b2f) and then do a
> > > >
> > > > git revert 8bd6b8c4b100 0f0101719138 (original revert proposed by Andy)
> > > >
> > > > I get an infinite loop of reprobing that looks something like (some
> > > > debug tracing, function name + line number, included):
> > >
> > > Yes, this is (one of) known drawback(s) of deferred probe hack. I think
> > > the kernel that Ferry runs has a patch that basically reverts one from
> > > 2014 [1] and allows to have extcon as a module. (1)
> > >
> > > [1]: 58b116bce136 ("drivercore: deferral race condition fix")
> > >
> > > > which renders the system completely unusable, but USB host is
> > > > definitely going to be broken too. Now, ironically, with my patch
> > > > in-place, an attempt to probe extcon that ends up deferring the probe
> > > > happens before the ULPI driver failure (which wasn't failing driver
> > > > probe prior to https://lore.kernel.org/all/20220213130524.18748-7-hdegoede@redhat.com/),
> > > > there no "driver binding" event that re-triggers deferred probe
> > > > causing the loop, so the system progresses to a point where extcon is
> > > > available and dwc3 driver eventually loads.
> > > >
> > > > After that, and I don't know if I'm doing the same test, USB host
> > > > seems to work as expected. lsusb works, my USB stick enumerates as
> > > > expected. Switching the USB mux to micro-USB and back shuts the host
> > > > functionality down and brings it up as expected. Now I didn't try to
> > > > load any gadgets to make sure USB gadget works 100%, but since you
> > > > were saying it was USB host that was broken, I wasn't concerned with
> > > > that. Am I doing the right test?
> > >
> > > Hmm... What you described above sounds more like a yet another attempt to
> > > workaround (1). _If_ this is the case, we probably can discuss how to fix
> > > it in generic way (somewhere in dd.c, rather than in the certain driver).
> >
> > No, I'm not describing an attempt to fix anything. Just how vanilla
> > v6.0 (where my patch is not reverted) works and where my patch, fixing
> > a logical problem in which extcon was requested too late causing a
> > forced OTG -> "gadget only" switch, also changed the ordering enough
> > to accidentally avoid the loop.
>
> You still refer to a fix, but my question was if it's the same problem or not.
>

No, it's not the same problem.

> > > That said, the real test case should be performed on top of clean kernel
> > > before judging if it's good or bad.
> >
> > Given your level of involvemnt with this particular platform and you
> > being the author of
> > https://github.com/edison-fw/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/files/0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
> > I assumed/expected you to double check this before sending this revert
> > out. Please do so next time.
>
> As I said I have not yet restored my testing environment for that platform and
> I relied on the Ferry's report. Taking into account the history of breakages
> that done for Intel Merrifield, in particular by not wide tested patches
> against DWC3 driver, I immediately react with a revert.

That's what I'm asking you not to do next time. If you don't have time
to restore your testing env or double check Ferry's work, please live
with a revert in your local tree until you do. My time is as valuable
as yours and this revert required much more investigation before it
was submitted. You lived with
https://github.com/edison-fw/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/files/0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
since 5.10, which apparently was needed to either boot or have dwc3,
so I don't think there is any real urgency.
Andrey Smirnov Oct. 11, 2022, 8:50 p.m. UTC | #25
On Tue, Oct 11, 2022 at 11:54 AM Ferry Toth <fntoth@gmail.com> wrote:
>
> Hi,
>
> Op 10-10-2022 om 23:35 schreef Andrey Smirnov:
> > On Mon, Oct 10, 2022 at 1:52 PM Ferry Toth <fntoth@gmail.com> wrote:
> >>
> >> Hi
> >>
> >> Op 10-10-2022 om 13:04 schreef Ferry Toth:
> >>> Hi
> >>>
> >>> On 10-10-2022 07:02, Andrey Smirnov wrote:
> >>>> On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
> >>>>>
> >>>>> On 07-10-2022 04:11, Thinh Nguyen wrote:
> >>>>>> On Thu, Oct 06, 2022, Ferry Toth wrote:
> >>>>>>> Hi
> >>>>>>>
> >>>>>>> On 06-10-2022 04:12, Thinh Nguyen wrote:
> >>>>>>>> On Wed, Oct 05, 2022, Ferry Toth wrote:
> >>>>>>>>> Hi,
> >>>>>>>>>
> >>>>>>>>>         Thanks!
> >>>>>>>>>
> >>>>>>>>>         Does the failure only happen the first time host is
> >>>>>>>>> initialized? Or can
> >>>>>>>>>         it recover after switching to device then back to host mode?
> >>>>>>>>>
> >>>>>>>>> I can switch back and forth and device mode works each time,
> >>>>>>>>> host mode remains
> >>>>>>>>> dead.
> >>>>>>>> Ok.
> >>>>>>>>
> >>>>>>>>>         Probably the failure happens if some step(s) in
> >>>>>>>>> dwc3_core_init() hasn't
> >>>>>>>>>         completed.
> >>>>>>>>>
> >>>>>>>>>         tusb1210 is a phy driver right? The issue is probably
> >>>>>>>>> because we didn't
> >>>>>>>>>         initialize the phy yet. So, I suspect placing
> >>>>>>>>> dwc3_get_extcon() after
> >>>>>>>>>         initializing the phy will probably solve the dependency
> >>>>>>>>> problem.
> >>>>>>>>>
> >>>>>>>>>         You can try something for yourself or I can provide
> >>>>>>>>> something to test
> >>>>>>>>>         later if you don't mind (maybe next week if it's ok).
> >>>>>>>>>
> >>>>>>>>> Yes, the code move I mentioned above "moves dwc3_get_extcon()
> >>>>>>>>> until after
> >>>>>>>>> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU
> >>>>>>>>> initially
> >>>>>>>>> dwc3_get_extcon() was called from within dwc3_core_init_mode()
> >>>>>>>>> but only for
> >>>>>>>>> case USB_DR_MODE_OTG. So with this change order of events is
> >>>>>>>>> more or less
> >>>>>>>>> unchanged" solves the issue.
> >>>>>>>>>
> >>>>>>>> I saw the experiment you did from the link you provided. We want
> >>>>>>>> to also
> >>>>>>>> confirm exactly which step in dwc3_core_init() was needed.
> >>>>>>> Ok. I first tried the code move suggested by Andrey (didn't work).
> >>>>>>> Then
> >>>>>>> after reading the actual code I moved a bit further.
> >>>>>>>
> >>>>>>> This move was on top of -rc6 without any reverts. I did not make
> >>>>>>> additional
> >>>>>>> changes to dwc3_core_init()
> >>>>>>>
> >>>>>>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
> >>>>>>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
> >>>>>>>
> >>>>>>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. -
> >>>>>>> dwc3_core_init - ..
> >>>>>>> - dwc3_core_init_mode (no change)
> >>>>>>>
> >>>>>>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. -
> >>>>>>> dwc3_get_extcon -
> >>>>>>> dwc3_core_init_mode (works)
> >>>>>>>
> >>>>>>> .. are what I believe for this issue irrelevant calls to
> >>>>>>> dwc3_alloc_scratch_buffers, dwc3_check_params and dwc3_debugfs_init.
> >>>>>>>
> >>>>>> Right. Thanks for narrowing it down. There are still many steps in
> >>>>>> dwc3_core_init(). We have some suspicion, but we still haven't
> >>>>>> confirmed
> >>>>>> the exact cause of the failure. We can write a proper patch once we
> >>>>>> know
> >>>>>> the reason.
> >>>>> If you would like me to test your suspicion, just tell me what to do
> >>>>> :-)
> >>>>
> >>>> OK, Ferry, I think I'm going to need clarification on specifics on
> >>>> your test setup. Can you share your kernel config, maybe your
> >>>> "/proc/config.gz", somewhere? When you say you are running vanilla
> >>>> Linux, do you mean it or do you mean vanilla tree + some patch delta?
> >>>
> >>> For v6.0 I can get the exacts tonight. But earlier I had this for v5.17:
> >>>
> >>> https://github.com/htot/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_5.17.bb
> >>>
> >>>
> >>> There are 2 patches referred in #67 and #68. One is related to the
> >>> infinite loop. The other is I believe also needed to get dwc3 to work.
> >>>
> >>> All the kernel config are applied as .cfg.
> >>>
> >>> Patches and cfs's here:
> >>>
> >>> https://github.com/htot/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files
> >>>
> >>
> >> Updated Yocto recipe for v6.0 here:
> >>
> >> https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb
> >>
> >> #75-#77 are the 2 reverts from Andy, + one SOF revert (not related to
> >> this thread).
> >
> > Please drop all of this
> > https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb#L69-L77
> > and re do the testing. Assuming things are still broken, that's how
> > you want to do the bisecting.
>
> I removed 4 patches:
> 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
> 0044-REVERTME-usb-dwc3-gadget-skip-endpoints-ep-18-in-out.patch
> 0001-Revert-USB-fixup-for-merge-issue-with-usb-dwc3-Don-t.patch
> 0001-Revert-usb-dwc3-Don-t-switch-OTG-peripheral-if-extco.patch

Please remove all custom patches so we are on the same page. I don't
suspect the 8250 related changes to affect anything, but I also would
like to be testing the same thing. I'm testing vanilla v6.0

>
> and indeed as you expect kernel boots (no infinite loop). However dwc3
> host mode is not working as in your case, device mode works fine (Yocto
> configures a set of gadgets for me).

What do you do to test host mode working? lsusb? Something else?
Asking to make sure I'm doing something equivalent on my end.

>
> Just to be sure if I could have bisected without 0043a I added back the
> 2 0001-Revert* and indeed I run into the infinite loop with the console
> spitting out continuous:
> debugfs: Directory 'dwc3.0.auto' with parent 'ulpi' already present!
> tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41 to reg 0x80
>
> so yes it seems either 0043b or your patch "usb: dwc3: Don't switch OTG
> -> peripheral if extcon is present" is needed to boot (break the
> infinite loop). But your patch is in my case not sufficient to make host
> mode work.
>

Next step would be to establish if USB is working before my patch. You
should be able to avoid the boot loop if you disable the
"phy-tusb1210" driver. The driver fails to probe anyway, so it's not
very likely to be crucial for functioning, so it should allow you to
try things with my patch reverted:

git revert 8bd6b8c4b100 0f0101719138

After that, if things start working, it'd make sense to re-do your
function re-arranging experiment to re-validate it.

> As I understand it depends a bit on the timing, I might have a different
> initrd (built by Yocto vs. Buildroot). F.i. I see I have
> extcon-intel-mrfld in initrd and dwc3 / phy-tusb1210 built-in.
>

You mentioned that your rootfs image does some gadget configuration
for you. Can this be disabled? If yes, it'd make sense to check if
this could be a variable explaining the difference.

What U-Boot version are you running? AFACT U-Boot will touch that
particular IP block, so this might be somewhat relevant.
Ferry Toth Oct. 12, 2022, 9:30 a.m. UTC | #26
Hi

On 11-10-2022 22:50, Andrey Smirnov wrote:
> On Tue, Oct 11, 2022 at 11:54 AM Ferry Toth <fntoth@gmail.com> wrote:
>> Hi,
>>
>> Op 10-10-2022 om 23:35 schreef Andrey Smirnov:
>>> On Mon, Oct 10, 2022 at 1:52 PM Ferry Toth <fntoth@gmail.com> wrote:
>>>> Hi
>>>>
>>>> Op 10-10-2022 om 13:04 schreef Ferry Toth:
>>>>> Hi
>>>>>
>>>>> On 10-10-2022 07:02, Andrey Smirnov wrote:
>>>>>> On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
>>>>>>> On 07-10-2022 04:11, Thinh Nguyen wrote:
>>>>>>>> On Thu, Oct 06, 2022, Ferry Toth wrote:
>>>>>>>>> Hi
>>>>>>>>>
>>>>>>>>> On 06-10-2022 04:12, Thinh Nguyen wrote:
>>>>>>>>>> On Wed, Oct 05, 2022, Ferry Toth wrote:
>>>>>>>>>>> Hi,
>>>>>>>>>>>
>>>>>>>>>>>          Thanks!
>>>>>>>>>>>
>>>>>>>>>>>          Does the failure only happen the first time host is
>>>>>>>>>>> initialized? Or can
>>>>>>>>>>>          it recover after switching to device then back to host mode?
>>>>>>>>>>>
>>>>>>>>>>> I can switch back and forth and device mode works each time,
>>>>>>>>>>> host mode remains
>>>>>>>>>>> dead.
>>>>>>>>>> Ok.
>>>>>>>>>>
>>>>>>>>>>>          Probably the failure happens if some step(s) in
>>>>>>>>>>> dwc3_core_init() hasn't
>>>>>>>>>>>          completed.
>>>>>>>>>>>
>>>>>>>>>>>          tusb1210 is a phy driver right? The issue is probably
>>>>>>>>>>> because we didn't
>>>>>>>>>>>          initialize the phy yet. So, I suspect placing
>>>>>>>>>>> dwc3_get_extcon() after
>>>>>>>>>>>          initializing the phy will probably solve the dependency
>>>>>>>>>>> problem.
>>>>>>>>>>>
>>>>>>>>>>>          You can try something for yourself or I can provide
>>>>>>>>>>> something to test
>>>>>>>>>>>          later if you don't mind (maybe next week if it's ok).
>>>>>>>>>>>
>>>>>>>>>>> Yes, the code move I mentioned above "moves dwc3_get_extcon()
>>>>>>>>>>> until after
>>>>>>>>>>> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU
>>>>>>>>>>> initially
>>>>>>>>>>> dwc3_get_extcon() was called from within dwc3_core_init_mode()
>>>>>>>>>>> but only for
>>>>>>>>>>> case USB_DR_MODE_OTG. So with this change order of events is
>>>>>>>>>>> more or less
>>>>>>>>>>> unchanged" solves the issue.
>>>>>>>>>>>
>>>>>>>>>> I saw the experiment you did from the link you provided. We want
>>>>>>>>>> to also
>>>>>>>>>> confirm exactly which step in dwc3_core_init() was needed.
>>>>>>>>> Ok. I first tried the code move suggested by Andrey (didn't work).
>>>>>>>>> Then
>>>>>>>>> after reading the actual code I moved a bit further.
>>>>>>>>>
>>>>>>>>> This move was on top of -rc6 without any reverts. I did not make
>>>>>>>>> additional
>>>>>>>>> changes to dwc3_core_init()
>>>>>>>>>
>>>>>>>>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
>>>>>>>>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
>>>>>>>>>
>>>>>>>>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. -
>>>>>>>>> dwc3_core_init - ..
>>>>>>>>> - dwc3_core_init_mode (no change)
>>>>>>>>>
>>>>>>>>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. -
>>>>>>>>> dwc3_get_extcon -
>>>>>>>>> dwc3_core_init_mode (works)
>>>>>>>>>
>>>>>>>>> .. are what I believe for this issue irrelevant calls to
>>>>>>>>> dwc3_alloc_scratch_buffers, dwc3_check_params and dwc3_debugfs_init.
>>>>>>>>>
>>>>>>>> Right. Thanks for narrowing it down. There are still many steps in
>>>>>>>> dwc3_core_init(). We have some suspicion, but we still haven't
>>>>>>>> confirmed
>>>>>>>> the exact cause of the failure. We can write a proper patch once we
>>>>>>>> know
>>>>>>>> the reason.
>>>>>>> If you would like me to test your suspicion, just tell me what to do
>>>>>>> :-)
>>>>>> OK, Ferry, I think I'm going to need clarification on specifics on
>>>>>> your test setup. Can you share your kernel config, maybe your
>>>>>> "/proc/config.gz", somewhere? When you say you are running vanilla
>>>>>> Linux, do you mean it or do you mean vanilla tree + some patch delta?
>>>>> For v6.0 I can get the exacts tonight. But earlier I had this for v5.17:
>>>>>
>>>>> https://github.com/htot/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_5.17.bb
>>>>>
>>>>>
>>>>> There are 2 patches referred in #67 and #68. One is related to the
>>>>> infinite loop. The other is I believe also needed to get dwc3 to work.
>>>>>
>>>>> All the kernel config are applied as .cfg.
>>>>>
>>>>> Patches and cfs's here:
>>>>>
>>>>> https://github.com/htot/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files
>>>>>
>>>> Updated Yocto recipe for v6.0 here:
>>>>
>>>> https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb
>>>>
>>>> #75-#77 are the 2 reverts from Andy, + one SOF revert (not related to
>>>> this thread).
>>> Please drop all of this
>>> https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb#L69-L77
>>> and re do the testing. Assuming things are still broken, that's how
>>> you want to do the bisecting.
>> I removed 4 patches:
>> 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
>> 0044-REVERTME-usb-dwc3-gadget-skip-endpoints-ep-18-in-out.patch
>> 0001-Revert-USB-fixup-for-merge-issue-with-usb-dwc3-Don-t.patch
>> 0001-Revert-usb-dwc3-Don-t-switch-OTG-peripheral-if-extco.patch
> Please remove all custom patches so we are on the same page. I don't
> suspect the 8250 related changes to affect anything, but I also would
> like to be testing the same thing. I'm testing vanilla v6.0
Alright, but don't expect any change. The 8250 patches are related to 
using DMA for the serial ports (except the console). It may affect 
bluetooth, the serial port on the arduino connector, but not the console.
>> and indeed as you expect kernel boots (no infinite loop). However dwc3
>> host mode is not working as in your case, device mode works fine (Yocto
>> configures a set of gadgets for me).
> What do you do to test host mode working? lsusb? Something else?
> Asking to make sure I'm doing something equivalent on my end.
>
I have a smsc95xx 4p usb hub with 1 eth port continuously plugged. It 
has leds on all ports so when it works it lights up like a Christmas tree.

But I also tried plugging a usb stick.

It maybe that lsusb is not enough. Iirc the root hub is there, but the 
tusb1210 not and then device plugs are not detected. So in my case none 
of the leds on the hub turn on.

>> Just to be sure if I could have bisected without 0043a I added back the
>> 2 0001-Revert* and indeed I run into the infinite loop with the console
>> spitting out continuous:
>> debugfs: Directory 'dwc3.0.auto' with parent 'ulpi' already present!
>> tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41 to reg 0x80
>>
>> so yes it seems either 0043b or your patch "usb: dwc3: Don't switch OTG
>> -> peripheral if extcon is present" is needed to boot (break the
>> infinite loop). But your patch is in my case not sufficient to make host
>> mode work.
>>
> Next step would be to establish if USB is working before my patch. You
> should be able to avoid the boot loop if you disable the
> "phy-tusb1210" driver. The driver fails to probe anyway, so it's not
> very likely to be crucial for functioning, so it should allow you to
> try things with my patch reverted:

You lost me here. With "boot loop" you mean "probe loop" right? Why do 
you think the tusb1210 driver is not crucial?

See "phy: ti: tusb1210: Don't check for write errors when powering on"

It should not be failing to probe (and with Andy's "Break-infinite-loop" 
patch is doesn't) as without the tusb1210 usb host mode won't work as 
device plugs are not detected.

Earlier in this thread we had:

"The effect of the patch is that on Merrifield (I tested with Intel 
Edison Arduino board which has a HW switch to select between host and 
device mode) device mode works but in host mode USB is completely not 
working.

Currently on host mode - when working - superfluous error messages from 
tusb1210 appear. When host mode is not working there are no tusb1210 
messages in the logs / on the console at all. Seemingly tusb1210 is not 
probed, which points in the direction of a relation to extcon."

> git revert 8bd6b8c4b100 0f0101719138
>
> After that, if things start working, it'd make sense to re-do your
> function re-arranging experiment to re-validate it.
>
>> As I understand it depends a bit on the timing, I might have a different
>> initrd (built by Yocto vs. Buildroot). F.i. I see I have
>> extcon-intel-mrfld in initrd and dwc3 / phy-tusb1210 built-in.
>>
> You mentioned that your rootfs image does some gadget configuration
> for you. Can this be disabled? If yes, it'd make sense to check if
> this could be a variable explaining the difference.
This is done through configfs only when the switch is set to device mode.
> What U-Boot version are you running? AFACT U-Boot will touch that
> particular IP block, so this might be somewhat relevant.
IIRC if have v2022.04 but tested v2021.10 earlier (no difference).
Andy Shevchenko Oct. 12, 2022, 10:32 a.m. UTC | #27
On Tue, Oct 11, 2022 at 01:17:13PM -0700, Andrey Smirnov wrote:
> On Tue, Oct 11, 2022 at 2:21 AM Andy Shevchenko
> <andriy.shevchenko@linux.intel.com> wrote:
> > On Mon, Oct 10, 2022 at 02:40:30PM -0700, Andrey Smirnov wrote:
> > > On Mon, Oct 10, 2022 at 12:13 AM Andy Shevchenko
> > > <andriy.shevchenko@linux.intel.com> wrote:
> > > > On Sun, Oct 09, 2022 at 10:02:26PM -0700, Andrey Smirnov wrote:
> > > > > On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:

...

> > > > > OK, Ferry, I think I'm going to need clarification on specifics on
> > > > > your test setup. Can you share your kernel config, maybe your
> > > > > "/proc/config.gz", somewhere? When you say you are running vanilla
> > > > > Linux, do you mean it or do you mean vanilla tree + some patch delta?
> > > > >
> > > > > The reason I'm asking is because I'm having a hard time reproducing
> > > > > the problem on my end. In fact, when I build v6.0
> > > > > (4fe89d07dcc2804c8b562f6c7896a45643d34b2f) and then do a
> > > > >
> > > > > git revert 8bd6b8c4b100 0f0101719138 (original revert proposed by Andy)
> > > > >
> > > > > I get an infinite loop of reprobing that looks something like (some
> > > > > debug tracing, function name + line number, included):
> > > >
> > > > Yes, this is (one of) known drawback(s) of deferred probe hack. I think
> > > > the kernel that Ferry runs has a patch that basically reverts one from
> > > > 2014 [1] and allows to have extcon as a module. (1)
> > > >
> > > > [1]: 58b116bce136 ("drivercore: deferral race condition fix")
> > > >
> > > > > which renders the system completely unusable, but USB host is
> > > > > definitely going to be broken too. Now, ironically, with my patch
> > > > > in-place, an attempt to probe extcon that ends up deferring the probe
> > > > > happens before the ULPI driver failure (which wasn't failing driver
> > > > > probe prior to https://lore.kernel.org/all/20220213130524.18748-7-hdegoede@redhat.com/),
> > > > > there no "driver binding" event that re-triggers deferred probe
> > > > > causing the loop, so the system progresses to a point where extcon is
> > > > > available and dwc3 driver eventually loads.
> > > > >
> > > > > After that, and I don't know if I'm doing the same test, USB host
> > > > > seems to work as expected. lsusb works, my USB stick enumerates as
> > > > > expected. Switching the USB mux to micro-USB and back shuts the host
> > > > > functionality down and brings it up as expected. Now I didn't try to
> > > > > load any gadgets to make sure USB gadget works 100%, but since you
> > > > > were saying it was USB host that was broken, I wasn't concerned with
> > > > > that. Am I doing the right test?
> > > >
> > > > Hmm... What you described above sounds more like a yet another attempt to
> > > > workaround (1). _If_ this is the case, we probably can discuss how to fix
> > > > it in generic way (somewhere in dd.c, rather than in the certain driver).
> > >
> > > No, I'm not describing an attempt to fix anything. Just how vanilla
> > > v6.0 (where my patch is not reverted) works and where my patch, fixing
> > > a logical problem in which extcon was requested too late causing a
> > > forced OTG -> "gadget only" switch, also changed the ordering enough
> > > to accidentally avoid the loop.
> >
> > You still refer to a fix, but my question was if it's the same problem or not.
> >
> 
> No, it's not the same problem.
> 
> > > > That said, the real test case should be performed on top of clean kernel
> > > > before judging if it's good or bad.
> > >
> > > Given your level of involvemnt with this particular platform and you
> > > being the author of
> > > https://github.com/edison-fw/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/files/0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
> > > I assumed/expected you to double check this before sending this revert
> > > out. Please do so next time.
> >
> > As I said I have not yet restored my testing environment for that platform and
> > I relied on the Ferry's report. Taking into account the history of breakages
> > that done for Intel Merrifield, in particular by not wide tested patches
> > against DWC3 driver, I immediately react with a revert.
> 
> That's what I'm asking you not to do next time. If you don't have time
> to restore your testing env or double check Ferry's work, please live
> with a revert in your local tree until you do.

I trust Ferry's tests as mine and repeating again, we have a bad history
when people so value their time that breaks our platform, so please test
your changes in the future that it makes no regressions.

If you want to have a proof that your patches are broken, then I will
prioritize this. We now have a full release cycle time for that.

> My time is as valuable
> as yours and this revert required much more investigation before it
> was submitted. You lived with
> https://github.com/edison-fw/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/files/0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
> since 5.10, which apparently was needed to either boot or have dwc3,
> so I don't think there is any real urgency.

It is in my tree only for the purpose of "don't forget that issue".
I think you can work around it by built-in extcon driver.
Ferry Toth Oct. 12, 2022, 8:34 p.m. UTC | #28
Hi

Op 12-10-2022 om 11:30 schreef Ferry Toth:
> Hi
> 
> On 11-10-2022 22:50, Andrey Smirnov wrote:
>> On Tue, Oct 11, 2022 at 11:54 AM Ferry Toth <fntoth@gmail.com> wrote:
>>> Hi,
>>>
>>> Op 10-10-2022 om 23:35 schreef Andrey Smirnov:
>>>> On Mon, Oct 10, 2022 at 1:52 PM Ferry Toth <fntoth@gmail.com> wrote:
>>>>> Hi
>>>>>
>>>>> Op 10-10-2022 om 13:04 schreef Ferry Toth:
>>>>>> Hi
>>>>>>
>>>>>> On 10-10-2022 07:02, Andrey Smirnov wrote:
>>>>>>> On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
>>>>>>>> On 07-10-2022 04:11, Thinh Nguyen wrote:
>>>>>>>>> On Thu, Oct 06, 2022, Ferry Toth wrote:
>>>>>>>>>> Hi
>>>>>>>>>>
>>>>>>>>>> On 06-10-2022 04:12, Thinh Nguyen wrote:
>>>>>>>>>>> On Wed, Oct 05, 2022, Ferry Toth wrote:
>>>>>>>>>>>> Hi,
>>>>>>>>>>>>
>>>>>>>>>>>>          Thanks!
>>>>>>>>>>>>
>>>>>>>>>>>>          Does the failure only happen the first time host is
>>>>>>>>>>>> initialized? Or can
>>>>>>>>>>>>          it recover after switching to device then back to 
>>>>>>>>>>>> host mode?
>>>>>>>>>>>>
>>>>>>>>>>>> I can switch back and forth and device mode works each time,
>>>>>>>>>>>> host mode remains
>>>>>>>>>>>> dead.
>>>>>>>>>>> Ok.
>>>>>>>>>>>
>>>>>>>>>>>>          Probably the failure happens if some step(s) in
>>>>>>>>>>>> dwc3_core_init() hasn't
>>>>>>>>>>>>          completed.
>>>>>>>>>>>>
>>>>>>>>>>>>          tusb1210 is a phy driver right? The issue is probably
>>>>>>>>>>>> because we didn't
>>>>>>>>>>>>          initialize the phy yet. So, I suspect placing
>>>>>>>>>>>> dwc3_get_extcon() after
>>>>>>>>>>>>          initializing the phy will probably solve the 
>>>>>>>>>>>> dependency
>>>>>>>>>>>> problem.
>>>>>>>>>>>>
>>>>>>>>>>>>          You can try something for yourself or I can provide
>>>>>>>>>>>> something to test
>>>>>>>>>>>>          later if you don't mind (maybe next week if it's ok).
>>>>>>>>>>>>
>>>>>>>>>>>> Yes, the code move I mentioned above "moves dwc3_get_extcon()
>>>>>>>>>>>> until after
>>>>>>>>>>>> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU
>>>>>>>>>>>> initially
>>>>>>>>>>>> dwc3_get_extcon() was called from within dwc3_core_init_mode()
>>>>>>>>>>>> but only for
>>>>>>>>>>>> case USB_DR_MODE_OTG. So with this change order of events is
>>>>>>>>>>>> more or less
>>>>>>>>>>>> unchanged" solves the issue.
>>>>>>>>>>>>
>>>>>>>>>>> I saw the experiment you did from the link you provided. We want
>>>>>>>>>>> to also
>>>>>>>>>>> confirm exactly which step in dwc3_core_init() was needed.
>>>>>>>>>> Ok. I first tried the code move suggested by Andrey (didn't 
>>>>>>>>>> work).
>>>>>>>>>> Then
>>>>>>>>>> after reading the actual code I moved a bit further.
>>>>>>>>>>
>>>>>>>>>> This move was on top of -rc6 without any reverts. I did not make
>>>>>>>>>> additional
>>>>>>>>>> changes to dwc3_core_init()
>>>>>>>>>>
>>>>>>>>>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
>>>>>>>>>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
>>>>>>>>>>
>>>>>>>>>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. -
>>>>>>>>>> dwc3_core_init - ..
>>>>>>>>>> - dwc3_core_init_mode (no change)
>>>>>>>>>>
>>>>>>>>>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. -
>>>>>>>>>> dwc3_get_extcon -
>>>>>>>>>> dwc3_core_init_mode (works)
>>>>>>>>>>
>>>>>>>>>> .. are what I believe for this issue irrelevant calls to
>>>>>>>>>> dwc3_alloc_scratch_buffers, dwc3_check_params and 
>>>>>>>>>> dwc3_debugfs_init.
>>>>>>>>>>
>>>>>>>>> Right. Thanks for narrowing it down. There are still many steps in
>>>>>>>>> dwc3_core_init(). We have some suspicion, but we still haven't
>>>>>>>>> confirmed
>>>>>>>>> the exact cause of the failure. We can write a proper patch 
>>>>>>>>> once we
>>>>>>>>> know
>>>>>>>>> the reason.
>>>>>>>> If you would like me to test your suspicion, just tell me what 
>>>>>>>> to do
>>>>>>>> :-)
>>>>>>> OK, Ferry, I think I'm going to need clarification on specifics on
>>>>>>> your test setup. Can you share your kernel config, maybe your
>>>>>>> "/proc/config.gz", somewhere? When you say you are running vanilla
>>>>>>> Linux, do you mean it or do you mean vanilla tree + some patch 
>>>>>>> delta?
>>>>>> For v6.0 I can get the exacts tonight. But earlier I had this for 
>>>>>> v5.17:
>>>>>>
>>>>>> https://github.com/htot/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_5.17.bb
>>>>>>
>>>>>>
>>>>>> There are 2 patches referred in #67 and #68. One is related to the
>>>>>> infinite loop. The other is I believe also needed to get dwc3 to 
>>>>>> work.
>>>>>>
>>>>>> All the kernel config are applied as .cfg.
>>>>>>
>>>>>> Patches and cfs's here:
>>>>>>
>>>>>> https://github.com/htot/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files
>>>>>>
>>>>> Updated Yocto recipe for v6.0 here:
>>>>>
>>>>> https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb
>>>>>
>>>>> #75-#77 are the 2 reverts from Andy, + one SOF revert (not related to
>>>>> this thread).
>>>> Please drop all of this
>>>> https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb#L69-L77
>>>> and re do the testing. Assuming things are still broken, that's how
>>>> you want to do the bisecting.
>>> I removed 4 patches:
>>> 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
>>> 0044-REVERTME-usb-dwc3-gadget-skip-endpoints-ep-18-in-out.patch
>>> 0001-Revert-USB-fixup-for-merge-issue-with-usb-dwc3-Don-t.patch
>>> 0001-Revert-usb-dwc3-Don-t-switch-OTG-peripheral-if-extco.patch
>> Please remove all custom patches so we are on the same page. I don't
>> suspect the 8250 related changes to affect anything, but I also would
>> like to be testing the same thing. I'm testing vanilla v6.0
> Alright, but don't expect any change. The 8250 patches are related to 
> using DMA for the serial ports (except the console). It may affect 
> bluetooth, the serial port on the arduino connector, but not the console.
>>> and indeed as you expect kernel boots (no infinite loop). However dwc3
>>> host mode is not working as in your case, device mode works fine (Yocto
>>> configures a set of gadgets for me).

With vanilla v6.0 there is no probe loop but still host mode does not work.



>> What do you do to test host mode working? lsusb? Something else?
>> Asking to make sure I'm doing something equivalent on my end.

root@yuna:~# lsusb
Bus 002 Device 001: ID 1d6b:0003 Linux Foundation 3.0 root hub
Bus 001 Device 001: ID 1d6b:0002 Linux Foundation 2.0 root hub

This is with smsc95xx plugged in - no leds on except power/

> I have a smsc95xx 4p usb hub with 1 eth port continuously plugged. It 
> has leds on all ports so when it works it lights up like a Christmas tree.
> 
> But I also tried plugging a usb stick.
> 
> It maybe that lsusb is not enough. Iirc the root hub is there, but the 
> tusb1210 not and then device plugs are not detected. So in my case none 
> of the leds on the hub turn on.

root@yuna:~# dmesg | grep -i tusb
root@yuna:~#

>>> Just to be sure if I could have bisected without 0043a I added back the
>>> 2 0001-Revert* and indeed I run into the infinite loop with the console
>>> spitting out continuous:
>>> debugfs: Directory 'dwc3.0.auto' with parent 'ulpi' already present!
>>> tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41 to reg 0x80
>>>
>>> so yes it seems either 0043b or your patch "usb: dwc3: Don't switch OTG
>>> -> peripheral if extcon is present" is needed to boot (break the
>>> infinite loop). But your patch is in my case not sufficient to make host
>>> mode work.
>>>
>> Next step would be to establish if USB is working before my patch. You
>> should be able to avoid the boot loop if you disable the
>> "phy-tusb1210" driver. The driver fails to probe anyway, so it's not
>> very likely to be crucial for functioning, so it should allow you to
>> try things with my patch reverted:
> 
> You lost me here. With "boot loop" you mean "probe loop" right? Why do 
> you think the tusb1210 driver is not crucial?

Nevertheless tried: with tusb1210 disabled and your patch reverted
#SRC_URI:append = " file://0038-enable-PHY_TUSB1210.cfg"
SRC_URI:append = " 
file://0001-Revert-USB-fixup-for-merge-issue-with-usb-dwc3-Don-t.patch"
SRC_URI:append = " 
file://0001-Revert-usb-dwc3-Don-t-switch-OTG-peripheral-if-extco.patch"

there is indeed no probe loop as you expect, but host mode still does 
not work (device mode still works). We need the tusb1210 in host mode.

Earlier you asked for my config, here it is: 
https://drive.google.com/file/d/1aKJWMqiAXnReeLCvxshzjKwGxIWQ7eJk/view?usp=sharing

> See "phy: ti: tusb1210: Don't check for write errors when powering on"
> 
> It should not be failing to probe (and with Andy's "Break-infinite-loop" 
> patch is doesn't) as without the tusb1210 usb host mode won't work as 
> device plugs are not detected.
> 
> Earlier in this thread we had:
> 
> "The effect of the patch is that on Merrifield (I tested with Intel 
> Edison Arduino board which has a HW switch to select between host and 
> device mode) device mode works but in host mode USB is completely not 
> working.
> 
> Currently on host mode - when working - superfluous error messages from 
> tusb1210 appear. When host mode is not working there are no tusb1210 
> messages in the logs / on the console at all. Seemingly tusb1210 is not 
> probed, which points in the direction of a relation to extcon."
> 
>> git revert 8bd6b8c4b100 0f0101719138
>>
>> After that, if things start working, it'd make sense to re-do your
>> function re-arranging experiment to re-validate it.
>>
>>> As I understand it depends a bit on the timing, I might have a different
>>> initrd (built by Yocto vs. Buildroot). F.i. I see I have
>>> extcon-intel-mrfld in initrd and dwc3 / phy-tusb1210 built-in.
>>>
>> You mentioned that your rootfs image does some gadget configuration
>> for you. Can this be disabled? If yes, it'd make sense to check if
>> this could be a variable explaining the difference.

I notice when flipping switch to device mode, gadgets pop up. Then 
switching back to host, console (and dmesg) show:
root@yuna:~# dwc3 dwc3.0.auto: request 000000004e7f118e was not queued 
to ep5in
dwc3 dwc3.0.auto: request 000000003c6215ba was not queued to ep4out
dwc3 dwc3.0.auto: request 000000005270315b was not queued to ep4out
dwc3 dwc3.0.auto: request 000000001d456f53 was not queued to ep6in
dwc3 dwc3.0.auto: request 000000001f17ddc6 was not queued to ep6in

This is new and caused by dropping "REVERTME: usb: dwc3: gadget: skip 
endpoints ep[18]{in,out}". I think we need to keep this one.

> This is done through configfs only when the switch is set to device mode.
>> What U-Boot version are you running? AFACT U-Boot will touch that
>> particular IP block, so this might be somewhat relevant.
> IIRC if have v2022.04 but tested v2021.10 earlier (no difference).

I am indeed on v2022.04 with 1 patch on top "REVERTME: usb: dwc3: 
gadget: skip endpoints ep[18]{in,out}"
Andrey Smirnov Oct. 12, 2022, 9:43 p.m. UTC | #29
On Wed, Oct 12, 2022 at 2:30 AM Ferry Toth <fntoth@gmail.com> wrote:
>
> Hi
>
> On 11-10-2022 22:50, Andrey Smirnov wrote:
> > On Tue, Oct 11, 2022 at 11:54 AM Ferry Toth <fntoth@gmail.com> wrote:
> >> Hi,
> >>
> >> Op 10-10-2022 om 23:35 schreef Andrey Smirnov:
> >>> On Mon, Oct 10, 2022 at 1:52 PM Ferry Toth <fntoth@gmail.com> wrote:
> >>>> Hi
> >>>>
> >>>> Op 10-10-2022 om 13:04 schreef Ferry Toth:
> >>>>> Hi
> >>>>>
> >>>>> On 10-10-2022 07:02, Andrey Smirnov wrote:
> >>>>>> On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
> >>>>>>> On 07-10-2022 04:11, Thinh Nguyen wrote:
> >>>>>>>> On Thu, Oct 06, 2022, Ferry Toth wrote:
> >>>>>>>>> Hi
> >>>>>>>>>
> >>>>>>>>> On 06-10-2022 04:12, Thinh Nguyen wrote:
> >>>>>>>>>> On Wed, Oct 05, 2022, Ferry Toth wrote:
> >>>>>>>>>>> Hi,
> >>>>>>>>>>>
> >>>>>>>>>>>          Thanks!
> >>>>>>>>>>>
> >>>>>>>>>>>          Does the failure only happen the first time host is
> >>>>>>>>>>> initialized? Or can
> >>>>>>>>>>>          it recover after switching to device then back to host mode?
> >>>>>>>>>>>
> >>>>>>>>>>> I can switch back and forth and device mode works each time,
> >>>>>>>>>>> host mode remains
> >>>>>>>>>>> dead.
> >>>>>>>>>> Ok.
> >>>>>>>>>>
> >>>>>>>>>>>          Probably the failure happens if some step(s) in
> >>>>>>>>>>> dwc3_core_init() hasn't
> >>>>>>>>>>>          completed.
> >>>>>>>>>>>
> >>>>>>>>>>>          tusb1210 is a phy driver right? The issue is probably
> >>>>>>>>>>> because we didn't
> >>>>>>>>>>>          initialize the phy yet. So, I suspect placing
> >>>>>>>>>>> dwc3_get_extcon() after
> >>>>>>>>>>>          initializing the phy will probably solve the dependency
> >>>>>>>>>>> problem.
> >>>>>>>>>>>
> >>>>>>>>>>>          You can try something for yourself or I can provide
> >>>>>>>>>>> something to test
> >>>>>>>>>>>          later if you don't mind (maybe next week if it's ok).
> >>>>>>>>>>>
> >>>>>>>>>>> Yes, the code move I mentioned above "moves dwc3_get_extcon()
> >>>>>>>>>>> until after
> >>>>>>>>>>> dwc3_core_init() but just before dwc3_core_init_mode(). AFAIU
> >>>>>>>>>>> initially
> >>>>>>>>>>> dwc3_get_extcon() was called from within dwc3_core_init_mode()
> >>>>>>>>>>> but only for
> >>>>>>>>>>> case USB_DR_MODE_OTG. So with this change order of events is
> >>>>>>>>>>> more or less
> >>>>>>>>>>> unchanged" solves the issue.
> >>>>>>>>>>>
> >>>>>>>>>> I saw the experiment you did from the link you provided. We want
> >>>>>>>>>> to also
> >>>>>>>>>> confirm exactly which step in dwc3_core_init() was needed.
> >>>>>>>>> Ok. I first tried the code move suggested by Andrey (didn't work).
> >>>>>>>>> Then
> >>>>>>>>> after reading the actual code I moved a bit further.
> >>>>>>>>>
> >>>>>>>>> This move was on top of -rc6 without any reverts. I did not make
> >>>>>>>>> additional
> >>>>>>>>> changes to dwc3_core_init()
> >>>>>>>>>
> >>>>>>>>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
> >>>>>>>>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
> >>>>>>>>>
> >>>>>>>>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. -
> >>>>>>>>> dwc3_core_init - ..
> >>>>>>>>> - dwc3_core_init_mode (no change)
> >>>>>>>>>
> >>>>>>>>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. -
> >>>>>>>>> dwc3_get_extcon -
> >>>>>>>>> dwc3_core_init_mode (works)
> >>>>>>>>>
> >>>>>>>>> .. are what I believe for this issue irrelevant calls to
> >>>>>>>>> dwc3_alloc_scratch_buffers, dwc3_check_params and dwc3_debugfs_init.
> >>>>>>>>>
> >>>>>>>> Right. Thanks for narrowing it down. There are still many steps in
> >>>>>>>> dwc3_core_init(). We have some suspicion, but we still haven't
> >>>>>>>> confirmed
> >>>>>>>> the exact cause of the failure. We can write a proper patch once we
> >>>>>>>> know
> >>>>>>>> the reason.
> >>>>>>> If you would like me to test your suspicion, just tell me what to do
> >>>>>>> :-)
> >>>>>> OK, Ferry, I think I'm going to need clarification on specifics on
> >>>>>> your test setup. Can you share your kernel config, maybe your
> >>>>>> "/proc/config.gz", somewhere? When you say you are running vanilla
> >>>>>> Linux, do you mean it or do you mean vanilla tree + some patch delta?
> >>>>> For v6.0 I can get the exacts tonight. But earlier I had this for v5.17:
> >>>>>
> >>>>> https://github.com/htot/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_5.17.bb
> >>>>>
> >>>>>
> >>>>> There are 2 patches referred in #67 and #68. One is related to the
> >>>>> infinite loop. The other is I believe also needed to get dwc3 to work.
> >>>>>
> >>>>> All the kernel config are applied as .cfg.
> >>>>>
> >>>>> Patches and cfs's here:
> >>>>>
> >>>>> https://github.com/htot/meta-intel-edison/tree/master/meta-intel-edison-bsp/recipes-kernel/linux/files
> >>>>>
> >>>> Updated Yocto recipe for v6.0 here:
> >>>>
> >>>> https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb
> >>>>
> >>>> #75-#77 are the 2 reverts from Andy, + one SOF revert (not related to
> >>>> this thread).
> >>> Please drop all of this
> >>> https://github.com/htot/meta-intel-edison/blob/honister/meta-intel-edison-bsp/recipes-kernel/linux/linux-yocto_6.0.bb#L69-L77
> >>> and re do the testing. Assuming things are still broken, that's how
> >>> you want to do the bisecting.
> >> I removed 4 patches:
> >> 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
> >> 0044-REVERTME-usb-dwc3-gadget-skip-endpoints-ep-18-in-out.patch
> >> 0001-Revert-USB-fixup-for-merge-issue-with-usb-dwc3-Don-t.patch
> >> 0001-Revert-usb-dwc3-Don-t-switch-OTG-peripheral-if-extco.patch
> > Please remove all custom patches so we are on the same page. I don't
> > suspect the 8250 related changes to affect anything, but I also would
> > like to be testing the same thing. I'm testing vanilla v6.0
> Alright, but don't expect any change. The 8250 patches are related to
> using DMA for the serial ports (except the console). It may affect
> bluetooth, the serial port on the arduino connector, but not the console.

Yeah, I don't, but for the sake of thoroughness we may as well be
building the same thing.

> >> and indeed as you expect kernel boots (no infinite loop). However dwc3
> >> host mode is not working as in your case, device mode works fine (Yocto
> >> configures a set of gadgets for me).
> > What do you do to test host mode working? lsusb? Something else?
> > Asking to make sure I'm doing something equivalent on my end.
> >
> I have a smsc95xx 4p usb hub with 1 eth port continuously plugged. It
> has leds on all ports so when it works it lights up like a Christmas tree.
>
> But I also tried plugging a usb stick.
>
> It maybe that lsusb is not enough. Iirc the root hub is there, but the
> tusb1210 not and then device plugs are not detected. So in my case none
> of the leds on the hub turn on.
>

Yeah, by lsusb I mean, "plug in a device" and make sure it enumerated
and visible in "lsusb".

> >> Just to be sure if I could have bisected without 0043a I added back the
> >> 2 0001-Revert* and indeed I run into the infinite loop with the console
> >> spitting out continuous:
> >> debugfs: Directory 'dwc3.0.auto' with parent 'ulpi' already present!
> >> tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41 to reg 0x80
> >>
> >> so yes it seems either 0043b or your patch "usb: dwc3: Don't switch OTG
> >> -> peripheral if extcon is present" is needed to boot (break the
> >> infinite loop). But your patch is in my case not sufficient to make host
> >> mode work.
> >>
> > Next step would be to establish if USB is working before my patch. You
> > should be able to avoid the boot loop if you disable the
> > "phy-tusb1210" driver. The driver fails to probe anyway, so it's not
> > very likely to be crucial for functioning, so it should allow you to
> > try things with my patch reverted:
>
> You lost me here. With "boot loop" you mean "probe loop" right?

Yep

> Why do
> you think the tusb1210 driver is not crucial?
>
> See "phy: ti: tusb1210: Don't check for write errors when powering on"
>

My end goal here is to find a way to test vanilla v6.0 with the two
patches reverted on your end. I thought that during my testing I saw
tusb1210 print those timeout messages during its probe and that
disabling the driver worked to break the loop, but I went back to
double check and it doesn't work so scratch that idea. Configuring
extcon as a built-in breaks host functionality with or without patches
on my end, so I'm not sure it could be a path.

I won't have time to try things with
0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch until
the weekend, meanwhile can you give this diff a try with vanilla (no
reverts) v6.0:

modified   drivers/phy/ti/phy-tusb1210.c
@@ -127,6 +127,7 @@ static int tusb1210_set_mode(struct phy *phy, enum
phy_mode mode, int submode)
  u8 reg;

  ret = tusb1210_ulpi_read(tusb, ULPI_OTG_CTRL, &reg);
+ WARN_ON(ret < 0);
  if (ret < 0)
  return ret;

@@ -152,7 +153,10 @@ static int tusb1210_set_mode(struct phy *phy,
enum phy_mode mode, int submode)
  }

  tusb->otg_ctrl = reg;
- return tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
+ ret = tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
+ WARN_ON(ret < 0);
+ return ret;
+
 }

 #ifdef CONFIG_POWER_SUPPLY

? I'm curious to see if there's masked errors on your end since dwc3
driver doesn't check for those.

> It should not be failing to probe (and with Andy's "Break-infinite-loop"
> patch is doesn't) as without the tusb1210 usb host mode won't work as
> device plugs are not detected.
>
> Earlier in this thread we had:
>
> "The effect of the patch is that on Merrifield (I tested with Intel
> Edison Arduino board which has a HW switch to select between host and
> device mode) device mode works but in host mode USB is completely not
> working.
>
> Currently on host mode - when working - superfluous error messages from
> tusb1210 appear. When host mode is not working there are no tusb1210
> messages in the logs / on the console at all. Seemingly tusb1210 is not
> probed, which points in the direction of a relation to extcon."
>
> > git revert 8bd6b8c4b100 0f0101719138
> >
> > After that, if things start working, it'd make sense to re-do your
> > function re-arranging experiment to re-validate it.
> >
> >> As I understand it depends a bit on the timing, I might have a different
> >> initrd (built by Yocto vs. Buildroot). F.i. I see I have
> >> extcon-intel-mrfld in initrd and dwc3 / phy-tusb1210 built-in.
> >>
> > You mentioned that your rootfs image does some gadget configuration
> > for you. Can this be disabled? If yes, it'd make sense to check if
> > this could be a variable explaining the difference.
> This is done through configfs only when the switch is set to device mode.

Sure, but can it be disabled? We are looking for unknown variables, so
excluding this would be a reasonable thing to do.

> > What U-Boot version are you running? AFACT U-Boot will touch that
> > particular IP block, so this might be somewhat relevant.
> IIRC if have v2022.04 but tested v2021.10 earlier (no difference).

OK, sounds like we can tick that off the list.
Andrey Smirnov Oct. 12, 2022, 10:13 p.m. UTC | #30
On Wed, Oct 12, 2022 at 3:32 AM Andy Shevchenko
<andriy.shevchenko@linux.intel.com> wrote:
>
> On Tue, Oct 11, 2022 at 01:17:13PM -0700, Andrey Smirnov wrote:
> > On Tue, Oct 11, 2022 at 2:21 AM Andy Shevchenko
> > <andriy.shevchenko@linux.intel.com> wrote:
> > > On Mon, Oct 10, 2022 at 02:40:30PM -0700, Andrey Smirnov wrote:
> > > > On Mon, Oct 10, 2022 at 12:13 AM Andy Shevchenko
> > > > <andriy.shevchenko@linux.intel.com> wrote:
> > > > > On Sun, Oct 09, 2022 at 10:02:26PM -0700, Andrey Smirnov wrote:
> > > > > > On Fri, Oct 7, 2022 at 6:07 AM Ferry Toth <fntoth@gmail.com> wrote:
>
> ...
>
> > > > > > OK, Ferry, I think I'm going to need clarification on specifics on
> > > > > > your test setup. Can you share your kernel config, maybe your
> > > > > > "/proc/config.gz", somewhere? When you say you are running vanilla
> > > > > > Linux, do you mean it or do you mean vanilla tree + some patch delta?
> > > > > >
> > > > > > The reason I'm asking is because I'm having a hard time reproducing
> > > > > > the problem on my end. In fact, when I build v6.0
> > > > > > (4fe89d07dcc2804c8b562f6c7896a45643d34b2f) and then do a
> > > > > >
> > > > > > git revert 8bd6b8c4b100 0f0101719138 (original revert proposed by Andy)
> > > > > >
> > > > > > I get an infinite loop of reprobing that looks something like (some
> > > > > > debug tracing, function name + line number, included):
> > > > >
> > > > > Yes, this is (one of) known drawback(s) of deferred probe hack. I think
> > > > > the kernel that Ferry runs has a patch that basically reverts one from
> > > > > 2014 [1] and allows to have extcon as a module. (1)
> > > > >
> > > > > [1]: 58b116bce136 ("drivercore: deferral race condition fix")
> > > > >
> > > > > > which renders the system completely unusable, but USB host is
> > > > > > definitely going to be broken too. Now, ironically, with my patch
> > > > > > in-place, an attempt to probe extcon that ends up deferring the probe
> > > > > > happens before the ULPI driver failure (which wasn't failing driver
> > > > > > probe prior to https://lore.kernel.org/all/20220213130524.18748-7-hdegoede@redhat.com/),
> > > > > > there no "driver binding" event that re-triggers deferred probe
> > > > > > causing the loop, so the system progresses to a point where extcon is
> > > > > > available and dwc3 driver eventually loads.
> > > > > >
> > > > > > After that, and I don't know if I'm doing the same test, USB host
> > > > > > seems to work as expected. lsusb works, my USB stick enumerates as
> > > > > > expected. Switching the USB mux to micro-USB and back shuts the host
> > > > > > functionality down and brings it up as expected. Now I didn't try to
> > > > > > load any gadgets to make sure USB gadget works 100%, but since you
> > > > > > were saying it was USB host that was broken, I wasn't concerned with
> > > > > > that. Am I doing the right test?
> > > > >
> > > > > Hmm... What you described above sounds more like a yet another attempt to
> > > > > workaround (1). _If_ this is the case, we probably can discuss how to fix
> > > > > it in generic way (somewhere in dd.c, rather than in the certain driver).
> > > >
> > > > No, I'm not describing an attempt to fix anything. Just how vanilla
> > > > v6.0 (where my patch is not reverted) works and where my patch, fixing
> > > > a logical problem in which extcon was requested too late causing a
> > > > forced OTG -> "gadget only" switch, also changed the ordering enough
> > > > to accidentally avoid the loop.
> > >
> > > You still refer to a fix, but my question was if it's the same problem or not.
> > >
> >
> > No, it's not the same problem.
> >
> > > > > That said, the real test case should be performed on top of clean kernel
> > > > > before judging if it's good or bad.
> > > >
> > > > Given your level of involvemnt with this particular platform and you
> > > > being the author of
> > > > https://github.com/edison-fw/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/files/0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
> > > > I assumed/expected you to double check this before sending this revert
> > > > out. Please do so next time.
> > >
> > > As I said I have not yet restored my testing environment for that platform and
> > > I relied on the Ferry's report. Taking into account the history of breakages
> > > that done for Intel Merrifield, in particular by not wide tested patches
> > > against DWC3 driver, I immediately react with a revert.
> >
> > That's what I'm asking you not to do next time. If you don't have time
> > to restore your testing env or double check Ferry's work, please live
> > with a revert in your local tree until you do.
>
> I trust Ferry's tests as mine and repeating again, we have a bad history
> when people so value their time that breaks our platform,

This is not a good excuse to jump the gun and send a revert without
double checking. Some regressions will always be unavoidable.

> so please test
> your changes in the future that it makes no regressions.
>

This is, in a nutshell, asking me to prove a negative. That's not a
feasible request. To add insult to injury, you are talking about a
platform way past EOL that's out of stock in every major store and
it's by sheer luck that I was able to get the last kit on eBay.
Downstream will always be the ultimate test for regressions given the
sheer number of permutations. A CI/CD rig that would allow developers
to make a regression test run, would make this a much more reasonable
request, without it, end-user(s) is the only "test bed" there is.

> If you want to have a proof that your patches are broken, then I will
> prioritize this. We now have a full release cycle time for that.
>

You prioritizing this now saves me nothing, whereas you prioritizing
this before submitting reverts would've saved me time. That's the
point I'm trying to convey.

> > My time is as valuable
> > as yours and this revert required much more investigation before it
> > was submitted. You lived with
> > https://github.com/edison-fw/meta-intel-edison/blob/master/meta-intel-edison-bsp/recipes-kernel/linux/files/0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch
> > since 5.10, which apparently was needed to either boot or have dwc3,
> > so I don't think there is any real urgency.
>
> It is in my tree only for the purpose of "don't forget that issue".
> I think you can work around it by built-in extcon driver.
>
Ferry Toth Oct. 13, 2022, 7:35 p.m. UTC | #31
<SNIP>
> My end goal here is to find a way to test vanilla v6.0 with the two
> patches reverted on your end. I thought that during my testing I saw
> tusb1210 print those timeout messages during its probe and that
> disabling the driver worked to break the loop, but I went back to
> double check and it doesn't work so scratch that idea. Configuring
> extcon as a built-in breaks host functionality with or without patches
> on my end, so I'm not sure it could be a path.
>
> I won't have time to try things with
> 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch until
> the weekend, meanwhile can you give this diff a try with vanilla (no
> reverts) v6.0:
>
> modified   drivers/phy/ti/phy-tusb1210.c
> @@ -127,6 +127,7 @@ static int tusb1210_set_mode(struct phy *phy, enum
> phy_mode mode, int submode)
>    u8 reg;
>
>    ret = tusb1210_ulpi_read(tusb, ULPI_OTG_CTRL, &reg);
> + WARN_ON(ret < 0);
>    if (ret < 0)
>    return ret;
>
> @@ -152,7 +153,10 @@ static int tusb1210_set_mode(struct phy *phy,
> enum phy_mode mode, int submode)
>    }
>
>    tusb->otg_ctrl = reg;
> - return tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
> + ret = tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
> + WARN_ON(ret < 0);
> + return ret;
> +
>   }
>
>   #ifdef CONFIG_POWER_SUPPLY
>
> ? I'm curious to see if there's masked errors on your end since dwc3
> driver doesn't check for those.
root@yuna:~# dmesg | grep -i -E 'warn|assert|error|tusb|dwc3'
8250_mid: probe of 0000:00:04.0 failed with error -16
platform regulatory.0: Direct firmware load for regulatory.db failed 
with error -2
brcmfmac mmc2:0001:1: Direct firmware load for 
brcm/brcmfmac43340-sdio.Intel Corporation-Merrifield.bin failed with 
error -2
sof-audio-pci-intel-tng 0000:00:0d.0: error: I/O region is too small.
sof-audio-pci-intel-tng 0000:00:0d.0: error: failed to probe DSP -19


>> This is done through configfs only when the switch is set to device mode.
> Sure, but can it be disabled? We are looking for unknown variables, so
> excluding this would be a reasonable thing to do.
It's not enabled until I flip the switch to device mode.
Andrey Smirnov Oct. 15, 2022, 7:54 p.m. UTC | #32
On Thu, Oct 13, 2022 at 12:35 PM Ferry Toth <fntoth@gmail.com> wrote:
>
> <SNIP>
> > My end goal here is to find a way to test vanilla v6.0 with the two
> > patches reverted on your end. I thought that during my testing I saw
> > tusb1210 print those timeout messages during its probe and that
> > disabling the driver worked to break the loop, but I went back to
> > double check and it doesn't work so scratch that idea. Configuring
> > extcon as a built-in breaks host functionality with or without patches
> > on my end, so I'm not sure it could be a path.
> >
> > I won't have time to try things with
> > 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch until
> > the weekend, meanwhile can you give this diff a try with vanilla (no
> > reverts) v6.0:
> >

OK, got a chance to try things with that patch. Both v6.0 and v6.0
with my patches reverted work the same, my Kingston DataTraveller USB
stick enumerates and works as expected.

> > modified   drivers/phy/ti/phy-tusb1210.c
> > @@ -127,6 +127,7 @@ static int tusb1210_set_mode(struct phy *phy, enum
> > phy_mode mode, int submode)
> >    u8 reg;
> >
> >    ret = tusb1210_ulpi_read(tusb, ULPI_OTG_CTRL, &reg);
> > + WARN_ON(ret < 0);
> >    if (ret < 0)
> >    return ret;
> >
> > @@ -152,7 +153,10 @@ static int tusb1210_set_mode(struct phy *phy,
> > enum phy_mode mode, int submode)
> >    }
> >
> >    tusb->otg_ctrl = reg;
> > - return tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
> > + ret = tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
> > + WARN_ON(ret < 0);
> > + return ret;
> > +
> >   }
> >
> >   #ifdef CONFIG_POWER_SUPPLY
> >
> > ? I'm curious to see if there's masked errors on your end since dwc3
> > driver doesn't check for those.
> root@yuna:~# dmesg | grep -i -E 'warn|assert|error|tusb|dwc3'
> 8250_mid: probe of 0000:00:04.0 failed with error -16
> platform regulatory.0: Direct firmware load for regulatory.db failed
> with error -2
> brcmfmac mmc2:0001:1: Direct firmware load for
> brcm/brcmfmac43340-sdio.Intel Corporation-Merrifield.bin failed with
> error -2
> sof-audio-pci-intel-tng 0000:00:0d.0: error: I/O region is too small.
> sof-audio-pci-intel-tng 0000:00:0d.0: error: failed to probe DSP -19
>
>
> >> This is done through configfs only when the switch is set to device mode.
> > Sure, but can it be disabled? We are looking for unknown variables, so
> > excluding this would be a reasonable thing to do.
> It's not enabled until I flip the switch to device mode.

OK to cut this back and forth short, I think it'd be easier to just
ask you to run what I run. Here's vanilla v6.0 bzImage and initrd
(built with your config + CONFIG_PHY_TUSB1210=y) I tested with
https://drive.google.com/drive/folders/1H28AL1coPPZ2kLTYskDuDdWo-oE7DRPH?usp=sharing
let's see how it behaves on your setup. There's also the U-Boot binary
I use in that folder in case you want to give it a try.

Now on Merrifield dwc3_get_extcon() doesn't do anything but call
extcon_get_extcon_dev() which doesn't touch any hardware or interact
with other drivers, so assuming

> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
> dwc3_core_init - .. - dwc3_core_init_mode (not working)
>
> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. - dwc3_core_init -
> .. - dwc3_core_init_mode (no change)
>
> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. - dwc3_get_extcon -
> dwc3_core_init_mode (works)

still holds(did you double check that with vanilla v6.0?) the only
difference that I can see is execution timings. It seems to me it's
either an extra delay added by execution of  extcon_get_extcon_dev()
(unlikely) or multiple partial probes that include dwc3_core_init()
that change things. You can try to check the latter by adding an
artificial probe deferral point after dwc3_core_init(). Something like
(didn't test this):

modified   drivers/usb/dwc3/core.c
@@ -1860,6 +1860,10 @@ static int dwc3_probe(struct platform_device *pdev)
  goto err3;

  ret = dwc3_core_init(dwc);
+ static int deferral_counter = 0;
+ if (deferral_counter++ < 9) /* I counted 9 deferrals in my testing */
+ ret = -EPROBE_DEFER;
+
  if (ret) {
  dev_err_probe(dev, ret, "failed to initialize core\n");
  goto err4;
Ferry Toth Oct. 16, 2022, 8:59 p.m. UTC | #33
Op 15-10-2022 om 21:54 schreef Andrey Smirnov:
> On Thu, Oct 13, 2022 at 12:35 PM Ferry Toth <fntoth@gmail.com> wrote:
>> <SNIP>
>>> My end goal here is to find a way to test vanilla v6.0 with the two
>>> patches reverted on your end. I thought that during my testing I saw
>>> tusb1210 print those timeout messages during its probe and that
>>> disabling the driver worked to break the loop, but I went back to
>>> double check and it doesn't work so scratch that idea. Configuring
>>> extcon as a built-in breaks host functionality with or without patches
>>> on my end, so I'm not sure it could be a path.
>>>
>>> I won't have time to try things with
>>> 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch until
>>> the weekend, meanwhile can you give this diff a try with vanilla (no
>>> reverts) v6.0:
>>>
> OK, got a chance to try things with that patch. Both v6.0 and v6.0
> with my patches reverted work the same, my Kingston DataTraveller USB
> stick enumerates and works as expected.
>
Iow you don't need the patch at all to get usb to work. There has got to 
be a difference in our configs.

Did you have a chance to look at mine (here: 
https://drive.google.com/file/d/1aKJWMqiAXnReeLCvxshzjKwGxIWQ7eJk/view?usp=sharing)

Else, send me yours.

>>> modified   drivers/phy/ti/phy-tusb1210.c
>>> @@ -127,6 +127,7 @@ static int tusb1210_set_mode(struct phy *phy, enum
>>> phy_mode mode, int submode)
>>>     u8 reg;
>>>
>>>     ret = tusb1210_ulpi_read(tusb, ULPI_OTG_CTRL, &reg);
>>> + WARN_ON(ret < 0);
>>>     if (ret < 0)
>>>     return ret;
>>>
>>> @@ -152,7 +153,10 @@ static int tusb1210_set_mode(struct phy *phy,
>>> enum phy_mode mode, int submode)
>>>     }
>>>
>>>     tusb->otg_ctrl = reg;
>>> - return tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
>>> + ret = tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
>>> + WARN_ON(ret < 0);
>>> + return ret;
>>> +
>>>    }
>>>
>>>    #ifdef CONFIG_POWER_SUPPLY
>>>
>>> ? I'm curious to see if there's masked errors on your end since dwc3
>>> driver doesn't check for those.
>> root@yuna:~# dmesg | grep -i -E 'warn|assert|error|tusb|dwc3'
>> 8250_mid: probe of 0000:00:04.0 failed with error -16
>> platform regulatory.0: Direct firmware load for regulatory.db failed
>> with error -2
>> brcmfmac mmc2:0001:1: Direct firmware load for
>> brcm/brcmfmac43340-sdio.Intel Corporation-Merrifield.bin failed with
>> error -2
>> sof-audio-pci-intel-tng 0000:00:0d.0: error: I/O region is too small.
>> sof-audio-pci-intel-tng 0000:00:0d.0: error: failed to probe DSP -19
>>
>>
>>>> This is done through configfs only when the switch is set to device mode.
>>> Sure, but can it be disabled? We are looking for unknown variables, so
>>> excluding this would be a reasonable thing to do.
>> It's not enabled until I flip the switch to device mode.
> OK to cut this back and forth short, I think it'd be easier to just
> ask you to run what I run. Here's vanilla v6.0 bzImage and initrd
> (built with your config + CONFIG_PHY_TUSB1210=y) I tested with

What do you mean by this? My config is with

CONFIG_GENERIC_PHY=y
CONFIG_PHY_TUSB1210=y

> https://drive.google.com/drive/folders/1H28AL1coPPZ2kLTYskDuDdWo-oE7DRPH?usp=sharing
> let's see how it behaves on your setup. There's also the U-Boot binary

Ok, it's getting weirder and weirder. The following is with my U-Boot 
and your kernel/initrd

1) I placed them in /boot which is on my btrfs partition on the emmc (my 
U-Boot has btrfs enabled)

Linux kernel version 6.0.0-edison-acpi-standard 
(andreysm@neptunefw-builder) #8 SMP PREEMPT_DYNAMIC Sat Oct 15 18:47:19 
UTC 2022
Building boot_params at 0x00090000
Loading bzImage at address 100000 (12064480 bytes)
Initial RAM disk at linear address 0x06000000, size 25165824 bytes
Kernel command line: "quiet root=/dev/mmcblk0p8 
rootflags=subvol=@,compress=lzo rootfstype=btrfs console=ttyS2,115200n8 
earlyprintk=ttyS2,115200n8,keep loglevel=4 systemd.unit=multi-user.target"
Kernel loaded at 00100000, setup_base=00090000

Usb drive is not detected regardless booting with stick plugged or 
plugging later on.

# lsusb
Bus 001 Device 001: ID 1d6b:0002
Bus 002 Device 001: ID 1d6b:0003

No TUSB1210 probed

# dmesg | grep dwc3
#

2) I placed them in my vfat rescue partition

Linux kernel version 6.0.0-edison-acpi-standard 
(andreysm@neptunefw-builder) #8 SMP PREEMPT_DYNAMIC Sat Oct 15 18:47:19 
UTC 2022
Building boot_params at 0x00090000
Loading bzImage at address 100000 (12064480 bytes)
Initial RAM disk at linear address 0x06000000, size 25165824 bytes
Kernel command line: "debugshell=0 tty1 console=ttyS2,115200n8 
root=/dev/mmcblk0p7 rootfstype=vfat systemd.unit=multi-user.target"
Kernel loaded at 00100000, setup_base=00090000

Usb drive is detected.

# lsusb
Bus 001 Device 001: ID 1d6b:0002
Bus 001 Device 002: ID 125f:312b
Bus 002 Device 001: ID 1d6b:0003

TUSB1210 probed

# dmesg | grep dwc3
[    8.605845] tusb1210 dwc3.0.auto.ulpi: GPIO lookup for consumer reset
[    8.605876] tusb1210 dwc3.0.auto.ulpi: using ACPI for GPIO lookup
[    8.605927] tusb1210 dwc3.0.auto.ulpi: using lookup tables for GPIO 
lookup
[    8.605941] tusb1210 dwc3.0.auto.ulpi: No GPIO consumer reset found
[    8.605956] tusb1210 dwc3.0.auto.ulpi: GPIO lookup for consumer cs
[    8.605970] tusb1210 dwc3.0.auto.ulpi: using ACPI for GPIO lookup
[    8.606011] tusb1210 dwc3.0.auto.ulpi: using lookup tables for GPIO 
lookup
[    8.606024] tusb1210 dwc3.0.auto.ulpi: No GPIO consumer cs found
[    8.669317] tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41 to 
reg 0x80

## note: options debugshell, root and rootfstype are normally handled by 
a script in my initrd, so I guess here noop.

> I use in that folder in case you want to give it a try.
>
> Now on Merrifield dwc3_get_extcon() doesn't do anything but call
> extcon_get_extcon_dev() which doesn't touch any hardware or interact
> with other drivers, so assuming
>
>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
>>
>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. - dwc3_core_init -
>> .. - dwc3_core_init_mode (no change)
>>
>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. - dwc3_get_extcon -
>> dwc3_core_init_mode (works)
> still holds(did you double check that with vanilla v6.0?) the only
I didn't check
> difference that I can see is execution timings. It seems to me it's
> either an extra delay added by execution of  extcon_get_extcon_dev()
> (unlikely) or multiple partial probes that include dwc3_core_init()
> that change things. You can try to check the latter by adding an
> artificial probe deferral point after dwc3_core_init(). Something like
> (didn't test this):
>
> modified   drivers/usb/dwc3/core.c
> @@ -1860,6 +1860,10 @@ static int dwc3_probe(struct platform_device *pdev)
>    goto err3;
>
>    ret = dwc3_core_init(dwc);
> + static int deferral_counter = 0;
> + if (deferral_counter++ < 9) /* I counted 9 deferrals in my testing */
> + ret = -EPROBE_DEFER;
> +
>    if (ret) {
>    dev_err_probe(dev, ret, "failed to initialize core\n");
>    goto err4;

Not sure how you wanted this tested. So I assume on vanilla booting from 
btrfs on eemc. It crashes but maybe the trace is usefull. After crash it 
continues but no USB appears at all.

[    4.185151] kobject_add_internal failed for dwc3.0.auto.ulpi with 
-EEXIST, don't try to register things with the same name in the same 
directory.
[    4.198625] dwc3 dwc3.0.auto: failed to register ULPI interface
[    4.213112] BUG: kernel NULL pointer dereference, address: 
0000000000000001

[    4.220260] #PF: supervisor read access in kernel mode
[    4.225523] #PF: error_code(0x0000) - not-present page
[    4.230785] PGD 0 P4D 0
[    4.233402] Oops: 0000 [#1] PREEMPT SMP PTI
[    4.237696] CPU: 0 PID: 8 Comm: kworker/u4:0 Not tainted 
6.0.0-edison-acpi-standard #1
[    4.245802] Hardware name: Intel Corporation Merrifield/BODEGA BAY, 
BIOS 542 2015.01.21:18.19.48
[    4.254791] Workqueue: events_unbound deferred_probe_work_func
[    4.260793] RIP: 0010:strlen+0x0/0x20
[    4.264559] Code: b6 07 38 d0 74 14 48 83 c7 01 84 c0 74 05 48 39 f7 
75 ec 31 c0 c3 cc cc cc cc 48 89 f8 c3 cc cc cc cc 0f 1f 84 00 00 00 00 
00 <80> 3f 00 74 14 48 89 f8 48 83 c0 01 80 38 00 75 f7 48 29 f8 c3 cc
[    4.283751] RSP: 0018:ffffa9520004bb10 EFLAGS: 00010202
[    4.289113] RAX: 0000000000000018 RBX: ffff9c5807d77c18 RCX: 
0000000000000000
[    4.296417] RDX: ffffa9520004bb88 RSI: 0000000000000cc0 RDI: 
0000000000000001
[    4.303719] RBP: 0000000000000001 R08: 0000000000000000 R09: 
ffffa9520004baf0
[    4.311022] R10: ffffffffffffffff R11: 000000000000000b R12: 
0000000000000cc0
[    4.318323] R13: ffff9c5807d77c18 R14: ffff9c5807d77c18 R15: 
000000000000a710
[    4.325625] FS:  0000000000000000(0000) GS:ffff9c583e200000(0000) 
knlGS:0000000000000000
[    4.333907] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    4.339793] CR2: 0000000000000001 CR3: 000000001540c000 CR4: 
00000000001006f0
[    4.347098] Call Trace:
[    4.349611]  <TASK>
[    4.351771]  kstrdup_const+0x2d/0x70
[    4.355453]  kobject_set_name_vargs+0x1e/0x90
[    4.359939]  dev_set_name+0x4e/0x70
[    4.363534]  device_add+0x5b/0x960
[    4.367036]  ? complete_all+0x1b/0x80
[    4.370808]  ulpi_register_interface+0x213/0x290
[    4.375551]  dwc3_ulpi_init+0x17/0x40
[    4.379314]  dwc3_core_init+0xc29/0x1b70
[    4.383350]  dwc3_probe+0x115a/0x1900
[    4.387122]  platform_probe+0x3a/0xa0
[    4.390892]  really_probe+0xdc/0x390
[    4.394567]  ? pm_runtime_barrier+0x4b/0x80
[    4.398867]  __driver_probe_device+0x73/0x170
[    4.403340]  driver_probe_device+0x19/0x90
[    4.407545]  __device_attach_driver+0x80/0x110
[    4.412105]  ? driver_allows_async_probing+0x60/0x60
[    4.417195]  ? driver_allows_async_probing+0x60/0x60
[    4.422286]  bus_for_each_drv+0x79/0xc0
[    4.426230]  __device_attach+0xb7/0x210
[    4.430169]  bus_probe_device+0x89/0xa0
[    4.434111]  deferred_probe_work_func+0x85/0xe0
[    4.438762]  process_one_work+0x1d7/0x3a0
[    4.440890] EXT4-fs (mmcblk1): recovery complete
[    4.442785]  worker_thread+0x48/0x3c0
[    4.442809]  ? rescuer_thread+0x380/0x380
[    4.442828]  kthread+0xe2/0x110
[    4.447489] EXT4-fs (mmcblk1): mounted filesystem with ordered data 
mode. Quota mode: none.
[    4.451086]  ? kthread_complete_and_exit+0x20/0x20
[    4.451107]  ret_from_fork+0x22/0x30
[    4.475811]  </TASK>
[    4.478052] Modules linked in: mmc_block extcon_intel_mrfld sdhci_pci 
cqhci sdhci led_class mmc_core intel_soc_pmic_mrfld btrfs libcrc32c xor 
zlib_deflate raid6_pq zstd_compress
[    4.494312] CR2: 0000000000000001
[    4.497717] ---[ end trace 0000000000000000 ]---
Steev Klimaszewski Oct. 17, 2022, 7:44 p.m. UTC | #34
This revert breaks USB on the Lenovo Yoga C630.

sudo cat /sys/kernel/debug/devices_deferred
a800000.usb     platform: supplier 88e3000.phy not ready
88e2000.phy
88e3000.phy
a600000.usb     platform: supplier 88e2000.phy not ready

Nothing seems to indicate there is any breakage in the dmesg output though, just
that deferred probe is pending.

qcom-qmp-usb-phy 88e9000.phy: Registered Qcom-QMP phy
qcom-qmp-usb-phy 88eb000.phy: Registered Qcom-QMP phy
platform a800000.usb: deferred probe pending
platform a600000.usb: deferred probe pending

-- steev
Andrey Smirnov Oct. 17, 2022, 9:20 p.m. UTC | #35
On Sun, Oct 16, 2022 at 1:59 PM Ferry Toth <fntoth@gmail.com> wrote:
>
>
> Op 15-10-2022 om 21:54 schreef Andrey Smirnov:
> > On Thu, Oct 13, 2022 at 12:35 PM Ferry Toth <fntoth@gmail.com> wrote:
> >> <SNIP>
> >>> My end goal here is to find a way to test vanilla v6.0 with the two
> >>> patches reverted on your end. I thought that during my testing I saw
> >>> tusb1210 print those timeout messages during its probe and that
> >>> disabling the driver worked to break the loop, but I went back to
> >>> double check and it doesn't work so scratch that idea. Configuring
> >>> extcon as a built-in breaks host functionality with or without patches
> >>> on my end, so I'm not sure it could be a path.
> >>>
> >>> I won't have time to try things with
> >>> 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch until
> >>> the weekend, meanwhile can you give this diff a try with vanilla (no
> >>> reverts) v6.0:
> >>>
> > OK, got a chance to try things with that patch. Both v6.0 and v6.0
> > with my patches reverted work the same, my Kingston DataTraveller USB
> > stick enumerates and works as expected.
> >
> Iow you don't need the patch at all to get usb to work. There has got to
> be a difference in our configs.
>

My patch? Yeah, it should have zero effect on anything.
!DWC3_VER_IS_PRIOR(DWC3, 330A) is false for Merrifield, so the logical
change from my patch is a no-op. It's a pure coincidence that it
resolved the probe loop that
0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch is
for.

> Did you have a chance to look at mine (here:
> https://drive.google.com/file/d/1aKJWMqiAXnReeLCvxshzjKwGxIWQ7eJk/view?usp=sharing)
>
> Else, send me yours.
>

I've been using your config in all of the testing.

> >>> modified   drivers/phy/ti/phy-tusb1210.c
> >>> @@ -127,6 +127,7 @@ static int tusb1210_set_mode(struct phy *phy, enum
> >>> phy_mode mode, int submode)
> >>>     u8 reg;
> >>>
> >>>     ret = tusb1210_ulpi_read(tusb, ULPI_OTG_CTRL, &reg);
> >>> + WARN_ON(ret < 0);
> >>>     if (ret < 0)
> >>>     return ret;
> >>>
> >>> @@ -152,7 +153,10 @@ static int tusb1210_set_mode(struct phy *phy,
> >>> enum phy_mode mode, int submode)
> >>>     }
> >>>
> >>>     tusb->otg_ctrl = reg;
> >>> - return tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
> >>> + ret = tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
> >>> + WARN_ON(ret < 0);
> >>> + return ret;
> >>> +
> >>>    }
> >>>
> >>>    #ifdef CONFIG_POWER_SUPPLY
> >>>
> >>> ? I'm curious to see if there's masked errors on your end since dwc3
> >>> driver doesn't check for those.
> >> root@yuna:~# dmesg | grep -i -E 'warn|assert|error|tusb|dwc3'
> >> 8250_mid: probe of 0000:00:04.0 failed with error -16
> >> platform regulatory.0: Direct firmware load for regulatory.db failed
> >> with error -2
> >> brcmfmac mmc2:0001:1: Direct firmware load for
> >> brcm/brcmfmac43340-sdio.Intel Corporation-Merrifield.bin failed with
> >> error -2
> >> sof-audio-pci-intel-tng 0000:00:0d.0: error: I/O region is too small.
> >> sof-audio-pci-intel-tng 0000:00:0d.0: error: failed to probe DSP -19
> >>
> >>
> >>>> This is done through configfs only when the switch is set to device mode.
> >>> Sure, but can it be disabled? We are looking for unknown variables, so
> >>> excluding this would be a reasonable thing to do.
> >> It's not enabled until I flip the switch to device mode.
> > OK to cut this back and forth short, I think it'd be easier to just
> > ask you to run what I run. Here's vanilla v6.0 bzImage and initrd
> > (built with your config + CONFIG_PHY_TUSB1210=y) I tested with
>
> What do you mean by this? My config is with
>
> CONFIG_GENERIC_PHY=y
> CONFIG_PHY_TUSB1210=y
>

$ cat config-6.0.0-edison-acpi-standard | grep 1210
# CONFIG_PHY_TUSB1210 is not set
$ md5sum config-6.0.0-edison-acpi-standard
3c989c708302c1f9e73c6113e71aed9d  config-6.0.0-edison-acpi-standard

I had to manually enable it, that's what I meant by my comment.

> > https://drive.google.com/drive/folders/1H28AL1coPPZ2kLTYskDuDdWo-oE7DRPH?usp=sharing
> > let's see how it behaves on your setup. There's also the U-Boot binary
>
> Ok, it's getting weirder and weirder. The following is with my U-Boot
> and your kernel/initrd
>
> 1) I placed them in /boot which is on my btrfs partition on the emmc (my
> U-Boot has btrfs enabled)
>
> Linux kernel version 6.0.0-edison-acpi-standard
> (andreysm@neptunefw-builder) #8 SMP PREEMPT_DYNAMIC Sat Oct 15 18:47:19
> UTC 2022
> Building boot_params at 0x00090000
> Loading bzImage at address 100000 (12064480 bytes)
> Initial RAM disk at linear address 0x06000000, size 25165824 bytes
> Kernel command line: "quiet root=/dev/mmcblk0p8
> rootflags=subvol=@,compress=lzo rootfstype=btrfs console=ttyS2,115200n8
> earlyprintk=ttyS2,115200n8,keep loglevel=4 systemd.unit=multi-user.target"
> Kernel loaded at 00100000, setup_base=00090000
>

You shouldn't be using root from you storage since:
  a) the initrd I uploaded is self-containing, it doesn't need anything else
  b) your local data is another variable we don't want to introduce

just "rootfstype=ramfs" should be enough for this and

 root=/dev/mmcblk0p8 rootflags=subvol=@,compress=lzo rootfstype=btrfs

should be dropped.

> Usb drive is not detected regardless booting with stick plugged or
> plugging later on.
>
> # lsusb
> Bus 001 Device 001: ID 1d6b:0002
> Bus 002 Device 001: ID 1d6b:0003
>
> No TUSB1210 probed
>
> # dmesg | grep dwc3
> #
>
> 2) I placed them in my vfat rescue partition
>
> Linux kernel version 6.0.0-edison-acpi-standard
> (andreysm@neptunefw-builder) #8 SMP PREEMPT_DYNAMIC Sat Oct 15 18:47:19
> UTC 2022
> Building boot_params at 0x00090000
> Loading bzImage at address 100000 (12064480 bytes)
> Initial RAM disk at linear address 0x06000000, size 25165824 bytes
> Kernel command line: "debugshell=0 tty1 console=ttyS2,115200n8
> root=/dev/mmcblk0p7 rootfstype=vfat systemd.unit=multi-user.target"
> Kernel loaded at 00100000, setup_base=00090000
>
> Usb drive is detected.

Yep, that's exactly my point about extra variables. So it looks like
something in your root btrfs partition is triggering this issue. I
don't really know the contents of your root file system, so don't
really have any suggestions there. Maybe old kernel modules are
getting picked up? Or something else is interfering ¯\_(ツ)_/¯

>
> # lsusb
> Bus 001 Device 001: ID 1d6b:0002
> Bus 001 Device 002: ID 125f:312b
> Bus 002 Device 001: ID 1d6b:0003
>
> TUSB1210 probed
>
> # dmesg | grep dwc3
> [    8.605845] tusb1210 dwc3.0.auto.ulpi: GPIO lookup for consumer reset
> [    8.605876] tusb1210 dwc3.0.auto.ulpi: using ACPI for GPIO lookup
> [    8.605927] tusb1210 dwc3.0.auto.ulpi: using lookup tables for GPIO
> lookup
> [    8.605941] tusb1210 dwc3.0.auto.ulpi: No GPIO consumer reset found
> [    8.605956] tusb1210 dwc3.0.auto.ulpi: GPIO lookup for consumer cs
> [    8.605970] tusb1210 dwc3.0.auto.ulpi: using ACPI for GPIO lookup
> [    8.606011] tusb1210 dwc3.0.auto.ulpi: using lookup tables for GPIO
> lookup
> [    8.606024] tusb1210 dwc3.0.auto.ulpi: No GPIO consumer cs found
> [    8.669317] tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41 to
> reg 0x80
>
> ## note: options debugshell, root and rootfstype are normally handled by
> a script in my initrd, so I guess here noop.
>
> > I use in that folder in case you want to give it a try.
> >
> > Now on Merrifield dwc3_get_extcon() doesn't do anything but call
> > extcon_get_extcon_dev() which doesn't touch any hardware or interact
> > with other drivers, so assuming
> >
> >> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
> >> dwc3_core_init - .. - dwc3_core_init_mode (not working)
> >>
> >> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. - dwc3_core_init -
> >> .. - dwc3_core_init_mode (no change)
> >>
> >> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. - dwc3_get_extcon -
> >> dwc3_core_init_mode (works)
> > still holds(did you double check that with vanilla v6.0?) the only
> I didn't check
> > difference that I can see is execution timings. It seems to me it's
> > either an extra delay added by execution of  extcon_get_extcon_dev()
> > (unlikely) or multiple partial probes that include dwc3_core_init()
> > that change things. You can try to check the latter by adding an
> > artificial probe deferral point after dwc3_core_init(). Something like
> > (didn't test this):
> >
> > modified   drivers/usb/dwc3/core.c
> > @@ -1860,6 +1860,10 @@ static int dwc3_probe(struct platform_device *pdev)
> >    goto err3;
> >
> >    ret = dwc3_core_init(dwc);
> > + static int deferral_counter = 0;
> > + if (deferral_counter++ < 9) /* I counted 9 deferrals in my testing */
> > + ret = -EPROBE_DEFER;
> > +
> >    if (ret) {
> >    dev_err_probe(dev, ret, "failed to initialize core\n");
> >    goto err4;
>
> Not sure how you wanted this tested. So I assume on vanilla booting from
> btrfs on eemc. It crashes but maybe the trace is usefull. After crash it
> continues but no USB appears at all.
>

I think you'll have to experiment with that code placement to emulate
a deferred probe for the old location of "get extcon".  I'd focus on
figuring out the root filesystem variable first before trying to get
this to work.

To be explicit, at this point I don't think the revert is really
warranted. I'm also happy to reply/help you with suggestions, but you
are going to have to start driving this.
Ferry Toth Oct. 18, 2022, 8:47 p.m. UTC | #36
Hi,

Op 17-10-2022 om 23:20 schreef Andrey Smirnov:
> On Sun, Oct 16, 2022 at 1:59 PM Ferry Toth <fntoth@gmail.com> wrote:
>>
>> Op 15-10-2022 om 21:54 schreef Andrey Smirnov:
>>> On Thu, Oct 13, 2022 at 12:35 PM Ferry Toth <fntoth@gmail.com> wrote:
>>>> <SNIP>
>>>>> My end goal here is to find a way to test vanilla v6.0 with the two
>>>>> patches reverted on your end. I thought that during my testing I saw
>>>>> tusb1210 print those timeout messages during its probe and that
>>>>> disabling the driver worked to break the loop, but I went back to
>>>>> double check and it doesn't work so scratch that idea. Configuring
>>>>> extcon as a built-in breaks host functionality with or without patches
>>>>> on my end, so I'm not sure it could be a path.
>>>>>
>>>>> I won't have time to try things with
>>>>> 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch until
>>>>> the weekend, meanwhile can you give this diff a try with vanilla (no
>>>>> reverts) v6.0:
>>>>>
>>> OK, got a chance to try things with that patch. Both v6.0 and v6.0
>>> with my patches reverted work the same, my Kingston DataTraveller USB
>>> stick enumerates and works as expected.
>>>
>> Iow you don't need the patch at all to get usb to work. There has got to
>> be a difference in our configs.
>>
> My patch? Yeah, it should have zero effect on anything.
> !DWC3_VER_IS_PRIOR(DWC3, 330A) is false for Merrifield, so the logical
> change from my patch is a no-op. It's a pure coincidence that it
> resolved the probe loop that
> 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch is
> for.
>
>> Did you have a chance to look at mine (here:
>> https://drive.google.com/file/d/1aKJWMqiAXnReeLCvxshzjKwGxIWQ7eJk/view?usp=sharing)
>>
>> Else, send me yours.
>>
> I've been using your config in all of the testing.
>
>>>>> modified   drivers/phy/ti/phy-tusb1210.c
>>>>> @@ -127,6 +127,7 @@ static int tusb1210_set_mode(struct phy *phy, enum
>>>>> phy_mode mode, int submode)
>>>>>      u8 reg;
>>>>>
>>>>>      ret = tusb1210_ulpi_read(tusb, ULPI_OTG_CTRL, &reg);
>>>>> + WARN_ON(ret < 0);
>>>>>      if (ret < 0)
>>>>>      return ret;
>>>>>
>>>>> @@ -152,7 +153,10 @@ static int tusb1210_set_mode(struct phy *phy,
>>>>> enum phy_mode mode, int submode)
>>>>>      }
>>>>>
>>>>>      tusb->otg_ctrl = reg;
>>>>> - return tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
>>>>> + ret = tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
>>>>> + WARN_ON(ret < 0);
>>>>> + return ret;
>>>>> +
>>>>>     }
>>>>>
>>>>>     #ifdef CONFIG_POWER_SUPPLY
>>>>>
>>>>> ? I'm curious to see if there's masked errors on your end since dwc3
>>>>> driver doesn't check for those.
>>>> root@yuna:~# dmesg | grep -i -E 'warn|assert|error|tusb|dwc3'
>>>> 8250_mid: probe of 0000:00:04.0 failed with error -16
>>>> platform regulatory.0: Direct firmware load for regulatory.db failed
>>>> with error -2
>>>> brcmfmac mmc2:0001:1: Direct firmware load for
>>>> brcm/brcmfmac43340-sdio.Intel Corporation-Merrifield.bin failed with
>>>> error -2
>>>> sof-audio-pci-intel-tng 0000:00:0d.0: error: I/O region is too small.
>>>> sof-audio-pci-intel-tng 0000:00:0d.0: error: failed to probe DSP -19
>>>>
>>>>
>>>>>> This is done through configfs only when the switch is set to device mode.
>>>>> Sure, but can it be disabled? We are looking for unknown variables, so
>>>>> excluding this would be a reasonable thing to do.
>>>> It's not enabled until I flip the switch to device mode.
>>> OK to cut this back and forth short, I think it'd be easier to just
>>> ask you to run what I run. Here's vanilla v6.0 bzImage and initrd
>>> (built with your config + CONFIG_PHY_TUSB1210=y) I tested with
>> What do you mean by this? My config is with
>>
>> CONFIG_GENERIC_PHY=y
>> CONFIG_PHY_TUSB1210=y
>>
> $ cat config-6.0.0-edison-acpi-standard | grep 1210
> # CONFIG_PHY_TUSB1210 is not set
> $ md5sum config-6.0.0-edison-acpi-standard
> 3c989c708302c1f9e73c6113e71aed9d  config-6.0.0-edison-acpi-standard
>
> I had to manually enable it, that's what I meant by my comment.

Unbelievable, seems I uploaded the wrong config. I just double checked 
to see if any other differences:

scripts/diffconfig config-6.0.0-edison-acpi-standard-bad 
config-6.0.0-edison-acpi-standard-good
  GENERIC_PHY n -> y
  PHY_TUSB1210 n -> y

>
>>> https://drive.google.com/drive/folders/1H28AL1coPPZ2kLTYskDuDdWo-oE7DRPH?usp=sharing
>>> let's see how it behaves on your setup. There's also the U-Boot binary
>> Ok, it's getting weirder and weirder. The following is with my U-Boot
>> and your kernel/initrd
>>
>> 1) I placed them in /boot which is on my btrfs partition on the emmc (my
>> U-Boot has btrfs enabled)
>>
>> Linux kernel version 6.0.0-edison-acpi-standard
>> (andreysm@neptunefw-builder) #8 SMP PREEMPT_DYNAMIC Sat Oct 15 18:47:19
>> UTC 2022
>> Building boot_params at 0x00090000
>> Loading bzImage at address 100000 (12064480 bytes)
>> Initial RAM disk at linear address 0x06000000, size 25165824 bytes
>> Kernel command line: "quiet root=/dev/mmcblk0p8
>> rootflags=subvol=@,compress=lzo rootfstype=btrfs console=ttyS2,115200n8
>> earlyprintk=ttyS2,115200n8,keep loglevel=4 systemd.unit=multi-user.target"
>> Kernel loaded at 00100000, setup_base=00090000
>>
> You shouldn't be using root from you storage since:
>    a) the initrd I uploaded is self-containing, it doesn't need anything else

Yes I know. With the Yocto image we build our own that does switchroot.

Here I am inside your buildroot initrd, no fs from the emmc are mounted. 
According to dmesg btrfs module is loaded later then dwc3, and scans 
(finds) the btrfs partition in all cases without mounting.

>    b) your local data is another variable we don't want to introduce
>
> just "rootfstype=ramfs" should be enough for this and
>
>   root=/dev/mmcblk0p8 rootflags=subvol=@,compress=lzo rootfstype=btrfs
>
> should be dropped.

After some experimenting it appears "rootfstype=btrfs" causes the 
buildroot rootfs to fail probing tsub1210.

I think you should be able to reproduce this.

However, changing "rootfstype=ramfs" for my (yocto) image (which 
probably should be the right thing to do now I think about it) does not 
resolve the failing to probe tsub1210. Comparing the dmesg with the 
buildroot one shows that in my case a lot of stuff happens prior to dwc3:

raid6 does speed testing (this is used by btrfs)

btrfs is loaded

sdhci probed

acpi tables (for edison-arduino) loaded into configfs

external gpio muxes setup

finally xhci (tusb1210 is before this on the buildroot image)


>
>> Usb drive is not detected regardless booting with stick plugged or
>> plugging later on.
>>
>> # lsusb
>> Bus 001 Device 001: ID 1d6b:0002
>> Bus 002 Device 001: ID 1d6b:0003
>>
>> No TUSB1210 probed
>>
>> # dmesg | grep dwc3
>> #
>>
>> 2) I placed them in my vfat rescue partition
>>
>> Linux kernel version 6.0.0-edison-acpi-standard
>> (andreysm@neptunefw-builder) #8 SMP PREEMPT_DYNAMIC Sat Oct 15 18:47:19
>> UTC 2022
>> Building boot_params at 0x00090000
>> Loading bzImage at address 100000 (12064480 bytes)
>> Initial RAM disk at linear address 0x06000000, size 25165824 bytes
>> Kernel command line: "debugshell=0 tty1 console=ttyS2,115200n8
>> root=/dev/mmcblk0p7 rootfstype=vfat systemd.unit=multi-user.target"
>> Kernel loaded at 00100000, setup_base=00090000
>>
>> Usb drive is detected.
> Yep, that's exactly my point about extra variables. So it looks like
> something in your root btrfs partition is triggering this issue. I
> don't really know the contents of your root file system, so don't
> really have any suggestions there. Maybe old kernel modules are
> getting picked up? Or something else is interfering ¯\_(ツ)_/¯
>
>> # lsusb
>> Bus 001 Device 001: ID 1d6b:0002
>> Bus 001 Device 002: ID 125f:312b
>> Bus 002 Device 001: ID 1d6b:0003
>>
>> TUSB1210 probed
>>
>> # dmesg | grep dwc3
>> [    8.605845] tusb1210 dwc3.0.auto.ulpi: GPIO lookup for consumer reset
>> [    8.605876] tusb1210 dwc3.0.auto.ulpi: using ACPI for GPIO lookup
>> [    8.605927] tusb1210 dwc3.0.auto.ulpi: using lookup tables for GPIO
>> lookup
>> [    8.605941] tusb1210 dwc3.0.auto.ulpi: No GPIO consumer reset found
>> [    8.605956] tusb1210 dwc3.0.auto.ulpi: GPIO lookup for consumer cs
>> [    8.605970] tusb1210 dwc3.0.auto.ulpi: using ACPI for GPIO lookup
>> [    8.606011] tusb1210 dwc3.0.auto.ulpi: using lookup tables for GPIO
>> lookup
>> [    8.606024] tusb1210 dwc3.0.auto.ulpi: No GPIO consumer cs found
>> [    8.669317] tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41 to
>> reg 0x80
>>
>> ## note: options debugshell, root and rootfstype are normally handled by
>> a script in my initrd, so I guess here noop.
>>
>>> I use in that folder in case you want to give it a try.
>>>
>>> Now on Merrifield dwc3_get_extcon() doesn't do anything but call
>>> extcon_get_extcon_dev() which doesn't touch any hardware or interact
>>> with other drivers, so assuming
>>>
>>>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
>>>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
>>>>
>>>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. - dwc3_core_init -
>>>> .. - dwc3_core_init_mode (no change)
>>>>
>>>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. - dwc3_get_extcon -
>>>> dwc3_core_init_mode (works)
>>> still holds(did you double check that with vanilla v6.0?) the only
>> I didn't check
>>> difference that I can see is execution timings. It seems to me it's
>>> either an extra delay added by execution of  extcon_get_extcon_dev()
>>> (unlikely) or multiple partial probes that include dwc3_core_init()
>>> that change things. You can try to check the latter by adding an
>>> artificial probe deferral point after dwc3_core_init(). Something like
>>> (didn't test this):
>>>
>>> modified   drivers/usb/dwc3/core.c
>>> @@ -1860,6 +1860,10 @@ static int dwc3_probe(struct platform_device *pdev)
>>>     goto err3;
>>>
>>>     ret = dwc3_core_init(dwc);
>>> + static int deferral_counter = 0;
>>> + if (deferral_counter++ < 9) /* I counted 9 deferrals in my testing */
>>> + ret = -EPROBE_DEFER;
>>> +
>>>     if (ret) {
>>>     dev_err_probe(dev, ret, "failed to initialize core\n");
>>>     goto err4;
>> Not sure how you wanted this tested. So I assume on vanilla booting from
>> btrfs on eemc. It crashes but maybe the trace is usefull. After crash it
>> continues but no USB appears at all.
>>
> I think you'll have to experiment with that code placement to emulate
> a deferred probe for the old location of "get extcon".  I'd focus on
> figuring out the root filesystem variable first before trying to get
> this to work.

Yes, did that as described above. I think that "rootfstype=btrfs" causes 
some ordering issue, like as if xhci goes to soon. It goes before:

spi_master spi5: GPIO lookup for consumer cs

while tusb1210 when it does probe starts with:

tusb1210 dwc3.0.auto.ulpi: GPIO lookup for consumer reset

and xhci follow later.

> To be explicit, at this point I don't think the revert is really
> warranted. I'm also happy to reply/help you with suggestions, but you
> are going to have to start driving this.

I agree that reverting based on a "regression" can not be concluded here 
as dwc3 on merrifield never worked without an out-of-tree patch. And 
your patch makes that out-of-tree patch obsolete - that's a good thing.

But I do think your patch is exposing an older issue that makes dwc3 
sensitive to ordering. I would very much appreciate if you could try 
"rootfstype=btrfs" to reproduce. It think it would be a good thing to 
resolve it so that the effort here has not been for nothing.

My next step will be to move around the code placement as you suggest. 
(I can spend a few hours in the evenings only as this is not my day job, 
so explains if I'm a bit slow to respond here).
Ferry Toth Oct. 20, 2022, 7:55 p.m. UTC | #37
Op 18-10-2022 om 22:47 schreef Ferry Toth:
> Hi,
> 
> Op 17-10-2022 om 23:20 schreef Andrey Smirnov:
>> On Sun, Oct 16, 2022 at 1:59 PM Ferry Toth <fntoth@gmail.com> wrote:
>>>
>>> Op 15-10-2022 om 21:54 schreef Andrey Smirnov:
>>>> On Thu, Oct 13, 2022 at 12:35 PM Ferry Toth <fntoth@gmail.com> wrote:
>>>>> <SNIP>
>>>>>> My end goal here is to find a way to test vanilla v6.0 with the two
>>>>>> patches reverted on your end. I thought that during my testing I saw
>>>>>> tusb1210 print those timeout messages during its probe and that
>>>>>> disabling the driver worked to break the loop, but I went back to
>>>>>> double check and it doesn't work so scratch that idea. Configuring
>>>>>> extcon as a built-in breaks host functionality with or without 
>>>>>> patches
>>>>>> on my end, so I'm not sure it could be a path.
>>>>>>
>>>>>> I won't have time to try things with
>>>>>> 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch 
>>>>>> until
>>>>>> the weekend, meanwhile can you give this diff a try with vanilla (no
>>>>>> reverts) v6.0:
>>>>>>
>>>> OK, got a chance to try things with that patch. Both v6.0 and v6.0
>>>> with my patches reverted work the same, my Kingston DataTraveller USB
>>>> stick enumerates and works as expected.
>>>>
>>> Iow you don't need the patch at all to get usb to work. There has got to
>>> be a difference in our configs.
>>>
>> My patch? Yeah, it should have zero effect on anything.
>> !DWC3_VER_IS_PRIOR(DWC3, 330A) is false for Merrifield, so the logical
>> change from my patch is a no-op. It's a pure coincidence that it
>> resolved the probe loop that
>> 0043b-TODO-driver-core-Break-infinite-loop-when-deferred-p.patch is
>> for.
>>
>>> Did you have a chance to look at mine (here:
>>> https://drive.google.com/file/d/1aKJWMqiAXnReeLCvxshzjKwGxIWQ7eJk/view?usp=sharing)
>>>
>>> Else, send me yours.
>>>
>> I've been using your config in all of the testing.
>>
>>>>>> modified   drivers/phy/ti/phy-tusb1210.c
>>>>>> @@ -127,6 +127,7 @@ static int tusb1210_set_mode(struct phy *phy, 
>>>>>> enum
>>>>>> phy_mode mode, int submode)
>>>>>>      u8 reg;
>>>>>>
>>>>>>      ret = tusb1210_ulpi_read(tusb, ULPI_OTG_CTRL, &reg);
>>>>>> + WARN_ON(ret < 0);
>>>>>>      if (ret < 0)
>>>>>>      return ret;
>>>>>>
>>>>>> @@ -152,7 +153,10 @@ static int tusb1210_set_mode(struct phy *phy,
>>>>>> enum phy_mode mode, int submode)
>>>>>>      }
>>>>>>
>>>>>>      tusb->otg_ctrl = reg;
>>>>>> - return tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
>>>>>> + ret = tusb1210_ulpi_write(tusb, ULPI_OTG_CTRL, reg);
>>>>>> + WARN_ON(ret < 0);
>>>>>> + return ret;
>>>>>> +
>>>>>>     }
>>>>>>
>>>>>>     #ifdef CONFIG_POWER_SUPPLY
>>>>>>
>>>>>> ? I'm curious to see if there's masked errors on your end since dwc3
>>>>>> driver doesn't check for those.
>>>>> root@yuna:~# dmesg | grep -i -E 'warn|assert|error|tusb|dwc3'
>>>>> 8250_mid: probe of 0000:00:04.0 failed with error -16
>>>>> platform regulatory.0: Direct firmware load for regulatory.db failed
>>>>> with error -2
>>>>> brcmfmac mmc2:0001:1: Direct firmware load for
>>>>> brcm/brcmfmac43340-sdio.Intel Corporation-Merrifield.bin failed with
>>>>> error -2
>>>>> sof-audio-pci-intel-tng 0000:00:0d.0: error: I/O region is too small.
>>>>> sof-audio-pci-intel-tng 0000:00:0d.0: error: failed to probe DSP -19
>>>>>
>>>>>
>>>>>>> This is done through configfs only when the switch is set to 
>>>>>>> device mode.
>>>>>> Sure, but can it be disabled? We are looking for unknown 
>>>>>> variables, so
>>>>>> excluding this would be a reasonable thing to do.
>>>>> It's not enabled until I flip the switch to device mode.
>>>> OK to cut this back and forth short, I think it'd be easier to just
>>>> ask you to run what I run. Here's vanilla v6.0 bzImage and initrd
>>>> (built with your config + CONFIG_PHY_TUSB1210=y) I tested with
>>> What do you mean by this? My config is with
>>>
>>> CONFIG_GENERIC_PHY=y
>>> CONFIG_PHY_TUSB1210=y
>>>
>> $ cat config-6.0.0-edison-acpi-standard | grep 1210
>> # CONFIG_PHY_TUSB1210 is not set
>> $ md5sum config-6.0.0-edison-acpi-standard
>> 3c989c708302c1f9e73c6113e71aed9d  config-6.0.0-edison-acpi-standard
>>
>> I had to manually enable it, that's what I meant by my comment.
> 
> Unbelievable, seems I uploaded the wrong config. I just double checked 
> to see if any other differences:
> 
> scripts/diffconfig config-6.0.0-edison-acpi-standard-bad 
> config-6.0.0-edison-acpi-standard-good
>   GENERIC_PHY n -> y
>   PHY_TUSB1210 n -> y
> 
>>
>>>> https://drive.google.com/drive/folders/1H28AL1coPPZ2kLTYskDuDdWo-oE7DRPH?usp=sharing
>>>> let's see how it behaves on your setup. There's also the U-Boot binary
>>> Ok, it's getting weirder and weirder. The following is with my U-Boot
>>> and your kernel/initrd
>>>
>>> 1) I placed them in /boot which is on my btrfs partition on the emmc (my
>>> U-Boot has btrfs enabled)
>>>
>>> Linux kernel version 6.0.0-edison-acpi-standard
>>> (andreysm@neptunefw-builder) #8 SMP PREEMPT_DYNAMIC Sat Oct 15 18:47:19
>>> UTC 2022
>>> Building boot_params at 0x00090000
>>> Loading bzImage at address 100000 (12064480 bytes)
>>> Initial RAM disk at linear address 0x06000000, size 25165824 bytes
>>> Kernel command line: "quiet root=/dev/mmcblk0p8
>>> rootflags=subvol=@,compress=lzo rootfstype=btrfs console=ttyS2,115200n8
>>> earlyprintk=ttyS2,115200n8,keep loglevel=4 
>>> systemd.unit=multi-user.target"
>>> Kernel loaded at 00100000, setup_base=00090000
>>>
>> You shouldn't be using root from you storage since:
>>    a) the initrd I uploaded is self-containing, it doesn't need 
>> anything else
> 
> Yes I know. With the Yocto image we build our own that does switchroot.
> 
> Here I am inside your buildroot initrd, no fs from the emmc are mounted. 
> According to dmesg btrfs module is loaded later then dwc3, and scans 
> (finds) the btrfs partition in all cases without mounting.
> 
>>    b) your local data is another variable we don't want to introduce
>>
>> just "rootfstype=ramfs" should be enough for this and
>>
>>   root=/dev/mmcblk0p8 rootflags=subvol=@,compress=lzo rootfstype=btrfs
>>
>> should be dropped.
> 
> After some experimenting it appears "rootfstype=btrfs" causes the 
> buildroot rootfs to fail probing tsub1210.
> 
> I think you should be able to reproduce this.
> 
> However, changing "rootfstype=ramfs" for my (yocto) image (which 
> probably should be the right thing to do now I think about it) does not 
> resolve the failing to probe tsub1210. Comparing the dmesg with the 
> buildroot one shows that in my case a lot of stuff happens prior to dwc3:
> 
> raid6 does speed testing (this is used by btrfs)
> 
> btrfs is loaded
> 
> sdhci probed
> 
> acpi tables (for edison-arduino) loaded into configfs
> 
> external gpio muxes setup
> 
> finally xhci (tusb1210 is before this on the buildroot image)
> 
> 
>>
>>> Usb drive is not detected regardless booting with stick plugged or
>>> plugging later on.
>>>
>>> # lsusb
>>> Bus 001 Device 001: ID 1d6b:0002
>>> Bus 002 Device 001: ID 1d6b:0003
>>>
>>> No TUSB1210 probed
>>>
>>> # dmesg | grep dwc3
>>> #
>>>
>>> 2) I placed them in my vfat rescue partition
>>>
>>> Linux kernel version 6.0.0-edison-acpi-standard
>>> (andreysm@neptunefw-builder) #8 SMP PREEMPT_DYNAMIC Sat Oct 15 18:47:19
>>> UTC 2022
>>> Building boot_params at 0x00090000
>>> Loading bzImage at address 100000 (12064480 bytes)
>>> Initial RAM disk at linear address 0x06000000, size 25165824 bytes
>>> Kernel command line: "debugshell=0 tty1 console=ttyS2,115200n8
>>> root=/dev/mmcblk0p7 rootfstype=vfat systemd.unit=multi-user.target"
>>> Kernel loaded at 00100000, setup_base=00090000
>>>
>>> Usb drive is detected.
>> Yep, that's exactly my point about extra variables. So it looks like
>> something in your root btrfs partition is triggering this issue. I
>> don't really know the contents of your root file system, so don't
>> really have any suggestions there. Maybe old kernel modules are
>> getting picked up? Or something else is interfering ¯\_(ツ)_/¯
>>
>>> # lsusb
>>> Bus 001 Device 001: ID 1d6b:0002
>>> Bus 001 Device 002: ID 125f:312b
>>> Bus 002 Device 001: ID 1d6b:0003
>>>
>>> TUSB1210 probed
>>>
>>> # dmesg | grep dwc3
>>> [    8.605845] tusb1210 dwc3.0.auto.ulpi: GPIO lookup for consumer reset
>>> [    8.605876] tusb1210 dwc3.0.auto.ulpi: using ACPI for GPIO lookup
>>> [    8.605927] tusb1210 dwc3.0.auto.ulpi: using lookup tables for GPIO
>>> lookup
>>> [    8.605941] tusb1210 dwc3.0.auto.ulpi: No GPIO consumer reset found
>>> [    8.605956] tusb1210 dwc3.0.auto.ulpi: GPIO lookup for consumer cs
>>> [    8.605970] tusb1210 dwc3.0.auto.ulpi: using ACPI for GPIO lookup
>>> [    8.606011] tusb1210 dwc3.0.auto.ulpi: using lookup tables for GPIO
>>> lookup
>>> [    8.606024] tusb1210 dwc3.0.auto.ulpi: No GPIO consumer cs found
>>> [    8.669317] tusb1210 dwc3.0.auto.ulpi: error -110 writing val 0x41 to
>>> reg 0x80
>>>
>>> ## note: options debugshell, root and rootfstype are normally handled by
>>> a script in my initrd, so I guess here noop.
>>>
>>>> I use in that folder in case you want to give it a try.
>>>>
>>>> Now on Merrifield dwc3_get_extcon() doesn't do anything but call
>>>> extcon_get_extcon_dev() which doesn't touch any hardware or interact
>>>> with other drivers, so assuming
>>>>
>>>>> So current v6.0 has: dwc3_get_extcon - dwc3_get_dr_mode - ... -
>>>>> dwc3_core_init - .. - dwc3_core_init_mode (not working)
>>>>>
>>>>> I changed to: dwc3_get_dr_mode - dwc3_get_extcon - .. - 
>>>>> dwc3_core_init -
>>>>> .. - dwc3_core_init_mode (no change)
>>>>>
>>>>> Then to: dwc3_get_dr_mode - .. - dwc3_core_init - .. - 
>>>>> dwc3_get_extcon -
>>>>> dwc3_core_init_mode (works)
>>>> still holds(did you double check that with vanilla v6.0?) the only
>>> I didn't check
>>>> difference that I can see is execution timings. It seems to me it's
>>>> either an extra delay added by execution of  extcon_get_extcon_dev()
>>>> (unlikely) or multiple partial probes that include dwc3_core_init()
>>>> that change things. You can try to check the latter by adding an
>>>> artificial probe deferral point after dwc3_core_init(). Something like
>>>> (didn't test this):
>>>>
>>>> modified   drivers/usb/dwc3/core.c
>>>> @@ -1860,6 +1860,10 @@ static int dwc3_probe(struct platform_device 
>>>> *pdev)
>>>>     goto err3;
>>>>
>>>>     ret = dwc3_core_init(dwc);
>>>> + static int deferral_counter = 0;
>>>> + if (deferral_counter++ < 9) /* I counted 9 deferrals in my testing */
>>>> + ret = -EPROBE_DEFER;
>>>> +
>>>>     if (ret) {
>>>>     dev_err_probe(dev, ret, "failed to initialize core\n");
>>>>     goto err4;
>>> Not sure how you wanted this tested. So I assume on vanilla booting from
>>> btrfs on eemc. It crashes but maybe the trace is usefull. After crash it
>>> continues but no USB appears at all.
>>>
>> I think you'll have to experiment with that code placement to emulate
>> a deferred probe for the old location of "get extcon".  I'd focus on
>> figuring out the root filesystem variable first before trying to get
>> this to work.
> 
> Yes, did that as described above. I think that "rootfstype=btrfs" causes 
> some ordering issue, like as if xhci goes to soon. It goes before:
> 
> spi_master spi5: GPIO lookup for consumer cs
> 
> while tusb1210 when it does probe starts with:
> 
> tusb1210 dwc3.0.auto.ulpi: GPIO lookup for consumer reset
> 
> and xhci follow later.
> 
>> To be explicit, at this point I don't think the revert is really
>> warranted. I'm also happy to reply/help you with suggestions, but you
>> are going to have to start driving this.
> 
> I agree that reverting based on a "regression" can not be concluded here 
> as dwc3 on merrifield never worked without an out-of-tree patch. And 
> your patch makes that out-of-tree patch obsolete - that's a good thing.
> 
> But I do think your patch is exposing an older issue that makes dwc3 
> sensitive to ordering. I would very much appreciate if you could try 
> "rootfstype=btrfs" to reproduce. It think it would be a good thing to 
> resolve it so that the effort here has not been for nothing.
> 
> My next step will be to move around the code placement as you suggest. 
> (I can spend a few hours in the evenings only as this is not my day job, 
> so explains if I'm a bit slow to respond here).
> 
Here the results of moving the code around the same as before:
---
  drivers/usb/dwc3/core.c | 14 +++++++-------
  1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index d0237b30c9be..990de5bf1983 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -1844,13 +1844,6 @@ static int dwc3_probe(struct platform_device *pdev)
  		goto err2;
  	}

-	dwc->edev = dwc3_get_extcon(dwc);
-	if (IS_ERR(dwc->edev)) {
-		ret = PTR_ERR(dwc->edev);
-		dev_err_probe(dwc->dev, ret, "failed to get extcon\n");
-		goto err3;
-	}
-
  	ret = dwc3_get_dr_mode(dwc);
  	if (ret)
  		goto err3;
@@ -1868,6 +1861,13 @@ static int dwc3_probe(struct platform_device *pdev)
  	dwc3_check_params(dwc);
  	dwc3_debugfs_init(dwc);

+	dwc->edev = dwc3_get_extcon(dwc);
+	if (IS_ERR(dwc->edev)) {
+		ret = PTR_ERR(dwc->edev);
+		dev_err_probe(dwc->dev, ret, "failed to get extcon\n");
+		goto err5;
+	}
+
  	ret = dwc3_core_init_mode(dwc);
  	if (ret)
  		goto err5;
diff mbox series

Patch

diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c
index c2b463469d51..219d797e2230 100644
--- a/drivers/usb/dwc3/core.c
+++ b/drivers/usb/dwc3/core.c
@@ -23,7 +23,6 @@ 
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/of.h>
-#include <linux/of_graph.h>
 #include <linux/acpi.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/reset.h>
@@ -86,7 +85,7 @@  static int dwc3_get_dr_mode(struct dwc3 *dwc)
 		 * mode. If the controller supports DRD but the dr_mode is not
 		 * specified or set to OTG, then set the mode to peripheral.
 		 */
-		if (mode == USB_DR_MODE_OTG && !dwc->edev &&
+		if (mode == USB_DR_MODE_OTG &&
 		    (!IS_ENABLED(CONFIG_USB_ROLE_SWITCH) ||
 		     !device_property_read_bool(dwc->dev, "usb-role-switch")) &&
 		    !DWC3_VER_IS_PRIOR(DWC3, 330A))
@@ -1668,51 +1667,6 @@  static void dwc3_check_params(struct dwc3 *dwc)
 	}
 }
 
-static struct extcon_dev *dwc3_get_extcon(struct dwc3 *dwc)
-{
-	struct device *dev = dwc->dev;
-	struct device_node *np_phy;
-	struct extcon_dev *edev = NULL;
-	const char *name;
-
-	if (device_property_read_bool(dev, "extcon"))
-		return extcon_get_edev_by_phandle(dev, 0);
-
-	/*
-	 * Device tree platforms should get extcon via phandle.
-	 * On ACPI platforms, we get the name from a device property.
-	 * This device property is for kernel internal use only and
-	 * is expected to be set by the glue code.
-	 */
-	if (device_property_read_string(dev, "linux,extcon-name", &name) == 0) {
-		edev = extcon_get_extcon_dev(name);
-		if (!edev)
-			return ERR_PTR(-EPROBE_DEFER);
-
-		return edev;
-	}
-
-	/*
-	 * Try to get an extcon device from the USB PHY controller's "port"
-	 * node. Check if it has the "port" node first, to avoid printing the
-	 * error message from underlying code, as it's a valid case: extcon
-	 * device (and "port" node) may be missing in case of "usb-role-switch"
-	 * or OTG mode.
-	 */
-	np_phy = of_parse_phandle(dev->of_node, "phys", 0);
-	if (of_graph_is_present(np_phy)) {
-		struct device_node *np_conn;
-
-		np_conn = of_graph_get_remote_node(np_phy, -1, -1);
-		if (np_conn)
-			edev = extcon_find_edev_by_node(np_conn);
-		of_node_put(np_conn);
-	}
-	of_node_put(np_phy);
-
-	return edev;
-}
-
 static int dwc3_probe(struct platform_device *pdev)
 {
 	struct device		*dev = &pdev->dev;
@@ -1849,13 +1803,6 @@  static int dwc3_probe(struct platform_device *pdev)
 		goto err2;
 	}
 
-	dwc->edev = dwc3_get_extcon(dwc);
-	if (IS_ERR(dwc->edev)) {
-		ret = PTR_ERR(dwc->edev);
-		dev_err_probe(dwc->dev, ret, "failed to get extcon\n");
-		goto err3;
-	}
-
 	ret = dwc3_get_dr_mode(dwc);
 	if (ret)
 		goto err3;
diff --git a/drivers/usb/dwc3/drd.c b/drivers/usb/dwc3/drd.c
index 039bf241769a..8cad9e7d3368 100644
--- a/drivers/usb/dwc3/drd.c
+++ b/drivers/usb/dwc3/drd.c
@@ -8,6 +8,7 @@ 
  */
 
 #include <linux/extcon.h>
+#include <linux/of_graph.h>
 #include <linux/of_platform.h>
 #include <linux/platform_device.h>
 #include <linux/property.h>
@@ -438,6 +439,51 @@  static int dwc3_drd_notifier(struct notifier_block *nb,
 	return NOTIFY_DONE;
 }
 
+static struct extcon_dev *dwc3_get_extcon(struct dwc3 *dwc)
+{
+	struct device *dev = dwc->dev;
+	struct device_node *np_phy;
+	struct extcon_dev *edev = NULL;
+	const char *name;
+
+	if (device_property_read_bool(dev, "extcon"))
+		return extcon_get_edev_by_phandle(dev, 0);
+
+	/*
+	 * Device tree platforms should get extcon via phandle.
+	 * On ACPI platforms, we get the name from a device property.
+	 * This device property is for kernel internal use only and
+	 * is expected to be set by the glue code.
+	 */
+	if (device_property_read_string(dev, "linux,extcon-name", &name) == 0) {
+		edev = extcon_get_extcon_dev(name);
+		if (!edev)
+			return ERR_PTR(-EPROBE_DEFER);
+
+		return edev;
+	}
+
+	/*
+	 * Try to get an extcon device from the USB PHY controller's "port"
+	 * node. Check if it has the "port" node first, to avoid printing the
+	 * error message from underlying code, as it's a valid case: extcon
+	 * device (and "port" node) may be missing in case of "usb-role-switch"
+	 * or OTG mode.
+	 */
+	np_phy = of_parse_phandle(dev->of_node, "phys", 0);
+	if (of_graph_is_present(np_phy)) {
+		struct device_node *np_conn;
+
+		np_conn = of_graph_get_remote_node(np_phy, -1, -1);
+		if (np_conn)
+			edev = extcon_find_edev_by_node(np_conn);
+		of_node_put(np_conn);
+	}
+	of_node_put(np_phy);
+
+	return edev;
+}
+
 #if IS_ENABLED(CONFIG_USB_ROLE_SWITCH)
 #define ROLE_SWITCH 1
 static int dwc3_usb_role_switch_set(struct usb_role_switch *sw,
@@ -542,6 +588,10 @@  int dwc3_drd_init(struct dwc3 *dwc)
 	    device_property_read_bool(dwc->dev, "usb-role-switch"))
 		return dwc3_setup_role_switch(dwc);
 
+	dwc->edev = dwc3_get_extcon(dwc);
+	if (IS_ERR(dwc->edev))
+		return PTR_ERR(dwc->edev);
+
 	if (dwc->edev) {
 		dwc->edev_nb.notifier_call = dwc3_drd_notifier;
 		ret = extcon_register_notifier(dwc->edev, EXTCON_USB_HOST,