diff mbox series

USB: dwc2: write HCINT with INTMASK applied

Message ID 20231115144514.15248-1-oneukum@suse.com (mailing list archive)
State Accepted
Commit 0583bc776ca5b5a3f5752869fc31cf7322df2b35
Headers show
Series USB: dwc2: write HCINT with INTMASK applied | expand

Commit Message

Oliver Neukum Nov. 15, 2023, 2:45 p.m. UTC
dwc2_hc_n_intr() writes back INTMASK as read but evaluates it
with intmask applied. In stress testing this causes spurious
interrupts like this:

[Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 7 - ChHltd set, but reason is unknown
[Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
[Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
[Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
[Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 4 - ChHltd set, but reason is unknown
[Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
[Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_update_urb_state_abn(): trimming xfer length

Applying INTMASK prevents this. The issue exists in all versions of the
driver.

Signed-off-by: Oliver Neukum <oneukum@suse.com>
Tested-by: Ivan Ivanov <ivan.ivanov@suse.com>
Tested-by: Andrea della Porta <andrea.porta@suse.com>
---
 drivers/usb/dwc2/hcd_intr.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

Comments

Minas Harutyunyan Nov. 22, 2023, 12:35 p.m. UTC | #1
Hi Oliver,

On 11/15/23 18:45, Oliver Neukum wrote:
> dwc2_hc_n_intr() writes back INTMASK as read but evaluates it
> with intmask applied. In stress testing this causes spurious
> interrupts like this:
> 
> [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 7 - ChHltd set, but reason is unknown
> [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 4 - ChHltd set, but reason is unknown
> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_update_urb_state_abn(): trimming xfer length
> 
> Applying INTMASK prevents this. The issue exists in all versions of the
> driver.

I'm Ok with this patch, just need some elaboration.
So, channel halted interrupt asserted due to some other interrupt (AHB 
Error, Excessive transaction errors, Babble, Stall) which was masked. 
Can you check which of masked interrupts is cause of channel halt interrupt?

Thanks,
Minas

> 
> Signed-off-by: Oliver Neukum <oneukum@suse.com>
> Tested-by: Ivan Ivanov <ivan.ivanov@suse.com>
> Tested-by: Andrea della Porta <andrea.porta@suse.com>
> ---
>   drivers/usb/dwc2/hcd_intr.c | 15 +++++++--------
>   1 file changed, 7 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/usb/dwc2/hcd_intr.c b/drivers/usb/dwc2/hcd_intr.c
> index 0144ca8350c3..5c7538d498dd 100644
> --- a/drivers/usb/dwc2/hcd_intr.c
> +++ b/drivers/usb/dwc2/hcd_intr.c
> @@ -2015,15 +2015,17 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>   {
>   	struct dwc2_qtd *qtd;
>   	struct dwc2_host_chan *chan;
> -	u32 hcint, hcintmsk;
> +	u32 hcint, hcintraw, hcintmsk;
>   
>   	chan = hsotg->hc_ptr_array[chnum];
>   
> -	hcint = dwc2_readl(hsotg, HCINT(chnum));
> +	hcintraw = dwc2_readl(hsotg, HCINT(chnum));
>   	hcintmsk = dwc2_readl(hsotg, HCINTMSK(chnum));
> +	hcint = hcintraw & hcintmsk;
> +	dwc2_writel(hsotg, hcint, HCINT(chnum));
> +
>   	if (!chan) {
>   		dev_err(hsotg->dev, "## hc_ptr_array for channel is NULL ##\n");
> -		dwc2_writel(hsotg, hcint, HCINT(chnum));
>   		return;
>   	}
>   
> @@ -2032,11 +2034,9 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>   			 chnum);
>   		dev_vdbg(hsotg->dev,
>   			 "  hcint 0x%08x, hcintmsk 0x%08x, hcint&hcintmsk 0x%08x\n",
> -			 hcint, hcintmsk, hcint & hcintmsk);
> +			 hcintraw, hcintmsk, hcint);
>   	}
>   
> -	dwc2_writel(hsotg, hcint, HCINT(chnum));
> -
>   	/*
>   	 * If we got an interrupt after someone called
>   	 * dwc2_hcd_endpoint_disable() we don't want to crash below
> @@ -2046,8 +2046,7 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>   		return;
>   	}
>   
> -	chan->hcint = hcint;
> -	hcint &= hcintmsk;
> +	chan->hcint = hcintraw;
>   
>   	/*
>   	 * If the channel was halted due to a dequeue, the qtd list might
Andrea della Porta Nov. 27, 2023, 9:04 a.m. UTC | #2
On 12:35 Wed 22 Nov     , Minas Harutyunyan wrote:
> Hi Oliver,
> 
> On 11/15/23 18:45, Oliver Neukum wrote:
> > dwc2_hc_n_intr() writes back INTMASK as read but evaluates it
> > with intmask applied. In stress testing this causes spurious
> > interrupts like this:
> > 
> > [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 7 - ChHltd set, but reason is unknown
> > [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> > [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
> > [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> > [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 4 - ChHltd set, but reason is unknown
> > [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> > [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_update_urb_state_abn(): trimming xfer length
> > 
> > Applying INTMASK prevents this. The issue exists in all versions of the
> > driver.
> 
> I'm Ok with this patch, just need some elaboration.
> So, channel halted interrupt asserted due to some other interrupt (AHB 
> Error, Excessive transaction errors, Babble, Stall) which was masked. 
> Can you check which of masked interrupts is cause of channel halt interrupt?
> 
> Thanks,
> Minas
>

Hi Minas,
here's the report from dmesg:

[209384.238724]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.246725]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.247634]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.254722]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.262725]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.270724]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.278336]   hcint 0x00000092, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.278384]   hcint 0x00000010, hcintmsk 0x00000436, hcint&hcintmsk 0x00000010
[209384.278720]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.286723]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.288486]   hcint 0x00000021, hcintmsk 0x00000426, hcint&hcintmsk 0x00000020
[209384.288511]   hcint 0x00000002, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
[209384.288528] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 1 - ChHltd set, but reason is unknown
[209384.288604]   hcint 0x00000010, hcintmsk 0x00000436, hcint&hcintmsk 0x00000010
[209384.294720]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.302734]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.310724]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.318721]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.321722]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[209384.326729]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002


Many thanks,
Andrea 


> > 
> > Signed-off-by: Oliver Neukum <oneukum@suse.com>
> > Tested-by: Ivan Ivanov <ivan.ivanov@suse.com>
> > Tested-by: Andrea della Porta <andrea.porta@suse.com>
> > ---
> >   drivers/usb/dwc2/hcd_intr.c | 15 +++++++--------
> >   1 file changed, 7 insertions(+), 8 deletions(-)
> > 
> > diff --git a/drivers/usb/dwc2/hcd_intr.c b/drivers/usb/dwc2/hcd_intr.c
> > index 0144ca8350c3..5c7538d498dd 100644
> > --- a/drivers/usb/dwc2/hcd_intr.c
> > +++ b/drivers/usb/dwc2/hcd_intr.c
> > @@ -2015,15 +2015,17 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
> >   {
> >   	struct dwc2_qtd *qtd;
> >   	struct dwc2_host_chan *chan;
> > -	u32 hcint, hcintmsk;
> > +	u32 hcint, hcintraw, hcintmsk;
> >   
> >   	chan = hsotg->hc_ptr_array[chnum];
> >   
> > -	hcint = dwc2_readl(hsotg, HCINT(chnum));
> > +	hcintraw = dwc2_readl(hsotg, HCINT(chnum));
> >   	hcintmsk = dwc2_readl(hsotg, HCINTMSK(chnum));
> > +	hcint = hcintraw & hcintmsk;
> > +	dwc2_writel(hsotg, hcint, HCINT(chnum));
> > +
> >   	if (!chan) {
> >   		dev_err(hsotg->dev, "## hc_ptr_array for channel is NULL ##\n");
> > -		dwc2_writel(hsotg, hcint, HCINT(chnum));
> >   		return;
> >   	}
> >   
> > @@ -2032,11 +2034,9 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
> >   			 chnum);
> >   		dev_vdbg(hsotg->dev,
> >   			 "  hcint 0x%08x, hcintmsk 0x%08x, hcint&hcintmsk 0x%08x\n",
> > -			 hcint, hcintmsk, hcint & hcintmsk);
> > +			 hcintraw, hcintmsk, hcint);
> >   	}
> >   
> > -	dwc2_writel(hsotg, hcint, HCINT(chnum));
> > -
> >   	/*
> >   	 * If we got an interrupt after someone called
> >   	 * dwc2_hcd_endpoint_disable() we don't want to crash below
> > @@ -2046,8 +2046,7 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
> >   		return;
> >   	}
> >   
> > -	chan->hcint = hcint;
> > -	hcint &= hcintmsk;
> > +	chan->hcint = hcintraw;
> >   
> >   	/*
> >   	 * If the channel was halted due to a dequeue, the qtd list might
Minas Harutyunyan Nov. 27, 2023, 10:46 a.m. UTC | #3
Hi Andrea,

On 11/27/23 13:04, Andrea della Porta wrote:
> On 12:35 Wed 22 Nov     , Minas Harutyunyan wrote:
>> Hi Oliver,
>>
>> On 11/15/23 18:45, Oliver Neukum wrote:
>>> dwc2_hc_n_intr() writes back INTMASK as read but evaluates it
>>> with intmask applied. In stress testing this causes spurious
>>> interrupts like this:
>>>
>>> [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 7 - ChHltd set, but reason is unknown
>>> [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
>>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
>>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
>>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 4 - ChHltd set, but reason is unknown
>>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
>>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_update_urb_state_abn(): trimming xfer length
>>>
>>> Applying INTMASK prevents this. The issue exists in all versions of the
>>> driver.
>>
>> I'm Ok with this patch, just need some elaboration.
>> So, channel halted interrupt asserted due to some other interrupt (AHB
>> Error, Excessive transaction errors, Babble, Stall) which was masked.
>> Can you check which of masked interrupts is cause of channel halt interrupt?
>>
>> Thanks,
>> Minas
>>
> 
> Hi Minas,
> here's the report from dmesg:
> 
> [209384.238724]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.246725]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.247634]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.254722]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.262725]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.270724]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.278336]   hcint 0x00000092, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.278384]   hcint 0x00000010, hcintmsk 0x00000436, hcint&hcintmsk 0x00000010
> [209384.278720]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.286723]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.288486]   hcint 0x00000021, hcintmsk 0x00000426, hcint&hcintmsk 0x00000020

> [209384.288511]   hcint 0x00000002, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
According your patch terminology above 'hcint' is a 'hcintraw' and here 
asserted only channel halted interrupt. So, channel halted without any 
reason. Not clear how your patch fix this case?

Couple of questions related to your tests:
1. Which DMA mode used here - buffer or descriptor DMA?
2. Which type of transfers testing? Or you can add to this print HCCHARx 
also.
3. Which version of core you use (GSNPSID)?

Thanks,
Minas

> [209384.288528] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 1 - ChHltd set, but reason is unknown

> [209384.288604]   hcint 0x00000010, hcintmsk 0x00000436, hcint&hcintmsk 0x00000010
> [209384.294720]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.302734]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.310724]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.318721]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.321722]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [209384.326729]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> 
> 
> Many thanks,
> Andrea
> 
> 
>>>
>>> Signed-off-by: Oliver Neukum <oneukum@suse.com>
>>> Tested-by: Ivan Ivanov <ivan.ivanov@suse.com>
>>> Tested-by: Andrea della Porta <andrea.porta@suse.com>
>>> ---
>>>    drivers/usb/dwc2/hcd_intr.c | 15 +++++++--------
>>>    1 file changed, 7 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/drivers/usb/dwc2/hcd_intr.c b/drivers/usb/dwc2/hcd_intr.c
>>> index 0144ca8350c3..5c7538d498dd 100644
>>> --- a/drivers/usb/dwc2/hcd_intr.c
>>> +++ b/drivers/usb/dwc2/hcd_intr.c
>>> @@ -2015,15 +2015,17 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>>>    {
>>>    	struct dwc2_qtd *qtd;
>>>    	struct dwc2_host_chan *chan;
>>> -	u32 hcint, hcintmsk;
>>> +	u32 hcint, hcintraw, hcintmsk;
>>>    
>>>    	chan = hsotg->hc_ptr_array[chnum];
>>>    
>>> -	hcint = dwc2_readl(hsotg, HCINT(chnum));
>>> +	hcintraw = dwc2_readl(hsotg, HCINT(chnum));
>>>    	hcintmsk = dwc2_readl(hsotg, HCINTMSK(chnum));
>>> +	hcint = hcintraw & hcintmsk;
>>> +	dwc2_writel(hsotg, hcint, HCINT(chnum));
>>> +
>>>    	if (!chan) {
>>>    		dev_err(hsotg->dev, "## hc_ptr_array for channel is NULL ##\n");
>>> -		dwc2_writel(hsotg, hcint, HCINT(chnum));
>>>    		return;
>>>    	}
>>>    
>>> @@ -2032,11 +2034,9 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>>>    			 chnum);
>>>    		dev_vdbg(hsotg->dev,
>>>    			 "  hcint 0x%08x, hcintmsk 0x%08x, hcint&hcintmsk 0x%08x\n",
>>> -			 hcint, hcintmsk, hcint & hcintmsk);
>>> +			 hcintraw, hcintmsk, hcint);
>>>    	}
>>>    
>>> -	dwc2_writel(hsotg, hcint, HCINT(chnum));
>>> -
>>>    	/*
>>>    	 * If we got an interrupt after someone called
>>>    	 * dwc2_hcd_endpoint_disable() we don't want to crash below
>>> @@ -2046,8 +2046,7 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>>>    		return;
>>>    	}
>>>    
>>> -	chan->hcint = hcint;
>>> -	hcint &= hcintmsk;
>>> +	chan->hcint = hcintraw;
>>>    
>>>    	/*
>>>    	 * If the channel was halted due to a dequeue, the qtd list might
Andrea della Porta Nov. 28, 2023, 9 a.m. UTC | #4
Hi Minas,

On 10:46 Mon 27 Nov     , Minas Harutyunyan wrote:
> Hi Andrea,
> 
> On 11/27/23 13:04, Andrea della Porta wrote:
> > On 12:35 Wed 22 Nov     , Minas Harutyunyan wrote:
> >> Hi Oliver,
> >>
> >> On 11/15/23 18:45, Oliver Neukum wrote:
> >>> dwc2_hc_n_intr() writes back INTMASK as read but evaluates it
> >>> with intmask applied. In stress testing this causes spurious
> >>> interrupts like this:
> >>>
> >>> [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 7 - ChHltd set, but reason is unknown
> >>> [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> >>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
> >>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> >>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 4 - ChHltd set, but reason is unknown
> >>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> >>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_update_urb_state_abn(): trimming xfer length
> >>>
> >>> Applying INTMASK prevents this. The issue exists in all versions of the
> >>> driver.
> >>
> >> I'm Ok with this patch, just need some elaboration.
> >> So, channel halted interrupt asserted due to some other interrupt (AHB
> >> Error, Excessive transaction errors, Babble, Stall) which was masked.
> >> Can you check which of masked interrupts is cause of channel halt interrupt?
> >>
> >> Thanks,
> >> Minas
> >>
> > 
> > Hi Minas,
> > here's the report from dmesg:
> > 
> > [209384.238724]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [209384.246725]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [209384.247634]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [209384.254722]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [209384.262725]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [209384.270724]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [209384.278336]   hcint 0x00000092, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [209384.278384]   hcint 0x00000010, hcintmsk 0x00000436, hcint&hcintmsk 0x00000010
> > [209384.278720]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [209384.286723]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [209384.288486]   hcint 0x00000021, hcintmsk 0x00000426, hcint&hcintmsk 0x00000020
> 
> > [209384.288511]   hcint 0x00000002, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
> According your patch terminology above 'hcint' is a 'hcintraw' and here 
> asserted only channel halted interrupt. So, channel halted without any 
> reason. Not clear how your patch fix this case?
> 
> Couple of questions related to your tests:
> 1. Which DMA mode used here - buffer or descriptor DMA?
> 2. Which type of transfers testing? Or you can add to this print HCCHARx 
> also.
> 3. Which version of core you use (GSNPSID)?
> 
> Thanks,
> Minas
>

The augmented log is here below (hcchar[chnum] and others added):

[330438.623017] --Host Channel Interrupt--, Channel 4
[330438.623029]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[330438.623039]   hcchar[4] = 0x015c9810, chan->ep_type=3
[330438.626183] --Host Channel Interrupt--, Channel 5
[330438.626198]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[330438.626208]   hcchar[5] = 0x01d83200, chan->ep_type=2
[330438.627659] --Host Channel Interrupt--, Channel 3
[330438.627674]   hcint 0x00000021, hcintmsk 0x00000426, hcint&hcintmsk 0x00000020
[330438.627686]   hcchar[3] = 0x01d8d200, chan->ep_type=2
[330438.627703] --Host Channel Interrupt--, Channel 3
[330438.627711]   hcint 0x00000002, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
[330438.627721]   hcchar[3] = 0x01d8d200, chan->ep_type=2
[330438.627740] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 3 - ChHltd set, but reason is unknown
[330438.627758] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
[330438.627798] --Host Channel Interrupt--, Channel 0
[330438.627807]   hcint 0x00000010, hcintmsk 0x00000436, hcint&hcintmsk 0x00000010
[330438.627818]   hcchar[0] = 0x81d8d200, chan->ep_type=2
[330438.631017] --Host Channel Interrupt--, Channel 1
[330438.631025]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[330438.631035]   hcchar[1] = 0x015c9810, chan->ep_type=3
[330438.639019] --Host Channel Interrupt--, Channel 7
[330438.639041]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[330438.639053]   hcchar[7] = 0x015c9810, chan->ep_type=3
[330438.647019] --Host Channel Interrupt--, Channel 6
[330438.647040]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[330438.647051]   hcchar[6] = 0x015c9810, chan->ep_type=3
[330438.649015] --Host Channel Interrupt--, Channel 4
[330438.649020]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002

we're using buffer DMA mode (dma_desc_enable=0), we are atrss testing via ping flooding
through an LTE modem attached to USB which should mostly do bulk dataflow (confirmed
by ep_type==2). From regdump, GSNPSID = 0x4f54280a.

Many thanks,
Andrea
 
> >>>
> >>> Signed-off-by: Oliver Neukum <oneukum@suse.com>
> >>> Tested-by: Ivan Ivanov <ivan.ivanov@suse.com>
> >>> Tested-by: Andrea della Porta <andrea.porta@suse.com>
> >>> ---
> >>>    drivers/usb/dwc2/hcd_intr.c | 15 +++++++--------
> >>>    1 file changed, 7 insertions(+), 8 deletions(-)
> >>>
> >>> diff --git a/drivers/usb/dwc2/hcd_intr.c b/drivers/usb/dwc2/hcd_intr.c
> >>> index 0144ca8350c3..5c7538d498dd 100644
> >>> --- a/drivers/usb/dwc2/hcd_intr.c
> >>> +++ b/drivers/usb/dwc2/hcd_intr.c
> >>> @@ -2015,15 +2015,17 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
> >>>    {
> >>>    	struct dwc2_qtd *qtd;
> >>>    	struct dwc2_host_chan *chan;
> >>> -	u32 hcint, hcintmsk;
> >>> +	u32 hcint, hcintraw, hcintmsk;
> >>>    
> >>>    	chan = hsotg->hc_ptr_array[chnum];
> >>>    
> >>> -	hcint = dwc2_readl(hsotg, HCINT(chnum));
> >>> +	hcintraw = dwc2_readl(hsotg, HCINT(chnum));
> >>>    	hcintmsk = dwc2_readl(hsotg, HCINTMSK(chnum));
> >>> +	hcint = hcintraw & hcintmsk;
> >>> +	dwc2_writel(hsotg, hcint, HCINT(chnum));
> >>> +
> >>>    	if (!chan) {
> >>>    		dev_err(hsotg->dev, "## hc_ptr_array for channel is NULL ##\n");
> >>> -		dwc2_writel(hsotg, hcint, HCINT(chnum));
> >>>    		return;
> >>>    	}
> >>>    
> >>> @@ -2032,11 +2034,9 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
> >>>    			 chnum);
> >>>    		dev_vdbg(hsotg->dev,
> >>>    			 "  hcint 0x%08x, hcintmsk 0x%08x, hcint&hcintmsk 0x%08x\n",
> >>> -			 hcint, hcintmsk, hcint & hcintmsk);
> >>> +			 hcintraw, hcintmsk, hcint);
> >>>    	}
> >>>    
> >>> -	dwc2_writel(hsotg, hcint, HCINT(chnum));
> >>> -
> >>>    	/*
> >>>    	 * If we got an interrupt after someone called
> >>>    	 * dwc2_hcd_endpoint_disable() we don't want to crash below
> >>> @@ -2046,8 +2046,7 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
> >>>    		return;
> >>>    	}
> >>>    
> >>> -	chan->hcint = hcint;
> >>> -	hcint &= hcintmsk;
> >>> +	chan->hcint = hcintraw;
> >>>    
> >>>    	/*
> >>>    	 * If the channel was halted due to a dequeue, the qtd list might
Minas Harutyunyan Nov. 28, 2023, 11:48 a.m. UTC | #5
Hi Andrea,

On 11/28/23 13:00, Andrea della Porta wrote:
> Hi Minas,
> 
> On 10:46 Mon 27 Nov     , Minas Harutyunyan wrote:
>> Hi Andrea,
>>
>> On 11/27/23 13:04, Andrea della Porta wrote:
>>> On 12:35 Wed 22 Nov     , Minas Harutyunyan wrote:
>>>> Hi Oliver,
>>>>
>>>> On 11/15/23 18:45, Oliver Neukum wrote:
>>>>> dwc2_hc_n_intr() writes back INTMASK as read but evaluates it
>>>>> with intmask applied. In stress testing this causes spurious
>>>>> interrupts like this:
>>>>>
>>>>> [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 7 - ChHltd set, but reason is unknown
>>>>> [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
>>>>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
>>>>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
>>>>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 4 - ChHltd set, but reason is unknown
>>>>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
>>>>> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_update_urb_state_abn(): trimming xfer length
>>>>>
>>>>> Applying INTMASK prevents this. The issue exists in all versions of the
>>>>> driver.
>>>>
>>>> I'm Ok with this patch, just need some elaboration.
>>>> So, channel halted interrupt asserted due to some other interrupt (AHB
>>>> Error, Excessive transaction errors, Babble, Stall) which was masked.
>>>> Can you check which of masked interrupts is cause of channel halt interrupt?
>>>>
>>>> Thanks,
>>>> Minas
>>>>
>>>
>>> Hi Minas,
>>> here's the report from dmesg:
>>>
>>> [209384.238724]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [209384.246725]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [209384.247634]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [209384.254722]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [209384.262725]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [209384.270724]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [209384.278336]   hcint 0x00000092, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [209384.278384]   hcint 0x00000010, hcintmsk 0x00000436, hcint&hcintmsk 0x00000010
>>> [209384.278720]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [209384.286723]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [209384.288486]   hcint 0x00000021, hcintmsk 0x00000426, hcint&hcintmsk 0x00000020
>>
>>> [209384.288511]   hcint 0x00000002, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
>> According your patch terminology above 'hcint' is a 'hcintraw' and here
>> asserted only channel halted interrupt. So, channel halted without any
>> reason. Not clear how your patch fix this case?
>>
>> Couple of questions related to your tests:
>> 1. Which DMA mode used here - buffer or descriptor DMA?
>> 2. Which type of transfers testing? Or you can add to this print HCCHARx
>> also.
>> 3. Which version of core you use (GSNPSID)?
>>
>> Thanks,
>> Minas
>>
> 
> The augmented log is here below (hcchar[chnum] and others added):
> 
> [330438.623017] --Host Channel Interrupt--, Channel 4
> [330438.623029]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [330438.623039]   hcchar[4] = 0x015c9810, chan->ep_type=3
> [330438.626183] --Host Channel Interrupt--, Channel 5
> [330438.626198]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [330438.626208]   hcchar[5] = 0x01d83200, chan->ep_type=2
> [330438.627659] --Host Channel Interrupt--, Channel 3
> [330438.627674]   hcint 0x00000021, hcintmsk 0x00000426, hcint&hcintmsk 0x00000020
> [330438.627686]   hcchar[3] = 0x01d8d200, chan->ep_type=2
> [330438.627703] --Host Channel Interrupt--, Channel 3
> [330438.627711]   hcint 0x00000002, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
> [330438.627721]   hcchar[3] = 0x01d8d200, chan->ep_type=2
> [330438.627740] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 3 - ChHltd set, but reason is unknown
> [330438.627758] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> [330438.627798] --Host Channel Interrupt--, Channel 0
> [330438.627807]   hcint 0x00000010, hcintmsk 0x00000436, hcint&hcintmsk 0x00000010
> [330438.627818]   hcchar[0] = 0x81d8d200, chan->ep_type=2
> [330438.631017] --Host Channel Interrupt--, Channel 1
> [330438.631025]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [330438.631035]   hcchar[1] = 0x015c9810, chan->ep_type=3
> [330438.639019] --Host Channel Interrupt--, Channel 7
> [330438.639041]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [330438.639053]   hcchar[7] = 0x015c9810, chan->ep_type=3
> [330438.647019] --Host Channel Interrupt--, Channel 6
> [330438.647040]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [330438.647051]   hcchar[6] = 0x015c9810, chan->ep_type=3
> [330438.649015] --Host Channel Interrupt--, Channel 4
> [330438.649020]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> 
> we're using buffer DMA mode (dma_desc_enable=0), we are atrss testing via ping flooding
> through an LTE modem attached to USB which should mostly do bulk dataflow (confirmed
> by ep_type==2). From regdump, GSNPSID = 0x4f54280a.
> 

Host channel halt interrupt with "unkown reason" (no any other interrupt 
seen in hcintraw) mean that channel halted because application disable 
channel, i.e. on urb_dequeue.
Maybe "ChHltd set, but reason is unknown" message is not correct, but 
it's possible than channel halted interrupt asserted not because of any 
error condition.
Channel halted interrupt asserted by databook in follow cases:
"In non Scatter/Gather DMA mode, it indicates the transfer completed 
abnormally either because of any USB transaction error or in response to
disable request by the application or because of a completed transfer."

So, it's not spurious interrupt.
Does this "spurious" interrupt broke your tests? If not, then doesn't 
need to apply your patch. Moreover, I can't understand how your patch 
allow to avoid case with "ChHltd set, but reason is unknown" message.

Thanks,
Minas

> Many thanks,
> Andrea
>   
>>>>>
>>>>> Signed-off-by: Oliver Neukum <oneukum@suse.com>
>>>>> Tested-by: Ivan Ivanov <ivan.ivanov@suse.com>
>>>>> Tested-by: Andrea della Porta <andrea.porta@suse.com>
>>>>> ---
>>>>>     drivers/usb/dwc2/hcd_intr.c | 15 +++++++--------
>>>>>     1 file changed, 7 insertions(+), 8 deletions(-)
>>>>>
>>>>> diff --git a/drivers/usb/dwc2/hcd_intr.c b/drivers/usb/dwc2/hcd_intr.c
>>>>> index 0144ca8350c3..5c7538d498dd 100644
>>>>> --- a/drivers/usb/dwc2/hcd_intr.c
>>>>> +++ b/drivers/usb/dwc2/hcd_intr.c
>>>>> @@ -2015,15 +2015,17 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>>>>>     {
>>>>>     	struct dwc2_qtd *qtd;
>>>>>     	struct dwc2_host_chan *chan;
>>>>> -	u32 hcint, hcintmsk;
>>>>> +	u32 hcint, hcintraw, hcintmsk;
>>>>>     
>>>>>     	chan = hsotg->hc_ptr_array[chnum];
>>>>>     
>>>>> -	hcint = dwc2_readl(hsotg, HCINT(chnum));
>>>>> +	hcintraw = dwc2_readl(hsotg, HCINT(chnum));
>>>>>     	hcintmsk = dwc2_readl(hsotg, HCINTMSK(chnum));
>>>>> +	hcint = hcintraw & hcintmsk;
>>>>> +	dwc2_writel(hsotg, hcint, HCINT(chnum));
>>>>> +
>>>>>     	if (!chan) {
>>>>>     		dev_err(hsotg->dev, "## hc_ptr_array for channel is NULL ##\n");
>>>>> -		dwc2_writel(hsotg, hcint, HCINT(chnum));
>>>>>     		return;
>>>>>     	}
>>>>>     
>>>>> @@ -2032,11 +2034,9 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>>>>>     			 chnum);
>>>>>     		dev_vdbg(hsotg->dev,
>>>>>     			 "  hcint 0x%08x, hcintmsk 0x%08x, hcint&hcintmsk 0x%08x\n",
>>>>> -			 hcint, hcintmsk, hcint & hcintmsk);
>>>>> +			 hcintraw, hcintmsk, hcint);
>>>>>     	}
>>>>>     
>>>>> -	dwc2_writel(hsotg, hcint, HCINT(chnum));
>>>>> -
>>>>>     	/*
>>>>>     	 * If we got an interrupt after someone called
>>>>>     	 * dwc2_hcd_endpoint_disable() we don't want to crash below
>>>>> @@ -2046,8 +2046,7 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>>>>>     		return;
>>>>>     	}
>>>>>     
>>>>> -	chan->hcint = hcint;
>>>>> -	hcint &= hcintmsk;
>>>>> +	chan->hcint = hcintraw;
>>>>>     
>>>>>     	/*
>>>>>     	 * If the channel was halted due to a dequeue, the qtd list might
Ivan Ivanov Nov. 28, 2023, 2:43 p.m. UTC | #6
Hi Minas,

On Tue, 2023-11-28 at 11:48 +0000, Minas Harutyunyan wrote:
> 
> Does this "spurious" interrupt broke your tests? 

It is not just some kind of synthetic test case that was broken.
but real world usage. You can find complains about this error on
various internet forums, just search for dwc2_hc_chhltd_intr_dma
and it is not so difficult to reproduce.

Without databook I am not sure we can create better fix, but if
you develop different solution I will gladly tested it.

Regards,
Ivan
Minas Harutyunyan Dec. 1, 2023, 10:26 a.m. UTC | #7
Hi Ivan,

On 11/28/23 18:43, Ivan Ivanov wrote:
> 
> Hi Minas,
> 
> On Tue, 2023-11-28 at 11:48 +0000, Minas Harutyunyan wrote:
>>
>> Does this "spurious" interrupt broke your tests?
> 
> It is not just some kind of synthetic test case that was broken.
> but real world usage. You can find complains about this error on
> various internet forums, just search for dwc2_hc_chhltd_intr_dma
> and it is not so difficult to reproduce.
> 
> Without databook I am not sure we can create better fix, but if
> you develop different solution I will gladly tested it.
> 
> Regards,
> Ivan
1. In addition to HCCHARx and ep_type printing please add printing of 
GRXFSTSR if EP is IN or GNPTXSTS if EP is OUT, and provide dmesg with 
error case.
2. In function dwc2_hc_chhltd_intr_dma() remove "goto error;" after 
printing "ChHltd set, but reason is unknown".

Questions.
1. Do you see in the dmesg "Transaction Error--" before "ChHltd set, but 
reason is unknown"?
2. Your driver built for Host mode only or for both Device/Host?
3. If you build driver with debug and verbose debug prints enabled, 
issue still seen?
4. Andrea wrote: "we are atrss testing via ping flooding
through an LTE modem attached". This about USB2.0 ping flood in USB 
trace or just "ping" user command?

Thanks,
Minas
Andrea della Porta Dec. 3, 2023, 6:52 a.m. UTC | #8
Hi Minas,

On 10:26 Fri 01 Dec     , Minas Harutyunyan wrote:
> Hi Ivan,
> 
> On 11/28/23 18:43, Ivan Ivanov wrote:
> > 
> > Hi Minas,
> > 
> > On Tue, 2023-11-28 at 11:48 +0000, Minas Harutyunyan wrote:
> >>
> >> Does this "spurious" interrupt broke your tests?
> > 
> > It is not just some kind of synthetic test case that was broken.
> > but real world usage. You can find complains about this error on
> > various internet forums, just search for dwc2_hc_chhltd_intr_dma
> > and it is not so difficult to reproduce.
> > 
> > Without databook I am not sure we can create better fix, but if
> > you develop different solution I will gladly tested it.
> > 
> > Regards,
> > Ivan
> 1. In addition to HCCHARx and ep_type printing please add printing of 
> GRXFSTSR if EP is IN or GNPTXSTS if EP is OUT, and provide dmesg with 
> error case.

Here's the log, before comenting the 'goto' out:


[684829.206854] --Host Channel Interrupt--, Channel 2
[684829.206866]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[684829.206875]   hcchar[2] = 0x015c9810, chan->ep_type=3
[684829.206883]   GRXSTSR = 0x000E0002
[684829.214851] --Host Channel Interrupt--, Channel 6
[684829.214864]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[684829.214876]   hcchar[6] = 0x015c9810, chan->ep_type=3
[684829.214886]   GRXSTSR = 0x000E0007
[684829.217853] --Host Channel Interrupt--, Channel 5
[684829.217869]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[684829.217881]   hcchar[5] = 0x009c8801, chan->ep_type=3
[684829.217891]   GRXSTSR = 0x000E0005
[684829.222647] --Host Channel Interrupt--, Channel 0
[684829.222659]   hcint 0x00000021, hcintmsk 0x00000426, hcint&hcintmsk 0x00000020
[684829.222671]   hcchar[0] = 0x01d8d200, chan->ep_type=2
[684829.222681]   GRXSTSR = 0x00070044
[684829.222696] --Host Channel Interrupt--, Channel 0
[684829.222704]   hcint 0x00000002, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
[684829.222714]   hcchar[0] = 0x01d8d200, chan->ep_type=2
[684829.222724]   GRXSTSR = 0x00070044
[684829.222740] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
[684829.222758] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04000009
[684829.222797] --Host Channel Interrupt--, Channel 7
[684829.222806]   hcint 0x00000010, hcintmsk 0x00000436, hcint&hcintmsk 0x00000010
[684829.222816]   hcchar[7] = 0x81d8d200, chan->ep_type=2
[684829.222827]   GRXSTSR = 0x00070044
[684829.222851] --Host Channel Interrupt--, Channel 4
[684829.222859]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[684829.222869]   hcchar[4] = 0x015c9810, chan->ep_type=3
[684829.222879]   GRXSTSR = 0x000F0004
[684829.230853] --Host Channel Interrupt--, Channel 3
[684829.230875]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[684829.230885]   hcchar[3] = 0x015c9810, chan->ep_type=3
[684829.230895]   GRXSTSR = 0x00050000
[684829.238852] --Host Channel Interrupt--, Channel 2
[684829.238870]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[684829.238881]   hcchar[2] = 0x015c9810, chan->ep_type=3
[684829.238890]   GRXSTSR = 0x00070000
[684829.246851] --Host Channel Interrupt--, Channel 6
[684829.246863]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[684829.246872]   hcchar[6] = 0x015c9810, chan->ep_type=3
[684829.246880]   GRXSTSR = 0x000F0000


> 2. In function dwc2_hc_chhltd_intr_dma() remove "goto error;" after 
> printing "ChHltd set, but reason is unknown".a

Interestingly enough, with goto commented we do not see any error written on dmesg
(neither 'ChHltd set, but reason is unknown' nor 'Transaction Error---'), but
the ping is not working anymore (i.e. ping -I wwan0 8.8.8.8 from cmdline) is
not acknowledging any reply. Other peripherals attached to USB as the ethernet
are still working as usual.

> 
> Questions.
> 1. Do you see in the dmesg "Transaction Error--" before "ChHltd set, but 
> reason is unknown"?

No

> 2. Your driver built for Host mode only or for both Device/Host?

The module has been compiled as dual role driver.

> 3. If you build driver with debug and verbose debug prints enabled, 
> issue still seen?

Yes, please see the log below for further detail:


[50732.286732] --Host Channel Interrupt--, Channel 7
[50732.286742]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.286752]   hcchar[7] = 0x015c9810, chan->ep_type=3
[50732.286761]   GRXSTSR = 0x000E0003
[50732.294732] --Host Channel Interrupt--, Channel 3
[50732.294742]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.294752]   hcchar[3] = 0x015c9810, chan->ep_type=3
[50732.294762]   GRXSTSR = 0x000E0004
[50732.302732] --Host Channel Interrupt--, Channel 2
[50732.302745]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.302755]   hcchar[2] = 0x015c9810, chan->ep_type=3
[50732.302763]   GRXSTSR = 0x000E0000
[50732.310732] --Host Channel Interrupt--, Channel 5
[50732.310741]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.310751]   hcchar[5] = 0x015c9810, chan->ep_type=3
[50732.310760]   GRXSTSR = 0x000E0006
[50732.318732] --Host Channel Interrupt--, Channel 6
[50732.318746]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.318756]   hcchar[6] = 0x015c9810, chan->ep_type=3
[50732.318765]   GRXSTSR = 0x000E0002
[50732.325262] --Host Channel Interrupt--, Channel 1
[50732.325276]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.325286]   hcchar[1] = 0x01d83200, chan->ep_type=2
[50732.325295]   GNPTXSTS = 0x00000100
[50732.325309] dwc2 3f980000.usb: --Host Channel 1 Interrupt: Channel Halted--
[50732.325324] dwc2 3f980000.usb: --Host Channel 1 Interrupt: DMA Channel Halted--
[50732.325337] dwc2 3f980000.usb: --Host Channel 1 Interrupt: Transfer Complete--
[50732.325350] dwc2 3f980000.usb:   Bulk transfer complete
[50732.325362] dwc2 3f980000.usb: urb->actual_length=0 xfer_length=98
[50732.325375] dwc2 3f980000.usb: DWC_otg: dwc2_update_urb_state: OUT, channel 1
[50732.325389] dwc2 3f980000.usb:   chan->xfer_len 98
[50732.325400] dwc2 3f980000.usb:   hctsiz.xfersize 98
[50732.325411] dwc2 3f980000.usb:   urb->transfer_buffer_length 98
[50732.325423] dwc2 3f980000.usb:   urb->actual_length 98
[50732.325433] dwc2 3f980000.usb:   short_read 0, xfer_done 1
[50732.325452] dwc2 3f980000.usb: dwc2_complete_non_periodic_xfer()
[50732.325465] dwc2 3f980000.usb:   dwc2_release_channel: channel 1, halt_status 2
[50732.325478] dwc2 3f980000.usb:   dwc2_deactivate_qh(0000000071c758b5,00000000c2c3a3ef,1)
[50732.326735] --Host Channel Interrupt--, Channel 7
[50732.326757]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.326769]   hcchar[7] = 0x015c9810, chan->ep_type=3
[50732.326779]   GRXSTSR = 0x000E0007
[50732.334735] --Host Channel Interrupt--, Channel 3
[50732.334756]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.334768]   hcchar[3] = 0x015c9810, chan->ep_type=3
[50732.334778]   GRXSTSR = 0x000E0003
[50732.342736] --Host Channel Interrupt--, Channel 2
[50732.342759]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.342771]   hcchar[2] = 0x015c9810, chan->ep_type=3
[50732.342782]   GRXSTSR = 0x000E0004
[50732.350732] --Host Channel Interrupt--, Channel 5
[50732.350745]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.350751]   hcchar[5] = 0x015c9810, chan->ep_type=3
[50732.350757]   GRXSTSR = 0x000E0000
[50732.357386] --Host Channel Interrupt--, Channel 0
[50732.357396]   hcint 0x00000021, hcintmsk 0x00000426, hcint&hcintmsk 0x00000020
[50732.357402]   hcchar[0] = 0x01d8d200, chan->ep_type=2
[50732.357407]   GRXSTSR = 0x000E0003
[50732.357417] dwc2 3f980000.usb: --Host Channel 0 Interrupt: ACK Received--
[50732.357429] --Host Channel Interrupt--, Channel 0
[50732.357433]   hcint 0x00000002, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
[50732.357439]   hcchar[0] = 0x01d8d200, chan->ep_type=2
[50732.357444]   GRXSTSR = 0x000E0003
[50732.357450] dwc2 3f980000.usb: --Host Channel 0 Interrupt: Channel Halted--
[50732.357457] dwc2 3f980000.usb: --Host Channel 0 Interrupt: DMA Channel Halted--
[50732.357465] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
[50732.357473] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
[50732.357482] dwc2 3f980000.usb: dwc2_update_urb_state_abn(): trimming xfer length
[50732.357490] dwc2 3f980000.usb: DWC_otg: dwc2_update_urb_state_abn: IN, channel 0
[50732.357498] dwc2 3f980000.usb:   chan->start_pkt_count 1
[50732.357505] dwc2 3f980000.usb:   hctsiz.pktcnt 0
[50732.357512] dwc2 3f980000.usb:   chan->max_packet 512
[50732.357519] dwc2 3f980000.usb:   bytes_transferred 490
[50732.357525] dwc2 3f980000.usb:   urb->actual_length 1514
[50732.357532] dwc2 3f980000.usb:   urb->transfer_buffer_length 1514
[50732.357539] dwc2 3f980000.usb: dwc2_halt_channel()
[50732.357546] dwc2 3f980000.usb: DMA enabled
[50732.357552] dwc2 3f980000.usb:   dwc2_release_channel: channel 0, halt_status 7
[50732.357561] dwc2 3f980000.usb:   Complete URB with transaction error
[50732.357572] dwc2 3f980000.usb:   dwc2_deactivate_qh(0000000071c758b5,000000009ab78884,1)
[50732.358737] --Host Channel Interrupt--, Channel 1
[50732.358753]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.358759]   hcchar[1] = 0x015c9810, chan->ep_type=3
[50732.358765]   GRXSTSR = 0x000E0004
[50732.366732] --Host Channel Interrupt--, Channel 7
[50732.366746]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.366752]   hcchar[7] = 0x015c9810, chan->ep_type=3
[50732.366758]   GRXSTSR = 0x000E0000
[50732.371732] --Host Channel Interrupt--, Channel 3
[50732.371746]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.371753]   hcchar[3] = 0x00dc8801, chan->ep_type=3
[50732.371758]   GRXSTSR = 0x000E0006
[50732.374731] --Host Channel Interrupt--, Channel 2
[50732.374740]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.374746]   hcchar[2] = 0x015c9810, chan->ep_type=3
[50732.374752]   GRXSTSR = 0x00050047
[50732.382732] --Host Channel Interrupt--, Channel 5
[50732.382742]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.382748]   hcchar[5] = 0x015c9810, chan->ep_type=3
[50732.382754]   GRXSTSR = 0x00000840
[50732.390730] --Host Channel Interrupt--, Channel 0
[50732.390737]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.390742]   hcchar[0] = 0x015c9810, chan->ep_type=3
[50732.390748]   GRXSTSR = 0x00070047
[50732.398732] --Host Channel Interrupt--, Channel 1
[50732.398744]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[50732.398753]   hcchar[1] = 0x015c9810, chan->ep_type=3
[50732.398761]   GRXSTSR = 0x000F0007

> 4. Andrea wrote: "we are atrss testing via ping flooding
> through an LTE modem attached". This about USB2.0 ping flood in USB 
> trace or just "ping" user command?
>

I'm not sure to understand what you mean by 'USB2.0 ping flood in USB 
trace' but we are using ping command from shell via ssh, as the following:
ping -f -I wwan0 8.8.8.8
Please note that the ping flooding is not needed to reproduce the issue, just 
a one-per-second ping is sufficient. The flooding version makes the issue
happening more frequently so it's faster to gather the relevant logs, so the
higly sustained USB traffic is not necessarily the culprit.
Also note that this is not related to the specific modem driver iwe're using
since the same issue has been noted with other peripherals.
 
> Thanks,
> Minas

Many thanks,
Andrea
Minas Harutyunyan Dec. 14, 2023, 12:23 p.m. UTC | #9
Hi Andrea,

On 12/3/23 10:52, Andrea della Porta wrote:
> Hi Minas,
> 
> On 10:26 Fri 01 Dec     , Minas Harutyunyan wrote:
>> Hi Ivan,
>>
>> On 11/28/23 18:43, Ivan Ivanov wrote:
>>>
>>> Hi Minas,
>>>
>>> On Tue, 2023-11-28 at 11:48 +0000, Minas Harutyunyan wrote:
>>>>
>>>> Does this "spurious" interrupt broke your tests?
>>>
>>> It is not just some kind of synthetic test case that was broken.
>>> but real world usage. You can find complains about this error on
>>> various internet forums, just search for dwc2_hc_chhltd_intr_dma
>>> and it is not so difficult to reproduce.
>>>
>>> Without databook I am not sure we can create better fix, but if
>>> you develop different solution I will gladly tested it.
>>>
>>> Regards,
>>> Ivan
>> 1. In addition to HCCHARx and ep_type printing please add printing of
>> GRXFSTSR if EP is IN or GNPTXSTS if EP is OUT, and provide dmesg with
>> error case.
> 
> Here's the log, before comenting the 'goto' out:
> 
> 
> [684829.206854] --Host Channel Interrupt--, Channel 2
> [684829.206866]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [684829.206875]   hcchar[2] = 0x015c9810, chan->ep_type=3
> [684829.206883]   GRXSTSR = 0x000E0002
> [684829.214851] --Host Channel Interrupt--, Channel 6
> [684829.214864]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [684829.214876]   hcchar[6] = 0x015c9810, chan->ep_type=3
> [684829.214886]   GRXSTSR = 0x000E0007
> [684829.217853] --Host Channel Interrupt--, Channel 5
> [684829.217869]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [684829.217881]   hcchar[5] = 0x009c8801, chan->ep_type=3
> [684829.217891]   GRXSTSR = 0x000E0005
> [684829.222647] --Host Channel Interrupt--, Channel 0
> [684829.222659]   hcint 0x00000021, hcintmsk 0x00000426, hcint&hcintmsk 0x00000020
> [684829.222671]   hcchar[0] = 0x01d8d200, chan->ep_type=2
> [684829.222681]   GRXSTSR = 0x00070044
> [684829.222696] --Host Channel Interrupt--, Channel 0
> [684829.222704]   hcint 0x00000002, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
> [684829.222714]   hcchar[0] = 0x01d8d200, chan->ep_type=2
> [684829.222724]   GRXSTSR = 0x00070044
> [684829.222740] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
> [684829.222758] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04000009


Sorry for delayed response.
I guess the cause of issue is because of channel halted interrupt late 
for about ~40-50us. In above log, Channel 0 twice assert interrupt: 
first for ACK (XferComplete masked) and second for Channel_Halted. These 
all interrupts related to same BULK IN transfer. Ideally these 3 source 
of interrupt (ACK, XferCompl and ChHalt) should be asserted together.
To check it lets do follow:
1. Do not allow unmask ACK interrupt in function 
dwc2_hc_enable_dma_ints(). Just comment "hcintmsk |= HCINTMSK_ACK;"
2. remove comment for "goto error"
3. remove printing GRXSTSR and GNPTXSTS
4. build in non verbose debug mode
It's just temporary solution to check ACK influence on the issue.

Thanks,
Minas
Andrea della Porta Dec. 18, 2023, 2:36 p.m. UTC | #10
Hi Minas,

On 12:23 Thu 14 Dec     , Minas Harutyunyan wrote:
> Hi Andrea,
> 
> On 12/3/23 10:52, Andrea della Porta wrote:
> > Hi Minas,
> > 
> > On 10:26 Fri 01 Dec     , Minas Harutyunyan wrote:
> >> Hi Ivan,
> >>
> >> On 11/28/23 18:43, Ivan Ivanov wrote:
> >>>
> >>> Hi Minas,
> >>>
> >>> On Tue, 2023-11-28 at 11:48 +0000, Minas Harutyunyan wrote:
> >>>>
> >>>> Does this "spurious" interrupt broke your tests?
> >>>
> >>> It is not just some kind of synthetic test case that was broken.
> >>> but real world usage. You can find complains about this error on
> >>> various internet forums, just search for dwc2_hc_chhltd_intr_dma
> >>> and it is not so difficult to reproduce.
> >>>
> >>> Without databook I am not sure we can create better fix, but if
> >>> you develop different solution I will gladly tested it.
> >>>
> >>> Regards,
> >>> Ivan
> >> 1. In addition to HCCHARx and ep_type printing please add printing of
> >> GRXFSTSR if EP is IN or GNPTXSTS if EP is OUT, and provide dmesg with
> >> error case.
> > 
> > Here's the log, before comenting the 'goto' out:
> > 
> > 
> > [684829.206854] --Host Channel Interrupt--, Channel 2
> > [684829.206866]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [684829.206875]   hcchar[2] = 0x015c9810, chan->ep_type=3
> > [684829.206883]   GRXSTSR = 0x000E0002
> > [684829.214851] --Host Channel Interrupt--, Channel 6
> > [684829.214864]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [684829.214876]   hcchar[6] = 0x015c9810, chan->ep_type=3
> > [684829.214886]   GRXSTSR = 0x000E0007
> > [684829.217853] --Host Channel Interrupt--, Channel 5
> > [684829.217869]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> > [684829.217881]   hcchar[5] = 0x009c8801, chan->ep_type=3
> > [684829.217891]   GRXSTSR = 0x000E0005
> > [684829.222647] --Host Channel Interrupt--, Channel 0
> > [684829.222659]   hcint 0x00000021, hcintmsk 0x00000426, hcint&hcintmsk 0x00000020
> > [684829.222671]   hcchar[0] = 0x01d8d200, chan->ep_type=2
> > [684829.222681]   GRXSTSR = 0x00070044
> > [684829.222696] --Host Channel Interrupt--, Channel 0
> > [684829.222704]   hcint 0x00000002, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
> > [684829.222714]   hcchar[0] = 0x01d8d200, chan->ep_type=2
> > [684829.222724]   GRXSTSR = 0x00070044
> > [684829.222740] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
> > [684829.222758] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04000009
> 
> 
> Sorry for delayed response.
> I guess the cause of issue is because of channel halted interrupt late 
> for about ~40-50us. In above log, Channel 0 twice assert interrupt: 
> first for ACK (XferComplete masked) and second for Channel_Halted. These 
> all interrupts related to same BULK IN transfer. Ideally these 3 source 
> of interrupt (ACK, XferCompl and ChHalt) should be asserted together.
> To check it lets do follow:
> 1. Do not allow unmask ACK interrupt in function 
> dwc2_hc_enable_dma_ints(). Just comment "hcintmsk |= HCINTMSK_ACK;"
> 2. remove comment for "goto error"
> 3. remove printing GRXSTSR and GNPTXSTS
> 4. build in non verbose debug mode
> It's just temporary solution to check ACK influence on the issue.
> 
> Thanks,
> Minas

Testing the changes you suggested revealed that "ChHltd set, but reason is unknown" error
is not showing up anymore, but we now have some "Transaction error" as shown in the
following log:


[13941.590252]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.590263]  hcchar[5] = 0x00dc8801, chan->ep_type=3
[13941.592240] --Host Channel Interrupt--, Channel 4
[13941.592249]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.592258]  hcchar[4] = 0x015c9810, chan->ep_type=3
[13941.600243] --Host Channel Interrupt--, Channel 2
[13941.600263]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.600273]  hcchar[2] = 0x015c9810, chan->ep_type=3
[13941.605521] --Host Channel Interrupt--, Channel 3
[13941.605539]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.605549]  hcchar[3] = 0x01d83200, chan->ep_type=2
[13941.608242] --Host Channel Interrupt--, Channel 6
[13941.608256]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.608266]  hcchar[6] = 0x015c9810, chan->ep_type=3
[13941.609685] --Host Channel Interrupt--, Channel 1
[13941.609696]   hcint 0x00000023, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.609706]  hcchar[1] = 0x01d8d200, chan->ep_type=2
[13941.616243] --Host Channel Interrupt--, Channel 5
[13941.616262]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.616272]  hcchar[5] = 0x015c9810, chan->ep_type=3
[13941.619514] --Host Channel Interrupt--, Channel 0
[13941.619527]   hcint 0x00000023, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.619538]  hcchar[0] = 0x01d8d200, chan->ep_type=2
[13941.620015] --Host Channel Interrupt--, Channel 2
[13941.620027]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.620037]  hcchar[2] = 0x01d83200, chan->ep_type=2
[13941.624240] --Host Channel Interrupt--, Channel 3
[13941.624249]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.624259]  hcchar[3] = 0x015c9810, chan->ep_type=3
[13941.627347] --Host Channel Interrupt--, Channel 4
[13941.627363]   hcint 0x00000092, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.627373]  hcchar[4] = 0x01d8d200, chan->ep_type=2
[13941.627391] dwc2 3f980000.usb: --Host Channel 4 Interrupt: Transaction Error--
[13941.627432] --Host Channel Interrupt--, Channel 6
[13941.627440]   hcint 0x00000010, hcintmsk 0x00000416, hcint&hcintmsk 0x00000010
[13941.627450]  hcchar[6] = 0x81d8d200, chan->ep_type=2
[13941.632240] --Host Channel Interrupt--, Channel 1
[13941.632248]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.632257]  hcchar[1] = 0x015c9810, chan->ep_type=3
[13941.635528] --Host Channel Interrupt--, Channel 5
[13941.635543]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.635553]  hcchar[5] = 0x01d83200, chan->ep_type=2
[13941.640244] --Host Channel Interrupt--, Channel 0
[13941.640263]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.640272]  hcchar[0] = 0x015c9810, chan->ep_type=3
[13941.642078] --Host Channel Interrupt--, Channel 6
[13941.642086]   hcint 0x00000023, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
[13941.642095]  hcchar[6] = 0x01d8d200, chan->ep_type=2
[13941.648243] --Host Channel Interrupt--, Channel 3
[13941.648262]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.648272]  hcchar[3] = 0x015c9810, chan->ep_type=3
[13941.652240] --Host Channel Interrupt--, Channel 4
[13941.652248]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.652257]  hcchar[4] = 0x009c8801, chan->ep_type=3
[13941.656241] --Host Channel Interrupt--, Channel 1
[13941.656256]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.656266]  hcchar[1] = 0x015c9810, chan->ep_type=3
[13941.664240] --Host Channel Interrupt--, Channel 5
[13941.664248]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
[13941.664257]  hcchar[5] = 0x015c9810, chan->ep_type=3


The ping flood is otherwise working, except for a minor percentage loss (~0.7%).
Many thanks,

Andrea
Minas Harutyunyan Dec. 19, 2023, 10:18 a.m. UTC | #11
On 11/15/23 18:45, Oliver Neukum wrote:
> dwc2_hc_n_intr() writes back INTMASK as read but evaluates it
> with intmask applied. In stress testing this causes spurious
> interrupts like this:
> 
> [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 7 - ChHltd set, but reason is unknown
> [Mon Aug 14 10:51:07 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 4 - ChHltd set, but reason is unknown
> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04600001
> [Mon Aug 14 10:51:08 2023] dwc2 3f980000.usb: dwc2_update_urb_state_abn(): trimming xfer length
> 
> Applying INTMASK prevents this. The issue exists in all versions of the
> driver.
> 
> Signed-off-by: Oliver Neukum <oneukum@suse.com>
> Tested-by: Ivan Ivanov <ivan.ivanov@suse.com>
> Tested-by: Andrea della Porta <andrea.porta@suse.com>

Acked-by: Minas Harutyunyan <hminas@synopsys.com>

> ---
>   drivers/usb/dwc2/hcd_intr.c | 15 +++++++--------
>   1 file changed, 7 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/usb/dwc2/hcd_intr.c b/drivers/usb/dwc2/hcd_intr.c
> index 0144ca8350c3..5c7538d498dd 100644
> --- a/drivers/usb/dwc2/hcd_intr.c
> +++ b/drivers/usb/dwc2/hcd_intr.c
> @@ -2015,15 +2015,17 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>   {
>   	struct dwc2_qtd *qtd;
>   	struct dwc2_host_chan *chan;
> -	u32 hcint, hcintmsk;
> +	u32 hcint, hcintraw, hcintmsk;
>   
>   	chan = hsotg->hc_ptr_array[chnum];
>   
> -	hcint = dwc2_readl(hsotg, HCINT(chnum));
> +	hcintraw = dwc2_readl(hsotg, HCINT(chnum));
>   	hcintmsk = dwc2_readl(hsotg, HCINTMSK(chnum));
> +	hcint = hcintraw & hcintmsk;
> +	dwc2_writel(hsotg, hcint, HCINT(chnum));
> +
>   	if (!chan) {
>   		dev_err(hsotg->dev, "## hc_ptr_array for channel is NULL ##\n");
> -		dwc2_writel(hsotg, hcint, HCINT(chnum));
>   		return;
>   	}
>   
> @@ -2032,11 +2034,9 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>   			 chnum);
>   		dev_vdbg(hsotg->dev,
>   			 "  hcint 0x%08x, hcintmsk 0x%08x, hcint&hcintmsk 0x%08x\n",
> -			 hcint, hcintmsk, hcint & hcintmsk);
> +			 hcintraw, hcintmsk, hcint);
>   	}
>   
> -	dwc2_writel(hsotg, hcint, HCINT(chnum));
> -
>   	/*
>   	 * If we got an interrupt after someone called
>   	 * dwc2_hcd_endpoint_disable() we don't want to crash below
> @@ -2046,8 +2046,7 @@ static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
>   		return;
>   	}
>   
> -	chan->hcint = hcint;
> -	hcint &= hcintmsk;
> +	chan->hcint = hcintraw;
>   
>   	/*
>   	 * If the channel was halted due to a dequeue, the qtd list might
Minas Harutyunyan Dec. 19, 2023, 10:19 a.m. UTC | #12
Hi Andrea,

On 12/18/23 18:36, Andrea della Porta wrote:
> Hi Minas,
> 
> On 12:23 Thu 14 Dec     , Minas Harutyunyan wrote:
>> Hi Andrea,
>>
>> On 12/3/23 10:52, Andrea della Porta wrote:
>>> Hi Minas,
>>>
>>> On 10:26 Fri 01 Dec     , Minas Harutyunyan wrote:
>>>> Hi Ivan,
>>>>
>>>> On 11/28/23 18:43, Ivan Ivanov wrote:
>>>>>
>>>>> Hi Minas,
>>>>>
>>>>> On Tue, 2023-11-28 at 11:48 +0000, Minas Harutyunyan wrote:
>>>>>>
>>>>>> Does this "spurious" interrupt broke your tests?
>>>>>
>>>>> It is not just some kind of synthetic test case that was broken.
>>>>> but real world usage. You can find complains about this error on
>>>>> various internet forums, just search for dwc2_hc_chhltd_intr_dma
>>>>> and it is not so difficult to reproduce.
>>>>>
>>>>> Without databook I am not sure we can create better fix, but if
>>>>> you develop different solution I will gladly tested it.
>>>>>
>>>>> Regards,
>>>>> Ivan
>>>> 1. In addition to HCCHARx and ep_type printing please add printing of
>>>> GRXFSTSR if EP is IN or GNPTXSTS if EP is OUT, and provide dmesg with
>>>> error case.
>>>
>>> Here's the log, before comenting the 'goto' out:
>>>
>>>
>>> [684829.206854] --Host Channel Interrupt--, Channel 2
>>> [684829.206866]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [684829.206875]   hcchar[2] = 0x015c9810, chan->ep_type=3
>>> [684829.206883]   GRXSTSR = 0x000E0002
>>> [684829.214851] --Host Channel Interrupt--, Channel 6
>>> [684829.214864]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [684829.214876]   hcchar[6] = 0x015c9810, chan->ep_type=3
>>> [684829.214886]   GRXSTSR = 0x000E0007
>>> [684829.217853] --Host Channel Interrupt--, Channel 5
>>> [684829.217869]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
>>> [684829.217881]   hcchar[5] = 0x009c8801, chan->ep_type=3
>>> [684829.217891]   GRXSTSR = 0x000E0005
>>> [684829.222647] --Host Channel Interrupt--, Channel 0
>>> [684829.222659]   hcint 0x00000021, hcintmsk 0x00000426, hcint&hcintmsk 0x00000020
>>> [684829.222671]   hcchar[0] = 0x01d8d200, chan->ep_type=2
>>> [684829.222681]   GRXSTSR = 0x00070044
>>> [684829.222696] --Host Channel Interrupt--, Channel 0
>>> [684829.222704]   hcint 0x00000002, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
>>> [684829.222714]   hcchar[0] = 0x01d8d200, chan->ep_type=2
>>> [684829.222724]   GRXSTSR = 0x00070044
>>> [684829.222740] dwc2 3f980000.usb: dwc2_hc_chhltd_intr_dma: Channel 0 - ChHltd set, but reason is unknown
>>> [684829.222758] dwc2 3f980000.usb: hcint 0x00000002, intsts 0x04000009
>>
>>
>> Sorry for delayed response.
>> I guess the cause of issue is because of channel halted interrupt late
>> for about ~40-50us. In above log, Channel 0 twice assert interrupt:
>> first for ACK (XferComplete masked) and second for Channel_Halted. These
>> all interrupts related to same BULK IN transfer. Ideally these 3 source
>> of interrupt (ACK, XferCompl and ChHalt) should be asserted together.
>> To check it lets do follow:
>> 1. Do not allow unmask ACK interrupt in function
>> dwc2_hc_enable_dma_ints(). Just comment "hcintmsk |= HCINTMSK_ACK;"
>> 2. remove comment for "goto error"
>> 3. remove printing GRXSTSR and GNPTXSTS
>> 4. build in non verbose debug mode
>> It's just temporary solution to check ACK influence on the issue.
>>
>> Thanks,
>> Minas
> 
> Testing the changes you suggested revealed that "ChHltd set, but reason is unknown" error
> is not showing up anymore, but we now have some "Transaction error" as shown in the
> following log:
> 
> 
> [13941.590252]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.590263]  hcchar[5] = 0x00dc8801, chan->ep_type=3
> [13941.592240] --Host Channel Interrupt--, Channel 4
> [13941.592249]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.592258]  hcchar[4] = 0x015c9810, chan->ep_type=3
> [13941.600243] --Host Channel Interrupt--, Channel 2
> [13941.600263]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.600273]  hcchar[2] = 0x015c9810, chan->ep_type=3
> [13941.605521] --Host Channel Interrupt--, Channel 3
> [13941.605539]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.605549]  hcchar[3] = 0x01d83200, chan->ep_type=2
> [13941.608242] --Host Channel Interrupt--, Channel 6
> [13941.608256]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.608266]  hcchar[6] = 0x015c9810, chan->ep_type=3
> [13941.609685] --Host Channel Interrupt--, Channel 1
> [13941.609696]   hcint 0x00000023, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.609706]  hcchar[1] = 0x01d8d200, chan->ep_type=2
> [13941.616243] --Host Channel Interrupt--, Channel 5
> [13941.616262]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.616272]  hcchar[5] = 0x015c9810, chan->ep_type=3
> [13941.619514] --Host Channel Interrupt--, Channel 0
> [13941.619527]   hcint 0x00000023, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.619538]  hcchar[0] = 0x01d8d200, chan->ep_type=2
> [13941.620015] --Host Channel Interrupt--, Channel 2
> [13941.620027]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.620037]  hcchar[2] = 0x01d83200, chan->ep_type=2
> [13941.624240] --Host Channel Interrupt--, Channel 3
> [13941.624249]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.624259]  hcchar[3] = 0x015c9810, chan->ep_type=3
> [13941.627347] --Host Channel Interrupt--, Channel 4
> [13941.627363]   hcint 0x00000092, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.627373]  hcchar[4] = 0x01d8d200, chan->ep_type=2
> [13941.627391] dwc2 3f980000.usb: --Host Channel 4 Interrupt: Transaction Error--
> [13941.627432] --Host Channel Interrupt--, Channel 6
> [13941.627440]   hcint 0x00000010, hcintmsk 0x00000416, hcint&hcintmsk 0x00000010
> [13941.627450]  hcchar[6] = 0x81d8d200, chan->ep_type=2
> [13941.632240] --Host Channel Interrupt--, Channel 1
> [13941.632248]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.632257]  hcchar[1] = 0x015c9810, chan->ep_type=3
> [13941.635528] --Host Channel Interrupt--, Channel 5
> [13941.635543]   hcint 0x00000003, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.635553]  hcchar[5] = 0x01d83200, chan->ep_type=2
> [13941.640244] --Host Channel Interrupt--, Channel 0
> [13941.640263]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.640272]  hcchar[0] = 0x015c9810, chan->ep_type=3
> [13941.642078] --Host Channel Interrupt--, Channel 6
> [13941.642086]   hcint 0x00000023, hcintmsk 0x00000406, hcint&hcintmsk 0x00000002
> [13941.642095]  hcchar[6] = 0x01d8d200, chan->ep_type=2
> [13941.648243] --Host Channel Interrupt--, Channel 3
> [13941.648262]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.648272]  hcchar[3] = 0x015c9810, chan->ep_type=3
> [13941.652240] --Host Channel Interrupt--, Channel 4
> [13941.652248]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.652257]  hcchar[4] = 0x009c8801, chan->ep_type=3
> [13941.656241] --Host Channel Interrupt--, Channel 1
> [13941.656256]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.656266]  hcchar[1] = 0x015c9810, chan->ep_type=3
> [13941.664240] --Host Channel Interrupt--, Channel 5
> [13941.664248]   hcint 0x00000012, hcintmsk 0x00000006, hcint&hcintmsk 0x00000002
> [13941.664257]  hcchar[5] = 0x015c9810, chan->ep_type=3
> 
> 
> The ping flood is otherwise working, except for a minor percentage loss (~0.7%).
> Many thanks,
> 
> Andrea

Thanks for testing.

Minas
diff mbox series

Patch

diff --git a/drivers/usb/dwc2/hcd_intr.c b/drivers/usb/dwc2/hcd_intr.c
index 0144ca8350c3..5c7538d498dd 100644
--- a/drivers/usb/dwc2/hcd_intr.c
+++ b/drivers/usb/dwc2/hcd_intr.c
@@ -2015,15 +2015,17 @@  static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
 {
 	struct dwc2_qtd *qtd;
 	struct dwc2_host_chan *chan;
-	u32 hcint, hcintmsk;
+	u32 hcint, hcintraw, hcintmsk;
 
 	chan = hsotg->hc_ptr_array[chnum];
 
-	hcint = dwc2_readl(hsotg, HCINT(chnum));
+	hcintraw = dwc2_readl(hsotg, HCINT(chnum));
 	hcintmsk = dwc2_readl(hsotg, HCINTMSK(chnum));
+	hcint = hcintraw & hcintmsk;
+	dwc2_writel(hsotg, hcint, HCINT(chnum));
+
 	if (!chan) {
 		dev_err(hsotg->dev, "## hc_ptr_array for channel is NULL ##\n");
-		dwc2_writel(hsotg, hcint, HCINT(chnum));
 		return;
 	}
 
@@ -2032,11 +2034,9 @@  static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
 			 chnum);
 		dev_vdbg(hsotg->dev,
 			 "  hcint 0x%08x, hcintmsk 0x%08x, hcint&hcintmsk 0x%08x\n",
-			 hcint, hcintmsk, hcint & hcintmsk);
+			 hcintraw, hcintmsk, hcint);
 	}
 
-	dwc2_writel(hsotg, hcint, HCINT(chnum));
-
 	/*
 	 * If we got an interrupt after someone called
 	 * dwc2_hcd_endpoint_disable() we don't want to crash below
@@ -2046,8 +2046,7 @@  static void dwc2_hc_n_intr(struct dwc2_hsotg *hsotg, int chnum)
 		return;
 	}
 
-	chan->hcint = hcint;
-	hcint &= hcintmsk;
+	chan->hcint = hcintraw;
 
 	/*
 	 * If the channel was halted due to a dequeue, the qtd list might