diff mbox series

[v2,2/2] can: m_can: fix missed interrupts with m_can_pci

Message ID 861164dfe6d95fd69ab2f82528306db6be94351a.1726745009.git.matthias.schiffer@ew.tq-group.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [v2,1/2] can: m_can: set init flag earlier in probe | expand

Checks

Context Check Description
netdev/tree_selection success Series ignored based on subject

Commit Message

Matthias Schiffer Sept. 19, 2024, 11:27 a.m. UTC
The interrupt line of PCI devices is interpreted as edge-triggered,
however the interrupt signal of the m_can controller integrated in Intel
Elkhart Lake CPUs appears to be generated level-triggered.

Consider the following sequence of events:

- IR register is read, interrupt X is set
- A new interrupt Y is triggered in the m_can controller
- IR register is written to acknowledge interrupt X. Y remains set in IR

As at no point in this sequence no interrupt flag is set in IR, the
m_can interrupt line will never become deasserted, and no edge will ever
be observed to trigger another run of the ISR. This was observed to
result in the TX queue of the EHL m_can to get stuck under high load,
because frames were queued to the hardware in m_can_start_xmit(), but
m_can_finish_tx() was never run to account for their successful
transmission.

To fix the issue, repeatedly read and acknowledge interrupts at the
start of the ISR until no interrupt flags are set, so the next incoming
interrupt will also result in an edge on the interrupt line.

Fixes: cab7ffc0324f ("can: m_can: add PCI glue driver for Intel Elkhart Lake")
Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
---

v2: introduce flag is_edge_triggered, so we can avoid the loop on !m_can_pci

 drivers/net/can/m_can/m_can.c     | 21 ++++++++++++++++-----
 drivers/net/can/m_can/m_can.h     |  1 +
 drivers/net/can/m_can/m_can_pci.c |  1 +
 3 files changed, 18 insertions(+), 5 deletions(-)

Comments

Markus Schneider-Pargmann Sept. 23, 2024, 8:03 a.m. UTC | #1
Hi Matthias,

On Thu, Sep 19, 2024 at 01:27:28PM GMT, Matthias Schiffer wrote:
> The interrupt line of PCI devices is interpreted as edge-triggered,
> however the interrupt signal of the m_can controller integrated in Intel

I have a similar patch though for a different setup (I didn't send it
yet). I have a tcan chip wired to a pin that is only capable of edge
interrupts.

> Elkhart Lake CPUs appears to be generated level-triggered.
> 
> Consider the following sequence of events:
> 
> - IR register is read, interrupt X is set
> - A new interrupt Y is triggered in the m_can controller
> - IR register is written to acknowledge interrupt X. Y remains set in IR
> 
> As at no point in this sequence no interrupt flag is set in IR, the
> m_can interrupt line will never become deasserted, and no edge will ever
> be observed to trigger another run of the ISR. This was observed to
> result in the TX queue of the EHL m_can to get stuck under high load,
> because frames were queued to the hardware in m_can_start_xmit(), but
> m_can_finish_tx() was never run to account for their successful
> transmission.
> 
> To fix the issue, repeatedly read and acknowledge interrupts at the
> start of the ISR until no interrupt flags are set, so the next incoming
> interrupt will also result in an edge on the interrupt line.
> 
> Fixes: cab7ffc0324f ("can: m_can: add PCI glue driver for Intel Elkhart Lake")
> Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
> ---
> 
> v2: introduce flag is_edge_triggered, so we can avoid the loop on !m_can_pci
> 
>  drivers/net/can/m_can/m_can.c     | 21 ++++++++++++++++-----
>  drivers/net/can/m_can/m_can.h     |  1 +
>  drivers/net/can/m_can/m_can_pci.c |  1 +
>  3 files changed, 18 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
> index 47481afb9add3..2e182c3c98fed 100644
> --- a/drivers/net/can/m_can/m_can.c
> +++ b/drivers/net/can/m_can/m_can.c
> @@ -1207,20 +1207,31 @@ static void m_can_coalescing_update(struct m_can_classdev *cdev, u32 ir)
>  static int m_can_interrupt_handler(struct m_can_classdev *cdev)
>  {
>  	struct net_device *dev = cdev->net;
> -	u32 ir;
> +	u32 ir = 0, ir_read;
>  	int ret;
>  
>  	if (pm_runtime_suspended(cdev->dev))
>  		return IRQ_NONE;
>  
> -	ir = m_can_read(cdev, M_CAN_IR);
> +	/* For m_can_pci, the interrupt line is interpreted as edge-triggered,
> +	 * but the m_can controller generates them as level-triggered. We must
> +	 * observe that IR is 0 at least once to be sure that the next
> +	 * interrupt will generate an edge.
> +	 */

Could you please remove this hardware specific comment? As mentioned
above this will be independent of any specific hardware.

> +	while ((ir_read = m_can_read(cdev, M_CAN_IR)) != 0) {
> +		ir |= ir_read;
> +
> +		/* ACK all irqs */
> +		m_can_write(cdev, M_CAN_IR, ir);
> +
> +		if (!cdev->is_edge_triggered)
> +			break;
> +	}
> +
>  	m_can_coalescing_update(cdev, ir);
>  	if (!ir)
>  		return IRQ_NONE;
>  
> -	/* ACK all irqs */
> -	m_can_write(cdev, M_CAN_IR, ir);
> -
>  	if (cdev->ops->clear_interrupts)
>  		cdev->ops->clear_interrupts(cdev);
>  
> diff --git a/drivers/net/can/m_can/m_can.h b/drivers/net/can/m_can/m_can.h
> index 92b2bd8628e6b..8c17eb94d2f98 100644
> --- a/drivers/net/can/m_can/m_can.h
> +++ b/drivers/net/can/m_can/m_can.h
> @@ -99,6 +99,7 @@ struct m_can_classdev {
>  	int pm_clock_support;
>  	int pm_wake_source;
>  	int is_peripheral;
> +	bool is_edge_triggered;

To avoid confusion could you rename it to irq_edge_triggered or
something similar, to make clear that it is not about the chip but the
way the interrupt line is connected?

Also I am not sure it is possible, but could you use
irq_get_trigger_type() to see if it is a level or edge based interrupt?
Then we wouldn't need this additional parameter at all and could just
detect it in m_can.c.

Best
Markus

>  
>  	// Cached M_CAN_IE register content
>  	u32 active_interrupts;
> diff --git a/drivers/net/can/m_can/m_can_pci.c b/drivers/net/can/m_can/m_can_pci.c
> index d72fe771dfc7a..f98527981402a 100644
> --- a/drivers/net/can/m_can/m_can_pci.c
> +++ b/drivers/net/can/m_can/m_can_pci.c
> @@ -127,6 +127,7 @@ static int m_can_pci_probe(struct pci_dev *pci, const struct pci_device_id *id)
>  	mcan_class->pm_clock_support = 1;
>  	mcan_class->pm_wake_source = 0;
>  	mcan_class->can.clock.freq = id->driver_data;
> +	mcan_class->is_edge_triggered = true;
>  	mcan_class->ops = &m_can_pci_ops;
>  
>  	pci_set_drvdata(pci, mcan_class);
> -- 
> TQ-Systems GmbH | Mühlstraße 2, Gut Delling | 82229 Seefeld, Germany
> Amtsgericht München, HRB 105018
> Geschäftsführer: Detlef Schneider, Rüdiger Stahl, Stefan Schneider
> https://www.tq-group.com/
>
Matthias Schiffer Sept. 23, 2024, 9:32 a.m. UTC | #2
On Mon, 2024-09-23 at 10:03 +0200, Markus Schneider-Pargmann wrote:
> Hi Matthias,
> 
> On Thu, Sep 19, 2024 at 01:27:28PM GMT, Matthias Schiffer wrote:
> > The interrupt line of PCI devices is interpreted as edge-triggered,
> > however the interrupt signal of the m_can controller integrated in Intel
> 
> I have a similar patch though for a different setup (I didn't send it
> yet). I have a tcan chip wired to a pin that is only capable of edge
> interrupts.

Should I also change the Fixes tag to something else then?

> 
> > Elkhart Lake CPUs appears to be generated level-triggered.
> > 
> > Consider the following sequence of events:
> > 
> > - IR register is read, interrupt X is set
> > - A new interrupt Y is triggered in the m_can controller
> > - IR register is written to acknowledge interrupt X. Y remains set in IR
> > 
> > As at no point in this sequence no interrupt flag is set in IR, the
> > m_can interrupt line will never become deasserted, and no edge will ever
> > be observed to trigger another run of the ISR. This was observed to
> > result in the TX queue of the EHL m_can to get stuck under high load,
> > because frames were queued to the hardware in m_can_start_xmit(), but
> > m_can_finish_tx() was never run to account for their successful
> > transmission.
> > 
> > To fix the issue, repeatedly read and acknowledge interrupts at the
> > start of the ISR until no interrupt flags are set, so the next incoming
> > interrupt will also result in an edge on the interrupt line.
> > 
> > Fixes: cab7ffc0324f ("can: m_can: add PCI glue driver for Intel Elkhart Lake")
> > Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
> > ---
> > 
> > v2: introduce flag is_edge_triggered, so we can avoid the loop on !m_can_pci
> > 
> >  drivers/net/can/m_can/m_can.c     | 21 ++++++++++++++++-----
> >  drivers/net/can/m_can/m_can.h     |  1 +
> >  drivers/net/can/m_can/m_can_pci.c |  1 +
> >  3 files changed, 18 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
> > index 47481afb9add3..2e182c3c98fed 100644
> > --- a/drivers/net/can/m_can/m_can.c
> > +++ b/drivers/net/can/m_can/m_can.c
> > @@ -1207,20 +1207,31 @@ static void m_can_coalescing_update(struct m_can_classdev *cdev, u32 ir)
> >  static int m_can_interrupt_handler(struct m_can_classdev *cdev)
> >  {
> >  	struct net_device *dev = cdev->net;
> > -	u32 ir;
> > +	u32 ir = 0, ir_read;
> >  	int ret;
> >  
> >  	if (pm_runtime_suspended(cdev->dev))
> >  		return IRQ_NONE;
> >  
> > -	ir = m_can_read(cdev, M_CAN_IR);
> > +	/* For m_can_pci, the interrupt line is interpreted as edge-triggered,
> > +	 * but the m_can controller generates them as level-triggered. We must
> > +	 * observe that IR is 0 at least once to be sure that the next
> > +	 * interrupt will generate an edge.
> > +	 */
> 
> Could you please remove this hardware specific comment? As mentioned
> above this will be independent of any specific hardware.

Ok.


> 
> > +	while ((ir_read = m_can_read(cdev, M_CAN_IR)) != 0) {
> > +		ir |= ir_read;
> > +
> > +		/* ACK all irqs */
> > +		m_can_write(cdev, M_CAN_IR, ir);
> > +
> > +		if (!cdev->is_edge_triggered)
> > +			break;
> > +	}
> > +
> >  	m_can_coalescing_update(cdev, ir);
> >  	if (!ir)
> >  		return IRQ_NONE;
> >  
> > -	/* ACK all irqs */
> > -	m_can_write(cdev, M_CAN_IR, ir);
> > -
> >  	if (cdev->ops->clear_interrupts)
> >  		cdev->ops->clear_interrupts(cdev);
> >  
> > diff --git a/drivers/net/can/m_can/m_can.h b/drivers/net/can/m_can/m_can.h
> > index 92b2bd8628e6b..8c17eb94d2f98 100644
> > --- a/drivers/net/can/m_can/m_can.h
> > +++ b/drivers/net/can/m_can/m_can.h
> > @@ -99,6 +99,7 @@ struct m_can_classdev {
> >  	int pm_clock_support;
> >  	int pm_wake_source;
> >  	int is_peripheral;
> > +	bool is_edge_triggered;
> 
> To avoid confusion could you rename it to irq_edge_triggered or
> something similar, to make clear that it is not about the chip but the
> way the interrupt line is connected?

Will do.

> 
> Also I am not sure it is possible, but could you use
> irq_get_trigger_type() to see if it is a level or edge based interrupt?
> Then we wouldn't need this additional parameter at all and could just
> detect it in m_can.c.

Unfortunately that doesn't seem to work. irq_get_trigger_type() only returns a meaningful value
after the IRQ has been requested. I thought about requesting the IRQ with IRQF_NO_AUTOEN and then
filling in the irq_edge_triggered field before enabling the IRQ, but IRQF_NO_AUTOEN is incompatible
with IRQF_SHARED.

Of course there are ways around this - checking irq_get_trigger_type() from the ISR itself, or
adding more locking, but neither seems quite worthwhile to me. Would you agree with this?

Maybe there is some other way to find out the trigger type that would be set when the IRQ is
requested? I don't know what that would be however - so I'd just keep setting the flag statically
for m_can_pci and leave a dynamic solution for future improvement.

Matthias



> 
> Best
> Markus
> 
> >  
> >  	// Cached M_CAN_IE register content
> >  	u32 active_interrupts;
> > diff --git a/drivers/net/can/m_can/m_can_pci.c b/drivers/net/can/m_can/m_can_pci.c
> > index d72fe771dfc7a..f98527981402a 100644
> > --- a/drivers/net/can/m_can/m_can_pci.c
> > +++ b/drivers/net/can/m_can/m_can_pci.c
> > @@ -127,6 +127,7 @@ static int m_can_pci_probe(struct pci_dev *pci, const struct pci_device_id *id)
> >  	mcan_class->pm_clock_support = 1;
> >  	mcan_class->pm_wake_source = 0;
> >  	mcan_class->can.clock.freq = id->driver_data;
> > +	mcan_class->is_edge_triggered = true;
> >  	mcan_class->ops = &m_can_pci_ops;
> >  
> >  	pci_set_drvdata(pci, mcan_class);
> > -- 
> > TQ-Systems GmbH | Mühlstraße 2, Gut Delling | 82229 Seefeld, Germany
> > Amtsgericht München, HRB 105018
> > Geschäftsführer: Detlef Schneider, Rüdiger Stahl, Stefan Schneider
> > https://www.tq-group.com/
> >
Markus Schneider-Pargmann Sept. 23, 2024, 10:07 a.m. UTC | #3
On Mon, Sep 23, 2024 at 11:32:49AM GMT, Matthias Schiffer wrote:
> On Mon, 2024-09-23 at 10:03 +0200, Markus Schneider-Pargmann wrote:
> > Hi Matthias,
> > 
> > On Thu, Sep 19, 2024 at 01:27:28PM GMT, Matthias Schiffer wrote:
> > > The interrupt line of PCI devices is interpreted as edge-triggered,
> > > however the interrupt signal of the m_can controller integrated in Intel
> > 
> > I have a similar patch though for a different setup (I didn't send it
> > yet). I have a tcan chip wired to a pin that is only capable of edge
> > interrupts.
> 
> Should I also change the Fixes tag to something else then?

No, my use-case wasn't explicitly supported before, so I don't consider
it to be a fix but a new feature supporting edge triggered tcan.

> 
> > 
> > > Elkhart Lake CPUs appears to be generated level-triggered.
> > > 
> > > Consider the following sequence of events:
> > > 
> > > - IR register is read, interrupt X is set
> > > - A new interrupt Y is triggered in the m_can controller
> > > - IR register is written to acknowledge interrupt X. Y remains set in IR
> > > 
> > > As at no point in this sequence no interrupt flag is set in IR, the
> > > m_can interrupt line will never become deasserted, and no edge will ever
> > > be observed to trigger another run of the ISR. This was observed to
> > > result in the TX queue of the EHL m_can to get stuck under high load,
> > > because frames were queued to the hardware in m_can_start_xmit(), but
> > > m_can_finish_tx() was never run to account for their successful
> > > transmission.
> > > 
> > > To fix the issue, repeatedly read and acknowledge interrupts at the
> > > start of the ISR until no interrupt flags are set, so the next incoming
> > > interrupt will also result in an edge on the interrupt line.
> > > 
> > > Fixes: cab7ffc0324f ("can: m_can: add PCI glue driver for Intel Elkhart Lake")
> > > Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
> > > ---
> > > 
> > > v2: introduce flag is_edge_triggered, so we can avoid the loop on !m_can_pci
> > > 
> > >  drivers/net/can/m_can/m_can.c     | 21 ++++++++++++++++-----
> > >  drivers/net/can/m_can/m_can.h     |  1 +
> > >  drivers/net/can/m_can/m_can_pci.c |  1 +
> > >  3 files changed, 18 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
> > > index 47481afb9add3..2e182c3c98fed 100644
> > > --- a/drivers/net/can/m_can/m_can.c
> > > +++ b/drivers/net/can/m_can/m_can.c
> > > @@ -1207,20 +1207,31 @@ static void m_can_coalescing_update(struct m_can_classdev *cdev, u32 ir)
> > >  static int m_can_interrupt_handler(struct m_can_classdev *cdev)
> > >  {
> > >  	struct net_device *dev = cdev->net;
> > > -	u32 ir;
> > > +	u32 ir = 0, ir_read;
> > >  	int ret;
> > >  
> > >  	if (pm_runtime_suspended(cdev->dev))
> > >  		return IRQ_NONE;
> > >  
> > > -	ir = m_can_read(cdev, M_CAN_IR);
> > > +	/* For m_can_pci, the interrupt line is interpreted as edge-triggered,
> > > +	 * but the m_can controller generates them as level-triggered. We must
> > > +	 * observe that IR is 0 at least once to be sure that the next
> > > +	 * interrupt will generate an edge.
> > > +	 */
> > 
> > Could you please remove this hardware specific comment? As mentioned
> > above this will be independent of any specific hardware.
> 
> Ok.
> 
> 
> > 
> > > +	while ((ir_read = m_can_read(cdev, M_CAN_IR)) != 0) {
> > > +		ir |= ir_read;
> > > +
> > > +		/* ACK all irqs */
> > > +		m_can_write(cdev, M_CAN_IR, ir);
> > > +
> > > +		if (!cdev->is_edge_triggered)
> > > +			break;
> > > +	}
> > > +
> > >  	m_can_coalescing_update(cdev, ir);
> > >  	if (!ir)
> > >  		return IRQ_NONE;
> > >  
> > > -	/* ACK all irqs */
> > > -	m_can_write(cdev, M_CAN_IR, ir);
> > > -
> > >  	if (cdev->ops->clear_interrupts)
> > >  		cdev->ops->clear_interrupts(cdev);
> > >  
> > > diff --git a/drivers/net/can/m_can/m_can.h b/drivers/net/can/m_can/m_can.h
> > > index 92b2bd8628e6b..8c17eb94d2f98 100644
> > > --- a/drivers/net/can/m_can/m_can.h
> > > +++ b/drivers/net/can/m_can/m_can.h
> > > @@ -99,6 +99,7 @@ struct m_can_classdev {
> > >  	int pm_clock_support;
> > >  	int pm_wake_source;
> > >  	int is_peripheral;
> > > +	bool is_edge_triggered;
> > 
> > To avoid confusion could you rename it to irq_edge_triggered or
> > something similar, to make clear that it is not about the chip but the
> > way the interrupt line is connected?
> 
> Will do.
> 
> > 
> > Also I am not sure it is possible, but could you use
> > irq_get_trigger_type() to see if it is a level or edge based interrupt?
> > Then we wouldn't need this additional parameter at all and could just
> > detect it in m_can.c.
> 
> Unfortunately that doesn't seem to work. irq_get_trigger_type() only returns a meaningful value
> after the IRQ has been requested. I thought about requesting the IRQ with IRQF_NO_AUTOEN and then
> filling in the irq_edge_triggered field before enabling the IRQ, but IRQF_NO_AUTOEN is incompatible
> with IRQF_SHARED.

The mentioned function works for me on ARM and DT even before
irq_request_threaded_irq() was called.

Also I am probably missing something here. Afer requesting the irq, the
interrupts are not enabled yet right? So can't you just request it and
check the triggertype immediately afterwards?

> 
> Of course there are ways around this - checking irq_get_trigger_type() from the ISR itself, or
> adding more locking, but neither seems quite worthwhile to me. Would you agree with this?
> 
> Maybe there is some other way to find out the trigger type that would be set when the IRQ is
> requested? I don't know what that would be however - so I'd just keep setting the flag statically
> for m_can_pci and leave a dynamic solution for future improvement.

No if it doesn't work easily the parameter is probably the best option.

Best
Markus

> 
> Matthias
> 
> 
> 
> > 
> > Best
> > Markus
> > 
> > >  
> > >  	// Cached M_CAN_IE register content
> > >  	u32 active_interrupts;
> > > diff --git a/drivers/net/can/m_can/m_can_pci.c b/drivers/net/can/m_can/m_can_pci.c
> > > index d72fe771dfc7a..f98527981402a 100644
> > > --- a/drivers/net/can/m_can/m_can_pci.c
> > > +++ b/drivers/net/can/m_can/m_can_pci.c
> > > @@ -127,6 +127,7 @@ static int m_can_pci_probe(struct pci_dev *pci, const struct pci_device_id *id)
> > >  	mcan_class->pm_clock_support = 1;
> > >  	mcan_class->pm_wake_source = 0;
> > >  	mcan_class->can.clock.freq = id->driver_data;
> > > +	mcan_class->is_edge_triggered = true;
> > >  	mcan_class->ops = &m_can_pci_ops;
> > >  
> > >  	pci_set_drvdata(pci, mcan_class);
> > > -- 
> > > TQ-Systems GmbH | Mühlstraße 2, Gut Delling | 82229 Seefeld, Germany
> > > Amtsgericht München, HRB 105018
> > > Geschäftsführer: Detlef Schneider, Rüdiger Stahl, Stefan Schneider
> > > https://www.tq-group.com/
> > > 
> 
> -- 
> TQ-Systems GmbH | Mühlstraße 2, Gut Delling | 82229 Seefeld, Germany
> Amtsgericht München, HRB 105018
> Geschäftsführer: Detlef Schneider, Rüdiger Stahl, Stefan Schneider
> https://www.tq-group.com/
Marc Kleine-Budde Sept. 23, 2024, 10:17 a.m. UTC | #4
On 19.09.2024 13:27:28, Matthias Schiffer wrote:
> The interrupt line of PCI devices is interpreted as edge-triggered,
> however the interrupt signal of the m_can controller integrated in Intel
> Elkhart Lake CPUs appears to be generated level-triggered.
> 
> Consider the following sequence of events:
> 
> - IR register is read, interrupt X is set
> - A new interrupt Y is triggered in the m_can controller
> - IR register is written to acknowledge interrupt X. Y remains set in IR
> 
> As at no point in this sequence no interrupt flag is set in IR, the
> m_can interrupt line will never become deasserted, and no edge will ever
> be observed to trigger another run of the ISR. This was observed to
> result in the TX queue of the EHL m_can to get stuck under high load,
> because frames were queued to the hardware in m_can_start_xmit(), but
> m_can_finish_tx() was never run to account for their successful
> transmission.
> 
> To fix the issue, repeatedly read and acknowledge interrupts at the
> start of the ISR until no interrupt flags are set, so the next incoming
> interrupt will also result in an edge on the interrupt line.
> 
> Fixes: cab7ffc0324f ("can: m_can: add PCI glue driver for Intel Elkhart Lake")
> Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>

My coworker Lucas pointed me to:

| https://wiki.linuxfoundation.org/networking/napi#non-level_sensitive_irqs

On the other hand, I would also like to convert the !peripteral part of
the driver to rx-offload. However, I am still looking for potential
customers for this task. I have talked to some TI and ST people at LPC,
maybe they are interested.

I think let's first fix edge sensitive IRQs, then rework the driver to
rx-offload.

regards,
Marc
Marc Kleine-Budde Sept. 23, 2024, 10:30 a.m. UTC | #5
On 23.09.2024 10:03:24, Markus Schneider-Pargmann wrote:
> Hi Matthias,
> 
> On Thu, Sep 19, 2024 at 01:27:28PM GMT, Matthias Schiffer wrote:
> > The interrupt line of PCI devices is interpreted as edge-triggered,
> > however the interrupt signal of the m_can controller integrated in Intel
> 
> I have a similar patch though for a different setup (I didn't send it
> yet). I have a tcan chip wired to a pin that is only capable of edge
> interrupts.

Can you post your patch for completeness?

regards,
Marc
Marc Kleine-Budde Sept. 23, 2024, 10:53 a.m. UTC | #6
On 23.09.2024 12:07:33, Markus Schneider-Pargmann wrote:
> > > > diff --git a/drivers/net/can/m_can/m_can.h b/drivers/net/can/m_can/m_can.h
> > > > index 92b2bd8628e6b..8c17eb94d2f98 100644
> > > > --- a/drivers/net/can/m_can/m_can.h
> > > > +++ b/drivers/net/can/m_can/m_can.h
> > > > @@ -99,6 +99,7 @@ struct m_can_classdev {
> > > >  	int pm_clock_support;
> > > >  	int pm_wake_source;
> > > >  	int is_peripheral;
> > > > +	bool is_edge_triggered;
> > > 
> > > To avoid confusion could you rename it to irq_edge_triggered or
> > > something similar, to make clear that it is not about the chip but the
> > > way the interrupt line is connected?
> > 
> > Will do.
> > 
> > > 
> > > Also I am not sure it is possible, but could you use
> > > irq_get_trigger_type() to see if it is a level or edge based interrupt?
> > > Then we wouldn't need this additional parameter at all and could just
> > > detect it in m_can.c.
> > 
> > Unfortunately that doesn't seem to work. irq_get_trigger_type() only returns a meaningful value
> > after the IRQ has been requested. I thought about requesting the IRQ with IRQF_NO_AUTOEN and then
> > filling in the irq_edge_triggered field before enabling the IRQ, but IRQF_NO_AUTOEN is incompatible
> > with IRQF_SHARED.
> 
> The mentioned function works for me on ARM and DT even before
> irq_request_threaded_irq() was called.
> 
> Also I am probably missing something here. Afer requesting the irq, the
> interrupts are not enabled yet right? So can't you just request it and
> check the triggertype immediately afterwards?

You have to distinguish between the IRQ controller and the IRQ source in
your device. You must be able to handle IRQs right after request_irq().
If you IP core is in reset or has all IRQs masked, this will probably
not happen, but with shared IRQ one of the other IRQ sources on the IRQ
can trigger and your IRQ handler will be called. See
CONFIG_DEBUG_SHIRQ_FIXME and CONFIG_DEBUG_SHIRQ_FIXME.

> > Of course there are ways around this - checking irq_get_trigger_type() from the ISR itself, or
> > adding more locking, but neither seems quite worthwhile to me. Would you agree with this?
> > 
> > Maybe there is some other way to find out the trigger type that would be set when the IRQ is
> > requested? I don't know what that would be however - so I'd just keep setting the flag statically
> > for m_can_pci and leave a dynamic solution for future improvement.
> 
> No if it doesn't work easily the parameter is probably the best option.

regards,
Marc
Matthias Schiffer Sept. 23, 2024, 11:06 a.m. UTC | #7
On Mon, 2024-09-23 at 12:17 +0200, Marc Kleine-Budde wrote:
> On 19.09.2024 13:27:28, Matthias Schiffer wrote:
> > The interrupt line of PCI devices is interpreted as edge-triggered,
> > however the interrupt signal of the m_can controller integrated in Intel
> > Elkhart Lake CPUs appears to be generated level-triggered.
> > 
> > Consider the following sequence of events:
> > 
> > - IR register is read, interrupt X is set
> > - A new interrupt Y is triggered in the m_can controller
> > - IR register is written to acknowledge interrupt X. Y remains set in IR
> > 
> > As at no point in this sequence no interrupt flag is set in IR, the
> > m_can interrupt line will never become deasserted, and no edge will ever
> > be observed to trigger another run of the ISR. This was observed to
> > result in the TX queue of the EHL m_can to get stuck under high load,
> > because frames were queued to the hardware in m_can_start_xmit(), but
> > m_can_finish_tx() was never run to account for their successful
> > transmission.
> > 
> > To fix the issue, repeatedly read and acknowledge interrupts at the
> > start of the ISR until no interrupt flags are set, so the next incoming
> > interrupt will also result in an edge on the interrupt line.
> > 
> > Fixes: cab7ffc0324f ("can: m_can: add PCI glue driver for Intel Elkhart Lake")
> > Signed-off-by: Matthias Schiffer <matthias.schiffer@ew.tq-group.com>
> 
> My coworker Lucas pointed me to:
> 
> > https://wiki.linuxfoundation.org/networking/napi#non-level_sensitive_irqs
> 

Thanks. I don't think this is directly applicable here - in our case the lost TX complete interrupts
were the issue, not (only) the RX interrupts.

Matthias


> On the other hand, I would also like to convert the !peripteral part of
> the driver to rx-offload. However, I am still looking for potential
> customers for this task. I have talked to some TI and ST people at LPC,
> maybe they are interested.
> 
> I think let's first fix edge sensitive IRQs, then rework the driver to
> rx-offload.
> 
> regards,
> Marc
> 
>
diff mbox series

Patch

diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c
index 47481afb9add3..2e182c3c98fed 100644
--- a/drivers/net/can/m_can/m_can.c
+++ b/drivers/net/can/m_can/m_can.c
@@ -1207,20 +1207,31 @@  static void m_can_coalescing_update(struct m_can_classdev *cdev, u32 ir)
 static int m_can_interrupt_handler(struct m_can_classdev *cdev)
 {
 	struct net_device *dev = cdev->net;
-	u32 ir;
+	u32 ir = 0, ir_read;
 	int ret;
 
 	if (pm_runtime_suspended(cdev->dev))
 		return IRQ_NONE;
 
-	ir = m_can_read(cdev, M_CAN_IR);
+	/* For m_can_pci, the interrupt line is interpreted as edge-triggered,
+	 * but the m_can controller generates them as level-triggered. We must
+	 * observe that IR is 0 at least once to be sure that the next
+	 * interrupt will generate an edge.
+	 */
+	while ((ir_read = m_can_read(cdev, M_CAN_IR)) != 0) {
+		ir |= ir_read;
+
+		/* ACK all irqs */
+		m_can_write(cdev, M_CAN_IR, ir);
+
+		if (!cdev->is_edge_triggered)
+			break;
+	}
+
 	m_can_coalescing_update(cdev, ir);
 	if (!ir)
 		return IRQ_NONE;
 
-	/* ACK all irqs */
-	m_can_write(cdev, M_CAN_IR, ir);
-
 	if (cdev->ops->clear_interrupts)
 		cdev->ops->clear_interrupts(cdev);
 
diff --git a/drivers/net/can/m_can/m_can.h b/drivers/net/can/m_can/m_can.h
index 92b2bd8628e6b..8c17eb94d2f98 100644
--- a/drivers/net/can/m_can/m_can.h
+++ b/drivers/net/can/m_can/m_can.h
@@ -99,6 +99,7 @@  struct m_can_classdev {
 	int pm_clock_support;
 	int pm_wake_source;
 	int is_peripheral;
+	bool is_edge_triggered;
 
 	// Cached M_CAN_IE register content
 	u32 active_interrupts;
diff --git a/drivers/net/can/m_can/m_can_pci.c b/drivers/net/can/m_can/m_can_pci.c
index d72fe771dfc7a..f98527981402a 100644
--- a/drivers/net/can/m_can/m_can_pci.c
+++ b/drivers/net/can/m_can/m_can_pci.c
@@ -127,6 +127,7 @@  static int m_can_pci_probe(struct pci_dev *pci, const struct pci_device_id *id)
 	mcan_class->pm_clock_support = 1;
 	mcan_class->pm_wake_source = 0;
 	mcan_class->can.clock.freq = id->driver_data;
+	mcan_class->is_edge_triggered = true;
 	mcan_class->ops = &m_can_pci_ops;
 
 	pci_set_drvdata(pci, mcan_class);