diff mbox

[v2,2/3] spi: pxa2xx: Prepare for edge-triggered interrupts

Message ID 7b15a0910a3ad861fd32161c72559bafa7b71e29.1484592296.git.jan.kiszka@siemens.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Kiszka Jan. 16, 2017, 6:44 p.m. UTC
When using the a device with edge-triggered interrupts, such as MSIs,
the interrupt handler has to ensure that there is a point in time during
its execution where all interrupts sources are silent so that a new
event can trigger a new interrupt again.

This is achieved here by looping over SSSR evaluation. We need to take
into account that SSCR1 may be changed by the transfer handler, thus we
need to redo the mask calculation, at least regarding the volatile
interrupt enable bit (TIE).

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
---
 drivers/spi/spi-pxa2xx.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

Comments

Andy Shevchenko Jan. 16, 2017, 7:07 p.m. UTC | #1
On Mon, 2017-01-16 at 19:44 +0100, Jan Kiszka wrote:
> When using the a device with edge-triggered interrupts, such as MSIs,
> the interrupt handler has to ensure that there is a point in time
> during
> its execution where all interrupts sources are silent so that a new
> event can trigger a new interrupt again.
> 
> This is achieved here by looping over SSSR evaluation. We need to take
> into account that SSCR1 may be changed by the transfer handler, thus
> we
> need to redo the mask calculation, at least regarding the volatile
> interrupt enable bit (TIE).
> 

So, more comments/questions below.

>  
>  	sccr1_reg = pxa2xx_spi_read(drv_data, SSCR1);
>  
> -	/* Ignore possible writes if we don't need to write */
> -	if (!(sccr1_reg & SSCR1_TIE))
> -		mask &= ~SSSR_TFS;
> -
>  	/* Ignore RX timeout interrupt if it is disabled */
>  	if (!(sccr1_reg & SSCR1_TINTE))
>  		mask &= ~SSSR_TINT;
>  
> -	if (!(status & mask))
> -		return IRQ_NONE;
> +	while (1) {

Can we switch to do-while and move previous block here? Btw, can TINTE
bit be set again during a loop?

> +		/* Ignore possible writes if we don't need to write
> */
> +		if (!(sccr1_reg & SSCR1_TIE))
> +			mask &= ~SSSR_TFS;
>  
> -	if (!drv_data->master->cur_msg) {
> -		handle_bad_msg(drv_data);
> -		/* Never fail */
> -		return IRQ_HANDLED;
> -	}
> +		if (!(status & mask))
> +			return ret;
> +
> +		if (!drv_data->master->cur_msg) {
> +			handle_bad_msg(drv_data);
> +			/* Never fail */
> +			return IRQ_HANDLED;
> +		}
> +

> +		ret |= drv_data->transfer_handler(drv_data);

So, we might call handler several times. This needs to be commented in
the code why you do so.

>  
> -	return drv_data->transfer_handler(drv_data);
> +		status = pxa2xx_spi_read(drv_data, SSSR);

Would it be possible to get all 1:s from the register
(something/autosuspend just powered off it by timeout?) ?

> +		sccr1_reg = pxa2xx_spi_read(drv_data, SSCR1);
> +	}
>  }
>  
>  /*
Jan Kiszka Jan. 16, 2017, 7:46 p.m. UTC | #2
On 2017-01-16 20:07, Andy Shevchenko wrote:
> On Mon, 2017-01-16 at 19:44 +0100, Jan Kiszka wrote:
>> When using the a device with edge-triggered interrupts, such as MSIs,
>> the interrupt handler has to ensure that there is a point in time
>> during
>> its execution where all interrupts sources are silent so that a new
>> event can trigger a new interrupt again.
>>
>> This is achieved here by looping over SSSR evaluation. We need to take
>> into account that SSCR1 may be changed by the transfer handler, thus
>> we
>> need to redo the mask calculation, at least regarding the volatile
>> interrupt enable bit (TIE).
>>
> 
> So, more comments/questions below.
> 
>>  
>>  	sccr1_reg = pxa2xx_spi_read(drv_data, SSCR1);
>>  
>> -	/* Ignore possible writes if we don't need to write */
>> -	if (!(sccr1_reg & SSCR1_TIE))
>> -		mask &= ~SSSR_TFS;
>> -
>>  	/* Ignore RX timeout interrupt if it is disabled */
>>  	if (!(sccr1_reg & SSCR1_TINTE))
>>  		mask &= ~SSSR_TINT;
>>  
>> -	if (!(status & mask))
>> -		return IRQ_NONE;
>> +	while (1) {
> 
> Can we switch to do-while and move previous block here?

Don't see how this would help (without duplicating more code).

> Btw, can TINTE
> bit be set again during a loop?

Nope, it's statically set, at least so far.

What we could do is simply restarting ssp_int

> 
>> +		/* Ignore possible writes if we don't need to write
>> */
>> +		if (!(sccr1_reg & SSCR1_TIE))
>> +			mask &= ~SSSR_TFS;
>>  
>> -	if (!drv_data->master->cur_msg) {
>> -		handle_bad_msg(drv_data);
>> -		/* Never fail */
>> -		return IRQ_HANDLED;
>> -	}
>> +		if (!(status & mask))
>> +			return ret;
>> +
>> +		if (!drv_data->master->cur_msg) {
>> +			handle_bad_msg(drv_data);
>> +			/* Never fail */
>> +			return IRQ_HANDLED;
>> +		}
>> +
> 
>> +		ret |= drv_data->transfer_handler(drv_data);
> 
> So, we might call handler several times. This needs to be commented in
> the code why you do so.

I can move the commit log into the code.

> 
>>  
>> -	return drv_data->transfer_handler(drv_data);
>> +		status = pxa2xx_spi_read(drv_data, SSSR);
> 
> Would it be possible to get all 1:s from the register
> (something/autosuspend just powered off it by timeout?) ?
> 

Not sure if that can happen, but I guess it would be simpler and more
readable to simply do this instead:

	while (1) {
		/*
		 * If the device is not yet in RPM suspended state and we get an
		 * interrupt that is meant for another device, check if status
		 * bits are all set to one. That means that the device is
		 * already powered off.
		 */
		status = pxa2xx_spi_read(drv_data, SSSR);
		if (status == ~0)
			return ret;

		sccr1_reg = pxa2xx_spi_read(drv_data, SSCR1);

		/* Ignore RX timeout interrupt if it is disabled */
		if (!(sccr1_reg & SSCR1_TINTE))
			mask &= ~SSSR_TINT;

		/* Ignore possible writes if we don't need to write */
		if (!(sccr1_reg & SSCR1_TIE))
			mask &= ~SSSR_TFS;

		if (!(status & mask))
			return ret;

		if (!drv_data->master->cur_msg) {
			handle_bad_msg(drv_data);
			/* Never fail */
			return IRQ_HANDLED;
		}

		ret |= drv_data->transfer_handler(drv_data);
	}


i.e. preserve the current structure, just add the loop.

Jan
Robert Jarzmik Jan. 17, 2017, 7:54 a.m. UTC | #3
Jan Kiszka <jan.kiszka@siemens.com> writes:

> When using the a device with edge-triggered interrupts, such as MSIs,
> the interrupt handler has to ensure that there is a point in time during
> its execution where all interrupts sources are silent so that a new
> event can trigger a new interrupt again.
>
> This is achieved here by looping over SSSR evaluation. We need to take
> into account that SSCR1 may be changed by the transfer handler, thus we
> need to redo the mask calculation, at least regarding the volatile
> interrupt enable bit (TIE).
>
> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Hi Jan,

> +	while (1) {
This bit worries me a bit, as this can be either :
 - hogging the SoC's CPU, endlessly running
 - or even worse, blocking the CPU for ever

The question behind is, should this be done in a top-half, or moved to a irq
thread ?

> +		/* Ignore possible writes if we don't need to write */
> +		if (!(sccr1_reg & SSCR1_TIE))
> +			mask &= ~SSSR_TFS;
>  
> -	if (!drv_data->master->cur_msg) {
> -		handle_bad_msg(drv_data);
> -		/* Never fail */
> -		return IRQ_HANDLED;
> -	}
> +		if (!(status & mask))
> +			return ret;
> +
> +		if (!drv_data->master->cur_msg) {
> +			handle_bad_msg(drv_data);
> +			/* Never fail */
> +			return IRQ_HANDLED;
> +		}
> +
> +		ret |= drv_data->transfer_handler(drv_data);
Mmm that looks weird to me, oring a irqreturn.

Imagine that on first iteration the handler returns IRQ_NONE, and on second
IRQ_HANDLED. This makes ret IRQ_HANDLED. Yet after the first iteration the
handler should have exited, especially if the interrupt is shared with another
driver.

Another thing which is along what Andy already said : it would be better
practice to have this loop in the form :
do {
...
} while (exit_condition_not_met);

Just for maintainability, it's better, and it concentrates the test on the
"exit_condition_not_met" in one place, which will enable us to review better the
algorithm.

Cheers.
Robert Jarzmik Jan. 17, 2017, 7:58 a.m. UTC | #4
Jan Kiszka <jan.kiszka@siemens.com> writes:

> When using the a device with edge-triggered interrupts, such as MSIs,
> the interrupt handler has to ensure that there is a point in time during
> its execution where all interrupts sources are silent so that a new
> event can trigger a new interrupt again.
>
> This is achieved here by looping over SSSR evaluation. We need to take
> into account that SSCR1 may be changed by the transfer handler, thus we
> need to redo the mask calculation, at least regarding the volatile
> interrupt enable bit (TIE).

I'd like moreover to add a question here.

In pxa architecture, SPI interrupts are already edge-triggered, and it's working
well. The interrupt source disabling is not disabled, but the interrupt
controller doesn't trigger an interrupt anymore (as it is masked), yet it marks
it as pending if an interrupt arrives while the interrupt handler is running.

All of this is handled by the interrupt core. My question is why for Intel MSI's
is it necessary to make a change in the driver instead or relying on the
interrupt core as for the pxa ?

Cheers.

--
Robert
--
To unsubscribe from this list: send the line "unsubscribe linux-spi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jan Kiszka Jan. 17, 2017, 8:05 a.m. UTC | #5
On 2017-01-17 08:54, Robert Jarzmik wrote:
> Jan Kiszka <jan.kiszka@siemens.com> writes:
> 
>> When using the a device with edge-triggered interrupts, such as MSIs,
>> the interrupt handler has to ensure that there is a point in time during
>> its execution where all interrupts sources are silent so that a new
>> event can trigger a new interrupt again.
>>
>> This is achieved here by looping over SSSR evaluation. We need to take
>> into account that SSCR1 may be changed by the transfer handler, thus we
>> need to redo the mask calculation, at least regarding the volatile
>> interrupt enable bit (TIE).
>>
>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
> Hi Jan,
> 
>> +	while (1) {
> This bit worries me a bit, as this can be either :
>  - hogging the SoC's CPU, endlessly running
>  - or even worse, blocking the CPU for ever
> 
> The question behind is, should this be done in a top-half, or moved to a irq
> thread ?

Every device with a broken interrupt source can hog CPUs, nothing
special with this one. If you don't close the loop in the handler
itself, you close it over the hardware retriggering the interrupt over
and over again.

So, I don't see a point in offloading to a thread. The normal case is
some TX done (FIFO available) event followed by an RX event, then the
transfer is complete, isn't it?

> 
>> +		/* Ignore possible writes if we don't need to write */
>> +		if (!(sccr1_reg & SSCR1_TIE))
>> +			mask &= ~SSSR_TFS;
>>  
>> -	if (!drv_data->master->cur_msg) {
>> -		handle_bad_msg(drv_data);
>> -		/* Never fail */
>> -		return IRQ_HANDLED;
>> -	}
>> +		if (!(status & mask))
>> +			return ret;
>> +
>> +		if (!drv_data->master->cur_msg) {
>> +			handle_bad_msg(drv_data);
>> +			/* Never fail */
>> +			return IRQ_HANDLED;
>> +		}
>> +
>> +		ret |= drv_data->transfer_handler(drv_data);
> Mmm that looks weird to me, oring a irqreturn.

Not really an uncommon pattern, though.

> 
> Imagine that on first iteration the handler returns IRQ_NONE, and on second
> IRQ_HANDLED. This makes ret IRQ_HANDLED. Yet after the first iteration the
> handler should have exited, especially if the interrupt is shared with another
> driver.

That would be a bug in transfer_handler, because we don't enter it
without a reason (status != 0).

> 
> Another thing which is along what Andy already said : it would be better
> practice to have this loop in the form :
> do {
> ...
> } while (exit_condition_not_met);

This implies code duplication in order to calculate the condition
(mask...). I can do this if desired, I wouldn't do this to my own code,
though.

Jan

> 
> Just for maintainability, it's better, and it concentrates the test on the
> "exit_condition_not_met" in one place, which will enable us to review better the
> algorithm.
> 
> Cheers.
>
Jan Kiszka Jan. 17, 2017, 8:10 a.m. UTC | #6
On 2017-01-17 08:58, Robert Jarzmik wrote:
> Jan Kiszka <jan.kiszka@siemens.com> writes:
> 
>> When using the a device with edge-triggered interrupts, such as MSIs,
>> the interrupt handler has to ensure that there is a point in time during
>> its execution where all interrupts sources are silent so that a new
>> event can trigger a new interrupt again.
>>
>> This is achieved here by looping over SSSR evaluation. We need to take
>> into account that SSCR1 may be changed by the transfer handler, thus we
>> need to redo the mask calculation, at least regarding the volatile
>> interrupt enable bit (TIE).
> 
> I'd like moreover to add a question here.
> 
> In pxa architecture, SPI interrupts are already edge-triggered, and it's working
> well. The interrupt source disabling is not disabled, but the interrupt
> controller doesn't trigger an interrupt anymore (as it is masked), yet it marks
> it as pending if an interrupt arrives while the interrupt handler is running.
> 
> All of this is handled by the interrupt core. My question is why for Intel MSI's
> is it necessary to make a change in the driver instead or relying on the
> interrupt core as for the pxa ?

If someone was using this driver with edge-triggered interrupt sources
so far, it was probably slower hardware and some luck (I've seen this
when driving fast-clocked devices vs. slower ones - only the latter
exposed the bug). Or that hardware did some temporary masking at
interrupt controller level while the handler was running. But that is
also not by design. It's the driver's task to ensure that all interrupt
sources are addressed once when returning from an edge-triggered
handler, and that is missing in this one.

Jan
Jarkko Nikula Jan. 17, 2017, 1:11 p.m. UTC | #7
On 01/17/2017 10:10 AM, Jan Kiszka wrote:
> On 2017-01-17 08:58, Robert Jarzmik wrote:
>> Jan Kiszka <jan.kiszka@siemens.com> writes:
>>
>>> When using the a device with edge-triggered interrupts, such as MSIs,
>>> the interrupt handler has to ensure that there is a point in time during
>>> its execution where all interrupts sources are silent so that a new
>>> event can trigger a new interrupt again.
>>>
>>> This is achieved here by looping over SSSR evaluation. We need to take
>>> into account that SSCR1 may be changed by the transfer handler, thus we
>>> need to redo the mask calculation, at least regarding the volatile
>>> interrupt enable bit (TIE).
>>
>> I'd like moreover to add a question here.
>>
>> In pxa architecture, SPI interrupts are already edge-triggered, and it's working
>> well. The interrupt source disabling is not disabled, but the interrupt
>> controller doesn't trigger an interrupt anymore (as it is masked), yet it marks
>> it as pending if an interrupt arrives while the interrupt handler is running.
>>
>> All of this is handled by the interrupt core. My question is why for Intel MSI's
>> is it necessary to make a change in the driver instead or relying on the
>> interrupt core as for the pxa ?
>
> If someone was using this driver with edge-triggered interrupt sources
> so far, it was probably slower hardware and some luck (I've seen this
> when driving fast-clocked devices vs. slower ones - only the latter
> exposed the bug). Or that hardware did some temporary masking at
> interrupt controller level while the handler was running. But that is
> also not by design. It's the driver's task to ensure that all interrupt
> sources are addressed once when returning from an edge-triggered
> handler, and that is missing in this one.
>
Are you seeing actual problem here or adding loop just in case? Is it 
really so that PCI bridge doesn't generate another MSI interrupt if SPI 
controller has interrupt pending when handler returns? I don't know but 
I would expect irq line between SPI controller and PCI bridge is still 
level sensitive even PCI bridge issues MSIs to the CPU.
Jan Kiszka Jan. 17, 2017, 2:43 p.m. UTC | #8
On 2017-01-17 14:11, Jarkko Nikula wrote:
> On 01/17/2017 10:10 AM, Jan Kiszka wrote:
>> On 2017-01-17 08:58, Robert Jarzmik wrote:
>>> Jan Kiszka <jan.kiszka@siemens.com> writes:
>>>
>>>> When using the a device with edge-triggered interrupts, such as MSIs,
>>>> the interrupt handler has to ensure that there is a point in time
>>>> during
>>>> its execution where all interrupts sources are silent so that a new
>>>> event can trigger a new interrupt again.
>>>>
>>>> This is achieved here by looping over SSSR evaluation. We need to take
>>>> into account that SSCR1 may be changed by the transfer handler, thus we
>>>> need to redo the mask calculation, at least regarding the volatile
>>>> interrupt enable bit (TIE).
>>>
>>> I'd like moreover to add a question here.
>>>
>>> In pxa architecture, SPI interrupts are already edge-triggered, and
>>> it's working
>>> well. The interrupt source disabling is not disabled, but the interrupt
>>> controller doesn't trigger an interrupt anymore (as it is masked),
>>> yet it marks
>>> it as pending if an interrupt arrives while the interrupt handler is
>>> running.
>>>
>>> All of this is handled by the interrupt core. My question is why for
>>> Intel MSI's
>>> is it necessary to make a change in the driver instead or relying on the
>>> interrupt core as for the pxa ?
>>
>> If someone was using this driver with edge-triggered interrupt sources
>> so far, it was probably slower hardware and some luck (I've seen this
>> when driving fast-clocked devices vs. slower ones - only the latter
>> exposed the bug). Or that hardware did some temporary masking at
>> interrupt controller level while the handler was running. But that is
>> also not by design. It's the driver's task to ensure that all interrupt
>> sources are addressed once when returning from an edge-triggered
>> handler, and that is missing in this one.
>>
> Are you seeing actual problem here or adding loop just in case? Is it
> really so that PCI bridge doesn't generate another MSI interrupt if SPI
> controller has interrupt pending when handler returns? I don't know but
> I would expect irq line between SPI controller and PCI bridge is still
> level sensitive even PCI bridge issues MSIs to the CPU.
> 

Yes, I'm seeing real problems, e.g. on the Galileo board when running
against a slower SPI device and using MSI: An interrupt is raised
because the TX queue was flushed towards the device (threshold
underrun). While handling that interrupt, the device starts to respond
and an RX event occurs as well. This raises the related interrupt reason
before the TX source was satisfied. Therefore, the interrupt output of
the SPI master will never go down, and there will be no additional edge
generated. Using level-interrupts, this is no problem, but with
edge-triggered we get stuck.

Jan
Jan Kiszka Jan. 17, 2017, 6:52 p.m. UTC | #9
On 2017-01-16 20:46, Jan Kiszka wrote:
> On 2017-01-16 20:07, Andy Shevchenko wrote:
>> On Mon, 2017-01-16 at 19:44 +0100, Jan Kiszka wrote:
>>> When using the a device with edge-triggered interrupts, such as MSIs,
>>> the interrupt handler has to ensure that there is a point in time
>>> during
>>> its execution where all interrupts sources are silent so that a new
>>> event can trigger a new interrupt again.
>>>
>>> This is achieved here by looping over SSSR evaluation. We need to take
>>> into account that SSCR1 may be changed by the transfer handler, thus
>>> we
>>> need to redo the mask calculation, at least regarding the volatile
>>> interrupt enable bit (TIE).
>>>
>>
>> So, more comments/questions below.
>>
>>>  
>>>  	sccr1_reg = pxa2xx_spi_read(drv_data, SSCR1);
>>>  
>>> -	/* Ignore possible writes if we don't need to write */
>>> -	if (!(sccr1_reg & SSCR1_TIE))
>>> -		mask &= ~SSSR_TFS;
>>> -
>>>  	/* Ignore RX timeout interrupt if it is disabled */
>>>  	if (!(sccr1_reg & SSCR1_TINTE))
>>>  		mask &= ~SSSR_TINT;
>>>  
>>> -	if (!(status & mask))
>>> -		return IRQ_NONE;
>>> +	while (1) {
>>
>> Can we switch to do-while and move previous block here?
> 
> Don't see how this would help (without duplicating more code).
> 
>> Btw, can TINTE
>> bit be set again during a loop?
> 
> Nope, it's statically set, at least so far.
> 
> What we could do is simply restarting ssp_int
> 
>>
>>> +		/* Ignore possible writes if we don't need to write
>>> */
>>> +		if (!(sccr1_reg & SSCR1_TIE))
>>> +			mask &= ~SSSR_TFS;
>>>  
>>> -	if (!drv_data->master->cur_msg) {
>>> -		handle_bad_msg(drv_data);
>>> -		/* Never fail */
>>> -		return IRQ_HANDLED;
>>> -	}
>>> +		if (!(status & mask))
>>> +			return ret;
>>> +
>>> +		if (!drv_data->master->cur_msg) {
>>> +			handle_bad_msg(drv_data);
>>> +			/* Never fail */
>>> +			return IRQ_HANDLED;
>>> +		}
>>> +
>>
>>> +		ret |= drv_data->transfer_handler(drv_data);
>>
>> So, we might call handler several times. This needs to be commented in
>> the code why you do so.
> 
> I can move the commit log into the code.
> 
>>
>>>  
>>> -	return drv_data->transfer_handler(drv_data);
>>> +		status = pxa2xx_spi_read(drv_data, SSSR);
>>
>> Would it be possible to get all 1:s from the register
>> (something/autosuspend just powered off it by timeout?) ?
>>
> 
> Not sure if that can happen, but I guess it would be simpler and more
> readable to simply do this instead:
> 
> 	while (1) {
> 		/*
> 		 * If the device is not yet in RPM suspended state and we get an
> 		 * interrupt that is meant for another device, check if status
> 		 * bits are all set to one. That means that the device is
> 		 * already powered off.
> 		 */
> 		status = pxa2xx_spi_read(drv_data, SSSR);
> 		if (status == ~0)
> 			return ret;
> 
> 		sccr1_reg = pxa2xx_spi_read(drv_data, SSCR1);
> 
> 		/* Ignore RX timeout interrupt if it is disabled */
> 		if (!(sccr1_reg & SSCR1_TINTE))
> 			mask &= ~SSSR_TINT;
> 
> 		/* Ignore possible writes if we don't need to write */
> 		if (!(sccr1_reg & SSCR1_TIE))
> 			mask &= ~SSSR_TFS;
> 
> 		if (!(status & mask))
> 			return ret;
> 
> 		if (!drv_data->master->cur_msg) {
> 			handle_bad_msg(drv_data);
> 			/* Never fail */
> 			return IRQ_HANDLED;
> 		}
> 
> 		ret |= drv_data->transfer_handler(drv_data);
> 	}
> 
> 
> i.e. preserve the current structure, just add the loop.
> 

OK, please let me know if you want a v3 of this patch with the structure
above. If there are further concerns/questions, just let me know as
well, but I'd like to close this topic if possible.

Thanks,
Jan
Robert Jarzmik Jan. 18, 2017, 8:21 a.m. UTC | #10
Jan Kiszka <jan.kiszka@siemens.com> writes:

> On 2017-01-17 08:54, Robert Jarzmik wrote:
>> Jan Kiszka <jan.kiszka@siemens.com> writes:
>> 
>>> When using the a device with edge-triggered interrupts, such as MSIs,
>>> the interrupt handler has to ensure that there is a point in time during
>>> its execution where all interrupts sources are silent so that a new
>>> event can trigger a new interrupt again.
>>>
>>> This is achieved here by looping over SSSR evaluation. We need to take
>>> into account that SSCR1 may be changed by the transfer handler, thus we
>>> need to redo the mask calculation, at least regarding the volatile
>>> interrupt enable bit (TIE).
>>>
>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>> Hi Jan,
>> 
>>> +	while (1) {
>> This bit worries me a bit, as this can be either :
>>  - hogging the SoC's CPU, endlessly running
>>  - or even worse, blocking the CPU for ever
>> 
>> The question behind is, should this be done in a top-half, or moved to a irq
>> thread ?
>
> Every device with a broken interrupt source can hog CPUs, nothing
> special with this one. If you don't close the loop in the handler
> itself, you close it over the hardware retriggering the interrupt over
> and over again.
I'm not speaking of a broken interrupt source, I'm speaking of a broken code,
such as in the handler, or broken status readback, or lack of understanding on
the status register which may imply the while(1) to loop forever.

> So, I don't see a point in offloading to a thread. The normal case is
> some TX done (FIFO available) event followed by an RX event, then the
> transfer is complete, isn't it?
The point is if you stay forever in the while(1) loop, you can at least have a
print a backtrace (LOCKUP_DETECTOR).

>> Imagine that on first iteration the handler returns IRQ_NONE, and on second
>> IRQ_HANDLED. This makes ret IRQ_HANDLED. Yet after the first iteration the
>> handler should have exited, especially if the interrupt is shared with another
>> driver.
>
> That would be a bug in transfer_handler, because we don't enter it
> without a reason (status != 0).
Sure, but can you be sure that all the people modifying the code after you will
see that also ? The other way will _force_ them to see it.

>> Another thing which is along what Andy already said : it would be better
>> practice to have this loop in the form :
>> do {
>> ...
>> } while (exit_condition_not_met);
>
> This implies code duplication in order to calculate the condition
> (mask...). I can do this if desired, I wouldn't do this to my own code,
> though.
Okay, that's acceptable.
Why not have something like this :

sccr1_reg = pxa2xx_spi_read(drv_data, SSCR1);
if (!(sccr1_reg & SSCR1_TIE))
	mask &= ~SSSR_TFS;

/* Ignore RX timeout interrupt if it is disabled */
if (!(sccr1_reg & SSCR1_TINTE))
	mask &= ~SSSR_TINT;

status = pxa2xx_spi_read(drv_data, SSR);
while (status & mask) {
   ... handlers etc ...
   status = pxa2xx_spi_read(drv_data, SSR);
};

There is a duplication of the status read, but that looks acceptable, and the
mask calculation is moved out of the loop (this should be checked more
thoroughly as it looked to me only probe() would change these values, yet I
might be wrong).

Cheers.
Jan Kiszka Jan. 18, 2017, 9:33 a.m. UTC | #11
On 2017-01-18 09:21, Robert Jarzmik wrote:
> Jan Kiszka <jan.kiszka@siemens.com> writes:
> 
>> On 2017-01-17 08:54, Robert Jarzmik wrote:
>>> Jan Kiszka <jan.kiszka@siemens.com> writes:
>>>
>>>> When using the a device with edge-triggered interrupts, such as MSIs,
>>>> the interrupt handler has to ensure that there is a point in time during
>>>> its execution where all interrupts sources are silent so that a new
>>>> event can trigger a new interrupt again.
>>>>
>>>> This is achieved here by looping over SSSR evaluation. We need to take
>>>> into account that SSCR1 may be changed by the transfer handler, thus we
>>>> need to redo the mask calculation, at least regarding the volatile
>>>> interrupt enable bit (TIE).
>>>>
>>>> Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
>>> Hi Jan,
>>>
>>>> +	while (1) {
>>> This bit worries me a bit, as this can be either :
>>>  - hogging the SoC's CPU, endlessly running
>>>  - or even worse, blocking the CPU for ever
>>>
>>> The question behind is, should this be done in a top-half, or moved to a irq
>>> thread ?
>>
>> Every device with a broken interrupt source can hog CPUs, nothing
>> special with this one. If you don't close the loop in the handler
>> itself, you close it over the hardware retriggering the interrupt over
>> and over again.
> I'm not speaking of a broken interrupt source, I'm speaking of a broken code,
> such as in the handler, or broken status readback, or lack of understanding on
> the status register which may imply the while(1) to loop forever.
> 
>> So, I don't see a point in offloading to a thread. The normal case is
>> some TX done (FIFO available) event followed by an RX event, then the
>> transfer is complete, isn't it?
> The point is if you stay forever in the while(1) loop, you can at least have a
> print a backtrace (LOCKUP_DETECTOR).

I won't consider "debugability" as a good reason to move interrupt
handlers into threads. There should be real workload that requires
offloading or specific prioritization.

> 
>>> Imagine that on first iteration the handler returns IRQ_NONE, and on second
>>> IRQ_HANDLED. This makes ret IRQ_HANDLED. Yet after the first iteration the
>>> handler should have exited, especially if the interrupt is shared with another
>>> driver.
>>
>> That would be a bug in transfer_handler, because we don't enter it
>> without a reason (status != 0).
> Sure, but can you be sure that all the people modifying the code after you will
> see that also ? The other way will _force_ them to see it.
> 
>>> Another thing which is along what Andy already said : it would be better
>>> practice to have this loop in the form :
>>> do {
>>> ...
>>> } while (exit_condition_not_met);
>>
>> This implies code duplication in order to calculate the condition
>> (mask...). I can do this if desired, I wouldn't do this to my own code,
>> though.
> Okay, that's acceptable.
> Why not have something like this :
> 
> sccr1_reg = pxa2xx_spi_read(drv_data, SSCR1);
> if (!(sccr1_reg & SSCR1_TIE))
> 	mask &= ~SSSR_TFS;
> 
> /* Ignore RX timeout interrupt if it is disabled */
> if (!(sccr1_reg & SSCR1_TINTE))
> 	mask &= ~SSSR_TINT;
> 
> status = pxa2xx_spi_read(drv_data, SSR);
> while (status & mask) {
>    ... handlers etc ...
>    status = pxa2xx_spi_read(drv_data, SSR);
> };
> 
> There is a duplication of the status read, but that looks acceptable, and the
> mask calculation is moved out of the loop (this should be checked more
> thoroughly as it looked to me only probe() would change these values, yet I
> might be wrong).

Unfortunately, mask can change if SSCR1_TIE is cleared. So this is not
correct.

What would be an alternative to looping is masking (would be required
for threaded irq anyway - but then we won't need to loop in the first
place): disable all irq sources, check the status bits once, re-enable
according to a potentially updated set, leave the handler and let the
hardware call us again.

Jan
Mark Brown Jan. 18, 2017, 12:46 p.m. UTC | #12
On Wed, Jan 18, 2017 at 10:33:07AM +0100, Jan Kiszka wrote:
> On 2017-01-18 09:21, Robert Jarzmik wrote:

> >>>> +	while (1) {

> >>> This bit worries me a bit, as this can be either :
> >>>  - hogging the SoC's CPU, endlessly running
> >>>  - or even worse, blocking the CPU for ever

> >>> The question behind is, should this be done in a top-half, or moved to a irq
> >>> thread ?

> >> Every device with a broken interrupt source can hog CPUs, nothing
> >> special with this one. If you don't close the loop in the handler
> >> itself, you close it over the hardware retriggering the interrupt over
> >> and over again.

> > I'm not speaking of a broken interrupt source, I'm speaking of a broken code,
> > such as in the handler, or broken status readback, or lack of understanding on
> > the status register which may imply the while(1) to loop forever.

> >> So, I don't see a point in offloading to a thread. The normal case is
> >> some TX done (FIFO available) event followed by an RX event, then the
> >> transfer is complete, isn't it?

> > The point is if you stay forever in the while(1) loop, you can at least have a
> > print a backtrace (LOCKUP_DETECTOR).

> I won't consider "debugability" as a good reason to move interrupt
> handlers into threads. There should be real workload that requires
> offloading or specific prioritization.

It's failure mitigation - you're translating a hard lockup into
something that will potentially allow the system to soldier on which is
likely to be less severe for the user as well as making things easier to
figure out.  If we're doing something like this I'd at least have a
limit on how long we allow the interrupt to scream.
diff mbox

Patch

diff --git a/drivers/spi/spi-pxa2xx.c b/drivers/spi/spi-pxa2xx.c
index 0d10090..ac49b80 100644
--- a/drivers/spi/spi-pxa2xx.c
+++ b/drivers/spi/spi-pxa2xx.c
@@ -751,6 +751,7 @@  static irqreturn_t ssp_int(int irq, void *dev_id)
 	struct driver_data *drv_data = dev_id;
 	u32 sccr1_reg;
 	u32 mask = drv_data->mask_sr;
+	irqreturn_t ret = IRQ_NONE;
 	u32 status;
 
 	/*
@@ -774,24 +775,29 @@  static irqreturn_t ssp_int(int irq, void *dev_id)
 
 	sccr1_reg = pxa2xx_spi_read(drv_data, SSCR1);
 
-	/* Ignore possible writes if we don't need to write */
-	if (!(sccr1_reg & SSCR1_TIE))
-		mask &= ~SSSR_TFS;
-
 	/* Ignore RX timeout interrupt if it is disabled */
 	if (!(sccr1_reg & SSCR1_TINTE))
 		mask &= ~SSSR_TINT;
 
-	if (!(status & mask))
-		return IRQ_NONE;
+	while (1) {
+		/* Ignore possible writes if we don't need to write */
+		if (!(sccr1_reg & SSCR1_TIE))
+			mask &= ~SSSR_TFS;
 
-	if (!drv_data->master->cur_msg) {
-		handle_bad_msg(drv_data);
-		/* Never fail */
-		return IRQ_HANDLED;
-	}
+		if (!(status & mask))
+			return ret;
+
+		if (!drv_data->master->cur_msg) {
+			handle_bad_msg(drv_data);
+			/* Never fail */
+			return IRQ_HANDLED;
+		}
+
+		ret |= drv_data->transfer_handler(drv_data);
 
-	return drv_data->transfer_handler(drv_data);
+		status = pxa2xx_spi_read(drv_data, SSSR);
+		sccr1_reg = pxa2xx_spi_read(drv_data, SSCR1);
+	}
 }
 
 /*