diff mbox

[RFC:,2/2] iio: adc: exynos_adc: Handle timeout and race conditions

Message ID 1363364801-23684-1-git-send-email-ch.naveen@samsung.com (mailing list archive)
State New, archived
Headers show

Commit Message

Naveen Krishna Chatradhi March 15, 2013, 4:26 p.m. UTC
This patch does the following
1. Handle the return values of wait_for_completion_interruptible_timeout
2. Add spin locks to avoid race conditions during ISR.

Signed-off-by: Naveen Krishna Chatradhi <ch.naveen@samsung.com>
Cc: Doug Anderson <dianders@chromium.org>
Cc: Lars-Peter Clausen <lars@metafoo.de>
---
Discussion thread for this patch can be found at
http://www.gossamer-threads.com/lists/linux/kernel/1693284?page=last

I've not seen any reference to spin lock usage in IIO.
Kindly, suggest me if there is a better way to avoid the race.

Thanks,
Naveen
 drivers/iio/adc/exynos_adc.c |   14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

Comments

Lars-Peter Clausen March 15, 2013, 9:53 p.m. UTC | #1
On 03/15/2013 05:26 PM, Naveen Krishna Chatradhi wrote:
> This patch does the following
> 1. Handle the return values of wait_for_completion_interruptible_timeout
> 2. Add spin locks to avoid race conditions during ISR.
> 
> Signed-off-by: Naveen Krishna Chatradhi <ch.naveen@samsung.com>
> Cc: Doug Anderson <dianders@chromium.org>
> Cc: Lars-Peter Clausen <lars@metafoo.de>
> ---
> Discussion thread for this patch can be found at
> http://www.gossamer-threads.com/lists/linux/kernel/1693284?page=last
> 
> I've not seen any reference to spin lock usage in IIO.
> Kindly, suggest me if there is a better way to avoid the race.
> 
> Thanks,
> Naveen
>  drivers/iio/adc/exynos_adc.c |   14 ++++++++++++--
>  1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iio/adc/exynos_adc.c b/drivers/iio/adc/exynos_adc.c
> index ed6fdd7..4de28ae 100644
> --- a/drivers/iio/adc/exynos_adc.c
> +++ b/drivers/iio/adc/exynos_adc.c
> @@ -91,6 +91,7 @@ struct exynos_adc {
>  
>  	struct completion	completion;
>  
> +	spinlock_t		reg_lock;
>  	u32			value;
>  	unsigned int            version;
>  };
> @@ -117,7 +118,7 @@ static int exynos_read_raw(struct iio_dev *indio_dev,
>  				long mask)
>  {
>  	struct exynos_adc *info = iio_priv(indio_dev);
> -	unsigned long timeout;
> +	long timeout;
>  	u32 con1, con2;
>  
>  	if (mask != IIO_CHAN_INFO_RAW)
> @@ -143,15 +144,19 @@ static int exynos_read_raw(struct iio_dev *indio_dev,
>  				ADC_V1_CON(info->regs));
>  	}
>  
> +	INIT_COMPLETION(info->completion);
> +

This needs to happen before you start the transfer.


>  	timeout = wait_for_completion_interruptible_timeout
>  			(&info->completion, EXYNOS_ADC_TIMEOUT);
> +
>  	*val = info->value;
>  
>  	mutex_unlock(&indio_dev->mlock);
>  
>  	if (timeout == 0)
>  		return -ETIMEDOUT;
> -
> +	else if (timeout < 0)
> +		return timeout;
>  	return IIO_VAL_INT;
>  }
>  
> @@ -159,6 +164,8 @@ static irqreturn_t exynos_adc_isr(int irq, void *dev_id)
>  {
>  	struct exynos_adc *info = (struct exynos_adc *)dev_id;
>  
> +	spin_lock(&info->reg_lock);
> +
>  	/* Read value */
>  	info->value = readl(ADC_V1_DATX(info->regs)) &
>  						ADC_DATX_MASK;
> @@ -170,6 +177,8 @@ static irqreturn_t exynos_adc_isr(int irq, void *dev_id)
>  
>  	complete(&info->completion);
>  
> +	spin_unlock(&info->reg_lock);
> +

What exactly is the spinlock protecting against here? Concurrent runs of
exynos_adc_isr? This is probably not issue in the first place.

What you want to protect against is that completion is completed between the
call to INIT_COMPLETION() and the start of a new conversion. So the sections
that need to be under the spinlock are the complete call here and the point
from INIT_COMPLETION until the transfer is started in exynos_read_raw(). Make
sure to use spin_lock_irq there.

>  	return IRQ_HANDLED;
>  }
>  
> @@ -327,6 +336,7 @@ static int exynos_adc_probe(struct platform_device *pdev)
>  	else
>  		indio_dev->num_channels = MAX_ADC_V2_CHANNELS;
>  
> +	spin_lock_init(&info->reg_lock);
>  	ret = iio_device_register(indio_dev);
>  	if (ret)
>  		goto err_irq;

--
To unsubscribe from this list: send the line "unsubscribe linux-samsung-soc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Anderson March 16, 2013, 12:37 a.m. UTC | #2
On Fri, Mar 15, 2013 at 2:53 PM, Lars-Peter Clausen <lars@metafoo.de> wrote:
> What exactly is the spinlock protecting against here? Concurrent runs of
> exynos_adc_isr? This is probably not issue in the first place.
>
> What you want to protect against is that completion is completed between the
> call to INIT_COMPLETION() and the start of a new conversion. So the sections
> that need to be under the spinlock are the complete call here and the point
> from INIT_COMPLETION until the transfer is started in exynos_read_raw(). Make
> sure to use spin_lock_irq there.

...and at that point I _think_ you won't also need the mutex.

A reasonable way to test to see if you've got this all correct would be to:

* Start two processes that are reading from different ADCs that will
report very different values (maybe add a device tree node for adc1 or
adc7 and use those since they're not really connected to
thermistors?).

* Have your two processes read as fast as they can.  This could just
be "while true; do cat /sys/class/hwmon/hwmon0/device/temp1_input;
done"

* Decrease your timeout and maybe(?) sprinkle some random udelays in
the irq handler so that the timeouts happen sometimes but not others.

* Periodically cancel one of the readers with Ctrl-C

If all is working well then you should always get back the right value
from the right reader (and get no crashes).

-Doug
--
To unsubscribe from this list: send the line "unsubscribe linux-samsung-soc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lars-Peter Clausen March 16, 2013, 2:41 p.m. UTC | #3
On 03/16/2013 01:37 AM, Doug Anderson wrote:
> On Fri, Mar 15, 2013 at 2:53 PM, Lars-Peter Clausen <lars@metafoo.de> wrote:
>> What exactly is the spinlock protecting against here? Concurrent runs of
>> exynos_adc_isr? This is probably not issue in the first place.
>>
>> What you want to protect against is that completion is completed between the
>> call to INIT_COMPLETION() and the start of a new conversion. So the sections
>> that need to be under the spinlock are the complete call here and the point
>> from INIT_COMPLETION until the transfer is started in exynos_read_raw(). Make
>> sure to use spin_lock_irq there.
> 
> ...and at that point I _think_ you won't also need the mutex.
> 
> A reasonable way to test to see if you've got this all correct would be to:
> 
> * Start two processes that are reading from different ADCs that will
> report very different values (maybe add a device tree node for adc1 or
> adc7 and use those since they're not really connected to
> thermistors?).
> 
> * Have your two processes read as fast as they can.  This could just
> be "while true; do cat /sys/class/hwmon/hwmon0/device/temp1_input;
> done"
> 
> * Decrease your timeout and maybe(?) sprinkle some random udelays in
> the irq handler so that the timeouts happen sometimes but not others.
> 
> * Periodically cancel one of the readers with Ctrl-C
> 
> If all is working well then you should always get back the right value
> from the right reader (and get no crashes).
> 


I think you still need the mutex for serialization, otherwise the requests
would just cancel each other out. Btw. what happens if you start a conversion
while another is still in progress? Is it possible to abort a conversion?

- Lars
--
To unsubscribe from this list: send the line "unsubscribe linux-samsung-soc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Anderson April 3, 2013, 5:06 p.m. UTC | #4
Lars,

On Sat, Mar 16, 2013 at 7:41 AM, Lars-Peter Clausen <lars@metafoo.de> wrote:
> I think you still need the mutex for serialization, otherwise the requests
> would just cancel each other out. Btw. what happens if you start a conversion
> while another is still in progress? Is it possible to abort a conversion?

I was thinking that the spinlock would just replace the mutex for the
purposes of serialization.

I stepped back a bit, though, and I'm wondering if we're over-thinking
things.  The timeout case should certainly be handled properly (thanks
for pointing it out), but getting a timeout is really not expected and
adding a lot of extra overhead to handle it elegantly seems a bit
much?

Specifically, the mutex means that we have one user of the ADC at a
time, and ADC conversion has nothing variable about it.  The user
manual that I have access to talks about 12-bit conversion happening
in 1 microsecond with a 5MHz input clock or 5 microseconds with a 1MHz
input clock.  Even if someone has clocks configured very differently,
it would be hard to imagine a conversion actually taking a full
second.

...so that means that if the timeout actually fires then something
else fairly drastic has gone wrong.  It's _very_ unlikely that the IRQ
will still go off for this conversion sometime in the future.

To me, total modifications to what's landed already ought to be:

* Change timeout to long (from unsigned long)

* Make sure we return errors (negative results) from
wait_for_completion_interruptible_timeout() properly.

* If we get back a value of 0 from
wait_for_completion_interruptible_timeout() then we should print a
warning and attempt machinations to reset the ADC.  Without ever
seeing real-world situtations that would cause a real timeout these
machinations would be a bit of a guess (is resetting the adc useful
when it's more likely that someone accidentally messed with the clock
tree or power gated the ADC?)...  ...or perhaps a warning and a TODO
in the code would be enough?


Thoughts?

-Doug
--
To unsubscribe from this list: send the line "unsubscribe linux-samsung-soc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lars-Peter Clausen April 5, 2013, 8:53 a.m. UTC | #5
On 04/03/2013 07:06 PM, Doug Anderson wrote:
> Lars,
> 
> On Sat, Mar 16, 2013 at 7:41 AM, Lars-Peter Clausen <lars@metafoo.de> wrote:
>> I think you still need the mutex for serialization, otherwise the requests
>> would just cancel each other out. Btw. what happens if you start a conversion
>> while another is still in progress? Is it possible to abort a conversion?
> 
> I was thinking that the spinlock would just replace the mutex for the
> purposes of serialization.
> 

Since we sleep inside the protected section we need to use a mutex.

> I stepped back a bit, though, and I'm wondering if we're over-thinking
> things.  The timeout case should certainly be handled properly (thanks
> for pointing it out), but getting a timeout is really not expected and
> adding a lot of extra overhead to handle it elegantly seems a bit
> much?
> 
> Specifically, the mutex means that we have one user of the ADC at a
> time, and ADC conversion has nothing variable about it.  The user
> manual that I have access to talks about 12-bit conversion happening
> in 1 microsecond with a 5MHz input clock or 5 microseconds with a 1MHz
> input clock.  Even if someone has clocks configured very differently,
> it would be hard to imagine a conversion actually taking a full
> second.
> 
> ...so that means that if the timeout actually fires then something
> else fairly drastic has gone wrong.  It's _very_ unlikely that the IRQ
> will still go off for this conversion sometime in the future.
> 

It's not the timeout case I'm worried about, but the case where the transfer
is interrupted by the user. Even though it is rather unlikely for the
problem to occur we should still try to avoid it, this is one of these
annoying heisenbugs that happen once in a while and nobody is able to
reproduce them.

> To me, total modifications to what's landed already ought to be:
> 
> * Change timeout to long (from unsigned long)
> 
> * Make sure we return errors (negative results) from
> wait_for_completion_interruptible_timeout() properly.
> 
> * If we get back a value of 0 from
> wait_for_completion_interruptible_timeout() then we should print a
> warning and attempt machinations to reset the ADC.  Without ever
> seeing real-world situtations that would cause a real timeout these
> machinations would be a bit of a guess (is resetting the adc useful
> when it's more likely that someone accidentally messed with the clock
> tree or power gated the ADC?)...  ...or perhaps a warning and a TODO
> in the code would be enough?
> 
> 
> Thoughts?

I think most of this is already implemented and Naveen sent a patch to reset
the controller in case of a timeout, which is a good idea and works fine,
but you still should handle the case where the user aborted the transfer.
Just resetting the core should work as well in that case.

- Lars
--
To unsubscribe from this list: send the line "unsubscribe linux-samsung-soc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Anderson April 5, 2013, 2:56 p.m. UTC | #6
Lars,

On Fri, Apr 5, 2013 at 1:53 AM, Lars-Peter Clausen <lars@metafoo.de> wrote:
> Since we sleep inside the protected section we need to use a mutex.

Ah, good point.

> It's not the timeout case I'm worried about, but the case where the transfer
> is interrupted by the user. Even though it is rather unlikely for the
> problem to occur we should still try to avoid it, this is one of these
> annoying heisenbugs that happen once in a while and nobody is able to
> reproduce them.

Yes, of course.  Then we can also get extra confidence that the reset
logic works well by stressing out this case...  :)

This makes me think, though.  Given how fast we expect the ADC
transaction to finish, would there be any benefit to making the wait
non-interruptible and then shortening the timeout a whole lot.  If we
shortened to 1ms then we're really not "non-interruptible" for very
long and there's less chance of subtle bugs in the way that reset
works.

-Doug
--
To unsubscribe from this list: send the line "unsubscribe linux-samsung-soc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lars-Peter Clausen April 5, 2013, 4:38 p.m. UTC | #7
On 04/05/2013 04:56 PM, Doug Anderson wrote:
> Lars,
> 
> On Fri, Apr 5, 2013 at 1:53 AM, Lars-Peter Clausen <lars@metafoo.de> wrote:
>> Since we sleep inside the protected section we need to use a mutex.
> 
> Ah, good point.
> 
>> It's not the timeout case I'm worried about, but the case where the transfer
>> is interrupted by the user. Even though it is rather unlikely for the
>> problem to occur we should still try to avoid it, this is one of these
>> annoying heisenbugs that happen once in a while and nobody is able to
>> reproduce them.
> 
> Yes, of course.  Then we can also get extra confidence that the reset
> logic works well by stressing out this case...  :)
> 
> This makes me think, though.  Given how fast we expect the ADC
> transaction to finish, would there be any benefit to making the wait
> non-interruptible and then shortening the timeout a whole lot.  If we
> shortened to 1ms then we're really not "non-interruptible" for very
> long and there's less chance of subtle bugs in the way that reset
> works.

Yes, that could also work.

- Lars
--
To unsubscribe from this list: send the line "unsubscribe linux-samsung-soc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/iio/adc/exynos_adc.c b/drivers/iio/adc/exynos_adc.c
index ed6fdd7..4de28ae 100644
--- a/drivers/iio/adc/exynos_adc.c
+++ b/drivers/iio/adc/exynos_adc.c
@@ -91,6 +91,7 @@  struct exynos_adc {
 
 	struct completion	completion;
 
+	spinlock_t		reg_lock;
 	u32			value;
 	unsigned int            version;
 };
@@ -117,7 +118,7 @@  static int exynos_read_raw(struct iio_dev *indio_dev,
 				long mask)
 {
 	struct exynos_adc *info = iio_priv(indio_dev);
-	unsigned long timeout;
+	long timeout;
 	u32 con1, con2;
 
 	if (mask != IIO_CHAN_INFO_RAW)
@@ -143,15 +144,19 @@  static int exynos_read_raw(struct iio_dev *indio_dev,
 				ADC_V1_CON(info->regs));
 	}
 
+	INIT_COMPLETION(info->completion);
+
 	timeout = wait_for_completion_interruptible_timeout
 			(&info->completion, EXYNOS_ADC_TIMEOUT);
+
 	*val = info->value;
 
 	mutex_unlock(&indio_dev->mlock);
 
 	if (timeout == 0)
 		return -ETIMEDOUT;
-
+	else if (timeout < 0)
+		return timeout;
 	return IIO_VAL_INT;
 }
 
@@ -159,6 +164,8 @@  static irqreturn_t exynos_adc_isr(int irq, void *dev_id)
 {
 	struct exynos_adc *info = (struct exynos_adc *)dev_id;
 
+	spin_lock(&info->reg_lock);
+
 	/* Read value */
 	info->value = readl(ADC_V1_DATX(info->regs)) &
 						ADC_DATX_MASK;
@@ -170,6 +177,8 @@  static irqreturn_t exynos_adc_isr(int irq, void *dev_id)
 
 	complete(&info->completion);
 
+	spin_unlock(&info->reg_lock);
+
 	return IRQ_HANDLED;
 }
 
@@ -327,6 +336,7 @@  static int exynos_adc_probe(struct platform_device *pdev)
 	else
 		indio_dev->num_channels = MAX_ADC_V2_CHANNELS;
 
+	spin_lock_init(&info->reg_lock);
 	ret = iio_device_register(indio_dev);
 	if (ret)
 		goto err_irq;