diff mbox series

[5/5] crypto: stm32/crc: protect from concurrent accesses

Message ID 20200512141113.18972-6-nicolas.toromanoff@st.com (mailing list archive)
State Mainlined
Commit 7795c0baf5ac25e104fec8677ad134066a8fb8d3
Headers show
Series STM32 CRC update | expand

Commit Message

Nicolas Toromanoff May 12, 2020, 2:11 p.m. UTC
Protect STM32 CRC device from concurrent accesses.

As we create a spinlocked section that increase with buffer size,
we provide a module parameter to release the pressure by splitting
critical section in chunks.

Size of each chunk is defined in burst_size module parameter.
By default burst_size=0, i.e. don't split incoming buffer.

Signed-off-by: Nicolas Toromanoff <nicolas.toromanoff@st.com>
---
 drivers/crypto/stm32/stm32-crc32.c | 47 ++++++++++++++++++++++++++++--
 1 file changed, 45 insertions(+), 2 deletions(-)

Comments

Ard Biesheuvel May 22, 2020, 4:11 p.m. UTC | #1
On Tue, 12 May 2020 at 16:13, Nicolas Toromanoff
<nicolas.toromanoff@st.com> wrote:
>
> Protect STM32 CRC device from concurrent accesses.
>
> As we create a spinlocked section that increase with buffer size,
> we provide a module parameter to release the pressure by splitting
> critical section in chunks.
>
> Size of each chunk is defined in burst_size module parameter.
> By default burst_size=0, i.e. don't split incoming buffer.
>
> Signed-off-by: Nicolas Toromanoff <nicolas.toromanoff@st.com>

Would you mind explaining the usage model here? It looks like you are
sharing a CRC hardware accelerator with a synchronous interface
between different users by using spinlocks? You are aware that this
will tie up the waiting CPUs completely during this time, right? So it
would be much better to use a mutex here. Or perhaps it would make
more sense to fall back to a s/w based CRC routine if the h/w is tied
up working for another task?

Using spinlocks for this is really not acceptable.



> ---
>  drivers/crypto/stm32/stm32-crc32.c | 47 ++++++++++++++++++++++++++++--
>  1 file changed, 45 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/crypto/stm32/stm32-crc32.c b/drivers/crypto/stm32/stm32-crc32.c
> index 413415c216ef..3ba41148c2a4 100644
> --- a/drivers/crypto/stm32/stm32-crc32.c
> +++ b/drivers/crypto/stm32/stm32-crc32.c
> @@ -35,11 +35,16 @@
>
>  #define CRC_AUTOSUSPEND_DELAY  50
>
> +static unsigned int burst_size;
> +module_param(burst_size, uint, 0644);
> +MODULE_PARM_DESC(burst_size, "Select burst byte size (0 unlimited)");
> +
>  struct stm32_crc {
>         struct list_head list;
>         struct device    *dev;
>         void __iomem     *regs;
>         struct clk       *clk;
> +       spinlock_t       lock;
>  };
>
>  struct stm32_crc_list {
> @@ -109,6 +114,7 @@ static int stm32_crc_init(struct shash_desc *desc)
>         struct stm32_crc_desc_ctx *ctx = shash_desc_ctx(desc);
>         struct stm32_crc_ctx *mctx = crypto_shash_ctx(desc->tfm);
>         struct stm32_crc *crc;
> +       unsigned long flags;
>
>         crc = stm32_crc_get_next_crc();
>         if (!crc)
> @@ -116,6 +122,8 @@ static int stm32_crc_init(struct shash_desc *desc)
>
>         pm_runtime_get_sync(crc->dev);
>
> +       spin_lock_irqsave(&crc->lock, flags);
> +
>         /* Reset, set key, poly and configure in bit reverse mode */
>         writel_relaxed(bitrev32(mctx->key), crc->regs + CRC_INIT);
>         writel_relaxed(bitrev32(mctx->poly), crc->regs + CRC_POL);
> @@ -125,18 +133,21 @@ static int stm32_crc_init(struct shash_desc *desc)
>         /* Store partial result */
>         ctx->partial = readl_relaxed(crc->regs + CRC_DR);
>
> +       spin_unlock_irqrestore(&crc->lock, flags);
> +
>         pm_runtime_mark_last_busy(crc->dev);
>         pm_runtime_put_autosuspend(crc->dev);
>
>         return 0;
>  }
>
> -static int stm32_crc_update(struct shash_desc *desc, const u8 *d8,
> -                           unsigned int length)
> +static int burst_update(struct shash_desc *desc, const u8 *d8,
> +                       size_t length)
>  {
>         struct stm32_crc_desc_ctx *ctx = shash_desc_ctx(desc);
>         struct stm32_crc_ctx *mctx = crypto_shash_ctx(desc->tfm);
>         struct stm32_crc *crc;
> +       unsigned long flags;
>
>         crc = stm32_crc_get_next_crc();
>         if (!crc)
> @@ -144,6 +155,8 @@ static int stm32_crc_update(struct shash_desc *desc, const u8 *d8,
>
>         pm_runtime_get_sync(crc->dev);
>
> +       spin_lock_irqsave(&crc->lock, flags);
> +
>         /*
>          * Restore previously calculated CRC for this context as init value
>          * Restore polynomial configuration
> @@ -182,12 +195,40 @@ static int stm32_crc_update(struct shash_desc *desc, const u8 *d8,
>         /* Store partial result */
>         ctx->partial = readl_relaxed(crc->regs + CRC_DR);
>
> +       spin_unlock_irqrestore(&crc->lock, flags);
> +
>         pm_runtime_mark_last_busy(crc->dev);
>         pm_runtime_put_autosuspend(crc->dev);
>
>         return 0;
>  }
>
> +static int stm32_crc_update(struct shash_desc *desc, const u8 *d8,
> +                           unsigned int length)
> +{
> +       const unsigned int burst_sz = burst_size;
> +       unsigned int rem_sz;
> +       const u8 *cur;
> +       size_t size;
> +       int ret;
> +
> +       if (!burst_sz)
> +               return burst_update(desc, d8, length);
> +
> +       /* Digest first bytes not 32bit aligned at first pass in the loop */
> +       size = min(length,
> +                  burst_sz + (unsigned int)d8 - ALIGN_DOWN((unsigned int)d8,
> +                                                           sizeof(u32)));
> +       for (rem_sz = length, cur = d8; rem_sz;
> +            rem_sz -= size, cur += size, size = min(rem_sz, burst_sz)) {
> +               ret = burst_update(desc, cur, size);
> +               if (ret)
> +                       return ret;
> +       }
> +
> +       return 0;
> +}
> +
>  static int stm32_crc_final(struct shash_desc *desc, u8 *out)
>  {
>         struct stm32_crc_desc_ctx *ctx = shash_desc_ctx(desc);
> @@ -300,6 +341,8 @@ static int stm32_crc_probe(struct platform_device *pdev)
>         pm_runtime_irq_safe(dev);
>         pm_runtime_enable(dev);
>
> +       spin_lock_init(&crc->lock);
> +
>         platform_set_drvdata(pdev, crc);
>
>         spin_lock(&crc_list.lock);
> --
> 2.17.1
>
Nicolas Toromanoff May 25, 2020, 7:24 a.m. UTC | #2
Hello,

> -----Original Message-----
> From: Ard Biesheuvel <ardb@kernel.org>
> Sent: Friday, May 22, 2020 6:12 PM> 
> On Tue, 12 May 2020 at 16:13, Nicolas Toromanoff
> <nicolas.toromanoff@st.com> wrote:
> >
> > Protect STM32 CRC device from concurrent accesses.
> >
> > As we create a spinlocked section that increase with buffer size, we
> > provide a module parameter to release the pressure by splitting
> > critical section in chunks.
> >
> > Size of each chunk is defined in burst_size module parameter.
> > By default burst_size=0, i.e. don't split incoming buffer.
> >
> > Signed-off-by: Nicolas Toromanoff <nicolas.toromanoff@st.com>
> 
> Would you mind explaining the usage model here? It looks like you are sharing a
> CRC hardware accelerator with a synchronous interface between different users
> by using spinlocks? You are aware that this will tie up the waiting CPUs
> completely during this time, right? So it would be much better to use a mutex
> here. Or perhaps it would make more sense to fall back to a s/w based CRC
> routine if the h/w is tied up working for another task?

I know mutex are more acceptable here, but shash _update() and _init() may be call 
from any context, and so I cannot take a mutex.
And to protect my concurrent HW access I only though about spinlock. Due to possible
constraint on CPUs, I add a burst_size option to force slitting long buffer into smaller one,
and so decrease time we take the lock.
But I didn't though to fallback to software CRC.

I'll do a patch on top.
In in the burst_update() function I'll use a spin_trylock_irqsave() and use software CRC32 if HW is already in use.

Thanks and regards,
Nicolas.

> Using spinlocks for this is really not acceptable.
> 
> 
> 
> > ---
> >  drivers/crypto/stm32/stm32-crc32.c | 47
> > ++++++++++++++++++++++++++++--
> >  1 file changed, 45 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/crypto/stm32/stm32-crc32.c
> > b/drivers/crypto/stm32/stm32-crc32.c
> > index 413415c216ef..3ba41148c2a4 100644
> > --- a/drivers/crypto/stm32/stm32-crc32.c
> > +++ b/drivers/crypto/stm32/stm32-crc32.c
> > @@ -35,11 +35,16 @@
> >
> >  #define CRC_AUTOSUSPEND_DELAY  50
> >
> > +static unsigned int burst_size;
> > +module_param(burst_size, uint, 0644); MODULE_PARM_DESC(burst_size,
> > +"Select burst byte size (0 unlimited)");
> > +
> >  struct stm32_crc {
> >         struct list_head list;
> >         struct device    *dev;
> >         void __iomem     *regs;
> >         struct clk       *clk;
> > +       spinlock_t       lock;
> >  };
> >
> >  struct stm32_crc_list {
> > @@ -109,6 +114,7 @@ static int stm32_crc_init(struct shash_desc *desc)
> >         struct stm32_crc_desc_ctx *ctx = shash_desc_ctx(desc);
> >         struct stm32_crc_ctx *mctx = crypto_shash_ctx(desc->tfm);
> >         struct stm32_crc *crc;
> > +       unsigned long flags;
> >
> >         crc = stm32_crc_get_next_crc();
> >         if (!crc)
> > @@ -116,6 +122,8 @@ static int stm32_crc_init(struct shash_desc *desc)
> >
> >         pm_runtime_get_sync(crc->dev);
> >
> > +       spin_lock_irqsave(&crc->lock, flags);
> > +
> >         /* Reset, set key, poly and configure in bit reverse mode */
> >         writel_relaxed(bitrev32(mctx->key), crc->regs + CRC_INIT);
> >         writel_relaxed(bitrev32(mctx->poly), crc->regs + CRC_POL); @@
> > -125,18 +133,21 @@ static int stm32_crc_init(struct shash_desc *desc)
> >         /* Store partial result */
> >         ctx->partial = readl_relaxed(crc->regs + CRC_DR);
> >
> > +       spin_unlock_irqrestore(&crc->lock, flags);
> > +
> >         pm_runtime_mark_last_busy(crc->dev);
> >         pm_runtime_put_autosuspend(crc->dev);
> >
> >         return 0;
> >  }
> >
> > -static int stm32_crc_update(struct shash_desc *desc, const u8 *d8,
> > -                           unsigned int length)
> > +static int burst_update(struct shash_desc *desc, const u8 *d8,
> > +                       size_t length)
> >  {
> >         struct stm32_crc_desc_ctx *ctx = shash_desc_ctx(desc);
> >         struct stm32_crc_ctx *mctx = crypto_shash_ctx(desc->tfm);
> >         struct stm32_crc *crc;
> > +       unsigned long flags;
> >
> >         crc = stm32_crc_get_next_crc();
> >         if (!crc)
> > @@ -144,6 +155,8 @@ static int stm32_crc_update(struct shash_desc
> > *desc, const u8 *d8,
> >
> >         pm_runtime_get_sync(crc->dev);
> >
> > +       spin_lock_irqsave(&crc->lock, flags);
> > +
> >         /*
> >          * Restore previously calculated CRC for this context as init value
> >          * Restore polynomial configuration @@ -182,12 +195,40 @@
> > static int stm32_crc_update(struct shash_desc *desc, const u8 *d8,
> >         /* Store partial result */
> >         ctx->partial = readl_relaxed(crc->regs + CRC_DR);
> >
> > +       spin_unlock_irqrestore(&crc->lock, flags);
> > +
> >         pm_runtime_mark_last_busy(crc->dev);
> >         pm_runtime_put_autosuspend(crc->dev);
> >
> >         return 0;
> >  }
> >
> > +static int stm32_crc_update(struct shash_desc *desc, const u8 *d8,
> > +                           unsigned int length) {
> > +       const unsigned int burst_sz = burst_size;
> > +       unsigned int rem_sz;
> > +       const u8 *cur;
> > +       size_t size;
> > +       int ret;
> > +
> > +       if (!burst_sz)
> > +               return burst_update(desc, d8, length);
> > +
> > +       /* Digest first bytes not 32bit aligned at first pass in the loop */
> > +       size = min(length,
> > +                  burst_sz + (unsigned int)d8 - ALIGN_DOWN((unsigned int)d8,
> > +                                                           sizeof(u32)));
> > +       for (rem_sz = length, cur = d8; rem_sz;
> > +            rem_sz -= size, cur += size, size = min(rem_sz, burst_sz)) {
> > +               ret = burst_update(desc, cur, size);
> > +               if (ret)
> > +                       return ret;
> > +       }
> > +
> > +       return 0;
> > +}
> > +
> >  static int stm32_crc_final(struct shash_desc *desc, u8 *out)  {
> >         struct stm32_crc_desc_ctx *ctx = shash_desc_ctx(desc); @@
> > -300,6 +341,8 @@ static int stm32_crc_probe(struct platform_device *pdev)
> >         pm_runtime_irq_safe(dev);
> >         pm_runtime_enable(dev);
> >
> > +       spin_lock_init(&crc->lock);
> > +
> >         platform_set_drvdata(pdev, crc);
> >
> >         spin_lock(&crc_list.lock);
> > --
> > 2.17.1
> >
Ard Biesheuvel May 25, 2020, 7:46 a.m. UTC | #3
On Mon, 25 May 2020 at 09:24, Nicolas TOROMANOFF
<nicolas.toromanoff@st.com> wrote:
>
> Hello,
>
> > -----Original Message-----
> > From: Ard Biesheuvel <ardb@kernel.org>
> > Sent: Friday, May 22, 2020 6:12 PM>
> > On Tue, 12 May 2020 at 16:13, Nicolas Toromanoff
> > <nicolas.toromanoff@st.com> wrote:
> > >
> > > Protect STM32 CRC device from concurrent accesses.
> > >
> > > As we create a spinlocked section that increase with buffer size, we
> > > provide a module parameter to release the pressure by splitting
> > > critical section in chunks.
> > >
> > > Size of each chunk is defined in burst_size module parameter.
> > > By default burst_size=0, i.e. don't split incoming buffer.
> > >
> > > Signed-off-by: Nicolas Toromanoff <nicolas.toromanoff@st.com>
> >
> > Would you mind explaining the usage model here? It looks like you are sharing a
> > CRC hardware accelerator with a synchronous interface between different users
> > by using spinlocks? You are aware that this will tie up the waiting CPUs
> > completely during this time, right? So it would be much better to use a mutex
> > here. Or perhaps it would make more sense to fall back to a s/w based CRC
> > routine if the h/w is tied up working for another task?
>
> I know mutex are more acceptable here, but shash _update() and _init() may be call
> from any context, and so I cannot take a mutex.
> And to protect my concurrent HW access I only though about spinlock. Due to possible
> constraint on CPUs, I add a burst_size option to force slitting long buffer into smaller one,
> and so decrease time we take the lock.
> But I didn't though to fallback to software CRC.
>
> I'll do a patch on top.
> In in the burst_update() function I'll use a spin_trylock_irqsave() and use software CRC32 if HW is already in use.
>

Right. I didn't even notice that you were keeping interrupts disabled
the whole time when using the h/w block. That means that any serious
use of this h/w block will make IRQ latency go through the roof.

I recommend that you go back to the drawing board on this driver,
rather than papering over the issues with a spin_trylock(). Perhaps it
would be better to model it as a ahash (even though the h/w block
itself is synchronous) and use a kthread to feed in the data.
Nicolas Toromanoff May 25, 2020, 9:01 a.m. UTC | #4
> -----Original Message-----
> From: Ard Biesheuvel <ardb@kernel.org>
> Sent: Monday, May 25, 2020 9:46 AM
> To: Nicolas TOROMANOFF <nicolas.toromanoff@st.com>
> Subject: Re: [PATCH 5/5] crypto: stm32/crc: protect from concurrent accesses
> 
> On Mon, 25 May 2020 at 09:24, Nicolas TOROMANOFF
> <nicolas.toromanoff@st.com> wrote:
> >
> > Hello,
> >
> > > -----Original Message-----
> > > From: Ard Biesheuvel <ardb@kernel.org>
> > > Sent: Friday, May 22, 2020 6:12 PM>
> > > On Tue, 12 May 2020 at 16:13, Nicolas Toromanoff
> > > <nicolas.toromanoff@st.com> wrote:
> > > >
> > > > Protect STM32 CRC device from concurrent accesses.
> > > >
> > > > As we create a spinlocked section that increase with buffer size,
> > > > we provide a module parameter to release the pressure by splitting
> > > > critical section in chunks.
> > > >
> > > > Size of each chunk is defined in burst_size module parameter.
> > > > By default burst_size=0, i.e. don't split incoming buffer.
> > > >
> > > > Signed-off-by: Nicolas Toromanoff <nicolas.toromanoff@st.com>
> > >
> > > Would you mind explaining the usage model here? It looks like you
> > > are sharing a CRC hardware accelerator with a synchronous interface
> > > between different users by using spinlocks? You are aware that this
> > > will tie up the waiting CPUs completely during this time, right? So
> > > it would be much better to use a mutex here. Or perhaps it would
> > > make more sense to fall back to a s/w based CRC routine if the h/w is tied up
> working for another task?
> >
> > I know mutex are more acceptable here, but shash _update() and _init()
> > may be call from any context, and so I cannot take a mutex.
> > And to protect my concurrent HW access I only though about spinlock.
> > Due to possible constraint on CPUs, I add a burst_size option to force
> > slitting long buffer into smaller one, and so decrease time we take the lock.
> > But I didn't though to fallback to software CRC.
> >
> > I'll do a patch on top.
> > In in the burst_update() function I'll use a spin_trylock_irqsave() and use
> software CRC32 if HW is already in use.
> >
> 
> Right. I didn't even notice that you were keeping interrupts disabled the whole
> time when using the h/w block. That means that any serious use of this h/w
> block will make IRQ latency go through the roof.
> 
> I recommend that you go back to the drawing board on this driver, rather than
> papering over the issues with a spin_trylock(). Perhaps it would be better to
> model it as a ahash (even though the h/w block itself is synchronous) and use a
> kthread to feed in the data.

I thought when I updated the driver to move to a ahash interface, but the main usage
of crc32 is the ext4 fs, that calls the shash API.
Commit 877b5691f27a ("crypto: shash - remove shash_desc::flags") removed possibility
to sleep in shash callback. (before this commit and with MAY_SLEEP option set, using 
a mutex may have been fine).

By now the solution I see is to use the spin_trylock_irqsave(), fallback to software crc *AND* capping burst_size
to ensure the locked section stay reasonable.

Does this seems acceptable ?

Nicolas.
Ard Biesheuvel May 25, 2020, 9:07 a.m. UTC | #5
(+ Eric)

On Mon, 25 May 2020 at 11:01, Nicolas TOROMANOFF
<nicolas.toromanoff@st.com> wrote:
>
> > -----Original Message-----
> > From: Ard Biesheuvel <ardb@kernel.org>
> > Sent: Monday, May 25, 2020 9:46 AM
> > To: Nicolas TOROMANOFF <nicolas.toromanoff@st.com>
> > Subject: Re: [PATCH 5/5] crypto: stm32/crc: protect from concurrent accesses
> >
> > On Mon, 25 May 2020 at 09:24, Nicolas TOROMANOFF
> > <nicolas.toromanoff@st.com> wrote:
> > >
> > > Hello,
> > >
> > > > -----Original Message-----
> > > > From: Ard Biesheuvel <ardb@kernel.org>
> > > > Sent: Friday, May 22, 2020 6:12 PM>
> > > > On Tue, 12 May 2020 at 16:13, Nicolas Toromanoff
> > > > <nicolas.toromanoff@st.com> wrote:
> > > > >
> > > > > Protect STM32 CRC device from concurrent accesses.
> > > > >
> > > > > As we create a spinlocked section that increase with buffer size,
> > > > > we provide a module parameter to release the pressure by splitting
> > > > > critical section in chunks.
> > > > >
> > > > > Size of each chunk is defined in burst_size module parameter.
> > > > > By default burst_size=0, i.e. don't split incoming buffer.
> > > > >
> > > > > Signed-off-by: Nicolas Toromanoff <nicolas.toromanoff@st.com>
> > > >
> > > > Would you mind explaining the usage model here? It looks like you
> > > > are sharing a CRC hardware accelerator with a synchronous interface
> > > > between different users by using spinlocks? You are aware that this
> > > > will tie up the waiting CPUs completely during this time, right? So
> > > > it would be much better to use a mutex here. Or perhaps it would
> > > > make more sense to fall back to a s/w based CRC routine if the h/w is tied up
> > working for another task?
> > >
> > > I know mutex are more acceptable here, but shash _update() and _init()
> > > may be call from any context, and so I cannot take a mutex.
> > > And to protect my concurrent HW access I only though about spinlock.
> > > Due to possible constraint on CPUs, I add a burst_size option to force
> > > slitting long buffer into smaller one, and so decrease time we take the lock.
> > > But I didn't though to fallback to software CRC.
> > >
> > > I'll do a patch on top.
> > > In in the burst_update() function I'll use a spin_trylock_irqsave() and use
> > software CRC32 if HW is already in use.
> > >
> >
> > Right. I didn't even notice that you were keeping interrupts disabled the whole
> > time when using the h/w block. That means that any serious use of this h/w
> > block will make IRQ latency go through the roof.
> >
> > I recommend that you go back to the drawing board on this driver, rather than
> > papering over the issues with a spin_trylock(). Perhaps it would be better to
> > model it as a ahash (even though the h/w block itself is synchronous) and use a
> > kthread to feed in the data.
>
> I thought when I updated the driver to move to a ahash interface, but the main usage
> of crc32 is the ext4 fs, that calls the shash API.
> Commit 877b5691f27a ("crypto: shash - remove shash_desc::flags") removed possibility
> to sleep in shash callback. (before this commit and with MAY_SLEEP option set, using
> a mutex may have been fine).
>

According to that commit's log, sleeping is never fine for shash(),
since it uses kmap_atomic() when iterating over the scatterlist.

> By now the solution I see is to use the spin_trylock_irqsave(), fallback to software crc *AND* capping burst_size
> to ensure the locked section stay reasonable.
>
> Does this seems acceptable ?
>

If the reason for disabling interrupts is to avoid deadlocks, wouldn't
the switch to trylock() with a software fallback allow us to keep
interrupts enabled?
Nicolas Toromanoff May 25, 2020, 11:49 a.m. UTC | #6
> -----Original Message-----
> From: Ard Biesheuvel <ardb@kernel.org>
> Sent: Monday, May 25, 2020 11:07 AM
> To: Nicolas TOROMANOFF <nicolas.toromanoff@st.com>; Eric Biggers
> <ebiggers@kernel.org>
> On Mon, 25 May 2020 at 11:01, Nicolas TOROMANOFF
> <nicolas.toromanoff@st.com> wrote:
> >
> > > -----Original Message-----
> > > From: Ard Biesheuvel <ardb@kernel.org>
> > > Sent: Monday, May 25, 2020 9:46 AM
> > > To: Nicolas TOROMANOFF <nicolas.toromanoff@st.com>
> > > Subject: Re: [PATCH 5/5] crypto: stm32/crc: protect from concurrent
> > > accesses
> > >
> > > On Mon, 25 May 2020 at 09:24, Nicolas TOROMANOFF
> > > <nicolas.toromanoff@st.com> wrote:
> > > >
> > > > Hello,
> > > >
> > > > > -----Original Message-----
> > > > > From: Ard Biesheuvel <ardb@kernel.org>
> > > > > Sent: Friday, May 22, 2020 6:12 PM> On Tue, 12 May 2020 at
> > > > > 16:13, Nicolas Toromanoff <nicolas.toromanoff@st.com> wrote:
> > > > > >
> > > > > > Protect STM32 CRC device from concurrent accesses.
> > > > > >
> > > > > > As we create a spinlocked section that increase with buffer
> > > > > > size, we provide a module parameter to release the pressure by
> > > > > > splitting critical section in chunks.
> > > > > >
> > > > > > Size of each chunk is defined in burst_size module parameter.
> > > > > > By default burst_size=0, i.e. don't split incoming buffer.
> > > > > >
> > > > > > Signed-off-by: Nicolas Toromanoff <nicolas.toromanoff@st.com>
> > > > >
> > > > > Would you mind explaining the usage model here? It looks like
> > > > > you are sharing a CRC hardware accelerator with a synchronous
> > > > > interface between different users by using spinlocks? You are
> > > > > aware that this will tie up the waiting CPUs completely during
> > > > > this time, right? So it would be much better to use a mutex
> > > > > here. Or perhaps it would make more sense to fall back to a s/w
> > > > > based CRC routine if the h/w is tied up
> > > working for another task?
> > > >
> > > > I know mutex are more acceptable here, but shash _update() and
> > > > _init() may be call from any context, and so I cannot take a mutex.
> > > > And to protect my concurrent HW access I only though about spinlock.
> > > > Due to possible constraint on CPUs, I add a burst_size option to
> > > > force slitting long buffer into smaller one, and so decrease time we take
> the lock.
> > > > But I didn't though to fallback to software CRC.
> > > >
> > > > I'll do a patch on top.
> > > > In in the burst_update() function I'll use a
> > > > spin_trylock_irqsave() and use
> > > software CRC32 if HW is already in use.
> > > >
> > >
> > > Right. I didn't even notice that you were keeping interrupts
> > > disabled the whole time when using the h/w block. That means that
> > > any serious use of this h/w block will make IRQ latency go through the roof.
> > >
> > > I recommend that you go back to the drawing board on this driver,
> > > rather than papering over the issues with a spin_trylock(). Perhaps
> > > it would be better to model it as a ahash (even though the h/w block
> > > itself is synchronous) and use a kthread to feed in the data.
> >
> > I thought when I updated the driver to move to a ahash interface, but
> > the main usage of crc32 is the ext4 fs, that calls the shash API.
> > Commit 877b5691f27a ("crypto: shash - remove shash_desc::flags")
> > removed possibility to sleep in shash callback. (before this commit
> > and with MAY_SLEEP option set, using a mutex may have been fine).
> >
> 
> According to that commit's log, sleeping is never fine for shash(), since it uses
> kmap_atomic() when iterating over the scatterlist.

Today, we could avoid using kmap_atomic() in shash_ashash_*() APIs (the
ones that Walks through the scatterlist) by using the
crypto_ahash_walk_first() function to initialize the shash_ahash walker
(note that this function is never call in current kernel source [to remove ?]).
Then shash_ahash_*() functions will call ahash_*() function that use kmap()
if (walk->flags & CRYPTO_ALG_ASYNC) [flag set by crypto_ahash_walk_first()]
The last kmap_atomic() will be in the shash_ahash_digest() call in the
optimize branch (that should be replaced by the no atomic one)

I didn't investigate more this way, because I didn't check the drawback of
using kmap() instead of kmap_atomic(), I wanted to avoid modifying behavior
of other drivers and using a function never use elsewhere in kernel scarred
me ;-).
If these updates correct visible bugs in the ahash usage of the stm32-crc32
code [no more "sleep while atomic" traces even with mutex in tests], 
Documentation states that shash API could be called from any context,
I cannot add mutex in them.

> > By now the solution I see is to use the spin_trylock_irqsave(),
> > fallback to software crc *AND* capping burst_size to ensure the locked
> section stay reasonable.
> >
> > Does this seems acceptable ?
> >
> 
> If the reason for disabling interrupts is to avoid deadlocks, wouldn't the switch
> to trylock() with a software fallback allow us to keep interrupts enabled?

Right, with the trylock, I don't see why we may need to mask interrupts.
Ard Biesheuvel May 25, 2020, 12:01 p.m. UTC | #7
On Mon, 25 May 2020 at 13:49, Nicolas TOROMANOFF
<nicolas.toromanoff@st.com> wrote:
>
> > -----Original Message-----
> > From: Ard Biesheuvel <ardb@kernel.org>
> > Sent: Monday, May 25, 2020 11:07 AM
> > To: Nicolas TOROMANOFF <nicolas.toromanoff@st.com>; Eric Biggers
> > <ebiggers@kernel.org>
> > On Mon, 25 May 2020 at 11:01, Nicolas TOROMANOFF
> > <nicolas.toromanoff@st.com> wrote:
> > >
> > > > -----Original Message-----
> > > > From: Ard Biesheuvel <ardb@kernel.org>
> > > > Sent: Monday, May 25, 2020 9:46 AM
> > > > To: Nicolas TOROMANOFF <nicolas.toromanoff@st.com>
> > > > Subject: Re: [PATCH 5/5] crypto: stm32/crc: protect from concurrent
> > > > accesses
> > > >
> > > > On Mon, 25 May 2020 at 09:24, Nicolas TOROMANOFF
> > > > <nicolas.toromanoff@st.com> wrote:
> > > > >
> > > > > Hello,
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Ard Biesheuvel <ardb@kernel.org>
> > > > > > Sent: Friday, May 22, 2020 6:12 PM> On Tue, 12 May 2020 at
> > > > > > 16:13, Nicolas Toromanoff <nicolas.toromanoff@st.com> wrote:
> > > > > > >
> > > > > > > Protect STM32 CRC device from concurrent accesses.
> > > > > > >
> > > > > > > As we create a spinlocked section that increase with buffer
> > > > > > > size, we provide a module parameter to release the pressure by
> > > > > > > splitting critical section in chunks.
> > > > > > >
> > > > > > > Size of each chunk is defined in burst_size module parameter.
> > > > > > > By default burst_size=0, i.e. don't split incoming buffer.
> > > > > > >
> > > > > > > Signed-off-by: Nicolas Toromanoff <nicolas.toromanoff@st.com>
> > > > > >
> > > > > > Would you mind explaining the usage model here? It looks like
> > > > > > you are sharing a CRC hardware accelerator with a synchronous
> > > > > > interface between different users by using spinlocks? You are
> > > > > > aware that this will tie up the waiting CPUs completely during
> > > > > > this time, right? So it would be much better to use a mutex
> > > > > > here. Or perhaps it would make more sense to fall back to a s/w
> > > > > > based CRC routine if the h/w is tied up
> > > > working for another task?
> > > > >
> > > > > I know mutex are more acceptable here, but shash _update() and
> > > > > _init() may be call from any context, and so I cannot take a mutex.
> > > > > And to protect my concurrent HW access I only though about spinlock.
> > > > > Due to possible constraint on CPUs, I add a burst_size option to
> > > > > force slitting long buffer into smaller one, and so decrease time we take
> > the lock.
> > > > > But I didn't though to fallback to software CRC.
> > > > >
> > > > > I'll do a patch on top.
> > > > > In in the burst_update() function I'll use a
> > > > > spin_trylock_irqsave() and use
> > > > software CRC32 if HW is already in use.
> > > > >
> > > >
> > > > Right. I didn't even notice that you were keeping interrupts
> > > > disabled the whole time when using the h/w block. That means that
> > > > any serious use of this h/w block will make IRQ latency go through the roof.
> > > >
> > > > I recommend that you go back to the drawing board on this driver,
> > > > rather than papering over the issues with a spin_trylock(). Perhaps
> > > > it would be better to model it as a ahash (even though the h/w block
> > > > itself is synchronous) and use a kthread to feed in the data.
> > >
> > > I thought when I updated the driver to move to a ahash interface, but
> > > the main usage of crc32 is the ext4 fs, that calls the shash API.
> > > Commit 877b5691f27a ("crypto: shash - remove shash_desc::flags")
> > > removed possibility to sleep in shash callback. (before this commit
> > > and with MAY_SLEEP option set, using a mutex may have been fine).
> > >
> >
> > According to that commit's log, sleeping is never fine for shash(), since it uses
> > kmap_atomic() when iterating over the scatterlist.
>
> Today, we could avoid using kmap_atomic() in shash_ashash_*() APIs (the
> ones that Walks through the scatterlist) by using the
> crypto_ahash_walk_first() function to initialize the shash_ahash walker
> (note that this function is never call in current kernel source [to remove ?]).
> Then shash_ahash_*() functions will call ahash_*() function that use kmap()
> if (walk->flags & CRYPTO_ALG_ASYNC) [flag set by crypto_ahash_walk_first()]
> The last kmap_atomic() will be in the shash_ahash_digest() call in the
> optimize branch (that should be replaced by the no atomic one)
>
> I didn't investigate more this way, because I didn't check the drawback of
> using kmap() instead of kmap_atomic(), I wanted to avoid modifying behavior
> of other drivers and using a function never use elsewhere in kernel scarred
> me ;-).
> If these updates correct visible bugs in the ahash usage of the stm32-crc32
> code [no more "sleep while atomic" traces even with mutex in tests],
> Documentation states that shash API could be called from any context,
> I cannot add mutex in them.
>
> > > By now the solution I see is to use the spin_trylock_irqsave(),
> > > fallback to software crc *AND* capping burst_size to ensure the locked
> > section stay reasonable.
> > >
> > > Does this seems acceptable ?
> > >
> >
> > If the reason for disabling interrupts is to avoid deadlocks, wouldn't the switch
> > to trylock() with a software fallback allow us to keep interrupts enabled?
>
> Right, with the trylock, I don't see why we may need to mask interrupts.
>
>

OK, then the fix should be easy.
diff mbox series

Patch

diff --git a/drivers/crypto/stm32/stm32-crc32.c b/drivers/crypto/stm32/stm32-crc32.c
index 413415c216ef..3ba41148c2a4 100644
--- a/drivers/crypto/stm32/stm32-crc32.c
+++ b/drivers/crypto/stm32/stm32-crc32.c
@@ -35,11 +35,16 @@ 
 
 #define CRC_AUTOSUSPEND_DELAY	50
 
+static unsigned int burst_size;
+module_param(burst_size, uint, 0644);
+MODULE_PARM_DESC(burst_size, "Select burst byte size (0 unlimited)");
+
 struct stm32_crc {
 	struct list_head list;
 	struct device    *dev;
 	void __iomem     *regs;
 	struct clk       *clk;
+	spinlock_t       lock;
 };
 
 struct stm32_crc_list {
@@ -109,6 +114,7 @@  static int stm32_crc_init(struct shash_desc *desc)
 	struct stm32_crc_desc_ctx *ctx = shash_desc_ctx(desc);
 	struct stm32_crc_ctx *mctx = crypto_shash_ctx(desc->tfm);
 	struct stm32_crc *crc;
+	unsigned long flags;
 
 	crc = stm32_crc_get_next_crc();
 	if (!crc)
@@ -116,6 +122,8 @@  static int stm32_crc_init(struct shash_desc *desc)
 
 	pm_runtime_get_sync(crc->dev);
 
+	spin_lock_irqsave(&crc->lock, flags);
+
 	/* Reset, set key, poly and configure in bit reverse mode */
 	writel_relaxed(bitrev32(mctx->key), crc->regs + CRC_INIT);
 	writel_relaxed(bitrev32(mctx->poly), crc->regs + CRC_POL);
@@ -125,18 +133,21 @@  static int stm32_crc_init(struct shash_desc *desc)
 	/* Store partial result */
 	ctx->partial = readl_relaxed(crc->regs + CRC_DR);
 
+	spin_unlock_irqrestore(&crc->lock, flags);
+
 	pm_runtime_mark_last_busy(crc->dev);
 	pm_runtime_put_autosuspend(crc->dev);
 
 	return 0;
 }
 
-static int stm32_crc_update(struct shash_desc *desc, const u8 *d8,
-			    unsigned int length)
+static int burst_update(struct shash_desc *desc, const u8 *d8,
+			size_t length)
 {
 	struct stm32_crc_desc_ctx *ctx = shash_desc_ctx(desc);
 	struct stm32_crc_ctx *mctx = crypto_shash_ctx(desc->tfm);
 	struct stm32_crc *crc;
+	unsigned long flags;
 
 	crc = stm32_crc_get_next_crc();
 	if (!crc)
@@ -144,6 +155,8 @@  static int stm32_crc_update(struct shash_desc *desc, const u8 *d8,
 
 	pm_runtime_get_sync(crc->dev);
 
+	spin_lock_irqsave(&crc->lock, flags);
+
 	/*
 	 * Restore previously calculated CRC for this context as init value
 	 * Restore polynomial configuration
@@ -182,12 +195,40 @@  static int stm32_crc_update(struct shash_desc *desc, const u8 *d8,
 	/* Store partial result */
 	ctx->partial = readl_relaxed(crc->regs + CRC_DR);
 
+	spin_unlock_irqrestore(&crc->lock, flags);
+
 	pm_runtime_mark_last_busy(crc->dev);
 	pm_runtime_put_autosuspend(crc->dev);
 
 	return 0;
 }
 
+static int stm32_crc_update(struct shash_desc *desc, const u8 *d8,
+			    unsigned int length)
+{
+	const unsigned int burst_sz = burst_size;
+	unsigned int rem_sz;
+	const u8 *cur;
+	size_t size;
+	int ret;
+
+	if (!burst_sz)
+		return burst_update(desc, d8, length);
+
+	/* Digest first bytes not 32bit aligned at first pass in the loop */
+	size = min(length,
+		   burst_sz + (unsigned int)d8 - ALIGN_DOWN((unsigned int)d8,
+							    sizeof(u32)));
+	for (rem_sz = length, cur = d8; rem_sz;
+	     rem_sz -= size, cur += size, size = min(rem_sz, burst_sz)) {
+		ret = burst_update(desc, cur, size);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 static int stm32_crc_final(struct shash_desc *desc, u8 *out)
 {
 	struct stm32_crc_desc_ctx *ctx = shash_desc_ctx(desc);
@@ -300,6 +341,8 @@  static int stm32_crc_probe(struct platform_device *pdev)
 	pm_runtime_irq_safe(dev);
 	pm_runtime_enable(dev);
 
+	spin_lock_init(&crc->lock);
+
 	platform_set_drvdata(pdev, crc);
 
 	spin_lock(&crc_list.lock);