Patchwork [2/3] crypto: exynos - Improve performance of PRNG

login
register
mail settings
Submitter Łukasz Stelmach
Date Dec. 5, 2017, 12:35 p.m.
Message ID <20171205123558.31087-3-l.stelmach@samsung.com>
Download mbox | patch
Permalink /patch/10092807/
State Superseded
Delegated to: Herbert Xu
Headers show

Comments

Łukasz Stelmach - Dec. 5, 2017, 12:35 p.m.
Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
to retrieve generated numbers from the registers of PRNG.

Remove unnecessary invocation of cpu_relax().

Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
---
 drivers/crypto/exynos-rng.c | 36 +++++-------------------------------
 1 file changed, 5 insertions(+), 31 deletions(-)
Krzysztof Kozlowski - Dec. 5, 2017, 1:49 p.m.
On Tue, Dec 5, 2017 at 1:35 PM, Łukasz Stelmach <l.stelmach@samsung.com> wrote:
> Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
> to retrieve generated numbers from the registers of PRNG.
>
> Remove unnecessary invocation of cpu_relax().
>
> Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
> ---
>  drivers/crypto/exynos-rng.c | 36 +++++-------------------------------
>  1 file changed, 5 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
> index 894ef93ef5ec..002e9d2a83cc 100644
> --- a/drivers/crypto/exynos-rng.c
> +++ b/drivers/crypto/exynos-rng.c
> @@ -130,34 +130,6 @@ static int exynos_rng_set_seed(struct exynos_rng_dev *rng,
>  }
>
>  /*
> - * Read from output registers and put the data under 'dst' array,
> - * up to dlen bytes.
> - *
> - * Returns number of bytes actually stored in 'dst' (dlen
> - * or EXYNOS_RNG_SEED_SIZE).
> - */
> -static unsigned int exynos_rng_copy_random(struct exynos_rng_dev *rng,
> -                                          u8 *dst, unsigned int dlen)
> -{
> -       unsigned int cnt = 0;
> -       int i, j;
> -       u32 val;
> -
> -       for (j = 0; j < EXYNOS_RNG_SEED_REGS; j++) {
> -               val = exynos_rng_readl(rng, EXYNOS_RNG_OUT(j));
> -
> -               for (i = 0; i < 4; i++) {
> -                       dst[cnt] = val & 0xff;
> -                       val >>= 8;
> -                       if (++cnt >= dlen)
> -                               return cnt;
> -               }
> -       }
> -
> -       return cnt;
> -}
> -
> -/*
>   * Start the engine and poll for finish.  Then read from output registers
>   * filling the 'dst' buffer up to 'dlen' bytes or up to size of generated
>   * random data (EXYNOS_RNG_SEED_SIZE).
> @@ -171,6 +143,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev *rng,
>  {
>         int retry = EXYNOS_RNG_WAIT_RETRIES;
>
> +       *read = min_t(size_t, dlen, EXYNOS_RNG_SEED_SIZE);
> +

Do not set *read on error path. Only on success. Although now it will
not matter but that is the expected behavior - if possible, do not
affect state outside of a block in case of error.

>         if (rng->type == EXYNOS_PRNG_TYPE4) {
>                 exynos_rng_writel(rng, EXYNOS_RNG_CONTROL_START,
>                                   EXYNOS_RNG_CONTROL);
> @@ -180,8 +154,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev *rng,
>         }
>
>         while (!(exynos_rng_readl(rng,
> -                       EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) && --retry)
> -               cpu_relax();
> +                       EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) &&
> +              --retry);

It looks like unrelated change so split it into separate commit with
explanation why you are changing the common busy-loop pattern.
exynos_rng_readl() uses relaxed versions of readl() so I would expect
here cpu_relax().

Best regards,
Krzysztof
Stephan Mueller - Dec. 5, 2017, 1:54 p.m.
Am Dienstag, 5. Dezember 2017, 13:35:57 CET schrieb Łukasz Stelmach:

Hi Łukasz,

> Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
> to retrieve generated numbers from the registers of PRNG.
> 
> Remove unnecessary invocation of cpu_relax().
> 
> Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
> ---
>  drivers/crypto/exynos-rng.c | 36 +++++-------------------------------
>  1 file changed, 5 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
> index 894ef93ef5ec..002e9d2a83cc 100644
> --- a/drivers/crypto/exynos-rng.c
> +++ b/drivers/crypto/exynos-rng.c
> @@ -130,34 +130,6 @@ static int exynos_rng_set_seed(struct exynos_rng_dev
> *rng, }
> 
>  /*
> - * Read from output registers and put the data under 'dst' array,
> - * up to dlen bytes.
> - *
> - * Returns number of bytes actually stored in 'dst' (dlen
> - * or EXYNOS_RNG_SEED_SIZE).
> - */
> -static unsigned int exynos_rng_copy_random(struct exynos_rng_dev *rng,
> -					   u8 *dst, unsigned int dlen)
> -{
> -	unsigned int cnt = 0;
> -	int i, j;
> -	u32 val;
> -
> -	for (j = 0; j < EXYNOS_RNG_SEED_REGS; j++) {
> -		val = exynos_rng_readl(rng, EXYNOS_RNG_OUT(j));
> -
> -		for (i = 0; i < 4; i++) {
> -			dst[cnt] = val & 0xff;
> -			val >>= 8;
> -			if (++cnt >= dlen)
> -				return cnt;
> -		}
> -	}
> -
> -	return cnt;
> -}
> -
> -/*
>   * Start the engine and poll for finish.  Then read from output registers
>   * filling the 'dst' buffer up to 'dlen' bytes or up to size of generated
>   * random data (EXYNOS_RNG_SEED_SIZE).
> @@ -171,6 +143,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
> *rng, {
>  	int retry = EXYNOS_RNG_WAIT_RETRIES;
> 
> +	*read = min_t(size_t, dlen, EXYNOS_RNG_SEED_SIZE);
> +
>  	if (rng->type == EXYNOS_PRNG_TYPE4) {
>  		exynos_rng_writel(rng, EXYNOS_RNG_CONTROL_START,
>  				  EXYNOS_RNG_CONTROL);
> @@ -180,8 +154,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
> *rng, }
> 
>  	while (!(exynos_rng_readl(rng,
> -			EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) && --retry)
> -		cpu_relax();
> +			EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) &&
> +	       --retry);

Is this related to the patch?
> 
>  	if (!retry)
>  		return -ETIMEDOUT;
> @@ -189,7 +163,7 @@ static int exynos_rng_get_random(struct exynos_rng_dev
> *rng, /* Clear status bit */
>  	exynos_rng_writel(rng, EXYNOS_RNG_STATUS_RNG_DONE,
>  			  EXYNOS_RNG_STATUS);
> -	*read = exynos_rng_copy_random(rng, dst, dlen);
> +	memcpy_fromio(dst, rng->mem + EXYNOS_RNG_OUT_BASE, *read);
> 
>  	return 0;
>  }



Ciao
Stephan
Łukasz Stelmach - Dec. 5, 2017, 4:43 p.m.
It was <2017-12-05 wto 14:54>, when Stephan Mueller wrote:
> Am Dienstag, 5. Dezember 2017, 13:35:57 CET schrieb Łukasz Stelmach:
>
> Hi Łukasz,
>
>> Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
>> to retrieve generated numbers from the registers of PRNG.
>> 
>> Remove unnecessary invocation of cpu_relax().
>> 
>> Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
>> ---
>>  drivers/crypto/exynos-rng.c | 36 +++++-------------------------------
>>  1 file changed, 5 insertions(+), 31 deletions(-)
>> 
>> diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
>> index 894ef93ef5ec..002e9d2a83cc 100644
>> --- a/drivers/crypto/exynos-rng.c
>> +++ b/drivers/crypto/exynos-rng.c

[...]

>> @@ -171,6 +143,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
>> *rng, {
>>  	int retry = EXYNOS_RNG_WAIT_RETRIES;
>> 
>> +	*read = min_t(size_t, dlen, EXYNOS_RNG_SEED_SIZE);
>> +
>>  	if (rng->type == EXYNOS_PRNG_TYPE4) {
>>  		exynos_rng_writel(rng, EXYNOS_RNG_CONTROL_START,
>>  				  EXYNOS_RNG_CONTROL);
>> @@ -180,8 +154,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
>> *rng, }
>> 
>>  	while (!(exynos_rng_readl(rng,
>> -			EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) && --retry)
>> -		cpu_relax();
>> +			EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) &&
>> +	       --retry);
SM>
SM> Is this related to the patch?

KK> It looks like unrelated change so split it into separate commit with
KK> explanation why you are changing the common busy-loop pattern.
KK> exynos_rng_readl() uses relaxed versions of readl() so I would expect
KK> here cpu_relax().

Yes. As far as I can tell this gives the major part of the performance
improvement brought by this patch.

The busy loop is not very busy. Every time I checked the loop (w/o
cpu_relax()) was executed twice (retry was 98) and the operation was
reliable. I don't see why do we need a memory barrier here. On the other
hand, I am not sure the whole exynos_rng_get_random() shouldn't be ran
under a mutex or a spinlock (I don't see anything like this in the upper
layers of the crypto framework).

The *_relaxed() I/O operations do not enforce memory 

Thank you for asking the questions. I will put the above explanations in
the commit message.

>> 
>>  	if (!retry)
>>  		return -ETIMEDOUT;
>> @@ -189,7 +163,7 @@ static int exynos_rng_get_random(struct exynos_rng_dev
>> *rng, /* Clear status bit */
>>  	exynos_rng_writel(rng, EXYNOS_RNG_STATUS_RNG_DONE,
>>  			  EXYNOS_RNG_STATUS);
>> -	*read = exynos_rng_copy_random(rng, dst, dlen);
>> +	memcpy_fromio(dst, rng->mem + EXYNOS_RNG_OUT_BASE, *read);
>> 
>>  	return 0;
>>  }

Kind regards,
Krzysztof Kozlowski - Dec. 5, 2017, 5:53 p.m.
On Tue, Dec 05, 2017 at 05:43:10PM +0100, Łukasz Stelmach wrote:
> It was <2017-12-05 wto 14:54>, when Stephan Mueller wrote:
> > Am Dienstag, 5. Dezember 2017, 13:35:57 CET schrieb Łukasz Stelmach:
> >
> > Hi Łukasz,
> >
> >> Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
> >> to retrieve generated numbers from the registers of PRNG.
> >> 
> >> Remove unnecessary invocation of cpu_relax().
> >> 
> >> Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
> >> ---
> >>  drivers/crypto/exynos-rng.c | 36 +++++-------------------------------
> >>  1 file changed, 5 insertions(+), 31 deletions(-)
> >> 
> >> diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
> >> index 894ef93ef5ec..002e9d2a83cc 100644
> >> --- a/drivers/crypto/exynos-rng.c
> >> +++ b/drivers/crypto/exynos-rng.c
> 
> [...]
> 
> >> @@ -171,6 +143,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
> >> *rng, {
> >>  	int retry = EXYNOS_RNG_WAIT_RETRIES;
> >> 
> >> +	*read = min_t(size_t, dlen, EXYNOS_RNG_SEED_SIZE);
> >> +
> >>  	if (rng->type == EXYNOS_PRNG_TYPE4) {
> >>  		exynos_rng_writel(rng, EXYNOS_RNG_CONTROL_START,
> >>  				  EXYNOS_RNG_CONTROL);
> >> @@ -180,8 +154,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
> >> *rng, }
> >> 
> >>  	while (!(exynos_rng_readl(rng,
> >> -			EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) && --retry)
> >> -		cpu_relax();
> >> +			EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) &&
> >> +	       --retry);
> SM>
> SM> Is this related to the patch?
> 
> KK> It looks like unrelated change so split it into separate commit with
> KK> explanation why you are changing the common busy-loop pattern.
> KK> exynos_rng_readl() uses relaxed versions of readl() so I would expect
> KK> here cpu_relax().
> 
> Yes. As far as I can tell this gives the major part of the performance
> improvement brought by this patch.

In that case definitely split and explain... what and why you want to
achieve here.

> 
> The busy loop is not very busy. Every time I checked the loop (w/o
> cpu_relax()) was executed twice (retry was 98) and the operation was
> reliable. I don't see why do we need a memory barrier here. On the other
> hand, I am not sure the whole exynos_rng_get_random() shouldn't be ran
> under a mutex or a spinlock (I don't see anything like this in the upper
> layers of the crypto framework).
> 
> The *_relaxed() I/O operations do not enforce memory

The cpu_relax() is a common pattern for busy-loop. If you want to break
this pattern - please explain why only this part of kernel should not
follow it (and rest of kernel should).

The other part - this code is already using relaxed versions which might
get you into difficult to debug issues. You mentioned that loop works
reliable after removing the cpu_relax... yeah, it might for 99.999% but
that's not the argument. I remember few emails from Arnd Bergmann
mentioning explicitly to avoid using relaxed versions "just because",
unless it is necessary or really understood.

The code first writes to control register, then checks for status so you
should have these operations strictly ordered. Therefore I think
cpu_relax() should not be removed.

Best regards,
Krzysztof
Krzysztof Kozlowski - Dec. 5, 2017, 6:06 p.m.
On Tue, Dec 5, 2017 at 6:53 PM, Krzysztof Kozlowski <krzk@kernel.org> wrote:
> On Tue, Dec 05, 2017 at 05:43:10PM +0100, Łukasz Stelmach wrote:
>> It was <2017-12-05 wto 14:54>, when Stephan Mueller wrote:
>> > Am Dienstag, 5. Dezember 2017, 13:35:57 CET schrieb Łukasz Stelmach:
>> >
>> > Hi Łukasz,
>> >
>> >> Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
>> >> to retrieve generated numbers from the registers of PRNG.
>> >>
>> >> Remove unnecessary invocation of cpu_relax().
>> >>
>> >> Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
>> >> ---
>> >>  drivers/crypto/exynos-rng.c | 36 +++++-------------------------------
>> >>  1 file changed, 5 insertions(+), 31 deletions(-)
>> >>
>> >> diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
>> >> index 894ef93ef5ec..002e9d2a83cc 100644
>> >> --- a/drivers/crypto/exynos-rng.c
>> >> +++ b/drivers/crypto/exynos-rng.c
>>
>> [...]
>>
>> >> @@ -171,6 +143,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
>> >> *rng, {
>> >>    int retry = EXYNOS_RNG_WAIT_RETRIES;
>> >>
>> >> +  *read = min_t(size_t, dlen, EXYNOS_RNG_SEED_SIZE);
>> >> +
>> >>    if (rng->type == EXYNOS_PRNG_TYPE4) {
>> >>            exynos_rng_writel(rng, EXYNOS_RNG_CONTROL_START,
>> >>                              EXYNOS_RNG_CONTROL);
>> >> @@ -180,8 +154,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
>> >> *rng, }
>> >>
>> >>    while (!(exynos_rng_readl(rng,
>> >> -                  EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) && --retry)
>> >> -          cpu_relax();
>> >> +                  EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) &&
>> >> +         --retry);
>> SM>
>> SM> Is this related to the patch?
>>
>> KK> It looks like unrelated change so split it into separate commit with
>> KK> explanation why you are changing the common busy-loop pattern.
>> KK> exynos_rng_readl() uses relaxed versions of readl() so I would expect
>> KK> here cpu_relax().
>>
>> Yes. As far as I can tell this gives the major part of the performance
>> improvement brought by this patch.
>
> In that case definitely split and explain... what and why you want to
> achieve here.
>
>>
>> The busy loop is not very busy. Every time I checked the loop (w/o
>> cpu_relax()) was executed twice (retry was 98) and the operation was
>> reliable. I don't see why do we need a memory barrier here. On the other
>> hand, I am not sure the whole exynos_rng_get_random() shouldn't be ran
>> under a mutex or a spinlock (I don't see anything like this in the upper
>> layers of the crypto framework).
>>
>> The *_relaxed() I/O operations do not enforce memory
>
> The cpu_relax() is a common pattern for busy-loop. If you want to break
> this pattern - please explain why only this part of kernel should not
> follow it (and rest of kernel should).
>
> The other part - this code is already using relaxed versions which might
> get you into difficult to debug issues. You mentioned that loop works
> reliable after removing the cpu_relax... yeah, it might for 99.999% but
> that's not the argument. I remember few emails from Arnd Bergmann
> mentioning explicitly to avoid using relaxed versions "just because",
> unless it is necessary or really understood.
>
> The code first writes to control register, then checks for status so you
> should have these operations strictly ordered. Therefore I think
> cpu_relax() should not be removed.

... or just convert it to readl_poll_timeout() because it makes code
more readable, takes care of timeout and you do not have care about
specific implementation (whether there should or should not be
cpu_relax).

Best regards,
Krzysztof
Łukasz Stelmach - Dec. 6, 2017, 11:32 a.m.
It was <2017-12-05 wto 19:06>, when Krzysztof Kozlowski wrote:
> On Tue, Dec 5, 2017 at 6:53 PM, Krzysztof Kozlowski <krzk@kernel.org> wrote:
>> On Tue, Dec 05, 2017 at 05:43:10PM +0100, Łukasz Stelmach wrote:
>>> It was <2017-12-05 wto 14:54>, when Stephan Mueller wrote:
>>>> Am Dienstag, 5. Dezember 2017, 13:35:57 CET schrieb Łukasz Stelmach:
>>>>> Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
>>>>> to retrieve generated numbers from the registers of PRNG.
>>>>>
>>>>> Remove unnecessary invocation of cpu_relax().
>>>>>
>>>>> Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
>>>>> ---
>>>>>  drivers/crypto/exynos-rng.c | 36 +++++-------------------------------
>>>>>  1 file changed, 5 insertions(+), 31 deletions(-)
>>>>>
>>>>> diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
>>>>> index 894ef93ef5ec..002e9d2a83cc 100644
>>>>> --- a/drivers/crypto/exynos-rng.c
>>>>> +++ b/drivers/crypto/exynos-rng.c
>>>
>>> [...]
>>>
>>>>> @@ -171,6 +143,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
>>>>> *rng, {
>>>>>    int retry = EXYNOS_RNG_WAIT_RETRIES;
>>>>>
>>>>> +  *read = min_t(size_t, dlen, EXYNOS_RNG_SEED_SIZE);
>>>>> +
>>>>>    if (rng->type == EXYNOS_PRNG_TYPE4) {
>>>>>            exynos_rng_writel(rng, EXYNOS_RNG_CONTROL_START,
>>>>>                              EXYNOS_RNG_CONTROL);
>>>>> @@ -180,8 +154,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
>>>>> *rng, }
>>>>>
>>>>>    while (!(exynos_rng_readl(rng,
>>>>> -                  EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) && --retry)
>>>>> -          cpu_relax();
>>>>> +                  EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) &&
>>>>> +         --retry);

[...]

>>> The busy loop is not very busy. Every time I checked the loop (w/o
>>> cpu_relax()) was executed twice (retry was 98) and the operation was
>>> reliable. I don't see why do we need a memory barrier here. On the other
>>> hand, I am not sure the whole exynos_rng_get_random() shouldn't be ran
>>> under a mutex or a spinlock (I don't see anything like this in the upper
>>> layers of the crypto framework).
>>>
>>> The *_relaxed() I/O operations do not enforce memory
>>
>> The cpu_relax() is a common pattern for busy-loop. If you want to break
>> this pattern - please explain why only this part of kernel should not
>> follow it (and rest of kernel should).
>>
>> The other part - this code is already using relaxed versions which might
>> get you into difficult to debug issues. You mentioned that loop works
>> reliable after removing the cpu_relax... yeah, it might for 99.999% but
>> that's not the argument. I remember few emails from Arnd Bergmann
>> mentioning explicitly to avoid using relaxed versions "just because",
>> unless it is necessary or really understood.
>>
>> The code first writes to control register, then checks for status so you
>> should have these operations strictly ordered. Therefore I think
>> cpu_relax() should not be removed.
>
> ... or just convert it to readl_poll_timeout() because it makes code
> more readable, takes care of timeout and you do not have care about
> specific implementation (whether there should or should not be
> cpu_relax).

OK. This appears to perform reasonably.

	do {
		cpu_relax();
	} while (!(exynos_rng_readl(rng, EXYNOS_RNG_STATUS) &
		   EXYNOS_RNG_STATUS_RNG_DONE) && --retry);
Krzysztof Kozlowski - Dec. 6, 2017, 11:37 a.m.
On Wed, Dec 6, 2017 at 12:32 PM, Łukasz Stelmach <l.stelmach@samsung.com> wrote:
> It was <2017-12-05 wto 19:06>, when Krzysztof Kozlowski wrote:
>> On Tue, Dec 5, 2017 at 6:53 PM, Krzysztof Kozlowski <krzk@kernel.org> wrote:
>>> On Tue, Dec 05, 2017 at 05:43:10PM +0100, Łukasz Stelmach wrote:
>>>> It was <2017-12-05 wto 14:54>, when Stephan Mueller wrote:
>>>>> Am Dienstag, 5. Dezember 2017, 13:35:57 CET schrieb Łukasz Stelmach:
>>>>>> Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
>>>>>> to retrieve generated numbers from the registers of PRNG.
>>>>>>
>>>>>> Remove unnecessary invocation of cpu_relax().
>>>>>>
>>>>>> Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
>>>>>> ---
>>>>>>  drivers/crypto/exynos-rng.c | 36 +++++-------------------------------
>>>>>>  1 file changed, 5 insertions(+), 31 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
>>>>>> index 894ef93ef5ec..002e9d2a83cc 100644
>>>>>> --- a/drivers/crypto/exynos-rng.c
>>>>>> +++ b/drivers/crypto/exynos-rng.c
>>>>
>>>> [...]
>>>>
>>>>>> @@ -171,6 +143,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
>>>>>> *rng, {
>>>>>>    int retry = EXYNOS_RNG_WAIT_RETRIES;
>>>>>>
>>>>>> +  *read = min_t(size_t, dlen, EXYNOS_RNG_SEED_SIZE);
>>>>>> +
>>>>>>    if (rng->type == EXYNOS_PRNG_TYPE4) {
>>>>>>            exynos_rng_writel(rng, EXYNOS_RNG_CONTROL_START,
>>>>>>                              EXYNOS_RNG_CONTROL);
>>>>>> @@ -180,8 +154,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
>>>>>> *rng, }
>>>>>>
>>>>>>    while (!(exynos_rng_readl(rng,
>>>>>> -                  EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) && --retry)
>>>>>> -          cpu_relax();
>>>>>> +                  EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) &&
>>>>>> +         --retry);
>
> [...]
>
>>>> The busy loop is not very busy. Every time I checked the loop (w/o
>>>> cpu_relax()) was executed twice (retry was 98) and the operation was
>>>> reliable. I don't see why do we need a memory barrier here. On the other
>>>> hand, I am not sure the whole exynos_rng_get_random() shouldn't be ran
>>>> under a mutex or a spinlock (I don't see anything like this in the upper
>>>> layers of the crypto framework).
>>>>
>>>> The *_relaxed() I/O operations do not enforce memory
>>>
>>> The cpu_relax() is a common pattern for busy-loop. If you want to break
>>> this pattern - please explain why only this part of kernel should not
>>> follow it (and rest of kernel should).
>>>
>>> The other part - this code is already using relaxed versions which might
>>> get you into difficult to debug issues. You mentioned that loop works
>>> reliable after removing the cpu_relax... yeah, it might for 99.999% but
>>> that's not the argument. I remember few emails from Arnd Bergmann
>>> mentioning explicitly to avoid using relaxed versions "just because",
>>> unless it is necessary or really understood.
>>>
>>> The code first writes to control register, then checks for status so you
>>> should have these operations strictly ordered. Therefore I think
>>> cpu_relax() should not be removed.
>>
>> ... or just convert it to readl_poll_timeout() because it makes code
>> more readable, takes care of timeout and you do not have care about
>> specific implementation (whether there should or should not be
>> cpu_relax).
>
> OK. This appears to perform reasonably.
>
>         do {
>                 cpu_relax();
>         } while (!(exynos_rng_readl(rng, EXYNOS_RNG_STATUS) &
>                    EXYNOS_RNG_STATUS_RNG_DONE) && --retry);

You mean that:
  while (readl_relaxed()) cpu_relax();
is slower than
  do cpu_relax() while (readl_relaxed())
?

Hmm... Interesting. I would be happy to learn more about it why it
behaves so differently. Maybe the execution of cpu_relax() before
readl_relaxed() reduces the amount of loops to actually one read?

Indeed some parts of kernel code for ARM prefers this approach,
although still the most popular pattern is existing one (while()
cpu_relax).

Best regards,
Krzysztof
Łukasz Stelmach - Dec. 6, 2017, 1:06 p.m.
It was <2017-12-06 śro 12:37>, when Krzysztof Kozlowski wrote:
> On Wed, Dec 6, 2017 at 12:32 PM, Łukasz Stelmach <l.stelmach@samsung.com> wrote:
>> It was <2017-12-05 wto 19:06>, when Krzysztof Kozlowski wrote:
>>> On Tue, Dec 5, 2017 at 6:53 PM, Krzysztof Kozlowski <krzk@kernel.org> wrote:
>>>> On Tue, Dec 05, 2017 at 05:43:10PM +0100, Łukasz Stelmach wrote:
>>>>> It was <2017-12-05 wto 14:54>, when Stephan Mueller wrote:
>>>>>> Am Dienstag, 5. Dezember 2017, 13:35:57 CET schrieb Łukasz Stelmach:
>>>>>>> Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
>>>>>>> to retrieve generated numbers from the registers of PRNG.
>>>>>>>
>>>>>>> Remove unnecessary invocation of cpu_relax().
>>>>>>>
>>>>>>> Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
>>>>>>> ---
>>>>>>>  drivers/crypto/exynos-rng.c | 36 +++++-------------------------------
>>>>>>>  1 file changed, 5 insertions(+), 31 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
>>>>>>> index 894ef93ef5ec..002e9d2a83cc 100644
>>>>>>> --- a/drivers/crypto/exynos-rng.c
>>>>>>> +++ b/drivers/crypto/exynos-rng.c
>>>>>
>>>>> [...]
>>>>>
>>>>>>> @@ -171,6 +143,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
>>>>>>> *rng, {
>>>>>>>    int retry = EXYNOS_RNG_WAIT_RETRIES;
>>>>>>>
>>>>>>> +  *read = min_t(size_t, dlen, EXYNOS_RNG_SEED_SIZE);
>>>>>>> +
>>>>>>>    if (rng->type == EXYNOS_PRNG_TYPE4) {
>>>>>>>            exynos_rng_writel(rng, EXYNOS_RNG_CONTROL_START,
>>>>>>>                              EXYNOS_RNG_CONTROL);
>>>>>>> @@ -180,8 +154,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev
>>>>>>> *rng, }
>>>>>>>
>>>>>>>    while (!(exynos_rng_readl(rng,
>>>>>>> -                  EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) && --retry)
>>>>>>> -          cpu_relax();
>>>>>>> +                  EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) &&
>>>>>>> +         --retry);
>>
>> [...]
>>
>>>>> The busy loop is not very busy. Every time I checked the loop (w/o
>>>>> cpu_relax()) was executed twice (retry was 98) and the operation was
>>>>> reliable. I don't see why do we need a memory barrier here. On the other
>>>>> hand, I am not sure the whole exynos_rng_get_random() shouldn't be ran
>>>>> under a mutex or a spinlock (I don't see anything like this in the upper
>>>>> layers of the crypto framework).
>>>>>
>>>>> The *_relaxed() I/O operations do not enforce memory
>>>>
>>>> The cpu_relax() is a common pattern for busy-loop. If you want to break
>>>> this pattern - please explain why only this part of kernel should not
>>>> follow it (and rest of kernel should).
>>>>
>>>> The other part - this code is already using relaxed versions which might
>>>> get you into difficult to debug issues. You mentioned that loop works
>>>> reliable after removing the cpu_relax... yeah, it might for 99.999% but
>>>> that's not the argument. I remember few emails from Arnd Bergmann
>>>> mentioning explicitly to avoid using relaxed versions "just because",
>>>> unless it is necessary or really understood.
>>>>
>>>> The code first writes to control register, then checks for status so you
>>>> should have these operations strictly ordered. Therefore I think
>>>> cpu_relax() should not be removed.
>>>
>>> ... or just convert it to readl_poll_timeout() because it makes code
>>> more readable, takes care of timeout and you do not have care about
>>> specific implementation (whether there should or should not be
>>> cpu_relax).
>>
>> OK. This appears to perform reasonably.
>>
>>         do {
>>                 cpu_relax();
>>         } while (!(exynos_rng_readl(rng, EXYNOS_RNG_STATUS) &
>>                    EXYNOS_RNG_STATUS_RNG_DONE) && --retry);
>
> You mean that:
>   while (readl_relaxed()) cpu_relax();
> is slower than
>   do cpu_relax() while (readl_relaxed())
> ?
>
> Hmm... Interesting. I would be happy to learn more about it why it
> behaves so differently. Maybe the execution of cpu_relax() before
> readl_relaxed() reduces the amount of loops to actually one read?
>
> Indeed some parts of kernel code for ARM prefers this approach,
> although still the most popular pattern is existing one (while()
> cpu_relax).

Without cpu_relax() retry is decremented twice. So there are three
reads. It appears that a single call cpu_relax() gives the hardware
enough time to be ready when the first read occurs (retry is 99 after
the loop).

Patch

diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
index 894ef93ef5ec..002e9d2a83cc 100644
--- a/drivers/crypto/exynos-rng.c
+++ b/drivers/crypto/exynos-rng.c
@@ -130,34 +130,6 @@  static int exynos_rng_set_seed(struct exynos_rng_dev *rng,
 }
 
 /*
- * Read from output registers and put the data under 'dst' array,
- * up to dlen bytes.
- *
- * Returns number of bytes actually stored in 'dst' (dlen
- * or EXYNOS_RNG_SEED_SIZE).
- */
-static unsigned int exynos_rng_copy_random(struct exynos_rng_dev *rng,
-					   u8 *dst, unsigned int dlen)
-{
-	unsigned int cnt = 0;
-	int i, j;
-	u32 val;
-
-	for (j = 0; j < EXYNOS_RNG_SEED_REGS; j++) {
-		val = exynos_rng_readl(rng, EXYNOS_RNG_OUT(j));
-
-		for (i = 0; i < 4; i++) {
-			dst[cnt] = val & 0xff;
-			val >>= 8;
-			if (++cnt >= dlen)
-				return cnt;
-		}
-	}
-
-	return cnt;
-}
-
-/*
  * Start the engine and poll for finish.  Then read from output registers
  * filling the 'dst' buffer up to 'dlen' bytes or up to size of generated
  * random data (EXYNOS_RNG_SEED_SIZE).
@@ -171,6 +143,8 @@  static int exynos_rng_get_random(struct exynos_rng_dev *rng,
 {
 	int retry = EXYNOS_RNG_WAIT_RETRIES;
 
+	*read = min_t(size_t, dlen, EXYNOS_RNG_SEED_SIZE);
+
 	if (rng->type == EXYNOS_PRNG_TYPE4) {
 		exynos_rng_writel(rng, EXYNOS_RNG_CONTROL_START,
 				  EXYNOS_RNG_CONTROL);
@@ -180,8 +154,8 @@  static int exynos_rng_get_random(struct exynos_rng_dev *rng,
 	}
 
 	while (!(exynos_rng_readl(rng,
-			EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) && --retry)
-		cpu_relax();
+			EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) &&
+	       --retry);
 
 	if (!retry)
 		return -ETIMEDOUT;
@@ -189,7 +163,7 @@  static int exynos_rng_get_random(struct exynos_rng_dev *rng,
 	/* Clear status bit */
 	exynos_rng_writel(rng, EXYNOS_RNG_STATUS_RNG_DONE,
 			  EXYNOS_RNG_STATUS);
-	*read = exynos_rng_copy_random(rng, dst, dlen);
+	memcpy_fromio(dst, rng->mem + EXYNOS_RNG_OUT_BASE, *read);
 
 	return 0;
 }