diff mbox

[v2,2/4] crypto: exynos - Improve performance of PRNG

Message ID 20171211140623.7673-3-l.stelmach@samsung.com (mailing list archive)
State Not Applicable
Headers show

Commit Message

Lukasz Stelmach Dec. 11, 2017, 2:06 p.m. UTC
Cc: Marek Szyprowski <m.szyprowski@samsung.com>, Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>

Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
to retrieve generated numbers from the registers of PRNG.

Rearrange the loop around cpu_relax(). In a loop with while() at the
beginning and the cpu_relax() removed the retry variable is decremented
twice (down to 98). This means, the status register is read three
times before the hardware is ready (which is very quick). Apparently,
cpu_relax() requires noticeable amount of time to execute, so it seems
better to call it for the first time before checking the status of
PRNG. The do {} while () loop decrements the retry variable only once,
which means, the time required for cpu_relax() is long enough to for
the PRNG to provide results. On the other hand, performance in this
arrangement isn't much worse than in a loop without cpu_relax().

Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
---
 drivers/crypto/exynos-rng.c | 36 +++++-------------------------------
 1 file changed, 5 insertions(+), 31 deletions(-)

Comments

Krzysztof Kozlowski Dec. 11, 2017, 2:54 p.m. UTC | #1
On Mon, Dec 11, 2017 at 3:06 PM, Łukasz Stelmach <l.stelmach@samsung.com> wrote:
> Cc: Marek Szyprowski <m.szyprowski@samsung.com>, Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>

This should not appear here.

>
> Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
> to retrieve generated numbers from the registers of PRNG.
>
> Rearrange the loop around cpu_relax(). In a loop with while() at the
> beginning and the cpu_relax() removed the retry variable is decremented
> twice (down to 98).

I had troubles with understanding this sentence... and then I figured
out that you are referring to some case without cpu_relax(). I do not
see how it is relevant to this case. Compare the new code with old,
not with some imaginary case without barriers (thus maybe reordered?).

Your solution is strictly performance oriented so it would be nice to
see here the exact difference in numbers justifying the change. But
only the change for while() -> do-while(), not mixed with
memcpy_fromio.

> This means, the status register is read three
> times before the hardware is ready (which is very quick). Apparently,
> cpu_relax() requires noticeable amount of time to execute, so it seems
> better to call it for the first time before checking the status of
> PRNG. The do {} while () loop decrements the retry variable only once,
> which means, the time required for cpu_relax() is long enough to for
> the PRNG to provide results.

So basically you want to say that you removed one call to exynos_rng_read()?

> On the other hand, performance in this
> arrangement isn't much worse than in a loop without cpu_relax().

I think it is not relevant as cpu_relax() removal is not part of
current nor the new code.

Best regards,
Krzysztof

>
> Signed-off-by: Łukasz Stelmach <l.stelmach@samsung.com>
> ---
>  drivers/crypto/exynos-rng.c | 36 +++++-------------------------------
>  1 file changed, 5 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
> index 2f554b82f49f..7d8f658480d3 100644
> --- a/drivers/crypto/exynos-rng.c
> +++ b/drivers/crypto/exynos-rng.c
> @@ -131,34 +131,6 @@ static int exynos_rng_set_seed(struct exynos_rng_dev *rng,
>  }
>
>  /*
> - * Read from output registers and put the data under 'dst' array,
> - * up to dlen bytes.
> - *
> - * Returns number of bytes actually stored in 'dst' (dlen
> - * or EXYNOS_RNG_SEED_SIZE).
> - */
> -static unsigned int exynos_rng_copy_random(struct exynos_rng_dev *rng,
> -                                          u8 *dst, unsigned int dlen)
> -{
> -       unsigned int cnt = 0;
> -       int i, j;
> -       u32 val;
> -
> -       for (j = 0; j < EXYNOS_RNG_SEED_REGS; j++) {
> -               val = exynos_rng_readl(rng, EXYNOS_RNG_OUT(j));
> -
> -               for (i = 0; i < 4; i++) {
> -                       dst[cnt] = val & 0xff;
> -                       val >>= 8;
> -                       if (++cnt >= dlen)
> -                               return cnt;
> -               }
> -       }
> -
> -       return cnt;
> -}
> -
> -/*
>   * Start the engine and poll for finish.  Then read from output registers
>   * filling the 'dst' buffer up to 'dlen' bytes or up to size of generated
>   * random data (EXYNOS_RNG_SEED_SIZE).
> @@ -180,9 +152,10 @@ static int exynos_rng_get_random(struct exynos_rng_dev *rng,
>                                   EXYNOS_RNG_SEED_CONF);
>         }
>
> -       while (!(exynos_rng_readl(rng,
> -                       EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) && --retry)
> +       do {
>                 cpu_relax();
> +       } while (!(exynos_rng_readl(rng, EXYNOS_RNG_STATUS) &
> +                  EXYNOS_RNG_STATUS_RNG_DONE) && --retry);
>
>         if (!retry)
>                 return -ETIMEDOUT;
> @@ -190,7 +163,8 @@ static int exynos_rng_get_random(struct exynos_rng_dev *rng,
>         /* Clear status bit */
>         exynos_rng_writel(rng, EXYNOS_RNG_STATUS_RNG_DONE,
>                           EXYNOS_RNG_STATUS);
> -       *read = exynos_rng_copy_random(rng, dst, dlen);
> +       *read = min_t(size_t, dlen, EXYNOS_RNG_SEED_SIZE);
> +       memcpy_fromio(dst, rng->mem + EXYNOS_RNG_OUT_BASE, *read);
>
>         return 0;
>  }
> --
> 2.11.0
>
--
To unsubscribe from this list: send the line "unsubscribe linux-samsung-soc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Lukasz Stelmach Dec. 12, 2017, 2:49 p.m. UTC | #2
It was <2017-12-11 pon 15:54>, when Krzysztof Kozlowski wrote:
> On Mon, Dec 11, 2017 at 3:06 PM, Łukasz Stelmach <l.stelmach@samsung.com> wrote:
>> Cc: Marek Szyprowski <m.szyprowski@samsung.com>, Bartlomiej
>> Zolnierkiewicz <b.zolnierkie@samsung.com>
>
> This should not appear here.

A glitch in a scripted invocation of git-format-patch, fixed.

>> Use memcpy_fromio() instead of custom exynos_rng_copy_random() function
>> to retrieve generated numbers from the registers of PRNG.
>>
>> Rearrange the loop around cpu_relax(). In a loop with while() at the
>> beginning and the cpu_relax() removed the retry variable is decremented
>> twice (down to 98).
>
> I had troubles with understanding this sentence... and then I figured
> out that you are referring to some case without cpu_relax(). I do not
> see how it is relevant to this case. Compare the new code with old,
> not with some imaginary case without barriers (thus maybe reordered?).
>
> Your solution is strictly performance oriented so it would be nice to
> see here the exact difference in numbers justifying the change. But
> only the change for while() -> do-while(), not mixed with
> memcpy_fromio.

Apparently, after trhough tests, I must admit, the way the status
register is being polled is insignificant for the performance. I will
remove from the patch any changes in the loop.

It is the way, the random bytes are copied from the regiesteres, that
makes the difference (5.9 MB/s vs 7.1 MB/s)

Thank you very much for your assistance in reaching this conclusion.
diff mbox

Patch

diff --git a/drivers/crypto/exynos-rng.c b/drivers/crypto/exynos-rng.c
index 2f554b82f49f..7d8f658480d3 100644
--- a/drivers/crypto/exynos-rng.c
+++ b/drivers/crypto/exynos-rng.c
@@ -131,34 +131,6 @@  static int exynos_rng_set_seed(struct exynos_rng_dev *rng,
 }
 
 /*
- * Read from output registers and put the data under 'dst' array,
- * up to dlen bytes.
- *
- * Returns number of bytes actually stored in 'dst' (dlen
- * or EXYNOS_RNG_SEED_SIZE).
- */
-static unsigned int exynos_rng_copy_random(struct exynos_rng_dev *rng,
-					   u8 *dst, unsigned int dlen)
-{
-	unsigned int cnt = 0;
-	int i, j;
-	u32 val;
-
-	for (j = 0; j < EXYNOS_RNG_SEED_REGS; j++) {
-		val = exynos_rng_readl(rng, EXYNOS_RNG_OUT(j));
-
-		for (i = 0; i < 4; i++) {
-			dst[cnt] = val & 0xff;
-			val >>= 8;
-			if (++cnt >= dlen)
-				return cnt;
-		}
-	}
-
-	return cnt;
-}
-
-/*
  * Start the engine and poll for finish.  Then read from output registers
  * filling the 'dst' buffer up to 'dlen' bytes or up to size of generated
  * random data (EXYNOS_RNG_SEED_SIZE).
@@ -180,9 +152,10 @@  static int exynos_rng_get_random(struct exynos_rng_dev *rng,
 				  EXYNOS_RNG_SEED_CONF);
 	}
 
-	while (!(exynos_rng_readl(rng,
-			EXYNOS_RNG_STATUS) & EXYNOS_RNG_STATUS_RNG_DONE) && --retry)
+	do {
 		cpu_relax();
+	} while (!(exynos_rng_readl(rng, EXYNOS_RNG_STATUS) &
+		   EXYNOS_RNG_STATUS_RNG_DONE) && --retry);
 
 	if (!retry)
 		return -ETIMEDOUT;
@@ -190,7 +163,8 @@  static int exynos_rng_get_random(struct exynos_rng_dev *rng,
 	/* Clear status bit */
 	exynos_rng_writel(rng, EXYNOS_RNG_STATUS_RNG_DONE,
 			  EXYNOS_RNG_STATUS);
-	*read = exynos_rng_copy_random(rng, dst, dlen);
+	*read = min_t(size_t, dlen, EXYNOS_RNG_SEED_SIZE);
+	memcpy_fromio(dst, rng->mem + EXYNOS_RNG_OUT_BASE, *read);
 
 	return 0;
 }