Message ID | 1464771389-10640-3-git-send-email-t-kristo@ti.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 06/01/2016 11:56 AM, Tero Kristo wrote: > From: Lokesh Vutla <lokeshvutla@ti.com> > > Calling runtime PM API for every block causes serious perf hit to > crypto operations that are done on a long buffer. > As crypto is performed on a page boundary, encrypting large buffers can > cause a series of crypto operations divided by page. The runtime PM API > is also called those many times. > > We call runtime_pm_get_sync only at beginning on the session (cra_init) > and runtime_pm_put at the end. This result in upto a 50% speedup. > This doesn't make the driver to keep the system awake as runtime get/put > is only called during a crypto session which completes usually quickly. > > Signed-off-by: Lokesh Vutla <lokeshvutla@ti.com> > Signed-off-by: Tero Kristo <t-kristo@ti.com> > --- > drivers/crypto/omap-sham.c | 27 +++++++++++++++++---------- > 1 file changed, 17 insertions(+), 10 deletions(-) > > diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c > index 6eefaa2..bd0258f 100644 > --- a/drivers/crypto/omap-sham.c > +++ b/drivers/crypto/omap-sham.c > @@ -360,14 +360,6 @@ static void omap_sham_copy_ready_hash(struct ahash_request *req) > > static int omap_sham_hw_init(struct omap_sham_dev *dd) > { > - int err; > - > - err = pm_runtime_get_sync(dd->dev); > - if (err < 0) { > - dev_err(dd->dev, "failed to get sync: %d\n", err); > - return err; > - } > - > if (!test_bit(FLAGS_INIT, &dd->flags)) { > set_bit(FLAGS_INIT, &dd->flags); > dd->err = 0; > @@ -999,8 +991,6 @@ static void omap_sham_finish_req(struct ahash_request *req, int err) > dd->flags &= ~(BIT(FLAGS_BUSY) | BIT(FLAGS_FINAL) | BIT(FLAGS_CPU) | > BIT(FLAGS_DMA_READY) | BIT(FLAGS_OUTPUT_READY)); > > - pm_runtime_put(dd->dev); > - > if (req->base.complete) > req->base.complete(&req->base, err); > > @@ -1239,6 +1229,7 @@ static int omap_sham_cra_init_alg(struct crypto_tfm *tfm, const char *alg_base) > { > struct omap_sham_ctx *tctx = crypto_tfm_ctx(tfm); > const char *alg_name = crypto_tfm_alg_name(tfm); > + struct omap_sham_dev *dd; > > /* Allocate a fallback and abort if it failed. */ > tctx->fallback = crypto_alloc_shash(alg_name, 0, > @@ -1266,6 +1257,13 @@ static int omap_sham_cra_init_alg(struct crypto_tfm *tfm, const char *alg_base) > > } > > + spin_lock_bh(&sham.lock); > + list_for_each_entry(dd, &sham.dev_list, list) { > + break; > + } > + spin_unlock_bh(&sham.lock); > + > + pm_runtime_get_sync(dd->dev); > return 0; > } > > @@ -1307,6 +1305,7 @@ static int omap_sham_cra_sha512_init(struct crypto_tfm *tfm) > static void omap_sham_cra_exit(struct crypto_tfm *tfm) > { > struct omap_sham_ctx *tctx = crypto_tfm_ctx(tfm); > + struct omap_sham_dev *dd; > > crypto_free_shash(tctx->fallback); > tctx->fallback = NULL; > @@ -1315,6 +1314,14 @@ static void omap_sham_cra_exit(struct crypto_tfm *tfm) > struct omap_sham_hmac_ctx *bctx = tctx->base; > crypto_free_shash(bctx->shash); > } > + > + spin_lock_bh(&sham.lock); > + list_for_each_entry(dd, &sham.dev_list, list) { > + break; > + } > + spin_unlock_bh(&sham.lock); > + > + pm_runtime_get_sync(dd->dev); May be put_? > } > > static struct ahash_alg algs_sha1_md5[] = { >
On 06/01/2016 04:53 AM, Grygorii Strashko wrote: > On 06/01/2016 11:56 AM, Tero Kristo wrote: >> From: Lokesh Vutla <lokeshvutla@ti.com> >> >> Calling runtime PM API for every block causes serious perf hit to >> crypto operations that are done on a long buffer. >> As crypto is performed on a page boundary, encrypting large buffers can >> cause a series of crypto operations divided by page. The runtime PM API >> is also called those many times. >> >> We call runtime_pm_get_sync only at beginning on the session (cra_init) >> and runtime_pm_put at the end. This result in upto a 50% speedup. >> This doesn't make the driver to keep the system awake as runtime get/put >> is only called during a crypto session which completes usually quickly. >> >> Signed-off-by: Lokesh Vutla <lokeshvutla@ti.com> >> Signed-off-by: Tero Kristo <t-kristo@ti.com> >> --- >> drivers/crypto/omap-sham.c | 27 +++++++++++++++++---------- >> 1 file changed, 17 insertions(+), 10 deletions(-) >> >> diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c >> index 6eefaa2..bd0258f 100644 >> --- a/drivers/crypto/omap-sham.c >> +++ b/drivers/crypto/omap-sham.c >> @@ -360,14 +360,6 @@ static void omap_sham_copy_ready_hash(struct >> ahash_request *req) >> >> static int omap_sham_hw_init(struct omap_sham_dev *dd) >> { >> - int err; >> - >> - err = pm_runtime_get_sync(dd->dev); >> - if (err < 0) { >> - dev_err(dd->dev, "failed to get sync: %d\n", err); >> - return err; >> - } >> - Would it be worth it to investigate a pm_runtime autosuspend approach rather than knocking runtime PM out here completely? I am not clear if the overhead is coming from the pm_runtime calls themselves or the actual idling of the IP, but if it's the idling of the IP causing the slowdown, with a large enough autosuspend_delay we don't actually sleep between each block but after a long enough period of idle time we would actually suspend. Regards, Dave >> if (!test_bit(FLAGS_INIT, &dd->flags)) { >> set_bit(FLAGS_INIT, &dd->flags); >> dd->err = 0; >> @@ -999,8 +991,6 @@ static void omap_sham_finish_req(struct >> ahash_request *req, int err) >> dd->flags &= ~(BIT(FLAGS_BUSY) | BIT(FLAGS_FINAL) | >> BIT(FLAGS_CPU) | >> BIT(FLAGS_DMA_READY) | BIT(FLAGS_OUTPUT_READY)); >> >> - pm_runtime_put(dd->dev); >> - >> if (req->base.complete) >> req->base.complete(&req->base, err); >> >> @@ -1239,6 +1229,7 @@ static int omap_sham_cra_init_alg(struct >> crypto_tfm *tfm, const char *alg_base) >> { >> struct omap_sham_ctx *tctx = crypto_tfm_ctx(tfm); >> const char *alg_name = crypto_tfm_alg_name(tfm); >> + struct omap_sham_dev *dd; >> >> /* Allocate a fallback and abort if it failed. */ >> tctx->fallback = crypto_alloc_shash(alg_name, 0, >> @@ -1266,6 +1257,13 @@ static int omap_sham_cra_init_alg(struct >> crypto_tfm *tfm, const char *alg_base) >> >> } >> >> + spin_lock_bh(&sham.lock); >> + list_for_each_entry(dd, &sham.dev_list, list) { >> + break; >> + } >> + spin_unlock_bh(&sham.lock); >> + >> + pm_runtime_get_sync(dd->dev); >> return 0; >> } >> >> @@ -1307,6 +1305,7 @@ static int omap_sham_cra_sha512_init(struct >> crypto_tfm *tfm) >> static void omap_sham_cra_exit(struct crypto_tfm *tfm) >> { >> struct omap_sham_ctx *tctx = crypto_tfm_ctx(tfm); >> + struct omap_sham_dev *dd; >> >> crypto_free_shash(tctx->fallback); >> tctx->fallback = NULL; >> @@ -1315,6 +1314,14 @@ static void omap_sham_cra_exit(struct >> crypto_tfm *tfm) >> struct omap_sham_hmac_ctx *bctx = tctx->base; >> crypto_free_shash(bctx->shash); >> } >> + >> + spin_lock_bh(&sham.lock); >> + list_for_each_entry(dd, &sham.dev_list, list) { >> + break; >> + } >> + spin_unlock_bh(&sham.lock); >> + >> + pm_runtime_get_sync(dd->dev); > > May be put_? > >> } >> >> static struct ahash_alg algs_sha1_md5[] = { >> > >
On Wed, Jun 01, 2016 at 06:03:52PM -0500, Dave Gerlach wrote: > On 06/01/2016 04:53 AM, Grygorii Strashko wrote: > >On 06/01/2016 11:56 AM, Tero Kristo wrote: > >>From: Lokesh Vutla <lokeshvutla@ti.com> > >> > >>Calling runtime PM API for every block causes serious perf hit to > >>crypto operations that are done on a long buffer. > >>As crypto is performed on a page boundary, encrypting large buffers can > >>cause a series of crypto operations divided by page. The runtime PM API > >>is also called those many times. > >> > >>We call runtime_pm_get_sync only at beginning on the session (cra_init) > >>and runtime_pm_put at the end. This result in upto a 50% speedup. > >>This doesn't make the driver to keep the system awake as runtime get/put > >>is only called during a crypto session which completes usually quickly. > >> > >>Signed-off-by: Lokesh Vutla <lokeshvutla@ti.com> > >>Signed-off-by: Tero Kristo <t-kristo@ti.com> > >>--- > >> drivers/crypto/omap-sham.c | 27 +++++++++++++++++---------- > >> 1 file changed, 17 insertions(+), 10 deletions(-) > >> > >>diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c > >>index 6eefaa2..bd0258f 100644 > >>--- a/drivers/crypto/omap-sham.c > >>+++ b/drivers/crypto/omap-sham.c > >>@@ -360,14 +360,6 @@ static void omap_sham_copy_ready_hash(struct > >>ahash_request *req) > >> > >> static int omap_sham_hw_init(struct omap_sham_dev *dd) > >> { > >>- int err; > >>- > >>- err = pm_runtime_get_sync(dd->dev); > >>- if (err < 0) { > >>- dev_err(dd->dev, "failed to get sync: %d\n", err); > >>- return err; > >>- } > >>- > > Would it be worth it to investigate a pm_runtime autosuspend > approach rather than knocking runtime PM out here completely? I am > not clear if the overhead is coming from the pm_runtime calls > themselves or the actual idling of the IP, but if it's the idling of > the IP causing the slowdown, with a large enough autosuspend_delay > we don't actually sleep between each block but after a long enough > period of idle time we would actually suspend. Indeed, I think this patch is bogus. cra_init is associated with the tfm object which is usually long-lived. So doing power management there makes no sense. Cheers,
On 07/06/16 13:08, Herbert Xu wrote: > On Wed, Jun 01, 2016 at 06:03:52PM -0500, Dave Gerlach wrote: >> On 06/01/2016 04:53 AM, Grygorii Strashko wrote: >>> On 06/01/2016 11:56 AM, Tero Kristo wrote: >>>> From: Lokesh Vutla <lokeshvutla@ti.com> >>>> >>>> Calling runtime PM API for every block causes serious perf hit to >>>> crypto operations that are done on a long buffer. >>>> As crypto is performed on a page boundary, encrypting large buffers can >>>> cause a series of crypto operations divided by page. The runtime PM API >>>> is also called those many times. >>>> >>>> We call runtime_pm_get_sync only at beginning on the session (cra_init) >>>> and runtime_pm_put at the end. This result in upto a 50% speedup. >>>> This doesn't make the driver to keep the system awake as runtime get/put >>>> is only called during a crypto session which completes usually quickly. >>>> >>>> Signed-off-by: Lokesh Vutla <lokeshvutla@ti.com> >>>> Signed-off-by: Tero Kristo <t-kristo@ti.com> >>>> --- >>>> drivers/crypto/omap-sham.c | 27 +++++++++++++++++---------- >>>> 1 file changed, 17 insertions(+), 10 deletions(-) >>>> >>>> diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c >>>> index 6eefaa2..bd0258f 100644 >>>> --- a/drivers/crypto/omap-sham.c >>>> +++ b/drivers/crypto/omap-sham.c >>>> @@ -360,14 +360,6 @@ static void omap_sham_copy_ready_hash(struct >>>> ahash_request *req) >>>> >>>> static int omap_sham_hw_init(struct omap_sham_dev *dd) >>>> { >>>> - int err; >>>> - >>>> - err = pm_runtime_get_sync(dd->dev); >>>> - if (err < 0) { >>>> - dev_err(dd->dev, "failed to get sync: %d\n", err); >>>> - return err; >>>> - } >>>> - >> >> Would it be worth it to investigate a pm_runtime autosuspend >> approach rather than knocking runtime PM out here completely? I am >> not clear if the overhead is coming from the pm_runtime calls >> themselves or the actual idling of the IP, but if it's the idling of >> the IP causing the slowdown, with a large enough autosuspend_delay >> we don't actually sleep between each block but after a long enough >> period of idle time we would actually suspend. > > Indeed, I think this patch is bogus. cra_init is associated > with the tfm object which is usually long-lived. So doing power > management there makes no sense. > > Cheers, > I can investigate this further, but I believe this patch itself gave a noticeable performance boost. This is an optimization anyway, and not critical for functionality. -Tero
On 06/07/2016 02:52 PM, Tero Kristo wrote: > On 07/06/16 13:08, Herbert Xu wrote: >> On Wed, Jun 01, 2016 at 06:03:52PM -0500, Dave Gerlach wrote: >>> On 06/01/2016 04:53 AM, Grygorii Strashko wrote: >>>> On 06/01/2016 11:56 AM, Tero Kristo wrote: >>>>> From: Lokesh Vutla <lokeshvutla@ti.com> >>>>> >>>>> Calling runtime PM API for every block causes serious perf hit to >>>>> crypto operations that are done on a long buffer. >>>>> As crypto is performed on a page boundary, encrypting large buffers >>>>> can >>>>> cause a series of crypto operations divided by page. The runtime PM >>>>> API >>>>> is also called those many times. >>>>> >>>>> We call runtime_pm_get_sync only at beginning on the session >>>>> (cra_init) >>>>> and runtime_pm_put at the end. This result in upto a 50% speedup. >>>>> This doesn't make the driver to keep the system awake as runtime >>>>> get/put >>>>> is only called during a crypto session which completes usually >>>>> quickly. >>>>> >>>>> Signed-off-by: Lokesh Vutla <lokeshvutla@ti.com> >>>>> Signed-off-by: Tero Kristo <t-kristo@ti.com> >>>>> --- >>>>> drivers/crypto/omap-sham.c | 27 +++++++++++++++++---------- >>>>> 1 file changed, 17 insertions(+), 10 deletions(-) >>>>> >>>>> diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c >>>>> index 6eefaa2..bd0258f 100644 >>>>> --- a/drivers/crypto/omap-sham.c >>>>> +++ b/drivers/crypto/omap-sham.c >>>>> @@ -360,14 +360,6 @@ static void omap_sham_copy_ready_hash(struct >>>>> ahash_request *req) >>>>> >>>>> static int omap_sham_hw_init(struct omap_sham_dev *dd) >>>>> { >>>>> - int err; >>>>> - >>>>> - err = pm_runtime_get_sync(dd->dev); >>>>> - if (err < 0) { >>>>> - dev_err(dd->dev, "failed to get sync: %d\n", err); >>>>> - return err; >>>>> - } >>>>> - >>> >>> Would it be worth it to investigate a pm_runtime autosuspend >>> approach rather than knocking runtime PM out here completely? I am >>> not clear if the overhead is coming from the pm_runtime calls >>> themselves or the actual idling of the IP, but if it's the idling of >>> the IP causing the slowdown, with a large enough autosuspend_delay >>> we don't actually sleep between each block but after a long enough >>> period of idle time we would actually suspend. >> >> Indeed, I think this patch is bogus. cra_init is associated >> with the tfm object which is usually long-lived. So doing power >> management there makes no sense. >> >> Cheers, >> > > I can investigate this further, but I believe this patch itself gave a > noticeable performance boost. > > This is an optimization anyway, and not critical for functionality. > It is not critical only if below code would not introduce races + spin_lock_bh(&sham.lock); + list_for_each_entry(dd, &sham.dev_list, list) { + break; + } + spin_unlock_bh(&sham.lock); Is it guaranteed that dd will alive always at this moment? + + pm_runtime_get_sync(dd->dev);
On 07/06/16 15:24, Grygorii Strashko wrote: > On 06/07/2016 02:52 PM, Tero Kristo wrote: >> On 07/06/16 13:08, Herbert Xu wrote: >>> On Wed, Jun 01, 2016 at 06:03:52PM -0500, Dave Gerlach wrote: >>>> On 06/01/2016 04:53 AM, Grygorii Strashko wrote: >>>>> On 06/01/2016 11:56 AM, Tero Kristo wrote: >>>>>> From: Lokesh Vutla <lokeshvutla@ti.com> >>>>>> >>>>>> Calling runtime PM API for every block causes serious perf hit to >>>>>> crypto operations that are done on a long buffer. >>>>>> As crypto is performed on a page boundary, encrypting large buffers >>>>>> can >>>>>> cause a series of crypto operations divided by page. The runtime PM >>>>>> API >>>>>> is also called those many times. >>>>>> >>>>>> We call runtime_pm_get_sync only at beginning on the session >>>>>> (cra_init) >>>>>> and runtime_pm_put at the end. This result in upto a 50% speedup. >>>>>> This doesn't make the driver to keep the system awake as runtime >>>>>> get/put >>>>>> is only called during a crypto session which completes usually >>>>>> quickly. >>>>>> >>>>>> Signed-off-by: Lokesh Vutla <lokeshvutla@ti.com> >>>>>> Signed-off-by: Tero Kristo <t-kristo@ti.com> >>>>>> --- >>>>>> drivers/crypto/omap-sham.c | 27 +++++++++++++++++---------- >>>>>> 1 file changed, 17 insertions(+), 10 deletions(-) >>>>>> >>>>>> diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c >>>>>> index 6eefaa2..bd0258f 100644 >>>>>> --- a/drivers/crypto/omap-sham.c >>>>>> +++ b/drivers/crypto/omap-sham.c >>>>>> @@ -360,14 +360,6 @@ static void omap_sham_copy_ready_hash(struct >>>>>> ahash_request *req) >>>>>> >>>>>> static int omap_sham_hw_init(struct omap_sham_dev *dd) >>>>>> { >>>>>> - int err; >>>>>> - >>>>>> - err = pm_runtime_get_sync(dd->dev); >>>>>> - if (err < 0) { >>>>>> - dev_err(dd->dev, "failed to get sync: %d\n", err); >>>>>> - return err; >>>>>> - } >>>>>> - >>>> >>>> Would it be worth it to investigate a pm_runtime autosuspend >>>> approach rather than knocking runtime PM out here completely? I am >>>> not clear if the overhead is coming from the pm_runtime calls >>>> themselves or the actual idling of the IP, but if it's the idling of >>>> the IP causing the slowdown, with a large enough autosuspend_delay >>>> we don't actually sleep between each block but after a long enough >>>> period of idle time we would actually suspend. >>> >>> Indeed, I think this patch is bogus. cra_init is associated >>> with the tfm object which is usually long-lived. So doing power >>> management there makes no sense. >>> >>> Cheers, >>> >> >> I can investigate this further, but I believe this patch itself gave a >> noticeable performance boost. >> >> This is an optimization anyway, and not critical for functionality. >> > > It is not critical only if below code would not introduce races I don't get your point here. This patch is an optimization, and the driver works fine without it. > + spin_lock_bh(&sham.lock); > + list_for_each_entry(dd, &sham.dev_list, list) { > + break; > + } > + spin_unlock_bh(&sham.lock); > > Is it guaranteed that dd will alive always at this moment? Typically yes, but I think there might be a race condition here if the driver is removed during operation. Anyway, I'll drop this patch and change the optimization to use autosuspend as Dave suggested; that gives almost the same performance boost as this one (I miss a couple of percent in the overall performance, but I can live with that.) -Tero > > + > + pm_runtime_get_sync(dd->dev); > > >
diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c index 6eefaa2..bd0258f 100644 --- a/drivers/crypto/omap-sham.c +++ b/drivers/crypto/omap-sham.c @@ -360,14 +360,6 @@ static void omap_sham_copy_ready_hash(struct ahash_request *req) static int omap_sham_hw_init(struct omap_sham_dev *dd) { - int err; - - err = pm_runtime_get_sync(dd->dev); - if (err < 0) { - dev_err(dd->dev, "failed to get sync: %d\n", err); - return err; - } - if (!test_bit(FLAGS_INIT, &dd->flags)) { set_bit(FLAGS_INIT, &dd->flags); dd->err = 0; @@ -999,8 +991,6 @@ static void omap_sham_finish_req(struct ahash_request *req, int err) dd->flags &= ~(BIT(FLAGS_BUSY) | BIT(FLAGS_FINAL) | BIT(FLAGS_CPU) | BIT(FLAGS_DMA_READY) | BIT(FLAGS_OUTPUT_READY)); - pm_runtime_put(dd->dev); - if (req->base.complete) req->base.complete(&req->base, err); @@ -1239,6 +1229,7 @@ static int omap_sham_cra_init_alg(struct crypto_tfm *tfm, const char *alg_base) { struct omap_sham_ctx *tctx = crypto_tfm_ctx(tfm); const char *alg_name = crypto_tfm_alg_name(tfm); + struct omap_sham_dev *dd; /* Allocate a fallback and abort if it failed. */ tctx->fallback = crypto_alloc_shash(alg_name, 0, @@ -1266,6 +1257,13 @@ static int omap_sham_cra_init_alg(struct crypto_tfm *tfm, const char *alg_base) } + spin_lock_bh(&sham.lock); + list_for_each_entry(dd, &sham.dev_list, list) { + break; + } + spin_unlock_bh(&sham.lock); + + pm_runtime_get_sync(dd->dev); return 0; } @@ -1307,6 +1305,7 @@ static int omap_sham_cra_sha512_init(struct crypto_tfm *tfm) static void omap_sham_cra_exit(struct crypto_tfm *tfm) { struct omap_sham_ctx *tctx = crypto_tfm_ctx(tfm); + struct omap_sham_dev *dd; crypto_free_shash(tctx->fallback); tctx->fallback = NULL; @@ -1315,6 +1314,14 @@ static void omap_sham_cra_exit(struct crypto_tfm *tfm) struct omap_sham_hmac_ctx *bctx = tctx->base; crypto_free_shash(bctx->shash); } + + spin_lock_bh(&sham.lock); + list_for_each_entry(dd, &sham.dev_list, list) { + break; + } + spin_unlock_bh(&sham.lock); + + pm_runtime_get_sync(dd->dev); } static struct ahash_alg algs_sha1_md5[] = {