diff mbox

mmc: mxs-mmc: add support for pre_req and post_req

Message ID 1303058010-30256-1-git-send-email-shawn.guo@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Shawn Guo April 17, 2011, 4:33 p.m. UTC
pre_req() runs dma_map_sg() post_req() runs dma_unmap_sg.
If not calling pre_req() before mxs_mmc_request(), request()
will prepare the cache just like it did it before.
It is optional to use pre_req() and post_req().

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
---
 drivers/mmc/host/mxs-mmc.c |   75 ++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 72 insertions(+), 3 deletions(-)

Comments

Shawn Guo April 17, 2011, 4:48 p.m. UTC | #1
On Mon, Apr 18, 2011 at 12:33:30AM +0800, Shawn Guo wrote:
> pre_req() runs dma_map_sg() post_req() runs dma_unmap_sg.
> If not calling pre_req() before mxs_mmc_request(), request()
> will prepare the cache just like it did it before.
> It is optional to use pre_req() and post_req().
> 
> Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
> ---
>  drivers/mmc/host/mxs-mmc.c |   75 ++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 72 insertions(+), 3 deletions(-)
> 

Here is the result of mmc_test case 37 ~ 40, which are designed to see
the performance improvement introduced by non-blocking changes.

Honestly, the improvement is not so impressive.  Not sure if the patch
for mxs-mmc pre_req and post_req support was correctly produced.  So
please help review ...

mmc0: Test case 37. Write performance with blocking req 4k to 4MB...
mmc0: Transfer of 32768 x 8 sectors (32768 x 4 KiB) took 76.370031249 seconds (1
757 kB/s, 1716 KiB/s, 429.06 IOPS)
mmc0: Transfer of 16384 x 16 sectors (16384 x 8 KiB) took 34.951875000 seconds (
3840 kB/s, 3750 KiB/s, 468.75 IOPS)
mmc0: Transfer of 8192 x 32 sectors (8192 x 16 KiB) took 19.097406250 seconds (7
028 kB/s, 6863 KiB/s, 428.95 IOPS)
mmc0: Transfer of 4096 x 64 sectors (4096 x 32 KiB) took 14.393937500 seconds (9
324 kB/s, 9106 KiB/s, 284.56 IOPS)
mmc0: Transfer of 2048 x 128 sectors (2048 x 64 KiB) took 12.519875000 seconds (
10720 kB/s, 10469 KiB/s, 163.57 IOPS)
mmc0: Transfer of 1024 x 256 sectors (1024 x 128 KiB) took 11.535156250 seconds 
(11635 kB/s, 11362 KiB/s, 88.77 IOPS)
mmc0: Transfer of 512 x 512 sectors (512 x 256 KiB) took 11.165375000 seconds (1
2020 kB/s, 11739 KiB/s, 45.85 IOPS)
mmc0: Transfer of 256 x 1024 sectors (256 x 512 KiB) took 10.922375001 seconds (
12288 kB/s, 12000 KiB/s, 23.43 IOPS)
mmc0: Transfer of 128 x 2048 sectors (128 x 1024 KiB) took 10.791811701 seconds 
(12436 kB/s, 12145 KiB/s, 11.86 IOPS)
mmc0: Transfer of 39 x 6630 sectors (39 x 3315 KiB) took 10.723858316 seconds (1
2345 kB/s, 12055 KiB/s, 3.63 IOPS)
mmc0: Result: OK
mmc0: Test case 38. Write performance with none blocking req 4k to 4MB...
mmc0: Transfer of 32768 x 8 sectors (32768 x 4 KiB) took 75.940425898 seconds (1
767 kB/s, 1725 KiB/s, 431.49 IOPS)
mmc0: Transfer of 16384 x 16 sectors (16384 x 8 KiB) took 34.650031250 seconds (
3873 kB/s, 3782 KiB/s, 472.84 IOPS)
mmc0: Transfer of 8192 x 32 sectors (8192 x 16 KiB) took 18.854781250 seconds (7
118 kB/s, 6951 KiB/s, 434.47 IOPS)
mmc0: Transfer of 4096 x 64 sectors (4096 x 32 KiB) took 14.183781250 seconds (9
462 kB/s, 9240 KiB/s, 288.78 IOPS)
mmc0: Transfer of 2048 x 128 sectors (2048 x 64 KiB) took 12.349375000 seconds (
10868 kB/s, 10613 KiB/s, 165.83 IOPS)
mmc0: Transfer of 1024 x 256 sectors (1024 x 128 KiB) took 11.373031250 seconds 
(11801 kB/s, 11524 KiB/s, 90.03 IOPS)
mmc0: Transfer of 512 x 512 sectors (512 x 256 KiB) took 10.991343750 seconds (1
2211 kB/s, 11925 KiB/s, 46.58 IOPS)
mmc0: Transfer of 256 x 1024 sectors (256 x 512 KiB) took 10.759218749 seconds (
12474 kB/s, 12182 KiB/s, 23.79 IOPS)
mmc0: Transfer of 128 x 2048 sectors (128 x 1024 KiB) took 10.628342707 seconds 
(12628 kB/s, 12332 KiB/s, 12.04 IOPS)
mmc0: Transfer of 39 x 6630 sectors (39 x 3315 KiB) took 10.547394289 seconds (1
2551 kB/s, 12257 KiB/s, 3.69 IOPS)
mmc0: Result: OK
mmc0: Test case 39. Read performance with blocking req 4k to 4MB...
mmc0: Transfer of 32768 x 8 sectors (32768 x 4 KiB) took 23.516885650 seconds (5
707 kB/s, 5573 KiB/s, 1393.38 IOPS)
mmc0: Transfer of 16384 x 16 sectors (16384 x 8 KiB) took 13.651000000 seconds (
9832 kB/s, 9601 KiB/s, 1200.20 IOPS)
mmc0: Transfer of 8192 x 32 sectors (8192 x 16 KiB) took 9.048625000 seconds (14
832 kB/s, 14485 KiB/s, 905.33 IOPS)
mmc0: Transfer of 4096 x 64 sectors (4096 x 32 KiB) took 6.589500000 seconds (20
368 kB/s, 19891 KiB/s, 621.59 IOPS)
mmc0: Transfer of 2048 x 128 sectors (2048 x 64 KiB) took 5.292437500 seconds (2
5360 kB/s, 24765 KiB/s, 386.96 IOPS)
mmc0: Transfer of 1024 x 256 sectors (1024 x 128 KiB) took 4.646156250 seconds (
28887 kB/s, 28210 KiB/s, 220.39 IOPS)
mmc0: Transfer of 512 x 512 sectors (512 x 256 KiB) took 4.319437500 seconds (31
072 kB/s, 30344 KiB/s, 118.53 IOPS)
mmc0: Transfer of 256 x 1024 sectors (256 x 512 KiB) took 4.158187500 seconds (3
2277 kB/s, 31521 KiB/s, 61.56 IOPS)
mmc0: Transfer of 128 x 2048 sectors (128 x 1024 KiB) took 4.076250000 seconds (
32926 kB/s, 32155 KiB/s, 31.40 IOPS)
mmc0: Transfer of 39 x 6630 sectors (39 x 3315 KiB) took 3.966816444 seconds (33
373 kB/s, 32591 KiB/s, 9.83 IOPS)
mmc0: Result: OK
mmc0: Test case 40. Read performance with none blocking req 4k to 4MB...
mmc0: Transfer of 32768 x 8 sectors (32768 x 4 KiB) took 23.251465475 seconds (5
772 kB/s, 5637 KiB/s, 1409.28 IOPS)
mmc0: Transfer of 16384 x 16 sectors (16384 x 8 KiB) took 13.411468750 seconds (
10007 kB/s, 9773 KiB/s, 1221.64 IOPS)
mmc0: Transfer of 8192 x 32 sectors (8192 x 16 KiB) took 8.822875000 seconds (15
212 kB/s, 14855 KiB/s, 928.49 IOPS)
mmc0: Transfer of 4096 x 64 sectors (4096 x 32 KiB) took 6.413406250 seconds (20
927 kB/s, 20437 KiB/s, 638.66 IOPS)
mmc0: Transfer of 2048 x 128 sectors (2048 x 64 KiB) took 5.127875000 seconds (2
6174 kB/s, 25560 KiB/s, 399.38 IOPS)
mmc0: Transfer of 1024 x 256 sectors (1024 x 128 KiB) took 4.486593750 seconds (
29915 kB/s, 29214 KiB/s, 228.23 IOPS)
mmc0: Transfer of 512 x 512 sectors (512 x 256 KiB) took 4.178312500 seconds (32
122 kB/s, 31369 KiB/s, 122.53 IOPS)
mmc0: Transfer of 256 x 1024 sectors (256 x 512 KiB) took 4.010281250 seconds (3
3468 kB/s, 32683 KiB/s, 63.83 IOPS)
mmc0: Transfer of 128 x 2048 sectors (128 x 1024 KiB) took 3.927437500 seconds (
34174 kB/s, 33373 KiB/s, 32.59 IOPS)
mmc0: Transfer of 39 x 6630 sectors (39 x 3315 KiB) took 3.823653744 seconds (34
623 kB/s, 33811 KiB/s, 10.19 IOPS)
mmc0: Result: OK
mmc0: Tests completed.
Per Forlin April 20, 2011, 7:58 a.m. UTC | #2
On 17 April 2011 18:33, Shawn Guo <shawn.guo@linaro.org> wrote:
> pre_req() runs dma_map_sg() post_req() runs dma_unmap_sg.
> If not calling pre_req() before mxs_mmc_request(), request()
> will prepare the cache just like it did it before.
> It is optional to use pre_req() and post_req().
>
> Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
> ---
>  drivers/mmc/host/mxs-mmc.c |   75 ++++++++++++++++++++++++++++++++++++++++++--
>  1 files changed, 72 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c
> index 99d39a6..63c2ae2 100644
> --- a/drivers/mmc/host/mxs-mmc.c
> +++ b/drivers/mmc/host/mxs-mmc.c
> @@ -137,6 +137,10 @@
>
>  #define SSP_PIO_NUM    3
>
> +struct mxs_mmc_next {
> +       s32 cookie;
> +};
> +
>  struct mxs_mmc_host {
>        struct mmc_host                 *mmc;
>        struct mmc_request              *mrq;
> @@ -154,6 +158,7 @@ struct mxs_mmc_host {
>        struct mxs_dma_data             dma_data;
>        unsigned int                    dma_dir;
>        u32                             ssp_pio_words[SSP_PIO_NUM];
> +       struct mxs_mmc_next             next_data;
>
>        unsigned int                    version;
>        unsigned char                   bus_width;
> @@ -302,6 +307,31 @@ static irqreturn_t mxs_mmc_irq_handler(int irq, void *dev_id)
>        return IRQ_HANDLED;
>  }
>
> +static int mxs_mmc_prep_dma_data(struct mxs_mmc_host *host,
> +                               struct mmc_data *data,
> +                               struct mxs_mmc_next *next)
> +{
> +       if (!next && data->host_cookie &&
> +           data->host_cookie != host->next_data.cookie) {
> +               printk(KERN_WARNING "[%s] invalid cookie: data->host_cookie %d"
> +                      " host->next_data.cookie %d\n",
> +                      __func__, data->host_cookie, host->next_data.cookie);
> +               data->host_cookie = 0;
> +       }
> +
> +       /* Check if next job is already prepared */
> +       if (next || (!next && data->host_cookie != host->next_data.cookie))
> +               if (dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
> +                              (data->flags & MMC_DATA_WRITE) ?
> +                              DMA_TO_DEVICE : DMA_FROM_DEVICE) == 0)
> +                       return -EINVAL;
> +
> +       if (next)
> +               data->host_cookie = ++next->cookie < 0 ? 1 : next->cookie;
> +
> +       return 0;
> +}
> +
>  static struct dma_async_tx_descriptor *mxs_mmc_prep_dma(
>        struct mxs_mmc_host *host, unsigned int append)
>  {
> @@ -312,8 +342,8 @@ static struct dma_async_tx_descriptor *mxs_mmc_prep_dma(
>
>        if (data) {
>                /* data */
> -               dma_map_sg(mmc_dev(host->mmc), data->sg,
> -                          data->sg_len, host->dma_dir);
> +               if (mxs_mmc_prep_dma_data(host, data, NULL))
> +                       return NULL;
>                sgl = data->sg;
>                sg_len = data->sg_len;
>        } else {
> @@ -328,9 +358,11 @@ static struct dma_async_tx_descriptor *mxs_mmc_prep_dma(
>                desc->callback = mxs_mmc_dma_irq_callback;
>                desc->callback_param = host;
>        } else {
> -               if (data)
> +               if (data) {
>                        dma_unmap_sg(mmc_dev(host->mmc), data->sg,
>                                     data->sg_len, host->dma_dir);
> +                       data->host_cookie = 0;
> +               }
When is dma_unmap_sg called? If host_cookie is set dma_unmap() should
only be called from post_req.
My guess is
+ if (data && !data->host_cookie) {
It looks like only dma_map is run in parallel with transfer but not
dma_unmap. This may explain the numbers.
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Per Forlin April 20, 2011, 8:01 a.m. UTC | #3
On 17 April 2011 18:48, Shawn Guo <shawn.guo@freescale.com> wrote:
> On Mon, Apr 18, 2011 at 12:33:30AM +0800, Shawn Guo wrote:
>> pre_req() runs dma_map_sg() post_req() runs dma_unmap_sg.
>> If not calling pre_req() before mxs_mmc_request(), request()
>> will prepare the cache just like it did it before.
>> It is optional to use pre_req() and post_req().
>>
>> Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
>> ---
>>  drivers/mmc/host/mxs-mmc.c |   75 ++++++++++++++++++++++++++++++++++++++++++--
>>  1 files changed, 72 insertions(+), 3 deletions(-)
>>
>
> Here is the result of mmc_test case 37 ~ 40, which are designed to see
> the performance improvement introduced by non-blocking changes.
>
> Honestly, the improvement is not so impressive.  Not sure if the patch
> for mxs-mmc pre_req and post_req support was correctly produced.  So
> please help review ...
My guess is that dma_unmap is not run in parallel with transfer.
Please look at my patch reply.

>
> --
> Regards,
> Shawn
Regards,
Per
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shawn Guo April 20, 2011, 8:17 a.m. UTC | #4
On Wed, Apr 20, 2011 at 09:58:48AM +0200, Per Forlin wrote:
> >  static struct dma_async_tx_descriptor *mxs_mmc_prep_dma(
> >        struct mxs_mmc_host *host, unsigned int append)
> >  {
> > @@ -312,8 +342,8 @@ static struct dma_async_tx_descriptor *mxs_mmc_prep_dma(
> >
> >        if (data) {
> >                /* data */
> > -               dma_map_sg(mmc_dev(host->mmc), data->sg,
> > -                          data->sg_len, host->dma_dir);
> > +               if (mxs_mmc_prep_dma_data(host, data, NULL))
> > +                       return NULL;
> >                sgl = data->sg;
> >                sg_len = data->sg_len;
> >        } else {
> > @@ -328,9 +358,11 @@ static struct dma_async_tx_descriptor *mxs_mmc_prep_dma(
> >                desc->callback = mxs_mmc_dma_irq_callback;
> >                desc->callback_param = host;
> >        } else {
> > -               if (data)
> > +               if (data) {
> >                        dma_unmap_sg(mmc_dev(host->mmc), data->sg,
> >                                     data->sg_len, host->dma_dir);
> > +                       data->host_cookie = 0;
> > +               }
> When is dma_unmap_sg called? If host_cookie is set dma_unmap() should
> only be called from post_req.
> My guess is
> + if (data && !data->host_cookie) {
> It looks like only dma_map is run in parallel with transfer but not
> dma_unmap. This may explain the numbers.

Good catch.  I forgot patching mxs_mmc_request_done where dma_unmap_sg
is called.  Will correct and retest ...
Shawn Guo April 20, 2011, 2:01 p.m. UTC | #5
On Wed, Apr 20, 2011 at 10:01:22AM +0200, Per Forlin wrote:
> On 17 April 2011 18:48, Shawn Guo <shawn.guo@freescale.com> wrote:
> > On Mon, Apr 18, 2011 at 12:33:30AM +0800, Shawn Guo wrote:
> >> pre_req() runs dma_map_sg() post_req() runs dma_unmap_sg.
> >> If not calling pre_req() before mxs_mmc_request(), request()
> >> will prepare the cache just like it did it before.
> >> It is optional to use pre_req() and post_req().
> >>
> >> Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
> >> ---
> >>  drivers/mmc/host/mxs-mmc.c |   75 ++++++++++++++++++++++++++++++++++++++++++--
> >>  1 files changed, 72 insertions(+), 3 deletions(-)
> >>
> >
> > Here is the result of mmc_test case 37 ~ 40, which are designed to see
> > the performance improvement introduced by non-blocking changes.
> >
> > Honestly, the improvement is not so impressive.  Not sure if the patch
> > for mxs-mmc pre_req and post_req support was correctly produced.  So
> > please help review ...
> My guess is that dma_unmap is not run in parallel with transfer.
> Please look at my patch reply.
> 
Got it fixed in v2 posted just now.  Please take another look.
Unfortunately, I do not see noticeable difference than v1 in terms
of mmc_test result.

Do you have omap_hsmmc and mmci mmc_test result data to share?
Per Forlin April 20, 2011, 3:22 p.m. UTC | #6
On 20 April 2011 16:01, Shawn Guo <shawn.guo@freescale.com> wrote:
> On Wed, Apr 20, 2011 at 10:01:22AM +0200, Per Forlin wrote:
>> On 17 April 2011 18:48, Shawn Guo <shawn.guo@freescale.com> wrote:
>> > On Mon, Apr 18, 2011 at 12:33:30AM +0800, Shawn Guo wrote:
>> >> pre_req() runs dma_map_sg() post_req() runs dma_unmap_sg.
>> >> If not calling pre_req() before mxs_mmc_request(), request()
>> >> will prepare the cache just like it did it before.
>> >> It is optional to use pre_req() and post_req().
>> >>
>> >> Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
>> >> ---
>> >>  drivers/mmc/host/mxs-mmc.c |   75 ++++++++++++++++++++++++++++++++++++++++++--
>> >>  1 files changed, 72 insertions(+), 3 deletions(-)
>> >>
>> >
>> > Here is the result of mmc_test case 37 ~ 40, which are designed to see
>> > the performance improvement introduced by non-blocking changes.
>> >
>> > Honestly, the improvement is not so impressive.  Not sure if the patch
>> > for mxs-mmc pre_req and post_req support was correctly produced.  So
>> > please help review ...
>> My guess is that dma_unmap is not run in parallel with transfer.
>> Please look at my patch reply.
>>
> Got it fixed in v2 posted just now.  Please take another look.
> Unfortunately, I do not see noticeable difference than v1 in terms
> of mmc_test result.
>
> Do you have omap_hsmmc and mmci mmc_test result data to share?
>
I keep the here:
https://wiki.linaro.org/WorkingGroups/Kernel/Specs/StoragePerfMMC-async-req

> --
> Regards,
> Shawn
>
>
/Per
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Per Forlin April 20, 2011, 3:30 p.m. UTC | #7
On 20 April 2011 16:01, Shawn Guo <shawn.guo@freescale.com> wrote:
> On Wed, Apr 20, 2011 at 10:01:22AM +0200, Per Forlin wrote:
>> On 17 April 2011 18:48, Shawn Guo <shawn.guo@freescale.com> wrote:
>> > On Mon, Apr 18, 2011 at 12:33:30AM +0800, Shawn Guo wrote:
>> >> pre_req() runs dma_map_sg() post_req() runs dma_unmap_sg.
>> >> If not calling pre_req() before mxs_mmc_request(), request()
>> >> will prepare the cache just like it did it before.
>> >> It is optional to use pre_req() and post_req().
>> >>
>> >> Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
>> >> ---
>> >>  drivers/mmc/host/mxs-mmc.c |   75 ++++++++++++++++++++++++++++++++++++++++++--
>> >>  1 files changed, 72 insertions(+), 3 deletions(-)
>> >>
>> >
>> > Here is the result of mmc_test case 37 ~ 40, which are designed to see
>> > the performance improvement introduced by non-blocking changes.
>> >
>> > Honestly, the improvement is not so impressive.  Not sure if the patch
>> > for mxs-mmc pre_req and post_req support was correctly produced.  So
>> > please help review ...
>> My guess is that dma_unmap is not run in parallel with transfer.
>> Please look at my patch reply.
>>
> Got it fixed in v2 posted just now.  Please take another look.
> Unfortunately, I do not see noticeable difference than v1 in terms
> of mmc_test result.
>
Remove dma_map and dma_unmap from your host driver and run the tests
(obviously nonblocking and blocking will have the same results). If
there is still no performance gain the cache penalty is very small on
your platform and therefore nonblocking doesn't improve things much.
Please let me know the result.

BR,
Per
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shawn Guo April 21, 2011, 6:25 a.m. UTC | #8
On Wed, Apr 20, 2011 at 05:22:34PM +0200, Per Forlin wrote:
[...]
> > Do you have omap_hsmmc and mmci mmc_test result data to share?
> >
> I keep the here:
> https://wiki.linaro.org/WorkingGroups/Kernel/Specs/StoragePerfMMC-async-req
> 
Actually, I have seen this page before.  I was wondering if you have
mmc_test raw data (test log of cases #37 ~ #40) to share.
Shawn Guo April 21, 2011, 6:29 a.m. UTC | #9
On Wed, Apr 20, 2011 at 05:30:22PM +0200, Per Forlin wrote:
[...]
> Remove dma_map and dma_unmap from your host driver and run the tests
> (obviously nonblocking and blocking will have the same results). If
> there is still no performance gain the cache penalty is very small on
> your platform and therefore nonblocking doesn't improve things much.
> Please let me know the result.
> 
Sorry, I could not understand.  What's the point to run the test when
the driver is even broken.  The removal of  dma_map_sg and
dma_unmap_sg makes mxs-mmc host driver broken.
Per Forlin April 21, 2011, 8:46 a.m. UTC | #10
On 21 April 2011 08:29, Shawn Guo <shawn.guo@freescale.com> wrote:
> On Wed, Apr 20, 2011 at 05:30:22PM +0200, Per Forlin wrote:
> [...]
>> Remove dma_map and dma_unmap from your host driver and run the tests
>> (obviously nonblocking and blocking will have the same results). If
>> there is still no performance gain the cache penalty is very small on
>> your platform and therefore nonblocking doesn't improve things much.
>> Please let me know the result.
>>
> Sorry, I could not understand.  What's the point to run the test when
> the driver is even broken.  The removal of  dma_map_sg and
> dma_unmap_sg makes mxs-mmc host driver broken.
The point is only to get a measurement of the cost of handling
dma_map_sg and dma_unmap_sg, this is the maximum time mmc nonblocking
can save.
The nonblocking mmc_test should save the total time of dma_map_sg and
dma_unmap_sg, if the pre_req and post_req hooks are implemented
correctly.
Running without dma_map_sg and dma_unmap_sg will confirm if the
pre_req and post_req hooks are implemented correctly.

>
> --
> Regards,
> Shawn
>
>
Regards,
Per
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Per Forlin April 21, 2011, 8:52 a.m. UTC | #11
On 21 April 2011 08:25, Shawn Guo <shawn.guo@freescale.com> wrote:
> On Wed, Apr 20, 2011 at 05:22:34PM +0200, Per Forlin wrote:
> [...]
>> > Do you have omap_hsmmc and mmci mmc_test result data to share?
>> >
>> I keep the here:
>> https://wiki.linaro.org/WorkingGroups/Kernel/Specs/StoragePerfMMC-async-req
>>
> Actually, I have seen this page before.  I was wondering if you have
> mmc_test raw data (test log of cases #37 ~ #40) to share.
My test numbers are 39 - 42, same tests though.

PANDA - omap_hsmmc:
mmc0: Test case 39. Multiple write performance with sync req 4k to 4MB...
 mmc0: Transfer of 32768 x 8 sectors (32768 x 4 KiB) took 18.987487792
seconds (7068 kB/s, 6903 KiB/s)
 mmc0: Transfer of 16384 x 16 sectors (16384 x 8 KiB) took
11.947631837 seconds (11233 kB/s, 10970 KiB/s)
 mmc0: Transfer of 8192 x 32 sectors (8192 x 16 KiB) took 9.472808839
seconds (14168 kB/s, 13836 KiB/s)
 mmc0: Transfer of 4096 x 64 sectors (4096 x 32 KiB) took 8.260650636
seconds (16247 kB/s, 15867 KiB/s)
 mmc0: Transfer of 2048 x 128 sectors (2048 x 64 KiB) took 7.622741699
seconds (17607 kB/s, 17194 KiB/s)
 mmc0: Transfer of 1024 x 256 sectors (1024 x 128 KiB) took
7.323181152 seconds (18327 kB/s, 17898 KiB/s)
 mmc0: Transfer of 512 x 512 sectors (512 x 256 KiB) took 6.744140626
seconds (19901 kB/s, 19434 KiB/s)
 mmc0: Transfer of 256 x 1024 sectors (256 x 512 KiB) took 6.679138184
seconds (20095 kB/s, 19624 KiB/s)
 mmc0: Transfer of 128 x 2048 sectors (128 x 1024 KiB) took
6.625518800 seconds (20257 kB/s, 19782 KiB/s)
 mmc0: Transfer of 32 x 8192 sectors (32 x 4096 KiB) took 6.620574950
seconds (20272 kB/s, 19797 KiB/s)
 mmc0: Result: OK
 mmc0: Tests completed.
 mmc0: Starting tests of card mmc0:80ca...
 mmc0: Test case 40. Multiple write performance with async req 4k to 4MB...
 mmc0: Transfer of 32768 x 8 sectors (32768 x 4 KiB) took 18.647644043
seconds (7197 kB/s, 7028 KiB/s)
 mmc0: Transfer of 16384 x 16 sectors (16384 x 8 KiB) took
11.624816894 seconds (11545 kB/s, 11275 KiB/s)
 mmc0: Transfer of 8192 x 32 sectors (8192 x 16 KiB) took 9.170440675
seconds (14635 kB/s, 14292 KiB/s)
 mmc0: Transfer of 4096 x 64 sectors (4096 x 32 KiB) took 7.965667725
seconds (16849 kB/s, 16454 KiB/s)
 mmc0: Transfer of 2048 x 128 sectors (2048 x 64 KiB) took 7.318023681
seconds (18340 kB/s, 17910 KiB/s)
 mmc0: Transfer of 1024 x 256 sectors (1024 x 128 KiB) took
7.040740969 seconds (19063 kB/s, 18616 KiB/s)
 mmc0: Transfer of 512 x 512 sectors (512 x 256 KiB) took 6.444641113
seconds (20826 kB/s, 20338 KiB/s)
 mmc0: Transfer of 256 x 1024 sectors (256 x 512 KiB) took 6.380249023
seconds (21036 kB/s, 20543 KiB/s)
 mmc0: Transfer of 128 x 2048 sectors (128 x 1024 KiB) took
6.333343506 seconds (21192 kB/s, 20695 KiB/s)
 mmc0: Transfer of 32 x 8192 sectors (32 x 4096 KiB) took 6.328002930
seconds (21210 kB/s, 20713 KiB/s)
 mmc0: Result: OK
 mmc0: Tests completed.
 mmc0: Starting tests of card mmc0:80ca...
 mmc0: Test case 41. Multiple read performance with sync req 4k to 4MB...
 mmc0: Transfer of 32768 x 8 sectors (32768 x 4 KiB) took 20.567749024
seconds (6525 kB/s, 6372 KiB/s)
 mmc0: Transfer of 16384 x 16 sectors (16384 x 8 KiB) took
12.770507813 seconds (10509 kB/s, 10263 KiB/s)
 mmc0: Transfer of 8192 x 32 sectors (8192 x 16 KiB) took 10.003143311
seconds (13417 kB/s, 13103 KiB/s)
 mmc0: Transfer of 4096 x 64 sectors (4096 x 32 KiB) took 7.775665284
seconds (17261 kB/s, 16856 KiB/s)
 mmc0: Transfer of 2048 x 128 sectors (2048 x 64 KiB) took 6.960906982
seconds (19281 kB/s, 18829 KiB/s)
 mmc0: Transfer of 1024 x 256 sectors (1024 x 128 KiB) took
6.661651612 seconds (20147 kB/s, 19675 KiB/s)
 mmc0: Transfer of 512 x 512 sectors (512 x 256 KiB) took 6.510711672
seconds (20614 kB/s, 20131 KiB/s)
 mmc0: Transfer of 256 x 1024 sectors (256 x 512 KiB) took 6.434722900
seconds (20858 kB/s, 20369 KiB/s)
 mmc0: Transfer of 128 x 2048 sectors (128 x 1024 KiB) took
6.396850587 seconds (20981 kB/s, 20490 KiB/s)
 mmc0: Transfer of 32 x 8192 sectors (32 x 4096 KiB) took 6.368560791
seconds (21075 kB/s, 20581 KiB/s)
 mmc0: Result: OK
 mmc0: Tests completed.
 mmc0: Starting tests of card mmc0:80ca...
 mmc0: Test case 42. Multiple read performance with async req 4k to 4MB...
 mmc0: Transfer of 32768 x 8 sectors (32768 x 4 KiB) took 20.650848389
seconds (6499 kB/s, 6347 KiB/s)
 mmc0: Transfer of 16384 x 16 sectors (16384 x 8 KiB) took
12.796203613 seconds (10488 kB/s, 10243 KiB/s)
 mmc0: Transfer of 8192 x 32 sectors (8192 x 16 KiB) took 10.019592286
seconds (13395 kB/s, 13081 KiB/s)
 mmc0: Transfer of 4096 x 64 sectors (4096 x 32 KiB) took 7.603942870
seconds (17651 kB/s, 17237 KiB/s)
 mmc0: Transfer of 2048 x 128 sectors (2048 x 64 KiB) took 6.751251223
seconds (19880 kB/s, 19414 KiB/s)
 mmc0: Transfer of 1024 x 256 sectors (1024 x 128 KiB) took
6.252929687 seconds (21464 kB/s, 20961 KiB/s)
 mmc0: Transfer of 512 x 512 sectors (512 x 256 KiB) took 5.956726076
seconds (22532 kB/s, 22004 KiB/s)
 mmc0: Transfer of 256 x 1024 sectors (256 x 512 KiB) took 5.863586425
seconds (22890 kB/s, 22353 KiB/s)
 mmc0: Transfer of 128 x 2048 sectors (128 x 1024 KiB) took
5.818939210 seconds (23065 kB/s, 22525 KiB/s)
 mmc0: Transfer of 32 x 8192 sectors (32 x 4096 KiB) took 5.797515869
seconds (23150 kB/s, 22608 KiB/s)
 mmc0: Result: OK
 mmc0: Tests completed.


U5500 - mmci
mmc0: Test case 38. Multiple write performance with sync req 4k to 4MB...
mmc0: Transfer of 65536 x 8 sectors (65536 x 4 KiB) took 94.638215679
seconds (2836 kB/s, 2769 KiB/s)
mmc0: Transfer of 32768 x 16 sectors (32768 x 8 KiB) took 42.323242363
seconds (6342 kB/s, 6193 KiB/s)
mmc0: Transfer of 16384 x 32 sectors (16384 x 16 KiB) took
26.845500458 seconds (9999 kB/s, 9764 KiB/s)
mmc0: Transfer of 8192 x 64 sectors (8192 x 32 KiB) took 19.154612107
seconds (14014 kB/s, 13685 KiB/s)
mmc0: Transfer of 4096 x 128 sectors (4096 x 64 KiB) took 15.318183875
seconds (17523 kB/s, 17113 KiB/s)
mmc0: Transfer of 2048 x 256 sectors (2048 x 128 KiB) took
13.373840666 seconds (20071 kB/s, 19601 KiB/s)
mmc0: Transfer of 1024 x 512 sectors (1024 x 256 KiB) took
12.335949719 seconds (21760 kB/s, 21250 KiB/s)
mmc0: Transfer of 512 x 1024 sectors (512 x 512 KiB) took 11.886310118
seconds (22583 kB/s, 22054 KiB/s)
mmc0: Transfer of 256 x 2048 sectors (256 x 1024 KiB) took
11.703806675 seconds (22935 kB/s, 22398 KiB/s)
mmc0: Transfer of 64 x 8192 sectors (64 x 4096 KiB) took 11.632018242
seconds (23077 kB/s, 22536 KiB/s)
mmc0: Result: OK
mmc0: Tests completed.
mmc0: Starting tests of card mmc0:0001...
mmc0: Test case 39. Multiple write performance with async req 4k to 4MB...
mmc0: Transfer of 65536 x 8 sectors (65536 x 4 KiB) took 94.567817239
seconds (2838 kB/s, 2772 KiB/s)
mmc0: Transfer of 32768 x 16 sectors (32768 x 8 KiB) took 42.043969582
seconds (6384 kB/s, 6234 KiB/s)
mmc0: Transfer of 16384 x 32 sectors (16384 x 16 KiB) took
26.483716374 seconds (10135 kB/s, 9898 KiB/s)
mmc0: Transfer of 8192 x 64 sectors (8192 x 32 KiB) took 18.694318999
seconds (14359 kB/s, 14022 KiB/s)
mmc0: Transfer of 4096 x 128 sectors (4096 x 64 KiB) took 14.832822022
seconds (18097 kB/s, 17673 KiB/s)
mmc0: Transfer of 2048 x 256 sectors (2048 x 128 KiB) took
12.792243128 seconds (20984 kB/s, 20492 KiB/s)
mmc0: Transfer of 1024 x 512 sectors (1024 x 256 KiB) took
11.793732364 seconds (22760 kB/s, 22227 KiB/s)
mmc0: Transfer of 512 x 1024 sectors (512 x 512 KiB) took 11.299139279
seconds (23757 kB/s, 23200 KiB/s)
mmc0: Transfer of 256 x 2048 sectors (256 x 1024 KiB) took
11.160603255 seconds (24052 kB/s, 23488 KiB/s)
mmc0: Transfer of 64 x 8192 sectors (64 x 4096 KiB) took 11.036530115
seconds (24322 kB/s, 23752 KiB/s)
mmc0: Result: OK
mmc0: Tests completed.
mmc0: Starting tests of card mmc0:0001...
mmc0: Test case 40. Multiple read performance with sync req 4k to 4MB...
mmc0: Transfer of 65536 x 8 sectors (65536 x 4 KiB) took 24.830932935
seconds (10810 kB/s, 10557 KiB/s)
mmc0: Transfer of 32768 x 16 sectors (32768 x 8 KiB) took 15.896942145
seconds (16885 kB/s, 16490 KiB/s)
mmc0: Transfer of 16384 x 32 sectors (16384 x 16 KiB) took
11.452273667 seconds (23439 kB/s, 22890 KiB/s)
mmc0: Transfer of 8192 x 64 sectors (8192 x 32 KiB) took 9.247484468
seconds (29027 kB/s, 28347 KiB/s)
mmc0: Transfer of 4096 x 128 sectors (4096 x 64 KiB) took 8.117996552
seconds (33066 kB/s, 32291 KiB/s)
mmc0: Transfer of 2048 x 256 sectors (2048 x 128 KiB) took 7.672241687
seconds (34987 kB/s, 34167 KiB/s)
mmc0: Transfer of 1024 x 512 sectors (1024 x 256 KiB) took 7.588627916
seconds (35373 kB/s, 34544 KiB/s)
mmc0: Transfer of 512 x 1024 sectors (512 x 512 KiB) took 7.545239751
seconds (35576 kB/s, 34742 KiB/s)
mmc0: Transfer of 256 x 2048 sectors (256 x 1024 KiB) took 7.558409240
seconds (35514 kB/s, 34682 KiB/s)
mmc0: Transfer of 64 x 8192 sectors (64 x 4096 KiB) took 7.567300503
seconds (35473 kB/s, 34641 KiB/s)
mmc0: Result: OK
mmc0: Tests completed.
mmc0: Starting tests of card mmc0:0001...
mmc0: Test case 41. Multiple read performance with async req 4k to 4MB...
mmc0: Transfer of 65536 x 8 sectors (65536 x 4 KiB) took 24.077512698
seconds (11148 kB/s, 10887 KiB/s)
mmc0: Transfer of 32768 x 16 sectors (32768 x 8 KiB) took 15.280992666
seconds (17566 kB/s, 17154 KiB/s)
mmc0: Transfer of 16384 x 32 sectors (16384 x 16 KiB) took
10.892908603 seconds (24643 kB/s, 24065 KiB/s)
mmc0: Transfer of 8192 x 64 sectors (8192 x 32 KiB) took 8.698435941
seconds (30860 kB/s, 30136 KiB/s)
mmc0: Transfer of 4096 x 128 sectors (4096 x 64 KiB) took 7.617525358
seconds (35239 kB/s, 34413 KiB/s)
mmc0: Transfer of 2048 x 256 sectors (2048 x 128 KiB) took 7.058797630
seconds (38028 kB/s, 37137 KiB/s)
mmc0: Transfer of 1024 x 512 sectors (1024 x 256 KiB) took 6.779599610
seconds (39594 kB/s, 38666 KiB/s)
mmc0: Transfer of 512 x 1024 sectors (512 x 512 KiB) took 6.639430826
seconds (40430 kB/s, 39482 KiB/s)
mmc0: Transfer of 256 x 2048 sectors (256 x 1024 KiB) took 6.588963683
seconds (40740 kB/s, 39785 KiB/s)
mmc0: Transfer of 64 x 8192 sectors (64 x 4096 KiB) took 6.561933369
seconds (40907 kB/s, 39949 KiB/s)



>
> --
> Regards,
> Shawn
>
>
Regards,
Per
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Shawn Guo April 21, 2011, 9:11 a.m. UTC | #12
On Thu, Apr 21, 2011 at 10:46:18AM +0200, Per Forlin wrote:
> On 21 April 2011 08:29, Shawn Guo <shawn.guo@freescale.com> wrote:
> > On Wed, Apr 20, 2011 at 05:30:22PM +0200, Per Forlin wrote:
> > [...]
> >> Remove dma_map and dma_unmap from your host driver and run the tests
> >> (obviously nonblocking and blocking will have the same results). If
> >> there is still no performance gain the cache penalty is very small on
> >> your platform and therefore nonblocking doesn't improve things much.
> >> Please let me know the result.
> >>
> > Sorry, I could not understand.  What's the point to run the test when
> > the driver is even broken.  The removal of  dma_map_sg and
> > dma_unmap_sg makes mxs-mmc host driver broken.
> The point is only to get a measurement of the cost of handling
> dma_map_sg and dma_unmap_sg, this is the maximum time mmc nonblocking
> can save.
> The nonblocking mmc_test should save the total time of dma_map_sg and
> dma_unmap_sg, if the pre_req and post_req hooks are implemented
> correctly.
> Running without dma_map_sg and dma_unmap_sg will confirm if the
> pre_req and post_req hooks are implemented correctly.
> 
With dma_map_sg and dma_unmap_sg removed, the mmc_test gave very low
numbers, though blocking and non-blocking numbers are same.  Is it
an indication that pre_req and post_req hooks are not implemented
correctly?  If yes, can you please help to catch the mistakes?

Thanks.

mc0: Test case 39. Read performance with blocking req 4k to 4MB...
mmc0: Transfer of 32768 x 8 sectors (32768 x 4 KiB) took 56.875013015 seconds (2
359 kB/s, 2304 KiB/s, 576.14 IOPS)
mmc0: Transfer of 16384 x 16 sectors (16384 x 8 KiB) took 47.407562500 seconds (
2831 kB/s, 2764 KiB/s, 345.59 IOPS)
mmc0: Transfer of 8192 x 32 sectors (8192 x 16 KiB) took 42.708718750 seconds (3
142 kB/s, 3068 KiB/s, 191.81 IOPS)
mmc0: Transfer of 4096 x 64 sectors (4096 x 32 KiB) took 40.227125000 seconds (3
336 kB/s, 3258 KiB/s, 101.82 IOPS)
mmc0: Transfer of 2048 x 128 sectors (2048 x 64 KiB) took 38.915750000 seconds (
3448 kB/s, 3368 KiB/s, 52.62 IOPS)
mmc0: Transfer of 1024 x 256 sectors (1024 x 128 KiB) took 38.249562498 seconds
(3509 kB/s, 3426 KiB/s, 26.77 IOPS)
mmc0: Transfer of 512 x 512 sectors (512 x 256 KiB) took 37.912342548 seconds (3
540 kB/s, 3457 KiB/s, 13.50 IOPS)
mmc0: Transfer of 256 x 1024 sectors (256 x 512 KiB) took 37.743876391 seconds (
3556 kB/s, 3472 KiB/s, 6.78 IOPS)
mmc0: Transfer of 128 x 2048 sectors (128 x 1024 KiB) took 37.658104019 seconds
(3564 kB/s, 3480 KiB/s, 3.39 IOPS)
mmc0: Transfer of 39 x 6630 sectors (39 x 3315 KiB) took 37.086429038 seconds (3
569 kB/s, 3486 KiB/s, 1.05 IOPS)
mmc0: Result: OK
mmc0: Test case 40. Read performance with none blocking req 4k to 4MB...
mmc0: Transfer of 32768 x 8 sectors (32768 x 4 KiB) took 56.732932555 seconds (2
365 kB/s, 2310 KiB/s, 577.58 IOPS)
mmc0: Transfer of 16384 x 16 sectors (16384 x 8 KiB) took 47.342812500 seconds (
2835 kB/s, 2768 KiB/s, 346.07 IOPS)
mmc0: Transfer of 8192 x 32 sectors (8192 x 16 KiB) took 42.673906250 seconds (3
145 kB/s, 3071 KiB/s, 191.96 IOPS)
mmc0: Transfer of 4096 x 64 sectors (4096 x 32 KiB) took 40.208218750 seconds (3
338 kB/s, 3259 KiB/s, 101.86 IOPS)
mmc0: Transfer of 2048 x 128 sectors (2048 x 64 KiB) took 38.906750000 seconds (
3449 kB/s, 3368 KiB/s, 52.63 IOPS)
mmc0: Transfer of 1024 x 256 sectors (1024 x 128 KiB) took 38.244749999 seconds
(3509 kB/s, 3427 KiB/s, 26.77 IOPS)
mmc0: Transfer of 512 x 512 sectors (512 x 256 KiB) took 37.909719946 seconds (3
540 kB/s, 3457 KiB/s, 13.50 IOPS)
mmc0: Transfer of 256 x 1024 sectors (256 x 512 KiB) took 37.741834105 seconds (
3556 kB/s, 3472 KiB/s, 6.78 IOPS)
mmc0: Transfer of 128 x 2048 sectors (128 x 1024 KiB) took 37.657555456 seconds
(3564 kB/s, 3480 KiB/s, 3.39 IOPS)
mmc0: Transfer of 39 x 6630 sectors (39 x 3315 KiB) took 37.086351431 seconds (3
569 kB/s, 3486 KiB/s, 1.05 IOPS)
mmc0: Result: OK
mmc0: Tests completed.
Per Forlin April 21, 2011, 9:47 a.m. UTC | #13
On 21 April 2011 11:11, Shawn Guo <shawn.guo@freescale.com> wrote:
> On Thu, Apr 21, 2011 at 10:46:18AM +0200, Per Forlin wrote:
>> On 21 April 2011 08:29, Shawn Guo <shawn.guo@freescale.com> wrote:
>> > On Wed, Apr 20, 2011 at 05:30:22PM +0200, Per Forlin wrote:
>> > [...]
>> >> Remove dma_map and dma_unmap from your host driver and run the tests
>> >> (obviously nonblocking and blocking will have the same results). If
>> >> there is still no performance gain the cache penalty is very small on
>> >> your platform and therefore nonblocking doesn't improve things much.
>> >> Please let me know the result.
>> >>
>> > Sorry, I could not understand.  What's the point to run the test when
>> > the driver is even broken.  The removal of  dma_map_sg and
>> > dma_unmap_sg makes mxs-mmc host driver broken.
>> The point is only to get a measurement of the cost of handling
>> dma_map_sg and dma_unmap_sg, this is the maximum time mmc nonblocking
>> can save.
>> The nonblocking mmc_test should save the total time of dma_map_sg and
>> dma_unmap_sg, if the pre_req and post_req hooks are implemented
>> correctly.
>> Running without dma_map_sg and dma_unmap_sg will confirm if the
>> pre_req and post_req hooks are implemented correctly.
>>
> With dma_map_sg and dma_unmap_sg removed, the mmc_test gave very low
> numbers, though blocking and non-blocking numbers are same.  Is it
> an indication that pre_req and post_req hooks are not implemented
> correctly?
I think you could get the same numbers for the nonblocking with
dma_map and dma_unmap in place.

>  If yes, can you please help to catch the mistakes?
I will take a look.

> --
> Regards,
> Shawn
>
>
Regards,
Per
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Per Forlin April 21, 2011, 10:15 a.m. UTC | #14
On 21 April 2011 11:47, Per Forlin <per.forlin@linaro.org> wrote:
> On 21 April 2011 11:11, Shawn Guo <shawn.guo@freescale.com> wrote:
>> On Thu, Apr 21, 2011 at 10:46:18AM +0200, Per Forlin wrote:
>>> On 21 April 2011 08:29, Shawn Guo <shawn.guo@freescale.com> wrote:
>>> > On Wed, Apr 20, 2011 at 05:30:22PM +0200, Per Forlin wrote:
>>> > [...]
>>> >> Remove dma_map and dma_unmap from your host driver and run the tests
>>> >> (obviously nonblocking and blocking will have the same results). If
>>> >> there is still no performance gain the cache penalty is very small on
>>> >> your platform and therefore nonblocking doesn't improve things much.
>>> >> Please let me know the result.
>>> >>
>>> > Sorry, I could not understand.  What's the point to run the test when
>>> > the driver is even broken.  The removal of  dma_map_sg and
>>> > dma_unmap_sg makes mxs-mmc host driver broken.
>>> The point is only to get a measurement of the cost of handling
>>> dma_map_sg and dma_unmap_sg, this is the maximum time mmc nonblocking
>>> can save.
>>> The nonblocking mmc_test should save the total time of dma_map_sg and
>>> dma_unmap_sg, if the pre_req and post_req hooks are implemented
>>> correctly.
>>> Running without dma_map_sg and dma_unmap_sg will confirm if the
>>> pre_req and post_req hooks are implemented correctly.
>>>
>> With dma_map_sg and dma_unmap_sg removed, the mmc_test gave very low
>> numbers, though blocking and non-blocking numbers are same.  Is it
>> an indication that pre_req and post_req hooks are not implemented
>> correctly?
> I think you could get the same numbers for the nonblocking with
> dma_map and dma_unmap in place.
>
>>  If yes, can you please help to catch the mistakes?
> I will take a look.
>
I found something:
static void mxs_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq)
{
	struct mxs_mmc_host *host = mmc_priv(mmc);

	WARN_ON(host->mrq != NULL);
	host->mrq = mrq;
	mxs_mmc_start_cmd(host, mrq->cmd);
}

This is the execution flow:
pre_req()
mxs_mmc_request()
post_req()
wait_for_completion()
pre_req()

mxs_mmc_request() returns before the prepared value is used
post_req() will run dma_unmap and set the cookie to 0, this mean in
your case dma_unmap_sg will be called twice.
You need to store away the prepared data in mxs_mmc_request().
Look at my patch for mmci, function mmci_get_next_data. That function
deals with this issue.

I didn't see this issue when I only looked at the patch since no
changes are made in the request-function.

>> --
>> Regards,
>> Shawn
Regards,
Per
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Per Forlin April 28, 2011, 7:52 a.m. UTC | #15
On 21 April 2011 11:47, Per Forlin <per.forlin@linaro.org> wrote:
> On 21 April 2011 11:11, Shawn Guo <shawn.guo@freescale.com> wrote:
>> On Thu, Apr 21, 2011 at 10:46:18AM +0200, Per Forlin wrote:
>>> On 21 April 2011 08:29, Shawn Guo <shawn.guo@freescale.com> wrote:
>>> > On Wed, Apr 20, 2011 at 05:30:22PM +0200, Per Forlin wrote:
>>> > [...]
>>> >> Remove dma_map and dma_unmap from your host driver and run the tests
>>> >> (obviously nonblocking and blocking will have the same results). If
>>> >> there is still no performance gain the cache penalty is very small on
>>> >> your platform and therefore nonblocking doesn't improve things much.
>>> >> Please let me know the result.
>>> >>
>>> > Sorry, I could not understand.  What's the point to run the test when
>>> > the driver is even broken.  The removal of  dma_map_sg and
>>> > dma_unmap_sg makes mxs-mmc host driver broken.
>>> The point is only to get a measurement of the cost of handling
>>> dma_map_sg and dma_unmap_sg, this is the maximum time mmc nonblocking
>>> can save.
>>> The nonblocking mmc_test should save the total time of dma_map_sg and
>>> dma_unmap_sg, if the pre_req and post_req hooks are implemented
>>> correctly.
>>> Running without dma_map_sg and dma_unmap_sg will confirm if the
>>> pre_req and post_req hooks are implemented correctly.
>>>
>> With dma_map_sg and dma_unmap_sg removed, the mmc_test gave very low
>> numbers, though blocking and non-blocking numbers are same.  Is it
>> an indication that pre_req and post_req hooks are not implemented
>> correctly?
> I think you could get the same numbers for the nonblocking with
> dma_map and dma_unmap in place.
>
I wanted to test the performance without cache penalty but removing
dma_map_sg may not work since it produces the physical mapped sg list.
This is not as simple as I first thought. Make a copy for dma_map_sg
(call it dma_map_sg_no_cache) and modify it to not clean/inc the
cache. Replace dma_map_sg with dma_map_sg_no_cache in the mxs-mmc
driver.
Removing dma_unmap should be ok for this test case.
Do you still get very low numbers?

>>  If yes, can you please help to catch the mistakes?
> I will take a look.
>
>> --
>> Regards,
>> Shawn
>>
>>
> Regards,
> Per
>
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Russell King - ARM Linux April 28, 2011, 10:10 a.m. UTC | #16
On Thu, Apr 28, 2011 at 09:52:17AM +0200, Per Forlin wrote:
> I wanted to test the performance without cache penalty but removing
> dma_map_sg may not work since it produces the physical mapped sg list.
> This is not as simple as I first thought. Make a copy for dma_map_sg
> (call it dma_map_sg_no_cache) and modify it to not clean/inc the
> cache. Replace dma_map_sg with dma_map_sg_no_cache in the mxs-mmc
> driver.
> Removing dma_unmap should be ok for this test case.
> Do you still get very low numbers?

We can live in the hope that this is gathering evidence to illustrate
why having DMA incoherent caches are bad news for performance, and that
one day ARM Ltd will one day give us proper DMA cache coherency for all
devices.
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c
index 99d39a6..63c2ae2 100644
--- a/drivers/mmc/host/mxs-mmc.c
+++ b/drivers/mmc/host/mxs-mmc.c
@@ -137,6 +137,10 @@ 
 
 #define SSP_PIO_NUM	3
 
+struct mxs_mmc_next {
+	s32 cookie;
+};
+
 struct mxs_mmc_host {
 	struct mmc_host			*mmc;
 	struct mmc_request		*mrq;
@@ -154,6 +158,7 @@  struct mxs_mmc_host {
 	struct mxs_dma_data		dma_data;
 	unsigned int			dma_dir;
 	u32				ssp_pio_words[SSP_PIO_NUM];
+	struct mxs_mmc_next		next_data;
 
 	unsigned int			version;
 	unsigned char			bus_width;
@@ -302,6 +307,31 @@  static irqreturn_t mxs_mmc_irq_handler(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static int mxs_mmc_prep_dma_data(struct mxs_mmc_host *host,
+				struct mmc_data *data,
+				struct mxs_mmc_next *next)
+{
+	if (!next && data->host_cookie &&
+	    data->host_cookie != host->next_data.cookie) {
+		printk(KERN_WARNING "[%s] invalid cookie: data->host_cookie %d"
+		       " host->next_data.cookie %d\n",
+		       __func__, data->host_cookie, host->next_data.cookie);
+		data->host_cookie = 0;
+	}
+
+	/* Check if next job is already prepared */
+	if (next || (!next && data->host_cookie != host->next_data.cookie))
+		if (dma_map_sg(mmc_dev(host->mmc), data->sg, data->sg_len,
+			       (data->flags & MMC_DATA_WRITE) ?
+			       DMA_TO_DEVICE : DMA_FROM_DEVICE) == 0)
+			return -EINVAL;
+
+	if (next)
+		data->host_cookie = ++next->cookie < 0 ? 1 : next->cookie;
+
+	return 0;
+}
+
 static struct dma_async_tx_descriptor *mxs_mmc_prep_dma(
 	struct mxs_mmc_host *host, unsigned int append)
 {
@@ -312,8 +342,8 @@  static struct dma_async_tx_descriptor *mxs_mmc_prep_dma(
 
 	if (data) {
 		/* data */
-		dma_map_sg(mmc_dev(host->mmc), data->sg,
-			   data->sg_len, host->dma_dir);
+		if (mxs_mmc_prep_dma_data(host, data, NULL))
+			return NULL;
 		sgl = data->sg;
 		sg_len = data->sg_len;
 	} else {
@@ -328,9 +358,11 @@  static struct dma_async_tx_descriptor *mxs_mmc_prep_dma(
 		desc->callback = mxs_mmc_dma_irq_callback;
 		desc->callback_param = host;
 	} else {
-		if (data)
+		if (data) {
 			dma_unmap_sg(mmc_dev(host->mmc), data->sg,
 				     data->sg_len, host->dma_dir);
+			data->host_cookie = 0;
+		}
 	}
 
 	return desc;
@@ -553,6 +585,40 @@  static void mxs_mmc_start_cmd(struct mxs_mmc_host *host,
 	}
 }
 
+static void mxs_mmc_pre_req(struct mmc_host *mmc, struct mmc_request *mrq,
+			    bool is_first_req)
+{
+	struct mxs_mmc_host *host = mmc_priv(mmc);
+	struct mmc_data *data = mrq->data;
+
+	if (!data)
+		return;
+
+	if (data->host_cookie) {
+		data->host_cookie = 0;
+		return;
+	}
+
+	if (mxs_mmc_prep_dma_data(host, data, &host->next_data))
+		data->host_cookie = 0;
+}
+
+static void mxs_mmc_post_req(struct mmc_host *mmc, struct mmc_request *mrq,
+			     int err)
+{
+	struct mxs_mmc_host *host = mmc_priv(mmc);
+	struct mmc_data *data = mrq->data;
+
+	if (!data)
+		return;
+
+	if (data->host_cookie) {
+		dma_unmap_sg(mmc_dev(host->mmc), data->sg,
+			     data->sg_len, host->dma_dir);
+		data->host_cookie = 0;
+	}
+}
+
 static void mxs_mmc_request(struct mmc_host *mmc, struct mmc_request *mrq)
 {
 	struct mxs_mmc_host *host = mmc_priv(mmc);
@@ -644,6 +710,8 @@  static void mxs_mmc_enable_sdio_irq(struct mmc_host *mmc, int enable)
 }
 
 static const struct mmc_host_ops mxs_mmc_ops = {
+	.pre_req = mxs_mmc_pre_req,
+	.post_req = mxs_mmc_post_req,
 	.request = mxs_mmc_request,
 	.get_ro = mxs_mmc_get_ro,
 	.get_cd = mxs_mmc_get_cd,
@@ -708,6 +776,7 @@  static int mxs_mmc_probe(struct platform_device *pdev)
 	host->dma_res = dmares;
 	host->irq = irq_err;
 	host->sdio_irq_en = 0;
+	host->next_data.cookie = 1;
 
 	host->clk = clk_get(&pdev->dev, NULL);
 	if (IS_ERR(host->clk)) {