diff mbox

[V4,09/11] mmc: block: Add CQE support

Message ID AM4PR0401MB23240F1551DA4B2F0BB36B51908B0@AM4PR0401MB2324.eurprd04.prod.outlook.com (mailing list archive)
State New, archived
Headers show

Commit Message

Bough Chen Aug. 9, 2017, 7:57 a.m. UTC
> -----Original Message-----
> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-
> owner@vger.kernel.org] On Behalf Of Adrian Hunter
> Sent: Wednesday, August 09, 2017 1:58 PM
> To: Shawn Lin <shawn.lin@rock-chips.com>; Bough Chen
> <haibo.chen@nxp.com>
> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-
> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>; Mateusz
> Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung <jh80.chung@samsung.com>;
> Dong Aisheng <dongas86@gmail.com>; Das Asutosh
> <asutoshd@codeaurora.org>; Zhangfei Gao <zhangfei.gao@gmail.com>;
> Dorfman Konstantin <kdorfman@codeaurora.org>; Sahitya Tummala
> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>; Venu
> Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij <linus.walleij@linaro.org>
> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support
> 
> On 09/08/17 03:55, Shawn Lin wrote:
> > Hi,
> >
> > On 2017/8/8 20:07, Bough Chen wrote:
> >>> -----Original Message-----
> >>> From: Adrian Hunter [mailto:adrian.hunter@intel.com]
> >>> Sent: Friday, July 21, 2017 5:50 PM
> >>> To: Ulf Hansson <ulf.hansson@linaro.org>
> >>> Cc: linux-mmc <linux-mmc@vger.kernel.org>; Bough Chen
> >>> <haibo.chen@nxp.com>; Alex Lemberg <alex.lemberg@sandisk.com>;
> >>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
> >>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung
> >>> <jh80.chung@samsung.com>; Dong Aisheng <dongas86@gmail.com>; Das
> >>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao
> >>> <zhangfei.gao@gmail.com>; Dorfman Konstantin
> >>> <kdorfman@codeaurora.org>; David Griego <david.griego@linaro.org>;
> >>> Sahitya Tummala <stummala@codeaurora.org>; Harjani Ritesh
> >>> <riteshh@codeaurora.org>; Venu Byravarasu <vbyravarasu@nvidia.com>;
> >>> Linus Walleij <linus.walleij@linaro.org>; Shawn Lin
> >>> <shawn.lin@rock-chips.com>
> >>> Subject: [PATCH V4 09/11] mmc: block: Add CQE support
> >>>
> >>> Add CQE support to the block driver, including:
> >>>     - optionally using DCMD for flush requests
> >>>     - manually issuing discard requests
> >>>     - issuing read / write requests to the CQE
> >>>     - supporting block-layer timeouts
> >>>     - handling recovery
> >>>     - supporting re-tuning
> >>>
> >>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> >>> ---
> >>>   drivers/mmc/core/block.c | 195
> ++++++++++++++++++++++++++++++++-
> >>>   drivers/mmc/core/block.h |   7 ++
> >>>   drivers/mmc/core/queue.c | 273
> >>> ++++++++++++++++++++++++++++++++++++++++++++++-
> >>>   drivers/mmc/core/queue.h |  42 +++++++-
> >>>   4 files changed, 510 insertions(+), 7 deletions(-)
> >>>
> >>> diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
> >>> index
> >>> 915290c74363..2d25115637b7 100644
> >>> --- a/drivers/mmc/core/block.c
> >>> +++ b/drivers/mmc/core/block.c
> >>> @@ -109,6 +109,7 @@ struct mmc_blk_data {
> >>>   #define MMC_BLK_WRITE        BIT(1)
> >>>   #define MMC_BLK_DISCARD        BIT(2)
> >>>   #define MMC_BLK_SECDISCARD    BIT(3)
> >>> +#define MMC_BLK_CQE_RECOVERY    BIT(4)
> >>>
> >>>       /*
> >>>        * Only set in main mmc_blk_data associated @@ -1612,6
> >>> +1613,198 @@ static void mmc_blk_data_prep(struct mmc_queue *mq,
> >>> struct mmc_queue_req *mqrq,
> >>>           *do_data_tag_p = do_data_tag;
> >>>   }
> >>>
> >>> +#define MMC_CQE_RETRIES 2
> >
> >
> >>> +        blk_queue_rq_timed_out(mq->queue, mmc_cqe_timed_out);
> >>> +        blk_queue_rq_timeout(mq->queue, 60 * HZ);
> >>
> >
> > ------8<-------
> >
> >> Hi Adrian,
> >>
> >> These days I'm doing CMDQ stress test, and find one issue.
> >> On our i.MX8QXP-ARM2 board, the RAM is 3GB. eMMC is 32GB.
> >> I use command 'free -m' get the total memory is 2800M, and the free
> >> memory is 2500M.
> >>
> >> I use 'mkfs.ext4' to format ext4 file system on the eMMC under
> >> HS400ES CMDQ mode, works fine.
> >>
> >> When I use the following command to stress test CMDQ, it works fine.
> >> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 2048 -r 1024
> >>
> >> But when I change to use a large file size to do the same stress
> >> test, using
> >> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 4096 -r 2048
> >> or
> >> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 5600
> >>
> >> I get the following dump message.  According to the log,
> >> mmc_cqe_timed_out() was trigged.
> >> Seems mmc was blocked in somewhere.
> >> Then I try to debug this issue, and open MMC_DEBUG in config, do the
> >> same test, print the detail Command sending information on the
> >> console, but finally can't reproduce.
> 
> mmc_cqe_timed_out() is a 60 second timeout provided by the block layer.
> Refer "blk_queue_rq_timeout(mq->queue, 60 * HZ)" in mmc_init_queue().
> 60s is quite a long time so I would first want to determine if the task was really
> queued that long.  I would instrument some code into cqhci_request() to
> record the start time on struct mmc_request, and then print the time taken
> when there is a problem.
> 

Hi Adrian, 

According to your suggestion, I add the following code to print the time.
When issue happens, seems the request really pending for over 60s!

done
Writing intelligently...[  689.209548] mmc0: cqhci: timeout for tag 9
[  689.213658] the mrq all use 62123742 us
[  689.217487] mmc0: cqhci: ============ CQHCI REGISTER DUMP ===========
[  689.223927] mmc0: cqhci: Caps:      0x0000310a | Version:  0x00000510
[  689.230363] mmc0: cqhci: Config:    0x00001001 | Control:  0x00000000
[  689.236800] mmc0: cqhci: Int stat:  0x00000000 | Int enab: 0x00000006
[  689.243238] mmc0: cqhci: Int sig:   0x00000006 | Int Coal: 0x00000000
[  689.249675] mmc0: cqhci: TDL base:  0x90079000 | TDL up32: 0x00000000
[  689.256113] mmc0: cqhci: Doorbell:  0x1fffffff | TCN:      0x00000000
[  689.262550] mmc0: cqhci: Dev queue: 0x1fffefff | Dev Pend: 0x1fff7fff
[  689.268988] mmc0: cqhci: Task clr:  0x00000000 | SSC1:     0x00011000
[  689.275425] mmc0: cqhci: SSC2:      0x00000001 | DCMD rsp: 0x00000800
[  689.281862] mmc0: cqhci: RED mask:  0xfdf9a080 | TERRI:    0x00000000
[  689.288300] mmc0: cqhci: Resp idx:  0x0000002f | Resp arg: 0x00000900
[  689.294737] mmc0: sdhci: ============ SDHCI REGISTER DUMP ===========
[  689.301176] mmc0: sdhci: Sys addr:  0xb602f000 | Version:  0x00000002
[  689.307612] mmc0: sdhci: Blk size:  0x00000200 | Blk cnt:  0x00000400
[  689.314050] mmc0: sdhci: Argument:  0x000f0400 | Trn mode: 0x00000023
[  689.320487] mmc0: sdhci: Present:   0x01fd858f | Host ctl: 0x00000030
[  689.326925] mmc0: sdhci: Power:     0x00000002 | Blk gap:  0x00000080
[  689.333362] mmc0: sdhci: Wake-up:   0x00000008 | Clock:    0x0000000f
[  689.339800] mmc0: sdhci: Timeout:   0x0000008f | Int stat: 0x00000000
[  689.346237] mmc0: sdhci: Int enab:  0x107f4000 | Sig enab: 0x107f4000
[  689.352674] mmc0: sdhci: AC12 err:  0x00000000 | Slot int: 0x00000502
[  689.359113] mmc0: sdhci: Caps:      0x07eb0000 | Caps_1:   0x8000b407
[  689.365549] mmc0: sdhci: Cmd:       0x00002c1a | Max curr: 0x00ffffff
[  689.371987] mmc0: sdhci: Resp[0]:   0x00000900 | Resp[1]:  0xffffffff
[  689.378424] mmc0: sdhci: Resp[2]:   0x328f5903 | Resp[3]:  0x00d02700
[  689.384861] mmc0: sdhci: Host ctl2: 0x00000008
[  689.389302] mmc0: sdhci: ADMA Err:  0x00000009 | ADMA Ptr: 0x9009a400
[  689.395737] mmc0: sdhci: ============================================
[  689.402212] mmc0: running CQE recovery



> >>
> >> Shawn,
> >> Can you have a try on your side?
> >>
> >
> > I think bonnie++ is almost the same disk test tool as iozone or fio. I
> > didn't saw this when testing CMDQ by fio, but I will try bonnie++ late
> > this week.
> >
> >
> >>
> >> [  738.385610] mmc0: cqhci: timeout for tag 1 [  738.389719] mmc0:
> >> cqhci: ============ CQHCI REGISTER DUMP ===========
> >> [  738.396164] mmc0: cqhci: Caps:      0x0000310a | Version:  0x00000510
> >> [  738.402601] mmc0: cqhci: Config:    0x00001001 | Control:  0x00000000
> >> [  738.409038] mmc0: cqhci: Int stat:  0x00000000 | Int enab: 0x00000006
> >> [  738.415475] mmc0: cqhci: Int sig:   0x00000006 | Int Coal: 0x00000000
> >> [  738.421913] mmc0: cqhci: TDL base:  0x9007a000 | TDL up32: 0x00000000
> >> [  738.428350] mmc0: cqhci: Doorbell:  0x1fffffff | TCN:      0x00000000
> >> [  738.434788] mmc0: cqhci: Dev queue: 0x1f7fffff | Dev Pend: 0x1fffefff
> >> [  738.441226] mmc0: cqhci: Task clr:  0x00000000 | SSC1:     0x00011000
> >> [  738.447663] mmc0: cqhci: SSC2:      0x00000001 | DCMD rsp: 0x00000800
> >> [  738.454100] mmc0: cqhci: RED mask:  0xfdf9a080 | TERRI:    0x00000000
> >> [  738.460538] mmc0: cqhci: Resp idx:  0x0000002f | Resp arg:
> >> 0x00000900 [  738.466975] mmc0: sdhci: ============ SDHCI REGISTER
> >> DUMP =========== [  738.473414] mmc0: sdhci: Sys addr:  0xb6512000 |
> >> Version:  0x00000002 [  738.479850] mmc0: sdhci: Blk size:
> >> 0x00000200 | Blk cnt:  0x00000400 [  738.486288] mmc0: sdhci: Argument:
> 0x000c0400 | Trn mode: 0x00000023
> >> [  738.492725] mmc0: sdhci: Present:   0x01fd858f | Host ctl: 0x00000030
> >> [  738.499162] mmc0: sdhci: Power:     0x00000002 | Blk gap:  0x00000080
> >> [  738.505600] mmc0: sdhci: Wake-up:   0x00000008 | Clock:    0x0000000f
> >> [  738.512037] mmc0: sdhci: Timeout:   0x0000008f | Int stat: 0x00000000
> >> [  738.518475] mmc0: sdhci: Int enab:  0x107f4000 | Sig enab:
> >> 0x107f4000 [  738.524912] mmc0: sdhci: AC12 err:  0x00000000 | Slot int:
> 0x00000502
> >> [  738.531350] mmc0: sdhci: Caps:      0x07eb0000 | Caps_1:   0x8000b407
> >> [  738.537787] mmc0: sdhci: Cmd:       0x00002c1a | Max curr: 0x00ffffff
> >> [  738.544225] mmc0: sdhci: Resp[0]:   0x00000900 | Resp[1]:  0xffffffff
> >> [  738.550662] mmc0: sdhci: Resp[2]:   0x328f5903 | Resp[3]:  0x00d02700
> >> [  738.557099] mmc0: sdhci: Host ctl2: 0x00000008 [  738.561540]
> >> mmc0: sdhci: ADMA Err:  0x00000009 | ADMA Ptr: 0x90098400 [
> >> 738.567975] mmc0: sdhci:
> ============================================
> >> [  738.574449] mmc0: running CQE recovery [  738.593643] mmc0:
> >> Unexpected interrupt 0x00004000.
> >> [  738.598436] mmc0: sdhci: ============ SDHCI REGISTER DUMP
> >> =========== [  738.604881] mmc0: sdhci: Sys addr:  0x00000000 |
> >> Version:  0x00000002 [  738.611318] mmc0: sdhci: Blk size:
> >> 0x00000200 | Blk cnt:  0x00000400 [  738.617756] mmc0: sdhci: Argument:
> 0x01af6800 | Trn mode: 0x00000023
> >> [  738.624193] mmc0: sdhci: Present:   0x01fd8009 | Host ctl: 0x00000031
> >> [  738.630630] mmc0: sdhci: Power:     0x00000002 | Blk gap:  0x00000080
> >> [  738.637068] mmc0: sdhci: Wake-up:   0x00000008 | Clock:    0x0000000f
> >> [  738.643505] mmc0: sdhci: Timeout:   0x0000008f | Int stat: 0x00004000
> >> [  738.649943] mmc0: sdhci: Int enab:  0x007f1003 | Sig enab:
> >> 0x007f1003 [  738.656380] mmc0: sdhci: AC12 err:  0x00000000 | Slot int:
> 0x00000502
> >> [  738.662818] mmc0: sdhci: Caps:      0x07eb0000 | Caps_1:   0x8000b407
> >> [  738.669255] mmc0: sdhci: Cmd:       0x00002d12 | Max curr: 0x00ffffff
> >> [  738.675693] mmc0: sdhci: Resp[0]:   0x00000c00 | Resp[1]:  0xffffffff
> >> [  738.682130] mmc0: sdhci: Resp[2]:   0x328f5903 | Resp[3]:  0x00d02700
> >> [  738.688566] mmc0: sdhci: Host ctl2: 0x00000008 [  738.693008]
> >> mmc0: sdhci: ADMA Err:  0x00000000 | ADMA Ptr: 0x00000000 [
> >> 738.699443] mmc0: sdhci:
> ============================================
> >> [  738.715999] mmc0: Controller never released inhibit bit(s).
> >> [  738.721573] mmc0: sdhci: ============ SDHCI REGISTER DUMP
> >> =========== [  738.728018] mmc0: sdhci: Sys addr:  0x00000000 |
> >> Version:  0x00000002 [  738.734455] mmc0: sdhci: Blk size:
> >> 0x00000200 | Blk cnt:  0x00000400 [  738.740892] mmc0: sdhci: Argument:
> 0x01af6800 | Trn mode: 0x00000023
> >> [  738.747330] mmc0: sdhci: Present:   0x01fd8009 | Host ctl: 0x00000031
> >> [  738.753767] mmc0: sdhci: Power:     0x00000002 | Blk gap:  0x00000080
> >> [  738.760204] mmc0: sdhci: Wake-up:   0x00000008 | Clock:    0x0000000f
> >> [  738.766642] mmc0: sdhci: Timeout:   0x0000008f | Int stat: 0x00004000
> >> [  738.773079] mmc0: sdhci: Int enab:  0x007f1003 | Sig enab:
> >> 0x007f1003 [  738.779517] mmc0: sdhci: AC12 err:  0x00000000 | Slot int:
> 0x00000502
> >> [  738.785955] mmc0: sdhci: Caps:      0x07eb0000 | Caps_1:   0x8000b407
> >> [  738.792392] mmc0: sdhci: Cmd:       0x00002d12 | Max curr: 0x00ffffff
> >> [  738.798829] mmc0: sdhci: Resp[0]:   0x00000c00 | Resp[1]:  0xffffffff
> >> [  738.805266] mmc0: sdhci: Resp[2]:   0x328f5903 | Resp[3]:  0x00d02700
> >> [  738.811703] mmc0: sdhci: Host ctl2: 0x00000008 [  738.816144]
> >> mmc0: sdhci: ADMA Err:  0x00000000 | ADMA Ptr: 0x00000000 [
> >> 738.822579] mmc0: sdhci:
> ============================================
> >> [  748.881580] mmc0: Timeout waiting for hardware interrupt.
> >> ......
> >>
> >>
> >>> +
> >>> +        host->cqe_recovery_notifier = mmc_cqe_recovery_notifier;
> >>> +    }
> >>> +
> >>>       blk_queue_prep_rq(mq->queue, mmc_prep_request);
> >>>       queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
> >>>       queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, mq-
> >>>> queue); @@ -280,9 +543,9 @@ int mmc_init_queue(struct mmc_queue
> >>>> *mq,
> >>> struct mmc_card *card,
> >>>
> >>>       sema_init(&mq->thread_sem, 1);
> >>>
> >>> -    mq->thread = kthread_run(mmc_queue_thread, mq, "mmcqd/%d%s",
> >>> -        host->index, subname ? subname : "");
> >>> -
> >>> +    mq->thread = kthread_run(use_cqe ? mmc_cqe_thread :
> >>> mmc_queue_thread,
> >>> +                 mq, "mmcqd/%d%s", host->index,
> >>> +                 subname ? subname : "");
> >>>       if (IS_ERR(mq->thread)) {
> >>>           ret = PTR_ERR(mq->thread);
> >>>           goto cleanup_queue;
> >>> diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
> >>> index
> >>> 361b46408e0f..8e9273d977c0 100644
> >>> --- a/drivers/mmc/core/queue.h
> >>> +++ b/drivers/mmc/core/queue.h
> >>> @@ -7,6 +7,20 @@
> >>>   #include <linux/mmc/core.h>
> >>>   #include <linux/mmc/host.h>
> >>>
> >>> +enum mmc_issued {
> >>> +    MMC_REQ_STARTED,
> >>> +    MMC_REQ_BUSY,
> >>> +    MMC_REQ_FAILED_TO_START,
> >>> +    MMC_REQ_FINISHED,
> >>> +};
> >>> +
> >>> +enum mmc_issue_type {
> >>> +    MMC_ISSUE_SYNC,
> >>> +    MMC_ISSUE_DCMD,
> >>> +    MMC_ISSUE_ASYNC,
> >>> +    MMC_ISSUE_MAX,
> >>> +};
> >>> +
> >>>   static inline struct mmc_queue_req *req_to_mmc_queue_req(struct
> >>> request
> >>> *rq)  {
> >>>       return blk_mq_rq_to_pdu(rq);
> >>> @@ -53,6 +67,7 @@ struct mmc_queue_req {
> >>>       int            drv_op_result;
> >>>       struct mmc_blk_ioc_data    **idata;
> >>>       unsigned int        ioc_count;
> >>> +    int            retries;
> >>>   };
> >>>
> >>>   struct mmc_queue {
> >>> @@ -70,10 +85,17 @@ struct mmc_queue {
> >>>        * associated mmc_queue_req data.
> >>>        */
> >>>       int            qcnt;
> >>> +    /* Following are defined for a Command Queue Engine */
> >>> +    int            cqe_in_flight[MMC_ISSUE_MAX];
> >>> +    unsigned int        cqe_busy;
> >>> +    bool            cqe_recovery_needed;
> >>> +    bool            cqe_in_recovery;
> >>> +#define MMC_CQE_DCMD_BUSY    BIT(0)
> >>> +#define MMC_CQE_QUEUE_FULL    BIT(1)
> >>>   };
> >>>
> >>>   extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *,
> >>> spinlock_t *,
> >>> -              const char *);
> >>> +              const char *, int);
> >>>   extern void mmc_cleanup_queue(struct mmc_queue *);  extern void
> >>> mmc_queue_suspend(struct mmc_queue *);  extern void
> >>> mmc_queue_resume(struct mmc_queue *); @@ -85,4 +107,22 @@
> extern
> >>> unsigned int mmc_queue_map_sg(struct mmc_queue *,
> >>>
> >>>   extern int mmc_access_rpmb(struct mmc_queue *);
> >>>
> >>> +void mmc_cqe_kick_queue(struct mmc_queue *mq);
> >>> +
> >>> +enum mmc_issue_type mmc_cqe_issue_type(struct mmc_host *host,
> >>> +                       struct request *req);
> >>> +
> >>> +static inline int mmc_cqe_tot_in_flight(struct mmc_queue *mq) {
> >>> +    return mq->cqe_in_flight[MMC_ISSUE_SYNC] +
> >>> +           mq->cqe_in_flight[MMC_ISSUE_DCMD] +
> >>> +           mq->cqe_in_flight[MMC_ISSUE_ASYNC];
> >>> +}
> >>> +
> >>> +static inline int mmc_cqe_qcnt(struct mmc_queue *mq) {
> >>> +    return mq->cqe_in_flight[MMC_ISSUE_DCMD] +
> >>> +           mq->cqe_in_flight[MMC_ISSUE_ASYNC];
> >>> +}
> >>> +
> >>>   #endif
> >>> --
> >>> 1.9.1
> >>
> >>
> >>
> >>
> >
> >
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-mmc" in the
> body of a message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html

Comments

Adrian Hunter Aug. 9, 2017, 8:16 a.m. UTC | #1
On 09/08/17 10:57, Bough Chen wrote:
>> -----Original Message-----
>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-
>> owner@vger.kernel.org] On Behalf Of Adrian Hunter
>> Sent: Wednesday, August 09, 2017 1:58 PM
>> To: Shawn Lin <shawn.lin@rock-chips.com>; Bough Chen
>> <haibo.chen@nxp.com>
>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-
>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>; Mateusz
>> Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung <jh80.chung@samsung.com>;
>> Dong Aisheng <dongas86@gmail.com>; Das Asutosh
>> <asutoshd@codeaurora.org>; Zhangfei Gao <zhangfei.gao@gmail.com>;
>> Dorfman Konstantin <kdorfman@codeaurora.org>; Sahitya Tummala
>> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>; Venu
>> Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij <linus.walleij@linaro.org>
>> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support
>>
>> On 09/08/17 03:55, Shawn Lin wrote:
>>> Hi,
>>>
>>> On 2017/8/8 20:07, Bough Chen wrote:
>>>>> -----Original Message-----
>>>>> From: Adrian Hunter [mailto:adrian.hunter@intel.com]
>>>>> Sent: Friday, July 21, 2017 5:50 PM
>>>>> To: Ulf Hansson <ulf.hansson@linaro.org>
>>>>> Cc: linux-mmc <linux-mmc@vger.kernel.org>; Bough Chen
>>>>> <haibo.chen@nxp.com>; Alex Lemberg <alex.lemberg@sandisk.com>;
>>>>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>>>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung
>>>>> <jh80.chung@samsung.com>; Dong Aisheng <dongas86@gmail.com>; Das
>>>>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao
>>>>> <zhangfei.gao@gmail.com>; Dorfman Konstantin
>>>>> <kdorfman@codeaurora.org>; David Griego <david.griego@linaro.org>;
>>>>> Sahitya Tummala <stummala@codeaurora.org>; Harjani Ritesh
>>>>> <riteshh@codeaurora.org>; Venu Byravarasu <vbyravarasu@nvidia.com>;
>>>>> Linus Walleij <linus.walleij@linaro.org>; Shawn Lin
>>>>> <shawn.lin@rock-chips.com>
>>>>> Subject: [PATCH V4 09/11] mmc: block: Add CQE support
>>>>>
>>>>> Add CQE support to the block driver, including:
>>>>>     - optionally using DCMD for flush requests
>>>>>     - manually issuing discard requests
>>>>>     - issuing read / write requests to the CQE
>>>>>     - supporting block-layer timeouts
>>>>>     - handling recovery
>>>>>     - supporting re-tuning
>>>>>
>>>>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>>>>> ---
>>>>>   drivers/mmc/core/block.c | 195
>> ++++++++++++++++++++++++++++++++-
>>>>>   drivers/mmc/core/block.h |   7 ++
>>>>>   drivers/mmc/core/queue.c | 273
>>>>> ++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>   drivers/mmc/core/queue.h |  42 +++++++-
>>>>>   4 files changed, 510 insertions(+), 7 deletions(-)
>>>>>
>>>>> diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
>>>>> index
>>>>> 915290c74363..2d25115637b7 100644
>>>>> --- a/drivers/mmc/core/block.c
>>>>> +++ b/drivers/mmc/core/block.c
>>>>> @@ -109,6 +109,7 @@ struct mmc_blk_data {
>>>>>   #define MMC_BLK_WRITE        BIT(1)
>>>>>   #define MMC_BLK_DISCARD        BIT(2)
>>>>>   #define MMC_BLK_SECDISCARD    BIT(3)
>>>>> +#define MMC_BLK_CQE_RECOVERY    BIT(4)
>>>>>
>>>>>       /*
>>>>>        * Only set in main mmc_blk_data associated @@ -1612,6
>>>>> +1613,198 @@ static void mmc_blk_data_prep(struct mmc_queue *mq,
>>>>> struct mmc_queue_req *mqrq,
>>>>>           *do_data_tag_p = do_data_tag;
>>>>>   }
>>>>>
>>>>> +#define MMC_CQE_RETRIES 2
>>>
>>>
>>>>> +        blk_queue_rq_timed_out(mq->queue, mmc_cqe_timed_out);
>>>>> +        blk_queue_rq_timeout(mq->queue, 60 * HZ);
>>>>
>>>
>>> ------8<-------
>>>
>>>> Hi Adrian,
>>>>
>>>> These days I'm doing CMDQ stress test, and find one issue.
>>>> On our i.MX8QXP-ARM2 board, the RAM is 3GB. eMMC is 32GB.
>>>> I use command 'free -m' get the total memory is 2800M, and the free
>>>> memory is 2500M.
>>>>
>>>> I use 'mkfs.ext4' to format ext4 file system on the eMMC under
>>>> HS400ES CMDQ mode, works fine.
>>>>
>>>> When I use the following command to stress test CMDQ, it works fine.
>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 2048 -r 1024
>>>>
>>>> But when I change to use a large file size to do the same stress
>>>> test, using
>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 4096 -r 2048
>>>> or
>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 5600
>>>>
>>>> I get the following dump message.  According to the log,
>>>> mmc_cqe_timed_out() was trigged.
>>>> Seems mmc was blocked in somewhere.
>>>> Then I try to debug this issue, and open MMC_DEBUG in config, do the
>>>> same test, print the detail Command sending information on the
>>>> console, but finally can't reproduce.
>>
>> mmc_cqe_timed_out() is a 60 second timeout provided by the block layer.
>> Refer "blk_queue_rq_timeout(mq->queue, 60 * HZ)" in mmc_init_queue().
>> 60s is quite a long time so I would first want to determine if the task was really
>> queued that long.  I would instrument some code into cqhci_request() to
>> record the start time on struct mmc_request, and then print the time taken
>> when there is a problem.
>>
> 
> Hi Adrian, 
> 
> According to your suggestion, I add the following code to print the time.
> When issue happens, seems the request really pending for over 60s!
> 
> done
> Writing intelligently...[  689.209548] mmc0: cqhci: timeout for tag 9
> [  689.213658] the mrq all use 62123742 us
> [  689.217487] mmc0: cqhci: ============ CQHCI REGISTER DUMP ===========
> [  689.223927] mmc0: cqhci: Caps:      0x0000310a | Version:  0x00000510
> [  689.230363] mmc0: cqhci: Config:    0x00001001 | Control:  0x00000000
> [  689.236800] mmc0: cqhci: Int stat:  0x00000000 | Int enab: 0x00000006
> [  689.243238] mmc0: cqhci: Int sig:   0x00000006 | Int Coal: 0x00000000
> [  689.249675] mmc0: cqhci: TDL base:  0x90079000 | TDL up32: 0x00000000
> [  689.256113] mmc0: cqhci: Doorbell:  0x1fffffff | TCN:      0x00000000
> [  689.262550] mmc0: cqhci: Dev queue: 0x1fffefff | Dev Pend: 0x1fff7fff
> [  689.268988] mmc0: cqhci: Task clr:  0x00000000 | SSC1:     0x00011000
> [  689.275425] mmc0: cqhci: SSC2:      0x00000001 | DCMD rsp: 0x00000800
> [  689.281862] mmc0: cqhci: RED mask:  0xfdf9a080 | TERRI:    0x00000000
> [  689.288300] mmc0: cqhci: Resp idx:  0x0000002f | Resp arg: 0x00000900
> [  689.294737] mmc0: sdhci: ============ SDHCI REGISTER DUMP ===========
> [  689.301176] mmc0: sdhci: Sys addr:  0xb602f000 | Version:  0x00000002
> [  689.307612] mmc0: sdhci: Blk size:  0x00000200 | Blk cnt:  0x00000400
> [  689.314050] mmc0: sdhci: Argument:  0x000f0400 | Trn mode: 0x00000023
> [  689.320487] mmc0: sdhci: Present:   0x01fd858f | Host ctl: 0x00000030
> [  689.326925] mmc0: sdhci: Power:     0x00000002 | Blk gap:  0x00000080
> [  689.333362] mmc0: sdhci: Wake-up:   0x00000008 | Clock:    0x0000000f
> [  689.339800] mmc0: sdhci: Timeout:   0x0000008f | Int stat: 0x00000000
> [  689.346237] mmc0: sdhci: Int enab:  0x107f4000 | Sig enab: 0x107f4000
> [  689.352674] mmc0: sdhci: AC12 err:  0x00000000 | Slot int: 0x00000502
> [  689.359113] mmc0: sdhci: Caps:      0x07eb0000 | Caps_1:   0x8000b407
> [  689.365549] mmc0: sdhci: Cmd:       0x00002c1a | Max curr: 0x00ffffff
> [  689.371987] mmc0: sdhci: Resp[0]:   0x00000900 | Resp[1]:  0xffffffff
> [  689.378424] mmc0: sdhci: Resp[2]:   0x328f5903 | Resp[3]:  0x00d02700
> [  689.384861] mmc0: sdhci: Host ctl2: 0x00000008
> [  689.389302] mmc0: sdhci: ADMA Err:  0x00000009 | ADMA Ptr: 0x9009a400
> [  689.395737] mmc0: sdhci: ============================================
> [  689.402212] mmc0: running CQE recovery

Tag 9 has been queued (bit set in Dev Pend) which means it is up to the eMMC
to select it for execution.  You should dump the times for the other mrq's
to see how long they have been waiting and try to determine if anything is
being processed.

If the eMMC is just taking a really long time to process tasks we could
extend the timeout, but it is hard to see how that is acceptable to a final
product.  At this point it looks like the eMMC may have a flaw in the way it
selects tasks for execution.
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Adrian Hunter Aug. 9, 2017, 8:30 a.m. UTC | #2
On 09/08/17 11:16, Adrian Hunter wrote:
> On 09/08/17 10:57, Bough Chen wrote:
>>> -----Original Message-----
>>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-
>>> owner@vger.kernel.org] On Behalf Of Adrian Hunter
>>> Sent: Wednesday, August 09, 2017 1:58 PM
>>> To: Shawn Lin <shawn.lin@rock-chips.com>; Bough Chen
>>> <haibo.chen@nxp.com>
>>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-
>>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>; Mateusz
>>> Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung <jh80.chung@samsung.com>;
>>> Dong Aisheng <dongas86@gmail.com>; Das Asutosh
>>> <asutoshd@codeaurora.org>; Zhangfei Gao <zhangfei.gao@gmail.com>;
>>> Dorfman Konstantin <kdorfman@codeaurora.org>; Sahitya Tummala
>>> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>; Venu
>>> Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij <linus.walleij@linaro.org>
>>> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support
>>>
>>> On 09/08/17 03:55, Shawn Lin wrote:
>>>> Hi,
>>>>
>>>> On 2017/8/8 20:07, Bough Chen wrote:
>>>>>> -----Original Message-----
>>>>>> From: Adrian Hunter [mailto:adrian.hunter@intel.com]
>>>>>> Sent: Friday, July 21, 2017 5:50 PM
>>>>>> To: Ulf Hansson <ulf.hansson@linaro.org>
>>>>>> Cc: linux-mmc <linux-mmc@vger.kernel.org>; Bough Chen
>>>>>> <haibo.chen@nxp.com>; Alex Lemberg <alex.lemberg@sandisk.com>;
>>>>>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>>>>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung
>>>>>> <jh80.chung@samsung.com>; Dong Aisheng <dongas86@gmail.com>; Das
>>>>>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao
>>>>>> <zhangfei.gao@gmail.com>; Dorfman Konstantin
>>>>>> <kdorfman@codeaurora.org>; David Griego <david.griego@linaro.org>;
>>>>>> Sahitya Tummala <stummala@codeaurora.org>; Harjani Ritesh
>>>>>> <riteshh@codeaurora.org>; Venu Byravarasu <vbyravarasu@nvidia.com>;
>>>>>> Linus Walleij <linus.walleij@linaro.org>; Shawn Lin
>>>>>> <shawn.lin@rock-chips.com>
>>>>>> Subject: [PATCH V4 09/11] mmc: block: Add CQE support
>>>>>>
>>>>>> Add CQE support to the block driver, including:
>>>>>>     - optionally using DCMD for flush requests
>>>>>>     - manually issuing discard requests
>>>>>>     - issuing read / write requests to the CQE
>>>>>>     - supporting block-layer timeouts
>>>>>>     - handling recovery
>>>>>>     - supporting re-tuning
>>>>>>
>>>>>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>>>>>> ---
>>>>>>   drivers/mmc/core/block.c | 195
>>> ++++++++++++++++++++++++++++++++-
>>>>>>   drivers/mmc/core/block.h |   7 ++
>>>>>>   drivers/mmc/core/queue.c | 273
>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>>   drivers/mmc/core/queue.h |  42 +++++++-
>>>>>>   4 files changed, 510 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
>>>>>> index
>>>>>> 915290c74363..2d25115637b7 100644
>>>>>> --- a/drivers/mmc/core/block.c
>>>>>> +++ b/drivers/mmc/core/block.c
>>>>>> @@ -109,6 +109,7 @@ struct mmc_blk_data {
>>>>>>   #define MMC_BLK_WRITE        BIT(1)
>>>>>>   #define MMC_BLK_DISCARD        BIT(2)
>>>>>>   #define MMC_BLK_SECDISCARD    BIT(3)
>>>>>> +#define MMC_BLK_CQE_RECOVERY    BIT(4)
>>>>>>
>>>>>>       /*
>>>>>>        * Only set in main mmc_blk_data associated @@ -1612,6
>>>>>> +1613,198 @@ static void mmc_blk_data_prep(struct mmc_queue *mq,
>>>>>> struct mmc_queue_req *mqrq,
>>>>>>           *do_data_tag_p = do_data_tag;
>>>>>>   }
>>>>>>
>>>>>> +#define MMC_CQE_RETRIES 2
>>>>
>>>>
>>>>>> +        blk_queue_rq_timed_out(mq->queue, mmc_cqe_timed_out);
>>>>>> +        blk_queue_rq_timeout(mq->queue, 60 * HZ);
>>>>>
>>>>
>>>> ------8<-------
>>>>
>>>>> Hi Adrian,
>>>>>
>>>>> These days I'm doing CMDQ stress test, and find one issue.
>>>>> On our i.MX8QXP-ARM2 board, the RAM is 3GB. eMMC is 32GB.
>>>>> I use command 'free -m' get the total memory is 2800M, and the free
>>>>> memory is 2500M.
>>>>>
>>>>> I use 'mkfs.ext4' to format ext4 file system on the eMMC under
>>>>> HS400ES CMDQ mode, works fine.
>>>>>
>>>>> When I use the following command to stress test CMDQ, it works fine.
>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 2048 -r 1024
>>>>>
>>>>> But when I change to use a large file size to do the same stress
>>>>> test, using
>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 4096 -r 2048
>>>>> or
>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 5600
>>>>>
>>>>> I get the following dump message.  According to the log,
>>>>> mmc_cqe_timed_out() was trigged.
>>>>> Seems mmc was blocked in somewhere.
>>>>> Then I try to debug this issue, and open MMC_DEBUG in config, do the
>>>>> same test, print the detail Command sending information on the
>>>>> console, but finally can't reproduce.
>>>
>>> mmc_cqe_timed_out() is a 60 second timeout provided by the block layer.
>>> Refer "blk_queue_rq_timeout(mq->queue, 60 * HZ)" in mmc_init_queue().
>>> 60s is quite a long time so I would first want to determine if the task was really
>>> queued that long.  I would instrument some code into cqhci_request() to
>>> record the start time on struct mmc_request, and then print the time taken
>>> when there is a problem.
>>>
>>
>> Hi Adrian, 
>>
>> According to your suggestion, I add the following code to print the time.
>> When issue happens, seems the request really pending for over 60s!
>>
>> done
>> Writing intelligently...[  689.209548] mmc0: cqhci: timeout for tag 9
>> [  689.213658] the mrq all use 62123742 us
>> [  689.217487] mmc0: cqhci: ============ CQHCI REGISTER DUMP ===========
>> [  689.223927] mmc0: cqhci: Caps:      0x0000310a | Version:  0x00000510
>> [  689.230363] mmc0: cqhci: Config:    0x00001001 | Control:  0x00000000
>> [  689.236800] mmc0: cqhci: Int stat:  0x00000000 | Int enab: 0x00000006
>> [  689.243238] mmc0: cqhci: Int sig:   0x00000006 | Int Coal: 0x00000000
>> [  689.249675] mmc0: cqhci: TDL base:  0x90079000 | TDL up32: 0x00000000
>> [  689.256113] mmc0: cqhci: Doorbell:  0x1fffffff | TCN:      0x00000000
>> [  689.262550] mmc0: cqhci: Dev queue: 0x1fffefff | Dev Pend: 0x1fff7fff
>> [  689.268988] mmc0: cqhci: Task clr:  0x00000000 | SSC1:     0x00011000
>> [  689.275425] mmc0: cqhci: SSC2:      0x00000001 | DCMD rsp: 0x00000800
>> [  689.281862] mmc0: cqhci: RED mask:  0xfdf9a080 | TERRI:    0x00000000
>> [  689.288300] mmc0: cqhci: Resp idx:  0x0000002f | Resp arg: 0x00000900
>> [  689.294737] mmc0: sdhci: ============ SDHCI REGISTER DUMP ===========
>> [  689.301176] mmc0: sdhci: Sys addr:  0xb602f000 | Version:  0x00000002
>> [  689.307612] mmc0: sdhci: Blk size:  0x00000200 | Blk cnt:  0x00000400
>> [  689.314050] mmc0: sdhci: Argument:  0x000f0400 | Trn mode: 0x00000023
>> [  689.320487] mmc0: sdhci: Present:   0x01fd858f | Host ctl: 0x00000030
>> [  689.326925] mmc0: sdhci: Power:     0x00000002 | Blk gap:  0x00000080
>> [  689.333362] mmc0: sdhci: Wake-up:   0x00000008 | Clock:    0x0000000f
>> [  689.339800] mmc0: sdhci: Timeout:   0x0000008f | Int stat: 0x00000000
>> [  689.346237] mmc0: sdhci: Int enab:  0x107f4000 | Sig enab: 0x107f4000
>> [  689.352674] mmc0: sdhci: AC12 err:  0x00000000 | Slot int: 0x00000502
>> [  689.359113] mmc0: sdhci: Caps:      0x07eb0000 | Caps_1:   0x8000b407
>> [  689.365549] mmc0: sdhci: Cmd:       0x00002c1a | Max curr: 0x00ffffff
>> [  689.371987] mmc0: sdhci: Resp[0]:   0x00000900 | Resp[1]:  0xffffffff
>> [  689.378424] mmc0: sdhci: Resp[2]:   0x328f5903 | Resp[3]:  0x00d02700
>> [  689.384861] mmc0: sdhci: Host ctl2: 0x00000008
>> [  689.389302] mmc0: sdhci: ADMA Err:  0x00000009 | ADMA Ptr: 0x9009a400
>> [  689.395737] mmc0: sdhci: ============================================
>> [  689.402212] mmc0: running CQE recovery
> 
> Tag 9 has been queued (bit set in Dev Pend) which means it is up to the eMMC
> to select it for execution.  You should dump the times for the other mrq's
> to see how long they have been waiting and try to determine if anything is
> being processed.
> 
> If the eMMC is just taking a really long time to process tasks we could
> extend the timeout, but it is hard to see how that is acceptable to a final
> product.  At this point it looks like the eMMC may have a flaw in the way it
> selects tasks for execution.

No, that is wrong sorry, the task is in the QSR (Dev queue) so it is the CQE
that has not selected it.
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bough Chen Aug. 9, 2017, 9:41 a.m. UTC | #3
> -----Original Message-----

> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-

> owner@vger.kernel.org] On Behalf Of Adrian Hunter

> Sent: Wednesday, August 09, 2017 4:31 PM

> To: Bough Chen <haibo.chen@nxp.com>; Shawn Lin <shawn.lin@rock-

> chips.com>

> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-

> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>; Mateusz

> Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung <jh80.chung@samsung.com>;

> Dong Aisheng <dongas86@gmail.com>; Das Asutosh

> <asutoshd@codeaurora.org>; Zhangfei Gao <zhangfei.gao@gmail.com>;

> Dorfman Konstantin <kdorfman@codeaurora.org>; Sahitya Tummala

> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>; Venu

> Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij <linus.walleij@linaro.org>

> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support

> 

> On 09/08/17 11:16, Adrian Hunter wrote:

> > On 09/08/17 10:57, Bough Chen wrote:

> >>> -----Original Message-----

> >>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-

> >>> owner@vger.kernel.org] On Behalf Of Adrian Hunter

> >>> Sent: Wednesday, August 09, 2017 1:58 PM

> >>> To: Shawn Lin <shawn.lin@rock-chips.com>; Bough Chen

> >>> <haibo.chen@nxp.com>

> >>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-

> >>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>;

> >>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> >>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung

> >>> <jh80.chung@samsung.com>; Dong Aisheng <dongas86@gmail.com>; Das

> >>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao

> >>> <zhangfei.gao@gmail.com>; Dorfman Konstantin

> >>> <kdorfman@codeaurora.org>; Sahitya Tummala

> >>> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>;

> >>> Venu Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij

> >>> <linus.walleij@linaro.org>

> >>> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support

> >>>

> >>> On 09/08/17 03:55, Shawn Lin wrote:

> >>>> Hi,

> >>>>

> >>>> On 2017/8/8 20:07, Bough Chen wrote:

> >>>>>> -----Original Message-----

> >>>>>> From: Adrian Hunter [mailto:adrian.hunter@intel.com]

> >>>>>> Sent: Friday, July 21, 2017 5:50 PM

> >>>>>> To: Ulf Hansson <ulf.hansson@linaro.org>

> >>>>>> Cc: linux-mmc <linux-mmc@vger.kernel.org>; Bough Chen

> >>>>>> <haibo.chen@nxp.com>; Alex Lemberg <alex.lemberg@sandisk.com>;

> >>>>>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> >>>>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung

> >>>>>> <jh80.chung@samsung.com>; Dong Aisheng <dongas86@gmail.com>;

> Das

> >>>>>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao

> >>>>>> <zhangfei.gao@gmail.com>; Dorfman Konstantin

> >>>>>> <kdorfman@codeaurora.org>; David Griego

> >>>>>> <david.griego@linaro.org>; Sahitya Tummala

> >>>>>> <stummala@codeaurora.org>; Harjani Ritesh

> >>>>>> <riteshh@codeaurora.org>; Venu Byravarasu

> >>>>>> <vbyravarasu@nvidia.com>; Linus Walleij

> >>>>>> <linus.walleij@linaro.org>; Shawn Lin <shawn.lin@rock-chips.com>

> >>>>>> Subject: [PATCH V4 09/11] mmc: block: Add CQE support

> >>>>>>

> >>>>>> Add CQE support to the block driver, including:

> >>>>>>     - optionally using DCMD for flush requests

> >>>>>>     - manually issuing discard requests

> >>>>>>     - issuing read / write requests to the CQE

> >>>>>>     - supporting block-layer timeouts

> >>>>>>     - handling recovery

> >>>>>>     - supporting re-tuning

> >>>>>>

> >>>>>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>

> >>>>>> ---

> >>>>>>   drivers/mmc/core/block.c | 195

> >>> ++++++++++++++++++++++++++++++++-

> >>>>>>   drivers/mmc/core/block.h |   7 ++

> >>>>>>   drivers/mmc/core/queue.c | 273

> >>>>>> ++++++++++++++++++++++++++++++++++++++++++++++-

> >>>>>>   drivers/mmc/core/queue.h |  42 +++++++-

> >>>>>>   4 files changed, 510 insertions(+), 7 deletions(-)

> >>>>>>

> >>>>>> diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c

> >>>>>> index

> >>>>>> 915290c74363..2d25115637b7 100644

> >>>>>> --- a/drivers/mmc/core/block.c

> >>>>>> +++ b/drivers/mmc/core/block.c

> >>>>>> @@ -109,6 +109,7 @@ struct mmc_blk_data {

> >>>>>>   #define MMC_BLK_WRITE        BIT(1)

> >>>>>>   #define MMC_BLK_DISCARD        BIT(2)

> >>>>>>   #define MMC_BLK_SECDISCARD    BIT(3)

> >>>>>> +#define MMC_BLK_CQE_RECOVERY    BIT(4)

> >>>>>>

> >>>>>>       /*

> >>>>>>        * Only set in main mmc_blk_data associated @@ -1612,6

> >>>>>> +1613,198 @@ static void mmc_blk_data_prep(struct mmc_queue

> *mq,

> >>>>>> struct mmc_queue_req *mqrq,

> >>>>>>           *do_data_tag_p = do_data_tag;

> >>>>>>   }

> >>>>>>

> >>>>>> +#define MMC_CQE_RETRIES 2

> >>>>

> >>>>

> >>>>>> +        blk_queue_rq_timed_out(mq->queue, mmc_cqe_timed_out);

> >>>>>> +        blk_queue_rq_timeout(mq->queue, 60 * HZ);

> >>>>>

> >>>>

> >>>> ------8<-------

> >>>>

> >>>>> Hi Adrian,

> >>>>>

> >>>>> These days I'm doing CMDQ stress test, and find one issue.

> >>>>> On our i.MX8QXP-ARM2 board, the RAM is 3GB. eMMC is 32GB.

> >>>>> I use command 'free -m' get the total memory is 2800M, and the

> >>>>> free memory is 2500M.

> >>>>>

> >>>>> I use 'mkfs.ext4' to format ext4 file system on the eMMC under

> >>>>> HS400ES CMDQ mode, works fine.

> >>>>>

> >>>>> When I use the following command to stress test CMDQ, it works fine.

> >>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 2048 -r 1024

> >>>>>

> >>>>> But when I change to use a large file size to do the same stress

> >>>>> test, using

> >>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 4096 -r 2048

> >>>>> or

> >>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 5600

> >>>>>

> >>>>> I get the following dump message.  According to the log,

> >>>>> mmc_cqe_timed_out() was trigged.

> >>>>> Seems mmc was blocked in somewhere.

> >>>>> Then I try to debug this issue, and open MMC_DEBUG in config, do

> >>>>> the same test, print the detail Command sending information on the

> >>>>> console, but finally can't reproduce.

> >>>

> >>> mmc_cqe_timed_out() is a 60 second timeout provided by the block layer.

> >>> Refer "blk_queue_rq_timeout(mq->queue, 60 * HZ)" in

> mmc_init_queue().

> >>> 60s is quite a long time so I would first want to determine if the

> >>> task was really queued that long.  I would instrument some code into

> >>> cqhci_request() to record the start time on struct mmc_request, and

> >>> then print the time taken when there is a problem.

> >>>

> >>

> >> Hi Adrian,

> >>

> >> According to your suggestion, I add the following code to print the time.

> >> When issue happens, seems the request really pending for over 60s!

> >>

> >> done

> >> Writing intelligently...[  689.209548] mmc0: cqhci: timeout for tag 9

> >> [  689.213658] the mrq all use 62123742 us [  689.217487] mmc0:

> >> cqhci: ============ CQHCI REGISTER DUMP ===========

> >> [  689.223927] mmc0: cqhci: Caps:      0x0000310a | Version:  0x00000510

> >> [  689.230363] mmc0: cqhci: Config:    0x00001001 | Control:  0x00000000

> >> [  689.236800] mmc0: cqhci: Int stat:  0x00000000 | Int enab: 0x00000006

> >> [  689.243238] mmc0: cqhci: Int sig:   0x00000006 | Int Coal: 0x00000000

> >> [  689.249675] mmc0: cqhci: TDL base:  0x90079000 | TDL up32: 0x00000000

> >> [  689.256113] mmc0: cqhci: Doorbell:  0x1fffffff | TCN:      0x00000000

> >> [  689.262550] mmc0: cqhci: Dev queue: 0x1fffefff | Dev Pend: 0x1fff7fff

> >> [  689.268988] mmc0: cqhci: Task clr:  0x00000000 | SSC1:     0x00011000

> >> [  689.275425] mmc0: cqhci: SSC2:      0x00000001 | DCMD rsp: 0x00000800

> >> [  689.281862] mmc0: cqhci: RED mask:  0xfdf9a080 | TERRI:    0x00000000

> >> [  689.288300] mmc0: cqhci: Resp idx:  0x0000002f | Resp arg:

> >> 0x00000900 [  689.294737] mmc0: sdhci: ============ SDHCI REGISTER

> >> DUMP =========== [  689.301176] mmc0: sdhci: Sys addr:  0xb602f000 |

> >> Version:  0x00000002 [  689.307612] mmc0: sdhci: Blk size:

> >> 0x00000200 | Blk cnt:  0x00000400 [  689.314050] mmc0: sdhci: Argument:

> 0x000f0400 | Trn mode: 0x00000023

> >> [  689.320487] mmc0: sdhci: Present:   0x01fd858f | Host ctl: 0x00000030

> >> [  689.326925] mmc0: sdhci: Power:     0x00000002 | Blk gap:  0x00000080

> >> [  689.333362] mmc0: sdhci: Wake-up:   0x00000008 | Clock:    0x0000000f

> >> [  689.339800] mmc0: sdhci: Timeout:   0x0000008f | Int stat: 0x00000000

> >> [  689.346237] mmc0: sdhci: Int enab:  0x107f4000 | Sig enab:

> >> 0x107f4000 [  689.352674] mmc0: sdhci: AC12 err:  0x00000000 | Slot int:

> 0x00000502

> >> [  689.359113] mmc0: sdhci: Caps:      0x07eb0000 | Caps_1:   0x8000b407

> >> [  689.365549] mmc0: sdhci: Cmd:       0x00002c1a | Max curr: 0x00ffffff

> >> [  689.371987] mmc0: sdhci: Resp[0]:   0x00000900 | Resp[1]:  0xffffffff

> >> [  689.378424] mmc0: sdhci: Resp[2]:   0x328f5903 | Resp[3]:  0x00d02700

> >> [  689.384861] mmc0: sdhci: Host ctl2: 0x00000008 [  689.389302]

> >> mmc0: sdhci: ADMA Err:  0x00000009 | ADMA Ptr: 0x9009a400 [

> >> 689.395737] mmc0: sdhci:

> ============================================

> >> [  689.402212] mmc0: running CQE recovery

> >

> > Tag 9 has been queued (bit set in Dev Pend) which means it is up to

> > the eMMC to select it for execution.  You should dump the times for

> > the other mrq's to see how long they have been waiting and try to

> > determine if anything is being processed.

> >

> > If the eMMC is just taking a really long time to process tasks we

> > could extend the timeout, but it is hard to see how that is acceptable

> > to a final product.  At this point it looks like the eMMC may have a

> > flaw in the way it selects tasks for execution.

> 

> No, that is wrong sorry, the task is in the QSR (Dev queue) so it is the CQE that

> has not selected it.


The timeout tag is 9, for Dev queue: 0x1fffefff, bit 9 is 1, means task 9 already queue in eMMC. 
For Dev Pend: 0x1fff7fff, the bit 9 is also 1,  which means CQE already send CMD44 and CMD45, but still
not send CMD46/47.  Seems our CQE pending tag 9 for over 60s! I will check with our IC guys to confirm
the hardware mechanism.


> --

> To unsubscribe from this list: send the line "unsubscribe linux-mmc" in the

> body of a message to majordomo@vger.kernel.org More majordomo info at

> http://vger.kernel.org/majordomo-info.html
Bough Chen Aug. 9, 2017, 10:35 a.m. UTC | #4
> -----Original Message-----

> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-

> owner@vger.kernel.org] On Behalf Of Bough Chen

> Sent: Wednesday, August 09, 2017 5:42 PM

> To: Adrian Hunter <adrian.hunter@intel.com>; Shawn Lin <shawn.lin@rock-

> chips.com>

> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-

> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>; Mateusz

> Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung <jh80.chung@samsung.com>;

> Dong Aisheng <dongas86@gmail.com>; Das Asutosh

> <asutoshd@codeaurora.org>; Zhangfei Gao <zhangfei.gao@gmail.com>;

> Dorfman Konstantin <kdorfman@codeaurora.org>; Sahitya Tummala

> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>; Venu

> Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij <linus.walleij@linaro.org>

> Subject: RE: [PATCH V4 09/11] mmc: block: Add CQE support

> 

> > -----Original Message-----

> > From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-

> > owner@vger.kernel.org] On Behalf Of Adrian Hunter

> > Sent: Wednesday, August 09, 2017 4:31 PM

> > To: Bough Chen <haibo.chen@nxp.com>; Shawn Lin <shawn.lin@rock-

> > chips.com>

> > Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-

> > mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>;

> Mateusz

> > Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> > <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung <jh80.chung@samsung.com>;

> > Dong Aisheng <dongas86@gmail.com>; Das Asutosh

> > <asutoshd@codeaurora.org>; Zhangfei Gao <zhangfei.gao@gmail.com>;

> > Dorfman Konstantin <kdorfman@codeaurora.org>; Sahitya Tummala

> > <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>;

> > Venu Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij

> > <linus.walleij@linaro.org>

> > Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support

> >

> > On 09/08/17 11:16, Adrian Hunter wrote:

> > > On 09/08/17 10:57, Bough Chen wrote:

> > >>> -----Original Message-----

> > >>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-

> > >>> owner@vger.kernel.org] On Behalf Of Adrian Hunter

> > >>> Sent: Wednesday, August 09, 2017 1:58 PM

> > >>> To: Shawn Lin <shawn.lin@rock-chips.com>; Bough Chen

> > >>> <haibo.chen@nxp.com>

> > >>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-

> > >>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>;

> > >>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> > >>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung

> > >>> <jh80.chung@samsung.com>; Dong Aisheng <dongas86@gmail.com>;

> Das

> > >>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao

> > >>> <zhangfei.gao@gmail.com>; Dorfman Konstantin

> > >>> <kdorfman@codeaurora.org>; Sahitya Tummala

> > >>> <stummala@codeaurora.org>; Harjani Ritesh

> > >>> <riteshh@codeaurora.org>; Venu Byravarasu

> > >>> <vbyravarasu@nvidia.com>; Linus Walleij <linus.walleij@linaro.org>

> > >>> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support

> > >>>

> > >>> On 09/08/17 03:55, Shawn Lin wrote:

> > >>>> Hi,

> > >>>>

> > >>>> On 2017/8/8 20:07, Bough Chen wrote:

> > >>>>>> -----Original Message-----

> > >>>>>> From: Adrian Hunter [mailto:adrian.hunter@intel.com]

> > >>>>>> Sent: Friday, July 21, 2017 5:50 PM

> > >>>>>> To: Ulf Hansson <ulf.hansson@linaro.org>

> > >>>>>> Cc: linux-mmc <linux-mmc@vger.kernel.org>; Bough Chen

> > >>>>>> <haibo.chen@nxp.com>; Alex Lemberg

> <alex.lemberg@sandisk.com>;

> > >>>>>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> > >>>>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung

> > >>>>>> <jh80.chung@samsung.com>; Dong Aisheng

> <dongas86@gmail.com>;

> > Das

> > >>>>>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao

> > >>>>>> <zhangfei.gao@gmail.com>; Dorfman Konstantin

> > >>>>>> <kdorfman@codeaurora.org>; David Griego

> > >>>>>> <david.griego@linaro.org>; Sahitya Tummala

> > >>>>>> <stummala@codeaurora.org>; Harjani Ritesh

> > >>>>>> <riteshh@codeaurora.org>; Venu Byravarasu

> > >>>>>> <vbyravarasu@nvidia.com>; Linus Walleij

> > >>>>>> <linus.walleij@linaro.org>; Shawn Lin

> > >>>>>> <shawn.lin@rock-chips.com>

> > >>>>>> Subject: [PATCH V4 09/11] mmc: block: Add CQE support

> > >>>>>>

> > >>>>>> Add CQE support to the block driver, including:

> > >>>>>>     - optionally using DCMD for flush requests

> > >>>>>>     - manually issuing discard requests

> > >>>>>>     - issuing read / write requests to the CQE

> > >>>>>>     - supporting block-layer timeouts

> > >>>>>>     - handling recovery

> > >>>>>>     - supporting re-tuning

> > >>>>>>

> > >>>>>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>

> > >>>>>> ---

> > >>>>>>   drivers/mmc/core/block.c | 195

> > >>> ++++++++++++++++++++++++++++++++-

> > >>>>>>   drivers/mmc/core/block.h |   7 ++

> > >>>>>>   drivers/mmc/core/queue.c | 273

> > >>>>>> ++++++++++++++++++++++++++++++++++++++++++++++-

> > >>>>>>   drivers/mmc/core/queue.h |  42 +++++++-

> > >>>>>>   4 files changed, 510 insertions(+), 7 deletions(-)

> > >>>>>>

> > >>>>>> diff --git a/drivers/mmc/core/block.c

> > >>>>>> b/drivers/mmc/core/block.c index

> > >>>>>> 915290c74363..2d25115637b7 100644

> > >>>>>> --- a/drivers/mmc/core/block.c

> > >>>>>> +++ b/drivers/mmc/core/block.c

> > >>>>>> @@ -109,6 +109,7 @@ struct mmc_blk_data {

> > >>>>>>   #define MMC_BLK_WRITE        BIT(1)

> > >>>>>>   #define MMC_BLK_DISCARD        BIT(2)

> > >>>>>>   #define MMC_BLK_SECDISCARD    BIT(3)

> > >>>>>> +#define MMC_BLK_CQE_RECOVERY    BIT(4)

> > >>>>>>

> > >>>>>>       /*

> > >>>>>>        * Only set in main mmc_blk_data associated @@ -1612,6

> > >>>>>> +1613,198 @@ static void mmc_blk_data_prep(struct mmc_queue

> > *mq,

> > >>>>>> struct mmc_queue_req *mqrq,

> > >>>>>>           *do_data_tag_p = do_data_tag;

> > >>>>>>   }

> > >>>>>>

> > >>>>>> +#define MMC_CQE_RETRIES 2

> > >>>>

> > >>>>

> > >>>>>> +        blk_queue_rq_timed_out(mq->queue, mmc_cqe_timed_out);

> > >>>>>> +        blk_queue_rq_timeout(mq->queue, 60 * HZ);

> > >>>>>

> > >>>>

> > >>>> ------8<-------

> > >>>>

> > >>>>> Hi Adrian,

> > >>>>>

> > >>>>> These days I'm doing CMDQ stress test, and find one issue.

> > >>>>> On our i.MX8QXP-ARM2 board, the RAM is 3GB. eMMC is 32GB.

> > >>>>> I use command 'free -m' get the total memory is 2800M, and the

> > >>>>> free memory is 2500M.

> > >>>>>

> > >>>>> I use 'mkfs.ext4' to format ext4 file system on the eMMC under

> > >>>>> HS400ES CMDQ mode, works fine.

> > >>>>>

> > >>>>> When I use the following command to stress test CMDQ, it works fine.

> > >>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 2048 -r 1024

> > >>>>>

> > >>>>> But when I change to use a large file size to do the same stress

> > >>>>> test, using

> > >>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 4096 -r 2048

> > >>>>> or

> > >>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 5600

> > >>>>>

> > >>>>> I get the following dump message.  According to the log,

> > >>>>> mmc_cqe_timed_out() was trigged.

> > >>>>> Seems mmc was blocked in somewhere.

> > >>>>> Then I try to debug this issue, and open MMC_DEBUG in config, do

> > >>>>> the same test, print the detail Command sending information on

> > >>>>> the console, but finally can't reproduce.

> > >>>

> > >>> mmc_cqe_timed_out() is a 60 second timeout provided by the block

> layer.

> > >>> Refer "blk_queue_rq_timeout(mq->queue, 60 * HZ)" in

> > mmc_init_queue().

> > >>> 60s is quite a long time so I would first want to determine if the

> > >>> task was really queued that long.  I would instrument some code

> > >>> into

> > >>> cqhci_request() to record the start time on struct mmc_request,

> > >>> and then print the time taken when there is a problem.

> > >>>

> > >>

> > >> Hi Adrian,

> > >>

> > >> According to your suggestion, I add the following code to print the time.

> > >> When issue happens, seems the request really pending for over 60s!

> > >>

> > >> done

> > >> Writing intelligently...[  689.209548] mmc0: cqhci: timeout for tag

> > >> 9 [  689.213658] the mrq all use 62123742 us [  689.217487] mmc0:

> > >> cqhci: ============ CQHCI REGISTER DUMP ===========

> > >> [  689.223927] mmc0: cqhci: Caps:      0x0000310a | Version:  0x00000510

> > >> [  689.230363] mmc0: cqhci: Config:    0x00001001 | Control:  0x00000000

> > >> [  689.236800] mmc0: cqhci: Int stat:  0x00000000 | Int enab: 0x00000006

> > >> [  689.243238] mmc0: cqhci: Int sig:   0x00000006 | Int Coal: 0x00000000

> > >> [  689.249675] mmc0: cqhci: TDL base:  0x90079000 | TDL up32: 0x00000000

> > >> [  689.256113] mmc0: cqhci: Doorbell:  0x1fffffff | TCN:      0x00000000

> > >> [  689.262550] mmc0: cqhci: Dev queue: 0x1fffefff | Dev Pend: 0x1fff7fff

> > >> [  689.268988] mmc0: cqhci: Task clr:  0x00000000 | SSC1:     0x00011000

> > >> [  689.275425] mmc0: cqhci: SSC2:      0x00000001 | DCMD rsp: 0x00000800

> > >> [  689.281862] mmc0: cqhci: RED mask:  0xfdf9a080 | TERRI:    0x00000000

> > >> [  689.288300] mmc0: cqhci: Resp idx:  0x0000002f | Resp arg:

> > >> 0x00000900 [  689.294737] mmc0: sdhci: ============ SDHCI REGISTER

> > >> DUMP =========== [  689.301176] mmc0: sdhci: Sys addr:  0xb602f000

> > >> |

> > >> Version:  0x00000002 [  689.307612] mmc0: sdhci: Blk size:

> > >> 0x00000200 | Blk cnt:  0x00000400 [  689.314050] mmc0: sdhci: Argument:

> > 0x000f0400 | Trn mode: 0x00000023

> > >> [  689.320487] mmc0: sdhci: Present:   0x01fd858f | Host ctl: 0x00000030

> > >> [  689.326925] mmc0: sdhci: Power:     0x00000002 | Blk gap:  0x00000080

> > >> [  689.333362] mmc0: sdhci: Wake-up:   0x00000008 | Clock:    0x0000000f

> > >> [  689.339800] mmc0: sdhci: Timeout:   0x0000008f | Int stat: 0x00000000

> > >> [  689.346237] mmc0: sdhci: Int enab:  0x107f4000 | Sig enab:

> > >> 0x107f4000 [  689.352674] mmc0: sdhci: AC12 err:  0x00000000 | Slot int:

> > 0x00000502

> > >> [  689.359113] mmc0: sdhci: Caps:      0x07eb0000 | Caps_1:   0x8000b407

> > >> [  689.365549] mmc0: sdhci: Cmd:       0x00002c1a | Max curr: 0x00ffffff

> > >> [  689.371987] mmc0: sdhci: Resp[0]:   0x00000900 | Resp[1]:  0xffffffff

> > >> [  689.378424] mmc0: sdhci: Resp[2]:   0x328f5903 | Resp[3]:  0x00d02700

> > >> [  689.384861] mmc0: sdhci: Host ctl2: 0x00000008 [  689.389302]

> > >> mmc0: sdhci: ADMA Err:  0x00000009 | ADMA Ptr: 0x9009a400 [

> > >> 689.395737] mmc0: sdhci:

> > ============================================

> > >> [  689.402212] mmc0: running CQE recovery

> > >

> > > Tag 9 has been queued (bit set in Dev Pend) which means it is up to

> > > the eMMC to select it for execution.  You should dump the times for

> > > the other mrq's to see how long they have been waiting and try to

> > > determine if anything is being processed.

> > >

> > > If the eMMC is just taking a really long time to process tasks we

> > > could extend the timeout, but it is hard to see how that is

> > > acceptable to a final product.  At this point it looks like the eMMC

> > > may have a flaw in the way it selects tasks for execution.

> >

> > No, that is wrong sorry, the task is in the QSR (Dev queue) so it is

> > the CQE that has not selected it.

> 

> The timeout tag is 9, for Dev queue: 0x1fffefff, bit 9 is 1, means task 9 already

> queue in eMMC.

> For Dev Pend: 0x1fff7fff, the bit 9 is also 1,  which means CQE already send

> CMD44 and CMD45, but still not send CMD46/47.  Seems our CQE pending tag 9

> for over 60s! I will check with our IC guys to confirm the hardware mechanism.

> 


For the eMMC chip, the sequential wirte speed test by 'dd' is around 100MB/s.
If each tag try to write 1GB data, which meas each tag needs 10s to complete, once
The number of pending tags exceed 6, 60s timeout will be trigged.

I'm not sure how bonnie++ works, I will try to dump the CMDQ script list to verify whether this is the root cause.
   
> 

> > --

> > To unsubscribe from this list: send the line "unsubscribe linux-mmc"

> > in the body of a message to majordomo@vger.kernel.org More majordomo

> > info at http://vger.kernel.org/majordomo-info.html

>   칻 & ~ &   +-  ݶ  w  ˛   m b  f ȧ   ܨ}   Ơz &j:+v        zZ+  +zf   h   ~    i   z  w   ?

> & )ߢf
Adrian Hunter Aug. 9, 2017, 12:45 p.m. UTC | #5
On 08/09/2017 01:35 PM, Bough Chen wrote:
>> -----Original Message-----
>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-
>> owner@vger.kernel.org] On Behalf Of Bough Chen
>> Sent: Wednesday, August 09, 2017 5:42 PM
>> To: Adrian Hunter <adrian.hunter@intel.com>; Shawn Lin <shawn.lin@rock-
>> chips.com>
>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-
>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>; Mateusz
>> Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung <jh80.chung@samsung.com>;
>> Dong Aisheng <dongas86@gmail.com>; Das Asutosh
>> <asutoshd@codeaurora.org>; Zhangfei Gao <zhangfei.gao@gmail.com>;
>> Dorfman Konstantin <kdorfman@codeaurora.org>; Sahitya Tummala
>> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>; Venu
>> Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij <linus.walleij@linaro.org>
>> Subject: RE: [PATCH V4 09/11] mmc: block: Add CQE support
>>
>>> -----Original Message-----
>>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-
>>> owner@vger.kernel.org] On Behalf Of Adrian Hunter
>>> Sent: Wednesday, August 09, 2017 4:31 PM
>>> To: Bough Chen <haibo.chen@nxp.com>; Shawn Lin <shawn.lin@rock-
>>> chips.com>
>>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-
>>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>;
>> Mateusz
>>> Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung <jh80.chung@samsung.com>;
>>> Dong Aisheng <dongas86@gmail.com>; Das Asutosh
>>> <asutoshd@codeaurora.org>; Zhangfei Gao <zhangfei.gao@gmail.com>;
>>> Dorfman Konstantin <kdorfman@codeaurora.org>; Sahitya Tummala
>>> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>;
>>> Venu Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij
>>> <linus.walleij@linaro.org>
>>> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support
>>>
>>> On 09/08/17 11:16, Adrian Hunter wrote:
>>>> On 09/08/17 10:57, Bough Chen wrote:
>>>>>> -----Original Message-----
>>>>>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-
>>>>>> owner@vger.kernel.org] On Behalf Of Adrian Hunter
>>>>>> Sent: Wednesday, August 09, 2017 1:58 PM
>>>>>> To: Shawn Lin <shawn.lin@rock-chips.com>; Bough Chen
>>>>>> <haibo.chen@nxp.com>
>>>>>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-
>>>>>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>;
>>>>>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>>>>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung
>>>>>> <jh80.chung@samsung.com>; Dong Aisheng <dongas86@gmail.com>;
>> Das
>>>>>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao
>>>>>> <zhangfei.gao@gmail.com>; Dorfman Konstantin
>>>>>> <kdorfman@codeaurora.org>; Sahitya Tummala
>>>>>> <stummala@codeaurora.org>; Harjani Ritesh
>>>>>> <riteshh@codeaurora.org>; Venu Byravarasu
>>>>>> <vbyravarasu@nvidia.com>; Linus Walleij <linus.walleij@linaro.org>
>>>>>> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support
>>>>>>
>>>>>> On 09/08/17 03:55, Shawn Lin wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> On 2017/8/8 20:07, Bough Chen wrote:
>>>>>>>>> -----Original Message-----
>>>>>>>>> From: Adrian Hunter [mailto:adrian.hunter@intel.com]
>>>>>>>>> Sent: Friday, July 21, 2017 5:50 PM
>>>>>>>>> To: Ulf Hansson <ulf.hansson@linaro.org>
>>>>>>>>> Cc: linux-mmc <linux-mmc@vger.kernel.org>; Bough Chen
>>>>>>>>> <haibo.chen@nxp.com>; Alex Lemberg
>> <alex.lemberg@sandisk.com>;
>>>>>>>>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>>>>>>>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung
>>>>>>>>> <jh80.chung@samsung.com>; Dong Aisheng
>> <dongas86@gmail.com>;
>>> Das
>>>>>>>>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao
>>>>>>>>> <zhangfei.gao@gmail.com>; Dorfman Konstantin
>>>>>>>>> <kdorfman@codeaurora.org>; David Griego
>>>>>>>>> <david.griego@linaro.org>; Sahitya Tummala
>>>>>>>>> <stummala@codeaurora.org>; Harjani Ritesh
>>>>>>>>> <riteshh@codeaurora.org>; Venu Byravarasu
>>>>>>>>> <vbyravarasu@nvidia.com>; Linus Walleij
>>>>>>>>> <linus.walleij@linaro.org>; Shawn Lin
>>>>>>>>> <shawn.lin@rock-chips.com>
>>>>>>>>> Subject: [PATCH V4 09/11] mmc: block: Add CQE support
>>>>>>>>>
>>>>>>>>> Add CQE support to the block driver, including:
>>>>>>>>>     - optionally using DCMD for flush requests
>>>>>>>>>     - manually issuing discard requests
>>>>>>>>>     - issuing read / write requests to the CQE
>>>>>>>>>     - supporting block-layer timeouts
>>>>>>>>>     - handling recovery
>>>>>>>>>     - supporting re-tuning
>>>>>>>>>
>>>>>>>>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>>>>>>>>> ---
>>>>>>>>>   drivers/mmc/core/block.c | 195
>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>>>>   drivers/mmc/core/block.h |   7 ++
>>>>>>>>>   drivers/mmc/core/queue.c | 273
>>>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>>>>>   drivers/mmc/core/queue.h |  42 +++++++-
>>>>>>>>>   4 files changed, 510 insertions(+), 7 deletions(-)
>>>>>>>>>
>>>>>>>>> diff --git a/drivers/mmc/core/block.c
>>>>>>>>> b/drivers/mmc/core/block.c index
>>>>>>>>> 915290c74363..2d25115637b7 100644
>>>>>>>>> --- a/drivers/mmc/core/block.c
>>>>>>>>> +++ b/drivers/mmc/core/block.c
>>>>>>>>> @@ -109,6 +109,7 @@ struct mmc_blk_data {
>>>>>>>>>   #define MMC_BLK_WRITE        BIT(1)
>>>>>>>>>   #define MMC_BLK_DISCARD        BIT(2)
>>>>>>>>>   #define MMC_BLK_SECDISCARD    BIT(3)
>>>>>>>>> +#define MMC_BLK_CQE_RECOVERY    BIT(4)
>>>>>>>>>
>>>>>>>>>       /*
>>>>>>>>>        * Only set in main mmc_blk_data associated @@ -1612,6
>>>>>>>>> +1613,198 @@ static void mmc_blk_data_prep(struct mmc_queue
>>> *mq,
>>>>>>>>> struct mmc_queue_req *mqrq,
>>>>>>>>>           *do_data_tag_p = do_data_tag;
>>>>>>>>>   }
>>>>>>>>>
>>>>>>>>> +#define MMC_CQE_RETRIES 2
>>>>>>>
>>>>>>>
>>>>>>>>> +        blk_queue_rq_timed_out(mq->queue, mmc_cqe_timed_out);
>>>>>>>>> +        blk_queue_rq_timeout(mq->queue, 60 * HZ);
>>>>>>>>
>>>>>>>
>>>>>>> ------8<-------
>>>>>>>
>>>>>>>> Hi Adrian,
>>>>>>>>
>>>>>>>> These days I'm doing CMDQ stress test, and find one issue.
>>>>>>>> On our i.MX8QXP-ARM2 board, the RAM is 3GB. eMMC is 32GB.
>>>>>>>> I use command 'free -m' get the total memory is 2800M, and the
>>>>>>>> free memory is 2500M.
>>>>>>>>
>>>>>>>> I use 'mkfs.ext4' to format ext4 file system on the eMMC under
>>>>>>>> HS400ES CMDQ mode, works fine.
>>>>>>>>
>>>>>>>> When I use the following command to stress test CMDQ, it works fine.
>>>>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 2048 -r 1024
>>>>>>>>
>>>>>>>> But when I change to use a large file size to do the same stress
>>>>>>>> test, using
>>>>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 4096 -r 2048
>>>>>>>> or
>>>>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 5600
>>>>>>>>
>>>>>>>> I get the following dump message.  According to the log,
>>>>>>>> mmc_cqe_timed_out() was trigged.
>>>>>>>> Seems mmc was blocked in somewhere.
>>>>>>>> Then I try to debug this issue, and open MMC_DEBUG in config, do
>>>>>>>> the same test, print the detail Command sending information on
>>>>>>>> the console, but finally can't reproduce.
>>>>>>
>>>>>> mmc_cqe_timed_out() is a 60 second timeout provided by the block
>> layer.
>>>>>> Refer "blk_queue_rq_timeout(mq->queue, 60 * HZ)" in
>>> mmc_init_queue().
>>>>>> 60s is quite a long time so I would first want to determine if the
>>>>>> task was really queued that long.  I would instrument some code
>>>>>> into
>>>>>> cqhci_request() to record the start time on struct mmc_request,
>>>>>> and then print the time taken when there is a problem.
>>>>>>
>>>>>
>>>>> Hi Adrian,
>>>>>
>>>>> According to your suggestion, I add the following code to print the time.
>>>>> When issue happens, seems the request really pending for over 60s!
>>>>>
>>>>> done
>>>>> Writing intelligently...[  689.209548] mmc0: cqhci: timeout for tag
>>>>> 9 [  689.213658] the mrq all use 62123742 us [  689.217487] mmc0:
>>>>> cqhci: ============ CQHCI REGISTER DUMP ===========
>>>>> [  689.223927] mmc0: cqhci: Caps:      0x0000310a | Version:  0x00000510
>>>>> [  689.230363] mmc0: cqhci: Config:    0x00001001 | Control:  0x00000000
>>>>> [  689.236800] mmc0: cqhci: Int stat:  0x00000000 | Int enab: 0x00000006
>>>>> [  689.243238] mmc0: cqhci: Int sig:   0x00000006 | Int Coal: 0x00000000
>>>>> [  689.249675] mmc0: cqhci: TDL base:  0x90079000 | TDL up32: 0x00000000
>>>>> [  689.256113] mmc0: cqhci: Doorbell:  0x1fffffff | TCN:      0x00000000
>>>>> [  689.262550] mmc0: cqhci: Dev queue: 0x1fffefff | Dev Pend: 0x1fff7fff
>>>>> [  689.268988] mmc0: cqhci: Task clr:  0x00000000 | SSC1:     0x00011000
>>>>> [  689.275425] mmc0: cqhci: SSC2:      0x00000001 | DCMD rsp: 0x00000800
>>>>> [  689.281862] mmc0: cqhci: RED mask:  0xfdf9a080 | TERRI:    0x00000000
>>>>> [  689.288300] mmc0: cqhci: Resp idx:  0x0000002f | Resp arg:
>>>>> 0x00000900 [  689.294737] mmc0: sdhci: ============ SDHCI REGISTER
>>>>> DUMP =========== [  689.301176] mmc0: sdhci: Sys addr:  0xb602f000
>>>>> |
>>>>> Version:  0x00000002 [  689.307612] mmc0: sdhci: Blk size:
>>>>> 0x00000200 | Blk cnt:  0x00000400 [  689.314050] mmc0: sdhci: Argument:
>>> 0x000f0400 | Trn mode: 0x00000023
>>>>> [  689.320487] mmc0: sdhci: Present:   0x01fd858f | Host ctl: 0x00000030
>>>>> [  689.326925] mmc0: sdhci: Power:     0x00000002 | Blk gap:  0x00000080
>>>>> [  689.333362] mmc0: sdhci: Wake-up:   0x00000008 | Clock:    0x0000000f
>>>>> [  689.339800] mmc0: sdhci: Timeout:   0x0000008f | Int stat: 0x00000000
>>>>> [  689.346237] mmc0: sdhci: Int enab:  0x107f4000 | Sig enab:
>>>>> 0x107f4000 [  689.352674] mmc0: sdhci: AC12 err:  0x00000000 | Slot int:
>>> 0x00000502
>>>>> [  689.359113] mmc0: sdhci: Caps:      0x07eb0000 | Caps_1:   0x8000b407
>>>>> [  689.365549] mmc0: sdhci: Cmd:       0x00002c1a | Max curr: 0x00ffffff
>>>>> [  689.371987] mmc0: sdhci: Resp[0]:   0x00000900 | Resp[1]:  0xffffffff
>>>>> [  689.378424] mmc0: sdhci: Resp[2]:   0x328f5903 | Resp[3]:  0x00d02700
>>>>> [  689.384861] mmc0: sdhci: Host ctl2: 0x00000008 [  689.389302]
>>>>> mmc0: sdhci: ADMA Err:  0x00000009 | ADMA Ptr: 0x9009a400 [
>>>>> 689.395737] mmc0: sdhci:
>>> ============================================
>>>>> [  689.402212] mmc0: running CQE recovery
>>>>
>>>> Tag 9 has been queued (bit set in Dev Pend) which means it is up to
>>>> the eMMC to select it for execution.  You should dump the times for
>>>> the other mrq's to see how long they have been waiting and try to
>>>> determine if anything is being processed.
>>>>
>>>> If the eMMC is just taking a really long time to process tasks we
>>>> could extend the timeout, but it is hard to see how that is
>>>> acceptable to a final product.  At this point it looks like the eMMC
>>>> may have a flaw in the way it selects tasks for execution.
>>>
>>> No, that is wrong sorry, the task is in the QSR (Dev queue) so it is
>>> the CQE that has not selected it.
>>
>> The timeout tag is 9, for Dev queue: 0x1fffefff, bit 9 is 1, means task 9 already
>> queue in eMMC.
>> For Dev Pend: 0x1fff7fff, the bit 9 is also 1,  which means CQE already send
>> CMD44 and CMD45, but still not send CMD46/47.  Seems our CQE pending tag 9
>> for over 60s! I will check with our IC guys to confirm the hardware mechanism.
>>
> 
> For the eMMC chip, the sequential wirte speed test by 'dd' is around 100MB/s.
> If each tag try to write 1GB data, which meas each tag needs 10s to complete, once
> The number of pending tags exceed 6, 60s timeout will be trigged.

The request size is limited by the block layer due to host controller
parameters.  In the case of SDHCI to 512KiB.  So each tag is at most 512KiB.
--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Adrian Hunter Aug. 10, 2017, 10:19 a.m. UTC | #6
On 09/08/17 15:45, Adrian Hunter wrote:
> On 08/09/2017 01:35 PM, Bough Chen wrote:
>>> -----Original Message-----
>>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-
>>> owner@vger.kernel.org] On Behalf Of Bough Chen
>>> Sent: Wednesday, August 09, 2017 5:42 PM
>>> To: Adrian Hunter <adrian.hunter@intel.com>; Shawn Lin <shawn.lin@rock-
>>> chips.com>
>>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-
>>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>; Mateusz
>>> Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung <jh80.chung@samsung.com>;
>>> Dong Aisheng <dongas86@gmail.com>; Das Asutosh
>>> <asutoshd@codeaurora.org>; Zhangfei Gao <zhangfei.gao@gmail.com>;
>>> Dorfman Konstantin <kdorfman@codeaurora.org>; Sahitya Tummala
>>> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>; Venu
>>> Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij <linus.walleij@linaro.org>
>>> Subject: RE: [PATCH V4 09/11] mmc: block: Add CQE support
>>>
>>>> -----Original Message-----
>>>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-
>>>> owner@vger.kernel.org] On Behalf Of Adrian Hunter
>>>> Sent: Wednesday, August 09, 2017 4:31 PM
>>>> To: Bough Chen <haibo.chen@nxp.com>; Shawn Lin <shawn.lin@rock-
>>>> chips.com>
>>>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-
>>>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>;
>>> Mateusz
>>>> Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung <jh80.chung@samsung.com>;
>>>> Dong Aisheng <dongas86@gmail.com>; Das Asutosh
>>>> <asutoshd@codeaurora.org>; Zhangfei Gao <zhangfei.gao@gmail.com>;
>>>> Dorfman Konstantin <kdorfman@codeaurora.org>; Sahitya Tummala
>>>> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>;
>>>> Venu Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij
>>>> <linus.walleij@linaro.org>
>>>> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support
>>>>
>>>> On 09/08/17 11:16, Adrian Hunter wrote:
>>>>> On 09/08/17 10:57, Bough Chen wrote:
>>>>>>> -----Original Message-----
>>>>>>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-
>>>>>>> owner@vger.kernel.org] On Behalf Of Adrian Hunter
>>>>>>> Sent: Wednesday, August 09, 2017 1:58 PM
>>>>>>> To: Shawn Lin <shawn.lin@rock-chips.com>; Bough Chen
>>>>>>> <haibo.chen@nxp.com>
>>>>>>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-
>>>>>>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>;
>>>>>>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>>>>>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung
>>>>>>> <jh80.chung@samsung.com>; Dong Aisheng <dongas86@gmail.com>;
>>> Das
>>>>>>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao
>>>>>>> <zhangfei.gao@gmail.com>; Dorfman Konstantin
>>>>>>> <kdorfman@codeaurora.org>; Sahitya Tummala
>>>>>>> <stummala@codeaurora.org>; Harjani Ritesh
>>>>>>> <riteshh@codeaurora.org>; Venu Byravarasu
>>>>>>> <vbyravarasu@nvidia.com>; Linus Walleij <linus.walleij@linaro.org>
>>>>>>> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support
>>>>>>>
>>>>>>> On 09/08/17 03:55, Shawn Lin wrote:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> On 2017/8/8 20:07, Bough Chen wrote:
>>>>>>>>>> -----Original Message-----
>>>>>>>>>> From: Adrian Hunter [mailto:adrian.hunter@intel.com]
>>>>>>>>>> Sent: Friday, July 21, 2017 5:50 PM
>>>>>>>>>> To: Ulf Hansson <ulf.hansson@linaro.org>
>>>>>>>>>> Cc: linux-mmc <linux-mmc@vger.kernel.org>; Bough Chen
>>>>>>>>>> <haibo.chen@nxp.com>; Alex Lemberg
>>> <alex.lemberg@sandisk.com>;
>>>>>>>>>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov
>>>>>>>>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung
>>>>>>>>>> <jh80.chung@samsung.com>; Dong Aisheng
>>> <dongas86@gmail.com>;
>>>> Das
>>>>>>>>>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao
>>>>>>>>>> <zhangfei.gao@gmail.com>; Dorfman Konstantin
>>>>>>>>>> <kdorfman@codeaurora.org>; David Griego
>>>>>>>>>> <david.griego@linaro.org>; Sahitya Tummala
>>>>>>>>>> <stummala@codeaurora.org>; Harjani Ritesh
>>>>>>>>>> <riteshh@codeaurora.org>; Venu Byravarasu
>>>>>>>>>> <vbyravarasu@nvidia.com>; Linus Walleij
>>>>>>>>>> <linus.walleij@linaro.org>; Shawn Lin
>>>>>>>>>> <shawn.lin@rock-chips.com>
>>>>>>>>>> Subject: [PATCH V4 09/11] mmc: block: Add CQE support
>>>>>>>>>>
>>>>>>>>>> Add CQE support to the block driver, including:
>>>>>>>>>>     - optionally using DCMD for flush requests
>>>>>>>>>>     - manually issuing discard requests
>>>>>>>>>>     - issuing read / write requests to the CQE
>>>>>>>>>>     - supporting block-layer timeouts
>>>>>>>>>>     - handling recovery
>>>>>>>>>>     - supporting re-tuning
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
>>>>>>>>>> ---
>>>>>>>>>>   drivers/mmc/core/block.c | 195
>>>>>>> ++++++++++++++++++++++++++++++++-
>>>>>>>>>>   drivers/mmc/core/block.h |   7 ++
>>>>>>>>>>   drivers/mmc/core/queue.c | 273
>>>>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++-
>>>>>>>>>>   drivers/mmc/core/queue.h |  42 +++++++-
>>>>>>>>>>   4 files changed, 510 insertions(+), 7 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/drivers/mmc/core/block.c
>>>>>>>>>> b/drivers/mmc/core/block.c index
>>>>>>>>>> 915290c74363..2d25115637b7 100644
>>>>>>>>>> --- a/drivers/mmc/core/block.c
>>>>>>>>>> +++ b/drivers/mmc/core/block.c
>>>>>>>>>> @@ -109,6 +109,7 @@ struct mmc_blk_data {
>>>>>>>>>>   #define MMC_BLK_WRITE        BIT(1)
>>>>>>>>>>   #define MMC_BLK_DISCARD        BIT(2)
>>>>>>>>>>   #define MMC_BLK_SECDISCARD    BIT(3)
>>>>>>>>>> +#define MMC_BLK_CQE_RECOVERY    BIT(4)
>>>>>>>>>>
>>>>>>>>>>       /*
>>>>>>>>>>        * Only set in main mmc_blk_data associated @@ -1612,6
>>>>>>>>>> +1613,198 @@ static void mmc_blk_data_prep(struct mmc_queue
>>>> *mq,
>>>>>>>>>> struct mmc_queue_req *mqrq,
>>>>>>>>>>           *do_data_tag_p = do_data_tag;
>>>>>>>>>>   }
>>>>>>>>>>
>>>>>>>>>> +#define MMC_CQE_RETRIES 2
>>>>>>>>
>>>>>>>>
>>>>>>>>>> +        blk_queue_rq_timed_out(mq->queue, mmc_cqe_timed_out);
>>>>>>>>>> +        blk_queue_rq_timeout(mq->queue, 60 * HZ);
>>>>>>>>>
>>>>>>>>
>>>>>>>> ------8<-------
>>>>>>>>
>>>>>>>>> Hi Adrian,
>>>>>>>>>
>>>>>>>>> These days I'm doing CMDQ stress test, and find one issue.
>>>>>>>>> On our i.MX8QXP-ARM2 board, the RAM is 3GB. eMMC is 32GB.
>>>>>>>>> I use command 'free -m' get the total memory is 2800M, and the
>>>>>>>>> free memory is 2500M.
>>>>>>>>>
>>>>>>>>> I use 'mkfs.ext4' to format ext4 file system on the eMMC under
>>>>>>>>> HS400ES CMDQ mode, works fine.
>>>>>>>>>
>>>>>>>>> When I use the following command to stress test CMDQ, it works fine.
>>>>>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 2048 -r 1024
>>>>>>>>>
>>>>>>>>> But when I change to use a large file size to do the same stress
>>>>>>>>> test, using
>>>>>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 4096 -r 2048
>>>>>>>>> or
>>>>>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 5600
>>>>>>>>>
>>>>>>>>> I get the following dump message.  According to the log,
>>>>>>>>> mmc_cqe_timed_out() was trigged.
>>>>>>>>> Seems mmc was blocked in somewhere.
>>>>>>>>> Then I try to debug this issue, and open MMC_DEBUG in config, do
>>>>>>>>> the same test, print the detail Command sending information on
>>>>>>>>> the console, but finally can't reproduce.
>>>>>>>
>>>>>>> mmc_cqe_timed_out() is a 60 second timeout provided by the block
>>> layer.
>>>>>>> Refer "blk_queue_rq_timeout(mq->queue, 60 * HZ)" in
>>>> mmc_init_queue().
>>>>>>> 60s is quite a long time so I would first want to determine if the
>>>>>>> task was really queued that long.  I would instrument some code
>>>>>>> into
>>>>>>> cqhci_request() to record the start time on struct mmc_request,
>>>>>>> and then print the time taken when there is a problem.
>>>>>>>
>>>>>>
>>>>>> Hi Adrian,
>>>>>>
>>>>>> According to your suggestion, I add the following code to print the time.
>>>>>> When issue happens, seems the request really pending for over 60s!
>>>>>>
>>>>>> done
>>>>>> Writing intelligently...[  689.209548] mmc0: cqhci: timeout for tag
>>>>>> 9 [  689.213658] the mrq all use 62123742 us [  689.217487] mmc0:
>>>>>> cqhci: ============ CQHCI REGISTER DUMP ===========
>>>>>> [  689.223927] mmc0: cqhci: Caps:      0x0000310a | Version:  0x00000510
>>>>>> [  689.230363] mmc0: cqhci: Config:    0x00001001 | Control:  0x00000000
>>>>>> [  689.236800] mmc0: cqhci: Int stat:  0x00000000 | Int enab: 0x00000006
>>>>>> [  689.243238] mmc0: cqhci: Int sig:   0x00000006 | Int Coal: 0x00000000
>>>>>> [  689.249675] mmc0: cqhci: TDL base:  0x90079000 | TDL up32: 0x00000000
>>>>>> [  689.256113] mmc0: cqhci: Doorbell:  0x1fffffff | TCN:      0x00000000
>>>>>> [  689.262550] mmc0: cqhci: Dev queue: 0x1fffefff | Dev Pend: 0x1fff7fff
>>>>>> [  689.268988] mmc0: cqhci: Task clr:  0x00000000 | SSC1:     0x00011000
>>>>>> [  689.275425] mmc0: cqhci: SSC2:      0x00000001 | DCMD rsp: 0x00000800
>>>>>> [  689.281862] mmc0: cqhci: RED mask:  0xfdf9a080 | TERRI:    0x00000000
>>>>>> [  689.288300] mmc0: cqhci: Resp idx:  0x0000002f | Resp arg:
>>>>>> 0x00000900 [  689.294737] mmc0: sdhci: ============ SDHCI REGISTER
>>>>>> DUMP =========== [  689.301176] mmc0: sdhci: Sys addr:  0xb602f000
>>>>>> |
>>>>>> Version:  0x00000002 [  689.307612] mmc0: sdhci: Blk size:
>>>>>> 0x00000200 | Blk cnt:  0x00000400 [  689.314050] mmc0: sdhci: Argument:
>>>> 0x000f0400 | Trn mode: 0x00000023
>>>>>> [  689.320487] mmc0: sdhci: Present:   0x01fd858f | Host ctl: 0x00000030
>>>>>> [  689.326925] mmc0: sdhci: Power:     0x00000002 | Blk gap:  0x00000080
>>>>>> [  689.333362] mmc0: sdhci: Wake-up:   0x00000008 | Clock:    0x0000000f
>>>>>> [  689.339800] mmc0: sdhci: Timeout:   0x0000008f | Int stat: 0x00000000
>>>>>> [  689.346237] mmc0: sdhci: Int enab:  0x107f4000 | Sig enab:
>>>>>> 0x107f4000 [  689.352674] mmc0: sdhci: AC12 err:  0x00000000 | Slot int:
>>>> 0x00000502
>>>>>> [  689.359113] mmc0: sdhci: Caps:      0x07eb0000 | Caps_1:   0x8000b407
>>>>>> [  689.365549] mmc0: sdhci: Cmd:       0x00002c1a | Max curr: 0x00ffffff
>>>>>> [  689.371987] mmc0: sdhci: Resp[0]:   0x00000900 | Resp[1]:  0xffffffff
>>>>>> [  689.378424] mmc0: sdhci: Resp[2]:   0x328f5903 | Resp[3]:  0x00d02700
>>>>>> [  689.384861] mmc0: sdhci: Host ctl2: 0x00000008 [  689.389302]
>>>>>> mmc0: sdhci: ADMA Err:  0x00000009 | ADMA Ptr: 0x9009a400 [
>>>>>> 689.395737] mmc0: sdhci:
>>>> ============================================
>>>>>> [  689.402212] mmc0: running CQE recovery
>>>>>
>>>>> Tag 9 has been queued (bit set in Dev Pend) which means it is up to
>>>>> the eMMC to select it for execution.  You should dump the times for
>>>>> the other mrq's to see how long they have been waiting and try to
>>>>> determine if anything is being processed.
>>>>>
>>>>> If the eMMC is just taking a really long time to process tasks we
>>>>> could extend the timeout, but it is hard to see how that is
>>>>> acceptable to a final product.  At this point it looks like the eMMC
>>>>> may have a flaw in the way it selects tasks for execution.
>>>>
>>>> No, that is wrong sorry, the task is in the QSR (Dev queue) so it is
>>>> the CQE that has not selected it.
>>>
>>> The timeout tag is 9, for Dev queue: 0x1fffefff, bit 9 is 1, means task 9 already
>>> queue in eMMC.
>>> For Dev Pend: 0x1fff7fff, the bit 9 is also 1,  which means CQE already send
>>> CMD44 and CMD45, but still not send CMD46/47.  Seems our CQE pending tag 9
>>> for over 60s! I will check with our IC guys to confirm the hardware mechanism.
>>>
>>
>> For the eMMC chip, the sequential wirte speed test by 'dd' is around 100MB/s.
>> If each tag try to write 1GB data, which meas each tag needs 10s to complete, once
>> The number of pending tags exceed 6, 60s timeout will be trigged.
> 
> The request size is limited by the block layer due to host controller
> parameters.  In the case of SDHCI to 512KiB.  So each tag is at most 512KiB.
> 

I just found a bug in 32-bit DMA.  Are you using 32-bit DMA?  That could
also be causing your problem.  I will send a new version of the patches with
a fix, probably later today.

--
To unsubscribe from this list: send the line "unsubscribe linux-mmc" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bough Chen Aug. 10, 2017, 10:38 a.m. UTC | #7
> -----Original Message-----

> From: Adrian Hunter [mailto:adrian.hunter@intel.com]

> Sent: Thursday, August 10, 2017 6:19 PM

> To: Bough Chen <haibo.chen@nxp.com>; Shawn Lin <shawn.lin@rock-

> chips.com>

> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-

> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>; Mateusz

> Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung <jh80.chung@samsung.com>;

> Dong Aisheng <dongas86@gmail.com>; Das Asutosh

> <asutoshd@codeaurora.org>; Zhangfei Gao <zhangfei.gao@gmail.com>;

> Dorfman Konstantin <kdorfman@codeaurora.org>; Sahitya Tummala

> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>; Venu

> Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij <linus.walleij@linaro.org>

> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support

> 

> On 09/08/17 15:45, Adrian Hunter wrote:

> > On 08/09/2017 01:35 PM, Bough Chen wrote:

> >>> -----Original Message-----

> >>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-

> >>> owner@vger.kernel.org] On Behalf Of Bough Chen

> >>> Sent: Wednesday, August 09, 2017 5:42 PM

> >>> To: Adrian Hunter <adrian.hunter@intel.com>; Shawn Lin

> >>> <shawn.lin@rock- chips.com>

> >>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-

> >>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>;

> >>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> >>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung

> >>> <jh80.chung@samsung.com>; Dong Aisheng <dongas86@gmail.com>; Das

> >>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao

> >>> <zhangfei.gao@gmail.com>; Dorfman Konstantin

> >>> <kdorfman@codeaurora.org>; Sahitya Tummala

> >>> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>;

> >>> Venu Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij

> >>> <linus.walleij@linaro.org>

> >>> Subject: RE: [PATCH V4 09/11] mmc: block: Add CQE support

> >>>

> >>>> -----Original Message-----

> >>>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-

> >>>> owner@vger.kernel.org] On Behalf Of Adrian Hunter

> >>>> Sent: Wednesday, August 09, 2017 4:31 PM

> >>>> To: Bough Chen <haibo.chen@nxp.com>; Shawn Lin <shawn.lin@rock-

> >>>> chips.com>

> >>>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-

> >>>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>;

> >>> Mateusz

> >>>> Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> >>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung

> >>>> <jh80.chung@samsung.com>; Dong Aisheng <dongas86@gmail.com>;

> Das

> >>>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao

> >>>> <zhangfei.gao@gmail.com>; Dorfman Konstantin

> >>>> <kdorfman@codeaurora.org>; Sahitya Tummala

> >>>> <stummala@codeaurora.org>; Harjani Ritesh <riteshh@codeaurora.org>;

> >>>> Venu Byravarasu <vbyravarasu@nvidia.com>; Linus Walleij

> >>>> <linus.walleij@linaro.org>

> >>>> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support

> >>>>

> >>>> On 09/08/17 11:16, Adrian Hunter wrote:

> >>>>> On 09/08/17 10:57, Bough Chen wrote:

> >>>>>>> -----Original Message-----

> >>>>>>> From: linux-mmc-owner@vger.kernel.org [mailto:linux-mmc-

> >>>>>>> owner@vger.kernel.org] On Behalf Of Adrian Hunter

> >>>>>>> Sent: Wednesday, August 09, 2017 1:58 PM

> >>>>>>> To: Shawn Lin <shawn.lin@rock-chips.com>; Bough Chen

> >>>>>>> <haibo.chen@nxp.com>

> >>>>>>> Cc: Ulf Hansson <ulf.hansson@linaro.org>; linux-mmc <linux-

> >>>>>>> mmc@vger.kernel.org>; Alex Lemberg <alex.lemberg@sandisk.com>;

> >>>>>>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> >>>>>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung

> >>>>>>> <jh80.chung@samsung.com>; Dong Aisheng <dongas86@gmail.com>;

> >>> Das

> >>>>>>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao

> >>>>>>> <zhangfei.gao@gmail.com>; Dorfman Konstantin

> >>>>>>> <kdorfman@codeaurora.org>; Sahitya Tummala

> >>>>>>> <stummala@codeaurora.org>; Harjani Ritesh

> >>>>>>> <riteshh@codeaurora.org>; Venu Byravarasu

> >>>>>>> <vbyravarasu@nvidia.com>; Linus Walleij

> >>>>>>> <linus.walleij@linaro.org>

> >>>>>>> Subject: Re: [PATCH V4 09/11] mmc: block: Add CQE support

> >>>>>>>

> >>>>>>> On 09/08/17 03:55, Shawn Lin wrote:

> >>>>>>>> Hi,

> >>>>>>>>

> >>>>>>>> On 2017/8/8 20:07, Bough Chen wrote:

> >>>>>>>>>> -----Original Message-----

> >>>>>>>>>> From: Adrian Hunter [mailto:adrian.hunter@intel.com]

> >>>>>>>>>> Sent: Friday, July 21, 2017 5:50 PM

> >>>>>>>>>> To: Ulf Hansson <ulf.hansson@linaro.org>

> >>>>>>>>>> Cc: linux-mmc <linux-mmc@vger.kernel.org>; Bough Chen

> >>>>>>>>>> <haibo.chen@nxp.com>; Alex Lemberg

> >>> <alex.lemberg@sandisk.com>;

> >>>>>>>>>> Mateusz Nowak <mateusz.nowak@intel.com>; Yuliy Izrailov

> >>>>>>>>>> <Yuliy.Izrailov@sandisk.com>; Jaehoon Chung

> >>>>>>>>>> <jh80.chung@samsung.com>; Dong Aisheng

> >>> <dongas86@gmail.com>;

> >>>> Das

> >>>>>>>>>> Asutosh <asutoshd@codeaurora.org>; Zhangfei Gao

> >>>>>>>>>> <zhangfei.gao@gmail.com>; Dorfman Konstantin

> >>>>>>>>>> <kdorfman@codeaurora.org>; David Griego

> >>>>>>>>>> <david.griego@linaro.org>; Sahitya Tummala

> >>>>>>>>>> <stummala@codeaurora.org>; Harjani Ritesh

> >>>>>>>>>> <riteshh@codeaurora.org>; Venu Byravarasu

> >>>>>>>>>> <vbyravarasu@nvidia.com>; Linus Walleij

> >>>>>>>>>> <linus.walleij@linaro.org>; Shawn Lin

> >>>>>>>>>> <shawn.lin@rock-chips.com>

> >>>>>>>>>> Subject: [PATCH V4 09/11] mmc: block: Add CQE support

> >>>>>>>>>>

> >>>>>>>>>> Add CQE support to the block driver, including:

> >>>>>>>>>>     - optionally using DCMD for flush requests

> >>>>>>>>>>     - manually issuing discard requests

> >>>>>>>>>>     - issuing read / write requests to the CQE

> >>>>>>>>>>     - supporting block-layer timeouts

> >>>>>>>>>>     - handling recovery

> >>>>>>>>>>     - supporting re-tuning

> >>>>>>>>>>

> >>>>>>>>>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>

> >>>>>>>>>> ---

> >>>>>>>>>>   drivers/mmc/core/block.c | 195

> >>>>>>> ++++++++++++++++++++++++++++++++-

> >>>>>>>>>>   drivers/mmc/core/block.h |   7 ++

> >>>>>>>>>>   drivers/mmc/core/queue.c | 273

> >>>>>>>>>> ++++++++++++++++++++++++++++++++++++++++++++++-

> >>>>>>>>>>   drivers/mmc/core/queue.h |  42 +++++++-

> >>>>>>>>>>   4 files changed, 510 insertions(+), 7 deletions(-)

> >>>>>>>>>>

> >>>>>>>>>> diff --git a/drivers/mmc/core/block.c

> >>>>>>>>>> b/drivers/mmc/core/block.c index

> >>>>>>>>>> 915290c74363..2d25115637b7 100644

> >>>>>>>>>> --- a/drivers/mmc/core/block.c

> >>>>>>>>>> +++ b/drivers/mmc/core/block.c

> >>>>>>>>>> @@ -109,6 +109,7 @@ struct mmc_blk_data {

> >>>>>>>>>>   #define MMC_BLK_WRITE        BIT(1)

> >>>>>>>>>>   #define MMC_BLK_DISCARD        BIT(2)

> >>>>>>>>>>   #define MMC_BLK_SECDISCARD    BIT(3)

> >>>>>>>>>> +#define MMC_BLK_CQE_RECOVERY    BIT(4)

> >>>>>>>>>>

> >>>>>>>>>>       /*

> >>>>>>>>>>        * Only set in main mmc_blk_data associated @@ -1612,6

> >>>>>>>>>> +1613,198 @@ static void mmc_blk_data_prep(struct

> mmc_queue

> >>>> *mq,

> >>>>>>>>>> struct mmc_queue_req *mqrq,

> >>>>>>>>>>           *do_data_tag_p = do_data_tag;

> >>>>>>>>>>   }

> >>>>>>>>>>

> >>>>>>>>>> +#define MMC_CQE_RETRIES 2

> >>>>>>>>

> >>>>>>>>

> >>>>>>>>>> +        blk_queue_rq_timed_out(mq->queue,

> mmc_cqe_timed_out);

> >>>>>>>>>> +        blk_queue_rq_timeout(mq->queue, 60 * HZ);

> >>>>>>>>>

> >>>>>>>>

> >>>>>>>> ------8<-------

> >>>>>>>>

> >>>>>>>>> Hi Adrian,

> >>>>>>>>>

> >>>>>>>>> These days I'm doing CMDQ stress test, and find one issue.

> >>>>>>>>> On our i.MX8QXP-ARM2 board, the RAM is 3GB. eMMC is 32GB.

> >>>>>>>>> I use command 'free -m' get the total memory is 2800M, and the

> >>>>>>>>> free memory is 2500M.

> >>>>>>>>>

> >>>>>>>>> I use 'mkfs.ext4' to format ext4 file system on the eMMC under

> >>>>>>>>> HS400ES CMDQ mode, works fine.

> >>>>>>>>>

> >>>>>>>>> When I use the following command to stress test CMDQ, it works

> fine.

> >>>>>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 2048 -r 1024

> >>>>>>>>>

> >>>>>>>>> But when I change to use a large file size to do the same

> >>>>>>>>> stress test, using

> >>>>>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 4096 -r 2048

> >>>>>>>>> or

> >>>>>>>>> bonnie++ -d /run/media/mmcblk0p1/ -u 0:0 -s 5600

> >>>>>>>>>

> >>>>>>>>> I get the following dump message.  According to the log,

> >>>>>>>>> mmc_cqe_timed_out() was trigged.

> >>>>>>>>> Seems mmc was blocked in somewhere.

> >>>>>>>>> Then I try to debug this issue, and open MMC_DEBUG in config,

> >>>>>>>>> do the same test, print the detail Command sending information

> >>>>>>>>> on the console, but finally can't reproduce.

> >>>>>>>

> >>>>>>> mmc_cqe_timed_out() is a 60 second timeout provided by the block

> >>> layer.

> >>>>>>> Refer "blk_queue_rq_timeout(mq->queue, 60 * HZ)" in

> >>>> mmc_init_queue().

> >>>>>>> 60s is quite a long time so I would first want to determine if

> >>>>>>> the task was really queued that long.  I would instrument some

> >>>>>>> code into

> >>>>>>> cqhci_request() to record the start time on struct mmc_request,

> >>>>>>> and then print the time taken when there is a problem.

> >>>>>>>

> >>>>>>

> >>>>>> Hi Adrian,

> >>>>>>

> >>>>>> According to your suggestion, I add the following code to print the time.

> >>>>>> When issue happens, seems the request really pending for over 60s!

> >>>>>>

> >>>>>> done

> >>>>>> Writing intelligently...[  689.209548] mmc0: cqhci: timeout for

> >>>>>> tag

> >>>>>> 9 [  689.213658] the mrq all use 62123742 us [  689.217487] mmc0:

> >>>>>> cqhci: ============ CQHCI REGISTER DUMP ===========

> >>>>>> [  689.223927] mmc0: cqhci: Caps:      0x0000310a | Version:  0x00000510

> >>>>>> [  689.230363] mmc0: cqhci: Config:    0x00001001 | Control:  0x00000000

> >>>>>> [  689.236800] mmc0: cqhci: Int stat:  0x00000000 | Int enab: 0x00000006

> >>>>>> [  689.243238] mmc0: cqhci: Int sig:   0x00000006 | Int Coal: 0x00000000

> >>>>>> [  689.249675] mmc0: cqhci: TDL base:  0x90079000 | TDL up32:

> 0x00000000

> >>>>>> [  689.256113] mmc0: cqhci: Doorbell:  0x1fffffff | TCN:      0x00000000

> >>>>>> [  689.262550] mmc0: cqhci: Dev queue: 0x1fffefff | Dev Pend:

> 0x1fff7fff

> >>>>>> [  689.268988] mmc0: cqhci: Task clr:  0x00000000 | SSC1:     0x00011000

> >>>>>> [  689.275425] mmc0: cqhci: SSC2:      0x00000001 | DCMD rsp:

> 0x00000800

> >>>>>> [  689.281862] mmc0: cqhci: RED mask:  0xfdf9a080 | TERRI:

> 0x00000000

> >>>>>> [  689.288300] mmc0: cqhci: Resp idx:  0x0000002f | Resp arg:

> >>>>>> 0x00000900 [  689.294737] mmc0: sdhci: ============ SDHCI

> >>>>>> REGISTER DUMP =========== [  689.301176] mmc0: sdhci: Sys addr:

> >>>>>> 0xb602f000

> >>>>>> |

> >>>>>> Version:  0x00000002 [  689.307612] mmc0: sdhci: Blk size:

> >>>>>> 0x00000200 | Blk cnt:  0x00000400 [  689.314050] mmc0: sdhci:

> Argument:

> >>>> 0x000f0400 | Trn mode: 0x00000023

> >>>>>> [  689.320487] mmc0: sdhci: Present:   0x01fd858f | Host ctl: 0x00000030

> >>>>>> [  689.326925] mmc0: sdhci: Power:     0x00000002 | Blk gap:

> 0x00000080

> >>>>>> [  689.333362] mmc0: sdhci: Wake-up:   0x00000008 | Clock:

> 0x0000000f

> >>>>>> [  689.339800] mmc0: sdhci: Timeout:   0x0000008f | Int stat:

> 0x00000000

> >>>>>> [  689.346237] mmc0: sdhci: Int enab:  0x107f4000 | Sig enab:

> >>>>>> 0x107f4000 [  689.352674] mmc0: sdhci: AC12 err:  0x00000000 | Slot int:

> >>>> 0x00000502

> >>>>>> [  689.359113] mmc0: sdhci: Caps:      0x07eb0000 | Caps_1:   0x8000b407

> >>>>>> [  689.365549] mmc0: sdhci: Cmd:       0x00002c1a | Max curr: 0x00ffffff

> >>>>>> [  689.371987] mmc0: sdhci: Resp[0]:   0x00000900 | Resp[1]:  0xffffffff

> >>>>>> [  689.378424] mmc0: sdhci: Resp[2]:   0x328f5903 | Resp[3]:

> 0x00d02700

> >>>>>> [  689.384861] mmc0: sdhci: Host ctl2: 0x00000008 [  689.389302]

> >>>>>> mmc0: sdhci: ADMA Err:  0x00000009 | ADMA Ptr: 0x9009a400 [

> >>>>>> 689.395737] mmc0: sdhci:

> >>>> ============================================

> >>>>>> [  689.402212] mmc0: running CQE recovery

> >>>>>

> >>>>> Tag 9 has been queued (bit set in Dev Pend) which means it is up

> >>>>> to the eMMC to select it for execution.  You should dump the times

> >>>>> for the other mrq's to see how long they have been waiting and try

> >>>>> to determine if anything is being processed.

> >>>>>

> >>>>> If the eMMC is just taking a really long time to process tasks we

> >>>>> could extend the timeout, but it is hard to see how that is

> >>>>> acceptable to a final product.  At this point it looks like the

> >>>>> eMMC may have a flaw in the way it selects tasks for execution.

> >>>>

> >>>> No, that is wrong sorry, the task is in the QSR (Dev queue) so it

> >>>> is the CQE that has not selected it.

> >>>

> >>> The timeout tag is 9, for Dev queue: 0x1fffefff, bit 9 is 1, means

> >>> task 9 already queue in eMMC.

> >>> For Dev Pend: 0x1fff7fff, the bit 9 is also 1,  which means CQE

> >>> already send

> >>> CMD44 and CMD45, but still not send CMD46/47.  Seems our CQE pending

> >>> tag 9 for over 60s! I will check with our IC guys to confirm the hardware

> mechanism.

> >>>

> >>

> >> For the eMMC chip, the sequential wirte speed test by 'dd' is around

> 100MB/s.

> >> If each tag try to write 1GB data, which meas each tag needs 10s to

> >> complete, once The number of pending tags exceed 6, 60s timeout will be

> trigged.

> >

> > The request size is limited by the block layer due to host controller

> > parameters.  In the case of SDHCI to 512KiB.  So each tag is at most 512KiB.

> >

> 

> I just found a bug in 32-bit DMA.  Are you using 32-bit DMA?  That could also be

> causing your problem.  I will send a new version of the patches with a fix,

> probably later today.


Yes, I'm using 32-bit ADMA.
diff mbox

Patch

diff --git a/drivers/mmc/host/cqhci.c b/drivers/mmc/host/cqhci.c
index 1b56d03..7359895 100644
--- a/drivers/mmc/host/cqhci.c
+++ b/drivers/mmc/host/cqhci.c
@@ -556,6 +556,7 @@  static int cqhci_request(struct mmc_host *mmc, struct mmc_request *mrq)
        u64 *task_desc = NULL;
        int tag = cqhci_tag(mrq);
        struct cqhci_host *cq_host = mmc->cqe_private;
+       struct timeval *start_time = &mrq->start_time;
        unsigned long flags;

        if (!cq_host->enabled) {
@@ -605,6 +606,8 @@  static int cqhci_request(struct mmc_host *mmc, struct mmc_request *mrq)

        cq_host->qcnt += 1;

+       do_gettimeofday(start_time);
+
        cqhci_writel(cq_host, 1 << tag, CQHCI_TDBR);
        if (!(cqhci_readl(cq_host, CQHCI_TDBR) & (1 << tag)))
                pr_debug("%s: cqhci: doorbell not set for tag %d\n",
@@ -822,6 +825,8 @@  static bool cqhci_timeout(struct mmc_host *mmc, struct mmc_request *mrq,
        struct cqhci_host *cq_host = mmc->cqe_private;
        int tag = cqhci_tag(mrq);
        struct cqhci_slot *slot = &cq_host->slot[tag];
+       struct timeval *end_time = &mrq->end_time;
+       struct timeval *start_time = &mrq->start_time;
        unsigned long flags;
        bool timed_out;

@@ -835,8 +840,11 @@  static bool cqhci_timeout(struct mmc_host *mmc, struct mmc_request *mrq,
        spin_unlock_irqrestore(&cq_host->lock, flags);

        if (timed_out) {
+               do_gettimeofday(end_time);
+
                pr_err("%s: cqhci: timeout for tag %d\n",
                       mmc_hostname(mmc), tag);
+               pr_err("the mrq all use %ld us\n", (end_time->tv_sec - start_time->tv_sec) * 1000000 + end_time->tv_usec - start_time->tv_usec);
                cqhci_dumpregs(cq_host);
        }

diff --git a/include/linux/mmc/core.h b/include/linux/mmc/core.h
index 80f36b7..15093aa 100644
--- a/include/linux/mmc/core.h
+++ b/include/linux/mmc/core.h
@@ -10,6 +10,7 @@ 

 #include <linux/completion.h>
 #include <linux/types.h>
+#include <linux/time.h>

 struct mmc_data;
 struct mmc_request;
@@ -157,6 +158,8 @@  struct mmc_request {
        struct completion       cmd_completion;
        void                    (*done)(struct mmc_request *);/* completion function */
        struct mmc_host         *host;
+       struct timeval          start_time;
+       struct timeval          end_time;

        /* Allow other commands during this ongoing data transfer or busy wait */
        bool                    cap_cmd_during_tfr;