diff mbox

[V13,03/10] mmc: block: Add blk-mq support

Message ID 1509715220-31885-4-git-send-email-adrian.hunter@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Adrian Hunter Nov. 3, 2017, 1:20 p.m. UTC
Define and use a blk-mq queue. Discards and flushes are processed
synchronously, but reads and writes asynchronously. In order to support
slow DMA unmapping, DMA unmapping is not done until after the next request
is started. That means the request is not completed until then. If there is
no next request then the completion is done by queued work.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
---
 drivers/mmc/core/block.c | 457 ++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/mmc/core/block.h |   9 +
 drivers/mmc/core/queue.c | 273 +++++++++++++++++++++++++---
 drivers/mmc/core/queue.h |  32 ++++
 4 files changed, 740 insertions(+), 31 deletions(-)

Comments

Linus Walleij Nov. 8, 2017, 8:54 a.m. UTC | #1
On Fri, Nov 3, 2017 at 2:20 PM, Adrian Hunter <adrian.hunter@intel.com> wrote:

> Define and use a blk-mq queue. Discards and flushes are processed
> synchronously, but reads and writes asynchronously. In order to support
> slow DMA unmapping, DMA unmapping is not done until after the next request
> is started. That means the request is not completed until then. If there is
> no next request then the completion is done by queued work.
>
> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>

> -       blk_end_request_all(req, ret ? BLK_STS_IOERR : BLK_STS_OK);
> +       if (req->mq_ctx)
> +               blk_mq_end_request(req, ret ? BLK_STS_IOERR : BLK_STS_OK);
> +       else
> +               blk_end_request_all(req, ret ? BLK_STS_IOERR : BLK_STS_OK);

I think this quite obvious code duplication is unfortunate.

What my patches do is get rid of the old block layer in order
to be able to focus on the new stuff using just MQ.

One reason is that the code is hairy already as it is, by just
supporting MQ the above is still just one line of code, the same
goes for the other instances below.

At least you could do what I did and break out a helper like
this:

/*
 * This reports status back to the block layer for a finished request.
 */
static void mmc_blk_complete(struct mmc_queue_req *mq_rq,
                            blk_status_t status)
{
       struct request *req = mmc_queue_req_to_req(mq_rq);

       if (req->mq_ctx) {
          blk_mq_end_request(req, status);
       } else
          blk_end_request_all(req, status);
}

> +/* Single sector read during recovery */
> +static void mmc_blk_ss_read(struct mmc_queue *mq, struct request *req)
> +{
> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
> +       blk_status_t status;
> +
> +       while (1) {
> +               mmc_blk_rw_rq_prep(mqrq, mq->card, 1, mq);
> +
> +               mmc_wait_for_req(mq->card->host, &mqrq->brq.mrq);
> +
> +               /*
> +                * Not expecting command errors, so just give up in that case.
> +                * If there are retries remaining, the request will get
> +                * requeued.
> +                */
> +               if (mqrq->brq.cmd.error)
> +                       return;
> +
> +               if (blk_rq_bytes(req) <= 512)
> +                       break;
> +
> +               status = mqrq->brq.data.error ? BLK_STS_IOERR : BLK_STS_OK;
> +
> +               blk_update_request(req, status, 512);
> +       }
> +
> +       mqrq->retries = MMC_NO_RETRIES;
> +}
> +
> +static void mmc_blk_rw_recovery(struct mmc_queue *mq, struct request *req)
> +{
> +       int type = rq_data_dir(req) == READ ? MMC_BLK_READ : MMC_BLK_WRITE;
> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
> +       struct mmc_blk_request *brq = &mqrq->brq;
> +       struct mmc_blk_data *md = mq->blkdata;
> +       struct mmc_card *card = mq->card;
> +       static enum mmc_blk_status status;
> +
> +       brq->retune_retry_done = mqrq->retries;
> +
> +       status = __mmc_blk_err_check(card, mqrq);
> +
> +       mmc_retune_release(card->host);
> +
> +       /*
> +        * Requests are completed by mmc_blk_mq_complete_rq() which sets simple
> +        * policy:
> +        * 1. A request that has transferred at least some data is considered
> +        * successful and will be requeued if there is remaining data to
> +        * transfer.
> +        * 2. Otherwise the number of retries is incremented and the request
> +        * will be requeued if there are remaining retries.
> +        * 3. Otherwise the request will be errored out.
> +        * That means mmc_blk_mq_complete_rq() is controlled by bytes_xfered and
> +        * mqrq->retries. So there are only 4 possible actions here:
> +        *      1. do not accept the bytes_xfered value i.e. set it to zero
> +        *      2. change mqrq->retries to determine the number of retries
> +        *      3. try to reset the card
> +        *      4. read one sector at a time
> +        */
> +       switch (status) {
> +       case MMC_BLK_SUCCESS:
> +       case MMC_BLK_PARTIAL:
> +               /* Reset success, and accept bytes_xfered */
> +               mmc_blk_reset_success(md, type);
> +               break;
> +       case MMC_BLK_CMD_ERR:
> +               /*
> +                * For SD cards, get bytes written, but do not accept
> +                * bytes_xfered if that fails. For MMC cards accept
> +                * bytes_xfered. Then try to reset. If reset fails then
> +                * error out the remaining request, otherwise retry
> +                * once (N.B mmc_blk_reset() will not succeed twice in a
> +                * row).
> +                */
> +               if (mmc_card_sd(card)) {
> +                       u32 blocks;
> +                       int err;
> +
> +                       err = mmc_sd_num_wr_blocks(card, &blocks);
> +                       if (err)
> +                               brq->data.bytes_xfered = 0;
> +                       else
> +                               brq->data.bytes_xfered = blocks << 9;
> +               }
> +               if (mmc_blk_reset(md, card->host, type))
> +                       mqrq->retries = MMC_NO_RETRIES;
> +               else
> +                       mqrq->retries = MMC_MAX_RETRIES - 1;
> +               break;
> +       case MMC_BLK_RETRY:
> +               /*
> +                * Do not accept bytes_xfered, but retry up to 5 times,
> +                * otherwise same as abort.
> +                */
> +               brq->data.bytes_xfered = 0;
> +               if (mqrq->retries < MMC_MAX_RETRIES)
> +                       break;
> +               /* Fall through */
> +       case MMC_BLK_ABORT:
> +               /*
> +                * Do not accept bytes_xfered, but try to reset. If
> +                * reset succeeds, try once more, otherwise error out
> +                * the request.
> +                */
> +               brq->data.bytes_xfered = 0;
> +               if (mmc_blk_reset(md, card->host, type))
> +                       mqrq->retries = MMC_NO_RETRIES;
> +               else
> +                       mqrq->retries = MMC_MAX_RETRIES - 1;
> +               break;
> +       case MMC_BLK_DATA_ERR: {
> +               int err;
> +
> +               /*
> +                * Do not accept bytes_xfered, but try to reset. If
> +                * reset succeeds, try once more. If reset fails with
> +                * ENODEV which means the partition is wrong, then error
> +                * out the request. Otherwise attempt to read one sector
> +                * at a time.
> +                */
> +               brq->data.bytes_xfered = 0;
> +               err = mmc_blk_reset(md, card->host, type);
> +               if (!err) {
> +                       mqrq->retries = MMC_MAX_RETRIES - 1;
> +                       break;
> +               }
> +               if (err == -ENODEV) {
> +                       mqrq->retries = MMC_NO_RETRIES;
> +                       break;
> +               }
> +               /* Fall through */
> +       }
> +       case MMC_BLK_ECC_ERR:
> +               /*
> +                * Do not accept bytes_xfered. If reading more than one
> +                * sector, try reading one sector at a time.
> +                */
> +               brq->data.bytes_xfered = 0;
> +               /* FIXME: Missing single sector read for large sector size */
> +               if (brq->data.blocks > 1 && !mmc_large_sector(card)) {
> +                       /* Redo read one sector at a time */
> +                       pr_warn("%s: retrying using single block read\n",
> +                               req->rq_disk->disk_name);
> +                       mmc_blk_ss_read(mq, req);
> +               } else {
> +                       mqrq->retries = MMC_NO_RETRIES;
> +               }
> +               break;
> +       case MMC_BLK_NOMEDIUM:
> +               /* Do not accept bytes_xfered. Error out the request */
> +               brq->data.bytes_xfered = 0;
> +               mqrq->retries = MMC_NO_RETRIES;
> +               break;
> +       default:
> +               /* Do not accept bytes_xfered. Error out the request */
> +               brq->data.bytes_xfered = 0;
> +               mqrq->retries = MMC_NO_RETRIES;
> +               pr_err("%s: Unhandled return value (%d)",
> +                      req->rq_disk->disk_name, status);
> +               break;
> +       }
> +}
> +
> +static void mmc_blk_mq_complete_rq(struct mmc_queue *mq, struct request *req)
> +{
> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
> +       unsigned int nr_bytes = mqrq->brq.data.bytes_xfered;
> +
> +       if (nr_bytes) {
> +               if (blk_update_request(req, BLK_STS_OK, nr_bytes))
> +                       blk_mq_requeue_request(req, true);
> +               else
> +                       __blk_mq_end_request(req, BLK_STS_OK);
> +       } else if (mqrq->retries++ < MMC_MAX_RETRIES) {
> +               blk_mq_requeue_request(req, true);
> +       } else {
> +               if (mmc_card_removed(mq->card))
> +                       req->rq_flags |= RQF_QUIET;
> +               blk_mq_end_request(req, BLK_STS_IOERR);
> +       }
> +}

This retry and error handling using requeue is very elegant.
I really like this.

If we could also go for MQ-only, only this nice code
remains in the tree.

The problem: you have just reimplemented the whole error
handling we had in the old block layer and now we have to
maintain two copies and keep them in sync.

This is not OK IMO, we will inevitable screw it up, so we
need to get *one* error path.

> +static bool mmc_blk_urgent_bkops_needed(struct mmc_queue *mq,
> +                                       struct mmc_queue_req *mqrq)
> +{
> +       return mmc_card_mmc(mq->card) &&
> +              (mqrq->brq.cmd.resp[0] & R1_EXCEPTION_EVENT ||
> +               mqrq->brq.stop.resp[0] & R1_EXCEPTION_EVENT);
> +}
> +
> +static void mmc_blk_urgent_bkops(struct mmc_queue *mq,
> +                                struct mmc_queue_req *mqrq)
> +{
> +       if (mmc_blk_urgent_bkops_needed(mq, mqrq))
> +               mmc_start_bkops(mq->card, true);
> +}
> +
> +void mmc_blk_mq_complete(struct request *req)
> +{
> +       struct mmc_queue *mq = req->q->queuedata;
> +
> +       mmc_blk_mq_complete_rq(mq, req);
> +}

So this is called from the struct blk_mq_ops .complete()
callback. And this calls blk_mq_end_request().

So the semantic order needs to be complete -> end.

I see this pattern in newer MQ code, I got it wrong in
my patch set so I try to fix it up.

> +static void mmc_blk_mq_poll_completion(struct mmc_queue *mq,
> +                                      struct request *req)
> +{
> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
> +
> +       mmc_blk_rw_recovery(mq, req);
> +
> +       mmc_blk_urgent_bkops(mq, mqrq);
> +}

This looks nice.

> +static void mmc_blk_mq_acct_req_done(struct mmc_queue *mq, struct request *req)

What does "acct" mean in the above function name?
Accounting? Actual? I'm lost.

> +{
> +       struct request_queue *q = req->q;
> +       unsigned long flags;
> +       bool put_card;
> +
> +       spin_lock_irqsave(q->queue_lock, flags);
> +
> +       mq->in_flight[mmc_issue_type(mq, req)] -= 1;
> +
> +       put_card = mmc_tot_in_flight(mq) == 0;

This in_flight[] business seems a bit kludgy, but I
don't really understand it fully. Magic numbers like
-1 to mark that something is not going on etc, not
super-elegant.

I believe it is necessary for CQE though as you need
to keep track of outstanding requests?

> +
> +       spin_unlock_irqrestore(q->queue_lock, flags);
> +
> +       if (put_card)
> +               mmc_put_card(mq->card, &mq->ctx);

I think you should try not to sprinkle mmc_put_card() inside
the different functions but instead you can put this in the
.complete callback I guess mmc_blk_mq_complete() in your
patch set.

Also you do not need to avoid calling it several times with
that put_card variable. It's fully reentrant thanks to your
own code in the lock and all calls come from the same block
layer process if you call it in .complete() I think?

> +static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req)
> +{
> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
> +       struct mmc_request *mrq = &mqrq->brq.mrq;
> +       struct mmc_host *host = mq->card->host;
> +
> +       if (host->ops->post_req)
> +               host->ops->post_req(host, mrq, 0);
> +
> +       blk_mq_complete_request(req);
> +
> +       mmc_blk_mq_acct_req_done(mq, req);
> +}

Now the problem Ulf has pointed out starts to creep out in the
patch: a lot of code duplication on the MQ path compared to
the ordinary block layer path.

My approach was structured partly to avoid this: first refactor
the old path, then switch to (only) MQ to avoid code duplication.

> +static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq,
> +                                        struct request **prev_req)
> +{
> +       mutex_lock(&mq->complete_lock);
> +
> +       if (!mq->complete_req)
> +               goto out_unlock;
> +
> +       mmc_blk_mq_poll_completion(mq, mq->complete_req);
> +
> +       if (prev_req)
> +               *prev_req = mq->complete_req;
> +       else
> +               mmc_blk_mq_post_req(mq, mq->complete_req);
> +
> +       mq->complete_req = NULL;
> +
> +out_unlock:
> +       mutex_unlock(&mq->complete_lock);
> +}

This looks a bit like it is reimplementing the kernel
completion abstraction using a mutex and a variable named
.complete_req?

We were using a completion in the old block layer so
why did you not use it for MQ?

> +static void mmc_blk_mq_req_done(struct mmc_request *mrq)
> +{
> +       struct mmc_queue_req *mqrq = container_of(mrq, struct mmc_queue_req,
> +                                                 brq.mrq);
> +       struct request *req = mmc_queue_req_to_req(mqrq);
> +       struct request_queue *q = req->q;
> +       struct mmc_queue *mq = q->queuedata;
> +       unsigned long flags;
> +       bool waiting;
> +
> +       spin_lock_irqsave(q->queue_lock, flags);
> +       mq->complete_req = req;
> +       mq->rw_wait = false;
> +       waiting = mq->waiting;
> +       spin_unlock_irqrestore(q->queue_lock, flags);
> +
> +       if (waiting)
> +               wake_up(&mq->wait);

I would contest using a waitqueue for this. The name even says
"complete_req" so why is a completion not the right thing to
hang on rather than a waitqueue?

The completion already contains a waitqueue, so I think you're
just essentially reimplementing it.

Just complete(&mq->mq_req_complete) or something should do
the trick.

> +       else
> +               kblockd_schedule_work(&mq->complete_work);

I did not use the kblockd workqueue for this, out of fear
that it would interfere and disturb the block layer work items.
My intuitive idea was that the MMC layer needed its own
worker (like in the past it used a thread) in order to avoid
congestion in the block layer queue leading to unnecessary
delays.

On the other hand, this likely avoids a context switch if there
is no congestion on the queue.

I am uncertain when it is advisible to use the block layer
queue for subsystems like MMC/SD.

Would be nice to see some direction from the block layer
folks here, it is indeed exposed to us...

My performance tests show no problems with this approach
though.

> +static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
> +{
> +       struct request_queue *q = mq->queue;
> +       unsigned long flags;
> +       bool done;
> +
> +       spin_lock_irqsave(q->queue_lock, flags);
> +       done = !mq->rw_wait;
> +       mq->waiting = !done;
> +       spin_unlock_irqrestore(q->queue_lock, flags);

This makes it look like a reimplementation of completion_done()
so I think you should use the completion abstraction again. The
struct completion even contains a variable named "done".

> +static int mmc_blk_mq_issue_rw_rq(struct mmc_queue *mq,
> +                                 struct request *req)
> +{
> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
> +       struct mmc_host *host = mq->card->host;
> +       struct request *prev_req = NULL;
> +       int err = 0;
> +
> +       mmc_blk_rw_rq_prep(mqrq, mq->card, 0, mq);
> +
> +       mqrq->brq.mrq.done = mmc_blk_mq_req_done;
> +
> +       if (host->ops->pre_req)
> +               host->ops->pre_req(host, &mqrq->brq.mrq);
> +
> +       err = mmc_blk_rw_wait(mq, &prev_req);
> +       if (err)
> +               goto out_post_req;
> +
> +       mq->rw_wait = true;
> +
> +       err = mmc_start_request(host, &mqrq->brq.mrq);
> +
> +       if (prev_req)
> +               mmc_blk_mq_post_req(mq, prev_req);
> +
> +       if (err)
> +               mq->rw_wait = false;
> +
> +out_post_req:
> +       if (err && host->ops->post_req)
> +               host->ops->post_req(host, &mqrq->brq.mrq, err);
> +
> +       return err;
> +}

This is pretty straight-forward (pending the comments above).
Again it has the downside of duplicating the same code for the
old block layer instead of refactoring.

> +enum mmc_issued mmc_blk_mq_issue_rq(struct mmc_queue *mq, struct request *req)
> +{
> +       struct mmc_blk_data *md = mq->blkdata;
> +       struct mmc_card *card = md->queue.card;
> +       struct mmc_host *host = card->host;
> +       int ret;
> +
> +       ret = mmc_blk_part_switch(card, md->part_type);
> +       if (ret)
> +               return MMC_REQ_FAILED_TO_START;
> +
> +       switch (mmc_issue_type(mq, req)) {
> +       case MMC_ISSUE_SYNC:
> +               ret = mmc_blk_wait_for_idle(mq, host);
> +               if (ret)
> +                       return MMC_REQ_BUSY;
> +               switch (req_op(req)) {
> +               case REQ_OP_DRV_IN:
> +               case REQ_OP_DRV_OUT:
> +                       mmc_blk_issue_drv_op(mq, req);
> +                       break;
> +               case REQ_OP_DISCARD:
> +                       mmc_blk_issue_discard_rq(mq, req);
> +                       break;
> +               case REQ_OP_SECURE_ERASE:
> +                       mmc_blk_issue_secdiscard_rq(mq, req);
> +                       break;
> +               case REQ_OP_FLUSH:
> +                       mmc_blk_issue_flush(mq, req);
> +                       break;
> +               default:
> +                       WARN_ON_ONCE(1);
> +                       return MMC_REQ_FAILED_TO_START;
> +               }
> +               return MMC_REQ_FINISHED;
> +       case MMC_ISSUE_ASYNC:
> +               switch (req_op(req)) {
> +               case REQ_OP_READ:
> +               case REQ_OP_WRITE:
> +                       ret = mmc_blk_mq_issue_rw_rq(mq, req);
> +                       break;
> +               default:
> +                       WARN_ON_ONCE(1);
> +                       ret = -EINVAL;
> +               }
> +               if (!ret)
> +                       return MMC_REQ_STARTED;
> +               return ret == -EBUSY ? MMC_REQ_BUSY : MMC_REQ_FAILED_TO_START;
> +       default:
> +               WARN_ON_ONCE(1);
> +               return MMC_REQ_FAILED_TO_START;
> +       }
> +}

Again looks fine, again duplicates code. In this case I don't even
see why the MQ code needs its own copy of the issue funtion.

> +enum mmc_issue_type mmc_issue_type(struct mmc_queue *mq, struct request *req)
> +{
> +       if (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_WRITE)
> +               return MMC_ISSUE_ASYNC;
> +
> +       return MMC_ISSUE_SYNC;
> +}

Distinguishing between SYNC and ASYNC operations and using
that as abstraction is nice.

But you only do this in the new MQ code.

Instead, make this a separate patch and first refactor the old
code to use this distinction between SYNC and ASYNC.

Unfortunately I think Ulf's earlier criticism that you're rewriting
the world instead of refactoring what we have still stands on many
accounts here.

It makes it even harder to understand your persistance in keeping
the old block layer around. If you're introducing new concepts and
cleaner code in the MQ path and kind of discarding the old
block layer path, why keep it around at all?

I would have a much easier time accepting this patch if it
deleted as much as it was adding, i.e. introduce all this new
nice MQ code, but also tossing out the old block layer and error
handling code. Even if it is a massive rewrite, at least there
is just one body of code to maintain going forward.

That said, I would strongly prefer a refactoring of the old block
layer leading up to transitioning to MQ. But I am indeed biased
since I took that approach myself.

> +static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req,
> +                                                bool reserved)
> +{
> +       return BLK_EH_RESET_TIMER;
> +}

This timeout looks like something I need to pick up in my patch
set as well. It seems good for stability to support this. But what happened
here? Did you experience a bunch of timeouts during development,
or let's say how was this engineered, I guess it is for the case when
something randomly locks up for a long time and we don't really know
what has happened, like a watchdog?

> +static int mmc_init_request(struct request_queue *q, struct request *req,
> +                           gfp_t gfp)
> +{
> +       return __mmc_init_request(q->queuedata, req, gfp);
> +}
> +
(...)
> +static int mmc_mq_init_request(struct blk_mq_tag_set *set, struct request *req,
> +                              unsigned int hctx_idx, unsigned int numa_node)
> +{
> +       return __mmc_init_request(set->driver_data, req, GFP_KERNEL);
> +}
> +
> +static void mmc_mq_exit_request(struct blk_mq_tag_set *set, struct request *req,
> +                               unsigned int hctx_idx)
> +{
> +       struct mmc_queue *mq = set->driver_data;
> +
> +       mmc_exit_request(mq->queue, req);
> +}

Here is more code duplication just to keep both the old block layer
and MQ around. Including introducing another inner __foo function
which I have something strongly against personally (I might be
crazily picky, because I see many people do this).

> +static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
> +                                   const struct blk_mq_queue_data *bd)
> +{
> +       struct request *req = bd->rq;
> +       struct request_queue *q = req->q;
> +       struct mmc_queue *mq = q->queuedata;
> +       struct mmc_card *card = mq->card;
> +       enum mmc_issue_type issue_type;
> +       enum mmc_issued issued;
> +       bool get_card;
> +       int ret;
> +
> +       if (mmc_card_removed(mq->card)) {
> +               req->rq_flags |= RQF_QUIET;
> +               return BLK_STS_IOERR;
> +       }
> +
> +       issue_type = mmc_issue_type(mq, req);
> +
> +       spin_lock_irq(q->queue_lock);
> +
> +       switch (issue_type) {
> +       case MMC_ISSUE_ASYNC:
> +               break;
> +       default:
> +               /*
> +                * Timeouts are handled by mmc core, so set a large value to
> +                * avoid races.
> +                */
> +               req->timeout = 600 * HZ;
> +               break;
> +       }

These timeouts again, does this mean we have competing timeout
code in the block layer and MMC?

This mentions timeouts in the MMC core, but they are actually
coming from the *MMC* core, when below you set:
blk_queue_rq_timeout(mq->queue, 60 * HZ);?

Isn't the actual case that the per-queue timeout is set up to
occur before the per-request timeout, and that you are hacking
around the block layer core having two different timeouts?

It's a bit confusing so I'd really like to know what's going on...

> +       mq->in_flight[issue_type] += 1;
> +       get_card = mmc_tot_in_flight(mq) == 1;

Parenthesis around the logical expression preferred I guess
get_card = (mmc_tot_in_flight(mq) == 1);
(Isn't checkpatch complaining about this?)

Then:
(...)
> +       if (get_card)
> +               mmc_get_card(card, &mq->ctx);

I simply took the card on every request. Since the context is the
same for all block layer business and the lock is now fully
reentrant this if (get_card) is not necessary. Just take it for
every request and release it in the .complete() callback.

> +#define MMC_QUEUE_DEPTH 64
> +
> +static int mmc_mq_init(struct mmc_queue *mq, struct mmc_card *card,
> +                        spinlock_t *lock)
> +{
> +       int q_depth;
> +       int ret;
> +
> +       q_depth = MMC_QUEUE_DEPTH;
> +
> +       ret = mmc_mq_init_queue(mq, q_depth, &mmc_mq_ops, lock);

Apart from using a define, then assigning the define to a
variable and then passing that variable instead of just
passing the define: why 64? Is that the depth of the CQE
queue? In that case we need an if (cqe) and set it down
to 2 for non-CQE.

> +       if (ret)
> +               return ret;
> +
> +       blk_queue_rq_timeout(mq->queue, 60 * HZ);

And requests timeout after 1 minute I take it.

I suspect both of these have some relation to CQE, so that is where
you find these long execution times etc?

> +static void mmc_mq_queue_suspend(struct mmc_queue *mq)
> +{
> +       blk_mq_quiesce_queue(mq->queue);
> +
> +       /*
> +        * The host remains claimed while there are outstanding requests, so
> +        * simply claiming and releasing here ensures there are none.
> +        */
> +       mmc_claim_host(mq->card->host);
> +       mmc_release_host(mq->card->host);

I think just blk_mq_quiesce_queue() should be fine as and
should make sure all requests have called .complete() and there
I think you should also release the host lock.

If the MQ code is not doing this, we need to fix MQ to
do the right thing (or add a new callback such as
blk_mq_make_sure_queue_empty()) so at the very
least put a big fat FIXME or REVISIT comment on the above.

> +static void mmc_mq_queue_resume(struct mmc_queue *mq)
> +{
> +       blk_mq_unquiesce_queue(mq->queue);
> +}
> +
> +static void __mmc_queue_suspend(struct mmc_queue *mq)
> +{
> +       struct request_queue *q = mq->queue;
> +       unsigned long flags;
> +
> +       if (!mq->suspended) {
> +               mq->suspended |= true;
> +
> +               spin_lock_irqsave(q->queue_lock, flags);
> +               blk_stop_queue(q);
> +               spin_unlock_irqrestore(q->queue_lock, flags);
> +
> +               down(&mq->thread_sem);
> +       }
> +}
> +
> +static void __mmc_queue_resume(struct mmc_queue *mq)
> +{
> +       struct request_queue *q = mq->queue;
> +       unsigned long flags;
> +
> +       if (mq->suspended) {
> +               mq->suspended = false;
> +
> +               up(&mq->thread_sem);
> +
> +               spin_lock_irqsave(q->queue_lock, flags);
> +               blk_start_queue(q);
> +               spin_unlock_irqrestore(q->queue_lock, flags);
> +       }
> +}

One of the good reasons to delete the old block layer is to get
rid of this horrible semaphore construction. So I see it as necessary
to be able to focus development efforts on code that actually has
a future.

> +       if (q->mq_ops)
> +               mmc_mq_queue_suspend(mq);
> +       else
> +               __mmc_queue_suspend(mq);

And then there is the code duplication again.

>         int                     qcnt;
> +
> +       int                     in_flight[MMC_ISSUE_MAX];

So this is a [2] containing a counter for the number of
synchronous and asynchronous requests in flight at any
time.

But are there really synchronous and asynchronous requests
going on at the same time?

Maybe on the error path I guess.

I avoided this completely but I guess it may be necessary with
CQE, such that in_flight[0,1] is way more than 1 or 2 at times
when there are commands queued?

> +       bool                    rw_wait;
> +       bool                    waiting;
> +       wait_queue_head_t       wait;

As mentioned I think this is a reimplementation of
the completion abstraction.

Yours,
Linus Walleij
Adrian Hunter Nov. 9, 2017, 10:42 a.m. UTC | #2
On 08/11/17 10:54, Linus Walleij wrote:
> On Fri, Nov 3, 2017 at 2:20 PM, Adrian Hunter <adrian.hunter@intel.com> wrote:
> 
>> Define and use a blk-mq queue. Discards and flushes are processed
>> synchronously, but reads and writes asynchronously. In order to support
>> slow DMA unmapping, DMA unmapping is not done until after the next request
>> is started. That means the request is not completed until then. If there is
>> no next request then the completion is done by queued work.
>>
>> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
> 
>> -       blk_end_request_all(req, ret ? BLK_STS_IOERR : BLK_STS_OK);
>> +       if (req->mq_ctx)
>> +               blk_mq_end_request(req, ret ? BLK_STS_IOERR : BLK_STS_OK);
>> +       else
>> +               blk_end_request_all(req, ret ? BLK_STS_IOERR : BLK_STS_OK);
> 
> I think this quite obvious code duplication is unfortunate.
> 
> What my patches do is get rid of the old block layer in order
> to be able to focus on the new stuff using just MQ.
> 
> One reason is that the code is hairy already as it is, by just
> supporting MQ the above is still just one line of code, the same
> goes for the other instances below.
> 
> At least you could do what I did and break out a helper like
> this:
> 
> /*
>  * This reports status back to the block layer for a finished request.
>  */
> static void mmc_blk_complete(struct mmc_queue_req *mq_rq,
>                             blk_status_t status)
> {
>        struct request *req = mmc_queue_req_to_req(mq_rq);
> 
>        if (req->mq_ctx) {
>           blk_mq_end_request(req, status);
>        } else
>           blk_end_request_all(req, status);
> }

You are quibbling.  It makes next to no difference especially as it all goes
away when the legacy code is deleted.  I will change it in the next
revision, but what a waste of everyone's time.  Please try to focus on
things that matter.

>> +/* Single sector read during recovery */
>> +static void mmc_blk_ss_read(struct mmc_queue *mq, struct request *req)
>> +{
>> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
>> +       blk_status_t status;
>> +
>> +       while (1) {
>> +               mmc_blk_rw_rq_prep(mqrq, mq->card, 1, mq);
>> +
>> +               mmc_wait_for_req(mq->card->host, &mqrq->brq.mrq);
>> +
>> +               /*
>> +                * Not expecting command errors, so just give up in that case.
>> +                * If there are retries remaining, the request will get
>> +                * requeued.
>> +                */
>> +               if (mqrq->brq.cmd.error)
>> +                       return;
>> +
>> +               if (blk_rq_bytes(req) <= 512)
>> +                       break;
>> +
>> +               status = mqrq->brq.data.error ? BLK_STS_IOERR : BLK_STS_OK;
>> +
>> +               blk_update_request(req, status, 512);
>> +       }
>> +
>> +       mqrq->retries = MMC_NO_RETRIES;
>> +}
>> +
>> +static void mmc_blk_rw_recovery(struct mmc_queue *mq, struct request *req)
>> +{
>> +       int type = rq_data_dir(req) == READ ? MMC_BLK_READ : MMC_BLK_WRITE;
>> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
>> +       struct mmc_blk_request *brq = &mqrq->brq;
>> +       struct mmc_blk_data *md = mq->blkdata;
>> +       struct mmc_card *card = mq->card;
>> +       static enum mmc_blk_status status;
>> +
>> +       brq->retune_retry_done = mqrq->retries;
>> +
>> +       status = __mmc_blk_err_check(card, mqrq);
>> +
>> +       mmc_retune_release(card->host);
>> +
>> +       /*
>> +        * Requests are completed by mmc_blk_mq_complete_rq() which sets simple
>> +        * policy:
>> +        * 1. A request that has transferred at least some data is considered
>> +        * successful and will be requeued if there is remaining data to
>> +        * transfer.
>> +        * 2. Otherwise the number of retries is incremented and the request
>> +        * will be requeued if there are remaining retries.
>> +        * 3. Otherwise the request will be errored out.
>> +        * That means mmc_blk_mq_complete_rq() is controlled by bytes_xfered and
>> +        * mqrq->retries. So there are only 4 possible actions here:
>> +        *      1. do not accept the bytes_xfered value i.e. set it to zero
>> +        *      2. change mqrq->retries to determine the number of retries
>> +        *      3. try to reset the card
>> +        *      4. read one sector at a time
>> +        */
>> +       switch (status) {
>> +       case MMC_BLK_SUCCESS:
>> +       case MMC_BLK_PARTIAL:
>> +               /* Reset success, and accept bytes_xfered */
>> +               mmc_blk_reset_success(md, type);
>> +               break;
>> +       case MMC_BLK_CMD_ERR:
>> +               /*
>> +                * For SD cards, get bytes written, but do not accept
>> +                * bytes_xfered if that fails. For MMC cards accept
>> +                * bytes_xfered. Then try to reset. If reset fails then
>> +                * error out the remaining request, otherwise retry
>> +                * once (N.B mmc_blk_reset() will not succeed twice in a
>> +                * row).
>> +                */
>> +               if (mmc_card_sd(card)) {
>> +                       u32 blocks;
>> +                       int err;
>> +
>> +                       err = mmc_sd_num_wr_blocks(card, &blocks);
>> +                       if (err)
>> +                               brq->data.bytes_xfered = 0;
>> +                       else
>> +                               brq->data.bytes_xfered = blocks << 9;
>> +               }
>> +               if (mmc_blk_reset(md, card->host, type))
>> +                       mqrq->retries = MMC_NO_RETRIES;
>> +               else
>> +                       mqrq->retries = MMC_MAX_RETRIES - 1;
>> +               break;
>> +       case MMC_BLK_RETRY:
>> +               /*
>> +                * Do not accept bytes_xfered, but retry up to 5 times,
>> +                * otherwise same as abort.
>> +                */
>> +               brq->data.bytes_xfered = 0;
>> +               if (mqrq->retries < MMC_MAX_RETRIES)
>> +                       break;
>> +               /* Fall through */
>> +       case MMC_BLK_ABORT:
>> +               /*
>> +                * Do not accept bytes_xfered, but try to reset. If
>> +                * reset succeeds, try once more, otherwise error out
>> +                * the request.
>> +                */
>> +               brq->data.bytes_xfered = 0;
>> +               if (mmc_blk_reset(md, card->host, type))
>> +                       mqrq->retries = MMC_NO_RETRIES;
>> +               else
>> +                       mqrq->retries = MMC_MAX_RETRIES - 1;
>> +               break;
>> +       case MMC_BLK_DATA_ERR: {
>> +               int err;
>> +
>> +               /*
>> +                * Do not accept bytes_xfered, but try to reset. If
>> +                * reset succeeds, try once more. If reset fails with
>> +                * ENODEV which means the partition is wrong, then error
>> +                * out the request. Otherwise attempt to read one sector
>> +                * at a time.
>> +                */
>> +               brq->data.bytes_xfered = 0;
>> +               err = mmc_blk_reset(md, card->host, type);
>> +               if (!err) {
>> +                       mqrq->retries = MMC_MAX_RETRIES - 1;
>> +                       break;
>> +               }
>> +               if (err == -ENODEV) {
>> +                       mqrq->retries = MMC_NO_RETRIES;
>> +                       break;
>> +               }
>> +               /* Fall through */
>> +       }
>> +       case MMC_BLK_ECC_ERR:
>> +               /*
>> +                * Do not accept bytes_xfered. If reading more than one
>> +                * sector, try reading one sector at a time.
>> +                */
>> +               brq->data.bytes_xfered = 0;
>> +               /* FIXME: Missing single sector read for large sector size */
>> +               if (brq->data.blocks > 1 && !mmc_large_sector(card)) {
>> +                       /* Redo read one sector at a time */
>> +                       pr_warn("%s: retrying using single block read\n",
>> +                               req->rq_disk->disk_name);
>> +                       mmc_blk_ss_read(mq, req);
>> +               } else {
>> +                       mqrq->retries = MMC_NO_RETRIES;
>> +               }
>> +               break;
>> +       case MMC_BLK_NOMEDIUM:
>> +               /* Do not accept bytes_xfered. Error out the request */
>> +               brq->data.bytes_xfered = 0;
>> +               mqrq->retries = MMC_NO_RETRIES;
>> +               break;
>> +       default:
>> +               /* Do not accept bytes_xfered. Error out the request */
>> +               brq->data.bytes_xfered = 0;
>> +               mqrq->retries = MMC_NO_RETRIES;
>> +               pr_err("%s: Unhandled return value (%d)",
>> +                      req->rq_disk->disk_name, status);
>> +               break;
>> +       }
>> +}
>> +
>> +static void mmc_blk_mq_complete_rq(struct mmc_queue *mq, struct request *req)
>> +{
>> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
>> +       unsigned int nr_bytes = mqrq->brq.data.bytes_xfered;
>> +
>> +       if (nr_bytes) {
>> +               if (blk_update_request(req, BLK_STS_OK, nr_bytes))
>> +                       blk_mq_requeue_request(req, true);
>> +               else
>> +                       __blk_mq_end_request(req, BLK_STS_OK);
>> +       } else if (mqrq->retries++ < MMC_MAX_RETRIES) {
>> +               blk_mq_requeue_request(req, true);
>> +       } else {
>> +               if (mmc_card_removed(mq->card))
>> +                       req->rq_flags |= RQF_QUIET;
>> +               blk_mq_end_request(req, BLK_STS_IOERR);
>> +       }
>> +}
> 
> This retry and error handling using requeue is very elegant.
> I really like this.
> 
> If we could also go for MQ-only, only this nice code
> remains in the tree.

No one has ever suggested that the legacy API will remain.  Once blk-mq is
ready the old code gets deleted.

> 
> The problem: you have just reimplemented the whole error
> handling we had in the old block layer and now we have to
> maintain two copies and keep them in sync.
> 
> This is not OK IMO, we will inevitable screw it up, so we
> need to get *one* error path.

Wow, you really didn't read the code at all.  As I have repeatedly pointed
out, the new code is all new.  There is no overlap and there nothing to keep
in sync.  It may not look like it in this patch, but that is only because of
the ridiculous idea of splitting up the patch.

> 
>> +static bool mmc_blk_urgent_bkops_needed(struct mmc_queue *mq,
>> +                                       struct mmc_queue_req *mqrq)
>> +{
>> +       return mmc_card_mmc(mq->card) &&
>> +              (mqrq->brq.cmd.resp[0] & R1_EXCEPTION_EVENT ||
>> +               mqrq->brq.stop.resp[0] & R1_EXCEPTION_EVENT);
>> +}
>> +
>> +static void mmc_blk_urgent_bkops(struct mmc_queue *mq,
>> +                                struct mmc_queue_req *mqrq)
>> +{
>> +       if (mmc_blk_urgent_bkops_needed(mq, mqrq))
>> +               mmc_start_bkops(mq->card, true);
>> +}
>> +
>> +void mmc_blk_mq_complete(struct request *req)
>> +{
>> +       struct mmc_queue *mq = req->q->queuedata;
>> +
>> +       mmc_blk_mq_complete_rq(mq, req);
>> +}
> 
> So this is called from the struct blk_mq_ops .complete()
> callback. And this calls blk_mq_end_request().
> 
> So the semantic order needs to be complete -> end.
> 
> I see this pattern in newer MQ code, I got it wrong in
> my patch set so I try to fix it up.
> 
>> +static void mmc_blk_mq_poll_completion(struct mmc_queue *mq,
>> +                                      struct request *req)
>> +{
>> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
>> +
>> +       mmc_blk_rw_recovery(mq, req);
>> +
>> +       mmc_blk_urgent_bkops(mq, mqrq);
>> +}
> 
> This looks nice.
> 
>> +static void mmc_blk_mq_acct_req_done(struct mmc_queue *mq, struct request *req)
> 
> What does "acct" mean in the above function name?
> Accounting? Actual? I'm lost.

Does "actual" have two "c"'s.  You are just making things up.  Of course it
is "account".  It is counting the number of requests in flight - which is
pretty obvious from the code.  We use that to support important features
like CQE re-tuning and avoiding getting / putting the card all the time.

> 
>> +{
>> +       struct request_queue *q = req->q;
>> +       unsigned long flags;
>> +       bool put_card;
>> +
>> +       spin_lock_irqsave(q->queue_lock, flags);
>> +
>> +       mq->in_flight[mmc_issue_type(mq, req)] -= 1;
>> +
>> +       put_card = mmc_tot_in_flight(mq) == 0;
> 
> This in_flight[] business seems a bit kludgy, but I
> don't really understand it fully. Magic numbers like
> -1 to mark that something is not going on etc, not
> super-elegant.

You are misreading.  It subtracts 1 from the number of requests in flight.

> 
> I believe it is necessary for CQE though as you need
> to keep track of outstanding requests?

We have always avoided getting / putting the card when there is another
request in flight.  Yes it is also used for CQE.

> 
>> +
>> +       spin_unlock_irqrestore(q->queue_lock, flags);
>> +
>> +       if (put_card)
>> +               mmc_put_card(mq->card, &mq->ctx);
> 
> I think you should try not to sprinkle mmc_put_card() inside
> the different functions but instead you can put this in the
> .complete callback I guess mmc_blk_mq_complete() in your
> patch set.

This is the *only* block.c function in the blk-mq code that calls
mmc_put_card().  The queue also has does it for requests that didn't even
start.  That is it.

> 
> Also you do not need to avoid calling it several times with
> that put_card variable. It's fully reentrant thanks to your
> own code in the lock and all calls come from the same block
> layer process if you call it in .complete() I think?

We have always avoided unnecessary gets / puts.  Since the result is better,
why on earth take it out?

> 
>> +static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req)
>> +{
>> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
>> +       struct mmc_request *mrq = &mqrq->brq.mrq;
>> +       struct mmc_host *host = mq->card->host;
>> +
>> +       if (host->ops->post_req)
>> +               host->ops->post_req(host, mrq, 0);
>> +
>> +       blk_mq_complete_request(req);
>> +
>> +       mmc_blk_mq_acct_req_done(mq, req);
>> +}
> 
> Now the problem Ulf has pointed out starts to creep out in the
> patch: a lot of code duplication on the MQ path compared to
> the ordinary block layer path.
> 
> My approach was structured partly to avoid this: first refactor
> the old path, then switch to (only) MQ to avoid code duplication.

The old code is rubbish.  There is nothing of value there.  You have to make
the case why you are wasting everyone's time churning crappy code.  The idea
that nice new code is wrong because it doesn't churn the old rubbish code,
is ridiculous.

> 
>> +static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq,
>> +                                        struct request **prev_req)
>> +{
>> +       mutex_lock(&mq->complete_lock);
>> +
>> +       if (!mq->complete_req)
>> +               goto out_unlock;
>> +
>> +       mmc_blk_mq_poll_completion(mq, mq->complete_req);
>> +
>> +       if (prev_req)
>> +               *prev_req = mq->complete_req;
>> +       else
>> +               mmc_blk_mq_post_req(mq, mq->complete_req);
>> +
>> +       mq->complete_req = NULL;
>> +
>> +out_unlock:
>> +       mutex_unlock(&mq->complete_lock);
>> +}
> 
> This looks a bit like it is reimplementing the kernel
> completion abstraction using a mutex and a variable named
> .complete_req?
> 
> We were using a completion in the old block layer so
> why did you not use it for MQ?

Doesn't seem like you pay much attention to this stuff.  The previous
request has to be completed even if there is no next request.  That means
schduling work, that then races with the dispatch path.  So mutual exclusion
is necessary.

> 
>> +static void mmc_blk_mq_req_done(struct mmc_request *mrq)
>> +{
>> +       struct mmc_queue_req *mqrq = container_of(mrq, struct mmc_queue_req,
>> +                                                 brq.mrq);
>> +       struct request *req = mmc_queue_req_to_req(mqrq);
>> +       struct request_queue *q = req->q;
>> +       struct mmc_queue *mq = q->queuedata;
>> +       unsigned long flags;
>> +       bool waiting;
>> +
>> +       spin_lock_irqsave(q->queue_lock, flags);
>> +       mq->complete_req = req;
>> +       mq->rw_wait = false;
>> +       waiting = mq->waiting;
>> +       spin_unlock_irqrestore(q->queue_lock, flags);
>> +
>> +       if (waiting)
>> +               wake_up(&mq->wait);
> 
> I would contest using a waitqueue for this. The name even says
> "complete_req" so why is a completion not the right thing to
> hang on rather than a waitqueue?

I explained that above.

> 
> The completion already contains a waitqueue, so I think you're
> just essentially reimplementing it.
> 
> Just complete(&mq->mq_req_complete) or something should do
> the trick.

Nope.

> 
>> +       else
>> +               kblockd_schedule_work(&mq->complete_work);
> 
> I did not use the kblockd workqueue for this, out of fear
> that it would interfere and disturb the block layer work items.
> My intuitive idea was that the MMC layer needed its own
> worker (like in the past it used a thread) in order to avoid
> congestion in the block layer queue leading to unnecessary
> delays.

The complete work races with the dispatch of the next request, so putting
them in the same workqueue makes sense. i.e. the one that gets processed
first would anyway delay the one that gets processed second.

> 
> On the other hand, this likely avoids a context switch if there
> is no congestion on the queue.
> 
> I am uncertain when it is advisible to use the block layer
> queue for subsystems like MMC/SD.
> 
> Would be nice to see some direction from the block layer
> folks here, it is indeed exposed to us...
> 
> My performance tests show no problems with this approach
> though.

As I already wrote, the CPU-bound block layer dispatch work queue has
negative consequeuences for mmc performance.  So there are 2 aspects to that:
	1. Is the choice of CPU right to start with?  I suspect it is better for
the dispatch to run on the same CPU as the interrupt.
	2. Does the dispatch work need to be able to be migrated to a different
CPU? i.e. unbound work queue.  That helped in my tests, but it could just be
a side-effect of 1.

Of course we can't start looking at these real issues, while you are

> 
>> +static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
>> +{
>> +       struct request_queue *q = mq->queue;
>> +       unsigned long flags;
>> +       bool done;
>> +
>> +       spin_lock_irqsave(q->queue_lock, flags);
>> +       done = !mq->rw_wait;
>> +       mq->waiting = !done;
>> +       spin_unlock_irqrestore(q->queue_lock, flags);
> 
> This makes it look like a reimplementation of completion_done()
> so I think you should use the completion abstraction again. The
> struct completion even contains a variable named "done".

This just serves as an example of why splitting up the patch was such a bad
idea.  For direct completion, the wait can result in recovery being needed
for the previous request, so the current request gets requeued.

> 
>> +static int mmc_blk_mq_issue_rw_rq(struct mmc_queue *mq,
>> +                                 struct request *req)
>> +{
>> +       struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
>> +       struct mmc_host *host = mq->card->host;
>> +       struct request *prev_req = NULL;
>> +       int err = 0;
>> +
>> +       mmc_blk_rw_rq_prep(mqrq, mq->card, 0, mq);
>> +
>> +       mqrq->brq.mrq.done = mmc_blk_mq_req_done;
>> +
>> +       if (host->ops->pre_req)
>> +               host->ops->pre_req(host, &mqrq->brq.mrq);
>> +
>> +       err = mmc_blk_rw_wait(mq, &prev_req);
>> +       if (err)
>> +               goto out_post_req;
>> +
>> +       mq->rw_wait = true;
>> +
>> +       err = mmc_start_request(host, &mqrq->brq.mrq);
>> +
>> +       if (prev_req)
>> +               mmc_blk_mq_post_req(mq, prev_req);
>> +
>> +       if (err)
>> +               mq->rw_wait = false;
>> +
>> +out_post_req:
>> +       if (err && host->ops->post_req)
>> +               host->ops->post_req(host, &mqrq->brq.mrq, err);
>> +
>> +       return err;
>> +}
> 
> This is pretty straight-forward (pending the comments above).
> Again it has the downside of duplicating the same code for the
> old block layer instead of refactoring.

No, the old code is rubbish.  Churning it is a waste of time.

> 
>> +enum mmc_issued mmc_blk_mq_issue_rq(struct mmc_queue *mq, struct request *req)
>> +{
>> +       struct mmc_blk_data *md = mq->blkdata;
>> +       struct mmc_card *card = md->queue.card;
>> +       struct mmc_host *host = card->host;
>> +       int ret;
>> +
>> +       ret = mmc_blk_part_switch(card, md->part_type);
>> +       if (ret)
>> +               return MMC_REQ_FAILED_TO_START;
>> +
>> +       switch (mmc_issue_type(mq, req)) {
>> +       case MMC_ISSUE_SYNC:
>> +               ret = mmc_blk_wait_for_idle(mq, host);
>> +               if (ret)
>> +                       return MMC_REQ_BUSY;
>> +               switch (req_op(req)) {
>> +               case REQ_OP_DRV_IN:
>> +               case REQ_OP_DRV_OUT:
>> +                       mmc_blk_issue_drv_op(mq, req);
>> +                       break;
>> +               case REQ_OP_DISCARD:
>> +                       mmc_blk_issue_discard_rq(mq, req);
>> +                       break;
>> +               case REQ_OP_SECURE_ERASE:
>> +                       mmc_blk_issue_secdiscard_rq(mq, req);
>> +                       break;
>> +               case REQ_OP_FLUSH:
>> +                       mmc_blk_issue_flush(mq, req);
>> +                       break;
>> +               default:
>> +                       WARN_ON_ONCE(1);
>> +                       return MMC_REQ_FAILED_TO_START;
>> +               }
>> +               return MMC_REQ_FINISHED;
>> +       case MMC_ISSUE_ASYNC:
>> +               switch (req_op(req)) {
>> +               case REQ_OP_READ:
>> +               case REQ_OP_WRITE:
>> +                       ret = mmc_blk_mq_issue_rw_rq(mq, req);
>> +                       break;
>> +               default:
>> +                       WARN_ON_ONCE(1);
>> +                       ret = -EINVAL;
>> +               }
>> +               if (!ret)
>> +                       return MMC_REQ_STARTED;
>> +               return ret == -EBUSY ? MMC_REQ_BUSY : MMC_REQ_FAILED_TO_START;
>> +       default:
>> +               WARN_ON_ONCE(1);
>> +               return MMC_REQ_FAILED_TO_START;
>> +       }
>> +}
> 
> Again looks fine, again duplicates code. In this case I don't even
> see why the MQ code needs its own copy of the issue funtion.

Because it has to support CQE.  This attitude against CQE is very disappointing!

> 
>> +enum mmc_issue_type mmc_issue_type(struct mmc_queue *mq, struct request *req)
>> +{
>> +       if (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_WRITE)
>> +               return MMC_ISSUE_ASYNC;
>> +
>> +       return MMC_ISSUE_SYNC;
>> +}
> 
> Distinguishing between SYNC and ASYNC operations and using
> that as abstraction is nice.
> 
> But you only do this in the new MQ code.
> 
> Instead, make this a separate patch and first refactor the old
> code to use this distinction between SYNC and ASYNC.

That is a non-starter.  The old code is rubbish.  Point to something work
saving.  There isn't anything.

> 
> Unfortunately I think Ulf's earlier criticism that you're rewriting
> the world instead of refactoring what we have still stands on many
> accounts here.

Nope.  It is just an excuse to delay the patches.  You guys are playing
games and it is embarassing for linux.  What is actually wrong with this
technically?  It is not good because it doesn't churn the old code?  That is
ridiculous.

> 
> It makes it even harder to understand your persistance in keeping
> the old block layer around. If you're introducing new concepts and
> cleaner code in the MQ path and kind of discarding the old
> block layer path, why keep it around at all?

Wow, you really like making things up.  Never have I suggested keeping the
old code.  It is rubbish.  As soon and blk-mq is ready and tested, delete
the old crap.

I was expecting CQE to be applied 6 months ago, supporting the legacy blk
layer until blk-mq was ready.  But you never delivered on blk-mq, which is
why I had to do it.  And now you are making up excuses about why we can't
move forward.

> 
> I would have a much easier time accepting this patch if it
> deleted as much as it was adding, i.e. introduce all this new
> nice MQ code, but also tossing out the old block layer and error
> handling code. Even if it is a massive rewrite, at least there
> is just one body of code to maintain going forward.

How can you pssibly call a few hundred lines massive.  The kernel has
millions of lines.  Your sense of scale is out of whack.

> 
> That said, I would strongly prefer a refactoring of the old block
> layer leading up to transitioning to MQ. But I am indeed biased
> since I took that approach myself.

Well stop it.  We have nice working code.  Get it applied and tested, and
then we can delete the old crap.

> 
>> +static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req,
>> +                                                bool reserved)
>> +{
>> +       return BLK_EH_RESET_TIMER;
>> +}
> 
> This timeout looks like something I need to pick up in my patch
> set as well. It seems good for stability to support this. But what happened
> here? Did you experience a bunch of timeouts during development,
> or let's say how was this engineered, I guess it is for the case when
> something randomly locks up for a long time and we don't really know
> what has happened, like a watchdog?

We presently don't have the host APIs to support external timeouts.  CQE
uses them though.

> 
>> +static int mmc_init_request(struct request_queue *q, struct request *req,
>> +                           gfp_t gfp)
>> +{
>> +       return __mmc_init_request(q->queuedata, req, gfp);
>> +}
>> +
> (...)
>> +static int mmc_mq_init_request(struct blk_mq_tag_set *set, struct request *req,
>> +                              unsigned int hctx_idx, unsigned int numa_node)
>> +{
>> +       return __mmc_init_request(set->driver_data, req, GFP_KERNEL);
>> +}
>> +
>> +static void mmc_mq_exit_request(struct blk_mq_tag_set *set, struct request *req,
>> +                               unsigned int hctx_idx)
>> +{
>> +       struct mmc_queue *mq = set->driver_data;
>> +
>> +       mmc_exit_request(mq->queue, req);
>> +}
> 
> Here is more code duplication just to keep both the old block layer
> and MQ around. Including introducing another inner __foo function
> which I have something strongly against personally (I might be
> crazily picky, because I see many people do this).

In this case, it is not code duplication it is re-using the same code but
called from the blk-mq API.

> 
>> +static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
>> +                                   const struct blk_mq_queue_data *bd)
>> +{
>> +       struct request *req = bd->rq;
>> +       struct request_queue *q = req->q;
>> +       struct mmc_queue *mq = q->queuedata;
>> +       struct mmc_card *card = mq->card;
>> +       enum mmc_issue_type issue_type;
>> +       enum mmc_issued issued;
>> +       bool get_card;
>> +       int ret;
>> +
>> +       if (mmc_card_removed(mq->card)) {
>> +               req->rq_flags |= RQF_QUIET;
>> +               return BLK_STS_IOERR;
>> +       }
>> +
>> +       issue_type = mmc_issue_type(mq, req);
>> +
>> +       spin_lock_irq(q->queue_lock);
>> +
>> +       switch (issue_type) {
>> +       case MMC_ISSUE_ASYNC:
>> +               break;
>> +       default:
>> +               /*
>> +                * Timeouts are handled by mmc core, so set a large value to
>> +                * avoid races.
>> +                */
>> +               req->timeout = 600 * HZ;
>> +               break;
>> +       }
> 
> These timeouts again, does this mean we have competing timeout
> code in the block layer and MMC?

Yes - the host controller provides hardware timeout interrupts in most
cases.  The core provides software timeouts in other cases.

> 
> This mentions timeouts in the MMC core, but they are actually
> coming from the *MMC* core, when below you set:
> blk_queue_rq_timeout(mq->queue, 60 * HZ);?
> 
> Isn't the actual case that the per-queue timeout is set up to
> occur before the per-request timeout, and that you are hacking
> around the block layer core having two different timeouts?

There is no per-queue timeout.  The request timeout has a default value
given by the queue.  It can be changed for different requests.

> 
> It's a bit confusing so I'd really like to know what's going on...

I don't expect to have to teach you the block layer.

> 
>> +       mq->in_flight[issue_type] += 1;
>> +       get_card = mmc_tot_in_flight(mq) == 1;
> 
> Parenthesis around the logical expression preferred I guess
> get_card = (mmc_tot_in_flight(mq) == 1);
> (Isn't checkpatch complaining about this?)

Nope

> 
> Then:
> (...)
>> +       if (get_card)
>> +               mmc_get_card(card, &mq->ctx);
> 
> I simply took the card on every request. Since the context is the
> same for all block layer business and the lock is now fully
> reentrant this if (get_card) is not necessary. Just take it for
> every request and release it in the .complete() callback.

As I have written elsewhere, we have always avoind getting / putting
unecessarily.  It is better that way, so no point in taking it out.

> 
>> +#define MMC_QUEUE_DEPTH 64
>> +
>> +static int mmc_mq_init(struct mmc_queue *mq, struct mmc_card *card,
>> +                        spinlock_t *lock)
>> +{
>> +       int q_depth;
>> +       int ret;
>> +
>> +       q_depth = MMC_QUEUE_DEPTH;
>> +
>> +       ret = mmc_mq_init_queue(mq, q_depth, &mmc_mq_ops, lock);
> 
> Apart from using a define, then assigning the define to a
> variable and then passing that variable instead of just
> passing the define: why 64? Is that the depth of the CQE
> queue? In that case we need an if (cqe) and set it down
> to 2 for non-CQE.

Are you ever going to learn about the block layer.  The number of requests
is default 128 for the legacy block layer.  For blk-mq it is queue depth
times 2.  So 64 gives the same number of requests as before.

> 
>> +       if (ret)
>> +               return ret;
>> +
>> +       blk_queue_rq_timeout(mq->queue, 60 * HZ);
> 
> And requests timeout after 1 minute I take it.
> 
> I suspect both of these have some relation to CQE, so that is where
> you find these long execution times etc?

For legacy mmc, the core takes care of timeouts.  For CQE we expect reliable
devices and I would interpret a timeout as meaning the device is broken.
However it is sensible to have anyway.  For CQE, a request might have to
wait for the entire rest of the queue to be processed first, or maybe the
request somehow gets stuck and there are other requests constantly
overtaking it.  The max queue depth is 32 so 60 seconds seems ok.

> 
>> +static void mmc_mq_queue_suspend(struct mmc_queue *mq)
>> +{
>> +       blk_mq_quiesce_queue(mq->queue);
>> +
>> +       /*
>> +        * The host remains claimed while there are outstanding requests, so
>> +        * simply claiming and releasing here ensures there are none.
>> +        */
>> +       mmc_claim_host(mq->card->host);
>> +       mmc_release_host(mq->card->host);
> 
> I think just blk_mq_quiesce_queue() should be fine as and
> should make sure all requests have called .complete() and there
> I think you should also release the host lock.
> 
> If the MQ code is not doing this, we need to fix MQ to
> do the right thing (or add a new callback such as
> blk_mq_make_sure_queue_empty()) so at the very
> least put a big fat FIXME or REVISIT comment on the above.

blk_mq_quiesce_queue() prevents dispatches not completions.  So we wait for
outstanding requests.

> 
>> +static void mmc_mq_queue_resume(struct mmc_queue *mq)
>> +{
>> +       blk_mq_unquiesce_queue(mq->queue);
>> +}
>> +
>> +static void __mmc_queue_suspend(struct mmc_queue *mq)
>> +{
>> +       struct request_queue *q = mq->queue;
>> +       unsigned long flags;
>> +
>> +       if (!mq->suspended) {
>> +               mq->suspended |= true;
>> +
>> +               spin_lock_irqsave(q->queue_lock, flags);
>> +               blk_stop_queue(q);
>> +               spin_unlock_irqrestore(q->queue_lock, flags);
>> +
>> +               down(&mq->thread_sem);
>> +       }
>> +}
>> +
>> +static void __mmc_queue_resume(struct mmc_queue *mq)
>> +{
>> +       struct request_queue *q = mq->queue;
>> +       unsigned long flags;
>> +
>> +       if (mq->suspended) {
>> +               mq->suspended = false;
>> +
>> +               up(&mq->thread_sem);
>> +
>> +               spin_lock_irqsave(q->queue_lock, flags);
>> +               blk_start_queue(q);
>> +               spin_unlock_irqrestore(q->queue_lock, flags);
>> +       }
>> +}
> 
> One of the good reasons to delete the old block layer is to get
> rid of this horrible semaphore construction. So I see it as necessary
> to be able to focus development efforts on code that actually has
> a future.

The old crap will get deleted when blk-mq is ready.

> 
>> +       if (q->mq_ops)
>> +               mmc_mq_queue_suspend(mq);
>> +       else
>> +               __mmc_queue_suspend(mq);
> 
> And then there is the code duplication again.

The code is not duplicated the blk-mq code is completely different.  The old
crap will get deleted when blk-mq is ready.

> 
>>         int                     qcnt;
>> +
>> +       int                     in_flight[MMC_ISSUE_MAX];
> 
> So this is a [2] containing a counter for the number of
> synchronous and asynchronous requests in flight at any
> time.
> 
> But are there really synchronous and asynchronous requests
> going on at the same time?
> 
> Maybe on the error path I guess.
> 
> I avoided this completely but I guess it may be necessary with
> CQE, such that in_flight[0,1] is way more than 1 or 2 at times
> when there are commands queued?

CQE needs to count DCMD separately from read / writes.  Counting by issue
type is a simple way to do that.

I already pointed out that the code makes more sense together than split up.

> 
>> +       bool                    rw_wait;
>> +       bool                    waiting;
>> +       wait_queue_head_t       wait;
> 
> As mentioned I think this is a reimplementation of
> the completion abstraction.

I pointed out why that wouldn't work.  Another case of why the code makes
more sense together than split up.
Linus Walleij Nov. 9, 2017, 3:52 p.m. UTC | #3
On Thu, Nov 9, 2017 at 11:42 AM, Adrian Hunter <adrian.hunter@intel.com> wrote:
> On 08/11/17 10:54, Linus Walleij wrote:
>> On Fri, Nov 3, 2017 at 2:20 PM, Adrian Hunter <adrian.hunter@intel.com> wrote:

>> At least you could do what I did and break out a helper like
>> this:
>>
>> /*
>>  * This reports status back to the block layer for a finished request.
>>  */
>> static void mmc_blk_complete(struct mmc_queue_req *mq_rq,
>>                             blk_status_t status)
>> {
>>        struct request *req = mmc_queue_req_to_req(mq_rq);
>>
>>        if (req->mq_ctx) {
>>           blk_mq_end_request(req, status);
>>        } else
>>           blk_end_request_all(req, status);
>> }
>
> You are quibbling.

Hm wiktionary "quibble" points to:
https://en.wikipedia.org/wiki/Trivial_objections

Sorry for not being a native speaker in English, I think that
maybe you were again trying to be snarky to a reviewer but
I honestly don't know.

> It makes next to no difference especially as it all goes
> away when the legacy code is deleted.

Which is something your patch set doesn't do. Nor did you write
anywhere that deleting the legacy code was your intention.
But I'm happy to hear it.

>  I will change it in the next
> revision, but what a waste of everyone's time.
>  Please try to focus on
> things that matter.

As Jean-Paul Sartre said "hell is other people".
Can you try to be friendlier anyway?

>> This retry and error handling using requeue is very elegant.
>> I really like this.
>>
>> If we could also go for MQ-only, only this nice code
>> remains in the tree.
>
> No one has ever suggested that the legacy API will remain.  Once blk-mq is
> ready the old code gets deleted.

The block layer maintainers most definately think MQ is ready
but you seem to disagree. Why?

In the recent LWN article from kernel recepies:
https://lwn.net/Articles/735275/
the only obstacle I can see is a mention that SCSI was not
switched over by default because of some problems with
slow rotating media. "This change had to be backed out
because of some performance and scalability issues that
arose for rotating storage."

Christoph mentions that he's gonna try again for v4.14.
But it's true, I do not see that happening in linux-next yet.

But that is specifically rotating media, which MMC/SD is not.
And UBI is using it since ages without complaints. And
that is a sign of something.

Have you seen any problems when deploying it on MMC/SD,
really? If there are problems I bet the block layer people want
us to find them, diagnose them and ultimately fix them rather
than wait for them to do it. I haven't seen any performance or
scalability issues. At one point I had processes running
on two eMMC and one SD-card and apart from maxing out
the CPUs and DMA backplace as could be expected all was
fine.

>> The problem: you have just reimplemented the whole error
>> handling we had in the old block layer and now we have to
>> maintain two copies and keep them in sync.
>>
>> This is not OK IMO, we will inevitable screw it up, so we
>> need to get *one* error path.
>
> Wow, you really didn't read the code at all.

Just quit this snarky attitude.

> As I have repeatedly pointed
> out, the new code is all new.  There is no overlap and there nothing to keep
> in sync.

The problem is that you have not clearly communicated your
vision to delete the old code. The best way to communicate that
would be to include a patch actually deleteing it.

>  It may not look like it in this patch, but that is only because of
> the ridiculous idea of splitting up the patch.

Naming other people's review feedback as "ridiculous" is not very
helpful. I think the patch set became much easier to review
like this so I am happy with this format.

>>> +static void mmc_blk_mq_acct_req_done(struct mmc_queue *mq, struct request *req)
>>
>> What does "acct" mean in the above function name?
>> Accounting? Actual? I'm lost.
>
> Does "actual" have two "c"'s.  You are just making things up.

Please stop your snarky and belitteling attitude.

I am not a native English speaker and I am not making up
my review comments in order to annoy you.

Consider Hanlon's razor:
https://en.wikipedia.org/wiki/Hanlon%27s_razor
"Never attribute to malice that which is adequately explained by
stupidity".

It's bad enough that you repeatedly point out how stupid
you think I am, I am sorry about that, in my opinion sometimes
other people are really stupid but more often than not the problem
is really on your side, like being impatient and unwilling to educate
others about how your code actually works becuase it seems
so evident to yourself that you think it should be evident to everyone
else as well. I don't know what to say about that, it seems a bit
unempatic.

But you even go and think I am making stuff up in purpose.
That's pretty offensive.

> Of course it is "account".

This is maybe obvious for you but it was not to me.

>  It is counting the number of requests in flight - which is
> pretty obvious from the code.  We use that to support important features
> like CQE re-tuning and avoiding getting / putting the card all the time.

What is so hard with getting this point across without insults?

>>> +{
>>> +       struct request_queue *q = req->q;
>>> +       unsigned long flags;
>>> +       bool put_card;
>>> +
>>> +       spin_lock_irqsave(q->queue_lock, flags);
>>> +
>>> +       mq->in_flight[mmc_issue_type(mq, req)] -= 1;
>>> +
>>> +       put_card = mmc_tot_in_flight(mq) == 0;
>>
>> This in_flight[] business seems a bit kludgy, but I
>> don't really understand it fully. Magic numbers like
>> -1 to mark that something is not going on etc, not
>> super-elegant.
>
> You are misreading.  It subtracts 1 from the number of requests in flight.

Right! Sorry I misread it.

>> I believe it is necessary for CQE though as you need
>> to keep track of outstanding requests?
>
> We have always avoided getting / putting the card when there is another
> request in flight.  Yes it is also used for CQE.

Yeah I took the approach to get/put the card around each
request instead. It might make things a bit simpler.

>>> +       spin_unlock_irqrestore(q->queue_lock, flags);
>>> +
>>> +       if (put_card)
>>> +               mmc_put_card(mq->card, &mq->ctx);
>>
>> I think you should try not to sprinkle mmc_put_card() inside
>> the different functions but instead you can put this in the
>> .complete callback I guess mmc_blk_mq_complete() in your
>> patch set.
>
> This is the *only* block.c function in the blk-mq code that calls
> mmc_put_card().  The queue also has does it for requests that didn't even
> start.  That is it.

Yeah that's correct. If we also delete the old code it will not be
too bad.

>> Also you do not need to avoid calling it several times with
>> that put_card variable. It's fully reentrant thanks to your
>> own code in the lock and all calls come from the same block
>> layer process if you call it in .complete() I think?
>
> We have always avoided unnecessary gets / puts.  Since the result is better,
> why on earth take it out?

It's not a showstopper.

But what I think is nice in doing it around
each request is that since mmc_put_card() calls mmc_release_host()
contains this:

if (--host->claim_cnt) { (...)

So there is a counter inside the claim/release host functions
and now there is another counter keeping track if the in-flight
requests as well and it gives me the feeling we are maintaining
two counters when we only need one.

But maybe it is actually the host->claim_cnt that is overengineered
and should just be a bool, becuase with this construction
that you only call down and claim the host once, the
host->claim_cnt will only be 0 or 1, right?

>> Now the problem Ulf has pointed out starts to creep out in the
>> patch: a lot of code duplication on the MQ path compared to
>> the ordinary block layer path.
>>
>> My approach was structured partly to avoid this: first refactor
>> the old path, then switch to (only) MQ to avoid code duplication.
>
> The old code is rubbish.  There is nothing of value there.  You have to make
> the case why you are wasting everyone's time churning crappy code.

I kind of agree that the old code is rubbish, actually.

I guess I could turn that around and ask: if it is so crappy, why
is your patch set not deleting it?

And I guess the only reasonable answer to that would be what
you were saying above, something along the lines of "MQ is not
ready yet". But it can be argued that MQ is ready.

>  The idea
> that nice new code is wrong because it doesn't churn the old rubbish code,
> is ridiculous.

As I have said in the review comments I like your new
code, especially the new error path.

I took the approach of refactoring because I was afraid I would
break something I guess. But rewriting from scratch is also an
option.

But I think I would prefer that if a big slew of new code is
introduced it needs to be put to wide use and any bugs
smoked out during the -rc phase, and we are now
hiding it behind a Kconfig option so it's unlikely to see testing
until that option is turned on, and that is not good.

>> This looks a bit like it is reimplementing the kernel
>> completion abstraction using a mutex and a variable named
>> .complete_req?
>>
>> We were using a completion in the old block layer so
>> why did you not use it for MQ?
>
> Doesn't seem like you pay much attention to this stuff.

You really have to stop your snarky and belitteling attitude
to your fellow kernel developers.

From 6.Followthrough.rst again:

----------8<-------------------8<-------------------8<-----------

Andrew Morton has suggested that every review comment which does not result
in a code change should result in an additional code comment instead; that
can help future reviewers avoid the questions which came up the first time
around.

----------8<-------------------8<-------------------8<-----------

At least take a moment to consider the option that maybe so few people
are reviwing your code because this is complicated stuff and really
hard to grasp, maybe the problem isn't on my side, neither on yours,
it may be that the subject matter is the problem.

>  The previous
> request has to be completed even if there is no next request.  That means
> schduling work, that then races with the dispatch path.  So mutual exclusion
> is necessary.

I would say a synchronization primitive is needed, indeed.
I have a different approach to this, which uses completion
as the synchronization primitive and I think that would be possible
to use also here.

I have this code:

        /*
         * Here we postprocess the request differently depending on if
         * we go on the success path or error path. The success path will
         * immediately let new requests hit the host, whereas the error
         * path will hold off new requests until we have retried and
         * succeeded or failed the current asynchronous request.
         */
        if (status == MMC_BLK_SUCCESS) {
                /*
                 * This immediately opens the gate for the next request
                 * to start on the host while we perform post-processing
                 * and report back to the block layer.
                 */
                host->areq = NULL;
                complete(&areq->complete);
                mmc_post_req(host, areq->mrq, 0);
                if (areq->report_done_status)
                        areq->report_done_status(areq, MMC_BLK_SUCCESS);
        } else {
                /*
                 * Post-process this request. Then, if
                 * another request was already prepared, back that out
                 * so we can handle the errors without anything prepared
                 * on the host.
                 */
                if (host->areq_pending)
                        mmc_post_req(host, host->areq_pending->mrq, -EINVAL);
                /*
                 * Call back with error status, this will trigger retry
                 * etc if needed
                 */
                if (areq->report_done_status) {
                        if (areq->report_done_status(areq, status)) {
                                /*
                                 * This happens when we finally give up after
                                 * a few retries or on unrecoverable errors.
                                 */
                                mmc_post_req(host, areq->mrq, 0);
                                host->areq = NULL;
                                /* Re-prepare the next request */
                                if (host->areq_pending)
                                        mmc_pre_req(host,
host->areq_pending->mrq);
                                complete(&areq->complete);
                        }
                }
        }

As you can see it is a bit elaborate about some
of this stuff and quite a lot of comments were added to make
clear what is going on. This is in my head so that is why I ask:
completion worked fine as a synchronization primitive here
so it is maybe a good alternative in your (similar) code
as well?

>> I would contest using a waitqueue for this. The name even says
>> "complete_req" so why is a completion not the right thing to
>> hang on rather than a waitqueue?
>
> I explained that above.

And I explained what I meant.

>> The completion already contains a waitqueue, so I think you're
>> just essentially reimplementing it.
>>
>> Just complete(&mq->mq_req_complete) or something should do
>> the trick.
>
> Nope.

That's a very terse answer to an honest review comment.

>>> +       else
>>> +               kblockd_schedule_work(&mq->complete_work);
>>
>> I did not use the kblockd workqueue for this, out of fear
>> that it would interfere and disturb the block layer work items.
>> My intuitive idea was that the MMC layer needed its own
>> worker (like in the past it used a thread) in order to avoid
>> congestion in the block layer queue leading to unnecessary
>> delays.
>
> The complete work races with the dispatch of the next request, so putting
> them in the same workqueue makes sense. i.e. the one that gets processed
> first would anyway delay the one that gets processed second.

So what we want to attain is that the next dispatch happens as soon
as possible after completing the previous request. They only race
as far as that they go through post/preprocessing before getting to
the synchronization primitive though?

>> On the other hand, this likely avoids a context switch if there
>> is no congestion on the queue.
>>
>> I am uncertain when it is advisible to use the block layer
>> queue for subsystems like MMC/SD.
>>
>> Would be nice to see some direction from the block layer
>> folks here, it is indeed exposed to us...
>>
>> My performance tests show no problems with this approach
>> though.
>
> As I already wrote, the CPU-bound block layer dispatch work queue has
> negative consequeuences for mmc performance.  So there are 2 aspects to that:
>         1. Is the choice of CPU right to start with?  I suspect it is better for
> the dispatch to run on the same CPU as the interrupt.
>         2. Does the dispatch work need to be able to be migrated to a different
> CPU? i.e. unbound work queue.  That helped in my tests, but it could just be
> a side-effect of 1.

Hm! What you are saying sounds correct, and we really need to
consider the multi-CPU aspects of this, maybe not now. I am
happy as long as we have equal performance as before and
maintainable code.

> Of course we can't start looking at these real issues, while you are

You seem to have dropped the mic there, but it looks like
what you were going to say was not nice anyway so I guess
it was for better.

>>> +static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
>>> +{
>>> +       struct request_queue *q = mq->queue;
>>> +       unsigned long flags;
>>> +       bool done;
>>> +
>>> +       spin_lock_irqsave(q->queue_lock, flags);
>>> +       done = !mq->rw_wait;
>>> +       mq->waiting = !done;
>>> +       spin_unlock_irqrestore(q->queue_lock, flags);
>>
>> This makes it look like a reimplementation of completion_done()
>> so I think you should use the completion abstraction again. The
>> struct completion even contains a variable named "done".
>
> This just serves as an example of why splitting up the patch was such a bad
> idea.  For direct completion, the wait can result in recovery being needed
> for the previous request, so the current request gets requeued.

I guess this falls back to the previous comment on the
synchronization primitives.

>> This is pretty straight-forward (pending the comments above).
>> Again it has the downside of duplicating the same code for the
>> old block layer instead of refactoring.
>
> No, the old code is rubbish.  Churning it is a waste of time.

I think we have addressed this recurring theme.

>> Again looks fine, again duplicates code. In this case I don't even
>> see why the MQ code needs its own copy of the issue funtion.
>
> Because it has to support CQE.  This attitude against CQE is very disappointing!

Please understand that there is no conspiracy against CQE out
there.

I admit I personally think getting MQ in place and making the code
long-term maintainable is more important than supporting
CQE. But that does not mean I think CQE is unimportant.

Or rather: just because someone is FOR something else,
doesn't mean they are AGAINST this.

I am pretty sure both sets of functionality can be attained in the next
merge window, but it requires good faith.

>>> +enum mmc_issue_type mmc_issue_type(struct mmc_queue *mq, struct request *req)
>>> +{
>>> +       if (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_WRITE)
>>> +               return MMC_ISSUE_ASYNC;
>>> +
>>> +       return MMC_ISSUE_SYNC;
>>> +}
>>
>> Distinguishing between SYNC and ASYNC operations and using
>> that as abstraction is nice.
>>
>> But you only do this in the new MQ code.
>>
>> Instead, make this a separate patch and first refactor the old
>> code to use this distinction between SYNC and ASYNC.
>
> That is a non-starter.  The old code is rubbish.  Point to something work
> saving.  There isn't anything.

I think we have addressed this.

>> Unfortunately I think Ulf's earlier criticism that you're rewriting
>> the world instead of refactoring what we have still stands on many
>> accounts here.
>
> Nope.  It is just an excuse to delay the patches.  You guys are playing
> games and it is embarassing for linux.  What is actually wrong with this
> technically?  It is not good because it doesn't churn the old code?  That is
> ridiculous.

It's not good because it does not make MQ mandatory and does
not delete the interfaces to the old block layer.

>> It makes it even harder to understand your persistance in keeping
>> the old block layer around. If you're introducing new concepts and
>> cleaner code in the MQ path and kind of discarding the old
>> block layer path, why keep it around at all?
>
> Wow, you really like making things up.  Never have I suggested keeping the
> old code.  It is rubbish.

So why are you not just deleting it?

>  As soon and blk-mq is ready and tested, delete
> the old crap.

I disagree. No "ready and tested" is needed, the code is ready,
and I have tested it. It performs. Delete the old code now.

> I was expecting CQE to be applied 6 months ago, supporting the legacy blk
> layer until blk-mq was ready.  But you never delivered on blk-mq, which is
> why I had to do it.  And now you are making up excuses about why we can't
> move forward.

Don't be so conspiracist. I think I have answered this several
times over.

Migrating to MQ and getting rid of the old block layer interface is
paramount in my view. That is the essence of all my review feedback.
The other comments are minor, and even if you don't take my
comments into consideration it is stuff I can work on fixing after
these patches are merged.

If you just make a series also deleteing the old block layer
I will test it so it doesn't break anything and then you can
probably expect a big Acked-by on the whole shebang.

>> I would have a much easier time accepting this patch if it
>> deleted as much as it was adding, i.e. introduce all this new
>> nice MQ code, but also tossing out the old block layer and error
>> handling code. Even if it is a massive rewrite, at least there
>> is just one body of code to maintain going forward.
>
> How can you pssibly call a few hundred lines massive.  The kernel has
> millions of lines.  Your sense of scale is out of whack.

Arguably you are speaking against yourself, since the old code
was described by yourself as "rubbish", and it is very terse and
uninituitive to read even a few lines of rubbish.

I had to create a (quite popular) Googledoc breaking down the
MMC stack in words before I could even start hacking at it.
So it is not massive in the sense of number of lines but in the
sense of intelligibility, it's so terse that it feels like eating a
too big piece of mudcake or something.

Just delete the rubbish and I'm happy.

>> That said, I would strongly prefer a refactoring of the old block
>> layer leading up to transitioning to MQ. But I am indeed biased
>> since I took that approach myself.
>
> Well stop it.  We have nice working code.  Get it applied and tested, and
> then we can delete the old crap.

Just get the old code deleted so there is just one thing to
test and not a matrix of old and new code paths.

>> This timeout looks like something I need to pick up in my patch
>> set as well. It seems good for stability to support this. But what happened
>> here? Did you experience a bunch of timeouts during development,
>> or let's say how was this engineered, I guess it is for the case when
>> something randomly locks up for a long time and we don't really know
>> what has happened, like a watchdog?
>
> We presently don't have the host APIs to support external timeouts.  CQE
> uses them though.

OK

>>> +static int mmc_init_request(struct request_queue *q, struct request *req,
>>> +                           gfp_t gfp)
>>> +{
>>> +       return __mmc_init_request(q->queuedata, req, gfp);
>>> +}
>>> +
>> (...)
>>> +static int mmc_mq_init_request(struct blk_mq_tag_set *set, struct request *req,
>>> +                              unsigned int hctx_idx, unsigned int numa_node)
>>> +{
>>> +       return __mmc_init_request(set->driver_data, req, GFP_KERNEL);
>>> +}
>>> +
>>> +static void mmc_mq_exit_request(struct blk_mq_tag_set *set, struct request *req,
>>> +                               unsigned int hctx_idx)
>>> +{
>>> +       struct mmc_queue *mq = set->driver_data;
>>> +
>>> +       mmc_exit_request(mq->queue, req);
>>> +}
>>
>> Here is more code duplication just to keep both the old block layer
>> and MQ around. Including introducing another inner __foo function
>> which I have something strongly against personally (I might be
>> crazily picky, because I see many people do this).
>
> In this case, it is not code duplication it is re-using the same code but
> called from the blk-mq API.

Right, OK.

>>> +               /*
>>> +                * Timeouts are handled by mmc core, so set a large value to
>>> +                * avoid races.
>>> +                */
>>> +               req->timeout = 600 * HZ;
>>> +               break;
>>> +       }
>>
>> These timeouts again, does this mean we have competing timeout
>> code in the block layer and MMC?
>
> Yes - the host controller provides hardware timeout interrupts in most
> cases.  The core provides software timeouts in other cases.

OK maybe we can extend the comment about to explain this
situation so it is clear what is racing with what.

>> This mentions timeouts in the MMC core, but they are actually
>> coming from the *MMC* core, when below you set:
>> blk_queue_rq_timeout(mq->queue, 60 * HZ);?
>>
>> Isn't the actual case that the per-queue timeout is set up to
>> occur before the per-request timeout, and that you are hacking
>> around the block layer core having two different timeouts?
>
> There is no per-queue timeout.  The request timeout has a default value
> given by the queue.  It can be changed for different requests.

Aha.

>> It's a bit confusing so I'd really like to know what's going on...
>
> I don't expect to have to teach you the block layer.

But I will tell you to stop snarking around every time you
write things like that.


>>> +       mq->in_flight[issue_type] += 1;
>>> +       get_card = mmc_tot_in_flight(mq) == 1;
>>
>> Parenthesis around the logical expression preferred I guess
>> get_card = (mmc_tot_in_flight(mq) == 1);
>> (Isn't checkpatch complaining about this?)
>
> Nope

Too bad. OK I think it's nice with a paranthesis anyway.
Aids perception or something.

>> Then:
>> (...)
>>> +       if (get_card)
>>> +               mmc_get_card(card, &mq->ctx);
>>
>> I simply took the card on every request. Since the context is the
>> same for all block layer business and the lock is now fully
>> reentrant this if (get_card) is not necessary. Just take it for
>> every request and release it in the .complete() callback.
>
> As I have written elsewhere, we have always avoind getting / putting
> unecessarily.  It is better that way, so no point in taking it out.

I explained about about the double counters.

>>> +#define MMC_QUEUE_DEPTH 64
>>> +
>>> +static int mmc_mq_init(struct mmc_queue *mq, struct mmc_card *card,
>>> +                        spinlock_t *lock)
>>> +{
>>> +       int q_depth;
>>> +       int ret;
>>> +
>>> +       q_depth = MMC_QUEUE_DEPTH;
>>> +
>>> +       ret = mmc_mq_init_queue(mq, q_depth, &mmc_mq_ops, lock);
>>
>> Apart from using a define, then assigning the define to a
>> variable and then passing that variable instead of just
>> passing the define: why 64? Is that the depth of the CQE
>> queue? In that case we need an if (cqe) and set it down
>> to 2 for non-CQE.
>
> Are you ever going to learn about the block layer.

Are you ever going to treat your fellow community peers as
equals?

It is actually your job as a patch submitter to teach others about
the details of what you are doing. If you think all your fellow programmers
suck then elevate them to your own level instead of pushing them down.

> The number of requests
> is default 128 for the legacy block layer.

I had no clue.

> For blk-mq it is queue depth
> times 2.  So 64 gives the same number of requests as before.

I see. But by just setting it to 2 as I do in my patch set, it still
performs, I guess just with less buffers allocated.

This makes some kind of sense though and I guess it explains
why we got out of memory with the bounce buffer thing, as that
then resulted in 128*64KB of allocated memory for the request
queue. I thought it would be ... a few requests.

>>> +       if (ret)
>>> +               return ret;
>>> +
>>> +       blk_queue_rq_timeout(mq->queue, 60 * HZ);
>>
>> And requests timeout after 1 minute I take it.
>>
>> I suspect both of these have some relation to CQE, so that is where
>> you find these long execution times etc?
>
> For legacy mmc, the core takes care of timeouts.  For CQE we expect reliable
> devices and I would interpret a timeout as meaning the device is broken.

OK.

> However it is sensible to have anyway.  For CQE, a request might have to
> wait for the entire rest of the queue to be processed first, or maybe the
> request somehow gets stuck and there are other requests constantly
> overtaking it.  The max queue depth is 32 so 60 seconds seems ok.

OK we will get to see as we see more and more devices of this type.
It's fine.

>>> +static void mmc_mq_queue_suspend(struct mmc_queue *mq)
>>> +{
>>> +       blk_mq_quiesce_queue(mq->queue);
>>> +
>>> +       /*
>>> +        * The host remains claimed while there are outstanding requests, so
>>> +        * simply claiming and releasing here ensures there are none.
>>> +        */
>>> +       mmc_claim_host(mq->card->host);
>>> +       mmc_release_host(mq->card->host);
>>
>> I think just blk_mq_quiesce_queue() should be fine as and
>> should make sure all requests have called .complete() and there
>> I think you should also release the host lock.
>>
>> If the MQ code is not doing this, we need to fix MQ to
>> do the right thing (or add a new callback such as
>> blk_mq_make_sure_queue_empty()) so at the very
>> least put a big fat FIXME or REVISIT comment on the above.
>
> blk_mq_quiesce_queue() prevents dispatches not completions.  So we wait for
> outstanding requests.

I guess the quiesce call not really intended for PM suspend/resume
paths. Maybe we need to add a blk_mq_flush_queue() call then.
(No showstopper, this is OK for now.)

>> One of the good reasons to delete the old block layer is to get
>> rid of this horrible semaphore construction. So I see it as necessary
>> to be able to focus development efforts on code that actually has
>> a future.
>
> The old crap will get deleted when blk-mq is ready.

Which IMO is now.

>>> +
>>> +       int                     in_flight[MMC_ISSUE_MAX];
>>
>> So this is a [2] containing a counter for the number of
>> synchronous and asynchronous requests in flight at any
>> time.
>>
>> But are there really synchronous and asynchronous requests
>> going on at the same time?
>>
>> Maybe on the error path I guess.
>>
>> I avoided this completely but I guess it may be necessary with
>> CQE, such that in_flight[0,1] is way more than 1 or 2 at times
>> when there are commands queued?
>
> CQE needs to count DCMD separately from read / writes.  Counting by issue
> type is a simple way to do that.

OK

>>> +       bool                    rw_wait;
>>> +       bool                    waiting;
>>> +       wait_queue_head_t       wait;
>>
>> As mentioned I think this is a reimplementation of
>> the completion abstraction.
>
> I pointed out why that wouldn't work.  Another case of why the code makes
> more sense together than split up.

I'm not entirely convinced about that but we'll see. It is a detail.

Yours,
Linus Walleij
Linus Walleij Nov. 10, 2017, 10:19 a.m. UTC | #4
Following up on this:

On Thu, Nov 9, 2017 at 4:52 PM, Linus Walleij <linus.walleij@linaro.org> wrote:

>> No one has ever suggested that the legacy API will remain.  Once blk-mq is
>> ready the old code gets deleted.
>
> The block layer maintainers most definately think MQ is ready
> but you seem to disagree. Why?
>
> In the recent LWN article from kernel recepies:
> https://lwn.net/Articles/735275/
> the only obstacle I can see is a mention that SCSI was not
> switched over by default because of some problems with
> slow rotating media. "This change had to be backed out
> because of some performance and scalability issues that
> arose for rotating storage."
>
> Christoph mentions that he's gonna try again for v4.14.
> But it's true, I do not see that happening in linux-next yet.

Neil Brown's article on LWN points to Ming Lei's patch
set:
https://www.mail-archive.com/linux-kernel@vger.kernel.org/msg1513023.html

As addressing the issue that held back blk-MQ from
being default for SCSI, and it is reportedly to be
queued for v4.15. Since this MMC/SD rework is
also targeted for v4.15 I think we can assume it is
pretty much ready for everything, and delete the
non-MQ block path.

I just haven't seen any of this problem in my tests
with MMC/SD so I do not think it would be affected,
but it anyways seem to be fixed.

OK maybe I am especially optimistic. But it's not just
based on that.

Yours,
Linus Walleij
Adrian Hunter Nov. 14, 2017, 1:10 p.m. UTC | #5
On 09/11/17 17:52, Linus Walleij wrote:
> On Thu, Nov 9, 2017 at 11:42 AM, Adrian Hunter <adrian.hunter@intel.com> wrote:
>> On 08/11/17 10:54, Linus Walleij wrote:
>>> On Fri, Nov 3, 2017 at 2:20 PM, Adrian Hunter <adrian.hunter@intel.com> wrote:
> 
>>> At least you could do what I did and break out a helper like
>>> this:
>>>
>>> /*
>>>  * This reports status back to the block layer for a finished request.
>>>  */
>>> static void mmc_blk_complete(struct mmc_queue_req *mq_rq,
>>>                             blk_status_t status)
>>> {
>>>        struct request *req = mmc_queue_req_to_req(mq_rq);
>>>
>>>        if (req->mq_ctx) {
>>>           blk_mq_end_request(req, status);
>>>        } else
>>>           blk_end_request_all(req, status);
>>> }
>>
>> You are quibbling.
> 
> Hm wiktionary "quibble" points to:
> https://en.wikipedia.org/wiki/Trivial_objections
> 
> Sorry for not being a native speaker in English, I think that
> maybe you were again trying to be snarky to a reviewer but
> I honestly don't know.
> 
>> It makes next to no difference especially as it all goes
>> away when the legacy code is deleted.
> 
> Which is something your patch set doesn't do. Nor did you write
> anywhere that deleting the legacy code was your intention.
> But I'm happy to hear it.
> 
>>  I will change it in the next
>> revision, but what a waste of everyone's time.
>>  Please try to focus on
>> things that matter.
> 
> As Jean-Paul Sartre said "hell is other people".
> Can you try to be friendlier anyway?
> 
>>> This retry and error handling using requeue is very elegant.
>>> I really like this.
>>>
>>> If we could also go for MQ-only, only this nice code
>>> remains in the tree.
>>
>> No one has ever suggested that the legacy API will remain.  Once blk-mq is
>> ready the old code gets deleted.
> 
> The block layer maintainers most definately think MQ is ready
> but you seem to disagree. Why?

I meant when the mmc conversion to blk-mq is ready.  We don't need to
consider other sub-systems, just whether it is working for mmc.  For
example, the issue with the block layer workqueues not performing as well as
a thread.  That only came to light through testing mmc.

> 
> In the recent LWN article from kernel recepies:
> https://lwn.net/Articles/735275/
> the only obstacle I can see is a mention that SCSI was not
> switched over by default because of some problems with
> slow rotating media. "This change had to be backed out
> because of some performance and scalability issues that
> arose for rotating storage."
> 
> Christoph mentions that he's gonna try again for v4.14.
> But it's true, I do not see that happening in linux-next yet.
> 
> But that is specifically rotating media, which MMC/SD is not.
> And UBI is using it since ages without complaints. And
> that is a sign of something.
> 
> Have you seen any problems when deploying it on MMC/SD,
> really? If there are problems I bet the block layer people want
> us to find them, diagnose them and ultimately fix them rather
> than wait for them to do it. I haven't seen any performance or
> scalability issues. At one point I had processes running
> on two eMMC and one SD-card and apart from maxing out
> the CPUs and DMA backplace as could be expected all was
> fine.
> 
>>> The problem: you have just reimplemented the whole error
>>> handling we had in the old block layer and now we have to
>>> maintain two copies and keep them in sync.
>>>
>>> This is not OK IMO, we will inevitable screw it up, so we
>>> need to get *one* error path.
>>
>> Wow, you really didn't read the code at all.
> 
> Just quit this snarky attitude.
> 
>> As I have repeatedly pointed
>> out, the new code is all new.  There is no overlap and there nothing to keep
>> in sync.
> 
> The problem is that you have not clearly communicated your
> vision to delete the old code. The best way to communicate that
> would be to include a patch actually deleteing it.
> 
>>  It may not look like it in this patch, but that is only because of
>> the ridiculous idea of splitting up the patch.
> 
> Naming other people's review feedback as "ridiculous" is not very
> helpful. I think the patch set became much easier to review
> like this so I am happy with this format.
> 
>>>> +static void mmc_blk_mq_acct_req_done(struct mmc_queue *mq, struct request *req)
>>>
>>> What does "acct" mean in the above function name?
>>> Accounting? Actual? I'm lost.
>>
>> Does "actual" have two "c"'s.  You are just making things up.
> 
> Please stop your snarky and belitteling attitude.
> 
> I am not a native English speaker and I am not making up
> my review comments in order to annoy you.
> 
> Consider Hanlon's razor:
> https://en.wikipedia.org/wiki/Hanlon%27s_razor
> "Never attribute to malice that which is adequately explained by
> stupidity".
> 
> It's bad enough that you repeatedly point out how stupid
> you think I am

I have never called you names.  On the contrary, it is because I don't think
you are stupid that I get upset when you seem spend so little time and
effort to understand the code.  Equally, you seem to make comments that just
assume the code is wrong without due consideration.

>                 I am sorry about that, in my opinion sometimes
> other people are really stupid but more often than not the problem
> is really on your side, like being impatient and unwilling to educate
> others about how your code actually works becuase it seems
> so evident to yourself that you think it should be evident to everyone
> else as well. I don't know what to say about that, it seems a bit
> unempatic.
> 
> But you even go and think I am making stuff up in purpose.
> That's pretty offensive.
> 
>> Of course it is "account".
> 
> This is maybe obvious for you but it was not to me.
> 
>>  It is counting the number of requests in flight - which is
>> pretty obvious from the code.  We use that to support important features
>> like CQE re-tuning and avoiding getting / putting the card all the time.
> 
> What is so hard with getting this point across without insults?
> 
>>>> +{
>>>> +       struct request_queue *q = req->q;
>>>> +       unsigned long flags;
>>>> +       bool put_card;
>>>> +
>>>> +       spin_lock_irqsave(q->queue_lock, flags);
>>>> +
>>>> +       mq->in_flight[mmc_issue_type(mq, req)] -= 1;
>>>> +
>>>> +       put_card = mmc_tot_in_flight(mq) == 0;
>>>
>>> This in_flight[] business seems a bit kludgy, but I
>>> don't really understand it fully. Magic numbers like
>>> -1 to mark that something is not going on etc, not
>>> super-elegant.
>>
>> You are misreading.  It subtracts 1 from the number of requests in flight.
> 
> Right! Sorry I misread it.
> 
>>> I believe it is necessary for CQE though as you need
>>> to keep track of outstanding requests?
>>
>> We have always avoided getting / putting the card when there is another
>> request in flight.  Yes it is also used for CQE.
> 
> Yeah I took the approach to get/put the card around each
> request instead. It might make things a bit simpler.
> 
>>>> +       spin_unlock_irqrestore(q->queue_lock, flags);
>>>> +
>>>> +       if (put_card)
>>>> +               mmc_put_card(mq->card, &mq->ctx);
>>>
>>> I think you should try not to sprinkle mmc_put_card() inside
>>> the different functions but instead you can put this in the
>>> .complete callback I guess mmc_blk_mq_complete() in your
>>> patch set.
>>
>> This is the *only* block.c function in the blk-mq code that calls
>> mmc_put_card().  The queue also has does it for requests that didn't even
>> start.  That is it.
> 
> Yeah that's correct. If we also delete the old code it will not be
> too bad.
> 
>>> Also you do not need to avoid calling it several times with
>>> that put_card variable. It's fully reentrant thanks to your
>>> own code in the lock and all calls come from the same block
>>> layer process if you call it in .complete() I think?
>>
>> We have always avoided unnecessary gets / puts.  Since the result is better,
>> why on earth take it out?
> 
> It's not a showstopper.
> 
> But what I think is nice in doing it around
> each request is that since mmc_put_card() calls mmc_release_host()
> contains this:
> 
> if (--host->claim_cnt) { (...)
> 
> So there is a counter inside the claim/release host functions
> and now there is another counter keeping track if the in-flight
> requests as well and it gives me the feeling we are maintaining
> two counters when we only need one.

The gets / puts also get runtime pm for the card.  It is a lot a messing
around for the sake of a quick check for the number of requests inflight -
which is needed anyway for CQE.

> 
> But maybe it is actually the host->claim_cnt that is overengineered
> and should just be a bool, becuase with this construction
> that you only call down and claim the host once, the
> host->claim_cnt will only be 0 or 1, right?

The claim_cnt was introduced for nested claiming.

> 
>>> Now the problem Ulf has pointed out starts to creep out in the
>>> patch: a lot of code duplication on the MQ path compared to
>>> the ordinary block layer path.
>>>
>>> My approach was structured partly to avoid this: first refactor
>>> the old path, then switch to (only) MQ to avoid code duplication.
>>
>> The old code is rubbish.  There is nothing of value there.  You have to make
>> the case why you are wasting everyone's time churning crappy code.
> 
> I kind of agree that the old code is rubbish, actually.
> 
> I guess I could turn that around and ask: if it is so crappy, why
> is your patch set not deleting it?

To allow people time to test.

> 
> And I guess the only reasonable answer to that would be what
> you were saying above, something along the lines of "MQ is not
> ready yet". But it can be argued that MQ is ready.
> 
>>  The idea
>> that nice new code is wrong because it doesn't churn the old rubbish code,
>> is ridiculous.
> 
> As I have said in the review comments I like your new
> code, especially the new error path.
> 
> I took the approach of refactoring because I was afraid I would
> break something I guess. But rewriting from scratch is also an
> option.
> 
> But I think I would prefer that if a big slew of new code is
> introduced it needs to be put to wide use and any bugs
> smoked out during the -rc phase, and we are now
> hiding it behind a Kconfig option so it's unlikely to see testing
> until that option is turned on, and that is not good.

And if you find a big problem in rc7, then what do you do?  At least with
the config option, the revert is trivial.

> 
>>> This looks a bit like it is reimplementing the kernel
>>> completion abstraction using a mutex and a variable named
>>> .complete_req?
>>>
>>> We were using a completion in the old block layer so
>>> why did you not use it for MQ?
>>
>> Doesn't seem like you pay much attention to this stuff.
> 
> You really have to stop your snarky and belitteling attitude
> to your fellow kernel developers.
> 
>>From 6.Followthrough.rst again:
> 
> ----------8<-------------------8<-------------------8<-----------
> 
> Andrew Morton has suggested that every review comment which does not result
> in a code change should result in an additional code comment instead; that
> can help future reviewers avoid the questions which came up the first time
> around.
> 
> ----------8<-------------------8<-------------------8<-----------
> 
> At least take a moment to consider the option that maybe so few people
> are reviwing your code because this is complicated stuff and really
> hard to grasp, maybe the problem isn't on my side, neither on yours,
> it may be that the subject matter is the problem.

I would expect a review would take a number of days, perhaps longer.
However, it seems people are not willing to consider anything that takes
more than a an hour or two.

> 
>>  The previous
>> request has to be completed even if there is no next request.  That means
>> schduling work, that then races with the dispatch path.  So mutual exclusion
>> is necessary.
> 
> I would say a synchronization primitive is needed, indeed.
> I have a different approach to this, which uses completion
> as the synchronization primitive and I think that would be possible
> to use also here.

Because they are both just wake-ups.  When waiting for multiple conditions,
wait_event() is simpler.  You are not considering the possibility for having
only 1 task switch between requests.

> 
> I have this code:

Which runs as a work item, so there is already a task switch to get here.
i.e. there are 2 task switches between each request, instead of 1.

> 
>         /*
>          * Here we postprocess the request differently depending on if
>          * we go on the success path or error path. The success path will
>          * immediately let new requests hit the host, whereas the error
>          * path will hold off new requests until we have retried and
>          * succeeded or failed the current asynchronous request.
>          */
>         if (status == MMC_BLK_SUCCESS) {
>                 /*
>                  * This immediately opens the gate for the next request
>                  * to start on the host while we perform post-processing
>                  * and report back to the block layer.
>                  */
>                 host->areq = NULL;
>                 complete(&areq->complete);

So that is the second task switch.

>                 mmc_post_req(host, areq->mrq, 0);
>                 if (areq->report_done_status)
>                         areq->report_done_status(areq, MMC_BLK_SUCCESS);
>         } else {
>                 /*
>                  * Post-process this request. Then, if
>                  * another request was already prepared, back that out
>                  * so we can handle the errors without anything prepared
>                  * on the host.
>                  */
>                 if (host->areq_pending)
>                         mmc_post_req(host, host->areq_pending->mrq, -EINVAL);
>                 /*
>                  * Call back with error status, this will trigger retry
>                  * etc if needed
>                  */
>                 if (areq->report_done_status) {
>                         if (areq->report_done_status(areq, status)) {
>                                 /*
>                                  * This happens when we finally give up after
>                                  * a few retries or on unrecoverable errors.
>                                  */
>                                 mmc_post_req(host, areq->mrq, 0);
>                                 host->areq = NULL;
>                                 /* Re-prepare the next request */
>                                 if (host->areq_pending)
>                                         mmc_pre_req(host,
> host->areq_pending->mrq);
>                                 complete(&areq->complete);
>                         }
>                 }
>         }
> 
> As you can see it is a bit elaborate about some
> of this stuff and quite a lot of comments were added to make
> clear what is going on. This is in my head so that is why I ask:
> completion worked fine as a synchronization primitive here
> so it is maybe a good alternative in your (similar) code
> as well?

In the case there is another request already waiting, then instead of
scheduling work to complete the request, the dispatch is woken to complete
the previous and start the next.  That needs different synchronization.

> 
>>> I would contest using a waitqueue for this. The name even says
>>> "complete_req" so why is a completion not the right thing to
>>> hang on rather than a waitqueue?
>>
>> I explained that above.
> 
> And I explained what I meant.
> 
>>> The completion already contains a waitqueue, so I think you're
>>> just essentially reimplementing it.
>>>
>>> Just complete(&mq->mq_req_complete) or something should do
>>> the trick.
>>
>> Nope.
> 
> That's a very terse answer to an honest review comment.
> 
>>>> +       else
>>>> +               kblockd_schedule_work(&mq->complete_work);
>>>
>>> I did not use the kblockd workqueue for this, out of fear
>>> that it would interfere and disturb the block layer work items.
>>> My intuitive idea was that the MMC layer needed its own
>>> worker (like in the past it used a thread) in order to avoid
>>> congestion in the block layer queue leading to unnecessary
>>> delays.
>>
>> The complete work races with the dispatch of the next request, so putting
>> them in the same workqueue makes sense. i.e. the one that gets processed
>> first would anyway delay the one that gets processed second.
> 
> So what we want to attain is that the next dispatch happens as soon
> as possible after completing the previous request. They only race
> as far as that they go through post/preprocessing before getting to
> the synchronization primitive though?

They only race when the next request is not already waiting.  Otherwise the
work is never scheduled.

> 
>>> On the other hand, this likely avoids a context switch if there
>>> is no congestion on the queue.
>>>
>>> I am uncertain when it is advisible to use the block layer
>>> queue for subsystems like MMC/SD.
>>>
>>> Would be nice to see some direction from the block layer
>>> folks here, it is indeed exposed to us...
>>>
>>> My performance tests show no problems with this approach
>>> though.
>>
>> As I already wrote, the CPU-bound block layer dispatch work queue has
>> negative consequeuences for mmc performance.  So there are 2 aspects to that:
>>         1. Is the choice of CPU right to start with?  I suspect it is better for
>> the dispatch to run on the same CPU as the interrupt.
>>         2. Does the dispatch work need to be able to be migrated to a different
>> CPU? i.e. unbound work queue.  That helped in my tests, but it could just be
>> a side-effect of 1.
> 
> Hm! What you are saying sounds correct, and we really need to
> consider the multi-CPU aspects of this, maybe not now. I am
> happy as long as we have equal performance as before and
> maintainable code.

Well I saw 3% drop in sequential read performance, improving to 1% when an
unbound workqueue was used.

> 
>> Of course we can't start looking at these real issues, while you are
> 
> You seem to have dropped the mic there, but it looks like
> what you were going to say was not nice anyway so I guess
> it was for better.
> 
>>>> +static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
>>>> +{
>>>> +       struct request_queue *q = mq->queue;
>>>> +       unsigned long flags;
>>>> +       bool done;
>>>> +
>>>> +       spin_lock_irqsave(q->queue_lock, flags);
>>>> +       done = !mq->rw_wait;
>>>> +       mq->waiting = !done;
>>>> +       spin_unlock_irqrestore(q->queue_lock, flags);
>>>
>>> This makes it look like a reimplementation of completion_done()
>>> so I think you should use the completion abstraction again. The
>>> struct completion even contains a variable named "done".
>>
>> This just serves as an example of why splitting up the patch was such a bad
>> idea.  For direct completion, the wait can result in recovery being needed
>> for the previous request, so the current request gets requeued.
> 
> I guess this falls back to the previous comment on the
> synchronization primitives.
> 
>>> This is pretty straight-forward (pending the comments above).
>>> Again it has the downside of duplicating the same code for the
>>> old block layer instead of refactoring.
>>
>> No, the old code is rubbish.  Churning it is a waste of time.
> 
> I think we have addressed this recurring theme.
> 
>>> Again looks fine, again duplicates code. In this case I don't even
>>> see why the MQ code needs its own copy of the issue funtion.
>>
>> Because it has to support CQE.  This attitude against CQE is very disappointing!
> 
> Please understand that there is no conspiracy against CQE out
> there.
> 
> I admit I personally think getting MQ in place and making the code
> long-term maintainable is more important than supporting
> CQE. But that does not mean I think CQE is unimportant.
> 
> Or rather: just because someone is FOR something else,
> doesn't mean they are AGAINST this.

But that is not what happened.  You blocked CQE even though it was ready.

> 
> I am pretty sure both sets of functionality can be attained in the next
> merge window, but it requires good faith.
> 
>>>> +enum mmc_issue_type mmc_issue_type(struct mmc_queue *mq, struct request *req)
>>>> +{
>>>> +       if (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_WRITE)
>>>> +               return MMC_ISSUE_ASYNC;
>>>> +
>>>> +       return MMC_ISSUE_SYNC;
>>>> +}
>>>
>>> Distinguishing between SYNC and ASYNC operations and using
>>> that as abstraction is nice.
>>>
>>> But you only do this in the new MQ code.
>>>
>>> Instead, make this a separate patch and first refactor the old
>>> code to use this distinction between SYNC and ASYNC.
>>
>> That is a non-starter.  The old code is rubbish.  Point to something work
>> saving.  There isn't anything.
> 
> I think we have addressed this.
> 
>>> Unfortunately I think Ulf's earlier criticism that you're rewriting
>>> the world instead of refactoring what we have still stands on many
>>> accounts here.
>>
>> Nope.  It is just an excuse to delay the patches.  You guys are playing
>> games and it is embarassing for linux.  What is actually wrong with this
>> technically?  It is not good because it doesn't churn the old code?  That is
>> ridiculous.
> 
> It's not good because it does not make MQ mandatory and does
> not delete the interfaces to the old block layer.

That is a separate issue.

> 
>>> It makes it even harder to understand your persistance in keeping
>>> the old block layer around. If you're introducing new concepts and
>>> cleaner code in the MQ path and kind of discarding the old
>>> block layer path, why keep it around at all?
>>
>> Wow, you really like making things up.  Never have I suggested keeping the
>> old code.  It is rubbish.
> 
> So why are you not just deleting it?

Because it needs more testing first.  If the testing goes well, then switch
over the default.  If, after 1 or more release cycles, there are no
problems, delete the old code.

> 
>>  As soon and blk-mq is ready and tested, delete
>> the old crap.
> 
> I disagree. No "ready and tested" is needed, the code is ready,
> and I have tested it. It performs. Delete the old code now.

Not true.  My testing showed the block layer workqueue wasn't performing as
well as a thread.

> 
>> I was expecting CQE to be applied 6 months ago, supporting the legacy blk
>> layer until blk-mq was ready.  But you never delivered on blk-mq, which is
>> why I had to do it.  And now you are making up excuses about why we can't
>> move forward.
> 
> Don't be so conspiracist. I think I have answered this several
> times over.
> 
> Migrating to MQ and getting rid of the old block layer interface is
> paramount in my view. That is the essence of all my review feedback.
> The other comments are minor, and even if you don't take my
> comments into consideration it is stuff I can work on fixing after
> these patches are merged.
> 
> If you just make a series also deleteing the old block layer
> I will test it so it doesn't break anything and then you can
> probably expect a big Acked-by on the whole shebang.

I will add patches for that - let's see what happens.

> 
>>> I would have a much easier time accepting this patch if it
>>> deleted as much as it was adding, i.e. introduce all this new
>>> nice MQ code, but also tossing out the old block layer and error
>>> handling code. Even if it is a massive rewrite, at least there
>>> is just one body of code to maintain going forward.
>>
>> How can you pssibly call a few hundred lines massive.  The kernel has
>> millions of lines.  Your sense of scale is out of whack.
> 
> Arguably you are speaking against yourself, since the old code
> was described by yourself as "rubbish", and it is very terse and
> uninituitive to read even a few lines of rubbish.
> 
> I had to create a (quite popular) Googledoc breaking down the
> MMC stack in words before I could even start hacking at it.
> So it is not massive in the sense of number of lines but in the
> sense of intelligibility, it's so terse that it feels like eating a
> too big piece of mudcake or something.
> 
> Just delete the rubbish and I'm happy.
> 
>>> That said, I would strongly prefer a refactoring of the old block
>>> layer leading up to transitioning to MQ. But I am indeed biased
>>> since I took that approach myself.
>>
>> Well stop it.  We have nice working code.  Get it applied and tested, and
>> then we can delete the old crap.
> 
> Just get the old code deleted so there is just one thing to
> test and not a matrix of old and new code paths.
> 
>>> This timeout looks like something I need to pick up in my patch
>>> set as well. It seems good for stability to support this. But what happened
>>> here? Did you experience a bunch of timeouts during development,
>>> or let's say how was this engineered, I guess it is for the case when
>>> something randomly locks up for a long time and we don't really know
>>> what has happened, like a watchdog?
>>
>> We presently don't have the host APIs to support external timeouts.  CQE
>> uses them though.
> 
> OK
> 
>>>> +static int mmc_init_request(struct request_queue *q, struct request *req,
>>>> +                           gfp_t gfp)
>>>> +{
>>>> +       return __mmc_init_request(q->queuedata, req, gfp);
>>>> +}
>>>> +
>>> (...)
>>>> +static int mmc_mq_init_request(struct blk_mq_tag_set *set, struct request *req,
>>>> +                              unsigned int hctx_idx, unsigned int numa_node)
>>>> +{
>>>> +       return __mmc_init_request(set->driver_data, req, GFP_KERNEL);
>>>> +}
>>>> +
>>>> +static void mmc_mq_exit_request(struct blk_mq_tag_set *set, struct request *req,
>>>> +                               unsigned int hctx_idx)
>>>> +{
>>>> +       struct mmc_queue *mq = set->driver_data;
>>>> +
>>>> +       mmc_exit_request(mq->queue, req);
>>>> +}
>>>
>>> Here is more code duplication just to keep both the old block layer
>>> and MQ around. Including introducing another inner __foo function
>>> which I have something strongly against personally (I might be
>>> crazily picky, because I see many people do this).
>>
>> In this case, it is not code duplication it is re-using the same code but
>> called from the blk-mq API.
> 
> Right, OK.
> 
>>>> +               /*
>>>> +                * Timeouts are handled by mmc core, so set a large value to
>>>> +                * avoid races.
>>>> +                */
>>>> +               req->timeout = 600 * HZ;
>>>> +               break;
>>>> +       }
>>>
>>> These timeouts again, does this mean we have competing timeout
>>> code in the block layer and MMC?
>>
>> Yes - the host controller provides hardware timeout interrupts in most
>> cases.  The core provides software timeouts in other cases.
> 
> OK maybe we can extend the comment about to explain this
> situation so it is clear what is racing with what.

Ok

> 
>>> This mentions timeouts in the MMC core, but they are actually
>>> coming from the *MMC* core, when below you set:
>>> blk_queue_rq_timeout(mq->queue, 60 * HZ);?
>>>
>>> Isn't the actual case that the per-queue timeout is set up to
>>> occur before the per-request timeout, and that you are hacking
>>> around the block layer core having two different timeouts?
>>
>> There is no per-queue timeout.  The request timeout has a default value
>> given by the queue.  It can be changed for different requests.
> 
> Aha.
> 
>>> It's a bit confusing so I'd really like to know what's going on...
>>
>> I don't expect to have to teach you the block layer.
> 
> But I will tell you to stop snarking around every time you
> write things like that.
> 
> 
>>>> +       mq->in_flight[issue_type] += 1;
>>>> +       get_card = mmc_tot_in_flight(mq) == 1;
>>>
>>> Parenthesis around the logical expression preferred I guess
>>> get_card = (mmc_tot_in_flight(mq) == 1);
>>> (Isn't checkpatch complaining about this?)
>>
>> Nope
> 
> Too bad. OK I think it's nice with a paranthesis anyway.
> Aids perception or something.

Ok

> 
>>> Then:
>>> (...)
>>>> +       if (get_card)
>>>> +               mmc_get_card(card, &mq->ctx);
>>>
>>> I simply took the card on every request. Since the context is the
>>> same for all block layer business and the lock is now fully
>>> reentrant this if (get_card) is not necessary. Just take it for
>>> every request and release it in the .complete() callback.
>>
>> As I have written elsewhere, we have always avoind getting / putting
>> unecessarily.  It is better that way, so no point in taking it out.
> 
> I explained about about the double counters.
> 
>>>> +#define MMC_QUEUE_DEPTH 64
>>>> +
>>>> +static int mmc_mq_init(struct mmc_queue *mq, struct mmc_card *card,
>>>> +                        spinlock_t *lock)
>>>> +{
>>>> +       int q_depth;
>>>> +       int ret;
>>>> +
>>>> +       q_depth = MMC_QUEUE_DEPTH;
>>>> +
>>>> +       ret = mmc_mq_init_queue(mq, q_depth, &mmc_mq_ops, lock);
>>>
>>> Apart from using a define, then assigning the define to a
>>> variable and then passing that variable instead of just
>>> passing the define: why 64? Is that the depth of the CQE
>>> queue? In that case we need an if (cqe) and set it down
>>> to 2 for non-CQE.
>>
>> Are you ever going to learn about the block layer.
> 
> Are you ever going to treat your fellow community peers as
> equals?
> 
> It is actually your job as a patch submitter to teach others about
> the details of what you are doing. If you think all your fellow programmers
> suck then elevate them to your own level instead of pushing them down.
> 
>> The number of requests
>> is default 128 for the legacy block layer.
> 
> I had no clue.
> 
>> For blk-mq it is queue depth
>> times 2.  So 64 gives the same number of requests as before.
> 
> I see. But by just setting it to 2 as I do in my patch set, it still
> performs, I guess just with less buffers allocated.
> 
> This makes some kind of sense though and I guess it explains
> why we got out of memory with the bounce buffer thing, as that
> then resulted in 128*64KB of allocated memory for the request
> queue. I thought it would be ... a few requests.
> 
>>>> +       if (ret)
>>>> +               return ret;
>>>> +
>>>> +       blk_queue_rq_timeout(mq->queue, 60 * HZ);
>>>
>>> And requests timeout after 1 minute I take it.
>>>
>>> I suspect both of these have some relation to CQE, so that is where
>>> you find these long execution times etc?
>>
>> For legacy mmc, the core takes care of timeouts.  For CQE we expect reliable
>> devices and I would interpret a timeout as meaning the device is broken.
> 
> OK.
> 
>> However it is sensible to have anyway.  For CQE, a request might have to
>> wait for the entire rest of the queue to be processed first, or maybe the
>> request somehow gets stuck and there are other requests constantly
>> overtaking it.  The max queue depth is 32 so 60 seconds seems ok.
> 
> OK we will get to see as we see more and more devices of this type.
> It's fine.
> 
>>>> +static void mmc_mq_queue_suspend(struct mmc_queue *mq)
>>>> +{
>>>> +       blk_mq_quiesce_queue(mq->queue);
>>>> +
>>>> +       /*
>>>> +        * The host remains claimed while there are outstanding requests, so
>>>> +        * simply claiming and releasing here ensures there are none.
>>>> +        */
>>>> +       mmc_claim_host(mq->card->host);
>>>> +       mmc_release_host(mq->card->host);
>>>
>>> I think just blk_mq_quiesce_queue() should be fine as and
>>> should make sure all requests have called .complete() and there
>>> I think you should also release the host lock.
>>>
>>> If the MQ code is not doing this, we need to fix MQ to
>>> do the right thing (or add a new callback such as
>>> blk_mq_make_sure_queue_empty()) so at the very
>>> least put a big fat FIXME or REVISIT comment on the above.
>>
>> blk_mq_quiesce_queue() prevents dispatches not completions.  So we wait for
>> outstanding requests.
> 
> I guess the quiesce call not really intended for PM suspend/resume
> paths. Maybe we need to add a blk_mq_flush_queue() call then.
> (No showstopper, this is OK for now.)
> 
>>> One of the good reasons to delete the old block layer is to get
>>> rid of this horrible semaphore construction. So I see it as necessary
>>> to be able to focus development efforts on code that actually has
>>> a future.
>>
>> The old crap will get deleted when blk-mq is ready.
> 
> Which IMO is now.
> 
>>>> +
>>>> +       int                     in_flight[MMC_ISSUE_MAX];
>>>
>>> So this is a [2] containing a counter for the number of
>>> synchronous and asynchronous requests in flight at any
>>> time.
>>>
>>> But are there really synchronous and asynchronous requests
>>> going on at the same time?
>>>
>>> Maybe on the error path I guess.
>>>
>>> I avoided this completely but I guess it may be necessary with
>>> CQE, such that in_flight[0,1] is way more than 1 or 2 at times
>>> when there are commands queued?
>>
>> CQE needs to count DCMD separately from read / writes.  Counting by issue
>> type is a simple way to do that.
> 
> OK
> 
>>>> +       bool                    rw_wait;
>>>> +       bool                    waiting;
>>>> +       wait_queue_head_t       wait;
>>>
>>> As mentioned I think this is a reimplementation of
>>> the completion abstraction.
>>
>> I pointed out why that wouldn't work.  Another case of why the code makes
>> more sense together than split up.
> 
> I'm not entirely convinced about that but we'll see. It is a detail.
Linus Walleij Nov. 14, 2017, 2:50 p.m. UTC | #6
Hi Adrian!

Thanks for the helpful and productive tone in the recent mail,
it is very helpful and improves my work environment.

>> The block layer maintainers most definately think MQ is ready
>> but you seem to disagree. Why?
>
> I meant when the mmc conversion to blk-mq is ready.  We don't need to
> consider other sub-systems, just whether it is working for mmc.  For
> example, the issue with the block layer workqueues not performing as well as
> a thread.  That only came to light through testing mmc.

OK I see your point.

>> It's bad enough that you repeatedly point out how stupid
>> you think I am
>
> I have never called you names.  On the contrary, it is because I don't think
> you are stupid that I get upset when you seem spend so little time and
> effort to understand the code.  Equally, you seem to make comments that just
> assume the code is wrong without due consideration.

Please work on your patience. I think so few people are reviewing the
code because it is massive and complicated and sits in a complicated
place. One can not be blamed for trying, right?

I have spent considerable time with your code, more than with my own.
Mostly testing it, and that is why I can say it does not regress performance
on my end.

I also ran several rounds of fault injection, which it handled without
problems.

Then I tried to eject the SD card when running DD from the card and
then it crashed.

But that is an extreme test case, so overall it is very robust code.
With the exception of card removal during I/O, feel free to add
my Tested-by: Linus Walleij <linus.walleij@linaro.org> on this series.

>> But what I think is nice in doing it around
>> each request is that since mmc_put_card() calls mmc_release_host()
>> contains this:
>>
>> if (--host->claim_cnt) { (...)
>>
>> So there is a counter inside the claim/release host functions
>> and now there is another counter keeping track if the in-flight
>> requests as well and it gives me the feeling we are maintaining
>> two counters when we only need one.
>
> The gets / puts also get runtime pm for the card.  It is a lot a messing
> around for the sake of a quick check for the number of requests inflight -
> which is needed anyway for CQE.

OK I buy that.

>> But maybe it is actually the host->claim_cnt that is overengineered
>> and should just be a bool, becuase with this construction
>> that you only call down and claim the host once, the
>> host->claim_cnt will only be 0 or 1, right?
>
> The claim_cnt was introduced for nested claiming.

Yeah that sounds familiar. I'm not really sure it happens
anymore after this but I guess I can test that with some
practical experiements, let's leave it like this.

>> I guess I could turn that around and ask: if it is so crappy, why
>> is your patch set not deleting it?
>
> To allow people time to test.

OK I guess I'm more forging ahead with such things. But we could
at least enable it by default so whoever checks out and builds
and tests linux-next with their MMC/SD controllers will then be
testing this for us in the next kernel cycle.

>> But I think I would prefer that if a big slew of new code is
>> introduced it needs to be put to wide use and any bugs
>> smoked out during the -rc phase, and we are now
>> hiding it behind a Kconfig option so it's unlikely to see testing
>> until that option is turned on, and that is not good.
>
> And if you find a big problem in rc7, then what do you do?  At least with
> the config option, the revert is trivial.

I see your point. I guess it is up to Ulf how he feels about
trust wrt the code. If a problem appears at -rc7 it's indeed
nice if we can leave the code in-tree and work on it.

>> At least take a moment to consider the option that maybe so few people
>> are reviwing your code because this is complicated stuff and really
>> hard to grasp, maybe the problem isn't on my side, neither on yours,
>> it may be that the subject matter is the problem.
>
> I would expect a review would take a number of days, perhaps longer.
> However, it seems people are not willing to consider anything that takes
> more than a an hour or two.

Your criticism is something like "do it properly or not at all"
and I understand that feeling. I deserve some of the criticism
and I can accept it much easier when put with these words.

It would be nice if the community would step in here and I have
seen so many people talk about this code that I really think they
should invest in proper review. More people than me and Ulf
need to help out with review and test.

>> I would say a synchronization primitive is needed, indeed.
>> I have a different approach to this, which uses completion
>> as the synchronization primitive and I think that would be possible
>> to use also here.
>
> Because they are both just wake-ups.  When waiting for multiple conditions,
> wait_event() is simpler.  You are not considering the possibility for having
> only 1 task switch between requests.

That is indeed a nice feature.

>>
>> I have this code:
>
> Which runs as a work item, so there is already a task switch to get here.
> i.e. there are 2 task switches between each request, instead of 1.

Yes. This is a limitation in my patch set.

>>         /*
>>          * Here we postprocess the request differently depending on if
>>          * we go on the success path or error path. The success path will
>>          * immediately let new requests hit the host, whereas the error
>>          * path will hold off new requests until we have retried and
>>          * succeeded or failed the current asynchronous request.
>>          */
>>         if (status == MMC_BLK_SUCCESS) {
>>                 /*
>>                  * This immediately opens the gate for the next request
>>                  * to start on the host while we perform post-processing
>>                  * and report back to the block layer.
>>                  */
>>                 host->areq = NULL;
>>                 complete(&areq->complete);
>
> So that is the second task switch.

Yes.

Interestingly, it does not affect performance in my tests.
I thought it would but it doesn't.

>> As you can see it is a bit elaborate about some
>> of this stuff and quite a lot of comments were added to make
>> clear what is going on. This is in my head so that is why I ask:
>> completion worked fine as a synchronization primitive here
>> so it is maybe a good alternative in your (similar) code
>> as well?
>
> In the case there is another request already waiting, then instead of
> scheduling work to complete the request, the dispatch is woken to complete
> the previous and start the next.  That needs different synchronization.

OK.

>> So what we want to attain is that the next dispatch happens as soon
>> as possible after completing the previous request. They only race
>> as far as that they go through post/preprocessing before getting to
>> the synchronization primitive though?
>
> They only race when the next request is not already waiting.  Otherwise the
> work is never scheduled.

OK I see better which race you were referring to and it makes
a lot of sense. This should work optimally with the kblockd queue
then. (I should have used that in my patch set as well.)

>>> As I already wrote, the CPU-bound block layer dispatch work queue has
>>> negative consequeuences for mmc performance.  So there are 2 aspects to that:
>>>         1. Is the choice of CPU right to start with?  I suspect it is better for
>>> the dispatch to run on the same CPU as the interrupt.
>>>         2. Does the dispatch work need to be able to be migrated to a different
>>> CPU? i.e. unbound work queue.  That helped in my tests, but it could just be
>>> a side-effect of 1.
>>
>> Hm! What you are saying sounds correct, and we really need to
>> consider the multi-CPU aspects of this, maybe not now. I am
>> happy as long as we have equal performance as before and
>> maintainable code.
>
> Well I saw 3% drop in sequential read performance, improving to 1% when an
> unbound workqueue was used.

I think that is acceptable, albeit I cannot see it on my host.
I wonder if people disagree strongly when I say "acceptable..."

In my tests any throughput loss is in the error margin. It may be
because of fuzz in this hardware or because of differences between
ARM and x86, caches or whatever.

>> Or rather: just because someone is FOR something else,
>> doesn't mean they are AGAINST this.
>
> But that is not what happened.  You blocked CQE even though it was ready.

That is an unfair accusation. I alone, not even being a
maintainer of MMC can't block your patch even if something is
seriously wrong with it.

The whole core of your conflict with me and the MMC maintainer is that
you think CQE is more important to get in than to get MMC converted
to MQ. You thought it was more important to support this new feature
than to evolute the MMC subsystem to support MQ. Please understand that.

When you are putting one thing over another and refusing to discuss
you are not a team player and you're not playing well with others.

With the recent patch set you have come a long way towards acceptance
from my side, because you do MQ and MQ first. As I have repeatedly
said, make it it the default and kill the old code, then you're home in
my book.

>> It's not good because it does not make MQ mandatory and does
>> not delete the interfaces to the old block layer.
>
> That is a separate issue.

Not at all. This is the core of this entire conflict, my insistance on
tranitioning to MQ over all. We need to turn this into a win-win
situation where we get both MQ and CQE in the same go in the same
merge window IMO.

>> So why are you not just deleting it?
>
> Because it needs more testing first.  If the testing goes well, then switch
> over the default.  If, after 1 or more release cycles, there are no
> problems, delete the old code.

OK I understand the conservative stance.

But it's just not my preferred stance :)

I feel a strong pressure from the block maintainers to move forward
with MQ and now it is happening. I'm very happy with that.

>> I disagree. No "ready and tested" is needed, the code is ready,
>> and I have tested it. It performs. Delete the old code now.
>
> Not true.  My testing showed the block layer workqueue wasn't performing as
> well as a thread.

Yeah good point. It's weird that my testing doesn't show anything
of the kind.

We can probably agree on one thing though: the MQ code should be
default-enabled on CQE-capable devices.

>> If you just make a series also deleteing the old block layer
>> I will test it so it doesn't break anything and then you can
>> probably expect a big Acked-by on the whole shebang.
>
> I will add patches for that - let's see what happens.

Yay!

Yours,
Linus Walleij
Ulf Hansson Nov. 15, 2017, 10:55 a.m. UTC | #7
Linus, Adrian,

Apologize for sidetracking the discussion, just wanted to add some
minor comments.

[...]

>
>>> But what I think is nice in doing it around
>>> each request is that since mmc_put_card() calls mmc_release_host()
>>> contains this:
>>>
>>> if (--host->claim_cnt) { (...)
>>>
>>> So there is a counter inside the claim/release host functions
>>> and now there is another counter keeping track if the in-flight
>>> requests as well and it gives me the feeling we are maintaining
>>> two counters when we only need one.
>>
>> The gets / puts also get runtime pm for the card.  It is a lot a messing
>> around for the sake of a quick check for the number of requests inflight -
>> which is needed anyway for CQE.

Actually the get / puts for runtime PM of the card is already done
taking the host->claim_cnt into account.

In other words, the additional in-flight counter does not provide an
additional improvement in this regards.

For that reason, perhaps the in-flight counter should be added in the
CQE patch instead, because it seems like it is there it really
belongs?

[...]

>
> OK I guess I'm more forging ahead with such things. But we could
> at least enable it by default so whoever checks out and builds
> and tests linux-next with their MMC/SD controllers will then be
> testing this for us in the next kernel cycle.
>
>>> But I think I would prefer that if a big slew of new code is
>>> introduced it needs to be put to wide use and any bugs
>>> smoked out during the -rc phase, and we are now
>>> hiding it behind a Kconfig option so it's unlikely to see testing
>>> until that option is turned on, and that is not good.
>>
>> And if you find a big problem in rc7, then what do you do?  At least with
>> the config option, the revert is trivial.
>
> I see your point. I guess it is up to Ulf how he feels about
> trust wrt the code. If a problem appears at -rc7 it's indeed
> nice if we can leave the code in-tree and work on it.

Well, it's not an easy decision, simply because it moves the code in
an even worse situation maintenance wise. So, if you guys just
suddenly have to move on to do something else, then it becomes my
problem to work out. :-)

However, as I trust both of you, and that you seems to agree on the
path forward, I am fine keeping the old code for while.

Although, please make sure the Kconfig option starts out by being
enabled by default, so we can get some test coverage of the mq path.

Of course, then we need to work on the card removal problem asap, and
hopefully we manage to fix them. If not, or other strange errors
happens, we would need to change the default value to 'n' of the
Kconfig.

[...]

>>> Hm! What you are saying sounds correct, and we really need to
>>> consider the multi-CPU aspects of this, maybe not now. I am
>>> happy as long as we have equal performance as before and
>>> maintainable code.
>>
>> Well I saw 3% drop in sequential read performance, improving to 1% when an
>> unbound workqueue was used.

Can you please make this improvement as a standalone patch on top of
the mq patch?

I think that would be good, because it points out some generic
problems with mq, which we then, at least short term, tries to address
locally in MMC.

[...]

>
>>> If you just make a series also deleteing the old block layer
>>> I will test it so it doesn't break anything and then you can
>>> probably expect a big Acked-by on the whole shebang.
>>
>> I will add patches for that - let's see what happens.

Yes, please. However, I will as stated, be withholding that change for
a while, until we fixed any showstoppers in the mq path.

[...]

Kind regards
Uffe
Adrian Hunter Nov. 15, 2017, 1:07 p.m. UTC | #8
On 15/11/17 12:55, Ulf Hansson wrote:
> Linus, Adrian,
> 
> Apologize for sidetracking the discussion, just wanted to add some
> minor comments.
> 
> [...]
> 
>>
>>>> But what I think is nice in doing it around
>>>> each request is that since mmc_put_card() calls mmc_release_host()
>>>> contains this:
>>>>
>>>> if (--host->claim_cnt) { (...)
>>>>
>>>> So there is a counter inside the claim/release host functions
>>>> and now there is another counter keeping track if the in-flight
>>>> requests as well and it gives me the feeling we are maintaining
>>>> two counters when we only need one.
>>>
>>> The gets / puts also get runtime pm for the card.  It is a lot a messing
>>> around for the sake of a quick check for the number of requests inflight -
>>> which is needed anyway for CQE.
> 
> Actually the get / puts for runtime PM of the card is already done
> taking the host->claim_cnt into account.

We do pm_runtime_get_sync(&card->dev) always i.e.

void mmc_get_card(struct mmc_card *card, struct mmc_ctx *ctx)
{
	pm_runtime_get_sync(&card->dev);
	__mmc_claim_host(card->host, ctx, NULL);
}

> 
> In other words, the additional in-flight counter does not provide an
> additional improvement in this regards.
> 
> For that reason, perhaps the in-flight counter should be added in the
> CQE patch instead, because it seems like it is there it really
> belongs?
> 
> [...]
> 
>>
>> OK I guess I'm more forging ahead with such things. But we could
>> at least enable it by default so whoever checks out and builds
>> and tests linux-next with their MMC/SD controllers will then be
>> testing this for us in the next kernel cycle.
>>
>>>> But I think I would prefer that if a big slew of new code is
>>>> introduced it needs to be put to wide use and any bugs
>>>> smoked out during the -rc phase, and we are now
>>>> hiding it behind a Kconfig option so it's unlikely to see testing
>>>> until that option is turned on, and that is not good.
>>>
>>> And if you find a big problem in rc7, then what do you do?  At least with
>>> the config option, the revert is trivial.
>>
>> I see your point. I guess it is up to Ulf how he feels about
>> trust wrt the code. If a problem appears at -rc7 it's indeed
>> nice if we can leave the code in-tree and work on it.
> 
> Well, it's not an easy decision, simply because it moves the code in
> an even worse situation maintenance wise. So, if you guys just
> suddenly have to move on to do something else, then it becomes my
> problem to work out. :-)
> 
> However, as I trust both of you, and that you seems to agree on the
> path forward, I am fine keeping the old code for while.
> 
> Although, please make sure the Kconfig option starts out by being
> enabled by default, so we can get some test coverage of the mq path.

Ok

> 
> Of course, then we need to work on the card removal problem asap, and
> hopefully we manage to fix them. If not, or other strange errors
> happens, we would need to change the default value to 'n' of the
> Kconfig.
> 
> [...]
> 
>>>> Hm! What you are saying sounds correct, and we really need to
>>>> consider the multi-CPU aspects of this, maybe not now. I am
>>>> happy as long as we have equal performance as before and
>>>> maintainable code.
>>>
>>> Well I saw 3% drop in sequential read performance, improving to 1% when an
>>> unbound workqueue was used.
> 
> Can you please make this improvement as a standalone patch on top of
> the mq patch?

Unfortunately it was just a hack to the blk-mq core - the block layer does
not have an unbound workqueue.  I have not had time to consider a proper
fix.  It will have to wait, but I agree 3% is not enough to delay going forward.

> 
> I think that would be good, because it points out some generic
> problems with mq, which we then, at least short term, tries to address
> locally in MMC.
> 
> [...]
> 
>>
>>>> If you just make a series also deleteing the old block layer
>>>> I will test it so it doesn't break anything and then you can
>>>> probably expect a big Acked-by on the whole shebang.
>>>
>>> I will add patches for that - let's see what happens.
> 
> Yes, please. However, I will as stated, be withholding that change for
> a while, until we fixed any showstoppers in the mq path.
Ulf Hansson Nov. 16, 2017, 7:19 a.m. UTC | #9
On 15 November 2017 at 14:07, Adrian Hunter <adrian.hunter@intel.com> wrote:
> On 15/11/17 12:55, Ulf Hansson wrote:
>> Linus, Adrian,
>>
>> Apologize for sidetracking the discussion, just wanted to add some
>> minor comments.
>>
>> [...]
>>
>>>
>>>>> But what I think is nice in doing it around
>>>>> each request is that since mmc_put_card() calls mmc_release_host()
>>>>> contains this:
>>>>>
>>>>> if (--host->claim_cnt) { (...)
>>>>>
>>>>> So there is a counter inside the claim/release host functions
>>>>> and now there is another counter keeping track if the in-flight
>>>>> requests as well and it gives me the feeling we are maintaining
>>>>> two counters when we only need one.
>>>>
>>>> The gets / puts also get runtime pm for the card.  It is a lot a messing
>>>> around for the sake of a quick check for the number of requests inflight -
>>>> which is needed anyway for CQE.
>>
>> Actually the get / puts for runtime PM of the card is already done
>> taking the host->claim_cnt into account.
>
> We do pm_runtime_get_sync(&card->dev) always i.e.
>
> void mmc_get_card(struct mmc_card *card, struct mmc_ctx *ctx)
> {
>         pm_runtime_get_sync(&card->dev);
>         __mmc_claim_host(card->host, ctx, NULL);
> }

You are absolutely correct, so let's leave the inflight counter in.
Apologize for the noise!

I was thinking about the runtime PM of the host dev, which is managed
by mmc_claim|release host().

[...]

>>>> Well I saw 3% drop in sequential read performance, improving to 1% when an
>>>> unbound workqueue was used.
>>
>> Can you please make this improvement as a standalone patch on top of
>> the mq patch?
>
> Unfortunately it was just a hack to the blk-mq core - the block layer does
> not have an unbound workqueue.  I have not had time to consider a proper
> fix.  It will have to wait, but I agree 3% is not enough to delay going forward.

I see, thanks!

[...]

Kind regards
Uffe
diff mbox

Patch

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index ad72fa19f082..e2838ff4738e 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -1264,7 +1264,10 @@  static void mmc_blk_issue_drv_op(struct mmc_queue *mq, struct request *req)
 		break;
 	}
 	mq_rq->drv_op_result = ret;
-	blk_end_request_all(req, ret ? BLK_STS_IOERR : BLK_STS_OK);
+	if (req->mq_ctx)
+		blk_mq_end_request(req, ret ? BLK_STS_IOERR : BLK_STS_OK);
+	else
+		blk_end_request_all(req, ret ? BLK_STS_IOERR : BLK_STS_OK);
 }
 
 static void mmc_blk_issue_discard_rq(struct mmc_queue *mq, struct request *req)
@@ -1307,7 +1310,10 @@  static void mmc_blk_issue_discard_rq(struct mmc_queue *mq, struct request *req)
 	else
 		mmc_blk_reset_success(md, type);
 fail:
-	blk_end_request(req, status, blk_rq_bytes(req));
+	if (req->mq_ctx)
+		blk_mq_end_request(req, status);
+	else
+		blk_end_request(req, status, blk_rq_bytes(req));
 }
 
 static void mmc_blk_issue_secdiscard_rq(struct mmc_queue *mq,
@@ -1377,7 +1383,10 @@  static void mmc_blk_issue_secdiscard_rq(struct mmc_queue *mq,
 	if (!err)
 		mmc_blk_reset_success(md, type);
 out:
-	blk_end_request(req, status, blk_rq_bytes(req));
+	if (req->mq_ctx)
+		blk_mq_end_request(req, status);
+	else
+		blk_end_request(req, status, blk_rq_bytes(req));
 }
 
 static void mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req)
@@ -1387,7 +1396,10 @@  static void mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req)
 	int ret = 0;
 
 	ret = mmc_flush_cache(card);
-	blk_end_request_all(req, ret ? BLK_STS_IOERR : BLK_STS_OK);
+	if (req->mq_ctx)
+		blk_mq_end_request(req, ret ? BLK_STS_IOERR : BLK_STS_OK);
+	else
+		blk_end_request_all(req, ret ? BLK_STS_IOERR : BLK_STS_OK);
 }
 
 /*
@@ -1464,11 +1476,9 @@  static void mmc_blk_eval_resp_error(struct mmc_blk_request *brq)
 	}
 }
 
-static enum mmc_blk_status mmc_blk_err_check(struct mmc_card *card,
-					     struct mmc_async_req *areq)
+static enum mmc_blk_status __mmc_blk_err_check(struct mmc_card *card,
+					       struct mmc_queue_req *mq_mrq)
 {
-	struct mmc_queue_req *mq_mrq = container_of(areq, struct mmc_queue_req,
-						    areq);
 	struct mmc_blk_request *brq = &mq_mrq->brq;
 	struct request *req = mmc_queue_req_to_req(mq_mrq);
 	int need_retune = card->host->need_retune;
@@ -1574,6 +1584,15 @@  static enum mmc_blk_status mmc_blk_err_check(struct mmc_card *card,
 	return MMC_BLK_SUCCESS;
 }
 
+static enum mmc_blk_status mmc_blk_err_check(struct mmc_card *card,
+					     struct mmc_async_req *areq)
+{
+	struct mmc_queue_req *mq_mrq = container_of(areq, struct mmc_queue_req,
+						    areq);
+
+	return __mmc_blk_err_check(card, mq_mrq);
+}
+
 static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq,
 			      int disable_multi, bool *do_rel_wr_p,
 			      bool *do_data_tag_p)
@@ -1766,6 +1785,428 @@  static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq,
 	mqrq->areq.err_check = mmc_blk_err_check;
 }
 
+#define MMC_MAX_RETRIES		5
+#define MMC_NO_RETRIES		(MMC_MAX_RETRIES + 1)
+
+/* Single sector read during recovery */
+static void mmc_blk_ss_read(struct mmc_queue *mq, struct request *req)
+{
+	struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
+	blk_status_t status;
+
+	while (1) {
+		mmc_blk_rw_rq_prep(mqrq, mq->card, 1, mq);
+
+		mmc_wait_for_req(mq->card->host, &mqrq->brq.mrq);
+
+		/*
+		 * Not expecting command errors, so just give up in that case.
+		 * If there are retries remaining, the request will get
+		 * requeued.
+		 */
+		if (mqrq->brq.cmd.error)
+			return;
+
+		if (blk_rq_bytes(req) <= 512)
+			break;
+
+		status = mqrq->brq.data.error ? BLK_STS_IOERR : BLK_STS_OK;
+
+		blk_update_request(req, status, 512);
+	}
+
+	mqrq->retries = MMC_NO_RETRIES;
+}
+
+static void mmc_blk_rw_recovery(struct mmc_queue *mq, struct request *req)
+{
+	int type = rq_data_dir(req) == READ ? MMC_BLK_READ : MMC_BLK_WRITE;
+	struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
+	struct mmc_blk_request *brq = &mqrq->brq;
+	struct mmc_blk_data *md = mq->blkdata;
+	struct mmc_card *card = mq->card;
+	static enum mmc_blk_status status;
+
+	brq->retune_retry_done = mqrq->retries;
+
+	status = __mmc_blk_err_check(card, mqrq);
+
+	mmc_retune_release(card->host);
+
+	/*
+	 * Requests are completed by mmc_blk_mq_complete_rq() which sets simple
+	 * policy:
+	 * 1. A request that has transferred at least some data is considered
+	 * successful and will be requeued if there is remaining data to
+	 * transfer.
+	 * 2. Otherwise the number of retries is incremented and the request
+	 * will be requeued if there are remaining retries.
+	 * 3. Otherwise the request will be errored out.
+	 * That means mmc_blk_mq_complete_rq() is controlled by bytes_xfered and
+	 * mqrq->retries. So there are only 4 possible actions here:
+	 *	1. do not accept the bytes_xfered value i.e. set it to zero
+	 *	2. change mqrq->retries to determine the number of retries
+	 *	3. try to reset the card
+	 *	4. read one sector at a time
+	 */
+	switch (status) {
+	case MMC_BLK_SUCCESS:
+	case MMC_BLK_PARTIAL:
+		/* Reset success, and accept bytes_xfered */
+		mmc_blk_reset_success(md, type);
+		break;
+	case MMC_BLK_CMD_ERR:
+		/*
+		 * For SD cards, get bytes written, but do not accept
+		 * bytes_xfered if that fails. For MMC cards accept
+		 * bytes_xfered. Then try to reset. If reset fails then
+		 * error out the remaining request, otherwise retry
+		 * once (N.B mmc_blk_reset() will not succeed twice in a
+		 * row).
+		 */
+		if (mmc_card_sd(card)) {
+			u32 blocks;
+			int err;
+
+			err = mmc_sd_num_wr_blocks(card, &blocks);
+			if (err)
+				brq->data.bytes_xfered = 0;
+			else
+				brq->data.bytes_xfered = blocks << 9;
+		}
+		if (mmc_blk_reset(md, card->host, type))
+			mqrq->retries = MMC_NO_RETRIES;
+		else
+			mqrq->retries = MMC_MAX_RETRIES - 1;
+		break;
+	case MMC_BLK_RETRY:
+		/*
+		 * Do not accept bytes_xfered, but retry up to 5 times,
+		 * otherwise same as abort.
+		 */
+		brq->data.bytes_xfered = 0;
+		if (mqrq->retries < MMC_MAX_RETRIES)
+			break;
+		/* Fall through */
+	case MMC_BLK_ABORT:
+		/*
+		 * Do not accept bytes_xfered, but try to reset. If
+		 * reset succeeds, try once more, otherwise error out
+		 * the request.
+		 */
+		brq->data.bytes_xfered = 0;
+		if (mmc_blk_reset(md, card->host, type))
+			mqrq->retries = MMC_NO_RETRIES;
+		else
+			mqrq->retries = MMC_MAX_RETRIES - 1;
+		break;
+	case MMC_BLK_DATA_ERR: {
+		int err;
+
+		/*
+		 * Do not accept bytes_xfered, but try to reset. If
+		 * reset succeeds, try once more. If reset fails with
+		 * ENODEV which means the partition is wrong, then error
+		 * out the request. Otherwise attempt to read one sector
+		 * at a time.
+		 */
+		brq->data.bytes_xfered = 0;
+		err = mmc_blk_reset(md, card->host, type);
+		if (!err) {
+			mqrq->retries = MMC_MAX_RETRIES - 1;
+			break;
+		}
+		if (err == -ENODEV) {
+			mqrq->retries = MMC_NO_RETRIES;
+			break;
+		}
+		/* Fall through */
+	}
+	case MMC_BLK_ECC_ERR:
+		/*
+		 * Do not accept bytes_xfered. If reading more than one
+		 * sector, try reading one sector at a time.
+		 */
+		brq->data.bytes_xfered = 0;
+		/* FIXME: Missing single sector read for large sector size */
+		if (brq->data.blocks > 1 && !mmc_large_sector(card)) {
+			/* Redo read one sector at a time */
+			pr_warn("%s: retrying using single block read\n",
+				req->rq_disk->disk_name);
+			mmc_blk_ss_read(mq, req);
+		} else {
+			mqrq->retries = MMC_NO_RETRIES;
+		}
+		break;
+	case MMC_BLK_NOMEDIUM:
+		/* Do not accept bytes_xfered. Error out the request */
+		brq->data.bytes_xfered = 0;
+		mqrq->retries = MMC_NO_RETRIES;
+		break;
+	default:
+		/* Do not accept bytes_xfered. Error out the request */
+		brq->data.bytes_xfered = 0;
+		mqrq->retries = MMC_NO_RETRIES;
+		pr_err("%s: Unhandled return value (%d)",
+		       req->rq_disk->disk_name, status);
+		break;
+	}
+}
+
+static void mmc_blk_mq_complete_rq(struct mmc_queue *mq, struct request *req)
+{
+	struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
+	unsigned int nr_bytes = mqrq->brq.data.bytes_xfered;
+
+	if (nr_bytes) {
+		if (blk_update_request(req, BLK_STS_OK, nr_bytes))
+			blk_mq_requeue_request(req, true);
+		else
+			__blk_mq_end_request(req, BLK_STS_OK);
+	} else if (mqrq->retries++ < MMC_MAX_RETRIES) {
+		blk_mq_requeue_request(req, true);
+	} else {
+		if (mmc_card_removed(mq->card))
+			req->rq_flags |= RQF_QUIET;
+		blk_mq_end_request(req, BLK_STS_IOERR);
+	}
+}
+
+static bool mmc_blk_urgent_bkops_needed(struct mmc_queue *mq,
+					struct mmc_queue_req *mqrq)
+{
+	return mmc_card_mmc(mq->card) &&
+	       (mqrq->brq.cmd.resp[0] & R1_EXCEPTION_EVENT ||
+		mqrq->brq.stop.resp[0] & R1_EXCEPTION_EVENT);
+}
+
+static void mmc_blk_urgent_bkops(struct mmc_queue *mq,
+				 struct mmc_queue_req *mqrq)
+{
+	if (mmc_blk_urgent_bkops_needed(mq, mqrq))
+		mmc_start_bkops(mq->card, true);
+}
+
+void mmc_blk_mq_complete(struct request *req)
+{
+	struct mmc_queue *mq = req->q->queuedata;
+
+	mmc_blk_mq_complete_rq(mq, req);
+}
+
+static void mmc_blk_mq_poll_completion(struct mmc_queue *mq,
+				       struct request *req)
+{
+	struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
+
+	mmc_blk_rw_recovery(mq, req);
+
+	mmc_blk_urgent_bkops(mq, mqrq);
+}
+
+static void mmc_blk_mq_acct_req_done(struct mmc_queue *mq, struct request *req)
+{
+	struct request_queue *q = req->q;
+	unsigned long flags;
+	bool put_card;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+
+	mq->in_flight[mmc_issue_type(mq, req)] -= 1;
+
+	put_card = mmc_tot_in_flight(mq) == 0;
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	if (put_card)
+		mmc_put_card(mq->card, &mq->ctx);
+}
+
+static void mmc_blk_mq_post_req(struct mmc_queue *mq, struct request *req)
+{
+	struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
+	struct mmc_request *mrq = &mqrq->brq.mrq;
+	struct mmc_host *host = mq->card->host;
+
+	if (host->ops->post_req)
+		host->ops->post_req(host, mrq, 0);
+
+	blk_mq_complete_request(req);
+
+	mmc_blk_mq_acct_req_done(mq, req);
+}
+
+static void mmc_blk_mq_complete_prev_req(struct mmc_queue *mq,
+					 struct request **prev_req)
+{
+	mutex_lock(&mq->complete_lock);
+
+	if (!mq->complete_req)
+		goto out_unlock;
+
+	mmc_blk_mq_poll_completion(mq, mq->complete_req);
+
+	if (prev_req)
+		*prev_req = mq->complete_req;
+	else
+		mmc_blk_mq_post_req(mq, mq->complete_req);
+
+	mq->complete_req = NULL;
+
+out_unlock:
+	mutex_unlock(&mq->complete_lock);
+}
+
+void mmc_blk_mq_complete_work(struct work_struct *work)
+{
+	struct mmc_queue *mq = container_of(work, struct mmc_queue,
+					    complete_work);
+
+	mmc_blk_mq_complete_prev_req(mq, NULL);
+}
+
+static void mmc_blk_mq_req_done(struct mmc_request *mrq)
+{
+	struct mmc_queue_req *mqrq = container_of(mrq, struct mmc_queue_req,
+						  brq.mrq);
+	struct request *req = mmc_queue_req_to_req(mqrq);
+	struct request_queue *q = req->q;
+	struct mmc_queue *mq = q->queuedata;
+	unsigned long flags;
+	bool waiting;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	mq->complete_req = req;
+	mq->rw_wait = false;
+	waiting = mq->waiting;
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	if (waiting)
+		wake_up(&mq->wait);
+	else
+		kblockd_schedule_work(&mq->complete_work);
+}
+
+static bool mmc_blk_rw_wait_cond(struct mmc_queue *mq, int *err)
+{
+	struct request_queue *q = mq->queue;
+	unsigned long flags;
+	bool done;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	done = !mq->rw_wait;
+	mq->waiting = !done;
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return done;
+}
+
+static int mmc_blk_rw_wait(struct mmc_queue *mq, struct request **prev_req)
+{
+	int err = 0;
+
+	wait_event(mq->wait, mmc_blk_rw_wait_cond(mq, &err));
+
+	mmc_blk_mq_complete_prev_req(mq, prev_req);
+
+	return err;
+}
+
+static int mmc_blk_mq_issue_rw_rq(struct mmc_queue *mq,
+				  struct request *req)
+{
+	struct mmc_queue_req *mqrq = req_to_mmc_queue_req(req);
+	struct mmc_host *host = mq->card->host;
+	struct request *prev_req = NULL;
+	int err = 0;
+
+	mmc_blk_rw_rq_prep(mqrq, mq->card, 0, mq);
+
+	mqrq->brq.mrq.done = mmc_blk_mq_req_done;
+
+	if (host->ops->pre_req)
+		host->ops->pre_req(host, &mqrq->brq.mrq);
+
+	err = mmc_blk_rw_wait(mq, &prev_req);
+	if (err)
+		goto out_post_req;
+
+	mq->rw_wait = true;
+
+	err = mmc_start_request(host, &mqrq->brq.mrq);
+
+	if (prev_req)
+		mmc_blk_mq_post_req(mq, prev_req);
+
+	if (err)
+		mq->rw_wait = false;
+
+out_post_req:
+	if (err && host->ops->post_req)
+		host->ops->post_req(host, &mqrq->brq.mrq, err);
+
+	return err;
+}
+
+static int mmc_blk_wait_for_idle(struct mmc_queue *mq, struct mmc_host *host)
+{
+	return mmc_blk_rw_wait(mq, NULL);
+}
+
+enum mmc_issued mmc_blk_mq_issue_rq(struct mmc_queue *mq, struct request *req)
+{
+	struct mmc_blk_data *md = mq->blkdata;
+	struct mmc_card *card = md->queue.card;
+	struct mmc_host *host = card->host;
+	int ret;
+
+	ret = mmc_blk_part_switch(card, md->part_type);
+	if (ret)
+		return MMC_REQ_FAILED_TO_START;
+
+	switch (mmc_issue_type(mq, req)) {
+	case MMC_ISSUE_SYNC:
+		ret = mmc_blk_wait_for_idle(mq, host);
+		if (ret)
+			return MMC_REQ_BUSY;
+		switch (req_op(req)) {
+		case REQ_OP_DRV_IN:
+		case REQ_OP_DRV_OUT:
+			mmc_blk_issue_drv_op(mq, req);
+			break;
+		case REQ_OP_DISCARD:
+			mmc_blk_issue_discard_rq(mq, req);
+			break;
+		case REQ_OP_SECURE_ERASE:
+			mmc_blk_issue_secdiscard_rq(mq, req);
+			break;
+		case REQ_OP_FLUSH:
+			mmc_blk_issue_flush(mq, req);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			return MMC_REQ_FAILED_TO_START;
+		}
+		return MMC_REQ_FINISHED;
+	case MMC_ISSUE_ASYNC:
+		switch (req_op(req)) {
+		case REQ_OP_READ:
+		case REQ_OP_WRITE:
+			ret = mmc_blk_mq_issue_rw_rq(mq, req);
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			ret = -EINVAL;
+		}
+		if (!ret)
+			return MMC_REQ_STARTED;
+		return ret == -EBUSY ? MMC_REQ_BUSY : MMC_REQ_FAILED_TO_START;
+	default:
+		WARN_ON_ONCE(1);
+		return MMC_REQ_FAILED_TO_START;
+	}
+}
+
 static bool mmc_blk_rw_cmd_err(struct mmc_blk_data *md, struct mmc_card *card,
 			       struct mmc_blk_request *brq, struct request *req,
 			       bool old_req_pending)
diff --git a/drivers/mmc/core/block.h b/drivers/mmc/core/block.h
index 860ca7c8df86..c62e3f3d3a3a 100644
--- a/drivers/mmc/core/block.h
+++ b/drivers/mmc/core/block.h
@@ -6,4 +6,13 @@ 
 
 void mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req);
 
+enum mmc_issued;
+
+enum mmc_issued mmc_blk_mq_issue_rq(struct mmc_queue *mq, struct request *req);
+void mmc_blk_mq_complete(struct request *req);
+
+struct work_struct;
+
+void mmc_blk_mq_complete_work(struct work_struct *work);
+
 #endif
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index 4f33d277b125..a9c2351a9b29 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -22,6 +22,7 @@ 
 #include "block.h"
 #include "core.h"
 #include "card.h"
+#include "host.h"
 
 /*
  * Prepare a MMC request. This just filters out odd stuff.
@@ -34,10 +35,25 @@  static int mmc_prep_request(struct request_queue *q, struct request *req)
 		return BLKPREP_KILL;
 
 	req->rq_flags |= RQF_DONTPREP;
+	req_to_mmc_queue_req(req)->retries = 0;
 
 	return BLKPREP_OK;
 }
 
+enum mmc_issue_type mmc_issue_type(struct mmc_queue *mq, struct request *req)
+{
+	if (req_op(req) == REQ_OP_READ || req_op(req) == REQ_OP_WRITE)
+		return MMC_ISSUE_ASYNC;
+
+	return MMC_ISSUE_SYNC;
+}
+
+static enum blk_eh_timer_return mmc_mq_timed_out(struct request *req,
+						 bool reserved)
+{
+	return BLK_EH_RESET_TIMER;
+}
+
 static int mmc_queue_thread(void *d)
 {
 	struct mmc_queue *mq = d;
@@ -154,11 +170,10 @@  static void mmc_queue_setup_discard(struct request_queue *q,
  * @req: the request
  * @gfp: memory allocation policy
  */
-static int mmc_init_request(struct request_queue *q, struct request *req,
-			    gfp_t gfp)
+static int __mmc_init_request(struct mmc_queue *mq, struct request *req,
+			      gfp_t gfp)
 {
 	struct mmc_queue_req *mq_rq = req_to_mmc_queue_req(req);
-	struct mmc_queue *mq = q->queuedata;
 	struct mmc_card *card = mq->card;
 	struct mmc_host *host = card->host;
 
@@ -169,6 +184,12 @@  static int mmc_init_request(struct request_queue *q, struct request *req,
 	return 0;
 }
 
+static int mmc_init_request(struct request_queue *q, struct request *req,
+			    gfp_t gfp)
+{
+	return __mmc_init_request(q->queuedata, req, gfp);
+}
+
 static void mmc_exit_request(struct request_queue *q, struct request *req)
 {
 	struct mmc_queue_req *mq_rq = req_to_mmc_queue_req(req);
@@ -177,6 +198,105 @@  static void mmc_exit_request(struct request_queue *q, struct request *req)
 	mq_rq->sg = NULL;
 }
 
+static int mmc_mq_init_request(struct blk_mq_tag_set *set, struct request *req,
+			       unsigned int hctx_idx, unsigned int numa_node)
+{
+	return __mmc_init_request(set->driver_data, req, GFP_KERNEL);
+}
+
+static void mmc_mq_exit_request(struct blk_mq_tag_set *set, struct request *req,
+				unsigned int hctx_idx)
+{
+	struct mmc_queue *mq = set->driver_data;
+
+	mmc_exit_request(mq->queue, req);
+}
+
+static blk_status_t mmc_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+				    const struct blk_mq_queue_data *bd)
+{
+	struct request *req = bd->rq;
+	struct request_queue *q = req->q;
+	struct mmc_queue *mq = q->queuedata;
+	struct mmc_card *card = mq->card;
+	enum mmc_issue_type issue_type;
+	enum mmc_issued issued;
+	bool get_card;
+	int ret;
+
+	if (mmc_card_removed(mq->card)) {
+		req->rq_flags |= RQF_QUIET;
+		return BLK_STS_IOERR;
+	}
+
+	issue_type = mmc_issue_type(mq, req);
+
+	spin_lock_irq(q->queue_lock);
+
+	switch (issue_type) {
+	case MMC_ISSUE_ASYNC:
+		break;
+	default:
+		/*
+		 * Timeouts are handled by mmc core, so set a large value to
+		 * avoid races.
+		 */
+		req->timeout = 600 * HZ;
+		break;
+	}
+
+	mq->in_flight[issue_type] += 1;
+	get_card = mmc_tot_in_flight(mq) == 1;
+
+	spin_unlock_irq(q->queue_lock);
+
+	if (!(req->rq_flags & RQF_DONTPREP)) {
+		req_to_mmc_queue_req(req)->retries = 0;
+		req->rq_flags |= RQF_DONTPREP;
+	}
+
+	if (get_card)
+		mmc_get_card(card, &mq->ctx);
+
+	blk_mq_start_request(req);
+
+	issued = mmc_blk_mq_issue_rq(mq, req);
+
+	switch (issued) {
+	case MMC_REQ_BUSY:
+		ret = BLK_STS_RESOURCE;
+		break;
+	case MMC_REQ_FAILED_TO_START:
+		ret = BLK_STS_IOERR;
+		break;
+	default:
+		ret = BLK_STS_OK;
+		break;
+	}
+
+	if (issued != MMC_REQ_STARTED) {
+		bool put_card = false;
+
+		spin_lock_irq(q->queue_lock);
+		mq->in_flight[issue_type] -= 1;
+		if (mmc_tot_in_flight(mq) == 0)
+			put_card = true;
+		spin_unlock_irq(q->queue_lock);
+		if (put_card)
+			mmc_put_card(card, &mq->ctx);
+	}
+
+	return ret;
+}
+
+static const struct blk_mq_ops mmc_mq_ops = {
+	.queue_rq	= mmc_mq_queue_rq,
+	.init_request	= mmc_mq_init_request,
+	.exit_request	= mmc_mq_exit_request,
+	.complete	= mmc_blk_mq_complete,
+	.timeout	= mmc_mq_timed_out,
+};
+
 static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
 {
 	struct mmc_host *host = card->host;
@@ -198,6 +318,69 @@  static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
 
 	/* Initialize thread_sem even if it is not used */
 	sema_init(&mq->thread_sem, 1);
+
+	INIT_WORK(&mq->complete_work, mmc_blk_mq_complete_work);
+
+	mutex_init(&mq->complete_lock);
+
+	init_waitqueue_head(&mq->wait);
+}
+
+static int mmc_mq_init_queue(struct mmc_queue *mq, int q_depth,
+			     const struct blk_mq_ops *mq_ops, spinlock_t *lock)
+{
+	int ret;
+
+	memset(&mq->tag_set, 0, sizeof(mq->tag_set));
+	mq->tag_set.ops = mq_ops;
+	mq->tag_set.queue_depth = q_depth;
+	mq->tag_set.numa_node = NUMA_NO_NODE;
+	mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE |
+			    BLK_MQ_F_BLOCKING;
+	mq->tag_set.nr_hw_queues = 1;
+	mq->tag_set.cmd_size = sizeof(struct mmc_queue_req);
+	mq->tag_set.driver_data = mq;
+
+	ret = blk_mq_alloc_tag_set(&mq->tag_set);
+	if (ret)
+		return ret;
+
+	mq->queue = blk_mq_init_queue(&mq->tag_set);
+	if (IS_ERR(mq->queue)) {
+		ret = PTR_ERR(mq->queue);
+		goto free_tag_set;
+	}
+
+	mq->queue->queue_lock = lock;
+	mq->queue->queuedata = mq;
+
+	return 0;
+
+free_tag_set:
+	blk_mq_free_tag_set(&mq->tag_set);
+
+	return ret;
+}
+
+#define MMC_QUEUE_DEPTH 64
+
+static int mmc_mq_init(struct mmc_queue *mq, struct mmc_card *card,
+			 spinlock_t *lock)
+{
+	int q_depth;
+	int ret;
+
+	q_depth = MMC_QUEUE_DEPTH;
+
+	ret = mmc_mq_init_queue(mq, q_depth, &mmc_mq_ops, lock);
+	if (ret)
+		return ret;
+
+	blk_queue_rq_timeout(mq->queue, 60 * HZ);
+
+	mmc_setup_queue(mq, card);
+
+	return 0;
 }
 
 /**
@@ -216,6 +399,10 @@  int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 	int ret = -ENOMEM;
 
 	mq->card = card;
+
+	if (mmc_host_use_blk_mq(host))
+		return mmc_mq_init(mq, card, lock);
+
 	mq->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!mq->queue)
 		return -ENOMEM;
@@ -251,11 +438,63 @@  int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 	return ret;
 }
 
+static void mmc_mq_queue_suspend(struct mmc_queue *mq)
+{
+	blk_mq_quiesce_queue(mq->queue);
+
+	/*
+	 * The host remains claimed while there are outstanding requests, so
+	 * simply claiming and releasing here ensures there are none.
+	 */
+	mmc_claim_host(mq->card->host);
+	mmc_release_host(mq->card->host);
+}
+
+static void mmc_mq_queue_resume(struct mmc_queue *mq)
+{
+	blk_mq_unquiesce_queue(mq->queue);
+}
+
+static void __mmc_queue_suspend(struct mmc_queue *mq)
+{
+	struct request_queue *q = mq->queue;
+	unsigned long flags;
+
+	if (!mq->suspended) {
+		mq->suspended |= true;
+
+		spin_lock_irqsave(q->queue_lock, flags);
+		blk_stop_queue(q);
+		spin_unlock_irqrestore(q->queue_lock, flags);
+
+		down(&mq->thread_sem);
+	}
+}
+
+static void __mmc_queue_resume(struct mmc_queue *mq)
+{
+	struct request_queue *q = mq->queue;
+	unsigned long flags;
+
+	if (mq->suspended) {
+		mq->suspended = false;
+
+		up(&mq->thread_sem);
+
+		spin_lock_irqsave(q->queue_lock, flags);
+		blk_start_queue(q);
+		spin_unlock_irqrestore(q->queue_lock, flags);
+	}
+}
+
 void mmc_cleanup_queue(struct mmc_queue *mq)
 {
 	struct request_queue *q = mq->queue;
 	unsigned long flags;
 
+	if (q->mq_ops)
+		return;
+
 	/* Make sure the queue isn't suspended, as that will deadlock */
 	mmc_queue_resume(mq);
 
@@ -283,17 +522,11 @@  void mmc_cleanup_queue(struct mmc_queue *mq)
 void mmc_queue_suspend(struct mmc_queue *mq)
 {
 	struct request_queue *q = mq->queue;
-	unsigned long flags;
-
-	if (!mq->suspended) {
-		mq->suspended |= true;
-
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_stop_queue(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
 
-		down(&mq->thread_sem);
-	}
+	if (q->mq_ops)
+		mmc_mq_queue_suspend(mq);
+	else
+		__mmc_queue_suspend(mq);
 }
 
 /**
@@ -303,17 +536,11 @@  void mmc_queue_suspend(struct mmc_queue *mq)
 void mmc_queue_resume(struct mmc_queue *mq)
 {
 	struct request_queue *q = mq->queue;
-	unsigned long flags;
 
-	if (mq->suspended) {
-		mq->suspended = false;
-
-		up(&mq->thread_sem);
-
-		spin_lock_irqsave(q->queue_lock, flags);
-		blk_start_queue(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
+	if (q->mq_ops)
+		mmc_mq_queue_resume(mq);
+	else
+		__mmc_queue_resume(mq);
 }
 
 /*
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index 68f68ecd94ea..a5424ee100ef 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -7,6 +7,19 @@ 
 #include <linux/mmc/core.h>
 #include <linux/mmc/host.h>
 
+enum mmc_issued {
+	MMC_REQ_STARTED,
+	MMC_REQ_BUSY,
+	MMC_REQ_FAILED_TO_START,
+	MMC_REQ_FINISHED,
+};
+
+enum mmc_issue_type {
+	MMC_ISSUE_SYNC,
+	MMC_ISSUE_ASYNC,
+	MMC_ISSUE_MAX,
+};
+
 static inline struct mmc_queue_req *req_to_mmc_queue_req(struct request *rq)
 {
 	return blk_mq_rq_to_pdu(rq);
@@ -56,12 +69,15 @@  struct mmc_queue_req {
 	int			drv_op_result;
 	void			*drv_op_data;
 	unsigned int		ioc_count;
+	int			retries;
 };
 
 struct mmc_queue {
 	struct mmc_card		*card;
 	struct task_struct	*thread;
 	struct semaphore	thread_sem;
+	struct mmc_ctx		ctx;
+	struct blk_mq_tag_set	tag_set;
 	bool			suspended;
 	bool			asleep;
 	struct mmc_blk_data	*blkdata;
@@ -73,6 +89,14 @@  struct mmc_queue {
 	 * associated mmc_queue_req data.
 	 */
 	int			qcnt;
+
+	int			in_flight[MMC_ISSUE_MAX];
+	bool			rw_wait;
+	bool			waiting;
+	wait_queue_head_t	wait;
+	struct request		*complete_req;
+	struct mutex		complete_lock;
+	struct work_struct	complete_work;
 };
 
 extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
@@ -83,4 +107,12 @@  extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
 extern unsigned int mmc_queue_map_sg(struct mmc_queue *,
 				     struct mmc_queue_req *);
 
+enum mmc_issue_type mmc_issue_type(struct mmc_queue *mq, struct request *req);
+
+static inline int mmc_tot_in_flight(struct mmc_queue *mq)
+{
+	return mq->in_flight[MMC_ISSUE_SYNC] +
+	       mq->in_flight[MMC_ISSUE_ASYNC];
+}
+
 #endif