diff mbox

[v2] RFD: switch MMC/SD to use blk-mq multiqueueing

Message ID 1482242508-26131-1-git-send-email-linus.walleij@linaro.org (mailing list archive)
State New, archived
Headers show

Commit Message

Linus Walleij Dec. 20, 2016, 2:01 p.m. UTC
HACK ALERT: DO NOT MERGE THIS! IT IS A FYI PATCH FOR DISCUSSION
ONLY.

This hack switches the MMC/SD subsystem from using the legacy blk
layer to using blk-mq. It does this by registering one single
hardware queue, since MMC/SD has only one command pipe. I kill
off the worker thread altogether and let the MQ core logic fire
sleepable requests directly into the MMC core.

We emulate the 2 elements deep pipeline by specifying queue depth
2, which is an elaborate lie that makes the block layer issue
another request while a previous request is in transit. It't not
neat but it works.

As the pipeline needs to be flushed by pushing in a NULL request
after the last block layer request I added a delayed work with a
timeout of zero. This will fire as soon as the block layer stops
pushing in requests: as long as there are new requests the MQ
block layer will just repeatedly cancel this pipeline flush work
and push new requests into the pipeline, but once the requests
stop coming the NULL request will be flushed into the pipeline.

It's not pretty but it works... Look at the following performance
statistics:

BEFORE this patch:

time dd if=/dev/mmcblk0 of=/dev/null bs=1M count=1024
1024+0 records in
1024+0 records out
1073741824 bytes (1.0GB) copied, 45.145874 seconds, 22.7MB/s
real    0m 45.15s
user    0m 0.02s
sys     0m 7.51s

mount /dev/mmcblk0p1 /mnt/
cd /mnt/
time find . > /dev/null
real    0m 3.70s
user    0m 0.29s
sys     0m 1.63s

AFTER this patch:

time dd if=/dev/mmcblk0 of=/dev/null bs=1M count=1024
1024+0 records in
1024+0 records out
1073741824 bytes (1.0GB) copied, 45.285431 seconds, 22.6MB/s
real    0m 45.29s
user    0m 0.02s
sys     0m 6.58s

mount /dev/mmcblk0p1 /mnt/
cd /mnt/
time find . > /dev/null
real    0m 4.37s
user    0m 0.27s
sys     0m 1.65s

The results are consistent.

As you can see, for a straight dd-like task, we get more or less the
same nice parallelism as for the old framework. I have confirmed
through debugprints that indeed this is because the two-stage pipeline
is full at all times.

However, for spurious reads in the find command, we already see a big
performance regression.

This is because there are many small operations requireing a flush of
the pipeline, which used to happen immediately with the old block
layer interface code that used to pull a few NULL requests off the
queue and feed them into the pipeline immediately after the last
request, but happens after the delayed work is executed in this
new framework. The delayed work is never quick enough to terminate
all these small operations even if we schedule it immediately after
the last request.

AFAICT the only way forward to provide proper performance with MQ
for MMC/SD is to get the requests to complete out-of-sync, i.e. when
the driver calls back to MMC/SD core to notify that a request is
complete, it should not notify any main thread with a completion
as is done right now, but instead directly call blk_end_request_all()
and only schedule some extra communication with the card if necessary
for example to handle an error condition.

This rework needs a bigger rewrite so we can get rid of the paradigm
of the block layer "driving" the requests throgh the pipeline.

Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/mmc/core/block.c |  43 +++----
 drivers/mmc/core/queue.c | 308 ++++++++++++++++++++++++++---------------------
 drivers/mmc/core/queue.h |  13 +-
 3 files changed, 203 insertions(+), 161 deletions(-)

Comments

Bartlomiej Zolnierkiewicz Dec. 20, 2016, 4:21 p.m. UTC | #1
Hi,

On Tuesday, December 20, 2016 03:01:48 PM Linus Walleij wrote:
> HACK ALERT: DO NOT MERGE THIS! IT IS A FYI PATCH FOR DISCUSSION
> ONLY.
> 
> This hack switches the MMC/SD subsystem from using the legacy blk
> layer to using blk-mq. It does this by registering one single
> hardware queue, since MMC/SD has only one command pipe. I kill
> off the worker thread altogether and let the MQ core logic fire
> sleepable requests directly into the MMC core.
> 
> We emulate the 2 elements deep pipeline by specifying queue depth
> 2, which is an elaborate lie that makes the block layer issue
> another request while a previous request is in transit. It't not
> neat but it works.
> 
> As the pipeline needs to be flushed by pushing in a NULL request
> after the last block layer request I added a delayed work with a
> timeout of zero. This will fire as soon as the block layer stops
> pushing in requests: as long as there are new requests the MQ
> block layer will just repeatedly cancel this pipeline flush work
> and push new requests into the pipeline, but once the requests
> stop coming the NULL request will be flushed into the pipeline.
> 
> It's not pretty but it works... Look at the following performance
> statistics:
> 
> BEFORE this patch:
> 
> time dd if=/dev/mmcblk0 of=/dev/null bs=1M count=1024
> 1024+0 records in
> 1024+0 records out
> 1073741824 bytes (1.0GB) copied, 45.145874 seconds, 22.7MB/s
> real    0m 45.15s
> user    0m 0.02s
> sys     0m 7.51s
> 
> mount /dev/mmcblk0p1 /mnt/
> cd /mnt/
> time find . > /dev/null
> real    0m 3.70s
> user    0m 0.29s
> sys     0m 1.63s
> 
> AFTER this patch:
> 
> time dd if=/dev/mmcblk0 of=/dev/null bs=1M count=1024
> 1024+0 records in
> 1024+0 records out
> 1073741824 bytes (1.0GB) copied, 45.285431 seconds, 22.6MB/s
> real    0m 45.29s
> user    0m 0.02s
> sys     0m 6.58s
> 
> mount /dev/mmcblk0p1 /mnt/
> cd /mnt/
> time find . > /dev/null
> real    0m 4.37s
> user    0m 0.27s
> sys     0m 1.65s
> 
> The results are consistent.
> 
> As you can see, for a straight dd-like task, we get more or less the
> same nice parallelism as for the old framework. I have confirmed
> through debugprints that indeed this is because the two-stage pipeline
> is full at all times.
> 
> However, for spurious reads in the find command, we already see a big
> performance regression.
> 
> This is because there are many small operations requireing a flush of
> the pipeline, which used to happen immediately with the old block
> layer interface code that used to pull a few NULL requests off the
> queue and feed them into the pipeline immediately after the last
> request, but happens after the delayed work is executed in this
> new framework. The delayed work is never quick enough to terminate
> all these small operations even if we schedule it immediately after
> the last request.
> 
> AFAICT the only way forward to provide proper performance with MQ
> for MMC/SD is to get the requests to complete out-of-sync, i.e. when
> the driver calls back to MMC/SD core to notify that a request is
> complete, it should not notify any main thread with a completion
> as is done right now, but instead directly call blk_end_request_all()
> and only schedule some extra communication with the card if necessary
> for example to handle an error condition.

To do this I think that SCSI subsystem-like error handling should be
added to MMC core (in my uber-ugly blk-mq patches I just commented
most of error handling out and added direct request completion call).

> This rework needs a bigger rewrite so we can get rid of the paradigm
> of the block layer "driving" the requests throgh the pipeline.

IMHO for now blk-mq should be added as an add-on to MMC core with
leaving the old code in-place (again SCSI layer serves as inspiration
for that).

It should be easier to test and merge blk-mq support without causing
regressions this way (i.e. blk-mq handling of schedulers is only now
being worked on).

You may also consider re-basing your work on Adrian's swcmdq patches
(up to "mmc: core: Do not prepare a new request twice" patch) as they
cleanup MMC core queuing code significantly (some of the patches have
been merged upstream recently):

http://git.infradead.org/users/ahunter/linux-sdhci.git/shortlog/refs/heads/swcmdq

Best regards,
--
Bartlomiej Zolnierkiewicz
Samsung R&D Institute Poland
Samsung Electronics

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ritesh Harjani Dec. 21, 2016, 5:22 p.m. UTC | #2
Hi,

I may have some silly queries here. Please bear with my little 
understanding on blk-mq.

On 12/20/2016 7:31 PM, Linus Walleij wrote:
> HACK ALERT: DO NOT MERGE THIS! IT IS A FYI PATCH FOR DISCUSSION
> ONLY.
>
> This hack switches the MMC/SD subsystem from using the legacy blk
> layer to using blk-mq. It does this by registering one single
> hardware queue, since MMC/SD has only one command pipe. I kill
Could you please confirm on this- does even the HW/SW CMDQ in emmc would 
use only 1 hardware queue with (say ~31) as queue depth, of that HW 
queue? Is this understanding correct?

Or will it be possible to have more than 1 HW Queue with lesser queue 
depth per HW queue?


> off the worker thread altogether and let the MQ core logic fire
> sleepable requests directly into the MMC core.
>
> We emulate the 2 elements deep pipeline by specifying queue depth
> 2, which is an elaborate lie that makes the block layer issue
> another request while a previous request is in transit. It't not
> neat but it works.
>
> As the pipeline needs to be flushed by pushing in a NULL request
> after the last block layer request I added a delayed work with a
> timeout of zero. This will fire as soon as the block layer stops
> pushing in requests: as long as there are new requests the MQ
> block layer will just repeatedly cancel this pipeline flush work
> and push new requests into the pipeline, but once the requests
> stop coming the NULL request will be flushed into the pipeline.
>
> It's not pretty but it works... Look at the following performance
> statistics:
I understand that the block drivers are moving to blk-mq framework.
But keeping that reason apart, do we also anticipate any theoretical 
performance gains in moving mmc driver to blk-mq framework  - for both 
in case of legacy emmc, and SW/HW CMDQ in emmc ? And by how much?

It would be even better to know if adding of scheduler to blk-mq will 
make any difference in perf gains or not in this case?

Do we any rough estimate or study on that?
This is only out of curiosity and for information purpose.

Regards
Ritesh

>
> BEFORE this patch:
>
> time dd if=/dev/mmcblk0 of=/dev/null bs=1M count=1024
> 1024+0 records in
> 1024+0 records out
> 1073741824 bytes (1.0GB) copied, 45.145874 seconds, 22.7MB/s
> real    0m 45.15s
> user    0m 0.02s
> sys     0m 7.51s
>
> mount /dev/mmcblk0p1 /mnt/
> cd /mnt/
> time find . > /dev/null
> real    0m 3.70s
> user    0m 0.29s
> sys     0m 1.63s
>
> AFTER this patch:
>
> time dd if=/dev/mmcblk0 of=/dev/null bs=1M count=1024
> 1024+0 records in
> 1024+0 records out
> 1073741824 bytes (1.0GB) copied, 45.285431 seconds, 22.6MB/s
> real    0m 45.29s
> user    0m 0.02s
> sys     0m 6.58s
>
> mount /dev/mmcblk0p1 /mnt/
> cd /mnt/
> time find . > /dev/null
> real    0m 4.37s
> user    0m 0.27s
> sys     0m 1.65s
>
> The results are consistent.
>
> As you can see, for a straight dd-like task, we get more or less the
> same nice parallelism as for the old framework. I have confirmed
> through debugprints that indeed this is because the two-stage pipeline
> is full at all times.
>
> However, for spurious reads in the find command, we already see a big
> performance regression.
>
> This is because there are many small operations requireing a flush of
> the pipeline, which used to happen immediately with the old block
> layer interface code that used to pull a few NULL requests off the
> queue and feed them into the pipeline immediately after the last
> request, but happens after the delayed work is executed in this
> new framework. The delayed work is never quick enough to terminate
> all these small operations even if we schedule it immediately after
> the last request.
>
> AFAICT the only way forward to provide proper performance with MQ
> for MMC/SD is to get the requests to complete out-of-sync, i.e. when
> the driver calls back to MMC/SD core to notify that a request is
> complete, it should not notify any main thread with a completion
> as is done right now, but instead directly call blk_end_request_all()
> and only schedule some extra communication with the card if necessary
> for example to handle an error condition.
>
> This rework needs a bigger rewrite so we can get rid of the paradigm
> of the block layer "driving" the requests throgh the pipeline.
>
> Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Linus Walleij Dec. 27, 2016, 12:21 p.m. UTC | #3
On Wed, Dec 21, 2016 at 6:22 PM, Ritesh Harjani <riteshh@codeaurora.org> wrote:

> I may have some silly queries here. Please bear with my little understanding
> on blk-mq.

It's OK we need to build consensus.

> On 12/20/2016 7:31 PM, Linus Walleij wrote:

>> This hack switches the MMC/SD subsystem from using the legacy blk
>> layer to using blk-mq. It does this by registering one single
>> hardware queue, since MMC/SD has only one command pipe. I kill
>
> Could you please confirm on this- does even the HW/SW CMDQ in emmc would use
> only 1 hardware queue with (say ~31) as queue depth, of that HW queue? Is
> this understanding correct?

Yes as far as I can tell.

But you may have to tell me, because I'm not an expert in CMDQ.

Multiple queues are for when you can issue different request truly parallel
without taking any previous and later request into account. CMDQ on
MMC seems to require rollback etc if any of the issued requests after
a certain request fail, and then it is essentially one queue, like a pipeline,
and if one request fails all requests after that request needs to be backed
out, correct?

> Or will it be possible to have more than 1 HW Queue with lesser queue depth
> per HW queue?

Depends on the above.

Each queue must have its own error handling, and work isolated from
the other queues to be considered a real hardware queue.

If the requests have dependencies, like referring each other, or
as pointed out, needing to be cancelled if there is an error on a totally
different request, it is just a deep pipeline, single hardware queue.

> I understand that the block drivers are moving to blk-mq framework.
> But keeping that reason apart, do we also anticipate any theoretical
> performance gains in moving mmc driver to blk-mq framework  - for both in
> case of legacy emmc, and SW/HW CMDQ in emmc ? And by how much?

On the contrary we expect a performance regression as mq has no
scheduling. MQ is created for the usecase where you have multiple
hardware queues and they are so hungry for work that you have a problem
feeding them all. Needless to say, on eMMC/SD we don't have that problem
right now atleast.

> It would be even better to know if adding of scheduler to blk-mq will make
> any difference in perf gains or not in this case?

The tentative plan as I see it is to shunt in BFQ as the default scheduler
for MQ in the single-hw-queue case. The old block layer schedulers getting
deprecated in the process. But this is really up to the block layer developers.

> Do we any rough estimate or study on that?
> This is only out of curiosity and for information purpose.

No it is a venture into the unknown to go where no man has gone before.

I just have a good feeling about this and confidence that it will work out.

So I am doing RFD patches like this one to see if I'm right.

Yours,
Linus Walleij
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ritesh Harjani Dec. 28, 2016, 4:21 a.m. UTC | #4
Hi Linus,

Thanks for getting back on this.

On 12/27/2016 5:51 PM, Linus Walleij wrote:
> On Wed, Dec 21, 2016 at 6:22 PM, Ritesh Harjani <riteshh@codeaurora.org> wrote:
>
>> I may have some silly queries here. Please bear with my little understanding
>> on blk-mq.
>
> It's OK we need to build consensus.
>
>> On 12/20/2016 7:31 PM, Linus Walleij wrote:
>
>>> This hack switches the MMC/SD subsystem from using the legacy blk
>>> layer to using blk-mq. It does this by registering one single
>>> hardware queue, since MMC/SD has only one command pipe. I kill
>>
>> Could you please confirm on this- does even the HW/SW CMDQ in emmc would use
>> only 1 hardware queue with (say ~31) as queue depth, of that HW queue? Is
>> this understanding correct?
>
> Yes as far as I can tell.
Ok.

>
> But you may have to tell me, because I'm not an expert in CMDQ.
>
> Multiple queues are for when you can issue different request truly parallel
> without taking any previous and later request into account. CMDQ on
> MMC seems to require rollback etc if any of the issued requests after
> a certain request fail, and then it is essentially one queue, like a pipeline,
> and if one request fails all requests after that request needs to be backed
> out, correct?
This depends. There is a command CMD48 which can knock off the error'd 
out task from the queue. But in case if the reset of controller and card 
is required(for some error) then yes, all requests in queue needs to be 
re-queued back and removed from the card queue.

Also there are certain CMDs which can be *only* issued while the queue 
of emmc is empty. Otherwise they will be treated as illegal commands.
So yes, there is a dependency in previously issued requests - which 
means (as you said) that for blk-mq we should consider in HW/SW CMDQ as 
1 HW queue with certain qdepth advertised by card.


>
>> Or will it be possible to have more than 1 HW Queue with lesser queue depth
>> per HW queue?
>
> Depends on the above.
>
> Each queue must have its own error handling, and work isolated from
> the other queues to be considered a real hardware queue.
>
> If the requests have dependencies, like referring each other, or
> as pointed out, needing to be cancelled if there is an error on a totally
> different request, it is just a deep pipeline, single hardware queue.
Yes, thanks for explaining. Agree.

>
>> I understand that the block drivers are moving to blk-mq framework.
>> But keeping that reason apart, do we also anticipate any theoretical
>> performance gains in moving mmc driver to blk-mq framework  - for both in
>> case of legacy emmc, and SW/HW CMDQ in emmc ? And by how much?
>
> On the contrary we expect a performance regression as mq has no
> scheduling. MQ is created for the usecase where you have multiple
> hardware queues and they are so hungry for work that you have a problem
> feeding them all. Needless to say, on eMMC/SD we don't have that problem
> right now atleast.
Ok. Could you please elaborate how a regression?

 From very little understanding on blk-mq, I see that it does have 
plugging mechanism in place. So merging wont be a problem for most of 
the use cases right?
So are you referring to priority given to sync requests v/s async or 
idle type requests, which were taken care in CFQ ?
For my own understanding, I would like to understand under what 
scenarios or why we should expect a performance regression in blk-mq for 
mmc driver without blk-mq scheduling?


>
>> It would be even better to know if adding of scheduler to blk-mq will make
>> any difference in perf gains or not in this case?
>
> The tentative plan as I see it is to shunt in BFQ as the default scheduler
> for MQ in the single-hw-queue case. The old block layer schedulers getting
> deprecated in the process. But this is really up to the block layer developers.
Ok. Yes, I do see blk-mq scheduling patches on the mailing lists.

>
>> Do we any rough estimate or study on that?
>> This is only out of curiosity and for information purpose.
>
> No it is a venture into the unknown to go where no man has gone before.
>
> I just have a good feeling about this and confidence that it will work out.
>
> So I am doing RFD patches like this one to see if I'm right.
Thanks!!


Regards
Ritesh

>
> Yours,
> Linus Walleij
>
Christoph Hellwig Dec. 28, 2016, 8:55 a.m. UTC | #5
On Tue, Dec 27, 2016 at 01:21:28PM +0100, Linus Walleij wrote:
> > Could you please confirm on this- does even the HW/SW CMDQ in emmc would use
> > only 1 hardware queue with (say ~31) as queue depth, of that HW queue? Is
> > this understanding correct?
> 
> Yes as far as I can tell.
> 
> But you may have to tell me, because I'm not an expert in CMDQ.
> 
> Multiple queues are for when you can issue different request truly parallel
> without taking any previous and later request into account. CMDQ on
> MMC seems to require rollback etc if any of the issued requests after
> a certain request fail, and then it is essentially one queue, like a pipeline,
> and if one request fails all requests after that request needs to be backed
> out, correct?

I'm not an expert on the MMC CMDQ feature, but I know blk-mq pretty
well, so based on that and the decription of the series CMDQ finally
allows MMC to implement one single queue almost properly, but certainly
not multiple queues.

In block storage terms a queue is something where the OS can send
multiple commands to the device and let the device operate on them in
parallel (or at least pseudo-parallel).  Even with the MMC CMDQ feature
is seems after queueing up command the host still needs to control
execution ordering by talking to the hardware again for each command,
which isn't up to par with what other standards have been doing for the
last decades.  It's certainly not support for multiple queues, which
is defined as hardware features where the host allows multiple issuers
to use queue entirely independently of each other.

> Each queue must have its own error handling, and work isolated from
> the other queues to be considered a real hardware queue.

Note that error handling actually is one of the areas where queues
don't actually operate entirely independently once you escalate high
enough (device / controller reset).  For modern-ish protocols like
SCSI or NVMe the first level of error handling (abort) actually doesn't
involve the queue at all at the protocol, just the failed command.
But once that fails we quickly escalate to a reset, which involves
more than the queue.

> If the requests have dependencies, like referring each other, or
> as pointed out, needing to be cancelled if there is an error on a totally
> different request, it is just a deep pipeline, single hardware queue.

Strictly speaking it's not even a a real hardware queue in that case,
but with enough workarounds in the driver we can probably make it look
like a queue at least.  Certainly not multiple queues, though.

> On the contrary we expect a performance regression as mq has no
> scheduling. MQ is created for the usecase where you have multiple
> hardware queues and they are so hungry for work that you have a problem
> feeding them all. Needless to say, on eMMC/SD we don't have that problem
> right now atleast.

That's not entirely correct.  blk-mq is designed to replace the legacy
request code eventually.  The focus is on not wasting CPU cycles, and
to support multiple queues (but not require them).  Sequential workloads
should always be as fast as the legacy path and use less CPU cycles,
for random workloads we might have to wait for I/O scheduler support,
which is under way now:

    http://git.kernel.dk/cgit/linux-block/log/?h=blk-mq-sched

All that assumes a properly converted driver, which as seen by your 
experiments isn't easy for MMC as it's a very convoluted beast thanks
the hardware interface which isn't up to the standards we expect from
block storage protocols.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Linus Walleij Dec. 28, 2016, 11:59 p.m. UTC | #6
On Wed, Dec 28, 2016 at 9:55 AM, Christoph Hellwig <hch@lst.de> wrote:
> On Tue, Dec 27, 2016 at 01:21:28PM +0100, Linus Walleij wrote:

>> On the contrary we expect a performance regression as mq has no
>> scheduling. MQ is created for the usecase where you have multiple
>> hardware queues and they are so hungry for work that you have a problem
>> feeding them all. Needless to say, on eMMC/SD we don't have that problem
>> right now atleast.
>
> That's not entirely correct.  blk-mq is designed to replace the legacy
> request code eventually.  The focus is on not wasting CPU cycles, and
> to support multiple queues (but not require them).

OK! Performance is paramount, so this indeed confirms that we need
to re-engineer the MMC/SD stack to not rely on this kthread to "drive"
transactions, instead we need to complete them quickly from the driver
callbacks and let MQ drive.

A problem here is that issueing the requests are in blocking context
while completion is in IRQ context (for most drivers) so we need to
look into this.

>  Sequential workloads
> should always be as fast as the legacy path and use less CPU cycles,

That seems more or less confirmed by my dd-test in the commit
message. sys time is really small with the simple time+dd tests.

> for random workloads we might have to wait for I/O scheduler support,
> which is under way now:
>
>     http://git.kernel.dk/cgit/linux-block/log/?h=blk-mq-sched

Awesome.

> All that assumes a properly converted driver, which as seen by your
> experiments isn't easy for MMC as it's a very convoluted beast thanks
> the hardware interface which isn't up to the standards we expect from
> block storage protocols.

I think we can hash it out, we just need to rewrite the MMC/SD
core request handling a bit.

Yours,
Linus Walleij
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Jan. 2, 2017, 9:40 a.m. UTC | #7
On Thursday, December 29, 2016 12:59:51 AM CET Linus Walleij wrote:
> On Wed, Dec 28, 2016 at 9:55 AM, Christoph Hellwig <hch@lst.de> wrote:
> > On Tue, Dec 27, 2016 at 01:21:28PM +0100, Linus Walleij wrote:
> 
> >> On the contrary we expect a performance regression as mq has no
> >> scheduling. MQ is created for the usecase where you have multiple
> >> hardware queues and they are so hungry for work that you have a problem
> >> feeding them all. Needless to say, on eMMC/SD we don't have that problem
> >> right now atleast.
> >
> > That's not entirely correct.  blk-mq is designed to replace the legacy
> > request code eventually.  The focus is on not wasting CPU cycles, and
> > to support multiple queues (but not require them).
> 
> OK! Performance is paramount, so this indeed confirms that we need
> to re-engineer the MMC/SD stack to not rely on this kthread to "drive"
> transactions, instead we need to complete them quickly from the driver
> callbacks and let MQ drive.
> 
> A problem here is that issueing the requests are in blocking context
> while completion is in IRQ context (for most drivers) so we need to
> look into this.

I think whether issuing an mmc request requires blocking should ideally
be up to the host device driver, at least I'd hope that we can end up
with something like this driving latency to the absolute minimum:

a) with MMC CMDQ support:
  - report actual queue depth
  - have blk-mq push requests directly into the device through
    the mmc-block driver and the host driver
  - if a host driver needs to block, make it use a private workqueue

b) without MMC CMDQ support:
  - report queue depth of '2'
  - first request gets handled as above
  - if one request is pending, prepare the second request and
    add a pointer to the mmc host structure (not that different
    from what we do today)
  - when the host driver completes a request, have it immediately
    issue the next one from the interrupt handler. In case we need
    to sleep here, use a threaded IRQ, or a workqueue. This should
    avoid the need for the NULL requests

c) possible optimization for command packing without CMDQ:
   - similar to b)
   - report a longer queue (e.g. 8, maybe user selectable, to
     balance throughput against latency)
   - any request of the same type (read or write) as the one that
     is currently added to the host as the 'next' one can be
     added to that request so they get issued together
   - if the types are different, report the queue to be busy

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Hannes Reinecke Jan. 2, 2017, 11:06 a.m. UTC | #8
On 01/02/2017 10:40 AM, Arnd Bergmann wrote:
> On Thursday, December 29, 2016 12:59:51 AM CET Linus Walleij wrote:
>> On Wed, Dec 28, 2016 at 9:55 AM, Christoph Hellwig <hch@lst.de> wrote:
>>> On Tue, Dec 27, 2016 at 01:21:28PM +0100, Linus Walleij wrote:
>>
>>>> On the contrary we expect a performance regression as mq has no
>>>> scheduling. MQ is created for the usecase where you have multiple
>>>> hardware queues and they are so hungry for work that you have a problem
>>>> feeding them all. Needless to say, on eMMC/SD we don't have that problem
>>>> right now atleast.
>>>
>>> That's not entirely correct.  blk-mq is designed to replace the legacy
>>> request code eventually.  The focus is on not wasting CPU cycles, and
>>> to support multiple queues (but not require them).
>>
>> OK! Performance is paramount, so this indeed confirms that we need
>> to re-engineer the MMC/SD stack to not rely on this kthread to "drive"
>> transactions, instead we need to complete them quickly from the driver
>> callbacks and let MQ drive.
>>
>> A problem here is that issueing the requests are in blocking context
>> while completion is in IRQ context (for most drivers) so we need to
>> look into this.
> 
> I think whether issuing an mmc request requires blocking should ideally
> be up to the host device driver, at least I'd hope that we can end up
> with something like this driving latency to the absolute minimum:
> 
> a) with MMC CMDQ support:
>   - report actual queue depth
>   - have blk-mq push requests directly into the device through
>     the mmc-block driver and the host driver
>   - if a host driver needs to block, make it use a private workqueue
> 
> b) without MMC CMDQ support:
>   - report queue depth of '2'
>   - first request gets handled as above
>   - if one request is pending, prepare the second request and
>     add a pointer to the mmc host structure (not that different
>     from what we do today)
>   - when the host driver completes a request, have it immediately
>     issue the next one from the interrupt handler. In case we need
>     to sleep here, use a threaded IRQ, or a workqueue. This should
>     avoid the need for the NULL requests
> 
> c) possible optimization for command packing without CMDQ:
>    - similar to b)
>    - report a longer queue (e.g. 8, maybe user selectable, to
>      balance throughput against latency)
>    - any request of the same type (read or write) as the one that
>      is currently added to the host as the 'next' one can be
>      added to that request so they get issued together
>    - if the types are different, report the queue to be busy
> 
Hmm. But that would amount to implement yet another queuing mechanism
within the driver/mmc subsystem, wouldn't it?

Which is, incidentally, the same method the S/390 DASD driver uses
nowadays; report an arbitrary queue depth to the block layer and queue
all requests internally to better saturate the device.

However I'd really like to get rid of this, and tweak the block layer to
handle these cases.

One idea I had was to use a 'prep' function for mq; it would be executed
once the request is added to the queue.
Key point here is that 'prep' and 'queue_rq' would be two distinct
steps; the driver could do all required setup functionality during
'prep', and then do the actual submission via 'queue_rq'.
That would allow to better distribute the load for timing-sensitive
devices, and with a bit of luck remove the need for a separate queuing
inside the driver.

In any case, it looks like a proper subject for LSF/MM...
Linus? Arnd? Are you up for it?

Cheers,

Hannes
Bart Van Assche Jan. 2, 2017, 11:55 a.m. UTC | #9
On Thu, 2016-12-29 at 00:59 +0100, Linus Walleij wrote:
> A problem here is that issueing the requests are in blocking context

> while completion is in IRQ context (for most drivers) so we need to

> look into this.


Hello Linus,

Although I'm not sure whether I understood you correctly: are you familiar
with the request queue flag BLK_MQ_F_BLOCKING, a flag that was introduced
for the nbd driver?

Bart.
Bart Van Assche Jan. 2, 2017, 11:58 a.m. UTC | #10
On Mon, 2017-01-02 at 12:06 +0100, Hannes Reinecke wrote:
> Hmm. But that would amount to implement yet another queuing mechanism

> within the driver/mmc subsystem, wouldn't it?

> 

> Which is, incidentally, the same method the S/390 DASD driver uses

> nowadays; report an arbitrary queue depth to the block layer and queue

> all requests internally to better saturate the device.

> 

> However I'd really like to get rid of this, and tweak the block layer to

> handle these cases.


Hello Hannes,

Such functionality will be added to the blk-mq core as part of the
initiative to add blk-mq I/O scheduling support. One implementation has been
proposed by Jens (http://git.kernel.dk/cgit/linux-block/log/?h=blk-mq-sched)
and another implementation has been proposed by myself (https://lkml.org/lkm
l/2016/12/22/322).

Bart.
Adrian Hunter Jan. 2, 2017, 1:56 p.m. UTC | #11
On 28/12/16 10:55, Christoph Hellwig wrote:
> On Tue, Dec 27, 2016 at 01:21:28PM +0100, Linus Walleij wrote:
>>> Could you please confirm on this- does even the HW/SW CMDQ in emmc would use
>>> only 1 hardware queue with (say ~31) as queue depth, of that HW queue? Is
>>> this understanding correct?
>>
>> Yes as far as I can tell.
>>
>> But you may have to tell me, because I'm not an expert in CMDQ.
>>
>> Multiple queues are for when you can issue different request truly parallel
>> without taking any previous and later request into account. CMDQ on
>> MMC seems to require rollback etc if any of the issued requests after
>> a certain request fail, and then it is essentially one queue, like a pipeline,
>> and if one request fails all requests after that request needs to be backed
>> out, correct?
> 
> I'm not an expert on the MMC CMDQ feature, but I know blk-mq pretty
> well, so based on that and the decription of the series CMDQ finally
> allows MMC to implement one single queue almost properly, but certainly
> not multiple queues.
> 
> In block storage terms a queue is something where the OS can send
> multiple commands to the device and let the device operate on them in
> parallel (or at least pseudo-parallel).  Even with the MMC CMDQ feature
> is seems after queueing up command the host still needs to control
> execution ordering by talking to the hardware again for each command,
> which isn't up to par with what other standards have been doing for the
> last decades.  It's certainly not support for multiple queues, which
> is defined as hardware features where the host allows multiple issuers
> to use queue entirely independently of each other.

eMMC can only do one thing at a time.  However when the host controller has
a command queue engine (CQE), then the engine takes care of executing tasks
in the queue, so the driver only needs to queue transfers and handle
completions.

Unfortunately there is a complication.  EMMC can be blocked for a number of
reasons:

First, eMMC has a number of internal partitions that are represented as
disks with their own (block-layer) queues.  Because eMMC can only do one
thing at a time, the queues must be synchronized with each other, and there
is a switch command that must be issued to switch the eMMC between internal
partitions.  Currently that synchronization is provided by "claiming" the
host controller.  A queue (mmc_queue_thread) is blocked until it claims the
host.

Secondly, MMC block driver has an IOCTL interface that bypasses the queue,
and consequently also needs to be synchronized - again currently by claiming
the host.  There is also a proposed RPMB interface that works similarly.

Thirdly, discards are not (properly) supported by the CQE or (at all by)
eMMC CMDQ.  A discard (or erase) requires 3 commands to be sent in
succession.  Even if the CQE supports direct commands (DCMD) as described by
the Command Queue Host Controller (CQHCI) specification, only one DCMD can
be issued at a time.  That means no new reads/writes can be issued while the
first 2 commands are being issued, and no new discards can be issued until
the previous one completes.  However it is possible there are CQE that don't
support DCMD anyway - which would just mean the queue is halted/blocked
until the discard completes.

Fourthly, eMMC/SD make use of runtime PM for both the host controller and
the card.  The card might require a full reinitialization sequence to be
resumed.

Fifthly, although it is a minor point, the CQHCI spec specifies that the CQE
must be halted to handle any error, and indeed the CQE must be halted to
send commands to perform recovery anyway.

Sixthly, there are debugfs files that issue commands and are also
synchronized by claiming the host.

The problem with blocking is that it leaves up to queue_depth requests
sitting in a (local or dispatch) list unavailable for merging or
prioritizing.  That won't matter if the queue_depth is 2, but it might if
the queue_depth is the eMMC CMDQ maximum of 32.

If we assume it doesn't matter to have up to queue_depth requests
unavailable for merging or prioritizing, then a switch to blk-mq can be done
most simply by having ->queue_rq() put requests on a list, and have the
existing mmc_queue_thread() process them the same way it does now.  One
disadvantage of that approach is that the mmc_queue_thread() would not get
woken until ->queue_rq() is called which is done by scheduled work for async
requests.

If it does matter that all requests are still available for merging or
prioritizing when the queue is blocked, then AFAICT mmc will need more
support from blk-mq.

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Jan. 2, 2017, 5:08 p.m. UTC | #12
On Monday, January 2, 2017 12:06:04 PM CET Hannes Reinecke wrote:
> On 01/02/2017 10:40 AM, Arnd Bergmann wrote:
> > b) without MMC CMDQ support:
> >   - report queue depth of '2'
> >   - first request gets handled as above
> >   - if one request is pending, prepare the second request and
> >     add a pointer to the mmc host structure (not that different
> >     from what we do today)
> >   - when the host driver completes a request, have it immediately
> >     issue the next one from the interrupt handler. In case we need
> >     to sleep here, use a threaded IRQ, or a workqueue. This should
> >     avoid the need for the NULL requests
> > 
> > c) possible optimization for command packing without CMDQ:
> >    - similar to b)
> >    - report a longer queue (e.g. 8, maybe user selectable, to
> >      balance throughput against latency)
> >    - any request of the same type (read or write) as the one that
> >      is currently added to the host as the 'next' one can be
> >      added to that request so they get issued together
> >    - if the types are different, report the queue to be busy
> > 
> Hmm. But that would amount to implement yet another queuing mechanism
> within the driver/mmc subsystem, wouldn't it?
> 
> Which is, incidentally, the same method the S/390 DASD driver uses
> nowadays; report an arbitrary queue depth to the block layer and queue
> all requests internally to better saturate the device.
> 
> However I'd really like to get rid of this, and tweak the block layer to
> handle these cases.
> 
> One idea I had was to use a 'prep' function for mq; it would be executed
> once the request is added to the queue.
> Key point here is that 'prep' and 'queue_rq' would be two distinct
> steps; the driver could do all required setup functionality during
> 'prep', and then do the actual submission via 'queue_rq'.

Right, I had the same idea and I think even talked to you about that.
I think this would address case 'b)' above perfectly, but I don't
see how we can fit case 'c)' in that scheme.

Maybe if we could teach blk_mq that a device might be able to not
just merge consecutive requests but also a limited set of
non-consecutive requests in a way that MMC needs them? Unfortunately,
the 'packed command' is rather MMC specific, and it might even
be worthwhile to do some optimization regarding what commands
get packed (e.g. only writes, only small requests, or only up to
a total size).

While there are some parallels to DASD, but it's not completely
the same, even if we ignore the command packing.

- MMC wants the 'prepare' stage to reduce the latency from
  dma_map_sg() calls. This is actually more related to the
  CPU/SoC architecture and really important on most ARM systems
  as they are not cache coherent, but it's not just specific
  to MMC other than MMC performance on low-end ARM being
  really important to a lot of people since it's used in all
  Android phones.
  If we could find a way to get blk_mq to call blk_rq_map_sg()
  for us, we wouldn't need a ->prepare() step or a fake queue
  for case 'b)', and we could make use of that in other drivers
  too.

- DASD in contrast wants to build the channel programs while
  other I/O is running, and this is very specific to that
  driver. There are probably some other things it does in its
  own queue implementation that are also driver specific.

> That would allow to better distribute the load for timing-sensitive
> devices, and with a bit of luck remove the need for a separate queuing
> inside the driver.
> 
> In any case, it looks like a proper subject for LSF/MM...
> Linus? Arnd? Are you up for it?

I'm probably not going to work on it myself, but I think it would
be great for Linus and/or Ulf to bring this up at LSF/MM.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Valente Jan. 3, 2017, 7:50 a.m. UTC | #13
> Il giorno 02 gen 2017, alle ore 18:08, Arnd Bergmann <arnd@arndb.de> ha scritto:
> 
> On Monday, January 2, 2017 12:06:04 PM CET Hannes Reinecke wrote:
>> On 01/02/2017 10:40 AM, Arnd Bergmann wrote:
>>> b) without MMC CMDQ support:
>>>  - report queue depth of '2'
>>>  - first request gets handled as above
>>>  - if one request is pending, prepare the second request and
>>>    add a pointer to the mmc host structure (not that different
>>>    from what we do today)
>>>  - when the host driver completes a request, have it immediately
>>>    issue the next one from the interrupt handler. In case we need
>>>    to sleep here, use a threaded IRQ, or a workqueue. This should
>>>    avoid the need for the NULL requests
>>> 
>>> c) possible optimization for command packing without CMDQ:
>>>   - similar to b)
>>>   - report a longer queue (e.g. 8, maybe user selectable, to
>>>     balance throughput against latency)
>>>   - any request of the same type (read or write) as the one that
>>>     is currently added to the host as the 'next' one can be
>>>     added to that request so they get issued together
>>>   - if the types are different, report the queue to be busy
>>> 
>> Hmm. But that would amount to implement yet another queuing mechanism
>> within the driver/mmc subsystem, wouldn't it?
>> 
>> Which is, incidentally, the same method the S/390 DASD driver uses
>> nowadays; report an arbitrary queue depth to the block layer and queue
>> all requests internally to better saturate the device.
>> 
>> However I'd really like to get rid of this, and tweak the block layer to
>> handle these cases.
>> 
>> One idea I had was to use a 'prep' function for mq; it would be executed
>> once the request is added to the queue.
>> Key point here is that 'prep' and 'queue_rq' would be two distinct
>> steps; the driver could do all required setup functionality during
>> 'prep', and then do the actual submission via 'queue_rq'.
> 
> Right, I had the same idea and I think even talked to you about that.
> I think this would address case 'b)' above perfectly, but I don't
> see how we can fit case 'c)' in that scheme.
> 
> Maybe if we could teach blk_mq that a device might be able to not
> just merge consecutive requests but also a limited set of
> non-consecutive requests in a way that MMC needs them? Unfortunately,
> the 'packed command' is rather MMC specific, and it might even
> be worthwhile to do some optimization regarding what commands
> get packed (e.g. only writes, only small requests, or only up to
> a total size).
> 
> While there are some parallels to DASD, but it's not completely
> the same, even if we ignore the command packing.
> 
> - MMC wants the 'prepare' stage to reduce the latency from
>  dma_map_sg() calls. This is actually more related to the
>  CPU/SoC architecture and really important on most ARM systems
>  as they are not cache coherent, but it's not just specific
>  to MMC other than MMC performance on low-end ARM being
>  really important to a lot of people since it's used in all
>  Android phones.
>  If we could find a way to get blk_mq to call blk_rq_map_sg()
>  for us, we wouldn't need a ->prepare() step or a fake queue
>  for case 'b)', and we could make use of that in other drivers
>  too.
> 
> - DASD in contrast wants to build the channel programs while
>  other I/O is running, and this is very specific to that
>  driver. There are probably some other things it does in its
>  own queue implementation that are also driver specific.
> 

Sorry for the noise, but, assuming that an ignorant point of view
might have some value, exactly because it ignores details, here is my
point of view.  IMO the cleanest design would be the one where blk-mq
does only the job it has been designed for, i.e., pushes requests into
queues, and the driver takes care of the idiosyncrasies of the
device.  Concretely, the driver could
1) advertise a long queue (e.g., 32) to be constantly fed with a
large window of requests;
2) not pass requests immediately to the device, but keep them as long
as needed, before finally handing them to the device.

The driver could then use the window of requests, internally queued, to
perform exactly the operations that it now performs with the
collaboration of blk, such as command packing.  blk-mq would be
unchanged.  If I'm not mistaken, this would match, at least in part,
what some of you already proposed in more detail.

I apologize if I'm talking complete nonsense.

Paolo

>> That would allow to better distribute the load for timing-sensitive
>> devices, and with a bit of luck remove the need for a separate queuing
>> inside the driver.
>> 
>> In any case, it looks like a proper subject for LSF/MM...
>> Linus? Arnd? Are you up for it?
> 
> I'm probably not going to work on it myself, but I think it would
> be great for Linus and/or Ulf to bring this up at LSF/MM.
> 
> 	Arnd

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Linus Walleij Jan. 3, 2017, 12:57 p.m. UTC | #14
On Mon, Jan 2, 2017 at 10:40 AM, Arnd Bergmann <arnd@arndb.de> wrote:

> b) without MMC CMDQ support:
>   - report queue depth of '2'
>   - first request gets handled as above
>   - if one request is pending, prepare the second request and
>     add a pointer to the mmc host structure (not that different
>     from what we do today)
>   - when the host driver completes a request, have it immediately
>     issue the next one from the interrupt handler. In case we need
>     to sleep here, use a threaded IRQ, or a workqueue. This should
>     avoid the need for the NULL requests

This part we can do already today with the old block layer and I think
we (heh, I guess me) should do that as the first step.

After this migrating to blk-mq becomes much easier.

Yours,
Linus Walleij
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Linus Walleij Jan. 3, 2017, 12:59 p.m. UTC | #15
On Mon, Jan 2, 2017 at 12:55 PM, Bart Van Assche
<Bart.VanAssche@sandisk.com> wrote:
> On Thu, 2016-12-29 at 00:59 +0100, Linus Walleij wrote:
>> A problem here is that issueing the requests are in blocking context
>> while completion is in IRQ context (for most drivers) so we need to
>> look into this.
>
> Hello Linus,
>
> Although I'm not sure whether I understood you correctly: are you familiar
> with the request queue flag BLK_MQ_F_BLOCKING, a flag that was introduced
> for the nbd driver?

BLK_MQ_F_BLOCKING is what the patch set is currently using...

The problem I have is that request need to be *issued* in blocking
context and *completed* in fastpath/IRQ context.

Yours,
Linus Walleij
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Arnd Bergmann Jan. 3, 2017, 11:06 p.m. UTC | #16
On Tuesday, January 3, 2017 8:50:11 AM CET Paolo Valente wrote:
> 
> Sorry for the noise, but, assuming that an ignorant point of view
> might have some value, exactly because it ignores details, here is my
> point of view.  IMO the cleanest design would be the one where blk-mq
> does only the job it has been designed for, i.e., pushes requests into
> queues, and the driver takes care of the idiosyncrasies of the
> device.  Concretely, the driver could
> 1) advertise a long queue (e.g., 32) to be constantly fed with a
> large window of requests;
> 2) not pass requests immediately to the device, but keep them as long
> as needed, before finally handing them to the device.
> 
> The driver could then use the window of requests, internally queued, to
> perform exactly the operations that it now performs with the
> collaboration of blk, such as command packing.  blk-mq would be
> unchanged.  If I'm not mistaken, this would match, at least in part,
> what some of you already proposed in more detail.

I think we have to be careful not to trade latency in one place
for a bigger latency in another place.

Right now, the handoff between blk and mmc introduces an inherent
latency in some cases, and blk-mq was designed to avoid that by
allowing minimizing the number of cpu cycles between the file
system and the hardware, as well as minimizing the depth of the
queue on the software side.

Having the driver do its own queuing (as dasd and mmc both do
today) is worse for both of the above, since you need extra
context switches for a single I/O with an empty queue, while
a full queue can add a lot of latency that is out of reach of
the I/O scheduler: a high-priority request can be moved ahead
of the block queue, but cannot bypass anything that is already
in a driver-internal queue.

The tradeoff in both cases is that we can prepare (build a
dasd channel program, or have dma_map_sg manage cache and
iommu) a new request while waiting for the previous one,
reducing the latency between two requests being worked on
by the hardware on devices without hardware queuing.

We clearly want to have both benefits as much as possible,
and having a ->prepare() callback is probably better here
than a longer private queue in the device driver.

With the packed commands on MMC, we have a degenerated queue,
as we would sometimes submit multiple blk requests together
as one MMC command. Here, advertizing a longer queue as I
described earlier may be the best option.

	Arnd
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c
index bab3f07b1117..308ab7838f0d 100644
--- a/drivers/mmc/core/block.c
+++ b/drivers/mmc/core/block.c
@@ -28,6 +28,7 @@ 
 #include <linux/hdreg.h>
 #include <linux/kdev_t.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/mutex.h>
 #include <linux/scatterlist.h>
 #include <linux/string_helpers.h>
@@ -90,7 +91,6 @@  static DEFINE_SPINLOCK(mmc_blk_lock);
  * There is one mmc_blk_data per slot.
  */
 struct mmc_blk_data {
-	spinlock_t	lock;
 	struct device	*parent;
 	struct gendisk	*disk;
 	struct mmc_queue queue;
@@ -1181,7 +1181,8 @@  static int mmc_blk_issue_discard_rq(struct mmc_queue *mq, struct request *req)
 		goto retry;
 	if (!err)
 		mmc_blk_reset_success(md, type);
-	blk_end_request(req, err, blk_rq_bytes(req));
+
+	blk_mq_complete_request(req, err);
 
 	return err ? 0 : 1;
 }
@@ -1248,7 +1249,7 @@  static int mmc_blk_issue_secdiscard_rq(struct mmc_queue *mq,
 	if (!err)
 		mmc_blk_reset_success(md, type);
 out:
-	blk_end_request(req, err, blk_rq_bytes(req));
+	blk_mq_complete_request(req, err);
 
 	return err ? 0 : 1;
 }
@@ -1263,7 +1264,8 @@  static int mmc_blk_issue_flush(struct mmc_queue *mq, struct request *req)
 	if (ret)
 		ret = -EIO;
 
-	blk_end_request_all(req, ret);
+	/* FIXME: was using blk_end_request_all() to flush */
+	blk_mq_complete_request(req, ret);
 
 	return ret ? 0 : 1;
 }
@@ -1585,10 +1587,12 @@  static int mmc_blk_cmd_err(struct mmc_blk_data *md, struct mmc_card *card,
 
 		blocks = mmc_sd_num_wr_blocks(card);
 		if (blocks != (u32)-1) {
-			ret = blk_end_request(req, 0, blocks << 9);
+			blk_mq_complete_request(req, 0);
+			ret = 0;
 		}
 	} else {
-		ret = blk_end_request(req, 0, brq->data.bytes_xfered);
+		blk_mq_complete_request(req, 0);
+		ret = 0;
 	}
 	return ret;
 }
@@ -1648,9 +1652,8 @@  static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
 			 */
 			mmc_blk_reset_success(md, type);
 
-			ret = blk_end_request(req, 0,
-					brq->data.bytes_xfered);
-
+			blk_mq_complete_request(req, 0);
+			ret = 0;
 			/*
 			 * If the blk_end_request function returns non-zero even
 			 * though all data has been transferred and no errors
@@ -1703,8 +1706,7 @@  static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
 			 * time, so we only reach here after trying to
 			 * read a single sector.
 			 */
-			ret = blk_end_request(req, -EIO,
-						brq->data.blksz);
+			blk_mq_complete_request(req, -EIO);
 			if (!ret)
 				goto start_new_req;
 			break;
@@ -1733,16 +1735,16 @@  static int mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *rqc)
 
  cmd_abort:
 	if (mmc_card_removed(card))
-		req->rq_flags |= RQF_QUIET;
-	while (ret)
-		ret = blk_end_request(req, -EIO,
-				blk_rq_cur_bytes(req));
+		req->cmd_flags |= RQF_QUIET;
+	blk_mq_complete_request(req, -EIO);
+	ret = 0;
 
  start_new_req:
 	if (rqc) {
 		if (mmc_card_removed(card)) {
 			rqc->rq_flags |= RQF_QUIET;
-			blk_end_request_all(rqc, -EIO);
+			/* FIXME: was blk_end_request_all() */
+			blk_mq_complete_request(rqc, -EIO);
 		} else {
 			mmc_blk_rw_rq_prep(mq->mqrq_cur, card, 0, mq);
 			mmc_start_req(card->host,
@@ -1766,9 +1768,9 @@  int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 
 	ret = mmc_blk_part_switch(card, md);
 	if (ret) {
-		if (req) {
-			blk_end_request_all(req, -EIO);
-		}
+		if (req)
+			/* FIXME: was blk_end_request_all() to flush */
+			blk_mq_complete_request(req, -EIO);
 		ret = 0;
 		goto out;
 	}
@@ -1859,11 +1861,10 @@  static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card,
 		goto err_kfree;
 	}
 
-	spin_lock_init(&md->lock);
 	INIT_LIST_HEAD(&md->part);
 	md->usage = 1;
 
-	ret = mmc_init_queue(&md->queue, card, &md->lock, subname);
+	ret = mmc_init_queue(&md->queue, card, subname);
 	if (ret)
 		goto err_putdisk;
 
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index a6496d8027bc..6914d0bff959 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -10,11 +10,12 @@ 
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/freezer.h>
-#include <linux/kthread.h>
 #include <linux/scatterlist.h>
 #include <linux/dma-mapping.h>
-
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
 
@@ -23,6 +24,12 @@ 
 
 #define MMC_QUEUE_BOUNCESZ	65536
 
+/* Per-request struct */
+struct mmc_block_cmd {
+	struct request *rq;
+	struct device *dev;
+};
+
 /*
  * Prepare a MMC request. This just filters out odd stuff.
  */
@@ -47,107 +54,6 @@  static int mmc_prep_request(struct request_queue *q, struct request *req)
 	return BLKPREP_OK;
 }
 
-static int mmc_queue_thread(void *d)
-{
-	struct mmc_queue *mq = d;
-	struct request_queue *q = mq->queue;
-	struct mmc_context_info *cntx = &mq->card->host->context_info;
-
-	current->flags |= PF_MEMALLOC;
-
-	down(&mq->thread_sem);
-	do {
-		struct request *req = NULL;
-
-		spin_lock_irq(q->queue_lock);
-		set_current_state(TASK_INTERRUPTIBLE);
-		req = blk_fetch_request(q);
-		mq->asleep = false;
-		cntx->is_waiting_last_req = false;
-		cntx->is_new_req = false;
-		if (!req) {
-			/*
-			 * Dispatch queue is empty so set flags for
-			 * mmc_request_fn() to wake us up.
-			 */
-			if (mq->mqrq_prev->req)
-				cntx->is_waiting_last_req = true;
-			else
-				mq->asleep = true;
-		}
-		mq->mqrq_cur->req = req;
-		spin_unlock_irq(q->queue_lock);
-
-		if (req || mq->mqrq_prev->req) {
-			bool req_is_special = mmc_req_is_special(req);
-
-			set_current_state(TASK_RUNNING);
-			mmc_blk_issue_rq(mq, req);
-			cond_resched();
-			if (mq->flags & MMC_QUEUE_NEW_REQUEST) {
-				mq->flags &= ~MMC_QUEUE_NEW_REQUEST;
-				continue; /* fetch again */
-			}
-
-			/*
-			 * Current request becomes previous request
-			 * and vice versa.
-			 * In case of special requests, current request
-			 * has been finished. Do not assign it to previous
-			 * request.
-			 */
-			if (req_is_special)
-				mq->mqrq_cur->req = NULL;
-
-			mq->mqrq_prev->brq.mrq.data = NULL;
-			mq->mqrq_prev->req = NULL;
-			swap(mq->mqrq_prev, mq->mqrq_cur);
-		} else {
-			if (kthread_should_stop()) {
-				set_current_state(TASK_RUNNING);
-				break;
-			}
-			up(&mq->thread_sem);
-			schedule();
-			down(&mq->thread_sem);
-		}
-	} while (1);
-	up(&mq->thread_sem);
-
-	return 0;
-}
-
-/*
- * Generic MMC request handler.  This is called for any queue on a
- * particular host.  When the host is not busy, we look for a request
- * on any queue on this host, and attempt to issue it.  This may
- * not be the queue we were asked to process.
- */
-static void mmc_request_fn(struct request_queue *q)
-{
-	struct mmc_queue *mq = q->queuedata;
-	struct request *req;
-	struct mmc_context_info *cntx;
-
-	if (!mq) {
-		while ((req = blk_fetch_request(q)) != NULL) {
-			req->rq_flags |= RQF_QUIET;
-			__blk_end_request_all(req, -EIO);
-		}
-		return;
-	}
-
-	cntx = &mq->card->host->context_info;
-
-	if (cntx->is_waiting_last_req) {
-		cntx->is_new_req = true;
-		wake_up_interruptible(&cntx->wait);
-	}
-
-	if (mq->asleep)
-		wake_up_process(mq->thread);
-}
-
 static struct scatterlist *mmc_alloc_sg(int sg_len, int *err)
 {
 	struct scatterlist *sg;
@@ -240,6 +146,127 @@  static int mmc_queue_alloc_sgs(struct mmc_queue *mq, int max_segs)
 	return 0;
 }
 
+static void mmc_queue_timeout(struct work_struct *work)
+{
+	struct mmc_queue *mq =
+		container_of(work, struct mmc_queue, work.work);
+
+	/*
+	 * After a timeout, if the block layer is not pushing
+	 * in more requests, flush the pipeline by issueing a
+	 * NULL request.
+	 */
+	mutex_lock(&mq->lock);
+
+	mq->mqrq_cur->req = NULL;
+	mmc_blk_issue_rq(mq, NULL);
+
+	mq->mqrq_prev->brq.mrq.data = NULL;
+	mq->mqrq_prev->req = NULL;
+	swap(mq->mqrq_prev, mq->mqrq_cur);
+
+	mutex_unlock(&mq->lock);
+}
+
+static int mmc_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct mmc_block_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+	struct mmc_queue *mq = cmd->rq->q->queuedata;
+	struct request *req = bd->rq;
+	struct mmc_context_info *cntx;
+	bool req_is_special = mmc_req_is_special(req);
+
+	/* start this request */
+	blk_mq_start_request(req);
+
+	/*
+	 * If we are waiting for a flush timeout: cancel it. This new request
+	 * will flush the async pipeline.
+	 */
+	cancel_delayed_work_sync(&mq->work);
+
+	mutex_lock(&mq->lock);
+
+	/* Submit the request */
+	mq->mqrq_cur->req = req;
+	mmc_blk_issue_rq(mq, req);
+
+	if (mq->flags & MMC_QUEUE_NEW_REQUEST)
+		mq->flags &= ~MMC_QUEUE_NEW_REQUEST;
+
+	/*
+	 * Current request becomes previous request
+	 * and vice versa.
+	 *
+	 * In case of special requests, current request
+	 * has been finished. Do not assign it to previous
+	 * request.
+	 */
+	if (req_is_special)
+		mq->mqrq_cur->req = NULL;
+
+	mq->mqrq_prev->brq.mrq.data = NULL;
+	mq->mqrq_prev->req = NULL;
+	swap(mq->mqrq_prev, mq->mqrq_cur);
+
+	cntx = &mq->card->host->context_info;
+	if (cntx->is_waiting_last_req) {
+		cntx->is_new_req = true;
+		wake_up_interruptible(&cntx->wait);
+	}
+
+	mutex_unlock(&mq->lock);
+
+	/*
+	 * Bump the flush timer to expire per immediately.
+	 *
+	 * What happens is that if there is more work to be pushed in from
+	 * the block layer, then that happens before the delayed work has
+	 * a chance to run and this work gets cancelled quite immediately.
+	 * Else a short lapse passes and the work gets scheduled and will
+	 * then flush the async pipeline.
+	 */
+	schedule_delayed_work(&mq->work, 0);
+
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static int mmc_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
+{
+	struct mmc_block_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	struct mmc_queue *mq = data;
+	struct mmc_card *card = mq->card;
+	struct mmc_host *host = card->host;
+
+	cmd->rq = rq;
+	cmd->dev = host->parent;
+
+	return 0;
+}
+
+static void mmc_exit_request(void *data, struct request *rq,
+			     unsigned int hctx_idx, unsigned int request_idx)
+{
+	struct mmc_block_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	dev_info(cmd->dev, "%s: hctx_idx = %u, request_idx = %u\n",
+		 __func__, hctx_idx, request_idx);
+}
+
+static struct blk_mq_ops mmc_mq_ops = {
+	.queue_rq	= mmc_queue_rq,
+	.init_request	= mmc_init_request,
+	.exit_request	= mmc_exit_request,
+	/*
+	 * .exit_request() will only be invoked if we explcitly call
+	 * blk_mq_end_request() on all requests. Why would we do that,
+	 * we will just call blk_mq_complete_request().
+	 */
+};
+
 static void mmc_queue_req_free_bufs(struct mmc_queue_req *mqrq)
 {
 	kfree(mqrq->bounce_sg);
@@ -270,7 +297,7 @@  static void mmc_queue_reqs_free_bufs(struct mmc_queue *mq)
  * Initialise a MMC card request queue.
  */
 int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
-		   spinlock_t *lock, const char *subname)
+		   const char *subname)
 {
 	struct mmc_host *host = card->host;
 	u64 limit = BLK_BOUNCE_HIGH;
@@ -281,10 +308,38 @@  int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 		limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT;
 
 	mq->card = card;
-	mq->queue = blk_init_queue(mmc_request_fn, lock);
-	if (!mq->queue)
+	mq->tag_set.ops = &mmc_mq_ops;
+	/* The MMC/SD protocols have only one command pipe */
+	mq->tag_set.nr_hw_queues = 1;
+	/* Set this to 2 to simulate async requests */
+	mq->tag_set.queue_depth = 2;
+	/*
+	 * The extra data allocated per block request.
+	 */
+	mq->tag_set.cmd_size = sizeof(struct mmc_block_cmd);
+	mq->tag_set.numa_node = NUMA_NO_NODE;
+	/* We use blocking requests */
+	mq->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+	// BLK_MQ_F_SG_MERGE?
+	mq->tag_set.driver_data = mq;
+
+	ret = blk_mq_alloc_tag_set(&mq->tag_set);
+	if (ret) {
+		dev_err(card->host->parent, "failed to allocate MQ tag set\n");
 		return -ENOMEM;
+	}
+
+	mq->queue = blk_mq_init_queue(&mq->tag_set);
+	if (!mq->queue) {
+		dev_err(card->host->parent, "failed to initialize block MQ\n");
+		goto cleanup_free_tag_set;
+	}
+
+	blk_queue_max_segments(mq->queue, host->max_segs);
 
+	/* Set up the async request timeout timer */
+	mutex_init(&mq->lock);
+	INIT_DELAYED_WORK(&mq->work, mmc_queue_timeout);
 	mq->qdepth = 2;
 	mq->mqrq = kcalloc(mq->qdepth, sizeof(struct mmc_queue_req),
 			   GFP_KERNEL);
@@ -340,16 +395,6 @@  int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 			goto cleanup_queue;
 	}
 
-	sema_init(&mq->thread_sem, 1);
-
-	mq->thread = kthread_run(mmc_queue_thread, mq, "mmcqd/%d%s",
-		host->index, subname ? subname : "");
-
-	if (IS_ERR(mq->thread)) {
-		ret = PTR_ERR(mq->thread);
-		goto cleanup_queue;
-	}
-
 	return 0;
 
  cleanup_queue:
@@ -358,30 +403,34 @@  int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card,
 	mq->mqrq = NULL;
 blk_cleanup:
 	blk_cleanup_queue(mq->queue);
+
+cleanup_free_tag_set:
+	blk_mq_free_tag_set(&mq->tag_set);
+
 	return ret;
 }
 
 void mmc_cleanup_queue(struct mmc_queue *mq)
 {
 	struct request_queue *q = mq->queue;
-	unsigned long flags;
 
 	/* Make sure the queue isn't suspended, as that will deadlock */
 	mmc_queue_resume(mq);
 
-	/* Then terminate our worker thread */
-	kthread_stop(mq->thread);
-
 	/* Empty the queue */
-	spin_lock_irqsave(q->queue_lock, flags);
 	q->queuedata = NULL;
 	blk_start_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	/* Sync and kill the timer */
+	cancel_delayed_work_sync(&mq->work);
 
 	mmc_queue_reqs_free_bufs(mq);
 	kfree(mq->mqrq);
 	mq->mqrq = NULL;
 
+	blk_cleanup_queue(mq->queue);
+	blk_mq_free_tag_set(&mq->tag_set);
+
 	mq->card = NULL;
 }
 EXPORT_SYMBOL(mmc_cleanup_queue);
@@ -390,23 +439,18 @@  EXPORT_SYMBOL(mmc_cleanup_queue);
  * mmc_queue_suspend - suspend a MMC request queue
  * @mq: MMC queue to suspend
  *
- * Stop the block request queue, and wait for our thread to
- * complete any outstanding requests.  This ensures that we
+ * Stop the block request queue. This ensures that we
  * won't suspend while a request is being processed.
  */
 void mmc_queue_suspend(struct mmc_queue *mq)
 {
 	struct request_queue *q = mq->queue;
-	unsigned long flags;
 
+	/* FIXME: deal with worker queue */
 	if (!(mq->flags & MMC_QUEUE_SUSPENDED)) {
 		mq->flags |= MMC_QUEUE_SUSPENDED;
 
-		spin_lock_irqsave(q->queue_lock, flags);
 		blk_stop_queue(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-
-		down(&mq->thread_sem);
 	}
 }
 
@@ -417,16 +461,12 @@  void mmc_queue_suspend(struct mmc_queue *mq)
 void mmc_queue_resume(struct mmc_queue *mq)
 {
 	struct request_queue *q = mq->queue;
-	unsigned long flags;
 
+	/* FIXME: deal with worker queue */
 	if (mq->flags & MMC_QUEUE_SUSPENDED) {
 		mq->flags &= ~MMC_QUEUE_SUSPENDED;
 
-		up(&mq->thread_sem);
-
-		spin_lock_irqsave(q->queue_lock, flags);
 		blk_start_queue(q);
-		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
 }
 
diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h
index dac8c3d010dd..b9b97bc16f8e 100644
--- a/drivers/mmc/core/queue.h
+++ b/drivers/mmc/core/queue.h
@@ -1,6 +1,9 @@ 
 #ifndef MMC_QUEUE_H
 #define MMC_QUEUE_H
 
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+
 static inline bool mmc_req_is_special(struct request *req)
 {
 	return req &&
@@ -10,7 +13,6 @@  static inline bool mmc_req_is_special(struct request *req)
 }
 
 struct request;
-struct task_struct;
 struct mmc_blk_data;
 
 struct mmc_blk_request {
@@ -34,22 +36,21 @@  struct mmc_queue_req {
 
 struct mmc_queue {
 	struct mmc_card		*card;
-	struct task_struct	*thread;
-	struct semaphore	thread_sem;
 	unsigned int		flags;
 #define MMC_QUEUE_SUSPENDED	(1 << 0)
 #define MMC_QUEUE_NEW_REQUEST	(1 << 1)
-	bool			asleep;
 	struct mmc_blk_data	*blkdata;
 	struct request_queue	*queue;
+	struct blk_mq_tag_set	tag_set;
 	struct mmc_queue_req	*mqrq;
 	struct mmc_queue_req	*mqrq_cur;
 	struct mmc_queue_req	*mqrq_prev;
 	int			qdepth;
+	struct mutex		lock;
+	struct delayed_work	work;
 };
 
-extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *,
-			  const char *);
+extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, const char *);
 extern void mmc_cleanup_queue(struct mmc_queue *);
 extern void mmc_queue_suspend(struct mmc_queue *);
 extern void mmc_queue_resume(struct mmc_queue *);