diff mbox

[5/7] blk-mq-sched: add framework for MQ capable IO schedulers

Message ID 1481228005-9245-6-git-send-email-axboe@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe Dec. 8, 2016, 8:13 p.m. UTC
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/Makefile           |   2 +-
 block/blk-core.c         |   9 +-
 block/blk-exec.c         |   3 +-
 block/blk-flush.c        |   7 +-
 block/blk-merge.c        |   3 +
 block/blk-mq-sched.c     | 246 +++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-sched.h     | 187 +++++++++++++++++++++++++++++++++++
 block/blk-mq-tag.c       |   1 +
 block/blk-mq.c           | 150 +++++++++++++++--------------
 block/blk-mq.h           |  34 +++----
 block/elevator.c         | 181 ++++++++++++++++++++++++++--------
 include/linux/blk-mq.h   |   2 +-
 include/linux/elevator.h |  29 +++++-
 13 files changed, 713 insertions(+), 141 deletions(-)
 create mode 100644 block/blk-mq-sched.c
 create mode 100644 block/blk-mq-sched.h

Comments

Bart Van Assche Dec. 13, 2016, 1:56 p.m. UTC | #1
On 12/08/2016 09:13 PM, Jens Axboe wrote:
> +/*
> + * Empty set
> + */
> +static struct blk_mq_ops mq_sched_tag_ops = {
> +	.queue_rq	= NULL,
> +};

Hello Jens,

Would "static struct blk_mq_ops mq_sched_tag_ops;" have been sufficient? 
Can this data structure be declared 'const' if the blk_mq_ops pointers 
in struct blk_mq_tag_set and struct request_queue are also declared const?

> +struct request *blk_mq_sched_alloc_shadow_request(struct request_queue *q,
> +						  struct blk_mq_alloc_data *data,
> +						  struct blk_mq_tags *tags,
> +						  atomic_t *wait_index)
> +{

Using the word "shadow" in the function name suggests to me that there 
is a shadow request for every request and a request for every shadow 
request. However, my understanding from the code is that there can be 
requests without shadow requests (for e.g. a flush) and shadow requests 
without requests. Shouldn't the name of this function reflect that, e.g. 
by using "sched" or "elv" in the function name instead of "shadow"?

> +struct request *
> +blk_mq_sched_request_from_shadow(struct blk_mq_hw_ctx *hctx,
> +				 struct request *(*get_sched_rq)(struct blk_mq_hw_ctx *))

This function dequeues a request from the I/O scheduler queue, allocates 
a request, copies the relevant request structure members into that 
request and makes the request refer to the shadow request. Isn't the 
request dispatching more important than associating the request with the 
shadow request? If so, how about making the function name reflect that?

> +{
> +	struct blk_mq_alloc_data data;
> +	struct request *sched_rq, *rq;
> +
> +	data.q = hctx->queue;
> +	data.flags = BLK_MQ_REQ_NOWAIT;
> +	data.ctx = blk_mq_get_ctx(hctx->queue);
> +	data.hctx = hctx;
> +
> +	rq = __blk_mq_alloc_request(&data, 0);
> +	blk_mq_put_ctx(data.ctx);
> +
> +	if (!rq) {
> +		blk_mq_stop_hw_queue(hctx);
> +		return NULL;
> +	}
> +
> +	sched_rq = get_sched_rq(hctx);
> +
> +	if (!sched_rq) {
> +		blk_queue_enter_live(hctx->queue);
> +		__blk_mq_free_request(hctx, data.ctx, rq);
> +		return NULL;
> +	}

The mq deadline scheduler calls this function with get_sched_rq == 
__dd_dispatch_request. If __blk_mq_alloc_request() fails, shouldn't the 
request that was removed from the scheduler queue be pushed back onto 
that queue? Additionally, are you sure it's necessary to call 
blk_queue_enter_live() from the error path?

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche Dec. 13, 2016, 2:29 p.m. UTC | #2
On 12/08/2016 09:13 PM, Jens Axboe wrote:
> +static inline void blk_mq_sched_put_request(struct request *rq)
> +{
> +	struct request_queue *q = rq->q;
> +	struct elevator_queue *e = q->elevator;
> +
> +	if (e && e->type->mq_ops.put_request)
> +		e->type->mq_ops.put_request(rq);
> +	else
> +		blk_mq_free_request(rq);
> +}

blk_mq_free_request() always triggers a call of blk_queue_exit(). 
dd_put_request() only triggers a call of blk_queue_exit() if it is not a 
shadow request. Is that on purpose?

> +static inline struct request *
> +blk_mq_sched_get_request(struct request_queue *q, unsigned int op,
> +			 struct blk_mq_alloc_data *data)
> +{
> +	struct elevator_queue *e = q->elevator;
> +	struct blk_mq_hw_ctx *hctx;
> +	struct blk_mq_ctx *ctx;
> +	struct request *rq;
> +
> +	blk_queue_enter_live(q);
> +	ctx = blk_mq_get_ctx(q);
> +	hctx = blk_mq_map_queue(q, ctx->cpu);
> +
> +	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
> +
> +	if (e && e->type->mq_ops.get_request)
> +		rq = e->type->mq_ops.get_request(q, op, data);
> +	else
> +		rq = __blk_mq_alloc_request(data, op);
> +
> +	if (rq)
> +		data->hctx->queued++;
> +
> +	return rq;
> +
> +}

Some but not all callers of blk_mq_sched_get_request() call 
blk_queue_exit() if this function returns NULL. Please consider to move 
the blk_queue_exit() call from the blk_mq_alloc_request() error path 
into this function. I think that will make it a lot easier to verify 
whether or not the blk_queue_enter() / blk_queue_exit() calls are 
balanced properly.

Additionally, since blk_queue_enter() / blk_queue_exit() calls by 
blk_mq_sched_get_request() and blk_mq_sched_put_request() must be 
balanced and since the latter function only calls blk_queue_exit() for 
non-shadow requests, shouldn't blk_mq_sched_get_request() call 
blk_queue_enter_live() only if __blk_mq_alloc_request() is called?

Thanks,

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Dec. 13, 2016, 3:14 p.m. UTC | #3
On 12/13/2016 06:56 AM, Bart Van Assche wrote:
> On 12/08/2016 09:13 PM, Jens Axboe wrote:
>> +/*
>> + * Empty set
>> + */
>> +static struct blk_mq_ops mq_sched_tag_ops = {
>> +	.queue_rq	= NULL,
>> +};
> 
> Hello Jens,
> 
> Would "static struct blk_mq_ops mq_sched_tag_ops;" have been sufficient? 
> Can this data structure be declared 'const' if the blk_mq_ops pointers 
> in struct blk_mq_tag_set and struct request_queue are also declared const?

Yes, the static should be enough to ensure that it's all zeroes. I did
have this as const, but then realized I'd have to change a few other
places too. I'll make that change, hopefully it'll just work out.

>> +struct request *blk_mq_sched_alloc_shadow_request(struct request_queue *q,
>> +						  struct blk_mq_alloc_data *data,
>> +						  struct blk_mq_tags *tags,
>> +						  atomic_t *wait_index)
>> +{
> 
> Using the word "shadow" in the function name suggests to me that there 
> is a shadow request for every request and a request for every shadow 
> request. However, my understanding from the code is that there can be 
> requests without shadow requests (for e.g. a flush) and shadow requests 
> without requests. Shouldn't the name of this function reflect that, e.g. 
> by using "sched" or "elv" in the function name instead of "shadow"?

Shadow might not be the best name. Most do have shadows though, it's
only the rare exception like the flush, that you mention. I'll see if I
can come up with a better name.

>> +struct request *
>> +blk_mq_sched_request_from_shadow(struct blk_mq_hw_ctx *hctx,
>> +				 struct request *(*get_sched_rq)(struct blk_mq_hw_ctx *))
> 
> This function dequeues a request from the I/O scheduler queue, allocates 
> a request, copies the relevant request structure members into that 
> request and makes the request refer to the shadow request. Isn't the 
> request dispatching more important than associating the request with the 
> shadow request? If so, how about making the function name reflect that?

Sure, I can update the naming. Will need to anyway, if we get rid of the
shadow naming.

>> +{
>> +	struct blk_mq_alloc_data data;
>> +	struct request *sched_rq, *rq;
>> +
>> +	data.q = hctx->queue;
>> +	data.flags = BLK_MQ_REQ_NOWAIT;
>> +	data.ctx = blk_mq_get_ctx(hctx->queue);
>> +	data.hctx = hctx;
>> +
>> +	rq = __blk_mq_alloc_request(&data, 0);
>> +	blk_mq_put_ctx(data.ctx);
>> +
>> +	if (!rq) {
>> +		blk_mq_stop_hw_queue(hctx);
>> +		return NULL;
>> +	}
>> +
>> +	sched_rq = get_sched_rq(hctx);
>> +
>> +	if (!sched_rq) {
>> +		blk_queue_enter_live(hctx->queue);
>> +		__blk_mq_free_request(hctx, data.ctx, rq);
>> +		return NULL;
>> +	}
> 
> The mq deadline scheduler calls this function with get_sched_rq == 
> __dd_dispatch_request. If __blk_mq_alloc_request() fails, shouldn't the 
> request that was removed from the scheduler queue be pushed back onto 
> that queue? Additionally, are you sure it's necessary to call 
> blk_queue_enter_live() from the error path?

If __blk_mq_alloc_request() fails, we haven't pulled a request from the
scheduler yet. The extra ref is needed because __blk_mq_alloc_request()
doesn't get a ref on the request, however __blk_mq_free_request() does
put one.
Jens Axboe Dec. 13, 2016, 3:20 p.m. UTC | #4
On Tue, Dec 13 2016, Bart Van Assche wrote:
> On 12/08/2016 09:13 PM, Jens Axboe wrote:
> >+static inline void blk_mq_sched_put_request(struct request *rq)
> >+{
> >+	struct request_queue *q = rq->q;
> >+	struct elevator_queue *e = q->elevator;
> >+
> >+	if (e && e->type->mq_ops.put_request)
> >+		e->type->mq_ops.put_request(rq);
> >+	else
> >+		blk_mq_free_request(rq);
> >+}
> 
> blk_mq_free_request() always triggers a call of blk_queue_exit().
> dd_put_request() only triggers a call of blk_queue_exit() if it is not a
> shadow request. Is that on purpose?

If the scheduler doesn't define get/put requests, then the lifetime
follows the normal setup. If we do define them, then dd_put_request()
only wants to put the request if it's one where we did setup a shadow.

> >+static inline struct request *
> >+blk_mq_sched_get_request(struct request_queue *q, unsigned int op,
> >+			 struct blk_mq_alloc_data *data)
> >+{
> >+	struct elevator_queue *e = q->elevator;
> >+	struct blk_mq_hw_ctx *hctx;
> >+	struct blk_mq_ctx *ctx;
> >+	struct request *rq;
> >+
> >+	blk_queue_enter_live(q);
> >+	ctx = blk_mq_get_ctx(q);
> >+	hctx = blk_mq_map_queue(q, ctx->cpu);
> >+
> >+	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
> >+
> >+	if (e && e->type->mq_ops.get_request)
> >+		rq = e->type->mq_ops.get_request(q, op, data);
> >+	else
> >+		rq = __blk_mq_alloc_request(data, op);
> >+
> >+	if (rq)
> >+		data->hctx->queued++;
> >+
> >+	return rq;
> >+
> >+}
> 
> Some but not all callers of blk_mq_sched_get_request() call blk_queue_exit()
> if this function returns NULL. Please consider to move the blk_queue_exit()
> call from the blk_mq_alloc_request() error path into this function. I think
> that will make it a lot easier to verify whether or not the
> blk_queue_enter() / blk_queue_exit() calls are balanced properly.

Agree, I'll make the change, it'll be easier to read then.

> Additionally, since blk_queue_enter() / blk_queue_exit() calls by
> blk_mq_sched_get_request() and blk_mq_sched_put_request() must be balanced
> and since the latter function only calls blk_queue_exit() for non-shadow
> requests, shouldn't blk_mq_sched_get_request() call blk_queue_enter_live()
> only if __blk_mq_alloc_request() is called?

I'll double check that part, there might be a bug or at least a chance
to clean this up a bit. I did verify most of this at some point, and
tested it with the scheduler switching. That part falls apart pretty
quickly, if the references aren't matched exactly.
Bart Van Assche Dec. 14, 2016, 10:31 a.m. UTC | #5
On 12/13/2016 04:14 PM, Jens Axboe wrote:
> On 12/13/2016 06:56 AM, Bart Van Assche wrote:
>> On 12/08/2016 09:13 PM, Jens Axboe wrote:
>>> +struct request *blk_mq_sched_alloc_shadow_request(struct request_queue *q,
>>> +						  struct blk_mq_alloc_data *data,
>>> +						  struct blk_mq_tags *tags,
>>> +						  atomic_t *wait_index)
>>> +{
>>
>> Using the word "shadow" in the function name suggests to me that there 
>> is a shadow request for every request and a request for every shadow 
>> request. However, my understanding from the code is that there can be 
>> requests without shadow requests (for e.g. a flush) and shadow requests 
>> without requests. Shouldn't the name of this function reflect that, e.g. 
>> by using "sched" or "elv" in the function name instead of "shadow"?
> 
> Shadow might not be the best name. Most do have shadows though, it's
> only the rare exception like the flush, that you mention. I'll see if I
> can come up with a better name.

Hello Jens,

One aspect of this patch series that might turn out to be a maintenance
burden is the copying between original and shadow requests. It is easy
to overlook that rq_copy() has to be updated if a field would ever be
added to struct request. Additionally, having to allocate two requests
structures per I/O instead of one will have a runtime overhead. Do you
think the following approach would work?
- Instead of using two request structures per I/O, only use a single
  request structure.
- Instead of storing one tag in the request structure, store two tags
  in that structure. One tag comes from the I/O scheduler tag set
  (size: nr_requests) and the other from the tag set associated with
  the block driver (size: HBA queue depth).
- Only add a request to the hctx dispatch list after a block driver tag
  has been assigned. This means that an I/O scheduler must keep a
  request structure on a list it manages itself as long as no block
  driver tag has been assigned.
- sysfs_list_show() is modified such that it shows both tags.

Thanks,

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Dec. 14, 2016, 3:05 p.m. UTC | #6
On 12/14/2016 03:31 AM, Bart Van Assche wrote:
> On 12/13/2016 04:14 PM, Jens Axboe wrote:
>> On 12/13/2016 06:56 AM, Bart Van Assche wrote:
>>> On 12/08/2016 09:13 PM, Jens Axboe wrote:
>>>> +struct request *blk_mq_sched_alloc_shadow_request(struct request_queue *q,
>>>> +						  struct blk_mq_alloc_data *data,
>>>> +						  struct blk_mq_tags *tags,
>>>> +						  atomic_t *wait_index)
>>>> +{
>>>
>>> Using the word "shadow" in the function name suggests to me that there 
>>> is a shadow request for every request and a request for every shadow 
>>> request. However, my understanding from the code is that there can be 
>>> requests without shadow requests (for e.g. a flush) and shadow requests 
>>> without requests. Shouldn't the name of this function reflect that, e.g. 
>>> by using "sched" or "elv" in the function name instead of "shadow"?
>>
>> Shadow might not be the best name. Most do have shadows though, it's
>> only the rare exception like the flush, that you mention. I'll see if I
>> can come up with a better name.
> 
> Hello Jens,
> 
> One aspect of this patch series that might turn out to be a maintenance
> burden is the copying between original and shadow requests. It is easy
> to overlook that rq_copy() has to be updated if a field would ever be
> added to struct request. Additionally, having to allocate two requests
> structures per I/O instead of one will have a runtime overhead. Do you
> think the following approach would work?
> - Instead of using two request structures per I/O, only use a single
>   request structure.
> - Instead of storing one tag in the request structure, store two tags
>   in that structure. One tag comes from the I/O scheduler tag set
>   (size: nr_requests) and the other from the tag set associated with
>   the block driver (size: HBA queue depth).
> - Only add a request to the hctx dispatch list after a block driver tag
>   has been assigned. This means that an I/O scheduler must keep a
>   request structure on a list it manages itself as long as no block
>   driver tag has been assigned.
> - sysfs_list_show() is modified such that it shows both tags.

I have considered doing exactly that, decided to go down the other path.
I may still revisit, it's not that I'm a huge fan of the shadow requests
and the necessary copying. We don't update the request that often, so I
don't think it's going to be a big maintenance burden. But it'd be hard
to claim that it's super pretty...

I'll play with the idea.
diff mbox

Patch

diff --git a/block/Makefile b/block/Makefile
index a827f988c4e6..2eee9e1bb6db 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@  obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
 			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
-			blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
+			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
 			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
 			badblocks.o partitions/
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 4b7ec5958055..3f83414d6986 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@ 
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-mq-sched.h"
 #include "blk-wbt.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -1428,7 +1429,7 @@  void __blk_put_request(struct request_queue *q, struct request *req)
 		return;
 
 	if (q->mq_ops) {
-		blk_mq_free_request(req);
+		blk_mq_sched_put_request(req);
 		return;
 	}
 
@@ -1464,7 +1465,7 @@  void blk_put_request(struct request *req)
 	struct request_queue *q = req->q;
 
 	if (q->mq_ops)
-		blk_mq_free_request(req);
+		blk_mq_sched_put_request(req);
 	else {
 		unsigned long flags;
 
@@ -1528,6 +1529,7 @@  bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 	blk_account_io_start(req, false);
 	return true;
 }
+EXPORT_SYMBOL_GPL(bio_attempt_back_merge);
 
 bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 			     struct bio *bio)
@@ -1552,6 +1554,7 @@  bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
 	blk_account_io_start(req, false);
 	return true;
 }
+EXPORT_SYMBOL_GPL(bio_attempt_front_merge);
 
 /**
  * blk_attempt_plug_merge - try to merge with %current's plugged list
@@ -2173,7 +2176,7 @@  int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	if (q->mq_ops) {
 		if (blk_queue_io_stat(q))
 			blk_account_io_start(rq, true);
-		blk_mq_insert_request(rq, false, true, false);
+		blk_mq_sched_insert_request(rq, false, true, false);
 		return 0;
 	}
 
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 3ecb00a6cf45..86656fdfa637 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -9,6 +9,7 @@ 
 #include <linux/sched/sysctl.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 /*
  * for max sense size
@@ -65,7 +66,7 @@  void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	 * be reused after dying flag is set
 	 */
 	if (q->mq_ops) {
-		blk_mq_insert_request(rq, at_head, true, false);
+		blk_mq_sched_insert_request(rq, at_head, true, false);
 		return;
 	}
 
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 27a42dab5a36..63b91697d167 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -74,6 +74,7 @@ 
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
 
 /* FLUSH/FUA sequences */
 enum {
@@ -425,9 +426,9 @@  void blk_insert_flush(struct request *rq)
 	 */
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-		if (q->mq_ops) {
-			blk_mq_insert_request(rq, false, true, false);
-		} else
+		if (q->mq_ops)
+			blk_mq_sched_insert_request(rq, false, true, false);
+		else
 			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
 	}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 1002afdfee99..01247812e13f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -766,6 +766,7 @@  int attempt_back_merge(struct request_queue *q, struct request *rq)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(attempt_back_merge);
 
 int attempt_front_merge(struct request_queue *q, struct request *rq)
 {
@@ -776,6 +777,7 @@  int attempt_front_merge(struct request_queue *q, struct request *rq)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(attempt_front_merge);
 
 int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 			  struct request *next)
@@ -825,3 +827,4 @@  int blk_try_merge(struct request *rq, struct bio *bio)
 		return ELEVATOR_FRONT_MERGE;
 	return ELEVATOR_NO_MERGE;
 }
+EXPORT_SYMBOL_GPL(blk_try_merge);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
new file mode 100644
index 000000000000..9213366e67d1
--- /dev/null
+++ b/block/blk-mq-sched.c
@@ -0,0 +1,246 @@ 
+#include <linux/kernel.h>
+#include <linux/module.h>
+
+#include <linux/blk-mq.h>
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-wbt.h"
+
+/*
+ * Empty set
+ */
+static struct blk_mq_ops mq_sched_tag_ops = {
+	.queue_rq	= NULL,
+};
+
+void blk_mq_sched_free_requests(struct blk_mq_tags *tags)
+{
+	blk_mq_free_rq_map(NULL, tags, 0);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_requests);
+
+struct blk_mq_tags *blk_mq_sched_alloc_requests(unsigned int depth,
+						unsigned int numa_node)
+{
+	struct blk_mq_tag_set set = {
+		.ops		= &mq_sched_tag_ops,
+		.nr_hw_queues	= 1,
+		.queue_depth	= depth,
+		.numa_node	= numa_node,
+	};
+
+	return blk_mq_init_rq_map(&set, 0);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_alloc_requests);
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+				 void (*exit)(struct blk_mq_hw_ctx *))
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (exit)
+			exit(hctx);
+		kfree(hctx->sched_data);
+		hctx->sched_data = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+				void (*init)(struct blk_mq_hw_ctx *))
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
+		if (!hctx->sched_data)
+			goto error;
+
+		if (init)
+			init(hctx);
+	}
+
+	return 0;
+error:
+	blk_mq_sched_free_hctx_data(q, NULL);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
+
+struct request *blk_mq_sched_alloc_shadow_request(struct request_queue *q,
+						  struct blk_mq_alloc_data *data,
+						  struct blk_mq_tags *tags,
+						  atomic_t *wait_index)
+{
+	struct sbq_wait_state *ws;
+	DEFINE_WAIT(wait);
+	struct request *rq;
+	int tag;
+
+	tag = __sbitmap_queue_get(&tags->bitmap_tags);
+	if (tag != -1)
+		goto done;
+
+	if (data->flags & BLK_MQ_REQ_NOWAIT)
+		return NULL;
+
+	ws = sbq_wait_ptr(&tags->bitmap_tags, wait_index);
+	do {
+		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+		tag = __sbitmap_queue_get(&tags->bitmap_tags);
+		if (tag != -1)
+			break;
+
+		blk_mq_run_hw_queue(data->hctx, false);
+
+		tag = __sbitmap_queue_get(&tags->bitmap_tags);
+		if (tag != -1)
+			break;
+
+		blk_mq_put_ctx(data->ctx);
+		io_schedule();
+
+		data->ctx = blk_mq_get_ctx(data->q);
+		data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
+		finish_wait(&ws->wait, &wait);
+		ws = sbq_wait_ptr(&tags->bitmap_tags, wait_index);
+	} while (1);
+
+	finish_wait(&ws->wait, &wait);
+done:
+	rq = tags->rqs[tag];
+	rq->tag = tag;
+	rq->rq_flags |= RQF_ALLOCED;
+	return rq;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_alloc_shadow_request);
+
+void blk_mq_sched_free_shadow_request(struct blk_mq_tags *tags,
+				      struct request *rq)
+{
+	WARN_ON_ONCE(!(rq->rq_flags & RQF_ALLOCED));
+	sbitmap_queue_clear(&tags->bitmap_tags, rq->tag, rq->mq_ctx->cpu);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_shadow_request);
+
+static void rq_copy(struct request *rq, struct request *src)
+{
+#define FIELD_COPY(dst, src, name)	((dst)->name = (src)->name)
+	FIELD_COPY(rq, src, cpu);
+	FIELD_COPY(rq, src, cmd_type);
+	FIELD_COPY(rq, src, cmd_flags);
+	rq->rq_flags |= (src->rq_flags & (RQF_PREEMPT | RQF_QUIET | RQF_PM | RQF_DONTPREP));
+	rq->rq_flags &= ~RQF_IO_STAT;
+	FIELD_COPY(rq, src, __data_len);
+	FIELD_COPY(rq, src, __sector);
+	FIELD_COPY(rq, src, bio);
+	FIELD_COPY(rq, src, biotail);
+	FIELD_COPY(rq, src, rq_disk);
+	FIELD_COPY(rq, src, part);
+	FIELD_COPY(rq, src, nr_phys_segments);
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	FIELD_COPY(rq, src, nr_integrity_segments);
+#endif
+	FIELD_COPY(rq, src, ioprio);
+	FIELD_COPY(rq, src, timeout);
+
+	if (src->cmd_type == REQ_TYPE_BLOCK_PC) {
+		FIELD_COPY(rq, src, cmd);
+		FIELD_COPY(rq, src, cmd_len);
+		FIELD_COPY(rq, src, extra_len);
+		FIELD_COPY(rq, src, sense_len);
+		FIELD_COPY(rq, src, resid_len);
+		FIELD_COPY(rq, src, sense);
+		FIELD_COPY(rq, src, retries);
+	}
+
+	src->bio = src->biotail = NULL;
+}
+
+static void sched_rq_end_io(struct request *rq, int error)
+{
+	struct request *sched_rq = rq->end_io_data;
+
+	FIELD_COPY(sched_rq, rq, resid_len);
+	FIELD_COPY(sched_rq, rq, extra_len);
+	FIELD_COPY(sched_rq, rq, sense_len);
+	FIELD_COPY(sched_rq, rq, errors);
+	FIELD_COPY(sched_rq, rq, retries);
+
+	blk_account_io_completion(sched_rq, blk_rq_bytes(sched_rq));
+	blk_account_io_done(sched_rq);
+
+	wbt_done(sched_rq->q->rq_wb, &sched_rq->issue_stat);
+
+	if (sched_rq->end_io)
+		sched_rq->end_io(sched_rq, error);
+
+	blk_mq_free_request(rq);
+}
+
+struct request *
+blk_mq_sched_request_from_shadow(struct blk_mq_hw_ctx *hctx,
+				 struct request *(*get_sched_rq)(struct blk_mq_hw_ctx *))
+{
+	struct blk_mq_alloc_data data;
+	struct request *sched_rq, *rq;
+
+	data.q = hctx->queue;
+	data.flags = BLK_MQ_REQ_NOWAIT;
+	data.ctx = blk_mq_get_ctx(hctx->queue);
+	data.hctx = hctx;
+
+	rq = __blk_mq_alloc_request(&data, 0);
+	blk_mq_put_ctx(data.ctx);
+
+	if (!rq) {
+		blk_mq_stop_hw_queue(hctx);
+		return NULL;
+	}
+
+	sched_rq = get_sched_rq(hctx);
+
+	if (!sched_rq) {
+		blk_queue_enter_live(hctx->queue);
+		__blk_mq_free_request(hctx, data.ctx, rq);
+		return NULL;
+	}
+
+	WARN_ON_ONCE(!(sched_rq->rq_flags & RQF_ALLOCED));
+	rq_copy(rq, sched_rq);
+	rq->end_io = sched_rq_end_io;
+	rq->end_io_data = sched_rq;
+
+	return rq;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_request_from_shadow);
+
+void __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+	struct request *rq;
+	LIST_HEAD(rq_list);
+
+	if (unlikely(blk_mq_hctx_stopped(hctx)))
+		return;
+
+	hctx->run++;
+
+	if (!list_empty(&hctx->dispatch)) {
+		spin_lock(&hctx->lock);
+		if (!list_empty(&hctx->dispatch))
+			list_splice_init(&hctx->dispatch, &rq_list);
+		spin_unlock(&hctx->lock);
+	}
+
+	while ((rq = e->type->mq_ops.dispatch_request(hctx)) != NULL)
+		list_add_tail(&rq->queuelist, &rq_list);
+
+	blk_mq_dispatch_rq_list(hctx, &rq_list);
+}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
new file mode 100644
index 000000000000..609c80506cfc
--- /dev/null
+++ b/block/blk-mq-sched.h
@@ -0,0 +1,187 @@ 
+#ifndef BLK_MQ_SCHED_H
+#define BLK_MQ_SCHED_H
+
+#include "blk-mq.h"
+
+struct blk_mq_hw_ctx;
+struct blk_mq_ctx;
+struct request_queue;
+
+struct blk_mq_tags *blk_mq_sched_alloc_requests(unsigned int depth, unsigned int numa_node);
+void blk_mq_sched_free_requests(struct blk_mq_tags *tags);
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+				void (*init)(struct blk_mq_hw_ctx *));
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+				 void (*exit)(struct blk_mq_hw_ctx *));
+
+void blk_mq_sched_free_shadow_request(struct blk_mq_tags *tags,
+				      struct request *rq);
+struct request *blk_mq_sched_alloc_shadow_request(struct request_queue *q,
+						  struct blk_mq_alloc_data *data,
+						  struct blk_mq_tags *tags,
+						  atomic_t *wait_index);
+struct request *
+blk_mq_sched_request_from_shadow(struct blk_mq_hw_ctx *hctx,
+				 struct request *(*get_sched_rq)(struct blk_mq_hw_ctx *));
+
+
+struct blk_mq_alloc_data {
+	/* input parameter */
+	struct request_queue *q;
+	unsigned int flags;
+
+	/* input & output parameter */
+	struct blk_mq_ctx *ctx;
+	struct blk_mq_hw_ctx *hctx;
+};
+
+static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
+		struct request_queue *q, unsigned int flags,
+		struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
+{
+	data->q = q;
+	data->flags = flags;
+	data->ctx = ctx;
+	data->hctx = hctx;
+}
+
+static inline bool
+blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
+		return false;
+
+	if (e) {
+		struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+		struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+		blk_mq_put_ctx(ctx);
+		return e->type->mq_ops.bio_merge(hctx, bio);
+	}
+
+	return false;
+}
+
+static inline void blk_mq_sched_put_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->mq_ops.put_request)
+		e->type->mq_ops.put_request(rq);
+	else
+		blk_mq_free_request(rq);
+}
+
+static inline struct request *
+blk_mq_sched_get_request(struct request_queue *q, unsigned int op,
+			 struct blk_mq_alloc_data *data)
+{
+	struct elevator_queue *e = q->elevator;
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	struct request *rq;
+
+	blk_queue_enter_live(q);
+	ctx = blk_mq_get_ctx(q);
+	hctx = blk_mq_map_queue(q, ctx->cpu);
+
+	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
+
+	if (e && e->type->mq_ops.get_request)
+		rq = e->type->mq_ops.get_request(q, op, data);
+	else
+		rq = __blk_mq_alloc_request(data, op);
+
+	if (rq)
+		data->hctx->queued++;
+
+	return rq;
+
+}
+
+static inline void
+blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue,
+			    bool async)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+	if (e)
+		e->type->mq_ops.insert_request(hctx, rq, at_head);
+	else {
+		spin_lock(&ctx->lock);
+		__blk_mq_insert_request(hctx, rq, at_head);
+		spin_unlock(&ctx->lock);
+	}
+
+	if (run_queue)
+		blk_mq_run_hw_queue(hctx, async);
+}
+
+static inline bool
+blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
+			 struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->mq_ops.allow_merge)
+		return e->type->mq_ops.allow_merge(q, rq, bio);
+
+	return true;
+}
+
+static inline void
+blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (e && e->type->mq_ops.completed_request)
+		e->type->mq_ops.completed_request(hctx, rq);
+}
+
+static inline void blk_mq_sched_started_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->mq_ops.started_request)
+		e->type->mq_ops.started_request(rq);
+}
+
+static inline void blk_mq_sched_requeue_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->mq_ops.requeue_request)
+		e->type->mq_ops.requeue_request(rq);
+}
+
+static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (e && e->type->mq_ops.has_work)
+		return e->type->mq_ops.has_work(hctx);
+
+	return false;
+}
+
+void __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+
+static inline void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+	if (hctx->queue->elevator)
+		__blk_mq_sched_dispatch_requests(hctx);
+	else
+		blk_mq_process_sw_list(hctx);
+}
+
+
+#endif
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index dcf5ce3ba4bf..bbd494e23d57 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -12,6 +12,7 @@ 
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
 
 bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index abbf7cca4d0d..019de6f0fd06 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -32,6 +32,7 @@ 
 #include "blk-mq-tag.h"
 #include "blk-stat.h"
 #include "blk-wbt.h"
+#include "blk-mq-sched.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
@@ -41,7 +42,8 @@  static LIST_HEAD(all_q_list);
  */
 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
-	return sbitmap_any_bit_set(&hctx->ctx_map);
+	return sbitmap_any_bit_set(&hctx->ctx_map) ||
+		blk_mq_sched_has_work(hctx);
 }
 
 /*
@@ -167,8 +169,8 @@  bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_can_queue);
 
-static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
-			       struct request *rq, unsigned int op)
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+			struct request *rq, unsigned int op)
 {
 	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
@@ -213,9 +215,10 @@  static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 
 	ctx->rq_dispatched[op_is_sync(op)]++;
 }
+EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
 
-static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+				       unsigned int op)
 {
 	struct request *rq;
 	unsigned int tag;
@@ -236,25 +239,23 @@  __blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
 
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		unsigned int flags)
 {
-	struct blk_mq_ctx *ctx;
-	struct blk_mq_hw_ctx *hctx;
-	struct request *rq;
 	struct blk_mq_alloc_data alloc_data;
+	struct request *rq;
 	int ret;
 
 	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
 	if (ret)
 		return ERR_PTR(ret);
 
-	ctx = blk_mq_get_ctx(q);
-	hctx = blk_mq_map_queue(q, ctx->cpu);
-	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-	rq = __blk_mq_alloc_request(&alloc_data, rw);
-	blk_mq_put_ctx(ctx);
+	rq = blk_mq_sched_get_request(q, rw, &alloc_data);
+
+	blk_mq_put_ctx(alloc_data.ctx);
+	blk_queue_exit(q);
 
 	if (!rq) {
 		blk_queue_exit(q);
@@ -319,12 +320,14 @@  struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
-static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
-				  struct blk_mq_ctx *ctx, struct request *rq)
+void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+			   struct request *rq)
 {
 	const int tag = rq->tag;
 	struct request_queue *q = rq->q;
 
+	blk_mq_sched_completed_request(hctx, rq);
+
 	if (rq->rq_flags & RQF_MQ_INFLIGHT)
 		atomic_dec(&hctx->nr_active);
 
@@ -467,6 +470,8 @@  void blk_mq_start_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	blk_mq_sched_started_request(rq);
+
 	trace_block_rq_issue(q, rq);
 
 	rq->resid_len = blk_rq_bytes(rq);
@@ -515,6 +520,7 @@  static void __blk_mq_requeue_request(struct request *rq)
 
 	trace_block_rq_requeue(q, rq);
 	wbt_requeue(q->rq_wb, &rq->issue_stat);
+	blk_mq_sched_requeue_request(rq);
 
 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 		if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -549,13 +555,13 @@  static void blk_mq_requeue_work(struct work_struct *work)
 
 		rq->rq_flags &= ~RQF_SOFTBARRIER;
 		list_del_init(&rq->queuelist);
-		blk_mq_insert_request(rq, true, false, false);
+		blk_mq_sched_insert_request(rq, true, false, false);
 	}
 
 	while (!list_empty(&rq_list)) {
 		rq = list_entry(rq_list.next, struct request, queuelist);
 		list_del_init(&rq->queuelist);
-		blk_mq_insert_request(rq, false, false, false);
+		blk_mq_sched_insert_request(rq, false, false, false);
 	}
 
 	blk_mq_run_hw_queues(q, false);
@@ -761,8 +767,16 @@  static bool blk_mq_attempt_merge(struct request_queue *q,
 
 		if (!blk_rq_merge_ok(rq, bio))
 			continue;
+		if (!blk_mq_sched_allow_merge(q, rq, bio))
+			break;
 
 		el_ret = blk_try_merge(rq, bio);
+		if (el_ret == ELEVATOR_NO_MERGE)
+			continue;
+
+		if (!blk_mq_sched_allow_merge(q, rq, bio))
+			break;
+
 		if (el_ret == ELEVATOR_BACK_MERGE) {
 			if (bio_attempt_back_merge(q, rq, bio)) {
 				ctx->rq_merged++;
@@ -909,7 +923,7 @@  bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
  * of IO. In particular, we'd like FIFO behaviour on handling existing
  * items on the hctx->dispatch list. Ignore that for now.
  */
-static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
+void blk_mq_process_sw_list(struct blk_mq_hw_ctx *hctx)
 {
 	LIST_HEAD(rq_list);
 	LIST_HEAD(driver_list);
@@ -947,11 +961,11 @@  static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		rcu_read_lock();
-		blk_mq_process_rq_list(hctx);
+		blk_mq_sched_dispatch_requests(hctx);
 		rcu_read_unlock();
 	} else {
 		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
-		blk_mq_process_rq_list(hctx);
+		blk_mq_sched_dispatch_requests(hctx);
 		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
 	}
 }
@@ -1081,6 +1095,7 @@  void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 	blk_mq_run_hw_queue(hctx, async);
 }
+EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
 
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
@@ -1135,8 +1150,8 @@  static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
 		list_add_tail(&rq->queuelist, &ctx->rq_list);
 }
 
-static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
-				    struct request *rq, bool at_head)
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+			     bool at_head)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
@@ -1144,21 +1159,6 @@  static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 	blk_mq_hctx_mark_pending(hctx, ctx);
 }
 
-void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
-			   bool async)
-{
-	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
-	spin_lock(&ctx->lock);
-	__blk_mq_insert_request(hctx, rq, at_head);
-	spin_unlock(&ctx->lock);
-
-	if (run_queue)
-		blk_mq_run_hw_queue(hctx, async);
-}
-
 static void blk_mq_insert_requests(struct request_queue *q,
 				     struct blk_mq_ctx *ctx,
 				     struct list_head *list,
@@ -1174,17 +1174,14 @@  static void blk_mq_insert_requests(struct request_queue *q,
 	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
 	 * offline now
 	 */
-	spin_lock(&ctx->lock);
 	while (!list_empty(list)) {
 		struct request *rq;
 
 		rq = list_first_entry(list, struct request, queuelist);
 		BUG_ON(rq->mq_ctx != ctx);
 		list_del_init(&rq->queuelist);
-		__blk_mq_insert_req_list(hctx, rq, false);
+		blk_mq_sched_insert_request(rq, false, false, false);
 	}
-	blk_mq_hctx_mark_pending(hctx, ctx);
-	spin_unlock(&ctx->lock);
 
 	blk_mq_run_hw_queue(hctx, from_schedule);
 }
@@ -1285,41 +1282,27 @@  static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
-static struct request *blk_mq_map_request(struct request_queue *q,
-					  struct bio *bio,
-					  struct blk_mq_alloc_data *data)
-{
-	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-	struct request *rq;
-
-	blk_queue_enter_live(q);
-	ctx = blk_mq_get_ctx(q);
-	hctx = blk_mq_map_queue(q, ctx->cpu);
-
-	trace_block_getrq(q, bio, bio->bi_opf);
-	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
-	rq = __blk_mq_alloc_request(data, bio->bi_opf);
-
-	data->hctx->queued++;
-	return rq;
-}
-
 static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 {
-	int ret;
 	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
 	struct blk_mq_queue_data bd = {
 		.rq = rq,
 		.list = NULL,
 		.last = 1
 	};
-	blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+	struct blk_mq_hw_ctx *hctx;
+	blk_qc_t new_cookie;
+	int ret;
+
+	if (q->elevator)
+		goto insert;
 
+	hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
 	if (blk_mq_hctx_stopped(hctx))
 		goto insert;
 
+	new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+
 	/*
 	 * For OK queue, we are done. For error, kill it. Any other
 	 * error (busy), just add it to our list as we previously
@@ -1341,7 +1324,7 @@  static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 	}
 
 insert:
-	blk_mq_insert_request(rq, false, true, true);
+	blk_mq_sched_insert_request(rq, false, true, true);
 }
 
 /*
@@ -1374,9 +1357,14 @@  static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 		return BLK_QC_T_NONE;
 
+	if (blk_mq_sched_bio_merge(q, bio))
+		return BLK_QC_T_NONE;
+
 	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
 
-	rq = blk_mq_map_request(q, bio, &data);
+	trace_block_getrq(q, bio, bio->bi_opf);
+
+	rq = blk_mq_sched_get_request(q, bio->bi_opf, &data);
 	if (unlikely(!rq)) {
 		__wbt_done(q->rq_wb, wb_acct);
 		return BLK_QC_T_NONE;
@@ -1438,6 +1426,12 @@  static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		goto done;
 	}
 
+	if (q->elevator) {
+		blk_mq_put_ctx(data.ctx);
+		blk_mq_bio_to_request(rq, bio);
+		blk_mq_sched_insert_request(rq, false, true, true);
+		goto done;
+	}
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
 		/*
 		 * For a SYNC request, send it to the hardware immediately. For
@@ -1483,9 +1477,14 @@  static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	} else
 		request_count = blk_plug_queued_count(q);
 
+	if (blk_mq_sched_bio_merge(q, bio))
+		return BLK_QC_T_NONE;
+
 	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
 
-	rq = blk_mq_map_request(q, bio, &data);
+	trace_block_getrq(q, bio, bio->bi_opf);
+
+	rq = blk_mq_sched_get_request(q, bio->bi_opf, &data);
 	if (unlikely(!rq)) {
 		__wbt_done(q->rq_wb, wb_acct);
 		return BLK_QC_T_NONE;
@@ -1535,6 +1534,12 @@  static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return cookie;
 	}
 
+	if (q->elevator) {
+		blk_mq_put_ctx(data.ctx);
+		blk_mq_bio_to_request(rq, bio);
+		blk_mq_sched_insert_request(rq, false, true, true);
+		goto done;
+	}
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
 		/*
 		 * For a SYNC request, send it to the hardware immediately. For
@@ -1547,15 +1552,16 @@  static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	blk_mq_put_ctx(data.ctx);
+done:
 	return cookie;
 }
 
-static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
-		struct blk_mq_tags *tags, unsigned int hctx_idx)
+void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+			unsigned int hctx_idx)
 {
 	struct page *page;
 
-	if (tags->rqs && set->ops->exit_request) {
+	if (tags->rqs && set && set->ops->exit_request) {
 		int i;
 
 		for (i = 0; i < tags->nr_tags; i++) {
@@ -1588,8 +1594,8 @@  static size_t order_to_size(unsigned int order)
 	return (size_t)PAGE_SIZE << order;
 }
 
-static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
-		unsigned int hctx_idx)
+struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
+				       unsigned int hctx_idx)
 {
 	struct blk_mq_tags *tags;
 	unsigned int i, j, entries_per_page, max_order = 4;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 3a54dd32a6fc..ddce89bb0461 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -84,26 +84,6 @@  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
 	put_cpu();
 }
 
-struct blk_mq_alloc_data {
-	/* input parameter */
-	struct request_queue *q;
-	unsigned int flags;
-
-	/* input & output parameter */
-	struct blk_mq_ctx *ctx;
-	struct blk_mq_hw_ctx *hctx;
-};
-
-static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
-		struct request_queue *q, unsigned int flags,
-		struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
-{
-	data->q = q;
-	data->flags = flags;
-	data->ctx = ctx;
-	data->hctx = hctx;
-}
-
 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
 {
 	return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
@@ -114,4 +94,18 @@  static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
 	return hctx->nr_ctx && hctx->tags;
 }
 
+void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+			unsigned int hctx_idx);
+struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
+				       unsigned int hctx_idx);
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+			struct request *rq, unsigned int op);
+void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+			   struct request *rq);
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+				       unsigned int op);
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+			     bool at_head);
+void blk_mq_process_sw_list(struct blk_mq_hw_ctx *hctx);
+
 #endif
diff --git a/block/elevator.c b/block/elevator.c
index 40f0c04e5ad3..f1191b3b0ff3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -40,6 +40,7 @@ 
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
@@ -58,7 +59,9 @@  static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_allow_bio_merge_fn)
+	if (e->uses_mq && e->type->mq_ops.allow_merge)
+		return e->type->mq_ops.allow_merge(q, rq, bio);
+	else if (!e->uses_mq && e->type->ops.elevator_allow_bio_merge_fn)
 		return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio);
 
 	return 1;
@@ -163,6 +166,7 @@  struct elevator_queue *elevator_alloc(struct request_queue *q,
 	kobject_init(&eq->kobj, &elv_ktype);
 	mutex_init(&eq->sysfs_lock);
 	hash_init(eq->hash);
+	eq->uses_mq = e->uses_mq;
 
 	return eq;
 }
@@ -224,7 +228,10 @@  int elevator_init(struct request_queue *q, char *name)
 		}
 	}
 
-	err = e->ops.elevator_init_fn(q, e);
+	if (e->uses_mq)
+		err = e->mq_ops.init_sched(q, e);
+	else
+		err = e->ops.elevator_init_fn(q, e);
 	if (err)
 		elevator_put(e);
 	return err;
@@ -234,7 +241,9 @@  EXPORT_SYMBOL(elevator_init);
 void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
-	if (e->type->ops.elevator_exit_fn)
+	if (e->uses_mq && e->type->mq_ops.exit_sched)
+		e->type->mq_ops.exit_sched(e);
+	else if (!e->uses_mq && e->type->ops.elevator_exit_fn)
 		e->type->ops.elevator_exit_fn(e);
 	mutex_unlock(&e->sysfs_lock);
 
@@ -253,6 +262,7 @@  void elv_rqhash_del(struct request_queue *q, struct request *rq)
 	if (ELV_ON_HASH(rq))
 		__elv_rqhash_del(rq);
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_del);
 
 void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
@@ -262,12 +272,14 @@  void elv_rqhash_add(struct request_queue *q, struct request *rq)
 	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
 	rq->rq_flags |= RQF_HASHED;
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_add);
 
 void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 {
 	__elv_rqhash_del(rq);
 	elv_rqhash_add(q, rq);
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_reposition);
 
 struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 {
@@ -289,6 +301,7 @@  struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_find);
 
 /*
  * RB-tree support functions for inserting/lookup/removal of requests
@@ -411,6 +424,9 @@  int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	struct request *__rq;
 	int ret;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return ELEVATOR_NO_MERGE;
+
 	/*
 	 * Levels of merges:
 	 * 	nomerges:  No merges at all attempted
@@ -462,6 +478,9 @@  static bool elv_attempt_insert_merge(struct request_queue *q,
 	struct request *__rq;
 	bool ret;
 
+	if (WARN_ON_ONCE(q->elevator && q->elevator->uses_mq))
+		return false;
+
 	if (blk_queue_nomerges(q))
 		return false;
 
@@ -495,7 +514,7 @@  void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_merged_fn)
+	if (!e->uses_mq && e->type->ops.elevator_merged_fn)
 		e->type->ops.elevator_merged_fn(q, rq, type);
 
 	if (type == ELEVATOR_BACK_MERGE)
@@ -508,10 +527,15 @@  void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
 	struct elevator_queue *e = q->elevator;
-	const int next_sorted = next->rq_flags & RQF_SORTED;
-
-	if (next_sorted && e->type->ops.elevator_merge_req_fn)
-		e->type->ops.elevator_merge_req_fn(q, rq, next);
+	bool next_sorted = false;
+
+	if (e->uses_mq && e->type->mq_ops.requests_merged)
+		e->type->mq_ops.requests_merged(q, rq, next);
+	else if (e->type->ops.elevator_merge_req_fn) {
+		next_sorted = next->rq_flags & RQF_SORTED;
+		if (next_sorted)
+			e->type->ops.elevator_merge_req_fn(q, rq, next);
+	}
 
 	elv_rqhash_reposition(q, rq);
 
@@ -528,6 +552,9 @@  void elv_bio_merged(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	if (e->type->ops.elevator_bio_merged_fn)
 		e->type->ops.elevator_bio_merged_fn(q, rq, bio);
 }
@@ -682,8 +709,11 @@  struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_latter_req_fn)
+	if (e->uses_mq && e->type->mq_ops.next_request)
+		return e->type->mq_ops.next_request(q, rq);
+	else if (!e->uses_mq && e->type->ops.elevator_latter_req_fn)
 		return e->type->ops.elevator_latter_req_fn(q, rq);
+
 	return NULL;
 }
 
@@ -691,7 +721,9 @@  struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_former_req_fn)
+	if (e->uses_mq && e->type->mq_ops.former_request)
+		return e->type->mq_ops.former_request(q, rq);
+	if (!e->uses_mq && e->type->ops.elevator_former_req_fn)
 		return e->type->ops.elevator_former_req_fn(q, rq);
 	return NULL;
 }
@@ -701,6 +733,9 @@  int elv_set_request(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return 0;
+
 	if (e->type->ops.elevator_set_req_fn)
 		return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
 	return 0;
@@ -710,6 +745,9 @@  void elv_put_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	if (e->type->ops.elevator_put_req_fn)
 		e->type->ops.elevator_put_req_fn(rq);
 }
@@ -718,6 +756,9 @@  int elv_may_queue(struct request_queue *q, unsigned int op)
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return 0;
+
 	if (e->type->ops.elevator_may_queue_fn)
 		return e->type->ops.elevator_may_queue_fn(q, op);
 
@@ -728,6 +769,9 @@  void elv_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	/*
 	 * request is released from the driver, io must be done
 	 */
@@ -803,7 +847,7 @@  int elv_register_queue(struct request_queue *q)
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 		e->registered = 1;
-		if (e->type->ops.elevator_registered_fn)
+		if (!e->uses_mq && e->type->ops.elevator_registered_fn)
 			e->type->ops.elevator_registered_fn(q);
 	}
 	return error;
@@ -891,9 +935,14 @@  EXPORT_SYMBOL_GPL(elv_unregister);
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
 	struct elevator_queue *old = q->elevator;
-	bool registered = old->registered;
+	bool old_registered = false;
 	int err;
 
+	if (q->mq_ops) {
+		blk_mq_freeze_queue(q);
+		blk_mq_quiesce_queue(q);
+	}
+
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data.
 	 * Block layer doesn't call into a quiesced elevator - all requests
@@ -901,32 +950,54 @@  static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
 	 * merge happens either.
 	 */
-	blk_queue_bypass_start(q);
+	if (old) {
+		old_registered = old->registered;
+
+		if (!q->mq_ops)
+			blk_queue_bypass_start(q);
 
-	/* unregister and clear all auxiliary data of the old elevator */
-	if (registered)
-		elv_unregister_queue(q);
+		/* unregister and clear all auxiliary data of the old elevator */
+		if (old_registered)
+			elv_unregister_queue(q);
 
-	spin_lock_irq(q->queue_lock);
-	ioc_clear_queue(q);
-	spin_unlock_irq(q->queue_lock);
+		if (q->queue_lock) {
+			spin_lock_irq(q->queue_lock);
+			ioc_clear_queue(q);
+			spin_unlock_irq(q->queue_lock);
+		}
+	}
 
 	/* allocate, init and register new elevator */
-	err = new_e->ops.elevator_init_fn(q, new_e);
-	if (err)
-		goto fail_init;
+	if (new_e) {
+		if (new_e->uses_mq)
+			err = new_e->mq_ops.init_sched(q, new_e);
+		else
+			err = new_e->ops.elevator_init_fn(q, new_e);
+		if (err)
+			goto fail_init;
 
-	if (registered) {
 		err = elv_register_queue(q);
 		if (err)
 			goto fail_register;
-	}
+	} else
+		q->elevator = NULL;
 
 	/* done, kill the old one and finish */
-	elevator_exit(old);
-	blk_queue_bypass_end(q);
+	if (old) {
+		elevator_exit(old);
+		if (!q->mq_ops)
+			blk_queue_bypass_end(q);
+	}
 
-	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+	if (q->mq_ops) {
+		blk_mq_unfreeze_queue(q);
+		blk_mq_start_stopped_hw_queues(q, true);
+	}
+
+	if (new_e)
+		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+	else
+		blk_add_trace_msg(q, "elv switch: none");
 
 	return 0;
 
@@ -934,9 +1005,16 @@  static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	elevator_exit(q->elevator);
 fail_init:
 	/* switch failed, restore and re-register old elevator */
-	q->elevator = old;
-	elv_register_queue(q);
-	blk_queue_bypass_end(q);
+	if (old) {
+		q->elevator = old;
+		elv_register_queue(q);
+		if (!q->mq_ops)
+			blk_queue_bypass_end(q);
+	}
+	if (q->mq_ops) {
+		blk_mq_unfreeze_queue(q);
+		blk_mq_start_stopped_hw_queues(q, true);
+	}
 
 	return err;
 }
@@ -949,8 +1027,11 @@  static int __elevator_change(struct request_queue *q, const char *name)
 	char elevator_name[ELV_NAME_MAX];
 	struct elevator_type *e;
 
-	if (!q->elevator)
-		return -ENXIO;
+	/*
+	 * Special case for mq, turn off scheduling
+	 */
+	if (q->mq_ops && !strncmp(name, "none", 4))
+		return elevator_switch(q, NULL);
 
 	strlcpy(elevator_name, name, sizeof(elevator_name));
 	e = elevator_get(strstrip(elevator_name), true);
@@ -959,11 +1040,23 @@  static int __elevator_change(struct request_queue *q, const char *name)
 		return -EINVAL;
 	}
 
-	if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
+	if (q->elevator &&
+	    !strcmp(elevator_name, q->elevator->type->elevator_name)) {
 		elevator_put(e);
 		return 0;
 	}
 
+	if (!e->uses_mq && q->mq_ops) {
+		printk(KERN_ERR "blk-mq-sched: elv %s does not support mq\n", elevator_name);
+		elevator_put(e);
+		return -EINVAL;
+	}
+	if (e->uses_mq && !q->mq_ops) {
+		printk(KERN_ERR "blk-mq-sched: elv %s is for mq\n", elevator_name);
+		elevator_put(e);
+		return -EINVAL;
+	}
+
 	return elevator_switch(q, e);
 }
 
@@ -985,7 +1078,7 @@  ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 {
 	int ret;
 
-	if (!q->elevator)
+	if (!q->mq_ops || q->request_fn)
 		return count;
 
 	ret = __elevator_change(q, name);
@@ -999,24 +1092,34 @@  ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
 	struct elevator_queue *e = q->elevator;
-	struct elevator_type *elv;
+	struct elevator_type *elv = NULL;
 	struct elevator_type *__e;
 	int len = 0;
 
-	if (!q->elevator || !blk_queue_stackable(q))
+	if (!blk_queue_stackable(q))
 		return sprintf(name, "none\n");
 
-	elv = e->type;
+	if (!q->elevator)
+		len += sprintf(name+len, "[none] ");
+	else
+		elv = e->type;
 
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
-		if (!strcmp(elv->elevator_name, __e->elevator_name))
+		if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
-		else
+			continue;
+		}
+		if (__e->uses_mq && q->mq_ops)
+			len += sprintf(name+len, "%s ", __e->elevator_name);
+		else if (!__e->uses_mq && !q->mq_ops)
 			len += sprintf(name+len, "%s ", __e->elevator_name);
 	}
 	spin_unlock(&elv_list_lock);
 
+	if (q->mq_ops && q->elevator)
+		len += sprintf(name+len, "none");
+
 	len += sprintf(len+name, "\n");
 	return len;
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 87e404aae267..c86b314dde97 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -22,6 +22,7 @@  struct blk_mq_hw_ctx {
 
 	unsigned long		flags;		/* BLK_MQ_F_* flags */
 
+	void			*sched_data;
 	struct request_queue	*queue;
 	struct blk_flush_queue	*fq;
 
@@ -179,7 +180,6 @@  void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
-void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_free_request(struct request *rq);
 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index b276e9ef0e0b..5d013f2b9071 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -77,6 +77,28 @@  struct elevator_ops
 	elevator_registered_fn *elevator_registered_fn;
 };
 
+struct blk_mq_alloc_data;
+struct blk_mq_hw_ctx;
+
+struct elevator_mq_ops {
+	int (*init_sched)(struct request_queue *, struct elevator_type *);
+	void (*exit_sched)(struct elevator_queue *);
+
+	bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
+	bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
+	void (*requests_merged)(struct request_queue *, struct request *, struct request *);
+	struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *);
+	void (*put_request)(struct request *);
+	void (*insert_request)(struct blk_mq_hw_ctx *, struct request *, bool);
+	struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
+	bool (*has_work)(struct blk_mq_hw_ctx *);
+	void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
+	void (*started_request)(struct request *);
+	void (*requeue_request)(struct request *);
+	struct request *(*former_request)(struct request_queue *, struct request *);
+	struct request *(*next_request)(struct request_queue *, struct request *);
+};
+
 #define ELV_NAME_MAX	(16)
 
 struct elv_fs_entry {
@@ -94,12 +116,16 @@  struct elevator_type
 	struct kmem_cache *icq_cache;
 
 	/* fields provided by elevator implementation */
-	struct elevator_ops ops;
+	union {
+		struct elevator_ops ops;
+		struct elevator_mq_ops mq_ops;
+	};
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
+	bool uses_mq;
 
 	/* managed by elevator core */
 	char icq_cache_name[ELV_NAME_MAX + 5];	/* elvname + "_io_cq" */
@@ -123,6 +149,7 @@  struct elevator_queue
 	struct kobject kobj;
 	struct mutex sysfs_lock;
 	unsigned int registered:1;
+	unsigned int uses_mq:1;
 	DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
 };