diff mbox

[7/8] mq-deadline: add blk-mq adaptation of the deadline IO scheduler

Message ID 1481933536-12844-8-git-send-email-axboe@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jens Axboe Dec. 17, 2016, 12:12 a.m. UTC
This is basically identical to deadline-iosched, except it registers
as a MQ capable scheduler. This is still a single queue design.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/Kconfig.iosched |   6 +
 block/Makefile        |   1 +
 block/mq-deadline.c   | 649 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 656 insertions(+)
 create mode 100644 block/mq-deadline.c

Comments

Paolo Valente Dec. 20, 2016, 9:34 a.m. UTC | #1
> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
> 
> This is basically identical to deadline-iosched, except it registers
> as a MQ capable scheduler. This is still a single queue design.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ...
> +
> +static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +
> +	return !list_empty_careful(&dd->dispatch) ||
> +		!list_empty_careful(&dd->fifo_list[0]) ||
> +		!list_empty_careful(&dd->fifo_list[1]);

Just a request for clarification: if I'm not mistaken,
list_empty_careful can be used safely only if the only possible other
concurrent access is a delete.  Or am I missing something?

If the above constraint does hold, then how are we guaranteed that it
is met?  My doubt arises from, e.g., the possible concurrent list_add
from dd_insert_request.

Thanks,
Paolo

> +}
> +
> +/*
> + * sysfs parts below
> + */
> +static ssize_t
> +deadline_var_show(int var, char *page)
> +{
> +	return sprintf(page, "%d\n", var);
> +}
> +
> +static ssize_t
> +deadline_var_store(int *var, const char *page, size_t count)
> +{
> +	char *p = (char *) page;
> +
> +	*var = simple_strtol(p, &p, 10);
> +	return count;
> +}
> +
> +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
> +static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
> +{									\
> +	struct deadline_data *dd = e->elevator_data;			\
> +	int __data = __VAR;						\
> +	if (__CONV)							\
> +		__data = jiffies_to_msecs(__data);			\
> +	return deadline_var_show(__data, (page));			\
> +}
> +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
> +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
> +SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
> +SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
> +SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
> +#undef SHOW_FUNCTION
> +
> +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
> +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
> +{									\
> +	struct deadline_data *dd = e->elevator_data;			\
> +	int __data;							\
> +	int ret = deadline_var_store(&__data, (page), count);		\
> +	if (__data < (MIN))						\
> +		__data = (MIN);						\
> +	else if (__data > (MAX))					\
> +		__data = (MAX);						\
> +	if (__CONV)							\
> +		*(__PTR) = msecs_to_jiffies(__data);			\
> +	else								\
> +		*(__PTR) = __data;					\
> +	return ret;							\
> +}
> +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
> +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
> +STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
> +STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
> +STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
> +#undef STORE_FUNCTION
> +
> +#define DD_ATTR(name) \
> +	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
> +				      deadline_##name##_store)
> +
> +static struct elv_fs_entry deadline_attrs[] = {
> +	DD_ATTR(read_expire),
> +	DD_ATTR(write_expire),
> +	DD_ATTR(writes_starved),
> +	DD_ATTR(front_merges),
> +	DD_ATTR(fifo_batch),
> +	__ATTR_NULL
> +};
> +
> +static struct elevator_type mq_deadline = {
> +	.ops.mq = {
> +		.get_request		= dd_get_request,
> +		.put_request		= dd_put_request,
> +		.insert_requests	= dd_insert_requests,
> +		.dispatch_requests	= dd_dispatch_requests,
> +		.completed_request	= dd_completed_request,
> +		.next_request		= elv_rb_latter_request,
> +		.former_request		= elv_rb_former_request,
> +		.bio_merge		= dd_bio_merge,
> +		.request_merge		= dd_request_merge,
> +		.requests_merged	= dd_merged_requests,
> +		.request_merged		= dd_request_merged,
> +		.has_work		= dd_has_work,
> +		.init_sched		= dd_init_queue,
> +		.exit_sched		= dd_exit_queue,
> +	},
> +
> +	.uses_mq	= true,
> +	.elevator_attrs = deadline_attrs,
> +	.elevator_name = "mq-deadline",
> +	.elevator_owner = THIS_MODULE,
> +};
> +
> +static int __init deadline_init(void)
> +{
> +	if (!queue_depth) {
> +		pr_err("mq-deadline: queue depth must be > 0\n");
> +		return -EINVAL;
> +	}
> +	return elv_register(&mq_deadline);
> +}
> +
> +static void __exit deadline_exit(void)
> +{
> +	elv_unregister(&mq_deadline);
> +}
> +
> +module_init(deadline_init);
> +module_exit(deadline_exit);
> +
> +MODULE_AUTHOR("Jens Axboe");
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("MQ deadline IO scheduler");
> -- 
> 2.7.4
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Dec. 20, 2016, 3:46 p.m. UTC | #2
On 12/20/2016 02:34 AM, Paolo Valente wrote:
> 
>> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
>>
>> This is basically identical to deadline-iosched, except it registers
>> as a MQ capable scheduler. This is still a single queue design.
>>
>> Signed-off-by: Jens Axboe <axboe@fb.com>
>> ...
>> +
>> +static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
>> +{
>> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
>> +
>> +	return !list_empty_careful(&dd->dispatch) ||
>> +		!list_empty_careful(&dd->fifo_list[0]) ||
>> +		!list_empty_careful(&dd->fifo_list[1]);
> 
> Just a request for clarification: if I'm not mistaken,
> list_empty_careful can be used safely only if the only possible other
> concurrent access is a delete.  Or am I missing something?

We can "solve" that with memory barriers. For now, it's safe to ignore
on your end.
Bart Van Assche Dec. 21, 2016, 11:59 a.m. UTC | #3
On 12/17/2016 01:12 AM, Jens Axboe wrote:
> +static bool dd_put_request(struct request *rq)

> +{

> +	/*

> +	 * If it's a real request, we just have to

free it. For a shadow
> +	 * request, we should only free it if we haven't started it. A

> +	 * started request is mapped to a real one, and the

real one will
> +	 * free it. We can get here with request merges, since we then

> +	 * free the request before we start/issue it.

> +	 */

> +	

if (!blk_mq_sched_rq_is_shadow(rq))
> +		return false;

> +

> +	if (!(rq->rq_flags & RQF_STARTED)) {

> +		struct request_queue *q

= rq->q;
> +		struct deadline_data *dd = q->elevator->elevator_data;

> +

> +		/*

> +		 * IO completion would normally do

this, but if we merge
> +		 * and free before we issue the request, drop both the

> +		 * tag and queue ref

> +		 */

> +	

	blk_mq_sched_free_shadow_request(dd->tags, rq);
> +		blk_queue_exit(q);

> +	}

> +

> +	return true;

> +}


Hello Jens,

Since this patch is the first patch that introduces a call to blk_queue_exit()
from a module other than the block layer core, shouldn't this patch export the
blk_queue_exit() function? An attempt to build mq-deadline as a module resulted
in the following:

ERROR: "blk_queue_exit" [block/mq-deadline.ko] undefined!
make[1]: *** [scripts/Makefile.modpost:91: __modpost] Error 1
make: *** [Makefile:1198: modules] Error 2
Execution failed: make all

Bart.
Jens Axboe Dec. 21, 2016, 2:22 p.m. UTC | #4
On 12/21/2016 04:59 AM, Bart Van Assche wrote:
> Since this patch is the first patch that introduces a call to
> blk_queue_exit() from a module other than the block layer core,
> shouldn't this patch export the blk_queue_exit() function? An attempt
> to build mq-deadline as a module resulted in the following:
> 
> ERROR: "blk_queue_exit" [block/mq-deadline.ko] undefined!
> make[1]: *** [scripts/Makefile.modpost:91: __modpost] Error 1
> make: *** [Makefile:1198: modules] Error 2
> Execution failed: make all

Yes, it should. I'll make the export for now, I want to move that check
and free/drop into the generic code so that the schedulers don't have to
worry about it.
Paolo Valente Dec. 22, 2016, 4:07 p.m. UTC | #5
> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
> 
> This is basically identical to deadline-iosched, except it registers
> as a MQ capable scheduler. This is still a single queue design.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---

...

> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> new file mode 100644
> index 000000000000..3cb9de21ab21
> --- /dev/null
> +++ b/block/mq-deadline.c
> ...
> +/*
> + * remove rq from rbtree and fifo.
> + */
> +static void deadline_remove_request(struct request_queue *q, struct request *rq)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	list_del_init(&rq->queuelist);
> +
> +	/*
> +	 * We might not be on the rbtree, if we are doing an insert merge
> +	 */
> +	if (!RB_EMPTY_NODE(&rq->rb_node))
> +		deadline_del_rq_rb(dd, rq);
> +

I've been scratching my head on the last three instructions, but at no
avail.  If I understand correctly, the
list_del_init(&rq->queue list);
removes rq from the fifo list.  But, if so, I don't understand how it
could be possible that rq has not been added to the rb_tree too.

Another interpretation that I tried is that the above three lines
handle correctly the following case where rq has not been inserted at
all into deadline fifo queue and rb tree: when dd_insert_request was
executed for rq, blk_mq_sched_try_insert_merge succeeded.  Yet, the
list_del_init(&rq->queue list);
does not seem to make sense.

Could you please shed some light on this for me?

Thanks,
Paolo

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Valente Dec. 22, 2016, 4:49 p.m. UTC | #6
> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
> 
> This is basically identical to deadline-iosched, except it registers
> as a MQ capable scheduler. This is still a single queue design.
> 

One last question (for today ...):in mq-deadline there are no
"schedule dispatch" or "unplug work" functions.  In blk, CFQ and BFQ
do these schedules/unplugs in a lot of cases.  What's the right
replacement?  Just doing nothing?

Thanks,
Paolo

> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---
> block/Kconfig.iosched |   6 +
> block/Makefile        |   1 +
> block/mq-deadline.c   | 649 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 656 insertions(+)
> create mode 100644 block/mq-deadline.c
> 
> diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
> index 421bef9c4c48..490ef2850fae 100644
> --- a/block/Kconfig.iosched
> +++ b/block/Kconfig.iosched
> @@ -32,6 +32,12 @@ config IOSCHED_CFQ
> 
> 	  This is the default I/O scheduler.
> 
> +config MQ_IOSCHED_DEADLINE
> +	tristate "MQ deadline I/O scheduler"
> +	default y
> +	---help---
> +	  MQ version of the deadline IO scheduler.
> +
> config CFQ_GROUP_IOSCHED
> 	bool "CFQ Group Scheduling support"
> 	depends on IOSCHED_CFQ && BLK_CGROUP
> diff --git a/block/Makefile b/block/Makefile
> index 2eee9e1bb6db..3ee0abd7205a 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
> obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
> obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
> obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
> +obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
> 
> obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
> obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> new file mode 100644
> index 000000000000..3cb9de21ab21
> --- /dev/null
> +++ b/block/mq-deadline.c
> @@ -0,0 +1,649 @@
> +/*
> + *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
> + *  for the blk-mq scheduling framework
> + *
> + *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
> + */
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
> +#include <linux/elevator.h>
> +#include <linux/bio.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/init.h>
> +#include <linux/compiler.h>
> +#include <linux/rbtree.h>
> +#include <linux/sbitmap.h>
> +
> +#include "blk.h"
> +#include "blk-mq.h"
> +#include "blk-mq-tag.h"
> +#include "blk-mq-sched.h"
> +
> +static unsigned int queue_depth = 256;
> +module_param(queue_depth, uint, 0644);
> +MODULE_PARM_DESC(queue_depth, "Use this value as the scheduler queue depth");
> +
> +/*
> + * See Documentation/block/deadline-iosched.txt
> + */
> +static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
> +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
> +static const int writes_starved = 2;    /* max times reads can starve a write */
> +static const int fifo_batch = 16;       /* # of sequential requests treated as one
> +				     by the above parameters. For throughput. */
> +
> +struct deadline_data {
> +	/*
> +	 * run time data
> +	 */
> +
> +	/*
> +	 * requests (deadline_rq s) are present on both sort_list and fifo_list
> +	 */
> +	struct rb_root sort_list[2];
> +	struct list_head fifo_list[2];
> +
> +	/*
> +	 * next in sort order. read, write or both are NULL
> +	 */
> +	struct request *next_rq[2];
> +	unsigned int batching;		/* number of sequential requests made */
> +	unsigned int starved;		/* times reads have starved writes */
> +
> +	/*
> +	 * settings that change how the i/o scheduler behaves
> +	 */
> +	int fifo_expire[2];
> +	int fifo_batch;
> +	int writes_starved;
> +	int front_merges;
> +
> +	spinlock_t lock;
> +	struct list_head dispatch;
> +	struct blk_mq_tags *tags;
> +	atomic_t wait_index;
> +};
> +
> +static inline struct rb_root *
> +deadline_rb_root(struct deadline_data *dd, struct request *rq)
> +{
> +	return &dd->sort_list[rq_data_dir(rq)];
> +}
> +
> +/*
> + * get the request after `rq' in sector-sorted order
> + */
> +static inline struct request *
> +deadline_latter_request(struct request *rq)
> +{
> +	struct rb_node *node = rb_next(&rq->rb_node);
> +
> +	if (node)
> +		return rb_entry_rq(node);
> +
> +	return NULL;
> +}
> +
> +static void
> +deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
> +{
> +	struct rb_root *root = deadline_rb_root(dd, rq);
> +
> +	elv_rb_add(root, rq);
> +}
> +
> +static inline void
> +deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
> +{
> +	const int data_dir = rq_data_dir(rq);
> +
> +	if (dd->next_rq[data_dir] == rq)
> +		dd->next_rq[data_dir] = deadline_latter_request(rq);
> +
> +	elv_rb_del(deadline_rb_root(dd, rq), rq);
> +}
> +
> +/*
> + * remove rq from rbtree and fifo.
> + */
> +static void deadline_remove_request(struct request_queue *q, struct request *rq)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	list_del_init(&rq->queuelist);
> +
> +	/*
> +	 * We might not be on the rbtree, if we are doing an insert merge
> +	 */
> +	if (!RB_EMPTY_NODE(&rq->rb_node))
> +		deadline_del_rq_rb(dd, rq);
> +
> +	elv_rqhash_del(q, rq);
> +	if (q->last_merge == rq)
> +		q->last_merge = NULL;
> +}
> +
> +static void dd_request_merged(struct request_queue *q, struct request *req,
> +			      int type)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	/*
> +	 * if the merge was a front merge, we need to reposition request
> +	 */
> +	if (type == ELEVATOR_FRONT_MERGE) {
> +		elv_rb_del(deadline_rb_root(dd, req), req);
> +		deadline_add_rq_rb(dd, req);
> +	}
> +}
> +
> +static void dd_merged_requests(struct request_queue *q, struct request *req,
> +			       struct request *next)
> +{
> +	/*
> +	 * if next expires before rq, assign its expire time to rq
> +	 * and move into next position (next will be deleted) in fifo
> +	 */
> +	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
> +		if (time_before((unsigned long)next->fifo_time,
> +				(unsigned long)req->fifo_time)) {
> +			list_move(&req->queuelist, &next->queuelist);
> +			req->fifo_time = next->fifo_time;
> +		}
> +	}
> +
> +	/*
> +	 * kill knowledge of next, this one is a goner
> +	 */
> +	deadline_remove_request(q, next);
> +}
> +
> +/*
> + * move an entry to dispatch queue
> + */
> +static void
> +deadline_move_request(struct deadline_data *dd, struct request *rq)
> +{
> +	const int data_dir = rq_data_dir(rq);
> +
> +	dd->next_rq[READ] = NULL;
> +	dd->next_rq[WRITE] = NULL;
> +	dd->next_rq[data_dir] = deadline_latter_request(rq);
> +
> +	/*
> +	 * take it off the sort and fifo list
> +	 */
> +	deadline_remove_request(rq->q, rq);
> +}
> +
> +/*
> + * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
> + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
> + */
> +static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
> +{
> +	struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
> +
> +	/*
> +	 * rq is expired!
> +	 */
> +	if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +/*
> + * deadline_dispatch_requests selects the best request according to
> + * read/write expire, fifo_batch, etc
> + */
> +static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +	struct request *rq;
> +	bool reads, writes;
> +	int data_dir;
> +
> +	spin_lock(&dd->lock);
> +
> +	if (!list_empty(&dd->dispatch)) {
> +		rq = list_first_entry(&dd->dispatch, struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +		goto done;
> +	}
> +
> +	reads = !list_empty(&dd->fifo_list[READ]);
> +	writes = !list_empty(&dd->fifo_list[WRITE]);
> +
> +	/*
> +	 * batches are currently reads XOR writes
> +	 */
> +	if (dd->next_rq[WRITE])
> +		rq = dd->next_rq[WRITE];
> +	else
> +		rq = dd->next_rq[READ];
> +
> +	if (rq && dd->batching < dd->fifo_batch)
> +		/* we have a next request are still entitled to batch */
> +		goto dispatch_request;
> +
> +	/*
> +	 * at this point we are not running a batch. select the appropriate
> +	 * data direction (read / write)
> +	 */
> +
> +	if (reads) {
> +		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
> +
> +		if (writes && (dd->starved++ >= dd->writes_starved))
> +			goto dispatch_writes;
> +
> +		data_dir = READ;
> +
> +		goto dispatch_find_request;
> +	}
> +
> +	/*
> +	 * there are either no reads or writes have been starved
> +	 */
> +
> +	if (writes) {
> +dispatch_writes:
> +		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
> +
> +		dd->starved = 0;
> +
> +		data_dir = WRITE;
> +
> +		goto dispatch_find_request;
> +	}
> +
> +	spin_unlock(&dd->lock);
> +	return NULL;
> +
> +dispatch_find_request:
> +	/*
> +	 * we are not running a batch, find best request for selected data_dir
> +	 */
> +	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
> +		/*
> +		 * A deadline has expired, the last request was in the other
> +		 * direction, or we have run out of higher-sectored requests.
> +		 * Start again from the request with the earliest expiry time.
> +		 */
> +		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
> +	} else {
> +		/*
> +		 * The last req was the same dir and we have a next request in
> +		 * sort order. No expired requests so continue on from here.
> +		 */
> +		rq = dd->next_rq[data_dir];
> +	}
> +
> +	dd->batching = 0;
> +
> +dispatch_request:
> +	/*
> +	 * rq is the selected appropriate request.
> +	 */
> +	dd->batching++;
> +	deadline_move_request(dd, rq);
> +done:
> +	rq->rq_flags |= RQF_STARTED;
> +	spin_unlock(&dd->lock);
> +	return rq;
> +}
> +
> +static void dd_dispatch_requests(struct blk_mq_hw_ctx *hctx,
> +				 struct list_head *rq_list)
> +{
> +	blk_mq_sched_dispatch_shadow_requests(hctx, rq_list, __dd_dispatch_request);
> +}
> +
> +static void dd_exit_queue(struct elevator_queue *e)
> +{
> +	struct deadline_data *dd = e->elevator_data;
> +
> +	BUG_ON(!list_empty(&dd->fifo_list[READ]));
> +	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
> +
> +	blk_mq_sched_free_requests(dd->tags);
> +	kfree(dd);
> +}
> +
> +/*
> + * initialize elevator private data (deadline_data).
> + */
> +static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
> +{
> +	struct deadline_data *dd;
> +	struct elevator_queue *eq;
> +
> +	eq = elevator_alloc(q, e);
> +	if (!eq)
> +		return -ENOMEM;
> +
> +	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
> +	if (!dd) {
> +		kobject_put(&eq->kobj);
> +		return -ENOMEM;
> +	}
> +	eq->elevator_data = dd;
> +
> +	dd->tags = blk_mq_sched_alloc_requests(queue_depth, q->node);
> +	if (!dd->tags) {
> +		kfree(dd);
> +		kobject_put(&eq->kobj);
> +		return -ENOMEM;
> +	}
> +
> +	INIT_LIST_HEAD(&dd->fifo_list[READ]);
> +	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
> +	dd->sort_list[READ] = RB_ROOT;
> +	dd->sort_list[WRITE] = RB_ROOT;
> +	dd->fifo_expire[READ] = read_expire;
> +	dd->fifo_expire[WRITE] = write_expire;
> +	dd->writes_starved = writes_starved;
> +	dd->front_merges = 1;
> +	dd->fifo_batch = fifo_batch;
> +	spin_lock_init(&dd->lock);
> +	INIT_LIST_HEAD(&dd->dispatch);
> +	atomic_set(&dd->wait_index, 0);
> +
> +	q->elevator = eq;
> +	return 0;
> +}
> +
> +static int dd_request_merge(struct request_queue *q, struct request **rq,
> +			    struct bio *bio)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	sector_t sector = bio_end_sector(bio);
> +	struct request *__rq;
> +
> +	if (!dd->front_merges)
> +		return ELEVATOR_NO_MERGE;
> +
> +	__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
> +	if (__rq) {
> +		BUG_ON(sector != blk_rq_pos(__rq));
> +
> +		if (elv_bio_merge_ok(__rq, bio)) {
> +			*rq = __rq;
> +			return ELEVATOR_FRONT_MERGE;
> +		}
> +	}
> +
> +	return ELEVATOR_NO_MERGE;
> +}
> +
> +static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	int ret;
> +
> +	spin_lock(&dd->lock);
> +	ret = blk_mq_sched_try_merge(q, bio);
> +	spin_unlock(&dd->lock);
> +
> +	return ret;
> +}
> +
> +/*
> + * add rq to rbtree and fifo
> + */
> +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
> +			      bool at_head)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	const int data_dir = rq_data_dir(rq);
> +
> +	if (blk_mq_sched_try_insert_merge(q, rq))
> +		return;
> +
> +	blk_mq_sched_request_inserted(rq);
> +
> +	/*
> +	 * If we're trying to insert a real request, just send it directly
> +	 * to the hardware dispatch list. This only happens for a requeue,
> +	 * or FUA/FLUSH requests.
> +	 */
> +	if (!blk_mq_sched_rq_is_shadow(rq)) {
> +		spin_lock(&hctx->lock);
> +		list_add_tail(&rq->queuelist, &hctx->dispatch);
> +		spin_unlock(&hctx->lock);
> +		return;
> +	}
> +
> +	if (at_head || rq->cmd_type != REQ_TYPE_FS) {
> +		if (at_head)
> +			list_add(&rq->queuelist, &dd->dispatch);
> +		else
> +			list_add_tail(&rq->queuelist, &dd->dispatch);
> +	} else {
> +		deadline_add_rq_rb(dd, rq);
> +
> +		if (rq_mergeable(rq)) {
> +			elv_rqhash_add(q, rq);
> +			if (!q->last_merge)
> +				q->last_merge = rq;
> +		}
> +
> +		/*
> +		 * set expire time and add to fifo list
> +		 */
> +		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
> +		list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
> +	}
> +}
> +
> +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
> +			       struct list_head *list, bool at_head)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	spin_lock(&dd->lock);
> +	while (!list_empty(list)) {
> +		struct request *rq;
> +
> +		rq = list_first_entry(list, struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +		dd_insert_request(hctx, rq, at_head);
> +	}
> +	spin_unlock(&dd->lock);
> +}
> +
> +static struct request *dd_get_request(struct request_queue *q, unsigned int op,
> +				      struct blk_mq_alloc_data *data)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	struct request *rq;
> +
> +	/*
> +	 * The flush machinery intercepts before we insert the request. As
> +	 * a work-around, just hand it back a real request.
> +	 */
> +	if (unlikely(op & (REQ_PREFLUSH | REQ_FUA)))
> +		rq = __blk_mq_alloc_request(data, op);
> +	else {
> +		rq = blk_mq_sched_alloc_shadow_request(q, data, dd->tags, &dd->wait_index);
> +		if (rq)
> +			blk_mq_rq_ctx_init(q, data->ctx, rq, op);
> +	}
> +
> +	return rq;
> +}
> +
> +static bool dd_put_request(struct request *rq)
> +{
> +	/*
> +	 * If it's a real request, we just have to free it. For a shadow
> +	 * request, we should only free it if we haven't started it. A
> +	 * started request is mapped to a real one, and the real one will
> +	 * free it. We can get here with request merges, since we then
> +	 * free the request before we start/issue it.
> +	 */
> +	if (!blk_mq_sched_rq_is_shadow(rq))
> +		return false;
> +
> +	if (!(rq->rq_flags & RQF_STARTED)) {
> +		struct request_queue *q = rq->q;
> +		struct deadline_data *dd = q->elevator->elevator_data;
> +
> +		/*
> +		 * IO completion would normally do this, but if we merge
> +		 * and free before we issue the request, drop both the
> +		 * tag and queue ref
> +		 */
> +		blk_mq_sched_free_shadow_request(dd->tags, rq);
> +		blk_queue_exit(q);
> +	}
> +
> +	return true;
> +}
> +
> +static void dd_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
> +{
> +	struct request *sched_rq = rq->end_io_data;
> +
> +	/*
> +	 * sched_rq can be NULL, if we haven't setup the shadow yet
> +	 * because we failed getting one.
> +	 */
> +	if (sched_rq) {
> +		struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +
> +		blk_mq_sched_free_shadow_request(dd->tags, sched_rq);
> +		blk_mq_start_stopped_hw_queue(hctx, true);
> +	}
> +}
> +
> +static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +
> +	return !list_empty_careful(&dd->dispatch) ||
> +		!list_empty_careful(&dd->fifo_list[0]) ||
> +		!list_empty_careful(&dd->fifo_list[1]);
> +}
> +
> +/*
> + * sysfs parts below
> + */
> +static ssize_t
> +deadline_var_show(int var, char *page)
> +{
> +	return sprintf(page, "%d\n", var);
> +}
> +
> +static ssize_t
> +deadline_var_store(int *var, const char *page, size_t count)
> +{
> +	char *p = (char *) page;
> +
> +	*var = simple_strtol(p, &p, 10);
> +	return count;
> +}
> +
> +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
> +static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
> +{									\
> +	struct deadline_data *dd = e->elevator_data;			\
> +	int __data = __VAR;						\
> +	if (__CONV)							\
> +		__data = jiffies_to_msecs(__data);			\
> +	return deadline_var_show(__data, (page));			\
> +}
> +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
> +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
> +SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
> +SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
> +SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
> +#undef SHOW_FUNCTION
> +
> +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
> +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
> +{									\
> +	struct deadline_data *dd = e->elevator_data;			\
> +	int __data;							\
> +	int ret = deadline_var_store(&__data, (page), count);		\
> +	if (__data < (MIN))						\
> +		__data = (MIN);						\
> +	else if (__data > (MAX))					\
> +		__data = (MAX);						\
> +	if (__CONV)							\
> +		*(__PTR) = msecs_to_jiffies(__data);			\
> +	else								\
> +		*(__PTR) = __data;					\
> +	return ret;							\
> +}
> +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
> +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
> +STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
> +STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
> +STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
> +#undef STORE_FUNCTION
> +
> +#define DD_ATTR(name) \
> +	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
> +				      deadline_##name##_store)
> +
> +static struct elv_fs_entry deadline_attrs[] = {
> +	DD_ATTR(read_expire),
> +	DD_ATTR(write_expire),
> +	DD_ATTR(writes_starved),
> +	DD_ATTR(front_merges),
> +	DD_ATTR(fifo_batch),
> +	__ATTR_NULL
> +};
> +
> +static struct elevator_type mq_deadline = {
> +	.ops.mq = {
> +		.get_request		= dd_get_request,
> +		.put_request		= dd_put_request,
> +		.insert_requests	= dd_insert_requests,
> +		.dispatch_requests	= dd_dispatch_requests,
> +		.completed_request	= dd_completed_request,
> +		.next_request		= elv_rb_latter_request,
> +		.former_request		= elv_rb_former_request,
> +		.bio_merge		= dd_bio_merge,
> +		.request_merge		= dd_request_merge,
> +		.requests_merged	= dd_merged_requests,
> +		.request_merged		= dd_request_merged,
> +		.has_work		= dd_has_work,
> +		.init_sched		= dd_init_queue,
> +		.exit_sched		= dd_exit_queue,
> +	},
> +
> +	.uses_mq	= true,
> +	.elevator_attrs = deadline_attrs,
> +	.elevator_name = "mq-deadline",
> +	.elevator_owner = THIS_MODULE,
> +};
> +
> +static int __init deadline_init(void)
> +{
> +	if (!queue_depth) {
> +		pr_err("mq-deadline: queue depth must be > 0\n");
> +		return -EINVAL;
> +	}
> +	return elv_register(&mq_deadline);
> +}
> +
> +static void __exit deadline_exit(void)
> +{
> +	elv_unregister(&mq_deadline);
> +}
> +
> +module_init(deadline_init);
> +module_exit(deadline_exit);
> +
> +MODULE_AUTHOR("Jens Axboe");
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("MQ deadline IO scheduler");
> -- 
> 2.7.4
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Jan. 17, 2017, 2:47 a.m. UTC | #7
On 12/22/2016 09:07 AM, Paolo Valente wrote:
> 
>> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
>>
>> This is basically identical to deadline-iosched, except it registers
>> as a MQ capable scheduler. This is still a single queue design.
>>
>> Signed-off-by: Jens Axboe <axboe@fb.com>
>> ---
> 
> ...
> 
>> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
>> new file mode 100644
>> index 000000000000..3cb9de21ab21
>> --- /dev/null
>> +++ b/block/mq-deadline.c
>> ...
>> +/*
>> + * remove rq from rbtree and fifo.
>> + */
>> +static void deadline_remove_request(struct request_queue *q, struct request *rq)
>> +{
>> +	struct deadline_data *dd = q->elevator->elevator_data;
>> +
>> +	list_del_init(&rq->queuelist);
>> +
>> +	/*
>> +	 * We might not be on the rbtree, if we are doing an insert merge
>> +	 */
>> +	if (!RB_EMPTY_NODE(&rq->rb_node))
>> +		deadline_del_rq_rb(dd, rq);
>> +
> 
> I've been scratching my head on the last three instructions, but at no
> avail.  If I understand correctly, the
> list_del_init(&rq->queue list);
> removes rq from the fifo list.  But, if so, I don't understand how it
> could be possible that rq has not been added to the rb_tree too.
>
> Another interpretation that I tried is that the above three lines
> handle correctly the following case where rq has not been inserted at
> all into deadline fifo queue and rb tree: when dd_insert_request was
> executed for rq, blk_mq_sched_try_insert_merge succeeded.  Yet, the
> list_del_init(&rq->queue list);
> does not seem to make sense.
> 
> Could you please shed some light on this for me?

I think you are correct, we don't need to touch ->queuelist for the case
where RB_EMPTY_NODE() is true. Minor detail, the list is already empty,
so it does no harm.
Jens Axboe Jan. 17, 2017, 2:47 a.m. UTC | #8
On 12/22/2016 09:49 AM, Paolo Valente wrote:
> 
>> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
>>
>> This is basically identical to deadline-iosched, except it registers
>> as a MQ capable scheduler. This is still a single queue design.
>>
> 
> One last question (for today ...):in mq-deadline there are no
> "schedule dispatch" or "unplug work" functions.  In blk, CFQ and BFQ
> do these schedules/unplugs in a lot of cases.  What's the right
> replacement?  Just doing nothing?

You just use blk_mq_run_hw_queue() or variants thereof to kick off queue
runs.
Paolo Valente Jan. 20, 2017, 11:07 a.m. UTC | #9
> Il giorno 17 gen 2017, alle ore 03:47, Jens Axboe <axboe@fb.com> ha scritto:
> 
> On 12/22/2016 09:49 AM, Paolo Valente wrote:
>> 
>>> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
>>> 
>>> This is basically identical to deadline-iosched, except it registers
>>> as a MQ capable scheduler. This is still a single queue design.
>>> 
>> 
>> One last question (for today ...):in mq-deadline there are no
>> "schedule dispatch" or "unplug work" functions.  In blk, CFQ and BFQ
>> do these schedules/unplugs in a lot of cases.  What's the right
>> replacement?  Just doing nothing?
> 
> You just use blk_mq_run_hw_queue() or variants thereof to kick off queue
> runs.
> 

Hi Jens,
I'm working on this right now.  I have a pair of quick questions about
performance.

In the blk version of bfq, if the in-service bfq_queue happen to have
no more budget when the bfq dispatch function is invoked, then bfq:
returns no request (NULL), immediately expires the in-service
bfq_queue, and schedules a new dispatch.  The third step is taken so
that, if other bfq_queues have requests, then a new in-service
bfq_queue will be selected on the upcoming new dispatch, and a new
request will be provided right away.

My questions are: is this dispatch-schedule step still needed with
blk-mq, to avoid a stall?  If it is not needed to avoid a stall, would
it still be needed to boost throughput, because it would force an
immediate, next dispatch?

BTW, bfq-mq survived its first request completion.  I will provide you
with a link to a github branch as soon as bfq-mq seems able to stand
up with a minimal workload.

Thanks,
Paolo

> -- 
> Jens Axboe
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Valente Jan. 20, 2017, 1:14 p.m. UTC | #10
> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
> 
> This is basically identical to deadline-iosched, except it registers
> as a MQ capable scheduler. This is still a single queue design.
> 

Jens,
no spin_lock_irq* in the code.  So, also request dispatches are
guaranteed to never be executed in IRQ context?  I'm asking this
question to understand whether I'm missing something that, even in
BFQ, would somehow allow me to not disable irqs in critical sections,
even if there is the slice_idle-expiration handler.  Be patient with
my ignorance.

Thanks,
Paolo

> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---
> block/Kconfig.iosched |   6 +
> block/Makefile        |   1 +
> block/mq-deadline.c   | 649 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 656 insertions(+)
> create mode 100644 block/mq-deadline.c
> 
> diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
> index 421bef9c4c48..490ef2850fae 100644
> --- a/block/Kconfig.iosched
> +++ b/block/Kconfig.iosched
> @@ -32,6 +32,12 @@ config IOSCHED_CFQ
> 
> 	  This is the default I/O scheduler.
> 
> +config MQ_IOSCHED_DEADLINE
> +	tristate "MQ deadline I/O scheduler"
> +	default y
> +	---help---
> +	  MQ version of the deadline IO scheduler.
> +
> config CFQ_GROUP_IOSCHED
> 	bool "CFQ Group Scheduling support"
> 	depends on IOSCHED_CFQ && BLK_CGROUP
> diff --git a/block/Makefile b/block/Makefile
> index 2eee9e1bb6db..3ee0abd7205a 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
> obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
> obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
> obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
> +obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
> 
> obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
> obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> new file mode 100644
> index 000000000000..3cb9de21ab21
> --- /dev/null
> +++ b/block/mq-deadline.c
> @@ -0,0 +1,649 @@
> +/*
> + *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
> + *  for the blk-mq scheduling framework
> + *
> + *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
> + */
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
> +#include <linux/elevator.h>
> +#include <linux/bio.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/init.h>
> +#include <linux/compiler.h>
> +#include <linux/rbtree.h>
> +#include <linux/sbitmap.h>
> +
> +#include "blk.h"
> +#include "blk-mq.h"
> +#include "blk-mq-tag.h"
> +#include "blk-mq-sched.h"
> +
> +static unsigned int queue_depth = 256;
> +module_param(queue_depth, uint, 0644);
> +MODULE_PARM_DESC(queue_depth, "Use this value as the scheduler queue depth");
> +
> +/*
> + * See Documentation/block/deadline-iosched.txt
> + */
> +static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
> +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
> +static const int writes_starved = 2;    /* max times reads can starve a write */
> +static const int fifo_batch = 16;       /* # of sequential requests treated as one
> +				     by the above parameters. For throughput. */
> +
> +struct deadline_data {
> +	/*
> +	 * run time data
> +	 */
> +
> +	/*
> +	 * requests (deadline_rq s) are present on both sort_list and fifo_list
> +	 */
> +	struct rb_root sort_list[2];
> +	struct list_head fifo_list[2];
> +
> +	/*
> +	 * next in sort order. read, write or both are NULL
> +	 */
> +	struct request *next_rq[2];
> +	unsigned int batching;		/* number of sequential requests made */
> +	unsigned int starved;		/* times reads have starved writes */
> +
> +	/*
> +	 * settings that change how the i/o scheduler behaves
> +	 */
> +	int fifo_expire[2];
> +	int fifo_batch;
> +	int writes_starved;
> +	int front_merges;
> +
> +	spinlock_t lock;
> +	struct list_head dispatch;
> +	struct blk_mq_tags *tags;
> +	atomic_t wait_index;
> +};
> +
> +static inline struct rb_root *
> +deadline_rb_root(struct deadline_data *dd, struct request *rq)
> +{
> +	return &dd->sort_list[rq_data_dir(rq)];
> +}
> +
> +/*
> + * get the request after `rq' in sector-sorted order
> + */
> +static inline struct request *
> +deadline_latter_request(struct request *rq)
> +{
> +	struct rb_node *node = rb_next(&rq->rb_node);
> +
> +	if (node)
> +		return rb_entry_rq(node);
> +
> +	return NULL;
> +}
> +
> +static void
> +deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
> +{
> +	struct rb_root *root = deadline_rb_root(dd, rq);
> +
> +	elv_rb_add(root, rq);
> +}
> +
> +static inline void
> +deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
> +{
> +	const int data_dir = rq_data_dir(rq);
> +
> +	if (dd->next_rq[data_dir] == rq)
> +		dd->next_rq[data_dir] = deadline_latter_request(rq);
> +
> +	elv_rb_del(deadline_rb_root(dd, rq), rq);
> +}
> +
> +/*
> + * remove rq from rbtree and fifo.
> + */
> +static void deadline_remove_request(struct request_queue *q, struct request *rq)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	list_del_init(&rq->queuelist);
> +
> +	/*
> +	 * We might not be on the rbtree, if we are doing an insert merge
> +	 */
> +	if (!RB_EMPTY_NODE(&rq->rb_node))
> +		deadline_del_rq_rb(dd, rq);
> +
> +	elv_rqhash_del(q, rq);
> +	if (q->last_merge == rq)
> +		q->last_merge = NULL;
> +}
> +
> +static void dd_request_merged(struct request_queue *q, struct request *req,
> +			      int type)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	/*
> +	 * if the merge was a front merge, we need to reposition request
> +	 */
> +	if (type == ELEVATOR_FRONT_MERGE) {
> +		elv_rb_del(deadline_rb_root(dd, req), req);
> +		deadline_add_rq_rb(dd, req);
> +	}
> +}
> +
> +static void dd_merged_requests(struct request_queue *q, struct request *req,
> +			       struct request *next)
> +{
> +	/*
> +	 * if next expires before rq, assign its expire time to rq
> +	 * and move into next position (next will be deleted) in fifo
> +	 */
> +	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
> +		if (time_before((unsigned long)next->fifo_time,
> +				(unsigned long)req->fifo_time)) {
> +			list_move(&req->queuelist, &next->queuelist);
> +			req->fifo_time = next->fifo_time;
> +		}
> +	}
> +
> +	/*
> +	 * kill knowledge of next, this one is a goner
> +	 */
> +	deadline_remove_request(q, next);
> +}
> +
> +/*
> + * move an entry to dispatch queue
> + */
> +static void
> +deadline_move_request(struct deadline_data *dd, struct request *rq)
> +{
> +	const int data_dir = rq_data_dir(rq);
> +
> +	dd->next_rq[READ] = NULL;
> +	dd->next_rq[WRITE] = NULL;
> +	dd->next_rq[data_dir] = deadline_latter_request(rq);
> +
> +	/*
> +	 * take it off the sort and fifo list
> +	 */
> +	deadline_remove_request(rq->q, rq);
> +}
> +
> +/*
> + * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
> + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
> + */
> +static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
> +{
> +	struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
> +
> +	/*
> +	 * rq is expired!
> +	 */
> +	if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +/*
> + * deadline_dispatch_requests selects the best request according to
> + * read/write expire, fifo_batch, etc
> + */
> +static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +	struct request *rq;
> +	bool reads, writes;
> +	int data_dir;
> +
> +	spin_lock(&dd->lock);
> +
> +	if (!list_empty(&dd->dispatch)) {
> +		rq = list_first_entry(&dd->dispatch, struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +		goto done;
> +	}
> +
> +	reads = !list_empty(&dd->fifo_list[READ]);
> +	writes = !list_empty(&dd->fifo_list[WRITE]);
> +
> +	/*
> +	 * batches are currently reads XOR writes
> +	 */
> +	if (dd->next_rq[WRITE])
> +		rq = dd->next_rq[WRITE];
> +	else
> +		rq = dd->next_rq[READ];
> +
> +	if (rq && dd->batching < dd->fifo_batch)
> +		/* we have a next request are still entitled to batch */
> +		goto dispatch_request;
> +
> +	/*
> +	 * at this point we are not running a batch. select the appropriate
> +	 * data direction (read / write)
> +	 */
> +
> +	if (reads) {
> +		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
> +
> +		if (writes && (dd->starved++ >= dd->writes_starved))
> +			goto dispatch_writes;
> +
> +		data_dir = READ;
> +
> +		goto dispatch_find_request;
> +	}
> +
> +	/*
> +	 * there are either no reads or writes have been starved
> +	 */
> +
> +	if (writes) {
> +dispatch_writes:
> +		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
> +
> +		dd->starved = 0;
> +
> +		data_dir = WRITE;
> +
> +		goto dispatch_find_request;
> +	}
> +
> +	spin_unlock(&dd->lock);
> +	return NULL;
> +
> +dispatch_find_request:
> +	/*
> +	 * we are not running a batch, find best request for selected data_dir
> +	 */
> +	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
> +		/*
> +		 * A deadline has expired, the last request was in the other
> +		 * direction, or we have run out of higher-sectored requests.
> +		 * Start again from the request with the earliest expiry time.
> +		 */
> +		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
> +	} else {
> +		/*
> +		 * The last req was the same dir and we have a next request in
> +		 * sort order. No expired requests so continue on from here.
> +		 */
> +		rq = dd->next_rq[data_dir];
> +	}
> +
> +	dd->batching = 0;
> +
> +dispatch_request:
> +	/*
> +	 * rq is the selected appropriate request.
> +	 */
> +	dd->batching++;
> +	deadline_move_request(dd, rq);
> +done:
> +	rq->rq_flags |= RQF_STARTED;
> +	spin_unlock(&dd->lock);
> +	return rq;
> +}
> +
> +static void dd_dispatch_requests(struct blk_mq_hw_ctx *hctx,
> +				 struct list_head *rq_list)
> +{
> +	blk_mq_sched_dispatch_shadow_requests(hctx, rq_list, __dd_dispatch_request);
> +}
> +
> +static void dd_exit_queue(struct elevator_queue *e)
> +{
> +	struct deadline_data *dd = e->elevator_data;
> +
> +	BUG_ON(!list_empty(&dd->fifo_list[READ]));
> +	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
> +
> +	blk_mq_sched_free_requests(dd->tags);
> +	kfree(dd);
> +}
> +
> +/*
> + * initialize elevator private data (deadline_data).
> + */
> +static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
> +{
> +	struct deadline_data *dd;
> +	struct elevator_queue *eq;
> +
> +	eq = elevator_alloc(q, e);
> +	if (!eq)
> +		return -ENOMEM;
> +
> +	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
> +	if (!dd) {
> +		kobject_put(&eq->kobj);
> +		return -ENOMEM;
> +	}
> +	eq->elevator_data = dd;
> +
> +	dd->tags = blk_mq_sched_alloc_requests(queue_depth, q->node);
> +	if (!dd->tags) {
> +		kfree(dd);
> +		kobject_put(&eq->kobj);
> +		return -ENOMEM;
> +	}
> +
> +	INIT_LIST_HEAD(&dd->fifo_list[READ]);
> +	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
> +	dd->sort_list[READ] = RB_ROOT;
> +	dd->sort_list[WRITE] = RB_ROOT;
> +	dd->fifo_expire[READ] = read_expire;
> +	dd->fifo_expire[WRITE] = write_expire;
> +	dd->writes_starved = writes_starved;
> +	dd->front_merges = 1;
> +	dd->fifo_batch = fifo_batch;
> +	spin_lock_init(&dd->lock);
> +	INIT_LIST_HEAD(&dd->dispatch);
> +	atomic_set(&dd->wait_index, 0);
> +
> +	q->elevator = eq;
> +	return 0;
> +}
> +
> +static int dd_request_merge(struct request_queue *q, struct request **rq,
> +			    struct bio *bio)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	sector_t sector = bio_end_sector(bio);
> +	struct request *__rq;
> +
> +	if (!dd->front_merges)
> +		return ELEVATOR_NO_MERGE;
> +
> +	__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
> +	if (__rq) {
> +		BUG_ON(sector != blk_rq_pos(__rq));
> +
> +		if (elv_bio_merge_ok(__rq, bio)) {
> +			*rq = __rq;
> +			return ELEVATOR_FRONT_MERGE;
> +		}
> +	}
> +
> +	return ELEVATOR_NO_MERGE;
> +}
> +
> +static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	int ret;
> +
> +	spin_lock(&dd->lock);
> +	ret = blk_mq_sched_try_merge(q, bio);
> +	spin_unlock(&dd->lock);
> +
> +	return ret;
> +}
> +
> +/*
> + * add rq to rbtree and fifo
> + */
> +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
> +			      bool at_head)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	const int data_dir = rq_data_dir(rq);
> +
> +	if (blk_mq_sched_try_insert_merge(q, rq))
> +		return;
> +
> +	blk_mq_sched_request_inserted(rq);
> +
> +	/*
> +	 * If we're trying to insert a real request, just send it directly
> +	 * to the hardware dispatch list. This only happens for a requeue,
> +	 * or FUA/FLUSH requests.
> +	 */
> +	if (!blk_mq_sched_rq_is_shadow(rq)) {
> +		spin_lock(&hctx->lock);
> +		list_add_tail(&rq->queuelist, &hctx->dispatch);
> +		spin_unlock(&hctx->lock);
> +		return;
> +	}
> +
> +	if (at_head || rq->cmd_type != REQ_TYPE_FS) {
> +		if (at_head)
> +			list_add(&rq->queuelist, &dd->dispatch);
> +		else
> +			list_add_tail(&rq->queuelist, &dd->dispatch);
> +	} else {
> +		deadline_add_rq_rb(dd, rq);
> +
> +		if (rq_mergeable(rq)) {
> +			elv_rqhash_add(q, rq);
> +			if (!q->last_merge)
> +				q->last_merge = rq;
> +		}
> +
> +		/*
> +		 * set expire time and add to fifo list
> +		 */
> +		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
> +		list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
> +	}
> +}
> +
> +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
> +			       struct list_head *list, bool at_head)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	spin_lock(&dd->lock);
> +	while (!list_empty(list)) {
> +		struct request *rq;
> +
> +		rq = list_first_entry(list, struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +		dd_insert_request(hctx, rq, at_head);
> +	}
> +	spin_unlock(&dd->lock);
> +}
> +
> +static struct request *dd_get_request(struct request_queue *q, unsigned int op,
> +				      struct blk_mq_alloc_data *data)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	struct request *rq;
> +
> +	/*
> +	 * The flush machinery intercepts before we insert the request. As
> +	 * a work-around, just hand it back a real request.
> +	 */
> +	if (unlikely(op & (REQ_PREFLUSH | REQ_FUA)))
> +		rq = __blk_mq_alloc_request(data, op);
> +	else {
> +		rq = blk_mq_sched_alloc_shadow_request(q, data, dd->tags, &dd->wait_index);
> +		if (rq)
> +			blk_mq_rq_ctx_init(q, data->ctx, rq, op);
> +	}
> +
> +	return rq;
> +}
> +
> +static bool dd_put_request(struct request *rq)
> +{
> +	/*
> +	 * If it's a real request, we just have to free it. For a shadow
> +	 * request, we should only free it if we haven't started it. A
> +	 * started request is mapped to a real one, and the real one will
> +	 * free it. We can get here with request merges, since we then
> +	 * free the request before we start/issue it.
> +	 */
> +	if (!blk_mq_sched_rq_is_shadow(rq))
> +		return false;
> +
> +	if (!(rq->rq_flags & RQF_STARTED)) {
> +		struct request_queue *q = rq->q;
> +		struct deadline_data *dd = q->elevator->elevator_data;
> +
> +		/*
> +		 * IO completion would normally do this, but if we merge
> +		 * and free before we issue the request, drop both the
> +		 * tag and queue ref
> +		 */
> +		blk_mq_sched_free_shadow_request(dd->tags, rq);
> +		blk_queue_exit(q);
> +	}
> +
> +	return true;
> +}
> +
> +static void dd_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
> +{
> +	struct request *sched_rq = rq->end_io_data;
> +
> +	/*
> +	 * sched_rq can be NULL, if we haven't setup the shadow yet
> +	 * because we failed getting one.
> +	 */
> +	if (sched_rq) {
> +		struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +
> +		blk_mq_sched_free_shadow_request(dd->tags, sched_rq);
> +		blk_mq_start_stopped_hw_queue(hctx, true);
> +	}
> +}
> +
> +static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +
> +	return !list_empty_careful(&dd->dispatch) ||
> +		!list_empty_careful(&dd->fifo_list[0]) ||
> +		!list_empty_careful(&dd->fifo_list[1]);
> +}
> +
> +/*
> + * sysfs parts below
> + */
> +static ssize_t
> +deadline_var_show(int var, char *page)
> +{
> +	return sprintf(page, "%d\n", var);
> +}
> +
> +static ssize_t
> +deadline_var_store(int *var, const char *page, size_t count)
> +{
> +	char *p = (char *) page;
> +
> +	*var = simple_strtol(p, &p, 10);
> +	return count;
> +}
> +
> +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
> +static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
> +{									\
> +	struct deadline_data *dd = e->elevator_data;			\
> +	int __data = __VAR;						\
> +	if (__CONV)							\
> +		__data = jiffies_to_msecs(__data);			\
> +	return deadline_var_show(__data, (page));			\
> +}
> +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
> +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
> +SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
> +SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
> +SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
> +#undef SHOW_FUNCTION
> +
> +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
> +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
> +{									\
> +	struct deadline_data *dd = e->elevator_data;			\
> +	int __data;							\
> +	int ret = deadline_var_store(&__data, (page), count);		\
> +	if (__data < (MIN))						\
> +		__data = (MIN);						\
> +	else if (__data > (MAX))					\
> +		__data = (MAX);						\
> +	if (__CONV)							\
> +		*(__PTR) = msecs_to_jiffies(__data);			\
> +	else								\
> +		*(__PTR) = __data;					\
> +	return ret;							\
> +}
> +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
> +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
> +STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
> +STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
> +STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
> +#undef STORE_FUNCTION
> +
> +#define DD_ATTR(name) \
> +	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
> +				      deadline_##name##_store)
> +
> +static struct elv_fs_entry deadline_attrs[] = {
> +	DD_ATTR(read_expire),
> +	DD_ATTR(write_expire),
> +	DD_ATTR(writes_starved),
> +	DD_ATTR(front_merges),
> +	DD_ATTR(fifo_batch),
> +	__ATTR_NULL
> +};
> +
> +static struct elevator_type mq_deadline = {
> +	.ops.mq = {
> +		.get_request		= dd_get_request,
> +		.put_request		= dd_put_request,
> +		.insert_requests	= dd_insert_requests,
> +		.dispatch_requests	= dd_dispatch_requests,
> +		.completed_request	= dd_completed_request,
> +		.next_request		= elv_rb_latter_request,
> +		.former_request		= elv_rb_former_request,
> +		.bio_merge		= dd_bio_merge,
> +		.request_merge		= dd_request_merge,
> +		.requests_merged	= dd_merged_requests,
> +		.request_merged		= dd_request_merged,
> +		.has_work		= dd_has_work,
> +		.init_sched		= dd_init_queue,
> +		.exit_sched		= dd_exit_queue,
> +	},
> +
> +	.uses_mq	= true,
> +	.elevator_attrs = deadline_attrs,
> +	.elevator_name = "mq-deadline",
> +	.elevator_owner = THIS_MODULE,
> +};
> +
> +static int __init deadline_init(void)
> +{
> +	if (!queue_depth) {
> +		pr_err("mq-deadline: queue depth must be > 0\n");
> +		return -EINVAL;
> +	}
> +	return elv_register(&mq_deadline);
> +}
> +
> +static void __exit deadline_exit(void)
> +{
> +	elv_unregister(&mq_deadline);
> +}
> +
> +module_init(deadline_init);
> +module_exit(deadline_exit);
> +
> +MODULE_AUTHOR("Jens Axboe");
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("MQ deadline IO scheduler");
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-block" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Valente Jan. 20, 2017, 1:18 p.m. UTC | #11
> Il giorno 20 gen 2017, alle ore 14:14, Paolo Valente <paolo.valente@linaro.org> ha scritto:
> 
>> 
>> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
>> 
>> This is basically identical to deadline-iosched, except it registers
>> as a MQ capable scheduler. This is still a single queue design.
>> 
> 
> Jens,
> no spin_lock_irq* in the code.  So, also request dispatches are
> guaranteed to never be executed in IRQ context?

Or maybe the opposite? That is, all scheduler functions invoked in IRQ context?

Thanks,
Paolo

>  I'm asking this
> question to understand whether I'm missing something that, even in
> BFQ, would somehow allow me to not disable irqs in critical sections,
> even if there is the slice_idle-expiration handler.  Be patient with
> my ignorance.
> 
> Thanks,
> Paolo
> 
>> Signed-off-by: Jens Axboe <axboe@fb.com>
>> ---
>> block/Kconfig.iosched |   6 +
>> block/Makefile        |   1 +
>> block/mq-deadline.c   | 649 ++++++++++++++++++++++++++++++++++++++++++++++++++
>> 3 files changed, 656 insertions(+)
>> create mode 100644 block/mq-deadline.c
>> 
>> diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
>> index 421bef9c4c48..490ef2850fae 100644
>> --- a/block/Kconfig.iosched
>> +++ b/block/Kconfig.iosched
>> @@ -32,6 +32,12 @@ config IOSCHED_CFQ
>> 
>> 	  This is the default I/O scheduler.
>> 
>> +config MQ_IOSCHED_DEADLINE
>> +	tristate "MQ deadline I/O scheduler"
>> +	default y
>> +	---help---
>> +	  MQ version of the deadline IO scheduler.
>> +
>> config CFQ_GROUP_IOSCHED
>> 	bool "CFQ Group Scheduling support"
>> 	depends on IOSCHED_CFQ && BLK_CGROUP
>> diff --git a/block/Makefile b/block/Makefile
>> index 2eee9e1bb6db..3ee0abd7205a 100644
>> --- a/block/Makefile
>> +++ b/block/Makefile
>> @@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
>> obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
>> obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
>> obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
>> +obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
>> 
>> obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
>> obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
>> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
>> new file mode 100644
>> index 000000000000..3cb9de21ab21
>> --- /dev/null
>> +++ b/block/mq-deadline.c
>> @@ -0,0 +1,649 @@
>> +/*
>> + *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
>> + *  for the blk-mq scheduling framework
>> + *
>> + *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
>> + */
>> +#include <linux/kernel.h>
>> +#include <linux/fs.h>
>> +#include <linux/blkdev.h>
>> +#include <linux/blk-mq.h>
>> +#include <linux/elevator.h>
>> +#include <linux/bio.h>
>> +#include <linux/module.h>
>> +#include <linux/slab.h>
>> +#include <linux/init.h>
>> +#include <linux/compiler.h>
>> +#include <linux/rbtree.h>
>> +#include <linux/sbitmap.h>
>> +
>> +#include "blk.h"
>> +#include "blk-mq.h"
>> +#include "blk-mq-tag.h"
>> +#include "blk-mq-sched.h"
>> +
>> +static unsigned int queue_depth = 256;
>> +module_param(queue_depth, uint, 0644);
>> +MODULE_PARM_DESC(queue_depth, "Use this value as the scheduler queue depth");
>> +
>> +/*
>> + * See Documentation/block/deadline-iosched.txt
>> + */
>> +static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
>> +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
>> +static const int writes_starved = 2;    /* max times reads can starve a write */
>> +static const int fifo_batch = 16;       /* # of sequential requests treated as one
>> +				     by the above parameters. For throughput. */
>> +
>> +struct deadline_data {
>> +	/*
>> +	 * run time data
>> +	 */
>> +
>> +	/*
>> +	 * requests (deadline_rq s) are present on both sort_list and fifo_list
>> +	 */
>> +	struct rb_root sort_list[2];
>> +	struct list_head fifo_list[2];
>> +
>> +	/*
>> +	 * next in sort order. read, write or both are NULL
>> +	 */
>> +	struct request *next_rq[2];
>> +	unsigned int batching;		/* number of sequential requests made */
>> +	unsigned int starved;		/* times reads have starved writes */
>> +
>> +	/*
>> +	 * settings that change how the i/o scheduler behaves
>> +	 */
>> +	int fifo_expire[2];
>> +	int fifo_batch;
>> +	int writes_starved;
>> +	int front_merges;
>> +
>> +	spinlock_t lock;
>> +	struct list_head dispatch;
>> +	struct blk_mq_tags *tags;
>> +	atomic_t wait_index;
>> +};
>> +
>> +static inline struct rb_root *
>> +deadline_rb_root(struct deadline_data *dd, struct request *rq)
>> +{
>> +	return &dd->sort_list[rq_data_dir(rq)];
>> +}
>> +
>> +/*
>> + * get the request after `rq' in sector-sorted order
>> + */
>> +static inline struct request *
>> +deadline_latter_request(struct request *rq)
>> +{
>> +	struct rb_node *node = rb_next(&rq->rb_node);
>> +
>> +	if (node)
>> +		return rb_entry_rq(node);
>> +
>> +	return NULL;
>> +}
>> +
>> +static void
>> +deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
>> +{
>> +	struct rb_root *root = deadline_rb_root(dd, rq);
>> +
>> +	elv_rb_add(root, rq);
>> +}
>> +
>> +static inline void
>> +deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
>> +{
>> +	const int data_dir = rq_data_dir(rq);
>> +
>> +	if (dd->next_rq[data_dir] == rq)
>> +		dd->next_rq[data_dir] = deadline_latter_request(rq);
>> +
>> +	elv_rb_del(deadline_rb_root(dd, rq), rq);
>> +}
>> +
>> +/*
>> + * remove rq from rbtree and fifo.
>> + */
>> +static void deadline_remove_request(struct request_queue *q, struct request *rq)
>> +{
>> +	struct deadline_data *dd = q->elevator->elevator_data;
>> +
>> +	list_del_init(&rq->queuelist);
>> +
>> +	/*
>> +	 * We might not be on the rbtree, if we are doing an insert merge
>> +	 */
>> +	if (!RB_EMPTY_NODE(&rq->rb_node))
>> +		deadline_del_rq_rb(dd, rq);
>> +
>> +	elv_rqhash_del(q, rq);
>> +	if (q->last_merge == rq)
>> +		q->last_merge = NULL;
>> +}
>> +
>> +static void dd_request_merged(struct request_queue *q, struct request *req,
>> +			      int type)
>> +{
>> +	struct deadline_data *dd = q->elevator->elevator_data;
>> +
>> +	/*
>> +	 * if the merge was a front merge, we need to reposition request
>> +	 */
>> +	if (type == ELEVATOR_FRONT_MERGE) {
>> +		elv_rb_del(deadline_rb_root(dd, req), req);
>> +		deadline_add_rq_rb(dd, req);
>> +	}
>> +}
>> +
>> +static void dd_merged_requests(struct request_queue *q, struct request *req,
>> +			       struct request *next)
>> +{
>> +	/*
>> +	 * if next expires before rq, assign its expire time to rq
>> +	 * and move into next position (next will be deleted) in fifo
>> +	 */
>> +	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
>> +		if (time_before((unsigned long)next->fifo_time,
>> +				(unsigned long)req->fifo_time)) {
>> +			list_move(&req->queuelist, &next->queuelist);
>> +			req->fifo_time = next->fifo_time;
>> +		}
>> +	}
>> +
>> +	/*
>> +	 * kill knowledge of next, this one is a goner
>> +	 */
>> +	deadline_remove_request(q, next);
>> +}
>> +
>> +/*
>> + * move an entry to dispatch queue
>> + */
>> +static void
>> +deadline_move_request(struct deadline_data *dd, struct request *rq)
>> +{
>> +	const int data_dir = rq_data_dir(rq);
>> +
>> +	dd->next_rq[READ] = NULL;
>> +	dd->next_rq[WRITE] = NULL;
>> +	dd->next_rq[data_dir] = deadline_latter_request(rq);
>> +
>> +	/*
>> +	 * take it off the sort and fifo list
>> +	 */
>> +	deadline_remove_request(rq->q, rq);
>> +}
>> +
>> +/*
>> + * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
>> + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
>> + */
>> +static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
>> +{
>> +	struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
>> +
>> +	/*
>> +	 * rq is expired!
>> +	 */
>> +	if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
>> +		return 1;
>> +
>> +	return 0;
>> +}
>> +
>> +/*
>> + * deadline_dispatch_requests selects the best request according to
>> + * read/write expire, fifo_batch, etc
>> + */
>> +static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
>> +{
>> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
>> +	struct request *rq;
>> +	bool reads, writes;
>> +	int data_dir;
>> +
>> +	spin_lock(&dd->lock);
>> +
>> +	if (!list_empty(&dd->dispatch)) {
>> +		rq = list_first_entry(&dd->dispatch, struct request, queuelist);
>> +		list_del_init(&rq->queuelist);
>> +		goto done;
>> +	}
>> +
>> +	reads = !list_empty(&dd->fifo_list[READ]);
>> +	writes = !list_empty(&dd->fifo_list[WRITE]);
>> +
>> +	/*
>> +	 * batches are currently reads XOR writes
>> +	 */
>> +	if (dd->next_rq[WRITE])
>> +		rq = dd->next_rq[WRITE];
>> +	else
>> +		rq = dd->next_rq[READ];
>> +
>> +	if (rq && dd->batching < dd->fifo_batch)
>> +		/* we have a next request are still entitled to batch */
>> +		goto dispatch_request;
>> +
>> +	/*
>> +	 * at this point we are not running a batch. select the appropriate
>> +	 * data direction (read / write)
>> +	 */
>> +
>> +	if (reads) {
>> +		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
>> +
>> +		if (writes && (dd->starved++ >= dd->writes_starved))
>> +			goto dispatch_writes;
>> +
>> +		data_dir = READ;
>> +
>> +		goto dispatch_find_request;
>> +	}
>> +
>> +	/*
>> +	 * there are either no reads or writes have been starved
>> +	 */
>> +
>> +	if (writes) {
>> +dispatch_writes:
>> +		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
>> +
>> +		dd->starved = 0;
>> +
>> +		data_dir = WRITE;
>> +
>> +		goto dispatch_find_request;
>> +	}
>> +
>> +	spin_unlock(&dd->lock);
>> +	return NULL;
>> +
>> +dispatch_find_request:
>> +	/*
>> +	 * we are not running a batch, find best request for selected data_dir
>> +	 */
>> +	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
>> +		/*
>> +		 * A deadline has expired, the last request was in the other
>> +		 * direction, or we have run out of higher-sectored requests.
>> +		 * Start again from the request with the earliest expiry time.
>> +		 */
>> +		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
>> +	} else {
>> +		/*
>> +		 * The last req was the same dir and we have a next request in
>> +		 * sort order. No expired requests so continue on from here.
>> +		 */
>> +		rq = dd->next_rq[data_dir];
>> +	}
>> +
>> +	dd->batching = 0;
>> +
>> +dispatch_request:
>> +	/*
>> +	 * rq is the selected appropriate request.
>> +	 */
>> +	dd->batching++;
>> +	deadline_move_request(dd, rq);
>> +done:
>> +	rq->rq_flags |= RQF_STARTED;
>> +	spin_unlock(&dd->lock);
>> +	return rq;
>> +}
>> +
>> +static void dd_dispatch_requests(struct blk_mq_hw_ctx *hctx,
>> +				 struct list_head *rq_list)
>> +{
>> +	blk_mq_sched_dispatch_shadow_requests(hctx, rq_list, __dd_dispatch_request);
>> +}
>> +
>> +static void dd_exit_queue(struct elevator_queue *e)
>> +{
>> +	struct deadline_data *dd = e->elevator_data;
>> +
>> +	BUG_ON(!list_empty(&dd->fifo_list[READ]));
>> +	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
>> +
>> +	blk_mq_sched_free_requests(dd->tags);
>> +	kfree(dd);
>> +}
>> +
>> +/*
>> + * initialize elevator private data (deadline_data).
>> + */
>> +static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
>> +{
>> +	struct deadline_data *dd;
>> +	struct elevator_queue *eq;
>> +
>> +	eq = elevator_alloc(q, e);
>> +	if (!eq)
>> +		return -ENOMEM;
>> +
>> +	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
>> +	if (!dd) {
>> +		kobject_put(&eq->kobj);
>> +		return -ENOMEM;
>> +	}
>> +	eq->elevator_data = dd;
>> +
>> +	dd->tags = blk_mq_sched_alloc_requests(queue_depth, q->node);
>> +	if (!dd->tags) {
>> +		kfree(dd);
>> +		kobject_put(&eq->kobj);
>> +		return -ENOMEM;
>> +	}
>> +
>> +	INIT_LIST_HEAD(&dd->fifo_list[READ]);
>> +	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
>> +	dd->sort_list[READ] = RB_ROOT;
>> +	dd->sort_list[WRITE] = RB_ROOT;
>> +	dd->fifo_expire[READ] = read_expire;
>> +	dd->fifo_expire[WRITE] = write_expire;
>> +	dd->writes_starved = writes_starved;
>> +	dd->front_merges = 1;
>> +	dd->fifo_batch = fifo_batch;
>> +	spin_lock_init(&dd->lock);
>> +	INIT_LIST_HEAD(&dd->dispatch);
>> +	atomic_set(&dd->wait_index, 0);
>> +
>> +	q->elevator = eq;
>> +	return 0;
>> +}
>> +
>> +static int dd_request_merge(struct request_queue *q, struct request **rq,
>> +			    struct bio *bio)
>> +{
>> +	struct deadline_data *dd = q->elevator->elevator_data;
>> +	sector_t sector = bio_end_sector(bio);
>> +	struct request *__rq;
>> +
>> +	if (!dd->front_merges)
>> +		return ELEVATOR_NO_MERGE;
>> +
>> +	__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
>> +	if (__rq) {
>> +		BUG_ON(sector != blk_rq_pos(__rq));
>> +
>> +		if (elv_bio_merge_ok(__rq, bio)) {
>> +			*rq = __rq;
>> +			return ELEVATOR_FRONT_MERGE;
>> +		}
>> +	}
>> +
>> +	return ELEVATOR_NO_MERGE;
>> +}
>> +
>> +static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
>> +{
>> +	struct request_queue *q = hctx->queue;
>> +	struct deadline_data *dd = q->elevator->elevator_data;
>> +	int ret;
>> +
>> +	spin_lock(&dd->lock);
>> +	ret = blk_mq_sched_try_merge(q, bio);
>> +	spin_unlock(&dd->lock);
>> +
>> +	return ret;
>> +}
>> +
>> +/*
>> + * add rq to rbtree and fifo
>> + */
>> +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>> +			      bool at_head)
>> +{
>> +	struct request_queue *q = hctx->queue;
>> +	struct deadline_data *dd = q->elevator->elevator_data;
>> +	const int data_dir = rq_data_dir(rq);
>> +
>> +	if (blk_mq_sched_try_insert_merge(q, rq))
>> +		return;
>> +
>> +	blk_mq_sched_request_inserted(rq);
>> +
>> +	/*
>> +	 * If we're trying to insert a real request, just send it directly
>> +	 * to the hardware dispatch list. This only happens for a requeue,
>> +	 * or FUA/FLUSH requests.
>> +	 */
>> +	if (!blk_mq_sched_rq_is_shadow(rq)) {
>> +		spin_lock(&hctx->lock);
>> +		list_add_tail(&rq->queuelist, &hctx->dispatch);
>> +		spin_unlock(&hctx->lock);
>> +		return;
>> +	}
>> +
>> +	if (at_head || rq->cmd_type != REQ_TYPE_FS) {
>> +		if (at_head)
>> +			list_add(&rq->queuelist, &dd->dispatch);
>> +		else
>> +			list_add_tail(&rq->queuelist, &dd->dispatch);
>> +	} else {
>> +		deadline_add_rq_rb(dd, rq);
>> +
>> +		if (rq_mergeable(rq)) {
>> +			elv_rqhash_add(q, rq);
>> +			if (!q->last_merge)
>> +				q->last_merge = rq;
>> +		}
>> +
>> +		/*
>> +		 * set expire time and add to fifo list
>> +		 */
>> +		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
>> +		list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
>> +	}
>> +}
>> +
>> +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
>> +			       struct list_head *list, bool at_head)
>> +{
>> +	struct request_queue *q = hctx->queue;
>> +	struct deadline_data *dd = q->elevator->elevator_data;
>> +
>> +	spin_lock(&dd->lock);
>> +	while (!list_empty(list)) {
>> +		struct request *rq;
>> +
>> +		rq = list_first_entry(list, struct request, queuelist);
>> +		list_del_init(&rq->queuelist);
>> +		dd_insert_request(hctx, rq, at_head);
>> +	}
>> +	spin_unlock(&dd->lock);
>> +}
>> +
>> +static struct request *dd_get_request(struct request_queue *q, unsigned int op,
>> +				      struct blk_mq_alloc_data *data)
>> +{
>> +	struct deadline_data *dd = q->elevator->elevator_data;
>> +	struct request *rq;
>> +
>> +	/*
>> +	 * The flush machinery intercepts before we insert the request. As
>> +	 * a work-around, just hand it back a real request.
>> +	 */
>> +	if (unlikely(op & (REQ_PREFLUSH | REQ_FUA)))
>> +		rq = __blk_mq_alloc_request(data, op);
>> +	else {
>> +		rq = blk_mq_sched_alloc_shadow_request(q, data, dd->tags, &dd->wait_index);
>> +		if (rq)
>> +			blk_mq_rq_ctx_init(q, data->ctx, rq, op);
>> +	}
>> +
>> +	return rq;
>> +}
>> +
>> +static bool dd_put_request(struct request *rq)
>> +{
>> +	/*
>> +	 * If it's a real request, we just have to free it. For a shadow
>> +	 * request, we should only free it if we haven't started it. A
>> +	 * started request is mapped to a real one, and the real one will
>> +	 * free it. We can get here with request merges, since we then
>> +	 * free the request before we start/issue it.
>> +	 */
>> +	if (!blk_mq_sched_rq_is_shadow(rq))
>> +		return false;
>> +
>> +	if (!(rq->rq_flags & RQF_STARTED)) {
>> +		struct request_queue *q = rq->q;
>> +		struct deadline_data *dd = q->elevator->elevator_data;
>> +
>> +		/*
>> +		 * IO completion would normally do this, but if we merge
>> +		 * and free before we issue the request, drop both the
>> +		 * tag and queue ref
>> +		 */
>> +		blk_mq_sched_free_shadow_request(dd->tags, rq);
>> +		blk_queue_exit(q);
>> +	}
>> +
>> +	return true;
>> +}
>> +
>> +static void dd_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
>> +{
>> +	struct request *sched_rq = rq->end_io_data;
>> +
>> +	/*
>> +	 * sched_rq can be NULL, if we haven't setup the shadow yet
>> +	 * because we failed getting one.
>> +	 */
>> +	if (sched_rq) {
>> +		struct deadline_data *dd = hctx->queue->elevator->elevator_data;
>> +
>> +		blk_mq_sched_free_shadow_request(dd->tags, sched_rq);
>> +		blk_mq_start_stopped_hw_queue(hctx, true);
>> +	}
>> +}
>> +
>> +static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
>> +{
>> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
>> +
>> +	return !list_empty_careful(&dd->dispatch) ||
>> +		!list_empty_careful(&dd->fifo_list[0]) ||
>> +		!list_empty_careful(&dd->fifo_list[1]);
>> +}
>> +
>> +/*
>> + * sysfs parts below
>> + */
>> +static ssize_t
>> +deadline_var_show(int var, char *page)
>> +{
>> +	return sprintf(page, "%d\n", var);
>> +}
>> +
>> +static ssize_t
>> +deadline_var_store(int *var, const char *page, size_t count)
>> +{
>> +	char *p = (char *) page;
>> +
>> +	*var = simple_strtol(p, &p, 10);
>> +	return count;
>> +}
>> +
>> +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
>> +static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
>> +{									\
>> +	struct deadline_data *dd = e->elevator_data;			\
>> +	int __data = __VAR;						\
>> +	if (__CONV)							\
>> +		__data = jiffies_to_msecs(__data);			\
>> +	return deadline_var_show(__data, (page));			\
>> +}
>> +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
>> +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
>> +SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
>> +SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
>> +SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
>> +#undef SHOW_FUNCTION
>> +
>> +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
>> +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
>> +{									\
>> +	struct deadline_data *dd = e->elevator_data;			\
>> +	int __data;							\
>> +	int ret = deadline_var_store(&__data, (page), count);		\
>> +	if (__data < (MIN))						\
>> +		__data = (MIN);						\
>> +	else if (__data > (MAX))					\
>> +		__data = (MAX);						\
>> +	if (__CONV)							\
>> +		*(__PTR) = msecs_to_jiffies(__data);			\
>> +	else								\
>> +		*(__PTR) = __data;					\
>> +	return ret;							\
>> +}
>> +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
>> +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
>> +STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
>> +STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
>> +STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
>> +#undef STORE_FUNCTION
>> +
>> +#define DD_ATTR(name) \
>> +	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
>> +				      deadline_##name##_store)
>> +
>> +static struct elv_fs_entry deadline_attrs[] = {
>> +	DD_ATTR(read_expire),
>> +	DD_ATTR(write_expire),
>> +	DD_ATTR(writes_starved),
>> +	DD_ATTR(front_merges),
>> +	DD_ATTR(fifo_batch),
>> +	__ATTR_NULL
>> +};
>> +
>> +static struct elevator_type mq_deadline = {
>> +	.ops.mq = {
>> +		.get_request		= dd_get_request,
>> +		.put_request		= dd_put_request,
>> +		.insert_requests	= dd_insert_requests,
>> +		.dispatch_requests	= dd_dispatch_requests,
>> +		.completed_request	= dd_completed_request,
>> +		.next_request		= elv_rb_latter_request,
>> +		.former_request		= elv_rb_former_request,
>> +		.bio_merge		= dd_bio_merge,
>> +		.request_merge		= dd_request_merge,
>> +		.requests_merged	= dd_merged_requests,
>> +		.request_merged		= dd_request_merged,
>> +		.has_work		= dd_has_work,
>> +		.init_sched		= dd_init_queue,
>> +		.exit_sched		= dd_exit_queue,
>> +	},
>> +
>> +	.uses_mq	= true,
>> +	.elevator_attrs = deadline_attrs,
>> +	.elevator_name = "mq-deadline",
>> +	.elevator_owner = THIS_MODULE,
>> +};
>> +
>> +static int __init deadline_init(void)
>> +{
>> +	if (!queue_depth) {
>> +		pr_err("mq-deadline: queue depth must be > 0\n");
>> +		return -EINVAL;
>> +	}
>> +	return elv_register(&mq_deadline);
>> +}
>> +
>> +static void __exit deadline_exit(void)
>> +{
>> +	elv_unregister(&mq_deadline);
>> +}
>> +
>> +module_init(deadline_init);
>> +module_exit(deadline_exit);
>> +
>> +MODULE_AUTHOR("Jens Axboe");
>> +MODULE_LICENSE("GPL");
>> +MODULE_DESCRIPTION("MQ deadline IO scheduler");
>> -- 
>> 2.7.4
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-block" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-block" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Jan. 20, 2017, 2:26 p.m. UTC | #12
On Fri, Jan 20 2017, Paolo Valente wrote:
> 
> > Il giorno 17 gen 2017, alle ore 03:47, Jens Axboe <axboe@fb.com> ha scritto:
> > 
> > On 12/22/2016 09:49 AM, Paolo Valente wrote:
> >> 
> >>> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
> >>> 
> >>> This is basically identical to deadline-iosched, except it registers
> >>> as a MQ capable scheduler. This is still a single queue design.
> >>> 
> >> 
> >> One last question (for today ...):in mq-deadline there are no
> >> "schedule dispatch" or "unplug work" functions.  In blk, CFQ and BFQ
> >> do these schedules/unplugs in a lot of cases.  What's the right
> >> replacement?  Just doing nothing?
> > 
> > You just use blk_mq_run_hw_queue() or variants thereof to kick off queue
> > runs.
> > 
> 
> Hi Jens,
> I'm working on this right now.  I have a pair of quick questions about
> performance.
> 
> In the blk version of bfq, if the in-service bfq_queue happen to have
> no more budget when the bfq dispatch function is invoked, then bfq:
> returns no request (NULL), immediately expires the in-service
> bfq_queue, and schedules a new dispatch.  The third step is taken so
> that, if other bfq_queues have requests, then a new in-service
> bfq_queue will be selected on the upcoming new dispatch, and a new
> request will be provided right away.
> 
> My questions are: is this dispatch-schedule step still needed with
> blk-mq, to avoid a stall?  If it is not needed to avoid a stall, would
> it still be needed to boost throughput, because it would force an
> immediate, next dispatch?

Generally that step is only needed if you don't dispatch a request for
that invocation, yet you have requests to dispatch. For that case, you
must ensure that the queues are run at some point in the future. So I'm
inclined to answer yes to your question, though it depends on exactly
how it happens. If you have the queue run in the code, comment it like
that, and we can always revisit.

> BTW, bfq-mq survived its first request completion.  I will provide you
> with a link to a github branch as soon as bfq-mq seems able to stand
> up with a minimal workload.

Congratulations, that's a nice milestone!
Jens Axboe Jan. 20, 2017, 2:28 p.m. UTC | #13
On Fri, Jan 20 2017, Paolo Valente wrote:
> 
> > Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
> > 
> > This is basically identical to deadline-iosched, except it registers
> > as a MQ capable scheduler. This is still a single queue design.
> > 
> 
> Jens,
> no spin_lock_irq* in the code.  So, also request dispatches are
> guaranteed to never be executed in IRQ context?  I'm asking this
> question to understand whether I'm missing something that, even in
> BFQ, would somehow allow me to not disable irqs in critical sections,
> even if there is the slice_idle-expiration handler.  Be patient with
> my ignorance.

Yes, dispatches will never happen from IRQ context. blk-mq was designed
so we didn't have to use irq disabling locks.

That said, certain parts of the API can be called from IRQ context.
put_request and the completion parts, for instance. But blk-mq doesn't
need to grab any locks there, and neither does mq-deadline. This might
be different from bfq. lockdep can be a big help there.
Jens Axboe Jan. 20, 2017, 2:28 p.m. UTC | #14
On Fri, Jan 20 2017, Paolo Valente wrote:
> 
> > Il giorno 20 gen 2017, alle ore 14:14, Paolo Valente <paolo.valente@linaro.org> ha scritto:
> > 
> >> 
> >> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
> >> 
> >> This is basically identical to deadline-iosched, except it registers
> >> as a MQ capable scheduler. This is still a single queue design.
> >> 
> > 
> > Jens,
> > no spin_lock_irq* in the code.  So, also request dispatches are
> > guaranteed to never be executed in IRQ context?
> 
> Or maybe the opposite? That is, all scheduler functions invoked in IRQ context?

Nope
Paolo Valente Feb. 1, 2017, 11:11 a.m. UTC | #15
> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
> 
> This is basically identical to deadline-iosched, except it registers
> as a MQ capable scheduler. This is still a single queue design.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---
> block/Kconfig.iosched |   6 +
> block/Makefile        |   1 +
> block/mq-deadline.c   | 649 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 656 insertions(+)
> create mode 100644 block/mq-deadline.c
> 
> diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
> index 421bef9c4c48..490ef2850fae 100644
> --- a/block/Kconfig.iosched
> +++ b/block/Kconfig.iosched
> @@ -32,6 +32,12 @@ config IOSCHED_CFQ
> 
> 	  This is the default I/O scheduler.
> 
> +config MQ_IOSCHED_DEADLINE
> +	tristate "MQ deadline I/O scheduler"
> +	default y
> +	---help---
> +	  MQ version of the deadline IO scheduler.
> +
> config CFQ_GROUP_IOSCHED
> 	bool "CFQ Group Scheduling support"
> 	depends on IOSCHED_CFQ && BLK_CGROUP
> diff --git a/block/Makefile b/block/Makefile
> index 2eee9e1bb6db..3ee0abd7205a 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
> obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
> obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
> obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
> +obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
> 
> obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
> obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> new file mode 100644
> index 000000000000..3cb9de21ab21
> --- /dev/null
> +++ b/block/mq-deadline.c
> @@ -0,0 +1,649 @@
> +/*
> + *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
> + *  for the blk-mq scheduling framework
> + *
> + *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
> + */
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
> +#include <linux/elevator.h>
> +#include <linux/bio.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/init.h>
> +#include <linux/compiler.h>
> +#include <linux/rbtree.h>
> +#include <linux/sbitmap.h>
> +
> +#include "blk.h"
> +#include "blk-mq.h"
> +#include "blk-mq-tag.h"
> +#include "blk-mq-sched.h"
> +
> +static unsigned int queue_depth = 256;
> +module_param(queue_depth, uint, 0644);
> +MODULE_PARM_DESC(queue_depth, "Use this value as the scheduler queue depth");
> +
> +/*
> + * See Documentation/block/deadline-iosched.txt
> + */
> +static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
> +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
> +static const int writes_starved = 2;    /* max times reads can starve a write */
> +static const int fifo_batch = 16;       /* # of sequential requests treated as one
> +				     by the above parameters. For throughput. */
> +
> +struct deadline_data {
> +	/*
> +	 * run time data
> +	 */
> +
> +	/*
> +	 * requests (deadline_rq s) are present on both sort_list and fifo_list
> +	 */
> +	struct rb_root sort_list[2];
> +	struct list_head fifo_list[2];
> +
> +	/*
> +	 * next in sort order. read, write or both are NULL
> +	 */
> +	struct request *next_rq[2];
> +	unsigned int batching;		/* number of sequential requests made */
> +	unsigned int starved;		/* times reads have starved writes */
> +
> +	/*
> +	 * settings that change how the i/o scheduler behaves
> +	 */
> +	int fifo_expire[2];
> +	int fifo_batch;
> +	int writes_starved;
> +	int front_merges;
> +
> +	spinlock_t lock;
> +	struct list_head dispatch;
> +	struct blk_mq_tags *tags;
> +	atomic_t wait_index;
> +};
> +
> +static inline struct rb_root *
> +deadline_rb_root(struct deadline_data *dd, struct request *rq)
> +{
> +	return &dd->sort_list[rq_data_dir(rq)];
> +}
> +
> +/*
> + * get the request after `rq' in sector-sorted order
> + */
> +static inline struct request *
> +deadline_latter_request(struct request *rq)
> +{
> +	struct rb_node *node = rb_next(&rq->rb_node);
> +
> +	if (node)
> +		return rb_entry_rq(node);
> +
> +	return NULL;
> +}
> +
> +static void
> +deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
> +{
> +	struct rb_root *root = deadline_rb_root(dd, rq);
> +
> +	elv_rb_add(root, rq);
> +}
> +
> +static inline void
> +deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
> +{
> +	const int data_dir = rq_data_dir(rq);
> +
> +	if (dd->next_rq[data_dir] == rq)
> +		dd->next_rq[data_dir] = deadline_latter_request(rq);
> +
> +	elv_rb_del(deadline_rb_root(dd, rq), rq);
> +}
> +
> +/*
> + * remove rq from rbtree and fifo.
> + */
> +static void deadline_remove_request(struct request_queue *q, struct request *rq)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	list_del_init(&rq->queuelist);
> +
> +	/*
> +	 * We might not be on the rbtree, if we are doing an insert merge
> +	 */
> +	if (!RB_EMPTY_NODE(&rq->rb_node))
> +		deadline_del_rq_rb(dd, rq);
> +
> +	elv_rqhash_del(q, rq);
> +	if (q->last_merge == rq)
> +		q->last_merge = NULL;
> +}
> +
> +static void dd_request_merged(struct request_queue *q, struct request *req,
> +			      int type)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	/*
> +	 * if the merge was a front merge, we need to reposition request
> +	 */
> +	if (type == ELEVATOR_FRONT_MERGE) {
> +		elv_rb_del(deadline_rb_root(dd, req), req);
> +		deadline_add_rq_rb(dd, req);
> +	}
> +}
> +
> +static void dd_merged_requests(struct request_queue *q, struct request *req,
> +			       struct request *next)
> +{
> +	/*
> +	 * if next expires before rq, assign its expire time to rq
> +	 * and move into next position (next will be deleted) in fifo
> +	 */
> +	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
> +		if (time_before((unsigned long)next->fifo_time,
> +				(unsigned long)req->fifo_time)) {
> +			list_move(&req->queuelist, &next->queuelist);
> +			req->fifo_time = next->fifo_time;
> +		}
> +	}
> +
> +	/*
> +	 * kill knowledge of next, this one is a goner
> +	 */
> +	deadline_remove_request(q, next);
> +}
> +
> +/*
> + * move an entry to dispatch queue
> + */
> +static void
> +deadline_move_request(struct deadline_data *dd, struct request *rq)
> +{
> +	const int data_dir = rq_data_dir(rq);
> +
> +	dd->next_rq[READ] = NULL;
> +	dd->next_rq[WRITE] = NULL;
> +	dd->next_rq[data_dir] = deadline_latter_request(rq);
> +
> +	/*
> +	 * take it off the sort and fifo list
> +	 */
> +	deadline_remove_request(rq->q, rq);
> +}
> +
> +/*
> + * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
> + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
> + */
> +static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
> +{
> +	struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
> +
> +	/*
> +	 * rq is expired!
> +	 */
> +	if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +/*
> + * deadline_dispatch_requests selects the best request according to
> + * read/write expire, fifo_batch, etc
> + */
> +static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +	struct request *rq;
> +	bool reads, writes;
> +	int data_dir;
> +
> +	spin_lock(&dd->lock);
> +
> +	if (!list_empty(&dd->dispatch)) {
> +		rq = list_first_entry(&dd->dispatch, struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +		goto done;
> +	}
> +
> +	reads = !list_empty(&dd->fifo_list[READ]);
> +	writes = !list_empty(&dd->fifo_list[WRITE]);
> +
> +	/*
> +	 * batches are currently reads XOR writes
> +	 */
> +	if (dd->next_rq[WRITE])
> +		rq = dd->next_rq[WRITE];
> +	else
> +		rq = dd->next_rq[READ];
> +
> +	if (rq && dd->batching < dd->fifo_batch)
> +		/* we have a next request are still entitled to batch */
> +		goto dispatch_request;
> +
> +	/*
> +	 * at this point we are not running a batch. select the appropriate
> +	 * data direction (read / write)
> +	 */
> +
> +	if (reads) {
> +		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
> +
> +		if (writes && (dd->starved++ >= dd->writes_starved))
> +			goto dispatch_writes;
> +
> +		data_dir = READ;
> +
> +		goto dispatch_find_request;
> +	}
> +
> +	/*
> +	 * there are either no reads or writes have been starved
> +	 */
> +
> +	if (writes) {
> +dispatch_writes:
> +		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
> +
> +		dd->starved = 0;
> +
> +		data_dir = WRITE;
> +
> +		goto dispatch_find_request;
> +	}
> +
> +	spin_unlock(&dd->lock);
> +	return NULL;
> +
> +dispatch_find_request:
> +	/*
> +	 * we are not running a batch, find best request for selected data_dir
> +	 */
> +	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
> +		/*
> +		 * A deadline has expired, the last request was in the other
> +		 * direction, or we have run out of higher-sectored requests.
> +		 * Start again from the request with the earliest expiry time.
> +		 */
> +		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
> +	} else {
> +		/*
> +		 * The last req was the same dir and we have a next request in
> +		 * sort order. No expired requests so continue on from here.
> +		 */
> +		rq = dd->next_rq[data_dir];
> +	}
> +
> +	dd->batching = 0;
> +
> +dispatch_request:
> +	/*
> +	 * rq is the selected appropriate request.
> +	 */
> +	dd->batching++;
> +	deadline_move_request(dd, rq);
> +done:
> +	rq->rq_flags |= RQF_STARTED;
> +	spin_unlock(&dd->lock);
> +	return rq;
> +}
> +
> +static void dd_dispatch_requests(struct blk_mq_hw_ctx *hctx,
> +				 struct list_head *rq_list)
> +{
> +	blk_mq_sched_dispatch_shadow_requests(hctx, rq_list, __dd_dispatch_request);
> +}
> +
> +static void dd_exit_queue(struct elevator_queue *e)
> +{
> +	struct deadline_data *dd = e->elevator_data;
> +
> +	BUG_ON(!list_empty(&dd->fifo_list[READ]));
> +	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
> +
> +	blk_mq_sched_free_requests(dd->tags);
> +	kfree(dd);
> +}
> +
> +/*
> + * initialize elevator private data (deadline_data).
> + */
> +static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
> +{
> +	struct deadline_data *dd;
> +	struct elevator_queue *eq;
> +
> +	eq = elevator_alloc(q, e);
> +	if (!eq)
> +		return -ENOMEM;
> +
> +	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
> +	if (!dd) {
> +		kobject_put(&eq->kobj);
> +		return -ENOMEM;
> +	}
> +	eq->elevator_data = dd;
> +
> +	dd->tags = blk_mq_sched_alloc_requests(queue_depth, q->node);
> +	if (!dd->tags) {
> +		kfree(dd);
> +		kobject_put(&eq->kobj);
> +		return -ENOMEM;
> +	}
> +
> +	INIT_LIST_HEAD(&dd->fifo_list[READ]);
> +	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
> +	dd->sort_list[READ] = RB_ROOT;
> +	dd->sort_list[WRITE] = RB_ROOT;
> +	dd->fifo_expire[READ] = read_expire;
> +	dd->fifo_expire[WRITE] = write_expire;
> +	dd->writes_starved = writes_starved;
> +	dd->front_merges = 1;
> +	dd->fifo_batch = fifo_batch;
> +	spin_lock_init(&dd->lock);
> +	INIT_LIST_HEAD(&dd->dispatch);
> +	atomic_set(&dd->wait_index, 0);
> +
> +	q->elevator = eq;
> +	return 0;
> +}
> +
> +static int dd_request_merge(struct request_queue *q, struct request **rq,
> +			    struct bio *bio)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	sector_t sector = bio_end_sector(bio);
> +	struct request *__rq;
> +
> +	if (!dd->front_merges)
> +		return ELEVATOR_NO_MERGE;
> +
> +	__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
> +	if (__rq) {
> +		BUG_ON(sector != blk_rq_pos(__rq));
> +
> +		if (elv_bio_merge_ok(__rq, bio)) {
> +			*rq = __rq;
> +			return ELEVATOR_FRONT_MERGE;
> +		}
> +	}
> +
> +	return ELEVATOR_NO_MERGE;
> +}
> +
> +static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	int ret;
> +
> +	spin_lock(&dd->lock);
> +	ret = blk_mq_sched_try_merge(q, bio);
> +	spin_unlock(&dd->lock);
> +

Hi Jens,
first, good news, bfq is passing my first sanity checks.  Still, I
need a little more help for the following issue.  There is a case that
would be impossible to handle without modifying code outside bfq.  But
so far such a case never occurred, and I hope that it can never occur.
I'll try to briefly list all relevant details on this concern of mine,
so that you can quickly confirm my hope, or highlight where or what I
am missing.

First, as done above for mq-deadline, invoking blk_mq_sched_try_merge
with the scheduler lock held is of course necessary (for example, to
protect q->last_merge).  This may lead to put_rq_private invoked
with the lock held, in case of successful merge.

As a consequence, put_rq_private may be invoked:
(1) in IRQ context, no scheduler lock held, because of a completion:
can be handled by deferring work and lock grabbing, because the
completed request is not queued in the scheduler any more;
(2) in process context, scheduler lock held, because of the above
successful merge: must be handled immediately, for consistency,
because the request is still queued in the scheduler;
(3) in process context, no scheduler lock held, for some other reason:
some path apparently may lead to this case, although I've never seen
it to happen.  Immediate handling, and hence locking, may be needed,
depending on whether the request is still queued in the scheduler.

So, my main question is: is case (3) actually impossible?  Should it
be possible, I guess we would have a problem, because of the
different lock state with respect to (2).

Finally, I hope that it is certainly impossible to have a case (4): in
IRQ context, no lock held, but with the request in the scheduler.

Thanks,
Paolo

> +	return ret;
> +}
> +
> +/*
> + * add rq to rbtree and fifo
> + */
> +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
> +			      bool at_head)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	const int data_dir = rq_data_dir(rq);
> +
> +	if (blk_mq_sched_try_insert_merge(q, rq))
> +		return;
> +
> +	blk_mq_sched_request_inserted(rq);
> +
> +	/*
> +	 * If we're trying to insert a real request, just send it directly
> +	 * to the hardware dispatch list. This only happens for a requeue,
> +	 * or FUA/FLUSH requests.
> +	 */
> +	if (!blk_mq_sched_rq_is_shadow(rq)) {
> +		spin_lock(&hctx->lock);
> +		list_add_tail(&rq->queuelist, &hctx->dispatch);
> +		spin_unlock(&hctx->lock);
> +		return;
> +	}
> +
> +	if (at_head || rq->cmd_type != REQ_TYPE_FS) {
> +		if (at_head)
> +			list_add(&rq->queuelist, &dd->dispatch);
> +		else
> +			list_add_tail(&rq->queuelist, &dd->dispatch);
> +	} else {
> +		deadline_add_rq_rb(dd, rq);
> +
> +		if (rq_mergeable(rq)) {
> +			elv_rqhash_add(q, rq);
> +			if (!q->last_merge)
> +				q->last_merge = rq;
> +		}
> +
> +		/*
> +		 * set expire time and add to fifo list
> +		 */
> +		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
> +		list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
> +	}
> +}
> +
> +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
> +			       struct list_head *list, bool at_head)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	spin_lock(&dd->lock);
> +	while (!list_empty(list)) {
> +		struct request *rq;
> +
> +		rq = list_first_entry(list, struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +		dd_insert_request(hctx, rq, at_head);
> +	}
> +	spin_unlock(&dd->lock);
> +}
> +
> +static struct request *dd_get_request(struct request_queue *q, unsigned int op,
> +				      struct blk_mq_alloc_data *data)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	struct request *rq;
> +
> +	/*
> +	 * The flush machinery intercepts before we insert the request. As
> +	 * a work-around, just hand it back a real request.
> +	 */
> +	if (unlikely(op & (REQ_PREFLUSH | REQ_FUA)))
> +		rq = __blk_mq_alloc_request(data, op);
> +	else {
> +		rq = blk_mq_sched_alloc_shadow_request(q, data, dd->tags, &dd->wait_index);
> +		if (rq)
> +			blk_mq_rq_ctx_init(q, data->ctx, rq, op);
> +	}
> +
> +	return rq;
> +}
> +
> +static bool dd_put_request(struct request *rq)
> +{
> +	/*
> +	 * If it's a real request, we just have to free it. For a shadow
> +	 * request, we should only free it if we haven't started it. A
> +	 * started request is mapped to a real one, and the real one will
> +	 * free it. We can get here with request merges, since we then
> +	 * free the request before we start/issue it.
> +	 */
> +	if (!blk_mq_sched_rq_is_shadow(rq))
> +		return false;
> +
> +	if (!(rq->rq_flags & RQF_STARTED)) {
> +		struct request_queue *q = rq->q;
> +		struct deadline_data *dd = q->elevator->elevator_data;
> +
> +		/*
> +		 * IO completion would normally do this, but if we merge
> +		 * and free before we issue the request, drop both the
> +		 * tag and queue ref
> +		 */
> +		blk_mq_sched_free_shadow_request(dd->tags, rq);
> +		blk_queue_exit(q);
> +	}
> +
> +	return true;
> +}
> +
> +static void dd_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
> +{
> +	struct request *sched_rq = rq->end_io_data;
> +
> +	/*
> +	 * sched_rq can be NULL, if we haven't setup the shadow yet
> +	 * because we failed getting one.
> +	 */
> +	if (sched_rq) {
> +		struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +
> +		blk_mq_sched_free_shadow_request(dd->tags, sched_rq);
> +		blk_mq_start_stopped_hw_queue(hctx, true);
> +	}
> +}
> +
> +static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +
> +	return !list_empty_careful(&dd->dispatch) ||
> +		!list_empty_careful(&dd->fifo_list[0]) ||
> +		!list_empty_careful(&dd->fifo_list[1]);
> +}
> +
> +/*
> + * sysfs parts below
> + */
> +static ssize_t
> +deadline_var_show(int var, char *page)
> +{
> +	return sprintf(page, "%d\n", var);
> +}
> +
> +static ssize_t
> +deadline_var_store(int *var, const char *page, size_t count)
> +{
> +	char *p = (char *) page;
> +
> +	*var = simple_strtol(p, &p, 10);
> +	return count;
> +}
> +
> +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
> +static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
> +{									\
> +	struct deadline_data *dd = e->elevator_data;			\
> +	int __data = __VAR;						\
> +	if (__CONV)							\
> +		__data = jiffies_to_msecs(__data);			\
> +	return deadline_var_show(__data, (page));			\
> +}
> +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
> +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
> +SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
> +SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
> +SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
> +#undef SHOW_FUNCTION
> +
> +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
> +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
> +{									\
> +	struct deadline_data *dd = e->elevator_data;			\
> +	int __data;							\
> +	int ret = deadline_var_store(&__data, (page), count);		\
> +	if (__data < (MIN))						\
> +		__data = (MIN);						\
> +	else if (__data > (MAX))					\
> +		__data = (MAX);						\
> +	if (__CONV)							\
> +		*(__PTR) = msecs_to_jiffies(__data);			\
> +	else								\
> +		*(__PTR) = __data;					\
> +	return ret;							\
> +}
> +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
> +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
> +STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
> +STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
> +STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
> +#undef STORE_FUNCTION
> +
> +#define DD_ATTR(name) \
> +	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
> +				      deadline_##name##_store)
> +
> +static struct elv_fs_entry deadline_attrs[] = {
> +	DD_ATTR(read_expire),
> +	DD_ATTR(write_expire),
> +	DD_ATTR(writes_starved),
> +	DD_ATTR(front_merges),
> +	DD_ATTR(fifo_batch),
> +	__ATTR_NULL
> +};
> +
> +static struct elevator_type mq_deadline = {
> +	.ops.mq = {
> +		.get_request		= dd_get_request,
> +		.put_request		= dd_put_request,
> +		.insert_requests	= dd_insert_requests,
> +		.dispatch_requests	= dd_dispatch_requests,
> +		.completed_request	= dd_completed_request,
> +		.next_request		= elv_rb_latter_request,
> +		.former_request		= elv_rb_former_request,
> +		.bio_merge		= dd_bio_merge,
> +		.request_merge		= dd_request_merge,
> +		.requests_merged	= dd_merged_requests,
> +		.request_merged		= dd_request_merged,
> +		.has_work		= dd_has_work,
> +		.init_sched		= dd_init_queue,
> +		.exit_sched		= dd_exit_queue,
> +	},
> +
> +	.uses_mq	= true,
> +	.elevator_attrs = deadline_attrs,
> +	.elevator_name = "mq-deadline",
> +	.elevator_owner = THIS_MODULE,
> +};
> +
> +static int __init deadline_init(void)
> +{
> +	if (!queue_depth) {
> +		pr_err("mq-deadline: queue depth must be > 0\n");
> +		return -EINVAL;
> +	}
> +	return elv_register(&mq_deadline);
> +}
> +
> +static void __exit deadline_exit(void)
> +{
> +	elv_unregister(&mq_deadline);
> +}
> +
> +module_init(deadline_init);
> +module_exit(deadline_exit);
> +
> +MODULE_AUTHOR("Jens Axboe");
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("MQ deadline IO scheduler");
> -- 
> 2.7.4
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Valente Feb. 1, 2017, 11:56 a.m. UTC | #16
> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
> 
> This is basically identical to deadline-iosched, except it registers
> as a MQ capable scheduler. This is still a single queue design.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---
> block/Kconfig.iosched |   6 +
> block/Makefile        |   1 +
> block/mq-deadline.c   | 649 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 656 insertions(+)
> create mode 100644 block/mq-deadline.c
> 
> diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
> index 421bef9c4c48..490ef2850fae 100644
> --- a/block/Kconfig.iosched
> +++ b/block/Kconfig.iosched
> @@ -32,6 +32,12 @@ config IOSCHED_CFQ
> 
> 	  This is the default I/O scheduler.
> 
> +config MQ_IOSCHED_DEADLINE
> +	tristate "MQ deadline I/O scheduler"
> +	default y
> +	---help---
> +	  MQ version of the deadline IO scheduler.
> +
> config CFQ_GROUP_IOSCHED
> 	bool "CFQ Group Scheduling support"
> 	depends on IOSCHED_CFQ && BLK_CGROUP
> diff --git a/block/Makefile b/block/Makefile
> index 2eee9e1bb6db..3ee0abd7205a 100644
> --- a/block/Makefile
> +++ b/block/Makefile
> @@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
> obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
> obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
> obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
> +obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
> 
> obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
> obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
> diff --git a/block/mq-deadline.c b/block/mq-deadline.c
> new file mode 100644
> index 000000000000..3cb9de21ab21
> --- /dev/null
> +++ b/block/mq-deadline.c
> @@ -0,0 +1,649 @@
> +/*
> + *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
> + *  for the blk-mq scheduling framework
> + *
> + *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
> + */
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
> +#include <linux/elevator.h>
> +#include <linux/bio.h>
> +#include <linux/module.h>
> +#include <linux/slab.h>
> +#include <linux/init.h>
> +#include <linux/compiler.h>
> +#include <linux/rbtree.h>
> +#include <linux/sbitmap.h>
> +
> +#include "blk.h"
> +#include "blk-mq.h"
> +#include "blk-mq-tag.h"
> +#include "blk-mq-sched.h"
> +
> +static unsigned int queue_depth = 256;
> +module_param(queue_depth, uint, 0644);
> +MODULE_PARM_DESC(queue_depth, "Use this value as the scheduler queue depth");
> +
> +/*
> + * See Documentation/block/deadline-iosched.txt
> + */
> +static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
> +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
> +static const int writes_starved = 2;    /* max times reads can starve a write */
> +static const int fifo_batch = 16;       /* # of sequential requests treated as one
> +				     by the above parameters. For throughput. */
> +
> +struct deadline_data {
> +	/*
> +	 * run time data
> +	 */
> +
> +	/*
> +	 * requests (deadline_rq s) are present on both sort_list and fifo_list
> +	 */
> +	struct rb_root sort_list[2];
> +	struct list_head fifo_list[2];
> +
> +	/*
> +	 * next in sort order. read, write or both are NULL
> +	 */
> +	struct request *next_rq[2];
> +	unsigned int batching;		/* number of sequential requests made */
> +	unsigned int starved;		/* times reads have starved writes */
> +
> +	/*
> +	 * settings that change how the i/o scheduler behaves
> +	 */
> +	int fifo_expire[2];
> +	int fifo_batch;
> +	int writes_starved;
> +	int front_merges;
> +
> +	spinlock_t lock;
> +	struct list_head dispatch;
> +	struct blk_mq_tags *tags;
> +	atomic_t wait_index;
> +};
> +
> +static inline struct rb_root *
> +deadline_rb_root(struct deadline_data *dd, struct request *rq)
> +{
> +	return &dd->sort_list[rq_data_dir(rq)];
> +}
> +
> +/*
> + * get the request after `rq' in sector-sorted order
> + */
> +static inline struct request *
> +deadline_latter_request(struct request *rq)
> +{
> +	struct rb_node *node = rb_next(&rq->rb_node);
> +
> +	if (node)
> +		return rb_entry_rq(node);
> +
> +	return NULL;
> +}
> +
> +static void
> +deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
> +{
> +	struct rb_root *root = deadline_rb_root(dd, rq);
> +
> +	elv_rb_add(root, rq);
> +}
> +
> +static inline void
> +deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
> +{
> +	const int data_dir = rq_data_dir(rq);
> +
> +	if (dd->next_rq[data_dir] == rq)
> +		dd->next_rq[data_dir] = deadline_latter_request(rq);
> +
> +	elv_rb_del(deadline_rb_root(dd, rq), rq);
> +}
> +
> +/*
> + * remove rq from rbtree and fifo.
> + */
> +static void deadline_remove_request(struct request_queue *q, struct request *rq)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	list_del_init(&rq->queuelist);
> +
> +	/*
> +	 * We might not be on the rbtree, if we are doing an insert merge
> +	 */
> +	if (!RB_EMPTY_NODE(&rq->rb_node))
> +		deadline_del_rq_rb(dd, rq);
> +
> +	elv_rqhash_del(q, rq);
> +	if (q->last_merge == rq)
> +		q->last_merge = NULL;
> +}
> +
> +static void dd_request_merged(struct request_queue *q, struct request *req,
> +			      int type)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	/*
> +	 * if the merge was a front merge, we need to reposition request
> +	 */
> +	if (type == ELEVATOR_FRONT_MERGE) {
> +		elv_rb_del(deadline_rb_root(dd, req), req);
> +		deadline_add_rq_rb(dd, req);
> +	}
> +}
> +
> +static void dd_merged_requests(struct request_queue *q, struct request *req,
> +			       struct request *next)
> +{
> +	/*
> +	 * if next expires before rq, assign its expire time to rq
> +	 * and move into next position (next will be deleted) in fifo
> +	 */
> +	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
> +		if (time_before((unsigned long)next->fifo_time,
> +				(unsigned long)req->fifo_time)) {
> +			list_move(&req->queuelist, &next->queuelist);
> +			req->fifo_time = next->fifo_time;
> +		}
> +	}
> +
> +	/*
> +	 * kill knowledge of next, this one is a goner
> +	 */
> +	deadline_remove_request(q, next);
> +}
> +
> +/*
> + * move an entry to dispatch queue
> + */
> +static void
> +deadline_move_request(struct deadline_data *dd, struct request *rq)
> +{
> +	const int data_dir = rq_data_dir(rq);
> +
> +	dd->next_rq[READ] = NULL;
> +	dd->next_rq[WRITE] = NULL;
> +	dd->next_rq[data_dir] = deadline_latter_request(rq);
> +
> +	/*
> +	 * take it off the sort and fifo list
> +	 */
> +	deadline_remove_request(rq->q, rq);
> +}
> +
> +/*
> + * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
> + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
> + */
> +static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
> +{
> +	struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
> +
> +	/*
> +	 * rq is expired!
> +	 */
> +	if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +/*
> + * deadline_dispatch_requests selects the best request according to
> + * read/write expire, fifo_batch, etc
> + */
> +static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +	struct request *rq;
> +	bool reads, writes;
> +	int data_dir;
> +
> +	spin_lock(&dd->lock);
> +
> +	if (!list_empty(&dd->dispatch)) {
> +		rq = list_first_entry(&dd->dispatch, struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +		goto done;
> +	}
> +
> +	reads = !list_empty(&dd->fifo_list[READ]);
> +	writes = !list_empty(&dd->fifo_list[WRITE]);
> +
> +	/*
> +	 * batches are currently reads XOR writes
> +	 */
> +	if (dd->next_rq[WRITE])
> +		rq = dd->next_rq[WRITE];
> +	else
> +		rq = dd->next_rq[READ];
> +
> +	if (rq && dd->batching < dd->fifo_batch)
> +		/* we have a next request are still entitled to batch */
> +		goto dispatch_request;
> +
> +	/*
> +	 * at this point we are not running a batch. select the appropriate
> +	 * data direction (read / write)
> +	 */
> +
> +	if (reads) {
> +		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
> +
> +		if (writes && (dd->starved++ >= dd->writes_starved))
> +			goto dispatch_writes;
> +
> +		data_dir = READ;
> +
> +		goto dispatch_find_request;
> +	}
> +
> +	/*
> +	 * there are either no reads or writes have been starved
> +	 */
> +
> +	if (writes) {
> +dispatch_writes:
> +		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
> +
> +		dd->starved = 0;
> +
> +		data_dir = WRITE;
> +
> +		goto dispatch_find_request;
> +	}
> +
> +	spin_unlock(&dd->lock);
> +	return NULL;
> +
> +dispatch_find_request:
> +	/*
> +	 * we are not running a batch, find best request for selected data_dir
> +	 */
> +	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
> +		/*
> +		 * A deadline has expired, the last request was in the other
> +		 * direction, or we have run out of higher-sectored requests.
> +		 * Start again from the request with the earliest expiry time.
> +		 */
> +		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
> +	} else {
> +		/*
> +		 * The last req was the same dir and we have a next request in
> +		 * sort order. No expired requests so continue on from here.
> +		 */
> +		rq = dd->next_rq[data_dir];
> +	}
> +
> +	dd->batching = 0;
> +
> +dispatch_request:
> +	/*
> +	 * rq is the selected appropriate request.
> +	 */
> +	dd->batching++;
> +	deadline_move_request(dd, rq);
> +done:
> +	rq->rq_flags |= RQF_STARTED;
> +	spin_unlock(&dd->lock);
> +	return rq;
> +}
> +
> +static void dd_dispatch_requests(struct blk_mq_hw_ctx *hctx,
> +				 struct list_head *rq_list)
> +{
> +	blk_mq_sched_dispatch_shadow_requests(hctx, rq_list, __dd_dispatch_request);
> +}
> +
> +static void dd_exit_queue(struct elevator_queue *e)
> +{
> +	struct deadline_data *dd = e->elevator_data;
> +
> +	BUG_ON(!list_empty(&dd->fifo_list[READ]));
> +	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
> +
> +	blk_mq_sched_free_requests(dd->tags);
> +	kfree(dd);
> +}
> +
> +/*
> + * initialize elevator private data (deadline_data).
> + */
> +static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
> +{
> +	struct deadline_data *dd;
> +	struct elevator_queue *eq;
> +
> +	eq = elevator_alloc(q, e);
> +	if (!eq)
> +		return -ENOMEM;
> +
> +	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
> +	if (!dd) {
> +		kobject_put(&eq->kobj);
> +		return -ENOMEM;
> +	}
> +	eq->elevator_data = dd;
> +
> +	dd->tags = blk_mq_sched_alloc_requests(queue_depth, q->node);
> +	if (!dd->tags) {
> +		kfree(dd);
> +		kobject_put(&eq->kobj);
> +		return -ENOMEM;
> +	}
> +
> +	INIT_LIST_HEAD(&dd->fifo_list[READ]);
> +	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
> +	dd->sort_list[READ] = RB_ROOT;
> +	dd->sort_list[WRITE] = RB_ROOT;
> +	dd->fifo_expire[READ] = read_expire;
> +	dd->fifo_expire[WRITE] = write_expire;
> +	dd->writes_starved = writes_starved;
> +	dd->front_merges = 1;
> +	dd->fifo_batch = fifo_batch;
> +	spin_lock_init(&dd->lock);
> +	INIT_LIST_HEAD(&dd->dispatch);
> +	atomic_set(&dd->wait_index, 0);
> +
> +	q->elevator = eq;
> +	return 0;
> +}
> +
> +static int dd_request_merge(struct request_queue *q, struct request **rq,
> +			    struct bio *bio)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	sector_t sector = bio_end_sector(bio);
> +	struct request *__rq;
> +
> +	if (!dd->front_merges)
> +		return ELEVATOR_NO_MERGE;
> +
> +	__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
> +	if (__rq) {
> +		BUG_ON(sector != blk_rq_pos(__rq));
> +
> +		if (elv_bio_merge_ok(__rq, bio)) {
> +			*rq = __rq;
> +			return ELEVATOR_FRONT_MERGE;
> +		}
> +	}
> +
> +	return ELEVATOR_NO_MERGE;
> +}
> +
> +static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	int ret;
> +
> +	spin_lock(&dd->lock);
> +	ret = blk_mq_sched_try_merge(q, bio);
> +	spin_unlock(&dd->lock);
> +
> +	return ret;
> +}
> +
> +/*
> + * add rq to rbtree and fifo
> + */
> +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
> +			      bool at_head)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	const int data_dir = rq_data_dir(rq);
> +
> +	if (blk_mq_sched_try_insert_merge(q, rq))
> +		return;
> +

A related doubt: shouldn't blk_mq_sched_try_insert_merge be invoked with the scheduler lock held too, as blk_mq_sched_try_merge, to protect (at least) q->last_merge?

In bfq this function is invoked with the lock held.

Thanks,
Paolo

> +	blk_mq_sched_request_inserted(rq);
> +
> +	/*
> +	 * If we're trying to insert a real request, just send it directly
> +	 * to the hardware dispatch list. This only happens for a requeue,
> +	 * or FUA/FLUSH requests.
> +	 */
> +	if (!blk_mq_sched_rq_is_shadow(rq)) {
> +		spin_lock(&hctx->lock);
> +		list_add_tail(&rq->queuelist, &hctx->dispatch);
> +		spin_unlock(&hctx->lock);
> +		return;
> +	}
> +
> +	if (at_head || rq->cmd_type != REQ_TYPE_FS) {
> +		if (at_head)
> +			list_add(&rq->queuelist, &dd->dispatch);
> +		else
> +			list_add_tail(&rq->queuelist, &dd->dispatch);
> +	} else {
> +		deadline_add_rq_rb(dd, rq);
> +
> +		if (rq_mergeable(rq)) {
> +			elv_rqhash_add(q, rq);
> +			if (!q->last_merge)
> +				q->last_merge = rq;
> +		}
> +
> +		/*
> +		 * set expire time and add to fifo list
> +		 */
> +		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
> +		list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
> +	}
> +}
> +
> +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
> +			       struct list_head *list, bool at_head)
> +{
> +	struct request_queue *q = hctx->queue;
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +
> +	spin_lock(&dd->lock);
> +	while (!list_empty(list)) {
> +		struct request *rq;
> +
> +		rq = list_first_entry(list, struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +		dd_insert_request(hctx, rq, at_head);
> +	}
> +	spin_unlock(&dd->lock);
> +}
> +
> +static struct request *dd_get_request(struct request_queue *q, unsigned int op,
> +				      struct blk_mq_alloc_data *data)
> +{
> +	struct deadline_data *dd = q->elevator->elevator_data;
> +	struct request *rq;
> +
> +	/*
> +	 * The flush machinery intercepts before we insert the request. As
> +	 * a work-around, just hand it back a real request.
> +	 */
> +	if (unlikely(op & (REQ_PREFLUSH | REQ_FUA)))
> +		rq = __blk_mq_alloc_request(data, op);
> +	else {
> +		rq = blk_mq_sched_alloc_shadow_request(q, data, dd->tags, &dd->wait_index);
> +		if (rq)
> +			blk_mq_rq_ctx_init(q, data->ctx, rq, op);
> +	}
> +
> +	return rq;
> +}
> +
> +static bool dd_put_request(struct request *rq)
> +{
> +	/*
> +	 * If it's a real request, we just have to free it. For a shadow
> +	 * request, we should only free it if we haven't started it. A
> +	 * started request is mapped to a real one, and the real one will
> +	 * free it. We can get here with request merges, since we then
> +	 * free the request before we start/issue it.
> +	 */
> +	if (!blk_mq_sched_rq_is_shadow(rq))
> +		return false;
> +
> +	if (!(rq->rq_flags & RQF_STARTED)) {
> +		struct request_queue *q = rq->q;
> +		struct deadline_data *dd = q->elevator->elevator_data;
> +
> +		/*
> +		 * IO completion would normally do this, but if we merge
> +		 * and free before we issue the request, drop both the
> +		 * tag and queue ref
> +		 */
> +		blk_mq_sched_free_shadow_request(dd->tags, rq);
> +		blk_queue_exit(q);
> +	}
> +
> +	return true;
> +}
> +
> +static void dd_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
> +{
> +	struct request *sched_rq = rq->end_io_data;
> +
> +	/*
> +	 * sched_rq can be NULL, if we haven't setup the shadow yet
> +	 * because we failed getting one.
> +	 */
> +	if (sched_rq) {
> +		struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +
> +		blk_mq_sched_free_shadow_request(dd->tags, sched_rq);
> +		blk_mq_start_stopped_hw_queue(hctx, true);
> +	}
> +}
> +
> +static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
> +{
> +	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
> +
> +	return !list_empty_careful(&dd->dispatch) ||
> +		!list_empty_careful(&dd->fifo_list[0]) ||
> +		!list_empty_careful(&dd->fifo_list[1]);
> +}
> +
> +/*
> + * sysfs parts below
> + */
> +static ssize_t
> +deadline_var_show(int var, char *page)
> +{
> +	return sprintf(page, "%d\n", var);
> +}
> +
> +static ssize_t
> +deadline_var_store(int *var, const char *page, size_t count)
> +{
> +	char *p = (char *) page;
> +
> +	*var = simple_strtol(p, &p, 10);
> +	return count;
> +}
> +
> +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
> +static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
> +{									\
> +	struct deadline_data *dd = e->elevator_data;			\
> +	int __data = __VAR;						\
> +	if (__CONV)							\
> +		__data = jiffies_to_msecs(__data);			\
> +	return deadline_var_show(__data, (page));			\
> +}
> +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
> +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
> +SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
> +SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
> +SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
> +#undef SHOW_FUNCTION
> +
> +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
> +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
> +{									\
> +	struct deadline_data *dd = e->elevator_data;			\
> +	int __data;							\
> +	int ret = deadline_var_store(&__data, (page), count);		\
> +	if (__data < (MIN))						\
> +		__data = (MIN);						\
> +	else if (__data > (MAX))					\
> +		__data = (MAX);						\
> +	if (__CONV)							\
> +		*(__PTR) = msecs_to_jiffies(__data);			\
> +	else								\
> +		*(__PTR) = __data;					\
> +	return ret;							\
> +}
> +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
> +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
> +STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
> +STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
> +STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
> +#undef STORE_FUNCTION
> +
> +#define DD_ATTR(name) \
> +	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
> +				      deadline_##name##_store)
> +
> +static struct elv_fs_entry deadline_attrs[] = {
> +	DD_ATTR(read_expire),
> +	DD_ATTR(write_expire),
> +	DD_ATTR(writes_starved),
> +	DD_ATTR(front_merges),
> +	DD_ATTR(fifo_batch),
> +	__ATTR_NULL
> +};
> +
> +static struct elevator_type mq_deadline = {
> +	.ops.mq = {
> +		.get_request		= dd_get_request,
> +		.put_request		= dd_put_request,
> +		.insert_requests	= dd_insert_requests,
> +		.dispatch_requests	= dd_dispatch_requests,
> +		.completed_request	= dd_completed_request,
> +		.next_request		= elv_rb_latter_request,
> +		.former_request		= elv_rb_former_request,
> +		.bio_merge		= dd_bio_merge,
> +		.request_merge		= dd_request_merge,
> +		.requests_merged	= dd_merged_requests,
> +		.request_merged		= dd_request_merged,
> +		.has_work		= dd_has_work,
> +		.init_sched		= dd_init_queue,
> +		.exit_sched		= dd_exit_queue,
> +	},
> +
> +	.uses_mq	= true,
> +	.elevator_attrs = deadline_attrs,
> +	.elevator_name = "mq-deadline",
> +	.elevator_owner = THIS_MODULE,
> +};
> +
> +static int __init deadline_init(void)
> +{
> +	if (!queue_depth) {
> +		pr_err("mq-deadline: queue depth must be > 0\n");
> +		return -EINVAL;
> +	}
> +	return elv_register(&mq_deadline);
> +}
> +
> +static void __exit deadline_exit(void)
> +{
> +	elv_unregister(&mq_deadline);
> +}
> +
> +module_init(deadline_init);
> +module_exit(deadline_exit);
> +
> +MODULE_AUTHOR("Jens Axboe");
> +MODULE_LICENSE("GPL");
> +MODULE_DESCRIPTION("MQ deadline IO scheduler");
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-block" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Feb. 2, 2017, 5:20 a.m. UTC | #17
On 02/01/2017 04:56 AM, Paolo Valente wrote:
>> +/*
>> + * add rq to rbtree and fifo
>> + */
>> +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
>> +			      bool at_head)
>> +{
>> +	struct request_queue *q = hctx->queue;
>> +	struct deadline_data *dd = q->elevator->elevator_data;
>> +	const int data_dir = rq_data_dir(rq);
>> +
>> +	if (blk_mq_sched_try_insert_merge(q, rq))
>> +		return;
>> +
> 
> A related doubt: shouldn't blk_mq_sched_try_insert_merge be invoked
> with the scheduler lock held too, as blk_mq_sched_try_merge, to
> protect (at least) q->last_merge?
>
> In bfq this function is invoked with the lock held.

It doesn't matter which lock you use, as long as:

1) You use the same one consistently
2) It has the same scope as the queue lock (the one you call the
   scheduler lock)

mq-deadline sets up a per-queue structure, deadline_data, and it has a
lock embedded in that structure. This is what mq-deadline uses to
serialize access to its data structures, as well as those in the queue
(like last_merge).
Paolo Valente Feb. 16, 2017, 10:46 a.m. UTC | #18
> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
> 
> This is basically identical to deadline-iosched, except it registers
> as a MQ capable scheduler. This is still a single queue design.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
...
> +
> +static void dd_merged_requests(struct request_queue *q, struct request *req,
> +			       struct request *next)
> +{
> +	/*
> +	 * if next expires before rq, assign its expire time to rq
> +	 * and move into next position (next will be deleted) in fifo
> +	 */
> +	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
> +		if (time_before((unsigned long)next->fifo_time,
> +				(unsigned long)req->fifo_time)) {
> +			list_move(&req->queuelist, &next->queuelist);
> +			req->fifo_time = next->fifo_time;
> +		}
> +	}
> +

Jens,
while trying to imagine the possible causes of Bart's hang with
bfq-mq, I've bumped into the following doubt: in the above function
(in my case, in bfq-mq-'s equivalent of the above function), are
we sure that neither req or next could EVER be in dd->dispatch instead
of dd->fifo_list?  I've tried to verify it, but, although I think it has never
happened in my tests, I was not able to make sure that no unlucky
combination may ever happen (considering also the use of
blk_rq_is_passthrough too, to decide where to put a new request).

I'm making a blunder, right?

Thanks,
Paolo
Jens Axboe Feb. 16, 2017, 3:35 p.m. UTC | #19
On 02/16/2017 03:46 AM, Paolo Valente wrote:
> 
>> Il giorno 17 dic 2016, alle ore 01:12, Jens Axboe <axboe@fb.com> ha scritto:
>>
>> This is basically identical to deadline-iosched, except it registers
>> as a MQ capable scheduler. This is still a single queue design.
>>
>> Signed-off-by: Jens Axboe <axboe@fb.com>
> ...
>> +
>> +static void dd_merged_requests(struct request_queue *q, struct request *req,
>> +			       struct request *next)
>> +{
>> +	/*
>> +	 * if next expires before rq, assign its expire time to rq
>> +	 * and move into next position (next will be deleted) in fifo
>> +	 */
>> +	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
>> +		if (time_before((unsigned long)next->fifo_time,
>> +				(unsigned long)req->fifo_time)) {
>> +			list_move(&req->queuelist, &next->queuelist);
>> +			req->fifo_time = next->fifo_time;
>> +		}
>> +	}
>> +
> 
> Jens,
> while trying to imagine the possible causes of Bart's hang with
> bfq-mq, I've bumped into the following doubt: in the above function
> (in my case, in bfq-mq-'s equivalent of the above function), are
> we sure that neither req or next could EVER be in dd->dispatch instead
> of dd->fifo_list?  I've tried to verify it, but, although I think it has never
> happened in my tests, I was not able to make sure that no unlucky
> combination may ever happen (considering also the use of
> blk_rq_is_passthrough too, to decide where to put a new request).
> 
> I'm making a blunder, right?

If a request goes into dd->dispatch, it's going to be found for merging.
Hence we can never call the above on the request.
diff mbox

Patch

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9c4c48..490ef2850fae 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -32,6 +32,12 @@  config IOSCHED_CFQ
 
 	  This is the default I/O scheduler.
 
+config MQ_IOSCHED_DEADLINE
+	tristate "MQ deadline I/O scheduler"
+	default y
+	---help---
+	  MQ version of the deadline IO scheduler.
+
 config CFQ_GROUP_IOSCHED
 	bool "CFQ Group Scheduling support"
 	depends on IOSCHED_CFQ && BLK_CGROUP
diff --git a/block/Makefile b/block/Makefile
index 2eee9e1bb6db..3ee0abd7205a 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -18,6 +18,7 @@  obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
new file mode 100644
index 000000000000..3cb9de21ab21
--- /dev/null
+++ b/block/mq-deadline.c
@@ -0,0 +1,649 @@ 
+/*
+ *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
+ *  for the blk-mq scheduling framework
+ *
+ *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+
+static unsigned int queue_depth = 256;
+module_param(queue_depth, uint, 0644);
+MODULE_PARM_DESC(queue_depth, "Use this value as the scheduler queue depth");
+
+/*
+ * See Documentation/block/deadline-iosched.txt
+ */
+static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int writes_starved = 2;    /* max times reads can starve a write */
+static const int fifo_batch = 16;       /* # of sequential requests treated as one
+				     by the above parameters. For throughput. */
+
+struct deadline_data {
+	/*
+	 * run time data
+	 */
+
+	/*
+	 * requests (deadline_rq s) are present on both sort_list and fifo_list
+	 */
+	struct rb_root sort_list[2];
+	struct list_head fifo_list[2];
+
+	/*
+	 * next in sort order. read, write or both are NULL
+	 */
+	struct request *next_rq[2];
+	unsigned int batching;		/* number of sequential requests made */
+	unsigned int starved;		/* times reads have starved writes */
+
+	/*
+	 * settings that change how the i/o scheduler behaves
+	 */
+	int fifo_expire[2];
+	int fifo_batch;
+	int writes_starved;
+	int front_merges;
+
+	spinlock_t lock;
+	struct list_head dispatch;
+	struct blk_mq_tags *tags;
+	atomic_t wait_index;
+};
+
+static inline struct rb_root *
+deadline_rb_root(struct deadline_data *dd, struct request *rq)
+{
+	return &dd->sort_list[rq_data_dir(rq)];
+}
+
+/*
+ * get the request after `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_latter_request(struct request *rq)
+{
+	struct rb_node *node = rb_next(&rq->rb_node);
+
+	if (node)
+		return rb_entry_rq(node);
+
+	return NULL;
+}
+
+static void
+deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+	struct rb_root *root = deadline_rb_root(dd, rq);
+
+	elv_rb_add(root, rq);
+}
+
+static inline void
+deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+	const int data_dir = rq_data_dir(rq);
+
+	if (dd->next_rq[data_dir] == rq)
+		dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+	elv_rb_del(deadline_rb_root(dd, rq), rq);
+}
+
+/*
+ * remove rq from rbtree and fifo.
+ */
+static void deadline_remove_request(struct request_queue *q, struct request *rq)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+
+	list_del_init(&rq->queuelist);
+
+	/*
+	 * We might not be on the rbtree, if we are doing an insert merge
+	 */
+	if (!RB_EMPTY_NODE(&rq->rb_node))
+		deadline_del_rq_rb(dd, rq);
+
+	elv_rqhash_del(q, rq);
+	if (q->last_merge == rq)
+		q->last_merge = NULL;
+}
+
+static void dd_request_merged(struct request_queue *q, struct request *req,
+			      int type)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+
+	/*
+	 * if the merge was a front merge, we need to reposition request
+	 */
+	if (type == ELEVATOR_FRONT_MERGE) {
+		elv_rb_del(deadline_rb_root(dd, req), req);
+		deadline_add_rq_rb(dd, req);
+	}
+}
+
+static void dd_merged_requests(struct request_queue *q, struct request *req,
+			       struct request *next)
+{
+	/*
+	 * if next expires before rq, assign its expire time to rq
+	 * and move into next position (next will be deleted) in fifo
+	 */
+	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
+		if (time_before((unsigned long)next->fifo_time,
+				(unsigned long)req->fifo_time)) {
+			list_move(&req->queuelist, &next->queuelist);
+			req->fifo_time = next->fifo_time;
+		}
+	}
+
+	/*
+	 * kill knowledge of next, this one is a goner
+	 */
+	deadline_remove_request(q, next);
+}
+
+/*
+ * move an entry to dispatch queue
+ */
+static void
+deadline_move_request(struct deadline_data *dd, struct request *rq)
+{
+	const int data_dir = rq_data_dir(rq);
+
+	dd->next_rq[READ] = NULL;
+	dd->next_rq[WRITE] = NULL;
+	dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+	/*
+	 * take it off the sort and fifo list
+	 */
+	deadline_remove_request(rq->q, rq);
+}
+
+/*
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
+ * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ */
+static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
+{
+	struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
+
+	/*
+	 * rq is expired!
+	 */
+	if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * deadline_dispatch_requests selects the best request according to
+ * read/write expire, fifo_batch, etc
+ */
+static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+	struct request *rq;
+	bool reads, writes;
+	int data_dir;
+
+	spin_lock(&dd->lock);
+
+	if (!list_empty(&dd->dispatch)) {
+		rq = list_first_entry(&dd->dispatch, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		goto done;
+	}
+
+	reads = !list_empty(&dd->fifo_list[READ]);
+	writes = !list_empty(&dd->fifo_list[WRITE]);
+
+	/*
+	 * batches are currently reads XOR writes
+	 */
+	if (dd->next_rq[WRITE])
+		rq = dd->next_rq[WRITE];
+	else
+		rq = dd->next_rq[READ];
+
+	if (rq && dd->batching < dd->fifo_batch)
+		/* we have a next request are still entitled to batch */
+		goto dispatch_request;
+
+	/*
+	 * at this point we are not running a batch. select the appropriate
+	 * data direction (read / write)
+	 */
+
+	if (reads) {
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
+
+		if (writes && (dd->starved++ >= dd->writes_starved))
+			goto dispatch_writes;
+
+		data_dir = READ;
+
+		goto dispatch_find_request;
+	}
+
+	/*
+	 * there are either no reads or writes have been starved
+	 */
+
+	if (writes) {
+dispatch_writes:
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
+
+		dd->starved = 0;
+
+		data_dir = WRITE;
+
+		goto dispatch_find_request;
+	}
+
+	spin_unlock(&dd->lock);
+	return NULL;
+
+dispatch_find_request:
+	/*
+	 * we are not running a batch, find best request for selected data_dir
+	 */
+	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+		/*
+		 * A deadline has expired, the last request was in the other
+		 * direction, or we have run out of higher-sectored requests.
+		 * Start again from the request with the earliest expiry time.
+		 */
+		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+	} else {
+		/*
+		 * The last req was the same dir and we have a next request in
+		 * sort order. No expired requests so continue on from here.
+		 */
+		rq = dd->next_rq[data_dir];
+	}
+
+	dd->batching = 0;
+
+dispatch_request:
+	/*
+	 * rq is the selected appropriate request.
+	 */
+	dd->batching++;
+	deadline_move_request(dd, rq);
+done:
+	rq->rq_flags |= RQF_STARTED;
+	spin_unlock(&dd->lock);
+	return rq;
+}
+
+static void dd_dispatch_requests(struct blk_mq_hw_ctx *hctx,
+				 struct list_head *rq_list)
+{
+	blk_mq_sched_dispatch_shadow_requests(hctx, rq_list, __dd_dispatch_request);
+}
+
+static void dd_exit_queue(struct elevator_queue *e)
+{
+	struct deadline_data *dd = e->elevator_data;
+
+	BUG_ON(!list_empty(&dd->fifo_list[READ]));
+	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+
+	blk_mq_sched_free_requests(dd->tags);
+	kfree(dd);
+}
+
+/*
+ * initialize elevator private data (deadline_data).
+ */
+static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+	struct deadline_data *dd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
+
+	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
+	if (!dd) {
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+	eq->elevator_data = dd;
+
+	dd->tags = blk_mq_sched_alloc_requests(queue_depth, q->node);
+	if (!dd->tags) {
+		kfree(dd);
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+
+	INIT_LIST_HEAD(&dd->fifo_list[READ]);
+	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
+	dd->sort_list[READ] = RB_ROOT;
+	dd->sort_list[WRITE] = RB_ROOT;
+	dd->fifo_expire[READ] = read_expire;
+	dd->fifo_expire[WRITE] = write_expire;
+	dd->writes_starved = writes_starved;
+	dd->front_merges = 1;
+	dd->fifo_batch = fifo_batch;
+	spin_lock_init(&dd->lock);
+	INIT_LIST_HEAD(&dd->dispatch);
+	atomic_set(&dd->wait_index, 0);
+
+	q->elevator = eq;
+	return 0;
+}
+
+static int dd_request_merge(struct request_queue *q, struct request **rq,
+			    struct bio *bio)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+	sector_t sector = bio_end_sector(bio);
+	struct request *__rq;
+
+	if (!dd->front_merges)
+		return ELEVATOR_NO_MERGE;
+
+	__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
+	if (__rq) {
+		BUG_ON(sector != blk_rq_pos(__rq));
+
+		if (elv_bio_merge_ok(__rq, bio)) {
+			*rq = __rq;
+			return ELEVATOR_FRONT_MERGE;
+		}
+	}
+
+	return ELEVATOR_NO_MERGE;
+}
+
+static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+	struct request_queue *q = hctx->queue;
+	struct deadline_data *dd = q->elevator->elevator_data;
+	int ret;
+
+	spin_lock(&dd->lock);
+	ret = blk_mq_sched_try_merge(q, bio);
+	spin_unlock(&dd->lock);
+
+	return ret;
+}
+
+/*
+ * add rq to rbtree and fifo
+ */
+static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+			      bool at_head)
+{
+	struct request_queue *q = hctx->queue;
+	struct deadline_data *dd = q->elevator->elevator_data;
+	const int data_dir = rq_data_dir(rq);
+
+	if (blk_mq_sched_try_insert_merge(q, rq))
+		return;
+
+	blk_mq_sched_request_inserted(rq);
+
+	/*
+	 * If we're trying to insert a real request, just send it directly
+	 * to the hardware dispatch list. This only happens for a requeue,
+	 * or FUA/FLUSH requests.
+	 */
+	if (!blk_mq_sched_rq_is_shadow(rq)) {
+		spin_lock(&hctx->lock);
+		list_add_tail(&rq->queuelist, &hctx->dispatch);
+		spin_unlock(&hctx->lock);
+		return;
+	}
+
+	if (at_head || rq->cmd_type != REQ_TYPE_FS) {
+		if (at_head)
+			list_add(&rq->queuelist, &dd->dispatch);
+		else
+			list_add_tail(&rq->queuelist, &dd->dispatch);
+	} else {
+		deadline_add_rq_rb(dd, rq);
+
+		if (rq_mergeable(rq)) {
+			elv_rqhash_add(q, rq);
+			if (!q->last_merge)
+				q->last_merge = rq;
+		}
+
+		/*
+		 * set expire time and add to fifo list
+		 */
+		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
+		list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+	}
+}
+
+static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
+			       struct list_head *list, bool at_head)
+{
+	struct request_queue *q = hctx->queue;
+	struct deadline_data *dd = q->elevator->elevator_data;
+
+	spin_lock(&dd->lock);
+	while (!list_empty(list)) {
+		struct request *rq;
+
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		dd_insert_request(hctx, rq, at_head);
+	}
+	spin_unlock(&dd->lock);
+}
+
+static struct request *dd_get_request(struct request_queue *q, unsigned int op,
+				      struct blk_mq_alloc_data *data)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+	struct request *rq;
+
+	/*
+	 * The flush machinery intercepts before we insert the request. As
+	 * a work-around, just hand it back a real request.
+	 */
+	if (unlikely(op & (REQ_PREFLUSH | REQ_FUA)))
+		rq = __blk_mq_alloc_request(data, op);
+	else {
+		rq = blk_mq_sched_alloc_shadow_request(q, data, dd->tags, &dd->wait_index);
+		if (rq)
+			blk_mq_rq_ctx_init(q, data->ctx, rq, op);
+	}
+
+	return rq;
+}
+
+static bool dd_put_request(struct request *rq)
+{
+	/*
+	 * If it's a real request, we just have to free it. For a shadow
+	 * request, we should only free it if we haven't started it. A
+	 * started request is mapped to a real one, and the real one will
+	 * free it. We can get here with request merges, since we then
+	 * free the request before we start/issue it.
+	 */
+	if (!blk_mq_sched_rq_is_shadow(rq))
+		return false;
+
+	if (!(rq->rq_flags & RQF_STARTED)) {
+		struct request_queue *q = rq->q;
+		struct deadline_data *dd = q->elevator->elevator_data;
+
+		/*
+		 * IO completion would normally do this, but if we merge
+		 * and free before we issue the request, drop both the
+		 * tag and queue ref
+		 */
+		blk_mq_sched_free_shadow_request(dd->tags, rq);
+		blk_queue_exit(q);
+	}
+
+	return true;
+}
+
+static void dd_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct request *sched_rq = rq->end_io_data;
+
+	/*
+	 * sched_rq can be NULL, if we haven't setup the shadow yet
+	 * because we failed getting one.
+	 */
+	if (sched_rq) {
+		struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+
+		blk_mq_sched_free_shadow_request(dd->tags, sched_rq);
+		blk_mq_start_stopped_hw_queue(hctx, true);
+	}
+}
+
+static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
+{
+	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+
+	return !list_empty_careful(&dd->dispatch) ||
+		!list_empty_careful(&dd->fifo_list[0]) ||
+		!list_empty_careful(&dd->fifo_list[1]);
+}
+
+/*
+ * sysfs parts below
+ */
+static ssize_t
+deadline_var_show(int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+deadline_var_store(int *var, const char *page, size_t count)
+{
+	char *p = (char *) page;
+
+	*var = simple_strtol(p, &p, 10);
+	return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct deadline_data *dd = e->elevator_data;			\
+	int __data = __VAR;						\
+	if (__CONV)							\
+		__data = jiffies_to_msecs(__data);			\
+	return deadline_var_show(__data, (page));			\
+}
+SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
+{									\
+	struct deadline_data *dd = e->elevator_data;			\
+	int __data;							\
+	int ret = deadline_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV)							\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else								\
+		*(__PTR) = __data;					\
+	return ret;							\
+}
+STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+#undef STORE_FUNCTION
+
+#define DD_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
+				      deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+	DD_ATTR(read_expire),
+	DD_ATTR(write_expire),
+	DD_ATTR(writes_starved),
+	DD_ATTR(front_merges),
+	DD_ATTR(fifo_batch),
+	__ATTR_NULL
+};
+
+static struct elevator_type mq_deadline = {
+	.ops.mq = {
+		.get_request		= dd_get_request,
+		.put_request		= dd_put_request,
+		.insert_requests	= dd_insert_requests,
+		.dispatch_requests	= dd_dispatch_requests,
+		.completed_request	= dd_completed_request,
+		.next_request		= elv_rb_latter_request,
+		.former_request		= elv_rb_former_request,
+		.bio_merge		= dd_bio_merge,
+		.request_merge		= dd_request_merge,
+		.requests_merged	= dd_merged_requests,
+		.request_merged		= dd_request_merged,
+		.has_work		= dd_has_work,
+		.init_sched		= dd_init_queue,
+		.exit_sched		= dd_exit_queue,
+	},
+
+	.uses_mq	= true,
+	.elevator_attrs = deadline_attrs,
+	.elevator_name = "mq-deadline",
+	.elevator_owner = THIS_MODULE,
+};
+
+static int __init deadline_init(void)
+{
+	if (!queue_depth) {
+		pr_err("mq-deadline: queue depth must be > 0\n");
+		return -EINVAL;
+	}
+	return elv_register(&mq_deadline);
+}
+
+static void __exit deadline_exit(void)
+{
+	elv_unregister(&mq_deadline);
+}
+
+module_init(deadline_init);
+module_exit(deadline_exit);
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ deadline IO scheduler");