@@ -2,6 +2,19 @@ if BLOCK
menu "IO Schedulers"
+config ELV_FAIR_QUEUING
+ bool "Elevator Fair Queuing Support"
+ default n
+ ---help---
+ Traditionally only cfq had notion of multiple queues and it did
+ fair queuing at its own. With the cgroups and need of controlling
+ IO, now even the simple io schedulers like noop, deadline, as will
+ have one queue per cgroup and will need hierarchical fair queuing.
+ Instead of every io scheduler implementing its own fair queuing
+ logic, this option enables fair queuing in elevator layer so that
+ other ioschedulers can make use of it.
+ If unsure, say N.
+
config IOSCHED_NOOP
bool
default y
@@ -15,3 +15,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o
obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o
+obj-$(CONFIG_ELV_FAIR_QUEUING) += elevator-fq.o
@@ -276,6 +276,26 @@ static struct queue_sysfs_entry queue_iostats_entry = {
.store = queue_iostats_store,
};
+#ifdef CONFIG_ELV_FAIR_QUEUING
+static struct queue_sysfs_entry queue_slice_idle_entry = {
+ .attr = {.name = "slice_idle", .mode = S_IRUGO | S_IWUSR },
+ .show = elv_slice_idle_show,
+ .store = elv_slice_idle_store,
+};
+
+static struct queue_sysfs_entry queue_slice_sync_entry = {
+ .attr = {.name = "slice_sync", .mode = S_IRUGO | S_IWUSR },
+ .show = elv_slice_sync_show,
+ .store = elv_slice_sync_store,
+};
+
+static struct queue_sysfs_entry queue_slice_async_entry = {
+ .attr = {.name = "slice_async", .mode = S_IRUGO | S_IWUSR },
+ .show = elv_slice_async_show,
+ .store = elv_slice_async_store,
+};
+#endif
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -287,6 +307,11 @@ static struct attribute *default_attrs[] = {
&queue_nomerges_entry.attr,
&queue_rq_affinity_entry.attr,
&queue_iostats_entry.attr,
+#ifdef CONFIG_ELV_FAIR_QUEUING
+ &queue_slice_idle_entry.attr,
+ &queue_slice_sync_entry.attr,
+ &queue_slice_async_entry.attr,
+#endif
NULL,
};
new file mode 100644
@@ -0,0 +1,2078 @@
+/*
+ * BFQ: Hierarchical B-WF2Q+ scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ */
+
+#include <linux/blkdev.h>
+#include "elevator-fq.h"
+#include <linux/blktrace_api.h>
+
+/* Values taken from cfq */
+const int elv_slice_sync = HZ / 10;
+int elv_slice_async = HZ / 25;
+const int elv_slice_async_rq = 2;
+int elv_slice_idle = HZ / 125;
+static struct kmem_cache *elv_ioq_pool;
+
+#define ELV_SLICE_SCALE (5)
+#define ELV_HW_QUEUE_MIN (5)
+#define IO_SERVICE_TREE_INIT ((struct io_service_tree) \
+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+static inline struct io_queue *elv_close_cooperator(struct request_queue *q,
+ struct io_queue *ioq, int probe);
+struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd,
+ int extract);
+
+static inline int elv_prio_slice(struct elv_fq_data *efqd, int sync,
+ unsigned short prio)
+{
+ const int base_slice = efqd->elv_slice[sync];
+
+ WARN_ON(prio >= IOPRIO_BE_NR);
+
+ return base_slice + (base_slice/ELV_SLICE_SCALE * (4 - prio));
+}
+
+static inline int
+elv_prio_to_slice(struct elv_fq_data *efqd, struct io_queue *ioq)
+{
+ return elv_prio_slice(efqd, elv_ioq_sync(ioq), ioq->entity.ioprio);
+}
+
+/* Mainly the BFQ scheduling code Follows */
+
+/*
+ * Shift for timestamp calculations. This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT 22
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static inline int bfq_gt(bfq_timestamp_t a, bfq_timestamp_t b)
+{
+ return (s64)(a - b) > 0;
+}
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor.
+ */
+static inline bfq_timestamp_t bfq_delta(bfq_service_t service,
+ bfq_weight_t weight)
+{
+ bfq_timestamp_t d = (bfq_timestamp_t)service << WFQ_SERVICE_SHIFT;
+
+ do_div(d, weight);
+ return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static inline void bfq_calc_finish(struct io_entity *entity,
+ bfq_service_t service)
+{
+ BUG_ON(entity->weight == 0);
+
+ entity->finish = entity->start + bfq_delta(service, entity->weight);
+}
+
+static inline struct io_queue *io_entity_to_ioq(struct io_entity *entity)
+{
+ struct io_queue *ioq = NULL;
+
+ BUG_ON(entity == NULL);
+ if (entity->my_sched_data == NULL)
+ ioq = container_of(entity, struct io_queue, entity);
+ return ioq;
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity. This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+static inline struct io_entity *bfq_entity_of(struct rb_node *node)
+{
+ struct io_entity *entity = NULL;
+
+ if (node != NULL)
+ entity = rb_entry(node, struct io_entity, rb_node);
+
+ return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static inline void bfq_extract(struct rb_root *root, struct io_entity *entity)
+{
+ BUG_ON(entity->tree != root);
+
+ entity->tree = NULL;
+ rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct io_service_tree *st,
+ struct io_entity *entity)
+{
+ struct rb_node *next;
+ struct io_queue *ioq = io_entity_to_ioq(entity);
+
+ BUG_ON(entity->tree != &st->idle);
+
+ if (entity == st->first_idle) {
+ next = rb_next(&entity->rb_node);
+ st->first_idle = bfq_entity_of(next);
+ }
+
+ if (entity == st->last_idle) {
+ next = rb_prev(&entity->rb_node);
+ st->last_idle = bfq_entity_of(next);
+ }
+
+ bfq_extract(&st->idle, entity);
+
+ /* Delete queue from idle list */
+ if (ioq)
+ list_del(&ioq->queue_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct io_entity *entity)
+{
+ struct io_entity *entry;
+ struct rb_node **node = &root->rb_node;
+ struct rb_node *parent = NULL;
+
+ BUG_ON(entity->tree != NULL);
+
+ while (*node != NULL) {
+ parent = *node;
+ entry = rb_entry(parent, struct io_entity, rb_node);
+
+ if (bfq_gt(entry->finish, entity->finish))
+ node = &parent->rb_left;
+ else
+ node = &parent->rb_right;
+ }
+
+ rb_link_node(&entity->rb_node, parent, node);
+ rb_insert_color(&entity->rb_node, root);
+
+ entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree. The function assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static inline void bfq_update_min(struct io_entity *entity,
+ struct rb_node *node)
+{
+ struct io_entity *child;
+
+ if (node != NULL) {
+ child = rb_entry(node, struct io_entity, rb_node);
+ if (bfq_gt(entity->min_start, child->min_start))
+ entity->min_start = child->min_start;
+ }
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value. The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static inline void bfq_update_active_node(struct rb_node *node)
+{
+ struct io_entity *entity = rb_entry(node, struct io_entity, rb_node);
+
+ entity->min_start = entity->start;
+ bfq_update_min(entity, node->rb_right);
+ bfq_update_min(entity, node->rb_left);
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update. This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root. The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+ struct rb_node *parent;
+
+up:
+ bfq_update_active_node(node);
+
+ parent = rb_parent(node);
+ if (parent == NULL)
+ return;
+
+ if (node == parent->rb_left && parent->rb_right != NULL)
+ bfq_update_active_node(parent->rb_right);
+ else if (parent->rb_left != NULL)
+ bfq_update_active_node(parent->rb_left);
+
+ node = parent;
+ goto up;
+}
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct io_service_tree *st,
+ struct io_entity *entity)
+{
+ struct rb_node *node = &entity->rb_node;
+
+ bfq_insert(&st->active, entity);
+
+ if (node->rb_left != NULL)
+ node = node->rb_left;
+ else if (node->rb_right != NULL)
+ node = node->rb_right;
+
+ bfq_update_active_tree(node);
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+static bfq_weight_t bfq_ioprio_to_weight(int ioprio)
+{
+ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
+ return IOPRIO_BE_NR - ioprio;
+}
+
+void bfq_get_entity(struct io_entity *entity)
+{
+ struct io_queue *ioq = io_entity_to_ioq(entity);
+
+ if (ioq)
+ elv_get_ioq(ioq);
+}
+
+void bfq_init_entity(struct io_entity *entity, struct io_group *iog)
+{
+ entity->ioprio = entity->new_ioprio;
+ entity->ioprio_class = entity->new_ioprio_class;
+ entity->sched_data = &iog->sched_data;
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch. If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+ struct rb_node *deepest;
+
+ if (node->rb_right == NULL && node->rb_left == NULL)
+ deepest = rb_parent(node);
+ else if (node->rb_right == NULL)
+ deepest = node->rb_left;
+ else if (node->rb_left == NULL)
+ deepest = node->rb_right;
+ else {
+ deepest = rb_next(node);
+ if (deepest->rb_right != NULL)
+ deepest = deepest->rb_right;
+ else if (rb_parent(deepest) != node)
+ deepest = rb_parent(deepest);
+ }
+
+ return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct io_service_tree *st,
+ struct io_entity *entity)
+{
+ struct rb_node *node;
+
+ node = bfq_find_deepest(&entity->rb_node);
+ bfq_extract(&st->active, entity);
+
+ if (node != NULL)
+ bfq_update_active_tree(node);
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct io_service_tree *st,
+ struct io_entity *entity)
+{
+ struct io_entity *first_idle = st->first_idle;
+ struct io_entity *last_idle = st->last_idle;
+ struct io_queue *ioq = io_entity_to_ioq(entity);
+
+ if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
+ st->first_idle = entity;
+ if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
+ st->last_idle = entity;
+
+ bfq_insert(&st->idle, entity);
+
+ /* Add this queue to idle list */
+ if (ioq)
+ list_add(&ioq->queue_list, &ioq->efqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - remove an entity from the wfq trees.
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ *
+ * Update the device status and forget everything about @entity, putting
+ * the device reference to it, if it is a queue. Entities belonging to
+ * groups are not refcounted.
+ */
+static void bfq_forget_entity(struct io_service_tree *st,
+ struct io_entity *entity)
+{
+ struct io_queue *ioq = NULL;
+
+ BUG_ON(!entity->on_st);
+ entity->on_st = 0;
+ st->wsum -= entity->weight;
+ ioq = io_entity_to_ioq(entity);
+ if (!ioq)
+ return;
+ elv_put_ioq(ioq);
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+void bfq_put_idle_entity(struct io_service_tree *st,
+ struct io_entity *entity)
+{
+ bfq_idle_extract(st, entity);
+ bfq_forget_entity(st, entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+void bfq_forget_idle(struct io_service_tree *st)
+{
+ struct io_entity *first_idle = st->first_idle;
+ struct io_entity *last_idle = st->last_idle;
+
+ if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
+ !bfq_gt(last_idle->finish, st->vtime)) {
+ /*
+ * Active tree is empty. Pull back vtime to finish time of
+ * last idle entity on idle tree.
+ * Rational seems to be that it reduces the possibility of
+ * vtime wraparound (bfq_gt(V-F) < 0).
+ */
+ st->vtime = last_idle->finish;
+ }
+
+ if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
+ bfq_put_idle_entity(st, first_idle);
+}
+
+
+static struct io_service_tree *
+__bfq_entity_update_prio(struct io_service_tree *old_st,
+ struct io_entity *entity)
+{
+ struct io_service_tree *new_st = old_st;
+ struct io_queue *ioq = io_entity_to_ioq(entity);
+
+ if (entity->ioprio_changed) {
+ entity->ioprio = entity->new_ioprio;
+ entity->ioprio_class = entity->new_ioprio_class;
+ entity->ioprio_changed = 0;
+
+ /*
+ * Also update the scaled budget for ioq. Group will get the
+ * updated budget once ioq is selected to run next.
+ */
+ if (ioq) {
+ struct elv_fq_data *efqd = ioq->efqd;
+ entity->budget = elv_prio_to_slice(efqd, ioq);
+ }
+
+ old_st->wsum -= entity->weight;
+ entity->weight = bfq_ioprio_to_weight(entity->ioprio);
+
+ /*
+ * NOTE: here we may be changing the weight too early,
+ * this will cause unfairness. The correct approach
+ * would have required additional complexity to defer
+ * weight changes to the proper time instants (i.e.,
+ * when entity->finish <= old_st->vtime).
+ */
+ new_st = io_entity_service_tree(entity);
+ new_st->wsum += entity->weight;
+
+ if (new_st != old_st)
+ entity->start = new_st->vtime;
+ }
+
+ return new_st;
+}
+
+/**
+ * __bfq_activate_entity - activate an entity.
+ * @entity: the entity being activated.
+ *
+ * Called whenever an entity is activated, i.e., it is not active and one
+ * of its children receives a new request, or has to be reactivated due to
+ * budget exhaustion. It uses the current budget of the entity (and the
+ * service received if @entity is active) of the queue to calculate its
+ * timestamps.
+ */
+static void __bfq_activate_entity(struct io_entity *entity, int add_front)
+{
+ struct io_sched_data *sd = entity->sched_data;
+ struct io_service_tree *st = io_entity_service_tree(entity);
+
+ if (entity == sd->active_entity) {
+ BUG_ON(entity->tree != NULL);
+ /*
+ * If we are requeueing the current entity we have
+ * to take care of not charging to it service it has
+ * not received.
+ */
+ bfq_calc_finish(entity, entity->service);
+ entity->start = entity->finish;
+ sd->active_entity = NULL;
+ } else if (entity->tree == &st->active) {
+ /*
+ * Requeueing an entity due to a change of some
+ * next_active entity below it. We reuse the old
+ * start time.
+ */
+ bfq_active_extract(st, entity);
+ } else if (entity->tree == &st->idle) {
+ /*
+ * Must be on the idle tree, bfq_idle_extract() will
+ * check for that.
+ */
+ bfq_idle_extract(st, entity);
+ entity->start = bfq_gt(st->vtime, entity->finish) ?
+ st->vtime : entity->finish;
+ } else {
+ /*
+ * The finish time of the entity may be invalid, and
+ * it is in the past for sure, otherwise the queue
+ * would have been on the idle tree.
+ */
+ entity->start = st->vtime;
+ st->wsum += entity->weight;
+ bfq_get_entity(entity);
+
+ BUG_ON(entity->on_st);
+ entity->on_st = 1;
+ }
+
+ st = __bfq_entity_update_prio(st, entity);
+ /*
+ * This is to emulate cfq like functionality where preemption can
+ * happen with-in same class, like sync queue preempting async queue
+ * May be this is not a very good idea from fairness point of view
+ * as preempting queue gains share. Keeping it for now.
+ */
+ if (add_front) {
+ struct io_entity *next_entity;
+
+ /*
+ * Determine the entity which will be dispatched next
+ * Use sd->next_active once hierarchical patch is applied
+ */
+ next_entity = bfq_lookup_next_entity(sd, 0);
+
+ if (next_entity && next_entity != entity) {
+ struct io_service_tree *new_st;
+ bfq_timestamp_t delta;
+
+ new_st = io_entity_service_tree(next_entity);
+
+ /*
+ * At this point, both entities should belong to
+ * same service tree as cross service tree preemption
+ * is automatically taken care by algorithm
+ */
+ BUG_ON(new_st != st);
+ entity->finish = next_entity->finish - 1;
+ delta = bfq_delta(entity->budget, entity->weight);
+ entity->start = entity->finish - delta;
+ if (bfq_gt(entity->start, st->vtime))
+ entity->start = st->vtime;
+ }
+ } else {
+ bfq_calc_finish(entity, entity->budget);
+ }
+ bfq_active_insert(st, entity);
+}
+
+/**
+ * bfq_activate_entity - activate an entity.
+ * @entity: the entity to activate.
+ */
+void bfq_activate_entity(struct io_entity *entity, int add_front)
+{
+ __bfq_activate_entity(entity, add_front);
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @requeue: if false, the entity will not be put into the idle tree.
+ *
+ * Deactivate an entity, independently from its previous state. If the
+ * entity was not on a service tree just return, otherwise if it is on
+ * any scheduler tree, extract it from that tree, and if necessary
+ * and if the caller did not specify @requeue, put it on the idle tree.
+ *
+ */
+int __bfq_deactivate_entity(struct io_entity *entity, int requeue)
+{
+ struct io_sched_data *sd = entity->sched_data;
+ struct io_service_tree *st = io_entity_service_tree(entity);
+ int was_active = entity == sd->active_entity;
+ int ret = 0;
+
+ if (!entity->on_st)
+ return 0;
+
+ BUG_ON(was_active && entity->tree != NULL);
+
+ if (was_active) {
+ bfq_calc_finish(entity, entity->service);
+ sd->active_entity = NULL;
+ } else if (entity->tree == &st->active)
+ bfq_active_extract(st, entity);
+ else if (entity->tree == &st->idle)
+ bfq_idle_extract(st, entity);
+ else if (entity->tree != NULL)
+ BUG();
+
+ if (!requeue || !bfq_gt(entity->finish, st->vtime))
+ bfq_forget_entity(st, entity);
+ else
+ bfq_idle_insert(st, entity);
+
+ BUG_ON(sd->active_entity == entity);
+
+ return ret;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity.
+ * @entity: the entity to deactivate.
+ * @requeue: true if the entity can be put on the idle tree
+ */
+void bfq_deactivate_entity(struct io_entity *entity, int requeue)
+{
+ __bfq_deactivate_entity(entity, requeue);
+}
+
+/**
+ * bfq_update_vtime - update vtime if necessary.
+ * @st: the service tree to act upon.
+ *
+ * If necessary update the service tree vtime to have at least one
+ * eligible entity, skipping to its start time. Assumes that the
+ * active tree of the device is not empty.
+ *
+ * NOTE: this hierarchical implementation updates vtimes quite often,
+ * we may end up with reactivated tasks getting timestamps after a
+ * vtime skip done because we needed a ->first_active entity on some
+ * intermediate node.
+ */
+static void bfq_update_vtime(struct io_service_tree *st)
+{
+ struct io_entity *entry;
+ struct rb_node *node = st->active.rb_node;
+
+ entry = rb_entry(node, struct io_entity, rb_node);
+ if (bfq_gt(entry->min_start, st->vtime)) {
+ st->vtime = entry->min_start;
+ bfq_forget_idle(st);
+ }
+}
+
+/**
+ * bfq_first_active - find the eligible entity with the smallest finish time
+ * @st: the service tree to select from.
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start <= vtime) entity. The path
+ * on the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct io_entity *bfq_first_active_entity(struct io_service_tree *st)
+{
+ struct io_entity *entry, *first = NULL;
+ struct rb_node *node = st->active.rb_node;
+
+ while (node != NULL) {
+ entry = rb_entry(node, struct io_entity, rb_node);
+left:
+ if (!bfq_gt(entry->start, st->vtime))
+ first = entry;
+
+ BUG_ON(bfq_gt(entry->min_start, st->vtime));
+
+ if (node->rb_left != NULL) {
+ entry = rb_entry(node->rb_left,
+ struct io_entity, rb_node);
+ if (!bfq_gt(entry->min_start, st->vtime)) {
+ node = node->rb_left;
+ goto left;
+ }
+ }
+ if (first != NULL)
+ break;
+ node = node->rb_right;
+ }
+
+ BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
+ return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * Update the virtual time in @st and return the first eligible entity
+ * it contains.
+ */
+static struct io_entity *__bfq_lookup_next_entity(struct io_service_tree *st)
+{
+ struct io_entity *entity;
+
+ if (RB_EMPTY_ROOT(&st->active))
+ return NULL;
+
+ bfq_update_vtime(st);
+ entity = bfq_first_active_entity(st);
+ BUG_ON(bfq_gt(entity->start, st->vtime));
+
+ return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ * @extract: if true the returned entity will be also extracted from @sd.
+ *
+ * NOTE: since we cache the next_active entity at each level of the
+ * hierarchy, the complexity of the lookup can be decreased with
+ * absolutely no effort just returning the cached next_active value;
+ * we prefer to do full lookups to test the consistency of * the data
+ * structures.
+ */
+struct io_entity *bfq_lookup_next_entity(struct io_sched_data *sd,
+ int extract)
+{
+ struct io_service_tree *st = sd->service_tree;
+ struct io_entity *entity;
+ int i;
+
+ /*
+ * One can check for which will be next selected entity without
+ * expiring the current one.
+ */
+ BUG_ON(extract && sd->active_entity != NULL);
+
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++, st++) {
+ entity = __bfq_lookup_next_entity(st);
+ if (entity != NULL) {
+ if (extract) {
+ bfq_active_extract(st, entity);
+ sd->active_entity = entity;
+ }
+ break;
+ }
+ }
+
+ return entity;
+}
+
+void entity_served(struct io_entity *entity, bfq_service_t served)
+{
+ struct io_service_tree *st;
+
+ st = io_entity_service_tree(entity);
+ entity->service += served;
+ BUG_ON(st->wsum == 0);
+ st->vtime += bfq_delta(served, st->wsum);
+ bfq_forget_idle(st);
+}
+
+/* Elevator fair queuing function */
+struct io_queue *rq_ioq(struct request *rq)
+{
+ return rq->ioq;
+}
+
+static inline struct io_queue *elv_active_ioq(struct elevator_queue *e)
+{
+ return e->efqd.active_queue;
+}
+
+void *elv_active_sched_queue(struct elevator_queue *e)
+{
+ return ioq_sched_queue(elv_active_ioq(e));
+}
+EXPORT_SYMBOL(elv_active_sched_queue);
+
+int elv_nr_busy_ioq(struct elevator_queue *e)
+{
+ return e->efqd.busy_queues;
+}
+EXPORT_SYMBOL(elv_nr_busy_ioq);
+
+int elv_nr_busy_rt_ioq(struct elevator_queue *e)
+{
+ return e->efqd.busy_rt_queues;
+}
+EXPORT_SYMBOL(elv_nr_busy_rt_ioq);
+
+int elv_hw_tag(struct elevator_queue *e)
+{
+ return e->efqd.hw_tag;
+}
+EXPORT_SYMBOL(elv_hw_tag);
+
+/* Helper functions for operating on elevator idle slice timer */
+int elv_mod_idle_slice_timer(struct elevator_queue *eq, unsigned long expires)
+{
+ struct elv_fq_data *efqd = &eq->efqd;
+
+ return mod_timer(&efqd->idle_slice_timer, expires);
+}
+EXPORT_SYMBOL(elv_mod_idle_slice_timer);
+
+int elv_del_idle_slice_timer(struct elevator_queue *eq)
+{
+ struct elv_fq_data *efqd = &eq->efqd;
+
+ return del_timer(&efqd->idle_slice_timer);
+}
+EXPORT_SYMBOL(elv_del_idle_slice_timer);
+
+unsigned int elv_get_slice_idle(struct elevator_queue *eq)
+{
+ return eq->efqd.elv_slice_idle;
+}
+EXPORT_SYMBOL(elv_get_slice_idle);
+
+void elv_ioq_served(struct io_queue *ioq, bfq_service_t served)
+{
+ entity_served(&ioq->entity, served);
+}
+
+/* Tells whether ioq is queued in root group or not */
+static inline int is_root_group_ioq(struct request_queue *q,
+ struct io_queue *ioq)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+
+ return (ioq->entity.sched_data == &efqd->root_group->sched_data);
+}
+
+/* Functions to show and store elv_idle_slice value through sysfs */
+ssize_t elv_slice_idle_show(struct request_queue *q, char *name)
+{
+ struct elv_fq_data *efqd;
+ unsigned int data;
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ efqd = &q->elevator->efqd;
+ data = jiffies_to_msecs(efqd->elv_slice_idle);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ return sprintf(name, "%d\n", data);
+}
+
+ssize_t elv_slice_idle_store(struct request_queue *q, const char *name,
+ size_t count)
+{
+ struct elv_fq_data *efqd;
+ unsigned int data;
+ unsigned long flags;
+
+ char *p = (char *)name;
+
+ data = simple_strtoul(p, &p, 10);
+
+ if (data < 0)
+ data = 0;
+ else if (data > INT_MAX)
+ data = INT_MAX;
+
+ data = msecs_to_jiffies(data);
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ efqd = &q->elevator->efqd;
+ efqd->elv_slice_idle = data;
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ return count;
+}
+
+/* Functions to show and store elv_slice_sync value through sysfs */
+ssize_t elv_slice_sync_show(struct request_queue *q, char *name)
+{
+ struct elv_fq_data *efqd;
+ unsigned int data;
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ efqd = &q->elevator->efqd;
+ data = efqd->elv_slice[1];
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ return sprintf(name, "%d\n", data);
+}
+
+ssize_t elv_slice_sync_store(struct request_queue *q, const char *name,
+ size_t count)
+{
+ struct elv_fq_data *efqd;
+ unsigned int data;
+ unsigned long flags;
+
+ char *p = (char *)name;
+
+ data = simple_strtoul(p, &p, 10);
+
+ if (data < 0)
+ data = 0;
+ /* 100ms is the limit for now*/
+ else if (data > 100)
+ data = 100;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ efqd = &q->elevator->efqd;
+ efqd->elv_slice[1] = data;
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ return count;
+}
+
+/* Functions to show and store elv_slice_async value through sysfs */
+ssize_t elv_slice_async_show(struct request_queue *q, char *name)
+{
+ struct elv_fq_data *efqd;
+ unsigned int data;
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ efqd = &q->elevator->efqd;
+ data = efqd->elv_slice[0];
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ return sprintf(name, "%d\n", data);
+}
+
+ssize_t elv_slice_async_store(struct request_queue *q, const char *name,
+ size_t count)
+{
+ struct elv_fq_data *efqd;
+ unsigned int data;
+ unsigned long flags;
+
+ char *p = (char *)name;
+
+ data = simple_strtoul(p, &p, 10);
+
+ if (data < 0)
+ data = 0;
+ /* 100ms is the limit for now*/
+ else if (data > 100)
+ data = 100;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ efqd = &q->elevator->efqd;
+ efqd->elv_slice[0] = data;
+ spin_unlock_irqrestore(q->queue_lock, flags);
+
+ return count;
+}
+
+void elv_schedule_dispatch(struct request_queue *q)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+
+ if (elv_nr_busy_ioq(q->elevator)) {
+ elv_log(efqd, "schedule dispatch");
+ kblockd_schedule_work(efqd->queue, &efqd->unplug_work);
+ }
+}
+EXPORT_SYMBOL(elv_schedule_dispatch);
+
+void elv_kick_queue(struct work_struct *work)
+{
+ struct elv_fq_data *efqd =
+ container_of(work, struct elv_fq_data, unplug_work);
+ struct request_queue *q = efqd->queue;
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ blk_start_queueing(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+void elv_shutdown_timer_wq(struct elevator_queue *e)
+{
+ del_timer_sync(&e->efqd.idle_slice_timer);
+ cancel_work_sync(&e->efqd.unplug_work);
+}
+EXPORT_SYMBOL(elv_shutdown_timer_wq);
+
+void elv_ioq_set_prio_slice(struct request_queue *q, struct io_queue *ioq)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+
+ ioq->slice_end = jiffies + ioq->entity.budget;
+ elv_log_ioq(efqd, ioq, "set_slice=%lu", ioq->entity.budget);
+}
+
+static void elv_ioq_update_io_thinktime(struct io_queue *ioq)
+{
+ struct elv_fq_data *efqd = ioq->efqd;
+ unsigned long elapsed = jiffies - ioq->last_end_request;
+ unsigned long ttime = min(elapsed, 2UL * efqd->elv_slice_idle);
+
+ ioq->ttime_samples = (7*ioq->ttime_samples + 256) / 8;
+ ioq->ttime_total = (7*ioq->ttime_total + 256*ttime) / 8;
+ ioq->ttime_mean = (ioq->ttime_total + 128) / ioq->ttime_samples;
+}
+
+/*
+ * Disable idle window if the process thinks too long.
+ * This idle flag can also be updated by io scheduler.
+ */
+static void elv_ioq_update_idle_window(struct elevator_queue *eq,
+ struct io_queue *ioq, struct request *rq)
+{
+ int old_idle, enable_idle;
+ struct elv_fq_data *efqd = ioq->efqd;
+
+ /*
+ * Don't idle for async or idle io prio class
+ */
+ if (!elv_ioq_sync(ioq) || elv_ioq_class_idle(ioq))
+ return;
+
+ enable_idle = old_idle = elv_ioq_idle_window(ioq);
+
+ if (!efqd->elv_slice_idle)
+ enable_idle = 0;
+ else if (ioq_sample_valid(ioq->ttime_samples)) {
+ if (ioq->ttime_mean > efqd->elv_slice_idle)
+ enable_idle = 0;
+ else
+ enable_idle = 1;
+ }
+
+ /*
+ * From think time perspective idle should be enabled. Check with
+ * io scheduler if it wants to disable idling based on additional
+ * considrations like seek pattern.
+ */
+ if (enable_idle) {
+ if (eq->ops->elevator_update_idle_window_fn)
+ enable_idle = eq->ops->elevator_update_idle_window_fn(
+ eq, ioq->sched_queue, rq);
+ if (!enable_idle)
+ elv_log_ioq(efqd, ioq, "iosched disabled idle");
+ }
+
+ if (old_idle != enable_idle) {
+ elv_log_ioq(efqd, ioq, "idle=%d", enable_idle);
+ if (enable_idle)
+ elv_mark_ioq_idle_window(ioq);
+ else
+ elv_clear_ioq_idle_window(ioq);
+ }
+}
+
+struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask)
+{
+ struct io_queue *ioq = NULL;
+
+ ioq = kmem_cache_alloc_node(elv_ioq_pool, gfp_mask, q->node);
+ return ioq;
+}
+EXPORT_SYMBOL(elv_alloc_ioq);
+
+void elv_free_ioq(struct io_queue *ioq)
+{
+ kmem_cache_free(elv_ioq_pool, ioq);
+}
+EXPORT_SYMBOL(elv_free_ioq);
+
+int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq,
+ void *sched_queue, int ioprio_class, int ioprio,
+ int is_sync)
+{
+ struct elv_fq_data *efqd = &eq->efqd;
+ struct io_group *iog = io_lookup_io_group_current(efqd->queue);
+
+ RB_CLEAR_NODE(&ioq->entity.rb_node);
+ atomic_set(&ioq->ref, 0);
+ ioq->efqd = efqd;
+ elv_ioq_set_ioprio_class(ioq, ioprio_class);
+ elv_ioq_set_ioprio(ioq, ioprio);
+ ioq->pid = current->pid;
+ ioq->sched_queue = sched_queue;
+ if (is_sync && !elv_ioq_class_idle(ioq))
+ elv_mark_ioq_idle_window(ioq);
+ bfq_init_entity(&ioq->entity, iog);
+ ioq->entity.budget = elv_prio_to_slice(efqd, ioq);
+ return 0;
+}
+EXPORT_SYMBOL(elv_init_ioq);
+
+void elv_put_ioq(struct io_queue *ioq)
+{
+ struct elv_fq_data *efqd = ioq->efqd;
+ struct elevator_queue *e = container_of(efqd, struct elevator_queue,
+ efqd);
+
+ BUG_ON(atomic_read(&ioq->ref) <= 0);
+ if (!atomic_dec_and_test(&ioq->ref))
+ return;
+ BUG_ON(ioq->nr_queued);
+ BUG_ON(ioq->entity.tree != NULL);
+ BUG_ON(elv_ioq_busy(ioq));
+ BUG_ON(efqd->active_queue == ioq);
+
+ /* Can be called by outgoing elevator. Don't use q */
+ BUG_ON(!e->ops->elevator_free_sched_queue_fn);
+
+ e->ops->elevator_free_sched_queue_fn(e, ioq->sched_queue);
+ elv_log_ioq(efqd, ioq, "put_queue");
+ elv_free_ioq(ioq);
+}
+EXPORT_SYMBOL(elv_put_ioq);
+
+void elv_release_ioq(struct elevator_queue *e, struct io_queue **ioq_ptr)
+{
+ struct io_queue *ioq = *ioq_ptr;
+
+ if (ioq != NULL) {
+ /* Drop the reference taken by the io group */
+ elv_put_ioq(ioq);
+ *ioq_ptr = NULL;
+ }
+}
+
+/*
+ * Normally next io queue to be served is selected from the service tree.
+ * This function allows one to choose a specific io queue to run next
+ * out of order. This is primarily to accomodate the close_cooperator
+ * feature of cfq.
+ *
+ * Currently it is done only for root level as to begin with supporting
+ * close cooperator feature only for root group to make sure default
+ * cfq behavior in flat hierarchy is not changed.
+ */
+void elv_set_next_ioq(struct request_queue *q, struct io_queue *ioq)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+ struct io_entity *entity = &ioq->entity;
+ struct io_sched_data *sd = &efqd->root_group->sched_data;
+ struct io_service_tree *st = io_entity_service_tree(entity);
+
+ BUG_ON(efqd->active_queue != NULL || sd->active_entity != NULL);
+ BUG_ON(!efqd->busy_queues);
+ BUG_ON(sd != entity->sched_data);
+ BUG_ON(!st);
+
+ bfq_update_vtime(st);
+ bfq_active_extract(st, entity);
+ sd->active_entity = entity;
+ entity->service = 0;
+ elv_log_ioq(efqd, ioq, "set_next_ioq");
+}
+
+/* Get next queue for service. */
+struct io_queue *elv_get_next_ioq(struct request_queue *q, int extract)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+ struct io_entity *entity = NULL;
+ struct io_queue *ioq = NULL;
+ struct io_sched_data *sd;
+
+ /*
+ * one can check for which queue will be selected next while having
+ * one queue active. preempt logic uses it.
+ */
+ BUG_ON(extract && efqd->active_queue != NULL);
+
+ if (!efqd->busy_queues)
+ return NULL;
+
+ sd = &efqd->root_group->sched_data;
+ if (extract)
+ entity = bfq_lookup_next_entity(sd, 1);
+ else
+ entity = bfq_lookup_next_entity(sd, 0);
+
+ BUG_ON(!entity);
+ if (extract)
+ entity->service = 0;
+ ioq = io_entity_to_ioq(entity);
+
+ return ioq;
+}
+
+/*
+ * coop tells that io scheduler selected a queue for us and we did not
+ * select the next queue based on fairness.
+ */
+static void __elv_set_active_ioq(struct elv_fq_data *efqd, struct io_queue *ioq,
+ int coop)
+{
+ struct request_queue *q = efqd->queue;
+
+ if (ioq) {
+ elv_log_ioq(efqd, ioq, "set_active, busy=%d",
+ efqd->busy_queues);
+ ioq->slice_end = 0;
+
+ elv_clear_ioq_wait_request(ioq);
+ elv_clear_ioq_must_dispatch(ioq);
+ elv_mark_ioq_slice_new(ioq);
+
+ del_timer(&efqd->idle_slice_timer);
+ }
+
+ efqd->active_queue = ioq;
+
+ /* Let iosched know if it wants to take some action */
+ if (ioq) {
+ if (q->elevator->ops->elevator_active_ioq_set_fn)
+ q->elevator->ops->elevator_active_ioq_set_fn(q,
+ ioq->sched_queue, coop);
+ }
+}
+
+/* Get and set a new active queue for service. */
+struct io_queue *elv_set_active_ioq(struct request_queue *q,
+ struct io_queue *ioq)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+ int coop = 0;
+
+ if (!ioq)
+ ioq = elv_get_next_ioq(q, 1);
+ else {
+ elv_set_next_ioq(q, ioq);
+ /*
+ * io scheduler selected the next queue for us. Pass this
+ * this info back to io scheudler. cfq currently uses it
+ * to reset coop flag on the queue.
+ */
+ coop = 1;
+ }
+ __elv_set_active_ioq(efqd, ioq, coop);
+ return ioq;
+}
+
+void elv_reset_active_ioq(struct elv_fq_data *efqd)
+{
+ struct request_queue *q = efqd->queue;
+ struct io_queue *ioq = elv_active_ioq(efqd->queue->elevator);
+
+ if (q->elevator->ops->elevator_active_ioq_reset_fn)
+ q->elevator->ops->elevator_active_ioq_reset_fn(q,
+ ioq->sched_queue);
+ efqd->active_queue = NULL;
+ del_timer(&efqd->idle_slice_timer);
+}
+
+void elv_activate_ioq(struct io_queue *ioq, int add_front)
+{
+ bfq_activate_entity(&ioq->entity, add_front);
+}
+
+void elv_deactivate_ioq(struct elv_fq_data *efqd, struct io_queue *ioq,
+ int requeue)
+{
+ if (ioq == efqd->active_queue)
+ elv_reset_active_ioq(efqd);
+
+ bfq_deactivate_entity(&ioq->entity, requeue);
+}
+
+/* Called when an inactive queue receives a new request. */
+void elv_add_ioq_busy(struct elv_fq_data *efqd, struct io_queue *ioq)
+{
+ BUG_ON(elv_ioq_busy(ioq));
+ BUG_ON(ioq == efqd->active_queue);
+ elv_log_ioq(efqd, ioq, "add to busy");
+ elv_activate_ioq(ioq, 0);
+ elv_mark_ioq_busy(ioq);
+ efqd->busy_queues++;
+ if (elv_ioq_class_rt(ioq))
+ efqd->busy_rt_queues++;
+}
+
+void elv_del_ioq_busy(struct elevator_queue *e, struct io_queue *ioq,
+ int requeue)
+{
+ struct elv_fq_data *efqd = &e->efqd;
+
+ BUG_ON(!elv_ioq_busy(ioq));
+ BUG_ON(ioq->nr_queued);
+ elv_log_ioq(efqd, ioq, "del from busy");
+ elv_clear_ioq_busy(ioq);
+ BUG_ON(efqd->busy_queues == 0);
+ efqd->busy_queues--;
+ if (elv_ioq_class_rt(ioq))
+ efqd->busy_rt_queues--;
+
+ elv_deactivate_ioq(efqd, ioq, requeue);
+}
+
+/*
+ * Do the accounting. Determine how much service (in terms of time slices)
+ * current queue used and adjust the start, finish time of queue and vtime
+ * of the tree accordingly.
+ *
+ * Determining the service used in terms of time is tricky in certain
+ * situations. Especially when underlying device supports command queuing
+ * and requests from multiple queues can be there at same time, then it
+ * is not clear which queue consumed how much of disk time.
+ *
+ * To mitigate this problem, cfq starts the time slice of the queue only
+ * after first request from the queue has completed. This does not work
+ * very well if we expire the queue before we wait for first and more
+ * request to finish from the queue. For seeky queues, we will expire the
+ * queue after dispatching few requests without waiting and start dispatching
+ * from next queue.
+ *
+ * Not sure how to determine the time consumed by queue in such scenarios.
+ * Currently as a crude approximation, we are charging 25% of time slice
+ * for such cases. A better mechanism is needed for accurate accounting.
+ */
+void __elv_ioq_slice_expired(struct request_queue *q, struct io_queue *ioq)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+ struct io_entity *entity = &ioq->entity;
+ long slice_unused = 0, slice_used = 0, slice_overshoot = 0;
+
+ assert_spin_locked(q->queue_lock);
+ elv_log_ioq(efqd, ioq, "slice expired");
+
+ if (elv_ioq_wait_request(ioq))
+ del_timer(&efqd->idle_slice_timer);
+
+ elv_clear_ioq_wait_request(ioq);
+
+ /*
+ * if ioq->slice_end = 0, that means a queue was expired before first
+ * reuqest from the queue got completed. Of course we are not planning
+ * to idle on the queue otherwise we would not have expired it.
+ *
+ * Charge for the 25% slice in such cases. This is not the best thing
+ * to do but at the same time not very sure what's the next best
+ * thing to do.
+ *
+ * This arises from that fact that we don't have the notion of
+ * one queue being operational at one time. io scheduler can dispatch
+ * requests from multiple queues in one dispatch round. Ideally for
+ * more accurate accounting of exact disk time used by disk, one
+ * should dispatch requests from only one queue and wait for all
+ * the requests to finish. But this will reduce throughput.
+ */
+ if (!ioq->slice_end)
+ slice_used = entity->budget/4;
+ else {
+ if (time_after(ioq->slice_end, jiffies)) {
+ slice_unused = ioq->slice_end - jiffies;
+ if (slice_unused == entity->budget) {
+ /*
+ * queue got expired immediately after
+ * completing first request. Charge 25% of
+ * slice.
+ */
+ slice_used = entity->budget/4;
+ } else
+ slice_used = entity->budget - slice_unused;
+ } else {
+ slice_overshoot = jiffies - ioq->slice_end;
+ slice_used = entity->budget + slice_overshoot;
+ }
+ }
+
+ elv_log_ioq(efqd, ioq, "sl_end=%lx, jiffies=%lx", ioq->slice_end,
+ jiffies);
+ elv_log_ioq(efqd, ioq, "sl_used=%ld, budget=%ld overshoot=%ld",
+ slice_used, entity->budget, slice_overshoot);
+ elv_ioq_served(ioq, slice_used);
+
+ BUG_ON(ioq != efqd->active_queue);
+ elv_reset_active_ioq(efqd);
+
+ if (!ioq->nr_queued)
+ elv_del_ioq_busy(q->elevator, ioq, 1);
+ else
+ elv_activate_ioq(ioq, 0);
+}
+EXPORT_SYMBOL(__elv_ioq_slice_expired);
+
+/*
+ * Expire the ioq.
+ */
+void elv_ioq_slice_expired(struct request_queue *q)
+{
+ struct io_queue *ioq = elv_active_ioq(q->elevator);
+
+ if (ioq)
+ __elv_ioq_slice_expired(q, ioq);
+}
+
+/*
+ * Check if new_cfqq should preempt the currently active queue. Return 0 for
+ * no or if we aren't sure, a 1 will cause a preemption attempt.
+ */
+int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq,
+ struct request *rq)
+{
+ struct io_queue *ioq;
+ struct elevator_queue *eq = q->elevator;
+
+ ioq = elv_active_ioq(eq);
+
+ if (!ioq)
+ return 0;
+
+ if (elv_ioq_slice_used(ioq))
+ return 1;
+
+ if (elv_ioq_class_idle(new_ioq))
+ return 0;
+
+ if (elv_ioq_class_idle(ioq))
+ return 1;
+
+ /*
+ * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
+ */
+ if (elv_ioq_class_rt(new_ioq) && !elv_ioq_class_rt(ioq))
+ return 1;
+
+ /*
+ * Check with io scheduler if it has additional criterion based on
+ * which it wants to preempt existing queue.
+ */
+ if (eq->ops->elevator_should_preempt_fn)
+ return eq->ops->elevator_should_preempt_fn(q, new_ioq, rq);
+
+ return 0;
+}
+
+static void elv_preempt_queue(struct request_queue *q, struct io_queue *ioq)
+{
+ elv_log_ioq(&q->elevator->efqd, ioq, "preempt");
+ elv_ioq_slice_expired(q);
+
+ /*
+ * Put the new queue at the front of the of the current list,
+ * so we know that it will be selected next.
+ */
+
+ elv_activate_ioq(ioq, 1);
+ elv_ioq_set_slice_end(ioq, 0);
+ elv_mark_ioq_slice_new(ioq);
+}
+
+void elv_ioq_request_add(struct request_queue *q, struct request *rq)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+ struct io_queue *ioq = rq->ioq;
+
+ if (!elv_iosched_fair_queuing_enabled(q->elevator))
+ return;
+
+ BUG_ON(!efqd);
+ BUG_ON(!ioq);
+ efqd->rq_queued++;
+ ioq->nr_queued++;
+
+ if (!elv_ioq_busy(ioq))
+ elv_add_ioq_busy(efqd, ioq);
+
+ elv_ioq_update_io_thinktime(ioq);
+ elv_ioq_update_idle_window(q->elevator, ioq, rq);
+
+ if (ioq == elv_active_ioq(q->elevator)) {
+ /*
+ * Remember that we saw a request from this process, but
+ * don't start queuing just yet. Otherwise we risk seeing lots
+ * of tiny requests, because we disrupt the normal plugging
+ * and merging. If the request is already larger than a single
+ * page, let it rip immediately. For that case we assume that
+ * merging is already done. Ditto for a busy system that
+ * has other work pending, don't risk delaying until the
+ * idle timer unplug to continue working.
+ */
+ if (elv_ioq_wait_request(ioq)) {
+ if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
+ efqd->busy_queues > 1) {
+ del_timer(&efqd->idle_slice_timer);
+ blk_start_queueing(q);
+ }
+ elv_mark_ioq_must_dispatch(ioq);
+ }
+ } else if (elv_should_preempt(q, ioq, rq)) {
+ /*
+ * not the active queue - expire current slice if it is
+ * idle and has expired it's mean thinktime or this new queue
+ * has some old slice time left and is of higher priority or
+ * this new queue is RT and the current one is BE
+ */
+ elv_preempt_queue(q, ioq);
+ blk_start_queueing(q);
+ }
+}
+
+void elv_idle_slice_timer(unsigned long data)
+{
+ struct elv_fq_data *efqd = (struct elv_fq_data *)data;
+ struct io_queue *ioq;
+ unsigned long flags;
+ struct request_queue *q = efqd->queue;
+
+ elv_log(efqd, "idle timer fired");
+
+ spin_lock_irqsave(q->queue_lock, flags);
+
+ ioq = efqd->active_queue;
+
+ if (ioq) {
+
+ /*
+ * We saw a request before the queue expired, let it through
+ */
+ if (elv_ioq_must_dispatch(ioq))
+ goto out_kick;
+
+ /*
+ * expired
+ */
+ if (elv_ioq_slice_used(ioq))
+ goto expire;
+
+ /*
+ * only expire and reinvoke request handler, if there are
+ * other queues with pending requests
+ */
+ if (!elv_nr_busy_ioq(q->elevator))
+ goto out_cont;
+
+ /*
+ * not expired and it has a request pending, let it dispatch
+ */
+ if (ioq->nr_queued)
+ goto out_kick;
+ }
+expire:
+ elv_ioq_slice_expired(q);
+out_kick:
+ elv_schedule_dispatch(q);
+out_cont:
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+void elv_ioq_arm_slice_timer(struct request_queue *q)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+ struct io_queue *ioq = elv_active_ioq(q->elevator);
+ unsigned long sl;
+
+ BUG_ON(!ioq);
+
+ /*
+ * SSD device without seek penalty, disable idling. But only do so
+ * for devices that support queuing, otherwise we still have a problem
+ * with sync vs async workloads.
+ */
+ if (blk_queue_nonrot(q) && efqd->hw_tag)
+ return;
+
+ /*
+ * still requests with the driver, don't idle
+ */
+ if (efqd->rq_in_driver)
+ return;
+
+ /*
+ * idle is disabled, either manually or by past process history
+ */
+ if (!efqd->elv_slice_idle || !elv_ioq_idle_window(ioq))
+ return;
+
+ /*
+ * may be iosched got its own idling logic. In that case io
+ * schduler will take care of arming the timer, if need be.
+ */
+ if (q->elevator->ops->elevator_arm_slice_timer_fn) {
+ q->elevator->ops->elevator_arm_slice_timer_fn(q,
+ ioq->sched_queue);
+ } else {
+ elv_mark_ioq_wait_request(ioq);
+ sl = efqd->elv_slice_idle;
+ mod_timer(&efqd->idle_slice_timer, jiffies + sl);
+ elv_log_ioq(efqd, ioq, "arm idle: %lu", sl);
+ }
+}
+
+void elv_free_idle_ioq_list(struct elevator_queue *e)
+{
+ struct io_queue *ioq, *n;
+ struct elv_fq_data *efqd = &e->efqd;
+
+ list_for_each_entry_safe(ioq, n, &efqd->idle_list, queue_list)
+ elv_deactivate_ioq(efqd, ioq, 0);
+}
+
+/* Common layer function to select the next queue to dispatch from */
+void *elv_fq_select_ioq(struct request_queue *q, int force)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+ struct io_queue *new_ioq = NULL, *ioq = elv_active_ioq(q->elevator);
+
+ if (!elv_nr_busy_ioq(q->elevator))
+ return NULL;
+
+ if (ioq == NULL)
+ goto new_queue;
+
+ /*
+ * Force dispatch. Continue to dispatch from current queue as long
+ * as it has requests.
+ */
+ if (unlikely(force)) {
+ if (ioq->nr_queued)
+ goto keep_queue;
+ else
+ goto expire;
+ }
+
+ /*
+ * The active queue has run out of time, expire it and select new.
+ */
+ if (elv_ioq_slice_used(ioq) && !elv_ioq_must_dispatch(ioq))
+ goto expire;
+
+ /*
+ * If we have a RT cfqq waiting, then we pre-empt the current non-rt
+ * cfqq.
+ */
+ if (!elv_ioq_class_rt(ioq) && efqd->busy_rt_queues) {
+ /*
+ * We simulate this as cfqq timed out so that it gets to bank
+ * the remaining of its time slice.
+ */
+ elv_log_ioq(efqd, ioq, "preempt");
+ goto expire;
+ }
+
+ /*
+ * The active queue has requests and isn't expired, allow it to
+ * dispatch.
+ */
+
+ if (ioq->nr_queued)
+ goto keep_queue;
+
+ /*
+ * If another queue has a request waiting within our mean seek
+ * distance, let it run. The expire code will check for close
+ * cooperators and put the close queue at the front of the service
+ * tree.
+ */
+ new_ioq = elv_close_cooperator(q, ioq, 0);
+ if (new_ioq)
+ goto expire;
+
+ /*
+ * No requests pending. If the active queue still has requests in
+ * flight or is idling for a new request, allow either of these
+ * conditions to happen (or time out) before selecting a new queue.
+ */
+
+ if (timer_pending(&efqd->idle_slice_timer) ||
+ (elv_ioq_nr_dispatched(ioq) && elv_ioq_idle_window(ioq))) {
+ ioq = NULL;
+ goto keep_queue;
+ }
+
+expire:
+ elv_ioq_slice_expired(q);
+new_queue:
+ ioq = elv_set_active_ioq(q, new_ioq);
+keep_queue:
+ return ioq;
+}
+
+/* A request got removed from io_queue. Do the accounting */
+void elv_ioq_request_removed(struct elevator_queue *e, struct request *rq)
+{
+ struct io_queue *ioq;
+ struct elv_fq_data *efqd;
+
+ if (!elv_iosched_fair_queuing_enabled(e))
+ return;
+
+ ioq = rq->ioq;
+ BUG_ON(!ioq);
+ ioq->nr_queued--;
+
+ efqd = ioq->efqd;
+ BUG_ON(!efqd);
+ efqd->rq_queued--;
+
+ if (elv_ioq_busy(ioq) && (elv_active_ioq(e) != ioq) && !ioq->nr_queued)
+ elv_del_ioq_busy(e, ioq, 1);
+}
+
+/* A request got dispatched. Do the accounting. */
+void elv_fq_dispatched_request(struct elevator_queue *e, struct request *rq)
+{
+ struct io_queue *ioq = rq->ioq;
+
+ if (!elv_iosched_fair_queuing_enabled(e))
+ return;
+
+ BUG_ON(!ioq);
+ elv_ioq_request_dispatched(ioq);
+ elv_ioq_request_removed(e, rq);
+ elv_clear_ioq_must_dispatch(ioq);
+}
+
+void elv_fq_activate_rq(struct request_queue *q, struct request *rq)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+
+ if (!elv_iosched_fair_queuing_enabled(q->elevator))
+ return;
+
+ efqd->rq_in_driver++;
+ elv_log_ioq(efqd, rq_ioq(rq), "activate rq, drv=%d",
+ efqd->rq_in_driver);
+}
+
+void elv_fq_deactivate_rq(struct request_queue *q, struct request *rq)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+
+ if (!elv_iosched_fair_queuing_enabled(q->elevator))
+ return;
+
+ WARN_ON(!efqd->rq_in_driver);
+ efqd->rq_in_driver--;
+ elv_log_ioq(efqd, rq_ioq(rq), "deactivate rq, drv=%d",
+ efqd->rq_in_driver);
+}
+
+/*
+ * Update hw_tag based on peak queue depth over 50 samples under
+ * sufficient load.
+ */
+static void elv_update_hw_tag(struct elv_fq_data *efqd)
+{
+ if (efqd->rq_in_driver > efqd->rq_in_driver_peak)
+ efqd->rq_in_driver_peak = efqd->rq_in_driver;
+
+ if (efqd->rq_queued <= ELV_HW_QUEUE_MIN &&
+ efqd->rq_in_driver <= ELV_HW_QUEUE_MIN)
+ return;
+
+ if (efqd->hw_tag_samples++ < 50)
+ return;
+
+ if (efqd->rq_in_driver_peak >= ELV_HW_QUEUE_MIN)
+ efqd->hw_tag = 1;
+ else
+ efqd->hw_tag = 0;
+
+ efqd->hw_tag_samples = 0;
+ efqd->rq_in_driver_peak = 0;
+}
+
+/*
+ * If ioscheduler has functionality of keeping track of close cooperator, check
+ * with it if it has got a closely co-operating queue.
+ */
+static inline struct io_queue *elv_close_cooperator(struct request_queue *q,
+ struct io_queue *ioq, int probe)
+{
+ struct elevator_queue *e = q->elevator;
+ struct io_queue *new_ioq = NULL;
+
+ /*
+ * Currently this feature is supported only for flat hierarchy or
+ * root group queues so that default cfq behavior is not changed.
+ */
+ if (!is_root_group_ioq(q, ioq))
+ return NULL;
+
+ if (q->elevator->ops->elevator_close_cooperator_fn)
+ new_ioq = e->ops->elevator_close_cooperator_fn(q,
+ ioq->sched_queue, probe);
+
+ /* Only select co-operating queue if it belongs to root group */
+ if (new_ioq && !is_root_group_ioq(q, new_ioq))
+ return NULL;
+
+ return new_ioq;
+}
+
+/* A request got completed from io_queue. Do the accounting. */
+void elv_ioq_completed_request(struct request_queue *q, struct request *rq)
+{
+ const int sync = rq_is_sync(rq);
+ struct io_queue *ioq;
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+
+ if (!elv_iosched_fair_queuing_enabled(q->elevator))
+ return;
+
+ ioq = rq->ioq;
+
+ elv_log_ioq(efqd, ioq, "complete");
+
+ elv_update_hw_tag(efqd);
+
+ WARN_ON(!efqd->rq_in_driver);
+ WARN_ON(!ioq->dispatched);
+ efqd->rq_in_driver--;
+ ioq->dispatched--;
+
+ if (sync)
+ ioq->last_end_request = jiffies;
+
+ /*
+ * If this is the active queue, check if it needs to be expired,
+ * or if we want to idle in case it has no pending requests.
+ */
+
+ if (elv_active_ioq(q->elevator) == ioq) {
+ if (elv_ioq_slice_new(ioq)) {
+ elv_ioq_set_prio_slice(q, ioq);
+ elv_clear_ioq_slice_new(ioq);
+ }
+ /*
+ * If there are no requests waiting in this queue, and
+ * there are other queues ready to issue requests, AND
+ * those other queues are issuing requests within our
+ * mean seek distance, give them a chance to run instead
+ * of idling.
+ */
+ if (elv_ioq_slice_used(ioq) || elv_ioq_class_idle(ioq))
+ elv_ioq_slice_expired(q);
+ else if (!ioq->nr_queued && !elv_close_cooperator(q, ioq, 1)
+ && sync && !rq_noidle(rq))
+ elv_ioq_arm_slice_timer(q);
+ }
+
+ if (!efqd->rq_in_driver)
+ elv_schedule_dispatch(q);
+}
+
+struct io_group *io_lookup_io_group_current(struct request_queue *q)
+{
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+
+ return efqd->root_group;
+}
+EXPORT_SYMBOL(io_lookup_io_group_current);
+
+void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class,
+ int ioprio)
+{
+ struct io_queue *ioq = NULL;
+
+ switch (ioprio_class) {
+ case IOPRIO_CLASS_RT:
+ ioq = iog->async_queue[0][ioprio];
+ break;
+ case IOPRIO_CLASS_BE:
+ ioq = iog->async_queue[1][ioprio];
+ break;
+ case IOPRIO_CLASS_IDLE:
+ ioq = iog->async_idle_queue;
+ break;
+ default:
+ BUG();
+ }
+
+ if (ioq)
+ return ioq->sched_queue;
+ return NULL;
+}
+EXPORT_SYMBOL(io_group_async_queue_prio);
+
+void io_group_set_async_queue(struct io_group *iog, int ioprio_class,
+ int ioprio, struct io_queue *ioq)
+{
+ switch (ioprio_class) {
+ case IOPRIO_CLASS_RT:
+ iog->async_queue[0][ioprio] = ioq;
+ break;
+ case IOPRIO_CLASS_BE:
+ iog->async_queue[1][ioprio] = ioq;
+ break;
+ case IOPRIO_CLASS_IDLE:
+ iog->async_idle_queue = ioq;
+ break;
+ default:
+ BUG();
+ }
+
+ /*
+ * Take the group reference and pin the queue. Group exit will
+ * clean it up
+ */
+ elv_get_ioq(ioq);
+}
+EXPORT_SYMBOL(io_group_set_async_queue);
+
+/*
+ * Release all the io group references to its async queues.
+ */
+void io_put_io_group_queues(struct elevator_queue *e, struct io_group *iog)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_BE_NR; j++)
+ elv_release_ioq(e, &iog->async_queue[i][j]);
+
+ /* Free up async idle queue */
+ elv_release_ioq(e, &iog->async_idle_queue);
+}
+
+struct io_group *io_alloc_root_group(struct request_queue *q,
+ struct elevator_queue *e, void *key)
+{
+ struct io_group *iog;
+ int i;
+
+ iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node);
+ if (iog == NULL)
+ return NULL;
+
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++)
+ iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT;
+
+ return iog;
+}
+
+void io_free_root_group(struct elevator_queue *e)
+{
+ struct io_group *iog = e->efqd.root_group;
+ io_put_io_group_queues(e, iog);
+ kfree(iog);
+}
+
+static void elv_slab_kill(void)
+{
+ /*
+ * Caller already ensured that pending RCU callbacks are completed,
+ * so we should have no busy allocations at this point.
+ */
+ if (elv_ioq_pool)
+ kmem_cache_destroy(elv_ioq_pool);
+}
+
+static int __init elv_slab_setup(void)
+{
+ elv_ioq_pool = KMEM_CACHE(io_queue, 0);
+ if (!elv_ioq_pool)
+ goto fail;
+
+ return 0;
+fail:
+ elv_slab_kill();
+ return -ENOMEM;
+}
+
+/* Initialize fair queueing data associated with elevator */
+int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e)
+{
+ struct io_group *iog;
+ struct elv_fq_data *efqd = &e->efqd;
+
+ if (!elv_iosched_fair_queuing_enabled(e))
+ return 0;
+
+ iog = io_alloc_root_group(q, e, efqd);
+ if (iog == NULL)
+ return 1;
+
+ efqd->root_group = iog;
+ efqd->queue = q;
+
+ init_timer(&efqd->idle_slice_timer);
+ efqd->idle_slice_timer.function = elv_idle_slice_timer;
+ efqd->idle_slice_timer.data = (unsigned long) efqd;
+
+ INIT_WORK(&efqd->unplug_work, elv_kick_queue);
+
+ INIT_LIST_HEAD(&efqd->idle_list);
+
+ efqd->elv_slice[0] = elv_slice_async;
+ efqd->elv_slice[1] = elv_slice_sync;
+ efqd->elv_slice_idle = elv_slice_idle;
+ efqd->hw_tag = 1;
+
+ return 0;
+}
+
+/*
+ * elv_exit_fq_data is called before we call elevator_exit_fn. Before
+ * we ask elevator to cleanup its queues, we do the cleanup here so
+ * that all the group and idle tree references to ioq are dropped. Later
+ * during elevator cleanup, ioc reference will be dropped which will lead
+ * to removal of ioscheduler queue as well as associated ioq object.
+ */
+void elv_exit_fq_data(struct elevator_queue *e)
+{
+ struct elv_fq_data *efqd = &e->efqd;
+ struct request_queue *q = efqd->queue;
+
+ if (!elv_iosched_fair_queuing_enabled(e))
+ return;
+
+ elv_shutdown_timer_wq(e);
+
+ spin_lock_irq(q->queue_lock);
+ /* This should drop all the idle tree references of ioq */
+ elv_free_idle_ioq_list(e);
+ spin_unlock_irq(q->queue_lock);
+
+ elv_shutdown_timer_wq(e);
+
+ BUG_ON(timer_pending(&efqd->idle_slice_timer));
+ io_free_root_group(e);
+}
+
+/*
+ * This is called after the io scheduler has cleaned up its data structres.
+ * I don't think that this function is required. Right now just keeping it
+ * because cfq cleans up timer and work queue again after freeing up
+ * io contexts. To me io scheduler has already been drained out, and all
+ * the active queue have already been expired so time and work queue should
+ * not been activated during cleanup process.
+ *
+ * Keeping it here for the time being. Will get rid of it later.
+ */
+void elv_exit_fq_data_post(struct elevator_queue *e)
+{
+ struct elv_fq_data *efqd = &e->efqd;
+
+ if (!elv_iosched_fair_queuing_enabled(e))
+ return;
+
+ elv_shutdown_timer_wq(e);
+ BUG_ON(timer_pending(&efqd->idle_slice_timer));
+}
+
+
+static int __init elv_fq_init(void)
+{
+ if (elv_slab_setup())
+ return -ENOMEM;
+
+ /* could be 0 on HZ < 1000 setups */
+
+ if (!elv_slice_async)
+ elv_slice_async = 1;
+
+ if (!elv_slice_idle)
+ elv_slice_idle = 1;
+
+ return 0;
+}
+
+module_init(elv_fq_init);
new file mode 100644
@@ -0,0 +1,488 @@
+/*
+ * BFQ: data structures and common functions prototypes.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ */
+
+#include <linux/blkdev.h>
+
+#ifndef _BFQ_SCHED_H
+#define _BFQ_SCHED_H
+
+#define IO_IOPRIO_CLASSES 3
+
+typedef u64 bfq_timestamp_t;
+typedef unsigned long bfq_weight_t;
+typedef unsigned long bfq_service_t;
+struct io_entity;
+struct io_queue;
+
+#ifdef CONFIG_ELV_FAIR_QUEUING
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ * @active: tree for active entities (i.e., those backlogged).
+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
+ * @first_idle: idle entity with minimum F_i.
+ * @last_idle: idle entity with maximum F_i.
+ * @vtime: scheduler virtual time.
+ * @wsum: scheduler weight sum; active and idle entities contribute to it.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree. All the fields are protected by the queue lock
+ * of the containing efqd.
+ */
+struct io_service_tree {
+ struct rb_root active;
+ struct rb_root idle;
+
+ struct io_entity *first_idle;
+ struct io_entity *last_idle;
+
+ bfq_timestamp_t vtime;
+ bfq_weight_t wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ * @active_entity: entity under service.
+ * @next_active: head-of-the-line entity in the scheduler.
+ * @service_tree: array of service trees, one per ioprio_class.
+ *
+ * bfq_sched_data is the basic scheduler queue. It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as
+ * an intermediate queue on a hierarchical setup.
+ * @next_active points to the active entity of the sched_data service
+ * trees that will be scheduled next.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct io_sched_data {
+ struct io_entity *active_entity;
+ struct io_service_tree service_tree[IO_IOPRIO_CLASSES];
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ * @rb_node: service_tree member.
+ * @on_st: flag, true if the entity is on a tree (either the active or
+ * the idle one of its service_tree).
+ * @finish: B-WF2Q+ finish timestamp (aka F_i).
+ * @start: B-WF2Q+ start timestamp (aka S_i).
+ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
+ * @min_start: minimum start time of the (active) subtree rooted at
+ * this entity; used for O(log N) lookups into active trees.
+ * @service: service received during the last round of service.
+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
+ * @weight: weight of the queue, calculated as IOPRIO_BE_NR - @ioprio.
+ * @parent: parent entity, for hierarchical scheduling.
+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
+ * associated scheduler queue, %NULL on leaf nodes.
+ * @sched_data: the scheduler queue this entity belongs to.
+ * @ioprio: the ioprio in use.
+ * @new_ioprio: when an ioprio change is requested, the new ioprio value
+ * @ioprio_class: the ioprio_class in use.
+ * @new_ioprio_class: when an ioprio_class change is requested, the new
+ * ioprio_class value.
+ * @ioprio_changed: flag, true when the user requested an ioprio or
+ * ioprio_class change.
+ *
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy. Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would allow
+ * different weights on different devices, but this functionality is not
+ * exported to userspace by now. Priorities are updated lazily, first
+ * storing the new values into the new_* fields, then setting the
+ * @ioprio_changed flag. As soon as there is a transition in the entity
+ * state that allows the priority update to take place the effective and
+ * the requested priority values are synchronized.
+ *
+ * The weight value is calculated from the ioprio to export the same
+ * interface as CFQ. When dealing with ``well-behaved'' queues (i.e.,
+ * queues that do not spend too much time to consume their budget and
+ * have true sequential behavior, and when there are no external factors
+ * breaking anticipation) the relative weights at each level of the
+ * cgroups hierarchy should be guaranteed.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct io_entity {
+ struct rb_node rb_node;
+
+ int on_st;
+
+ bfq_timestamp_t finish;
+ bfq_timestamp_t start;
+
+ struct rb_root *tree;
+
+ bfq_timestamp_t min_start;
+
+ bfq_service_t service, budget;
+ bfq_weight_t weight;
+
+ struct io_entity *parent;
+
+ struct io_sched_data *my_sched_data;
+ struct io_sched_data *sched_data;
+
+ unsigned short ioprio, new_ioprio;
+ unsigned short ioprio_class, new_ioprio_class;
+
+ int ioprio_changed;
+};
+
+/*
+ * A common structure embedded by every io scheduler into their respective
+ * queue structure.
+ */
+struct io_queue {
+ struct io_entity entity;
+ atomic_t ref;
+ unsigned int flags;
+
+ /* Pointer to generic elevator data structure */
+ struct elv_fq_data *efqd;
+ struct list_head queue_list;
+ pid_t pid;
+
+ /* Number of requests queued on this io queue */
+ unsigned long nr_queued;
+
+ /* Requests dispatched from this queue */
+ int dispatched;
+
+ /* Keep a track of think time of processes in this queue */
+ unsigned long last_end_request;
+ unsigned long ttime_total;
+ unsigned long ttime_samples;
+ unsigned long ttime_mean;
+
+ unsigned long slice_end;
+
+ /* Pointer to io scheduler's queue */
+ void *sched_queue;
+};
+
+struct io_group {
+ struct io_sched_data sched_data;
+
+ /* async_queue and idle_queue are used only for cfq */
+ struct io_queue *async_queue[2][IOPRIO_BE_NR];
+ struct io_queue *async_idle_queue;
+};
+
+struct elv_fq_data {
+ struct io_group *root_group;
+
+ /* List of io queues on idle tree. */
+ struct list_head idle_list;
+
+ struct request_queue *queue;
+ unsigned int busy_queues;
+ /*
+ * Used to track any pending rt requests so we can pre-empt current
+ * non-RT cfqq in service when this value is non-zero.
+ */
+ unsigned int busy_rt_queues;
+
+ /* Number of requests queued */
+ int rq_queued;
+
+ /* Pointer to the ioscheduler queue being served */
+ void *active_queue;
+
+ int rq_in_driver;
+ int hw_tag;
+ int hw_tag_samples;
+ int rq_in_driver_peak;
+
+ /*
+ * elevator fair queuing layer has the capability to provide idling
+ * for ensuring fairness for processes doing dependent reads.
+ * This might be needed to ensure fairness among two processes doing
+ * synchronous reads in two different cgroups. noop and deadline don't
+ * have any notion of anticipation/idling. As of now, these are the
+ * users of this functionality.
+ */
+ unsigned int elv_slice_idle;
+ struct timer_list idle_slice_timer;
+ struct work_struct unplug_work;
+
+ unsigned int elv_slice[2];
+};
+
+extern int elv_slice_idle;
+extern int elv_slice_async;
+
+/* Logging facilities. */
+#define elv_log_ioq(efqd, ioq, fmt, args...) \
+ blk_add_trace_msg((efqd)->queue, "elv%d%c " fmt, (ioq)->pid, \
+ elv_ioq_sync(ioq) ? 'S' : 'A', ##args)
+
+#define elv_log(efqd, fmt, args...) \
+ blk_add_trace_msg((efqd)->queue, "elv " fmt, ##args)
+
+#define ioq_sample_valid(samples) ((samples) > 80)
+
+/* Some shared queue flag manipulation functions among elevators */
+
+enum elv_queue_state_flags {
+ ELV_QUEUE_FLAG_busy = 0, /* has requests or is under service */
+ ELV_QUEUE_FLAG_sync, /* synchronous queue */
+ ELV_QUEUE_FLAG_idle_window, /* elevator slice idling enabled */
+ ELV_QUEUE_FLAG_wait_request, /* waiting for a request */
+ ELV_QUEUE_FLAG_must_dispatch, /* must be allowed a dispatch */
+ ELV_QUEUE_FLAG_slice_new, /* no requests dispatched in slice */
+ ELV_QUEUE_FLAG_NR,
+};
+
+#define ELV_IO_QUEUE_FLAG_FNS(name) \
+static inline void elv_mark_ioq_##name(struct io_queue *ioq) \
+{ \
+ (ioq)->flags |= (1 << ELV_QUEUE_FLAG_##name); \
+} \
+static inline void elv_clear_ioq_##name(struct io_queue *ioq) \
+{ \
+ (ioq)->flags &= ~(1 << ELV_QUEUE_FLAG_##name); \
+} \
+static inline int elv_ioq_##name(struct io_queue *ioq) \
+{ \
+ return ((ioq)->flags & (1 << ELV_QUEUE_FLAG_##name)) != 0; \
+}
+
+ELV_IO_QUEUE_FLAG_FNS(busy)
+ELV_IO_QUEUE_FLAG_FNS(sync)
+ELV_IO_QUEUE_FLAG_FNS(wait_request)
+ELV_IO_QUEUE_FLAG_FNS(must_dispatch)
+ELV_IO_QUEUE_FLAG_FNS(idle_window)
+ELV_IO_QUEUE_FLAG_FNS(slice_new)
+
+static inline struct io_service_tree *
+io_entity_service_tree(struct io_entity *entity)
+{
+ struct io_sched_data *sched_data = entity->sched_data;
+ unsigned int idx = entity->ioprio_class - 1;
+
+ BUG_ON(idx >= IO_IOPRIO_CLASSES);
+ BUG_ON(sched_data == NULL);
+
+ return sched_data->service_tree + idx;
+}
+
+/* A request got dispatched from the io_queue. Do the accounting. */
+static inline void elv_ioq_request_dispatched(struct io_queue *ioq)
+{
+ ioq->dispatched++;
+}
+
+static inline int elv_ioq_slice_used(struct io_queue *ioq)
+{
+ if (elv_ioq_slice_new(ioq))
+ return 0;
+ if (time_before(jiffies, ioq->slice_end))
+ return 0;
+
+ return 1;
+}
+
+/* How many request are currently dispatched from the queue */
+static inline int elv_ioq_nr_dispatched(struct io_queue *ioq)
+{
+ return ioq->dispatched;
+}
+
+/* How many request are currently queued in the queue */
+static inline int elv_ioq_nr_queued(struct io_queue *ioq)
+{
+ return ioq->nr_queued;
+}
+
+static inline pid_t elv_ioq_pid(struct io_queue *ioq)
+{
+ return ioq->pid;
+}
+
+static inline unsigned long elv_ioq_ttime_mean(struct io_queue *ioq)
+{
+ return ioq->ttime_mean;
+}
+
+static inline unsigned long elv_ioq_sample_valid(struct io_queue *ioq)
+{
+ return ioq_sample_valid(ioq->ttime_samples);
+}
+
+static inline void elv_get_ioq(struct io_queue *ioq)
+{
+ atomic_inc(&ioq->ref);
+}
+
+static inline void elv_ioq_set_slice_end(struct io_queue *ioq,
+ unsigned long slice_end)
+{
+ ioq->slice_end = slice_end;
+}
+
+static inline int elv_ioq_class_idle(struct io_queue *ioq)
+{
+ return ioq->entity.ioprio_class == IOPRIO_CLASS_IDLE;
+}
+
+static inline int elv_ioq_class_rt(struct io_queue *ioq)
+{
+ return ioq->entity.ioprio_class == IOPRIO_CLASS_RT;
+}
+
+static inline int elv_ioq_ioprio_class(struct io_queue *ioq)
+{
+ return ioq->entity.new_ioprio_class;
+}
+
+static inline int elv_ioq_ioprio(struct io_queue *ioq)
+{
+ return ioq->entity.new_ioprio;
+}
+
+static inline void elv_ioq_set_ioprio_class(struct io_queue *ioq,
+ int ioprio_class)
+{
+ ioq->entity.new_ioprio_class = ioprio_class;
+ ioq->entity.ioprio_changed = 1;
+}
+
+static inline void elv_ioq_set_ioprio(struct io_queue *ioq, int ioprio)
+{
+ ioq->entity.new_ioprio = ioprio;
+ ioq->entity.ioprio_changed = 1;
+}
+
+static inline void *ioq_sched_queue(struct io_queue *ioq)
+{
+ if (ioq)
+ return ioq->sched_queue;
+ return NULL;
+}
+
+static inline struct io_group *ioq_to_io_group(struct io_queue *ioq)
+{
+ return container_of(ioq->entity.sched_data, struct io_group,
+ sched_data);
+}
+
+/* Functions used by blksysfs.c */
+extern ssize_t elv_slice_idle_show(struct request_queue *q, char *name);
+extern ssize_t elv_slice_idle_store(struct request_queue *q, const char *name,
+ size_t count);
+extern ssize_t elv_slice_sync_show(struct request_queue *q, char *name);
+extern ssize_t elv_slice_sync_store(struct request_queue *q, const char *name,
+ size_t count);
+extern ssize_t elv_slice_async_show(struct request_queue *q, char *name);
+extern ssize_t elv_slice_async_store(struct request_queue *q, const char *name,
+ size_t count);
+
+/* Functions used by elevator.c */
+extern int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e);
+extern void elv_exit_fq_data(struct elevator_queue *e);
+extern void elv_exit_fq_data_post(struct elevator_queue *e);
+
+extern void elv_ioq_request_add(struct request_queue *q, struct request *rq);
+extern void elv_ioq_request_removed(struct elevator_queue *e,
+ struct request *rq);
+extern void elv_fq_dispatched_request(struct elevator_queue *e,
+ struct request *rq);
+
+extern void elv_fq_activate_rq(struct request_queue *q, struct request *rq);
+extern void elv_fq_deactivate_rq(struct request_queue *q, struct request *rq);
+
+extern void elv_ioq_completed_request(struct request_queue *q,
+ struct request *rq);
+
+extern void *elv_fq_select_ioq(struct request_queue *q, int force);
+extern struct io_queue *rq_ioq(struct request *rq);
+
+/* Functions used by io schedulers */
+extern void elv_put_ioq(struct io_queue *ioq);
+extern void __elv_ioq_slice_expired(struct request_queue *q,
+ struct io_queue *ioq);
+extern int elv_init_ioq(struct elevator_queue *eq, struct io_queue *ioq,
+ void *sched_queue, int ioprio_class, int ioprio, int is_sync);
+extern void elv_schedule_dispatch(struct request_queue *q);
+extern int elv_hw_tag(struct elevator_queue *e);
+extern void *elv_active_sched_queue(struct elevator_queue *e);
+extern int elv_mod_idle_slice_timer(struct elevator_queue *eq,
+ unsigned long expires);
+extern int elv_del_idle_slice_timer(struct elevator_queue *eq);
+extern unsigned int elv_get_slice_idle(struct elevator_queue *eq);
+extern void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class,
+ int ioprio);
+extern void io_group_set_async_queue(struct io_group *iog, int ioprio_class,
+ int ioprio, struct io_queue *ioq);
+extern struct io_group *io_lookup_io_group_current(struct request_queue *q);
+extern int elv_nr_busy_ioq(struct elevator_queue *e);
+extern int elv_nr_busy_rt_ioq(struct elevator_queue *e);
+extern struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask);
+extern void elv_free_ioq(struct io_queue *ioq);
+
+#else /* CONFIG_ELV_FAIR_QUEUING */
+
+static inline int elv_init_fq_data(struct request_queue *q,
+ struct elevator_queue *e)
+{
+ return 0;
+}
+
+static inline void elv_exit_fq_data(struct elevator_queue *e) {}
+static inline void elv_exit_fq_data_post(struct elevator_queue *e) {}
+
+static inline void elv_fq_activate_rq(struct request_queue *q,
+ struct request *rq)
+{
+}
+
+static inline void elv_fq_deactivate_rq(struct request_queue *q,
+ struct request *rq)
+{
+}
+
+static inline void elv_fq_dispatched_request(struct elevator_queue *e,
+ struct request *rq)
+{
+}
+
+static inline void elv_ioq_request_removed(struct elevator_queue *e,
+ struct request *rq)
+{
+}
+
+static inline void elv_ioq_request_add(struct request_queue *q,
+ struct request *rq)
+{
+}
+
+static inline void elv_ioq_completed_request(struct request_queue *q,
+ struct request *rq)
+{
+}
+
+static inline void *ioq_sched_queue(struct io_queue *ioq) { return NULL; }
+static inline struct io_queue *rq_ioq(struct request *rq) { return NULL; }
+static inline void *elv_fq_select_ioq(struct request_queue *q, int force)
+{
+ return NULL;
+}
+#endif /* CONFIG_ELV_FAIR_QUEUING */
+#endif /* _BFQ_SCHED_H */
@@ -231,6 +231,9 @@ static struct elevator_queue *elevator_alloc(struct request_queue *q,
for (i = 0; i < ELV_HASH_ENTRIES; i++)
INIT_HLIST_HEAD(&eq->hash[i]);
+ if (elv_init_fq_data(q, eq))
+ goto err;
+
return eq;
err:
kfree(eq);
@@ -301,9 +304,11 @@ EXPORT_SYMBOL(elevator_init);
void elevator_exit(struct elevator_queue *e)
{
mutex_lock(&e->sysfs_lock);
+ elv_exit_fq_data(e);
if (e->ops->elevator_exit_fn)
e->ops->elevator_exit_fn(e);
e->ops = NULL;
+ elv_exit_fq_data_post(e);
mutex_unlock(&e->sysfs_lock);
kobject_put(&e->kobj);
@@ -314,6 +319,8 @@ static void elv_activate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
+ elv_fq_activate_rq(q, rq);
+
if (e->ops->elevator_activate_req_fn)
e->ops->elevator_activate_req_fn(q, rq);
}
@@ -322,6 +329,8 @@ static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
{
struct elevator_queue *e = q->elevator;
+ elv_fq_deactivate_rq(q, rq);
+
if (e->ops->elevator_deactivate_req_fn)
e->ops->elevator_deactivate_req_fn(q, rq);
}
@@ -446,6 +455,7 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
elv_rqhash_del(q, rq);
q->nr_sorted--;
+ elv_fq_dispatched_request(q->elevator, rq);
boundary = q->end_sector;
stop_flags = REQ_SOFTBARRIER | REQ_HARDBARRIER | REQ_STARTED;
@@ -486,6 +496,7 @@ void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
elv_rqhash_del(q, rq);
q->nr_sorted--;
+ elv_fq_dispatched_request(q->elevator, rq);
q->end_sector = rq_end_sector(rq);
q->boundary_rq = rq;
@@ -553,6 +564,7 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
elv_rqhash_del(q, next);
q->nr_sorted--;
+ elv_ioq_request_removed(e, next);
q->last_merge = rq;
}
@@ -657,12 +669,8 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
q->last_merge = rq;
}
- /*
- * Some ioscheds (cfq) run q->request_fn directly, so
- * rq cannot be accessed after calling
- * elevator_add_req_fn.
- */
q->elevator->ops->elevator_add_req_fn(q, rq);
+ elv_ioq_request_add(q, rq);
break;
case ELEVATOR_INSERT_REQUEUE:
@@ -872,13 +880,12 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
int elv_queue_empty(struct request_queue *q)
{
- struct elevator_queue *e = q->elevator;
-
if (!list_empty(&q->queue_head))
return 0;
- if (e->ops->elevator_queue_empty_fn)
- return e->ops->elevator_queue_empty_fn(q);
+ /* Hopefully nr_sorted works and no need to call queue_empty_fn */
+ if (q->nr_sorted)
+ return 0;
return 1;
}
@@ -953,8 +960,11 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
*/
if (blk_account_rq(rq)) {
q->in_flight--;
- if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
- e->ops->elevator_completed_req_fn(q, rq);
+ if (blk_sorted_rq(rq)) {
+ if (e->ops->elevator_completed_req_fn)
+ e->ops->elevator_completed_req_fn(q, rq);
+ elv_ioq_completed_request(q, rq);
+ }
}
/*
@@ -1242,3 +1252,17 @@ struct request *elv_rb_latter_request(struct request_queue *q,
return NULL;
}
EXPORT_SYMBOL(elv_rb_latter_request);
+
+/* Get the io scheduler queue pointer. For cfq, it is stored in rq->ioq*/
+void *elv_get_sched_queue(struct request_queue *q, struct request *rq)
+{
+ return ioq_sched_queue(rq_ioq(rq));
+}
+EXPORT_SYMBOL(elv_get_sched_queue);
+
+/* Select an ioscheduler queue to dispatch request from. */
+void *elv_select_sched_queue(struct request_queue *q, int force)
+{
+ return ioq_sched_queue(elv_fq_select_ioq(q, force));
+}
+EXPORT_SYMBOL(elv_select_sched_queue);
@@ -245,6 +245,11 @@ struct request {
/* for bidi */
struct request *next_rq;
+
+#ifdef CONFIG_ELV_FAIR_QUEUING
+ /* io queue request belongs to */
+ struct io_queue *ioq;
+#endif
};
static inline unsigned short req_get_ioprio(struct request *req)
@@ -2,6 +2,7 @@
#define _LINUX_ELEVATOR_H
#include <linux/percpu.h>
+#include "../../block/elevator-fq.h"
#ifdef CONFIG_BLOCK
@@ -29,6 +30,18 @@ typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct reques
typedef void *(elevator_init_fn) (struct request_queue *);
typedef void (elevator_exit_fn) (struct elevator_queue *);
+#ifdef CONFIG_ELV_FAIR_QUEUING
+typedef void (elevator_free_sched_queue_fn) (struct elevator_queue*, void *);
+typedef void (elevator_active_ioq_set_fn) (struct request_queue*, void *, int);
+typedef void (elevator_active_ioq_reset_fn) (struct request_queue *, void*);
+typedef void (elevator_arm_slice_timer_fn) (struct request_queue*, void*);
+typedef int (elevator_should_preempt_fn) (struct request_queue*, void*,
+ struct request*);
+typedef int (elevator_update_idle_window_fn) (struct elevator_queue*, void*,
+ struct request*);
+typedef struct io_queue* (elevator_close_cooperator_fn) (struct request_queue*,
+ void*, int probe);
+#endif
struct elevator_ops
{
@@ -56,6 +69,17 @@ struct elevator_ops
elevator_init_fn *elevator_init_fn;
elevator_exit_fn *elevator_exit_fn;
void (*trim)(struct io_context *);
+
+#ifdef CONFIG_ELV_FAIR_QUEUING
+ elevator_free_sched_queue_fn *elevator_free_sched_queue_fn;
+ elevator_active_ioq_set_fn *elevator_active_ioq_set_fn;
+ elevator_active_ioq_reset_fn *elevator_active_ioq_reset_fn;
+
+ elevator_arm_slice_timer_fn *elevator_arm_slice_timer_fn;
+ elevator_should_preempt_fn *elevator_should_preempt_fn;
+ elevator_update_idle_window_fn *elevator_update_idle_window_fn;
+ elevator_close_cooperator_fn *elevator_close_cooperator_fn;
+#endif
};
#define ELV_NAME_MAX (16)
@@ -76,6 +100,9 @@ struct elevator_type
struct elv_fs_entry *elevator_attrs;
char elevator_name[ELV_NAME_MAX];
struct module *elevator_owner;
+#ifdef CONFIG_ELV_FAIR_QUEUING
+ int elevator_features;
+#endif
};
/*
@@ -89,6 +116,10 @@ struct elevator_queue
struct elevator_type *elevator_type;
struct mutex sysfs_lock;
struct hlist_head *hash;
+#ifdef CONFIG_ELV_FAIR_QUEUING
+ /* fair queuing data */
+ struct elv_fq_data efqd;
+#endif
};
/*
@@ -209,5 +240,25 @@ enum {
__val; \
})
+/* iosched can let elevator know their feature set/capability */
+#ifdef CONFIG_ELV_FAIR_QUEUING
+
+/* iosched wants to use fq logic of elevator layer */
+#define ELV_IOSCHED_NEED_FQ 1
+
+static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e)
+{
+ return (e->elevator_type->elevator_features) & ELV_IOSCHED_NEED_FQ;
+}
+
+#else /* ELV_IOSCHED_FAIR_QUEUING */
+
+static inline int elv_iosched_fair_queuing_enabled(struct elevator_queue *e)
+{
+ return 0;
+}
+#endif /* ELV_IOSCHED_FAIR_QUEUING */
+extern void *elv_get_sched_queue(struct request_queue *q, struct request *rq);
+extern void *elv_select_sched_queue(struct request_queue *q, int force);
#endif /* CONFIG_BLOCK */
#endif