@@ -1354,6 +1354,9 @@ alloc_cfqq:
cfqq->ioq = ioq;
cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
cfq_init_prio_data(cfqq, ioc);
+
+ /* ioq reference on iog */
+ elv_get_iog(iog);
cfq_log_cfqq(cfqd, cfqq, "alloced");
} else {
cfqq = &cfqd->oom_cfqq;
@@ -41,6 +41,9 @@ static struct kmem_cache *elv_ioq_pool;
*/
#define WFQ_SERVICE_SHIFT 22
+static void
+elv_release_ioq(struct elevator_queue *eq, struct io_queue **ioq_ptr);
+
#ifdef CONFIG_GROUP_IOSCHED
#define for_each_entity(entity) \
for (; entity != NULL; entity = entity->parent)
@@ -86,6 +89,69 @@ static inline void bfq_check_next_active(struct io_sched_data *sd,
{
BUG_ON(sd->next_active != entity);
}
+
+static inline int iog_deleting(struct io_group *iog)
+{
+ return iog->deleting;
+}
+
+/* Do the two (enqueued) entities belong to the same group ? */
+static inline int
+is_same_group(struct io_entity *entity, struct io_entity *new_entity)
+{
+ if (entity->sched_data == new_entity->sched_data)
+ return 1;
+
+ return 0;
+}
+
+static inline struct io_entity *parent_entity(struct io_entity *entity)
+{
+ return entity->parent;
+}
+
+/* return depth at which a io entity is present in the hierarchy */
+static inline int depth_entity(struct io_entity *entity)
+{
+ int depth = 0;
+
+ for_each_entity(entity)
+ depth++;
+
+ return depth;
+}
+
+static void bfq_find_matching_entity(struct io_entity **entity,
+ struct io_entity **new_entity)
+{
+ int entity_depth, new_entity_depth;
+
+ /*
+ * preemption test can be made between sibling entities who are in the
+ * same group i.e who have a common parent. Walk up the hierarchy of
+ * both entities until we find their ancestors who are siblings of
+ * common parent.
+ */
+
+ /* First walk up until both entities are at same depth */
+ entity_depth = depth_entity(*entity);
+ new_entity_depth = depth_entity(*new_entity);
+
+ while (entity_depth > new_entity_depth) {
+ entity_depth--;
+ *entity = parent_entity(*entity);
+ }
+
+ while (new_entity_depth > entity_depth) {
+ new_entity_depth--;
+ *new_entity = parent_entity(*new_entity);
+ }
+
+ while (!is_same_group(*entity, *new_entity)) {
+ *entity = parent_entity(*entity);
+ *new_entity = parent_entity(*new_entity);
+ }
+}
#else /* GROUP_IOSCHED */
#define for_each_entity(entity) \
for (; entity != NULL; entity = NULL)
@@ -102,6 +168,23 @@ static inline void bfq_check_next_active(struct io_sched_data *sd,
struct io_entity *entity)
{
}
+
+static inline int iog_deleting(struct io_group *iog)
+{
+ /* In flat mode, root cgroup can't be deleted. */
+ return 0;
+}
+
+static void bfq_find_matching_entity(struct io_entity **entity,
+ struct io_entity **new_entity)
+{
+}
+
+static inline int
+is_same_group(struct io_entity *entity, struct io_entity *new_entity)
+{
+ return 1;
+}
#endif /* GROUP_IOSCHED */
static inline int elv_prio_slice(struct elv_fq_data *efqd, int sync,
@@ -359,12 +442,6 @@ static void bfq_get_entity(struct io_entity *entity)
elv_get_ioq(ioq);
}
-static inline void
-bfq_init_entity(struct io_entity *entity, struct io_group *iog)
-{
- entity->sched_data = &iog->sched_data;
-}
-
/**
* bfq_find_deepest - find the deepest node that an extraction can modify.
* @node: the node being removed.
@@ -832,8 +909,26 @@ static int __bfq_deactivate_entity(struct io_entity *entity, int requeue)
static void bfq_deactivate_entity(struct io_entity *entity, int requeue)
{
struct io_sched_data *sd;
+ struct io_group *iog, *__iog;
struct io_entity *parent;
+ iog = container_of(entity->sched_data, struct io_group, sched_data);
+
+ /*
+ * Hold a reference to entity's iog until we are done. This function
+ * travels the hierarchy and we don't want to free up the group yet
+ * while we are traversing the hiearchy. It is possible that this
+ * group's cgroup has been removed hence cgroup reference is gone.
+ * If this entity was active entity, then its group will not be on
+ * any of the trees and it will be freed up the moment queue is
+ * freed up in __bfq_deactivate_entity().
+ *
+ * Hence, hold a reference, deactivate the hierarhcy of entities and
+ * then drop the reference which should free up the whole chain of
+ * groups.
+ */
+ elv_get_iog(iog);
+
for_each_entity_safe(entity, parent) {
sd = entity->sched_data;
@@ -851,6 +946,7 @@ static void bfq_deactivate_entity(struct io_entity *entity, int requeue)
* the budgets on the path towards the root
* need to be updated.
*/
+ elv_put_iog(iog);
goto update;
}
@@ -858,11 +954,16 @@ static void bfq_deactivate_entity(struct io_entity *entity, int requeue)
* If we reach there the parent is no more backlogged and
* we want to propagate the dequeue upwards.
*
+ * If entity's group has been marked for deletion, don't
+ * requeue the group in idle tree so that it can be freed.
*/
-
- requeue = 1;
+ __iog = container_of(entity->sched_data, struct io_group,
+ sched_data);
+ if (!iog_deleting(__iog))
+ requeue = 1;
}
+ elv_put_iog(iog);
return;
update:
@@ -901,8 +1002,57 @@ static void io_flush_idle_tree(struct io_service_tree *st)
__bfq_deactivate_entity(entity, 0);
}
+/*
+ * Release all the io group references to its async queues.
+ */
+static void
+io_put_io_group_queues(struct elevator_queue *e, struct io_group *iog)
+{
+ int i, j;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_BE_NR; j++)
+ elv_release_ioq(e, &iog->async_queue[i][j]);
+
+ /* Free up async idle queue */
+ elv_release_ioq(e, &iog->async_idle_queue);
+}
+
/* Mainly hierarchical grouping code */
#ifdef CONFIG_GROUP_IOSCHED
+static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup);
+
+static inline void
+bfq_init_entity(struct io_entity *entity, struct io_group *iog)
+{
+ entity->parent = iog->my_entity;
+ entity->sched_data = &iog->sched_data;
+}
+
+static void io_group_init_entity(struct io_cgroup *iocg, struct io_group *iog)
+{
+ struct io_entity *entity = &iog->entity;
+
+ entity->weight = iocg->weight;
+ entity->ioprio_class = entity->new_ioprio_class = iocg->ioprio_class;
+ entity->ioprio_changed = 1;
+ entity->my_sched_data = &iog->sched_data;
+}
+
+static void io_group_set_parent(struct io_group *iog, struct io_group *parent)
+{
+ struct io_entity *entity;
+
+ BUG_ON(parent == NULL);
+ BUG_ON(iog == NULL);
+
+ entity = &iog->entity;
+ entity->parent = parent->my_entity;
+ entity->sched_data = &parent->sched_data;
+ if (entity->parent)
+ /* Child group reference on parent group. */
+ elv_get_iog(parent);
+}
struct io_cgroup io_root_cgroup = {
.weight = IO_DEFAULT_GRP_WEIGHT,
@@ -915,6 +1065,26 @@ static struct io_cgroup *cgroup_to_io_cgroup(struct cgroup *cgroup)
struct io_cgroup, css);
}
+/*
+ * Search the io_group for efqd into the hash table (by now only a list)
+ * of bgrp. Must be called under rcu_read_lock().
+ */
+static struct io_group *
+io_cgroup_lookup_group(struct io_cgroup *iocg, void *key)
+{
+ struct io_group *iog;
+ struct hlist_node *n;
+ void *__key;
+
+ hlist_for_each_entry_rcu(iog, n, &iocg->group_data, group_node) {
+ __key = rcu_dereference(iog->key);
+ if (__key == key)
+ return iog;
+ }
+
+ return NULL;
+}
+
#define SHOW_FUNCTION(__VAR) \
static u64 io_cgroup_##__VAR##_read(struct cgroup *cgroup, \
struct cftype *cftype) \
@@ -1055,12 +1225,6 @@ static void iocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup,
task_unlock(tsk);
}
-static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
-{
-
- /* Implemented in later patch */
-}
-
struct cgroup_subsys io_subsys = {
.name = "io",
.create = iocg_create,
@@ -1071,7 +1235,597 @@ struct cgroup_subsys io_subsys = {
.subsys_id = io_subsys_id,
.use_id = 1,
};
+
+static inline unsigned int iog_weight(struct io_group *iog)
+{
+ return iog->entity.weight;
+}
+
+/**
+ * io_group_chain_alloc - allocate a chain of groups.
+ * @efqd: queue descriptor.
+ * @cgroup: the leaf cgroup this chain starts from.
+ *
+ * Allocate a chain of groups starting from the one belonging to
+ * @cgroup up to the root cgroup. Stop if a cgroup on the chain
+ * to the root has already an allocated group on @efqd.
+ */
+static struct io_group *
+io_group_chain_alloc(struct request_queue *q, void *key, struct cgroup *cgroup)
+{
+ struct io_cgroup *iocg;
+ struct io_group *iog, *leaf = NULL, *prev = NULL;
+ gfp_t flags = GFP_ATOMIC | __GFP_ZERO;
+
+ for (; cgroup != NULL; cgroup = cgroup->parent) {
+ iocg = cgroup_to_io_cgroup(cgroup);
+
+ iog = io_cgroup_lookup_group(iocg, key);
+ if (iog != NULL) {
+ /*
+ * All the cgroups in the path from there to the
+ * root must have a io_group for efqd, so we don't
+ * need any more allocations.
+ */
+ break;
+ }
+
+ iog = kzalloc_node(sizeof(*iog), flags, q->node);
+ if (!iog)
+ goto cleanup;
+
+ iog->iocg_id = css_id(&iocg->css);
+
+ io_group_init_entity(iocg, iog);
+ iog->my_entity = &iog->entity;
+
+ atomic_set(&iog->ref, 0);
+ iog->deleting = 0;
+
+ /*
+ * Take the initial reference that will be released on destroy
+ * This can be thought of a joint reference by cgroup and
+ * elevator which will be dropped by either elevator exit
+ * or cgroup deletion path depending on who is exiting first.
+ */
+ elv_get_iog(iog);
+
+ if (leaf == NULL) {
+ leaf = iog;
+ prev = leaf;
+ } else {
+ io_group_set_parent(prev, iog);
+ /*
+ * Build a list of allocated nodes using the efqd
+ * filed, that is still unused and will be initialized
+ * only after the node will be connected.
+ */
+ prev->key = iog;
+ prev = iog;
+ }
+ }
+
+ return leaf;
+
+cleanup:
+ while (leaf != NULL) {
+ prev = leaf;
+ leaf = leaf->key;
+ kfree(prev);
+ }
+
+ return NULL;
+}
+
+/**
+ * io_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
+ * @efqd: the queue descriptor.
+ * @cgroup: the leaf cgroup to start from.
+ * @leaf: the leaf group (to be associated to @cgroup).
+ *
+ * Try to link a chain of groups to a cgroup hierarchy, connecting the
+ * nodes bottom-up, so we can be sure that when we find a cgroup in the
+ * hierarchy that already as a group associated to @efqd all the nodes
+ * in the path to the root cgroup have one too.
+ *
+ * On locking: the queue lock protects the hierarchy (there is a hierarchy
+ * per device) while the io_cgroup lock protects the list of groups
+ * belonging to the same cgroup.
+ */
+static void io_group_chain_link(struct request_queue *q, void *key,
+ struct cgroup *cgroup,
+ struct io_group *leaf,
+ struct elv_fq_data *efqd)
+{
+ struct io_cgroup *iocg;
+ struct io_group *iog, *next, *prev = NULL;
+ unsigned long flags;
+
+ assert_spin_locked(q->queue_lock);
+
+ for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
+ iocg = cgroup_to_io_cgroup(cgroup);
+ next = leaf->key;
+
+ iog = io_cgroup_lookup_group(iocg, key);
+ BUG_ON(iog != NULL);
+
+ spin_lock_irqsave(&iocg->lock, flags);
+
+ rcu_assign_pointer(leaf->key, key);
+ hlist_add_head_rcu(&leaf->group_node, &iocg->group_data);
+ hlist_add_head(&leaf->elv_data_node, &efqd->group_list);
+
+ spin_unlock_irqrestore(&iocg->lock, flags);
+
+ prev = leaf;
+ leaf = next;
+ }
+
+ BUG_ON(cgroup == NULL && leaf != NULL);
+
+ if (cgroup != NULL && prev != NULL) {
+ iocg = cgroup_to_io_cgroup(cgroup);
+ iog = io_cgroup_lookup_group(iocg, key);
+ io_group_set_parent(prev, iog);
+ }
+}
+
+/**
+ * io_find_alloc_group - return the group associated to @efqd in @cgroup.
+ * @fqd: queue descriptor.
+ * @cgroup: cgroup being searched for.
+ * @create: if set to 1, create the io group if it has not been created yet.
+ *
+ * Return a group associated to @fqd in @cgroup, allocating one if
+ * necessary. When a group is returned all the cgroups in the path
+ * to the root have a group associated to @efqd.
+ *
+ * If the allocation fails, return the root group: this breaks guarantees
+ * but is a safe fallbak. If this loss becames a problem it can be
+ * mitigated using the equivalent weight (given by the product of the
+ * weights of the groups in the path from @group to the root) in the
+ * root scheduler.
+ *
+ * We allocate all the missing nodes in the path from the leaf cgroup
+ * to the root and we connect the nodes only after all the allocations
+ * have been successful.
+ */
+static struct io_group *io_find_alloc_group(struct request_queue *q,
+ struct cgroup *cgroup, struct elv_fq_data *efqd,
+ int create)
+{
+ struct io_cgroup *iocg = cgroup_to_io_cgroup(cgroup);
+ struct io_group *iog = NULL;
+ /* Note: Use efqd as key */
+ void *key = efqd;
+
+ /*
+ * Take a refenrece to css object. Don't want to map a bio to
+ * a group if it has been marked for deletion
+ */
+
+ if (!css_tryget(&iocg->css))
+ return iog;
+
+ iog = io_cgroup_lookup_group(iocg, key);
+ if (iog != NULL || !create)
+ goto end;
+
+ iog = io_group_chain_alloc(q, key, cgroup);
+ if (iog != NULL)
+ io_group_chain_link(q, key, cgroup, iog, efqd);
+
+end:
+ css_put(&iocg->css);
+ return iog;
+}
+
+/*
+ * Search for the io group current task belongs to. If create=1, then also
+ * create the io group if it is not already there.
+ *
+ * Note: This function should be called with queue lock held. It returns
+ * a pointer to io group without taking any reference. That group will
+ * be around as long as queue lock is not dropped (as group reclaim code
+ * needs to get hold of queue lock). So if somebody needs to use group
+ * pointer even after dropping queue lock, take a reference to the group
+ * before dropping queue lock.
+ */
+struct io_group *io_get_io_group(struct request_queue *q, int create)
+{
+ struct cgroup *cgroup;
+ struct io_group *iog;
+ struct elv_fq_data *efqd = &q->elevator->efqd;
+
+ assert_spin_locked(q->queue_lock);
+
+ rcu_read_lock();
+ cgroup = task_cgroup(current, io_subsys_id);
+ iog = io_find_alloc_group(q, cgroup, efqd, create);
+ if (!iog) {
+ if (create)
+ iog = efqd->root_group;
+ else
+ /*
+ * bio merge functions doing lookup don't want to
+ * map bio to root group by default
+ */
+ iog = NULL;
+ }
+ rcu_read_unlock();
+ return iog;
+}
+EXPORT_SYMBOL(io_get_io_group);
+
+static void io_free_root_group(struct elevator_queue *e)
+{
+ struct io_cgroup *iocg = &io_root_cgroup;
+ struct elv_fq_data *efqd = &e->efqd;
+ struct io_group *iog = efqd->root_group;
+ struct io_service_tree *st;
+ int i;
+
+ BUG_ON(!iog);
+ spin_lock_irq(&iocg->lock);
+ hlist_del_rcu(&iog->group_node);
+ spin_unlock_irq(&iocg->lock);
+
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
+ st = iog->sched_data.service_tree + i;
+ io_flush_idle_tree(st);
+ }
+
+ io_put_io_group_queues(e, iog);
+ elv_put_iog(iog);
+}
+
+static struct io_group *io_alloc_root_group(struct request_queue *q,
+ struct elevator_queue *e, void *key)
+{
+ struct io_group *iog;
+ struct io_cgroup *iocg;
+ int i;
+
+ iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node);
+ if (iog == NULL)
+ return NULL;
+
+ elv_get_iog(iog);
+ iog->entity.parent = NULL;
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++)
+ iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT;
+
+ iocg = &io_root_cgroup;
+ spin_lock_irq(&iocg->lock);
+ rcu_assign_pointer(iog->key, key);
+ hlist_add_head_rcu(&iog->group_node, &iocg->group_data);
+ iog->iocg_id = css_id(&iocg->css);
+ spin_unlock_irq(&iocg->lock);
+
+ return iog;
+}
+
+static void io_group_free_rcu(struct rcu_head *head)
+{
+ struct io_group *iog;
+
+ iog = container_of(head, struct io_group, rcu_head);
+ kfree(iog);
+}
+
+/*
+ * This cleanup function does the last bit of things to destroy cgroup.
+ * It should only get called after io_destroy_group has been invoked.
+ */
+static void io_group_cleanup(struct io_group *iog)
+{
+ struct io_service_tree *st;
+ struct io_entity *entity = iog->my_entity;
+ int i;
+
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
+ st = iog->sched_data.service_tree + i;
+
+ BUG_ON(!RB_EMPTY_ROOT(&st->active));
+ BUG_ON(!RB_EMPTY_ROOT(&st->idle));
+ BUG_ON(st->wsum != 0);
+ }
+
+ BUG_ON(iog->sched_data.next_active != NULL);
+ BUG_ON(iog->sched_data.active_entity != NULL);
+ BUG_ON(entity != NULL && entity->tree != NULL);
+
+ /*
+ * Wait for any rcu readers to exit before freeing up the group.
+ * Primarily useful when io_get_io_group() is called without queue
+ * lock to access some group data from bdi_congested_group() path.
+ */
+ call_rcu(&iog->rcu_head, io_group_free_rcu);
+}
+
+void elv_put_iog(struct io_group *iog)
+{
+ struct io_group *parent = NULL;
+ struct io_entity *entity;
+
+ BUG_ON(!iog);
+
+ entity = iog->my_entity;
+
+ BUG_ON(atomic_read(&iog->ref) <= 0);
+ if (!atomic_dec_and_test(&iog->ref))
+ return;
+
+ if (entity)
+ parent = container_of(iog->my_entity->parent,
+ struct io_group, entity);
+
+ io_group_cleanup(iog);
+
+ if (parent)
+ elv_put_iog(parent);
+}
+EXPORT_SYMBOL(elv_put_iog);
+
+/*
+ * check whether a given group has got any active entities on any of the
+ * service tree.
+ */
+static inline int io_group_has_active_entities(struct io_group *iog)
+{
+ int i;
+ struct io_service_tree *st;
+
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
+ st = iog->sched_data.service_tree + i;
+ if (!RB_EMPTY_ROOT(&st->active))
+ return 1;
+ }
+
+ /*
+ * Also check there are no active entities being served which are
+ * not on active tree
+ */
+
+ if (iog->sched_data.active_entity)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * After the group is destroyed, no new sync IO should come to the group.
+ * It might still have pending IOs in some busy queues. It should be able to
+ * send those IOs down to the disk. The async IOs (due to dirty page writeback)
+ * would go in the root group queues after this, as the group does not exist
+ * anymore.
+ */
+static void __io_destroy_group(struct elv_fq_data *efqd, struct io_group *iog)
+{
+ struct elevator_queue *eq;
+ struct io_service_tree *st;
+ int i;
+
+ BUG_ON(iog->my_entity == NULL);
+
+ /*
+ * Mark io group for deletion so that no new entry goes in
+ * idle tree. Any active queue will be removed from active
+ * tree and not put in to idle tree.
+ */
+ iog->deleting = 1;
+
+ /* We flush idle tree now, and don't put things in there any more. */
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
+ st = iog->sched_data.service_tree + i;
+
+ io_flush_idle_tree(st);
+ }
+
+ eq = container_of(efqd, struct elevator_queue, efqd);
+ hlist_del(&iog->elv_data_node);
+ io_put_io_group_queues(eq, iog);
+
+ /*
+ * We can come here either through cgroup deletion path or through
+ * elevator exit path. If we come here through cgroup deletion path
+ * check if io group has any active entities or not. If not, then
+ * deactivate this io group to make sure it is removed from idle
+ * tree it might have been on. If this group was on idle tree, then
+ * this probably will be the last reference and group will be
+ * freed upon putting the reference down.
+ */
+
+ if (!io_group_has_active_entities(iog)) {
+ /*
+ * io group does not have any active entites. Because this
+ * group has been decoupled from io_cgroup list and this
+ * cgroup is being deleted, this group should not receive
+ * any new IO. Hence it should be safe to deactivate this
+ * io group and remove from the scheduling tree.
+ */
+ __bfq_deactivate_entity(iog->my_entity, 0);
+ }
+
+ /*
+ * Put the reference taken at the time of creation so that when all
+ * queues are gone, cgroup can be destroyed.
+ */
+ elv_put_iog(iog);
+}
+
+static void iocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
+{
+ struct io_cgroup *iocg = cgroup_to_io_cgroup(cgroup);
+ struct io_group *iog;
+ struct elv_fq_data *efqd;
+ unsigned long uninitialized_var(flags);
+
+ /*
+ * io groups are linked in two lists. One list is maintained
+ * in elevator (efqd->group_list) and other is maintained
+ * per cgroup structure (iocg->group_data).
+ *
+ * While a cgroup is being deleted, elevator also might be
+ * exiting and both might try to cleanup the same io group
+ * so need to be little careful.
+ *
+ * (iocg->group_data) is protected by iocg->lock. To avoid deadlock,
+ * we can't hold the queue lock while holding iocg->lock. So we first
+ * remove iog from iocg->group_data under iocg->lock. Whoever removes
+ * iog from iocg->group_data should call __io_destroy_group to remove
+ * iog.
+ */
+
+ rcu_read_lock();
+
+remove_entry:
+ spin_lock_irqsave(&iocg->lock, flags);
+
+ if (hlist_empty(&iocg->group_data)) {
+ spin_unlock_irqrestore(&iocg->lock, flags);
+ goto done;
+ }
+ iog = hlist_entry(iocg->group_data.first, struct io_group,
+ group_node);
+ efqd = rcu_dereference(iog->key);
+ hlist_del_rcu(&iog->group_node);
+ iog->iocg_id = 0;
+ spin_unlock_irqrestore(&iocg->lock, flags);
+
+ spin_lock_irqsave(efqd->queue->queue_lock, flags);
+ __io_destroy_group(efqd, iog);
+ spin_unlock_irqrestore(efqd->queue->queue_lock, flags);
+ goto remove_entry;
+
+done:
+ free_css_id(&io_subsys, &iocg->css);
+ rcu_read_unlock();
+ BUG_ON(!hlist_empty(&iocg->group_data));
+ kfree(iocg);
+}
+
+/*
+ * This functions checks if iog is still in iocg->group_data, and removes it.
+ * If iog is not in that list, then cgroup destroy path has removed it, and
+ * we do not need to remove it.
+ */
+static void io_group_check_and_destroy(struct elv_fq_data *efqd,
+ struct io_group *iog)
+{
+ struct io_cgroup *iocg;
+ unsigned long flags;
+ struct cgroup_subsys_state *css;
+
+ rcu_read_lock();
+
+ css = css_lookup(&io_subsys, iog->iocg_id);
+
+ if (!css)
+ goto out;
+
+ iocg = container_of(css, struct io_cgroup, css);
+
+ spin_lock_irqsave(&iocg->lock, flags);
+
+ if (iog->iocg_id) {
+ hlist_del_rcu(&iog->group_node);
+ __io_destroy_group(efqd, iog);
+ }
+
+ spin_unlock_irqrestore(&iocg->lock, flags);
+out:
+ rcu_read_unlock();
+}
+
+static void io_disconnect_groups(struct elevator_queue *e)
+{
+ struct hlist_node *pos, *n;
+ struct io_group *iog;
+ struct elv_fq_data *efqd = &e->efqd;
+
+ hlist_for_each_entry_safe(iog, pos, n, &efqd->group_list,
+ elv_data_node) {
+ io_group_check_and_destroy(efqd, iog);
+ }
+}
+
+/*
+ * if bio sumbmitting task and rq don't belong to same io_group, it can't
+ * be merged
+ */
+int io_group_allow_merge(struct request *rq, struct bio *bio)
+{
+ struct request_queue *q = rq->q;
+ struct io_queue *ioq = rq->ioq;
+ struct io_group *iog, *__iog;
+
+ if (!elv_iosched_fair_queuing_enabled(q->elevator))
+ return 1;
+
+ /* Determine the io group of the bio submitting task */
+ iog = io_get_io_group(q, 0);
+ if (!iog) {
+ /* May be task belongs to a differet cgroup for which io
+ * group has not been setup yet. */
+ return 0;
+ }
+
+ /* Determine the io group of the ioq, rq belongs to*/
+ __iog = ioq_to_io_group(ioq);
+
+ return (iog == __iog);
+}
+#else /* GROUP_IOSCHED */
+static inline void
+bfq_init_entity(struct io_entity *entity, struct io_group *iog)
+{
+ entity->sched_data = &iog->sched_data;
+}
+
+static inline void io_disconnect_groups(struct elevator_queue *e) {}
+static inline unsigned int iog_weight(struct io_group *iog) { return 0; }
+
+static struct io_group *io_alloc_root_group(struct request_queue *q,
+ struct elevator_queue *e, void *key)
+{
+ struct io_group *iog;
+ int i;
+
+ iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node);
+ if (iog == NULL)
+ return NULL;
+
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++)
+ iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT;
+
+ return iog;
+}
+
+static void io_free_root_group(struct elevator_queue *e)
+{
+ struct io_group *iog = e->efqd.root_group;
+ struct io_service_tree *st;
+ int i;
+
+ for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
+ st = iog->sched_data.service_tree + i;
+ io_flush_idle_tree(st);
+ }
+
+ io_put_io_group_queues(e, iog);
+ kfree(iog);
+}
+
+struct io_group *io_get_io_group(struct request_queue *q, int create)
+{
+ /* In flat mode, there is only root group */
+ return q->elevator->efqd.root_group;
+}
+EXPORT_SYMBOL(io_get_io_group);
#endif /* GROUP_IOSCHED */
+
/* Elevator fair queuing function */
static inline struct io_queue *elv_active_ioq(struct elevator_queue *e)
{
@@ -1284,10 +2038,14 @@ void elv_put_ioq(struct io_queue *ioq)
struct elv_fq_data *efqd = ioq->efqd;
struct elevator_queue *e = container_of(efqd, struct elevator_queue,
efqd);
+ struct io_group *iog;
BUG_ON(atomic_read(&ioq->ref) <= 0);
if (!atomic_dec_and_test(&ioq->ref))
return;
+
+ iog = ioq_to_io_group(ioq);
+
BUG_ON(ioq->nr_queued);
BUG_ON(ioq->entity.tree != NULL);
BUG_ON(elv_ioq_busy(ioq));
@@ -1299,10 +2057,11 @@ void elv_put_ioq(struct io_queue *ioq)
e->ops->elevator_free_sched_queue_fn(e, ioq->sched_queue);
elv_log_ioq(efqd, ioq, "put_queue");
elv_free_ioq(ioq);
+ elv_put_iog(iog);
}
EXPORT_SYMBOL(elv_put_ioq);
-void elv_release_ioq(struct elevator_queue *e, struct io_queue **ioq_ptr)
+static void elv_release_ioq(struct elevator_queue *e, struct io_queue **ioq_ptr)
{
struct io_queue *ioq = *ioq_ptr;
@@ -1321,6 +2080,7 @@ static void elv_activate_ioq(struct io_queue *ioq, int add_front)
static void elv_deactivate_ioq(struct elv_fq_data *efqd, struct io_queue *ioq,
int requeue)
{
+ requeue = update_requeue(ioq, requeue);
bfq_deactivate_entity(&ioq->entity, requeue);
}
@@ -1399,8 +2159,12 @@ static void __elv_set_active_ioq(struct elv_fq_data *efqd, struct io_queue *ioq,
struct request_queue *q = efqd->queue;
if (ioq) {
- elv_log_ioq(efqd, ioq, "set_active, busy=%d",
- efqd->busy_queues);
+ struct io_group *iog = ioq_to_io_group(ioq);
+ elv_log_ioq(efqd, ioq, "set_active, busy=%d ioprio=%d"
+ " weight=%u group_weight=%u",
+ efqd->busy_queues,
+ ioq->entity.ioprio, ioq->entity.weight,
+ iog_weight(iog));
ioq->slice_end = 0;
ioq->slice_start = jiffies;
@@ -1585,6 +2349,7 @@ static int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq,
struct io_queue *ioq;
struct elevator_queue *eq = q->elevator;
struct io_entity *entity, *new_entity;
+ struct io_group *iog = NULL, *new_iog = NULL;
ioq = elv_active_ioq(eq);
@@ -1595,6 +2360,13 @@ static int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq,
new_entity = &new_ioq->entity;
/*
+ * In hierarchical setup, one need to traverse up the hierarchy
+ * till both the queues are children of same parent to make a
+ * decision whether to do the preemption or not.
+ */
+ bfq_find_matching_entity(&entity, &new_entity);
+
+ /*
* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
*/
@@ -1610,9 +2382,17 @@ static int elv_should_preempt(struct request_queue *q, struct io_queue *new_ioq,
return 1;
/*
- * Check with io scheduler if it has additional criterion based on
- * which it wants to preempt existing queue.
+ * If both the queues belong to same group, check with io scheduler
+ * if it has additional criterion based on which it wants to
+ * preempt existing queue.
*/
+ iog = ioq_to_io_group(ioq);
+ new_iog = ioq_to_io_group(new_ioq);
+
+ if (iog != new_iog)
+ return 0;
+
+
if (eq->ops->elevator_should_preempt_fn)
return eq->ops->elevator_should_preempt_fn(q,
ioq_sched_queue(new_ioq), rq);
@@ -1758,6 +2538,10 @@ static inline struct io_queue *elv_close_cooperator(struct request_queue *q,
if (q->elevator->ops->elevator_close_cooperator_fn)
new_ioq = e->ops->elevator_close_cooperator_fn(q, sched_queue);
+ /* Only select co-operating queue if it belongs to same group as ioq */
+ if (new_ioq && !is_same_group(&ioq->entity, &new_ioq->entity))
+ return NULL;
+
if (new_ioq)
elv_log_ioq(&e->efqd, ioq, "cooperating ioq=%d", new_ioq->pid);
@@ -1939,15 +2723,6 @@ void elv_ioq_completed_request(struct request_queue *q, struct request *rq)
elv_schedule_dispatch(q);
}
-struct io_group *io_get_io_group(struct request_queue *q)
-{
- struct elv_fq_data *efqd = &q->elevator->efqd;
-
- /* In flat mode, there is only root group */
- return efqd->root_group;
-}
-EXPORT_SYMBOL(io_get_io_group);
-
void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class,
int ioprio)
{
@@ -1998,53 +2773,6 @@ void io_group_set_async_queue(struct io_group *iog, int ioprio_class,
}
EXPORT_SYMBOL(io_group_set_async_queue);
-/*
- * Release all the io group references to its async queues.
- */
-static void
-io_put_io_group_queues(struct elevator_queue *e, struct io_group *iog)
-{
- int i, j;
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < IOPRIO_BE_NR; j++)
- elv_release_ioq(e, &iog->async_queue[i][j]);
-
- /* Free up async idle queue */
- elv_release_ioq(e, &iog->async_idle_queue);
-}
-
-static struct io_group *io_alloc_root_group(struct request_queue *q,
- struct elevator_queue *e, void *key)
-{
- struct io_group *iog;
- int i;
-
- iog = kmalloc_node(sizeof(*iog), GFP_KERNEL | __GFP_ZERO, q->node);
- if (iog == NULL)
- return NULL;
-
- for (i = 0; i < IO_IOPRIO_CLASSES; i++)
- iog->sched_data.service_tree[i] = IO_SERVICE_TREE_INIT;
-
- return iog;
-}
-
-static void io_free_root_group(struct elevator_queue *e)
-{
- struct io_group *iog = e->efqd.root_group;
- struct io_service_tree *st;
- int i;
-
- for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
- st = iog->sched_data.service_tree + i;
- io_flush_idle_tree(st);
- }
-
- io_put_io_group_queues(e, iog);
- kfree(iog);
-}
-
static void elv_slab_kill(void)
{
/*
@@ -2099,6 +2827,7 @@ int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e)
efqd->idle_slice_timer.data = (unsigned long) efqd;
INIT_WORK(&efqd->unplug_work, elv_kick_queue);
+ INIT_HLIST_HEAD(&efqd->group_list);
efqd->elv_slice[0] = elv_slice_async;
efqd->elv_slice[1] = elv_slice_sync;
@@ -2116,12 +2845,23 @@ int elv_init_fq_data(struct request_queue *q, struct elevator_queue *e)
void elv_exit_fq_data(struct elevator_queue *e)
{
struct elv_fq_data *efqd = &e->efqd;
+ struct request_queue *q = efqd->queue;
if (!elv_iosched_fair_queuing_enabled(e))
return;
elv_shutdown_timer_wq(e);
+ spin_lock_irq(q->queue_lock);
+ /* This should drop all the io group references of async queues */
+ io_disconnect_groups(e);
+ spin_unlock_irq(q->queue_lock);
+
+ elv_shutdown_timer_wq(e);
+
+ /* Wait for iog->key accessors to exit their grace periods. */
+ synchronize_rcu();
+
BUG_ON(timer_pending(&efqd->idle_slice_timer));
io_free_root_group(e);
}
@@ -181,19 +181,57 @@ struct io_queue {
};
#ifdef CONFIG_GROUP_IOSCHED
+/**
+ * struct io_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ * both io_queues and io_groups).
+ * @group_node: node to be inserted into the io_cgroup->group_data
+ * list of the containing cgroup's io_cgroup.
+ * @elv_data_node: node to be inserted into the @efqd->group_list list
+ * of the groups active on the same device; used for cleanup.
+ * @async_queue: array of async queues for all the tasks belonging to
+ * the group, one queue per ioprio value per ioprio_class,
+ * except for the idle class that has only one queue.
+ * @async_idle_queue: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ * to avoid too many special cases during group creation/migration.
+ *
+ * Each (device, cgroup) pair has its own io_group, i.e., for each cgroup
+ * there is a set of io_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ * o @group_node is protected by the io_cgroup lock, and is accessed
+ * via RCU from its readers.
+ * o @efqd is protected by the queue lock, RCU is used to access it
+ * from the readers.
+ * o All the other fields are protected by the @efqd queue lock.
+ */
struct io_group {
struct io_entity entity;
+ struct hlist_node elv_data_node;
struct hlist_node group_node;
struct io_sched_data sched_data;
+ atomic_t ref;
struct io_entity *my_entity;
/*
+ * A cgroup has multiple io_groups, one for each request queue.
+ * to find io group belonging to a particular queue, elv_fq_data
+ * pointer is stored as a key.
+ */
+ void *key;
+
+ /*
* async queue for each priority case for RT and BE class.
* Used only for cfq.
*/
struct io_queue *async_queue[2][IOPRIO_BE_NR];
struct io_queue *async_idle_queue;
+ struct rcu_head rcu_head;
+ int deleting;
unsigned short iocg_id;
};
@@ -231,6 +269,9 @@ struct io_group {
struct elv_fq_data {
struct io_group *root_group;
+ /* List of io groups hanging on this elevator */
+ struct hlist_head group_list;
+
struct request_queue *queue;
unsigned int busy_queues;
@@ -366,7 +407,7 @@ static inline void elv_ioq_set_ioprio_class(struct io_queue *ioq,
static inline unsigned int bfq_ioprio_to_weight(int ioprio)
{
WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
- return IOPRIO_BE_NR - ioprio;
+ return ((IOPRIO_BE_NR - ioprio) * WEIGHT_MAX)/IOPRIO_BE_NR;
}
static inline void elv_ioq_set_ioprio(struct io_queue *ioq, int ioprio)
@@ -389,6 +430,46 @@ static inline struct io_group *ioq_to_io_group(struct io_queue *ioq)
sched_data);
}
+#ifdef CONFIG_GROUP_IOSCHED
+extern int io_group_allow_merge(struct request *rq, struct bio *bio);
+extern void elv_put_iog(struct io_group *iog);
+
+static inline void elv_get_iog(struct io_group *iog)
+{
+ atomic_inc(&iog->ref);
+}
+
+static inline int update_requeue(struct io_queue *ioq, int requeue)
+{
+ struct io_group *iog = ioq_to_io_group(ioq);
+
+ if (iog->deleting == 1)
+ return 0;
+
+ return requeue;
+}
+
+#else /* !GROUP_IOSCHED */
+static inline int io_group_allow_merge(struct request *rq, struct bio *bio)
+{
+ return 1;
+}
+
+static inline void elv_get_iog(struct io_group *iog)
+{
+}
+
+static inline void elv_put_iog(struct io_group *iog)
+{
+}
+
+static inline int update_requeue(struct io_queue *ioq, int requeue)
+{
+ return requeue;
+}
+
+#endif /* GROUP_IOSCHED */
+
extern ssize_t elv_slice_sync_show(struct elevator_queue *q, char *name);
extern ssize_t elv_slice_sync_store(struct elevator_queue *q, const char *name,
size_t count);
@@ -437,7 +518,7 @@ extern void *io_group_async_queue_prio(struct io_group *iog, int ioprio_class,
int ioprio);
extern void io_group_set_async_queue(struct io_group *iog, int ioprio_class,
int ioprio, struct io_queue *ioq);
-extern struct io_group *io_get_io_group(struct request_queue *q);
+extern struct io_group *io_get_io_group(struct request_queue *q, int create);
extern int elv_nr_busy_ioq(struct elevator_queue *e);
extern int elv_rq_in_driver(struct elevator_queue *e);
extern struct io_queue *elv_alloc_ioq(struct request_queue *q, gfp_t gfp_mask);
@@ -489,5 +570,11 @@ static inline void *elv_fq_select_ioq(struct request_queue *q, int force)
{
return NULL;
}
+
+static inline int io_group_allow_merge(struct request *rq, struct bio *bio)
+
+{
+ return 1;
+}
#endif /* CONFIG_ELV_FAIR_QUEUING */
#endif /* _BFQ_SCHED_H */
@@ -113,6 +113,10 @@ int elv_rq_merge_ok(struct request *rq, struct bio *bio)
!bio_failfast_driver(bio) != !blk_failfast_driver(rq))
return 0;
+ /* If rq and bio belongs to different groups, dont allow merging */
+ if (!io_group_allow_merge(rq, bio))
+ return 0;
+
if (!elv_iosched_allow_merge(rq, bio))
return 0;