@@ -51,6 +51,7 @@
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-throttle.h"
+#include "blk-rq-qos.h"
struct dentry *blk_debugfs_root;
@@ -377,6 +378,7 @@ void blk_cleanup_queue(struct request_queue *q)
* it is safe to free requests now.
*/
mutex_lock(&q->sysfs_lock);
+ rq_qos_exit(q);
if (q->elevator)
blk_mq_sched_free_rqs(q);
mutex_unlock(&q->sysfs_lock);
@@ -662,7 +662,7 @@ static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
static struct ioc *q_to_ioc(struct request_queue *q)
{
- return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
+ return rqos_to_ioc(rq_qos_by_id(q, RQ_QOS_COST));
}
static const char *q_name(struct request_queue *q)
@@ -3162,6 +3162,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
struct block_device *bdev;
+ struct rq_qos *rqos;
struct ioc *ioc;
u32 qos[NR_QOS_PARAMS];
bool enable, user;
@@ -3172,14 +3173,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
- ioc = q_to_ioc(bdev_get_queue(bdev));
- if (!ioc) {
+ rqos = rq_qos_get(bdev_get_queue(bdev), RQ_QOS_COST);
+ if (!rqos) {
ret = blk_iocost_init(bdev_get_queue(bdev));
if (ret)
goto err;
- ioc = q_to_ioc(bdev_get_queue(bdev));
+ rqos = rq_qos_get(bdev_get_queue(bdev), RQ_QOS_COST);
}
+ ioc = rqos_to_ioc(rqos);
spin_lock_irq(&ioc->lock);
memcpy(qos, ioc->params.qos, sizeof(qos));
enable = ioc->enabled;
@@ -3272,10 +3274,12 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
+ rq_qos_put(rqos);
blkdev_put_no_open(bdev);
return nbytes;
einval:
ret = -EINVAL;
+ rq_qos_put(rqos);
err:
blkdev_put_no_open(bdev);
return ret;
@@ -3329,6 +3333,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
struct block_device *bdev;
+ struct rq_qos *rqos;
struct ioc *ioc;
u64 u[NR_I_LCOEFS];
bool user;
@@ -3339,14 +3344,15 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
- ioc = q_to_ioc(bdev_get_queue(bdev));
+ rqos = rq_qos_get(bdev_get_queue(bdev), RQ_QOS_COST);
if (!ioc) {
ret = blk_iocost_init(bdev_get_queue(bdev));
if (ret)
goto err;
- ioc = q_to_ioc(bdev_get_queue(bdev));
+ rqos = rq_qos_get(bdev_get_queue(bdev), RQ_QOS_COST);
}
+ ioc = rqos_to_ioc(rqos);
spin_lock_irq(&ioc->lock);
memcpy(u, ioc->params.i_lcoefs, sizeof(u));
user = ioc->user_cost_model;
@@ -3397,11 +3403,13 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
+ rq_qos_put(rqos);
blkdev_put_no_open(bdev);
return nbytes;
einval:
ret = -EINVAL;
+ rq_qos_put(rqos);
err:
blkdev_put_no_open(bdev);
return ret;
@@ -841,7 +841,9 @@ void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
{
struct request_queue *q = rqos->q;
- const char *dir_name = rq_qos_id_to_name(rqos->id);
+ const char *dir_name;
+
+ dir_name = rqos->ops->name ? rqos->ops->name : rq_qos_id_to_name(rqos->id);
if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
return;
@@ -2,6 +2,11 @@
#include "blk-rq-qos.h"
+static DEFINE_IDA(rq_qos_ida);
+static int nr_rqos_blkcg_pols;
+static DEFINE_MUTEX(rq_qos_mutex);
+static LIST_HEAD(rq_qos_list);
+
/*
* Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
* false if 'v' + 1 would be bigger than 'below'.
@@ -294,11 +299,316 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
void rq_qos_exit(struct request_queue *q)
{
- blk_mq_debugfs_unregister_queue_rqos(q);
+ WARN_ON(!mutex_is_locked(&q->sysfs_lock));
while (q->rq_qos) {
struct rq_qos *rqos = q->rq_qos;
q->rq_qos = rqos->next;
+ if (rqos->ops->owner)
+ module_put(rqos->ops->owner);
rqos->ops->exit(rqos);
}
+ blk_mq_debugfs_unregister_queue_rqos(q);
+}
+
+/*
+ * After the pluggable blk-qos, rqos's life cycle become complicated,
+ * qos switching path can add/delete rqos to/from request_queue
+ * under sysfs_lock and queue_lock. There are following places
+ * may access rqos through rq_qos_by_id() concurrently:
+ * (1) normal IO path, under q_usage_counter,
+ * (2) queue sysfs interfaces, under sysfs_lock,
+ * (3) blkg_create, the .pd_init_fn() may access rqos, under queue_lock,
+ * (4) cgroup file, such as ioc_cost_model_write,
+ *
+ * (1)(2)(3) are definitely safe. case (4) is tricky. rq_qos_get() is
+ * for the case.
+ */
+struct rq_qos *rq_qos_get(struct request_queue *q, int id)
+{
+ struct rq_qos *rqos;
+
+ spin_lock_irq(&q->queue_lock);
+ rqos = rq_qos_by_id(q, id);
+ if (rqos && rqos->dying)
+ rqos = NULL;
+ if (rqos)
+ refcount_inc(&rqos->ref);
+ spin_unlock_irq(&q->queue_lock);
+ return rqos;
+}
+EXPORT_SYMBOL_GPL(rq_qos_get);
+
+void rq_qos_put(struct rq_qos *rqos)
+{
+ struct request_queue *q = rqos->q;
+
+ spin_lock_irq(&q->queue_lock);
+ refcount_dec(&rqos->ref);
+ if (rqos->dying)
+ wake_up(&rqos->waitq);
+ spin_unlock_irq(&q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(rq_qos_put);
+
+void rq_qos_activate(struct request_queue *q,
+ struct rq_qos *rqos, const struct rq_qos_ops *ops)
+{
+ struct rq_qos *pos;
+ bool rq_alloc_time = false;
+
+ WARN_ON(!mutex_is_locked(&q->sysfs_lock));
+
+ rqos->dying = false;
+ refcount_set(&rqos->ref, 1);
+ init_waitqueue_head(&rqos->waitq);
+ rqos->id = ops->id;
+ rqos->ops = ops;
+ rqos->q = q;
+ rqos->next = NULL;
+
+ spin_lock_irq(&q->queue_lock);
+ pos = q->rq_qos;
+ if (pos) {
+ while (pos->next) {
+ if (pos->ops->flags & RQOS_FLAG_RQ_ALLOC_TIME)
+ rq_alloc_time = true;
+ pos = pos->next;
+ }
+ pos->next = rqos;
+ } else {
+ q->rq_qos = rqos;
+ }
+ if (ops->flags & RQOS_FLAG_RQ_ALLOC_TIME &&
+ !rq_alloc_time)
+ blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, q);
+
+ spin_unlock_irq(&q->queue_lock);
+
+ if (rqos->ops->debugfs_attrs)
+ blk_mq_debugfs_register_rqos(rqos);
+}
+EXPORT_SYMBOL_GPL(rq_qos_activate);
+
+void rq_qos_deactivate(struct rq_qos *rqos)
+{
+ struct request_queue *q = rqos->q;
+ struct rq_qos **cur, *pos;
+ bool rq_alloc_time = false;
+
+ WARN_ON(!mutex_is_locked(&q->sysfs_lock));
+
+ spin_lock_irq(&q->queue_lock);
+ rqos->dying = true;
+ /*
+ * Drain all of the usage of get/put_rqos()
+ */
+ wait_event_lock_irq(rqos->waitq,
+ refcount_read(&rqos->ref) == 1, q->queue_lock);
+ for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
+ if (*cur == rqos) {
+ *cur = rqos->next;
+ break;
+ }
+ }
+
+ pos = q->rq_qos;
+ while (pos && pos->next) {
+ if (pos->ops->flags & RQOS_FLAG_RQ_ALLOC_TIME)
+ rq_alloc_time = true;
+ pos = pos->next;
+ }
+
+ if (rqos->ops->flags & RQOS_FLAG_RQ_ALLOC_TIME &&
+ !rq_alloc_time)
+ blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, q);
+
+ spin_unlock_irq(&q->queue_lock);
+ blk_mq_debugfs_unregister_rqos(rqos);
+}
+EXPORT_SYMBOL_GPL(rq_qos_deactivate);
+
+static struct rq_qos_ops *rq_qos_find_by_name(const char *name)
+{
+ struct rq_qos_ops *pos;
+
+ list_for_each_entry(pos, &rq_qos_list, node) {
+ if (!strncmp(pos->name, name, strlen(pos->name)))
+ return pos;
+ }
+
+ return NULL;
+}
+
+int rq_qos_register(struct rq_qos_ops *ops)
+{
+ int ret, start;
+
+ mutex_lock(&rq_qos_mutex);
+
+ if (rq_qos_find_by_name(ops->name)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ if (ops->flags & RQOS_FLAG_CGRP_POL &&
+ nr_rqos_blkcg_pols >= (BLKCG_MAX_POLS - BLKCG_NON_RQOS_POLS)) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ start = RQ_QOS_IOPRIO + 1;
+ ret = ida_simple_get(&rq_qos_ida, start, INT_MAX, GFP_KERNEL);
+ if (ret < 0)
+ goto out;
+
+ if (ops->flags & RQOS_FLAG_CGRP_POL)
+ nr_rqos_blkcg_pols++;
+
+ ops->id = ret;
+ ret = 0;
+ INIT_LIST_HEAD(&ops->node);
+ list_add_tail(&ops->node, &rq_qos_list);
+out:
+ mutex_unlock(&rq_qos_mutex);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rq_qos_register);
+
+void rq_qos_unregister(struct rq_qos_ops *ops)
+{
+ mutex_lock(&rq_qos_mutex);
+
+ if (ops->flags & RQOS_FLAG_CGRP_POL)
+ nr_rqos_blkcg_pols--;
+ list_del_init(&ops->node);
+ ida_simple_remove(&rq_qos_ida, ops->id);
+ mutex_unlock(&rq_qos_mutex);
+}
+EXPORT_SYMBOL_GPL(rq_qos_unregister);
+
+ssize_t queue_qos_show(struct request_queue *q, char *buf)
+{
+ struct rq_qos_ops *ops;
+ struct rq_qos *rqos;
+ int ret = 0;
+
+ mutex_lock(&rq_qos_mutex);
+ /*
+ * Show the policies in the order of being invoked
+ */
+ for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (!rqos->ops->name)
+ continue;
+ ret += sprintf(buf + ret, "[%s] ", rqos->ops->name);
+ }
+ list_for_each_entry(ops, &rq_qos_list, node) {
+ if (!rq_qos_by_name(q, ops->name))
+ ret += sprintf(buf + ret, "%s ", ops->name);
+ }
+
+ ret--; /* overwrite the last space */
+ ret += sprintf(buf + ret, "\n");
+ mutex_unlock(&rq_qos_mutex);
+
+ return ret;
+}
+
+int rq_qos_switch(struct request_queue *q,
+ const struct rq_qos_ops *ops,
+ struct rq_qos *rqos)
+{
+ int ret;
+
+ WARN_ON(!mutex_is_locked(&q->sysfs_lock));
+
+ blk_mq_freeze_queue(q);
+ if (!rqos) {
+ ret = ops->init(q);
+ } else {
+ ops->exit(rqos);
+ ret = 0;
+ }
+ blk_mq_unfreeze_queue(q);
+
+ return ret;
+}
+
+ssize_t queue_qos_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ const struct rq_qos_ops *ops;
+ struct rq_qos *rqos;
+ const char *qosname;
+ char *buf;
+ bool add;
+ int ret;
+
+ buf = kstrdup(page, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ buf = strim(buf);
+ if (buf[0] != '+' && buf[0] != '-') {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ add = buf[0] == '+';
+ qosname = buf + 1;
+
+ rqos = rq_qos_by_name(q, qosname);
+ if ((buf[0] == '+' && rqos)) {
+ ret = -EEXIST;
+ goto out;
+ }
+
+ if ((buf[0] == '-' && !rqos)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ mutex_lock(&rq_qos_mutex);
+ if (add) {
+ ops = rq_qos_find_by_name(qosname);
+ if (!ops) {
+ /*
+ * module_init callback may request this mutex
+ */
+ mutex_unlock(&rq_qos_mutex);
+ request_module("%s", qosname);
+ mutex_lock(&rq_qos_mutex);
+ ops = rq_qos_find_by_name(qosname);
+ }
+ } else {
+ ops = rqos->ops;
+ }
+
+ if (!ops) {
+ ret = -EINVAL;
+ } else if (ops->owner && !try_module_get(ops->owner)) {
+ ops = NULL;
+ ret = -EAGAIN;
+ }
+ mutex_unlock(&rq_qos_mutex);
+
+ if (!ops)
+ goto out;
+
+ if (add) {
+ ret = rq_qos_switch(q, ops, NULL);
+ if (!ret && ops->owner)
+ __module_get(ops->owner);
+ } else {
+ rq_qos_switch(q, ops, rqos);
+ ret = 0;
+ if (ops->owner)
+ module_put(ops->owner);
+ }
+
+ if (ops->owner)
+ module_put(ops->owner);
+out:
+ kfree(buf);
+ return ret ? ret : count;
}
@@ -26,7 +26,10 @@ struct rq_wait {
};
struct rq_qos {
- struct rq_qos_ops *ops;
+ refcount_t ref;
+ wait_queue_head_t waitq;
+ bool dying;
+ const struct rq_qos_ops *ops;
struct request_queue *q;
enum rq_qos_id id;
struct rq_qos *next;
@@ -35,7 +38,17 @@ struct rq_qos {
#endif
};
+enum {
+ RQOS_FLAG_CGRP_POL = 1 << 0,
+ RQOS_FLAG_RQ_ALLOC_TIME = 1 << 1
+};
+
struct rq_qos_ops {
+ struct list_head node;
+ struct module *owner;
+ const char *name;
+ int flags;
+ int id;
void (*throttle)(struct rq_qos *, struct bio *);
void (*track)(struct rq_qos *, struct request *, struct bio *);
void (*merge)(struct rq_qos *, struct request *, struct bio *);
@@ -46,6 +59,7 @@ struct rq_qos_ops {
void (*cleanup)(struct rq_qos *, struct bio *);
void (*queue_depth_changed)(struct rq_qos *);
void (*exit)(struct rq_qos *);
+ int (*init)(struct request_queue *);
const struct blk_mq_debugfs_attr *debugfs_attrs;
};
@@ -59,10 +73,12 @@ struct rq_depth {
unsigned int default_depth;
};
-static inline struct rq_qos *rq_qos_id(struct request_queue *q,
- enum rq_qos_id id)
+static inline struct rq_qos *rq_qos_by_id(struct request_queue *q, int id)
{
struct rq_qos *rqos;
+
+ WARN_ON(!mutex_is_locked(&q->sysfs_lock) && !spin_is_locked(&q->queue_lock));
+
for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
if (rqos->id == id)
break;
@@ -72,12 +88,12 @@ static inline struct rq_qos *rq_qos_id(struct request_queue *q,
static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
{
- return rq_qos_id(q, RQ_QOS_WBT);
+ return rq_qos_by_id(q, RQ_QOS_WBT);
}
static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
{
- return rq_qos_id(q, RQ_QOS_LATENCY);
+ return rq_qos_by_id(q, RQ_QOS_LATENCY);
}
static inline void rq_wait_init(struct rq_wait *rq_wait)
@@ -132,6 +148,35 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
blk_mq_debugfs_unregister_rqos(rqos);
}
+int rq_qos_register(struct rq_qos_ops *ops);
+void rq_qos_unregister(struct rq_qos_ops *ops);
+void rq_qos_activate(struct request_queue *q,
+ struct rq_qos *rqos, const struct rq_qos_ops *ops);
+void rq_qos_deactivate(struct rq_qos *rqos);
+ssize_t queue_qos_show(struct request_queue *q, char *buf);
+ssize_t queue_qos_store(struct request_queue *q, const char *page,
+ size_t count);
+struct rq_qos *rq_qos_get(struct request_queue *q, int id);
+void rq_qos_put(struct rq_qos *rqos);
+
+static inline struct rq_qos *rq_qos_by_name(struct request_queue *q,
+ const char *name)
+{
+ struct rq_qos *rqos;
+
+ WARN_ON(!mutex_is_locked(&q->sysfs_lock));
+
+ for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+ if (!rqos->ops->name)
+ continue;
+
+ if (!strncmp(rqos->ops->name, name,
+ strlen(rqos->ops->name)))
+ return rqos;
+ }
+ return NULL;
+}
+
typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data);
@@ -573,6 +573,7 @@ QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size");
QUEUE_RW_ENTRY(elv_iosched, "scheduler");
+QUEUE_RW_ENTRY(queue_qos, "qos");
QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size");
QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size");
@@ -632,6 +633,7 @@ static struct attribute *queue_attrs[] = {
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
&elv_iosched_entry.attr,
+ &queue_qos_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
@@ -628,9 +628,13 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
{
- struct rq_qos *rqos = wbt_rq_qos(q);
+ struct rq_qos *rqos;
+
+ spin_lock_irq(&q->queue_lock);
+ rqos = wbt_rq_qos(q);
if (rqos)
RQWB(rqos)->wc = write_cache_on;
+ spin_unlock_irq(&q->queue_lock);
}
/*
@@ -701,12 +701,15 @@ void elevator_init_mq(struct request_queue *q)
* requests, then no need to quiesce queue which may add long boot
* latency, especially when lots of disks are involved.
*/
+
+ mutex_lock(&q->sysfs_lock);
blk_mq_freeze_queue(q);
blk_mq_cancel_work_sync(q);
err = blk_mq_init_sched(q, e);
blk_mq_unfreeze_queue(q);
+ mutex_unlock(&q->sysfs_lock);
if (err) {
pr_warn("\"%s\" elevator initialization failed, "
@@ -27,7 +27,6 @@
#include <linux/badblocks.h>
#include "blk.h"
-#include "blk-rq-qos.h"
static struct kobject *block_depr;
@@ -621,8 +620,6 @@ void del_gendisk(struct gendisk *disk)
device_del(disk_to_dev(disk));
blk_mq_freeze_queue_wait(q);
-
- rq_qos_exit(q);
blk_sync_queue(q);
blk_flush_integrity();
/*
@@ -43,6 +43,10 @@ struct blk_crypto_profile;
* Defined here to simplify include dependency.
*/
#define BLKCG_MAX_POLS 6
+/*
+ * Non blk-rq-qos blkcg policies include blk-throttle and bfq
+ */
+#define BLKCG_NON_RQOS_POLS 2
static inline int blk_validate_block_size(unsigned int bsize)
{