diff mbox series

[RFC,V4,1/6] blk: prepare to make blk-rq-qos pluggable and modular

Message ID 20220217031349.98561-2-jianchao.wan9@gmail.com (mailing list archive)
State New, archived
Headers show
Series blk: make blk-rq-qos policies pluggable and modular | expand

Commit Message

Wang Jianchao Feb. 17, 2022, 3:13 a.m. UTC
blk-rq-qos is a standalone framework out of io-sched and can be
used to control or observe the IO progress in block-layer with
hooks. blk-rq-qos is a great design but right now, it is totally
fixed and built-in and shut out peoples who want to use it with
external module.

This patch make blk-rq-qos policies pluggable and modular.
(1) Add code to maintain the rq_qos_ops. A rq-qos module need to
    register itself with rq_qos_register(). The original enum
    rq_qos_id will be removed in following patch. They will use
    a dynamic id maintained by rq_qos_ida.
(2) Add .init callback into rq_qos_ops. We use it to initialize the
    resource.
(3) Add /sys/block/x/queue/qos
    We can use '+name' or "-name" to open or close the blk-rq-qos
    policy.

This patch mainly prepare help interfaces and no functional changes.
Following patches will adpat the code of wbt, iolatency, iocost and
ioprio to make them pluggable and modular one by one. And after that,
the /sys/block/xxx/queue/qos interfaces will be exported.

Signed-off-by: Wang Jianchao (Kuaishou) <jianchao.wan9@gmail.com>
---
 block/blk-mq-debugfs.c |   9 +-
 block/blk-rq-qos.c     | 301 ++++++++++++++++++++++++++++++++++++++++-
 block/blk-rq-qos.h     |  39 +++++-
 include/linux/blkdev.h |   4 +
 4 files changed, 348 insertions(+), 5 deletions(-)

Comments

Christoph Hellwig Feb. 17, 2022, 8:48 a.m. UTC | #1
>  {
>  	struct request_queue *q = rqos->q;
> -	const char *dir_name = rq_qos_id_to_name(rqos->id);
> +	const char *dir_name;
> +
> +	dir_name = rqos->ops->name ? rqos->ops->name : rq_qos_id_to_name(rqos->id);

Overly long line here.  And it would be much more readable if you used
a good old if/else.

> +static DEFINE_IDA(rq_qos_ida);
> +static int nr_rqos_blkcg_pols;
> +static DEFINE_MUTEX(rq_qos_mutex);
> +static LIST_HEAD(rq_qos_list);

Please use an allocating xarray instead of an IDA plus list.

> +	/*
> +	 * queue must have been unregistered here, it is safe to iterate
> +	 * the list w/o lock
> +	 */

Please capitalize multi-line comments.

> + * After the pluggable blk-qos, rqos's life cycle become complicated,
> + * as we may modify the rqos list there. Except for the places where
> + * queue is not registered, there are following places may access rqos
> + * list concurrently:

Code comments are not the place to explain history.  PLease explain the
current situation.

> +struct rq_qos *rq_qos_get(struct request_queue *q, int id)
> +{
> +	struct rq_qos *rqos;
> +
> +	spin_lock_irq(&q->queue_lock);

Please don't use the grab all queue_lock for new code.  It badly needs
to be split and documented, and new code is the best place to start
that.

Also with all the new code please add a new config option that is
selected by all rq-pos implementations so that blk-rq-qos.c only gets
built when actually needed.

> +static inline struct rq_qos *rq_qos_by_id(struct request_queue *q, int id)
> +{
> +	struct rq_qos *rqos;
> +
> +	WARN_ON(!mutex_is_locked(&q->sysfs_lock) && !spin_is_locked(&q->queue_lock));

Another overly long line.  And in doubt split this into two helpers
so that you cna use lockdep_assert_held instead of doing the incorrect
asserts.
Wang Jianchao Feb. 18, 2022, 3:34 a.m. UTC | #2
On 2022/2/17 4:48 下午, Christoph Hellwig wrote:
>>  {
>>  	struct request_queue *q = rqos->q;
>> -	const char *dir_name = rq_qos_id_to_name(rqos->id);
>> +	const char *dir_name;
>> +
>> +	dir_name = rqos->ops->name ? rqos->ops->name : rq_qos_id_to_name(rqos->id);
> 
> Overly long line here.  And it would be much more readable if you used
> a good old if/else.
> 
>> +static DEFINE_IDA(rq_qos_ida);
>> +static int nr_rqos_blkcg_pols;
>> +static DEFINE_MUTEX(rq_qos_mutex);
>> +static LIST_HEAD(rq_qos_list);
> 
> Please use an allocating xarray instead of an IDA plus list.
> 
>> +	/*
>> +	 * queue must have been unregistered here, it is safe to iterate
>> +	 * the list w/o lock
>> +	 */
> 
> Please capitalize multi-line comments.
> 
>> + * After the pluggable blk-qos, rqos's life cycle become complicated,
>> + * as we may modify the rqos list there. Except for the places where
>> + * queue is not registered, there are following places may access rqos
>> + * list concurrently:
> 
> Code comments are not the place to explain history.  PLease explain the
> current situation.
> 
>> +struct rq_qos *rq_qos_get(struct request_queue *q, int id)
>> +{
>> +	struct rq_qos *rqos;
>> +
>> +	spin_lock_irq(&q->queue_lock);
> 
> Please don't use the grab all queue_lock for new code.  It badly needs
> to be split and documented, and new code is the best place to start
> that.
> 
> Also with all the new code please add a new config option that is
> selected by all rq-pos implementations so that blk-rq-qos.c only gets
> built when actually needed.
> 
>> +static inline struct rq_qos *rq_qos_by_id(struct request_queue *q, int id)
>> +{
>> +	struct rq_qos *rqos;
>> +
>> +	WARN_ON(!mutex_is_locked(&q->sysfs_lock) && !spin_is_locked(&q->queue_lock));
> 
> Another overly long line.  And in doubt split this into two helpers
> so that you cna use lockdep_assert_held instead of doing the incorrect
> asserts.

Thanks so much for your kindly comment. I'd change the code in next version.

Regards
Jianchao
Tejun Heo Feb. 22, 2022, 5:19 p.m. UTC | #3
Hello,

On Thu, Feb 17, 2022 at 11:13:44AM +0800, Wang Jianchao (Kuaishou) wrote:
> (3) Add /sys/block/x/queue/qos
>     We can use '+name' or "-name" to open or close the blk-rq-qos
>     policy.

I don't understand why we're modularizing rq-qos in this non-standard way
instead of modprobing to enable a policy and rmmoding to disable. Why are we
building in qos names into the kernel and adding an extra module handling
interface?

Thanks.
Wang Jianchao Feb. 23, 2022, 3:08 a.m. UTC | #4
On 2022/2/23 1:19 上午, Tejun Heo wrote:
> Hello,
> 
> On Thu, Feb 17, 2022 at 11:13:44AM +0800, Wang Jianchao (Kuaishou) wrote:
>> (3) Add /sys/block/x/queue/qos
>>     We can use '+name' or "-name" to open or close the blk-rq-qos
>>     policy.
> 
> I don't understand why we're modularizing rq-qos in this non-standard way
> instead of modprobing to enable a policy and rmmoding to disable. Why are we
> building in qos names into the kernel and adding an extra module handling
> interface?

Hi Tejun

We just want to provide the flexibility for the user to open/close a policy
per device. If we need to the policy on a device, we needn't to waste cpu
cycles and memory for it.

Thanks
Jianchao
Wang Jianchao Feb. 23, 2022, 3:10 a.m. UTC | #5
On 2022/2/23 11:08 上午, Wang Jianchao wrote:
> 
> 
> On 2022/2/23 1:19 上午, Tejun Heo wrote:
>> Hello,
>>
>> On Thu, Feb 17, 2022 at 11:13:44AM +0800, Wang Jianchao (Kuaishou) wrote:
>>> (3) Add /sys/block/x/queue/qos
>>>     We can use '+name' or "-name" to open or close the blk-rq-qos
>>>     policy.
>>
>> I don't understand why we're modularizing rq-qos in this non-standard way
>> instead of modprobing to enable a policy and rmmoding to disable. Why are we
>> building in qos names into the kernel and adding an extra module handling
>> interface?
> 
> Hi Tejun
> 
> We just want to provide the flexibility for the user to open/close a policy
> per device. If we need to the policy on a device, we needn't to waste cpu
sorry, it should be "If we don't need the policy on a device" ;)

Thanks
Jianchao
> cycles and memory for it.
> 
> Thanks
> Jianchao
Tejun Heo Feb. 23, 2022, 9:37 p.m. UTC | #6
Hello,

> > We just want to provide the flexibility for the user to open/close a policy
> > per device. If we need to the policy on a device, we needn't to waste cpu
> sorry, it should be "If we don't need the policy on a device" ;)

Yeah, that's what modularization does but why does it need a separate user
interface for loading? Everything else inits on insmod and exits on rmmod
and autoloading has been delegated to udev a very long time ago. The
interface you added for loading module doesn't make sense to me.

Thanks.
Wang Jianchao Feb. 24, 2022, 1:51 a.m. UTC | #7
On 2022/2/24 5:37 上午, Tejun Heo wrote:
> Hello,
> 
>>> We just want to provide the flexibility for the user to open/close a policy
>>> per device. If we need to the policy on a device, we needn't to waste cpu
>> sorry, it should be "If we don't need the policy on a device" ;)
> 
> Yeah, that's what modularization does but why does it need a separate user
> interface for loading? Everything else inits on insmod and exits on rmmod
> and autoloading has been delegated to udev a very long time ago. The
> interface you added for loading module doesn't make sense to me.
> 

The initial version of this patchset has two targets:
(1) Add a sysfs interface to open/close the policy per device. Then we needn't
    waste cpu cycles and memory if the device doesn't need the policy.
(2) Make the policies modular, then it easy to maintain the code of policy in
    production environment as we only need to close the policy and replace the
    .ko file.

The loading module when open policy in sysfs interface is just to avoid modprobe
manually. There is similar operation when switch io scheduler.

And as Christoph suggested, the modularization has been get rid of in next version.

Thanks
Jianchao
Tejun Heo Feb. 24, 2022, 2:07 a.m. UTC | #8
Hello,

On Thu, Feb 24, 2022 at 09:51:04AM +0800, Wang Jianchao wrote:
> The initial version of this patchset has two targets:
> (1) Add a sysfs interface to open/close the policy per device. Then we needn't
>     waste cpu cycles and memory if the device doesn't need the policy.
> (2) Make the policies modular, then it easy to maintain the code of policy in
>     production environment as we only need to close the policy and replace the
>     .ko file.
> 
> The loading module when open policy in sysfs interface is just to avoid modprobe
> manually. There is similar operation when switch io scheduler.

Each rq-qos mechanism already needs and has a way to turn off itself.
There's no reason to add another layer on top. If the current way of
disabling isn't efficient, we should improve that instead of adding a new
layer of interface on top.

And please don't add a custom interface to avoid modprobing. All it adds is
unnecessary deviation. There's no benefit to echoing a selector to a custom
sysfs file compared to explicitly modprobing it.

Thanks.
Wang Jianchao Feb. 24, 2022, 2:50 a.m. UTC | #9
On 2022/2/24 10:07 上午, Tejun Heo wrote:
> Hello,
> 
> On Thu, Feb 24, 2022 at 09:51:04AM +0800, Wang Jianchao wrote:
>> The initial version of this patchset has two targets:
>> (1) Add a sysfs interface to open/close the policy per device. Then we needn't
>>     waste cpu cycles and memory if the device doesn't need the policy.
>> (2) Make the policies modular, then it easy to maintain the code of policy in
>>     production environment as we only need to close the policy and replace the
>>     .ko file.
>>
>> The loading module when open policy in sysfs interface is just to avoid modprobe
>> manually. There is similar operation when switch io scheduler.
> 
> Each rq-qos mechanism already needs and has a way to turn off itself.
> There's no reason to add another layer on top. If the current way of
> disabling isn't efficient, we should improve that instead of adding a new
> layer of interface on top.

Yes, right now, every policy has their own way to turn off, but we always need to
iterate the rqos list and enter into the policy's callback to check it. And every
blkio cgroup needs to allocate memory for it even we don't use it.

I don't this patchset is adding a new layer, but blk-rq-qos layer has been already
there , we just add a unified interface to open/close the policies.

Thanks
Jianchao
Tejun Heo Feb. 24, 2022, 4:53 p.m. UTC | #10
On Thu, Feb 24, 2022 at 10:50:22AM +0800, Wang Jianchao wrote:
> Yes, right now, every policy has their own way to turn off, but we always need to
> iterate the rqos list and enter into the policy's callback to check it. And every
> blkio cgroup needs to allocate memory for it even we don't use it.
> 
> I don't this patchset is adding a new layer, but blk-rq-qos layer has been already
> there , we just add a unified interface to open/close the policies.

We're talking in circles. We already know when a policy is inactive. If it
sits in hot path in that state, take it off whatever gets iterated in hot
path and put it back on when it actually gets enabled. The same goes for
memory allocation. If there's substantial amount of memory allocted while
not used, make that dynamic and trigger it when the policy starts getting
used. It makes no sense to add another enable/disable interface on top.

FWIW, please consider the series nacked on this side.

Thanks.
Wang Jianchao Feb. 25, 2022, 2:02 a.m. UTC | #11
On 2022/2/25 12:53 上午, Tejun Heo wrote:
> On Thu, Feb 24, 2022 at 10:50:22AM +0800, Wang Jianchao wrote:
>> Yes, right now, every policy has their own way to turn off, but we always need to
>> iterate the rqos list and enter into the policy's callback to check it. And every
>> blkio cgroup needs to allocate memory for it even we don't use it.
>>
>> I don't this patchset is adding a new layer, but blk-rq-qos layer has been already
>> there , we just add a unified interface to open/close the policies.
> 
> We're talking in circles. We already know when a policy is inactive. If it
> sits in hot path in that state, take it off whatever gets iterated in hot
> path and put it back on when it actually gets enabled. The same goes for
> memory allocation. If there's substantial amount of memory allocted while
> not used, make that dynamic and trigger it when the policy starts getting
> used. It makes no sense to add another enable/disable interface on top.
>
It can make things more complicated if we does as above...

> FWIW, please consider the series nacked on this side.

Anyway, thanks so much for all of your comment ;)

Regards
Jianchao
diff mbox series

Patch

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3a790eb4995c..8b6d557e1ad6 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -729,7 +729,10 @@  void blk_mq_debugfs_register(struct request_queue *q)
 
 	if (q->rq_qos) {
 		struct rq_qos *rqos = q->rq_qos;
-
+		/*
+		 * queue has not been registered right now, it is safe to
+		 * iterate the rqos w/o lock
+		 */
 		while (rqos) {
 			blk_mq_debugfs_register_rqos(rqos);
 			rqos = rqos->next;
@@ -844,7 +847,9 @@  void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
 void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
 {
 	struct request_queue *q = rqos->q;
-	const char *dir_name = rq_qos_id_to_name(rqos->id);
+	const char *dir_name;
+
+	dir_name = rqos->ops->name ? rqos->ops->name : rq_qos_id_to_name(rqos->id);
 
 	if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
 		return;
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index e83af7bc7591..db13581ae878 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -2,6 +2,11 @@ 
 
 #include "blk-rq-qos.h"
 
+static DEFINE_IDA(rq_qos_ida);
+static int nr_rqos_blkcg_pols;
+static DEFINE_MUTEX(rq_qos_mutex);
+static LIST_HEAD(rq_qos_list);
+
 /*
  * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
  * false if 'v' + 1 would be bigger than 'below'.
@@ -294,11 +299,303 @@  void rq_qos_wait(struct rq_wait *rqw, void *private_data,
 
 void rq_qos_exit(struct request_queue *q)
 {
-	blk_mq_debugfs_unregister_queue_rqos(q);
-
+	/*
+	 * queue must have been unregistered here, it is safe to iterate
+	 * the list w/o lock
+	 */
 	while (q->rq_qos) {
 		struct rq_qos *rqos = q->rq_qos;
 		q->rq_qos = rqos->next;
 		rqos->ops->exit(rqos);
 	}
+	blk_mq_debugfs_unregister_queue_rqos(q);
+}
+
+static struct rq_qos *rq_qos_by_name(struct request_queue *q,
+		const char *name)
+{
+	struct rq_qos *rqos;
+
+	for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+		if (!rqos->ops->name)
+			continue;
+
+		if (!strncmp(rqos->ops->name, name,
+					strlen(rqos->ops->name)))
+			return rqos;
+	}
+	return NULL;
+}
+
+/*
+ * After the pluggable blk-qos, rqos's life cycle become complicated,
+ * as we may modify the rqos list there. Except for the places where
+ * queue is not registered, there are following places may access rqos
+ * list concurrently:
+ * (1) normal IO path, can be serialized by queue freezing
+ * (2) blkg_create, the .pd_init_fn() may access rqos, can be serialized
+ *     by queue_lock.
+ * (3) cgroup file, such as ioc_cost_model_write, rq_qos_get is for this
+ *     case to keep the rqos alive.
+ */
+struct rq_qos *rq_qos_get(struct request_queue *q, int id)
+{
+	struct rq_qos *rqos;
+
+	spin_lock_irq(&q->queue_lock);
+	rqos = rq_qos_by_id(q, id);
+	if (rqos && rqos->dying)
+		rqos = NULL;
+	if (rqos)
+		refcount_inc(&rqos->ref);
+	spin_unlock_irq(&q->queue_lock);
+	return rqos;
+}
+EXPORT_SYMBOL_GPL(rq_qos_get);
+
+void rq_qos_put(struct rq_qos *rqos)
+{
+	struct request_queue *q = rqos->q;
+
+	spin_lock_irq(&q->queue_lock);
+	refcount_dec(&rqos->ref);
+	if (rqos->dying)
+		wake_up(&rqos->waitq);
+	spin_unlock_irq(&q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(rq_qos_put);
+
+void rq_qos_activate(struct request_queue *q,
+		struct rq_qos *rqos, const struct rq_qos_ops *ops)
+{
+	struct rq_qos *pos;
+
+	rqos->dying = false;
+	refcount_set(&rqos->ref, 1);
+	init_waitqueue_head(&rqos->waitq);
+	rqos->id = ops->id;
+	rqos->ops = ops;
+	rqos->q = q;
+	rqos->next = NULL;
+
+	spin_lock_irq(&q->queue_lock);
+	pos = q->rq_qos;
+	if (pos) {
+		while (pos->next)
+			pos = pos->next;
+		pos->next = rqos;
+	} else {
+		q->rq_qos = rqos;
+	}
+	spin_unlock_irq(&q->queue_lock);
+
+	if (rqos->ops->debugfs_attrs)
+		blk_mq_debugfs_register_rqos(rqos);
+
+	if (ops->owner)
+		__module_get(ops->owner);
+}
+EXPORT_SYMBOL_GPL(rq_qos_activate);
+
+void rq_qos_deactivate(struct rq_qos *rqos)
+{
+	struct request_queue *q = rqos->q;
+	struct rq_qos **cur;
+
+	spin_lock_irq(&q->queue_lock);
+	rqos->dying = true;
+	/*
+	 * Drain all of the usage of get/put_rqos()
+	 */
+	wait_event_lock_irq(rqos->waitq,
+		refcount_read(&rqos->ref) == 1, q->queue_lock);
+	for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
+		if (*cur == rqos) {
+			*cur = rqos->next;
+			break;
+		}
+	}
+	spin_unlock_irq(&q->queue_lock);
+	blk_mq_debugfs_unregister_rqos(rqos);
+
+	if (rqos->ops->owner)
+		module_put(rqos->ops->owner);
+}
+EXPORT_SYMBOL_GPL(rq_qos_deactivate);
+
+static struct rq_qos_ops *rq_qos_op_find(const char *name)
+{
+	struct rq_qos_ops *pos;
+
+	list_for_each_entry(pos, &rq_qos_list, node) {
+		if (!strncmp(pos->name, name, strlen(pos->name)))
+			return pos;
+	}
+
+	return NULL;
+}
+
+int rq_qos_register(struct rq_qos_ops *ops)
+{
+	int ret, start;
+
+	mutex_lock(&rq_qos_mutex);
+
+	if (rq_qos_op_find(ops->name)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	if (ops->flags & RQOS_FLAG_CGRP_POL &&
+	    nr_rqos_blkcg_pols >= (BLKCG_MAX_POLS - BLKCG_NON_RQOS_POLS)) {
+		ret = -ENOSPC;
+		goto out;
+	}
+
+	start = RQ_QOS_IOPRIO + 1;
+	ret = ida_simple_get(&rq_qos_ida, start, INT_MAX, GFP_KERNEL);
+	if (ret < 0)
+		goto out;
+
+	if (ops->flags & RQOS_FLAG_CGRP_POL)
+		nr_rqos_blkcg_pols++;
+
+	ops->id = ret;
+	ret = 0;
+	INIT_LIST_HEAD(&ops->node);
+	list_add_tail(&ops->node, &rq_qos_list);
+out:
+	mutex_unlock(&rq_qos_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rq_qos_register);
+
+void rq_qos_unregister(struct rq_qos_ops *ops)
+{
+	mutex_lock(&rq_qos_mutex);
+
+	if (ops->flags & RQOS_FLAG_CGRP_POL)
+		nr_rqos_blkcg_pols--;
+	list_del_init(&ops->node);
+	ida_simple_remove(&rq_qos_ida, ops->id);
+	mutex_unlock(&rq_qos_mutex);
+}
+EXPORT_SYMBOL_GPL(rq_qos_unregister);
+
+ssize_t queue_qos_show(struct request_queue *q, char *buf)
+{
+	struct rq_qos_ops *ops;
+	struct rq_qos *rqos;
+	int ret = 0;
+
+	mutex_lock(&rq_qos_mutex);
+	/*
+	 * Show the policies in the order of being invoked.
+	 * queue_lock is not needed here as the sysfs_lock is
+	 * protected us from the queue_qos_store()
+	 */
+	for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+		if (!rqos->ops->name)
+			continue;
+		ret += sprintf(buf + ret, "[%s] ", rqos->ops->name);
+	}
+	list_for_each_entry(ops, &rq_qos_list, node) {
+		if (!rq_qos_by_name(q, ops->name))
+			ret += sprintf(buf + ret, "%s ", ops->name);
+	}
+
+	ret--; /* overwrite the last space */
+	ret += sprintf(buf + ret, "\n");
+	mutex_unlock(&rq_qos_mutex);
+
+	return ret;
+}
+
+static int rq_qos_switch(struct request_queue *q,
+		const struct rq_qos_ops *ops,
+		struct rq_qos *rqos)
+{
+	int ret;
+
+	blk_mq_freeze_queue(q);
+	if (!rqos) {
+		ret = ops->init(q);
+	} else {
+		ops->exit(rqos);
+		ret = 0;
+	}
+	blk_mq_unfreeze_queue(q);
+
+	return ret;
+}
+
+ssize_t queue_qos_store(struct request_queue *q, const char *page,
+			  size_t count)
+{
+	const struct rq_qos_ops *ops;
+	struct rq_qos *rqos;
+	const char *qosname;
+	char *buf;
+	bool add;
+	int ret;
+
+	if (!blk_queue_registered(q))
+		return -ENOENT;
+
+	buf = kstrdup(page, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	buf = strim(buf);
+	if (buf[0] != '+' && buf[0] != '-') {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	add = buf[0] == '+';
+	qosname = buf + 1;
+
+	rqos = rq_qos_by_name(q, qosname);
+	if ((buf[0] == '+' && rqos)) {
+		ret = -EEXIST;
+		goto out;
+	}
+
+	if ((buf[0] == '-' && !rqos)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (add) {
+		mutex_lock(&rq_qos_mutex);
+		ops = rq_qos_op_find(qosname);
+		if (!ops) {
+			/*
+			 * module_init callback may request this mutex
+			 */
+			mutex_unlock(&rq_qos_mutex);
+			request_module("%s", qosname);
+			mutex_lock(&rq_qos_mutex);
+			ops = rq_qos_op_find(qosname);
+		}
+		if (!ops) {
+			ret = -EINVAL;
+		} else if (ops->owner && !try_module_get(ops->owner)) {
+			ops = NULL;
+			ret = -EAGAIN;
+		}
+		mutex_unlock(&rq_qos_mutex);
+		if (!ops)
+			goto out;
+	} else {
+		ops = rqos->ops;
+	}
+
+	ret = rq_qos_switch(q, ops, add ? NULL : rqos);
+
+	if (add)
+		module_put(ops->owner);
+out:
+	kfree(buf);
+	return ret ? ret : count;
 }
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 3cfbc8668cba..586c3f5ec152 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -26,16 +26,28 @@  struct rq_wait {
 };
 
 struct rq_qos {
-	struct rq_qos_ops *ops;
+	const struct rq_qos_ops *ops;
 	struct request_queue *q;
 	enum rq_qos_id id;
+	refcount_t ref;
+	wait_queue_head_t waitq;
+	bool dying;
 	struct rq_qos *next;
 #ifdef CONFIG_BLK_DEBUG_FS
 	struct dentry *debugfs_dir;
 #endif
 };
 
+enum {
+	RQOS_FLAG_CGRP_POL = 1 << 0,
+};
+
 struct rq_qos_ops {
+	struct list_head node;
+	struct module *owner;
+	const char *name;
+	int flags;
+	int id;
 	void (*throttle)(struct rq_qos *, struct bio *);
 	void (*track)(struct rq_qos *, struct request *, struct bio *);
 	void (*merge)(struct rq_qos *, struct request *, struct bio *);
@@ -46,6 +58,7 @@  struct rq_qos_ops {
 	void (*cleanup)(struct rq_qos *, struct bio *);
 	void (*queue_depth_changed)(struct rq_qos *);
 	void (*exit)(struct rq_qos *);
+	int (*init)(struct request_queue *);
 	const struct blk_mq_debugfs_attr *debugfs_attrs;
 };
 
@@ -70,6 +83,19 @@  static inline struct rq_qos *rq_qos_id(struct request_queue *q,
 	return rqos;
 }
 
+static inline struct rq_qos *rq_qos_by_id(struct request_queue *q, int id)
+{
+	struct rq_qos *rqos;
+
+	WARN_ON(!mutex_is_locked(&q->sysfs_lock) && !spin_is_locked(&q->queue_lock));
+
+	for (rqos = q->rq_qos; rqos; rqos = rqos->next) {
+		if (rqos->id == id)
+			break;
+	}
+	return rqos;
+}
+
 static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
 {
 	return rq_qos_id(q, RQ_QOS_WBT);
@@ -132,6 +158,17 @@  static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
 	blk_mq_debugfs_unregister_rqos(rqos);
 }
 
+int rq_qos_register(struct rq_qos_ops *ops);
+void rq_qos_unregister(struct rq_qos_ops *ops);
+void rq_qos_activate(struct request_queue *q,
+		struct rq_qos *rqos, const struct rq_qos_ops *ops);
+void rq_qos_deactivate(struct rq_qos *rqos);
+ssize_t queue_qos_show(struct request_queue *q, char *buf);
+ssize_t queue_qos_store(struct request_queue *q, const char *page,
+			  size_t count);
+struct rq_qos *rq_qos_get(struct request_queue *q, int id);
+void rq_qos_put(struct rq_qos *rqos);
+
 typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
 typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f35aea98bc35..d5698a7cda67 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -44,6 +44,10 @@  struct blk_crypto_profile;
  * Defined here to simplify include dependency.
  */
 #define BLKCG_MAX_POLS		6
+/*
+ * Non blk-rq-qos blkcg policies include blk-throttle and bfq
+ */
+#define BLKCG_NON_RQOS_POLS		2
 
 static inline int blk_validate_block_size(unsigned long bsize)
 {