@@ -431,6 +431,7 @@ struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->limits_lock);
+ mutex_init(&q->elevator_lock);
mutex_init(&q->rq_qos_mutex);
spin_lock_init(&q->queue_lock);
@@ -4467,7 +4467,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
unsigned long i, j;
/* protect against switching io scheduler */
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
int old_node;
int node = blk_mq_get_hctx_node(set, i);
@@ -4500,7 +4500,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
xa_for_each_start(&q->hctx_table, j, hctx, j)
blk_mq_exit_hctx(q, set, hctx, j);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
/* unregister cpuhp callbacks for exited hctxs */
blk_mq_remove_hw_queues_cpuhp(q);
@@ -4934,7 +4934,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
return false;
/* q->elevator needs protection from ->sysfs_lock */
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
/* the check has to be done with holding sysfs_lock */
if (!q->elevator) {
@@ -4950,7 +4950,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
list_add(&qe->node, head);
elevator_disable(q);
unlock:
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
return true;
}
@@ -4980,11 +4980,11 @@ static void blk_mq_elv_switch_back(struct list_head *head,
list_del(&qe->node);
kfree(qe);
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
elevator_switch(q, t);
/* drop the reference acquired in blk_mq_elv_switch_none */
elevator_put(t);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
}
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
@@ -58,7 +58,13 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
static ssize_t queue_requests_show(struct gendisk *disk, char *page)
{
- return queue_var_show(disk->queue->nr_requests, page);
+ int ret;
+
+ mutex_lock(&disk->queue->sysfs_lock);
+ ret = queue_var_show(disk->queue->nr_requests, page);
+ mutex_unlock(&disk->queue->sysfs_lock);
+
+ return ret;
}
static ssize_t
@@ -66,27 +72,42 @@ queue_requests_store(struct gendisk *disk, const char *page, size_t count)
{
unsigned long nr;
int ret, err;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
- if (!queue_is_mq(disk->queue))
- return -EINVAL;
+ mutex_lock(&q->sysfs_lock);
+ memflags = blk_mq_freeze_queue(q);
+
+ if (!queue_is_mq(disk->queue)) {
+ ret = -EINVAL;
+ goto out;
+ }
ret = queue_var_store(&nr, page, count);
if (ret < 0)
- return ret;
+ goto out;
if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
err = blk_mq_update_nr_requests(disk->queue, nr);
if (err)
- return err;
-
+ ret = err;
+out:
+ blk_mq_unfreeze_queue(q, memflags);
+ mutex_unlock(&q->sysfs_lock);
return ret;
}
static ssize_t queue_ra_show(struct gendisk *disk, char *page)
{
- return queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page);
+ int ret;
+
+ mutex_lock(&disk->queue->sysfs_lock);
+ ret = queue_var_show(disk->bdi->ra_pages << (PAGE_SHIFT - 10), page);
+ mutex_unlock(&disk->queue->sysfs_lock);
+
+ return ret;
}
static ssize_t
@@ -94,11 +115,19 @@ queue_ra_store(struct gendisk *disk, const char *page, size_t count)
{
unsigned long ra_kb;
ssize_t ret;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
+
+ mutex_lock(&q->sysfs_lock);
+ memflags = blk_mq_freeze_queue(q);
ret = queue_var_store(&ra_kb, page, count);
if (ret < 0)
- return ret;
+ goto out;
disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
+out:
+ blk_mq_unfreeze_queue(q, memflags);
+ mutex_unlock(&q->sysfs_lock);
return ret;
}
@@ -534,14 +563,24 @@ static ssize_t queue_var_store64(s64 *var, const char *page)
static ssize_t queue_wb_lat_show(struct gendisk *disk, char *page)
{
- if (!wbt_rq_qos(disk->queue))
- return -EINVAL;
+ int ret;
+
+ mutex_lock(&disk->queue->sysfs_lock);
+
+ if (!wbt_rq_qos(disk->queue)) {
+ ret = -EINVAL;
+ goto out;
+ }
if (wbt_disabled(disk->queue))
- return sysfs_emit(page, "0\n");
+ ret = sysfs_emit(page, "0\n");
+ else
+ ret = sysfs_emit(page, "%llu\n",
+ div_u64(wbt_get_min_lat(disk->queue), 1000));
- return sysfs_emit(page, "%llu\n",
- div_u64(wbt_get_min_lat(disk->queue), 1000));
+out:
+ mutex_unlock(&disk->queue->sysfs_lock);
+ return ret;
}
static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
@@ -551,18 +590,24 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
struct rq_qos *rqos;
ssize_t ret;
s64 val;
+ unsigned int memflags;
+
+ mutex_lock(&q->sysfs_lock);
+ memflags = blk_mq_freeze_queue(q);
ret = queue_var_store64(&val, page);
if (ret < 0)
- return ret;
- if (val < -1)
- return -EINVAL;
+ goto out;
+ if (val < -1) {
+ ret = -EINVAL;
+ goto out;
+ }
rqos = wbt_rq_qos(q);
if (!rqos) {
ret = wbt_init(disk);
if (ret)
- return ret;
+ goto out;
}
if (val == -1)
@@ -570,8 +615,10 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
else if (val >= 0)
val *= 1000ULL;
- if (wbt_get_min_lat(q) == val)
- return count;
+ if (wbt_get_min_lat(q) == val) {
+ ret = count;
+ goto out;
+ }
/*
* Ensure that the queue is idled, in case the latency update
@@ -584,7 +631,10 @@ static ssize_t queue_wb_lat_store(struct gendisk *disk, const char *page,
blk_mq_unquiesce_queue(q);
- return count;
+out:
+ blk_mq_unfreeze_queue(q, memflags);
+ mutex_unlock(&q->sysfs_lock);
+ return ret;
}
QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
@@ -593,7 +643,7 @@ QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
/* Common attributes for bio-based and request-based queues. */
static struct attribute *queue_attrs[] = {
/*
- * attributes protected with q->sysfs_lock
+ * attributes which require some form of locking
*/
&queue_ra_entry.attr,
@@ -652,10 +702,10 @@ static struct attribute *queue_attrs[] = {
/* Request-based queue attributes that are not relevant for bio-based queues. */
static struct attribute *blk_mq_queue_attrs[] = {
/*
- * attributes protected with q->sysfs_lock
+ * attributes which require some form of locking
*/
- &queue_requests_entry.attr,
&elv_iosched_entry.attr,
+ &queue_requests_entry.attr,
#ifdef CONFIG_BLK_WBT
&queue_wb_lat_entry.attr,
#endif
@@ -729,10 +779,7 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
return res;
}
- mutex_lock(&disk->queue->sysfs_lock);
- res = entry->show(disk, page);
- mutex_unlock(&disk->queue->sysfs_lock);
- return res;
+ return entry->show(disk, page);
}
static ssize_t
@@ -778,12 +825,7 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
return length;
}
- mutex_lock(&q->sysfs_lock);
- memflags = blk_mq_freeze_queue(q);
- res = entry->store(disk, page, length);
- blk_mq_unfreeze_queue(q, memflags);
- mutex_unlock(&q->sysfs_lock);
- return res;
+ return entry->store(disk, page, length);
}
static const struct sysfs_ops queue_sysfs_ops = {
@@ -852,15 +894,19 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
goto out_debugfs_remove;
+ ret = blk_crypto_sysfs_register(disk);
+ if (ret)
+ goto out_unregister_ia_ranges;
+
+ mutex_lock(&q->elevator_lock);
if (q->elevator) {
ret = elv_register_queue(q, false);
- if (ret)
- goto out_unregister_ia_ranges;
+ if (ret) {
+ mutex_unlock(&q->elevator_lock);
+ goto out_crypto_sysfs_unregister;
+ }
}
-
- ret = blk_crypto_sysfs_register(disk);
- if (ret)
- goto out_elv_unregister;
+ mutex_unlock(&q->elevator_lock);
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(disk);
@@ -885,8 +931,8 @@ int blk_register_queue(struct gendisk *disk)
return ret;
-out_elv_unregister:
- elv_unregister_queue(q);
+out_crypto_sysfs_unregister:
+ blk_crypto_sysfs_unregister(disk);
out_unregister_ia_ranges:
disk_unregister_independent_access_ranges(disk);
out_debugfs_remove:
@@ -932,8 +978,11 @@ void blk_unregister_queue(struct gendisk *disk)
blk_mq_sysfs_unregister(disk);
blk_crypto_sysfs_unregister(disk);
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
elv_unregister_queue(q);
+ mutex_unlock(&q->elevator_lock);
+
+ mutex_lock(&q->sysfs_lock);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
@@ -457,7 +457,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
struct elevator_queue *e = q->elevator;
int error;
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->elevator_lock);
error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
if (!error) {
@@ -481,7 +481,7 @@ void elv_unregister_queue(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->elevator_lock);
if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
kobject_uevent(&e->kobj, KOBJ_REMOVE);
@@ -618,7 +618,7 @@ int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
unsigned int memflags;
int ret;
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->elevator_lock);
memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
@@ -655,7 +655,7 @@ void elevator_disable(struct request_queue *q)
{
unsigned int memflags;
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->elevator_lock);
memflags = blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
@@ -723,11 +723,19 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
{
char elevator_name[ELV_NAME_MAX];
int ret;
+ unsigned int memflags;
+ struct request_queue *q = disk->queue;
strscpy(elevator_name, buf, sizeof(elevator_name));
+
+ memflags = blk_mq_freeze_queue(q);
+ mutex_lock(&q->elevator_lock);
ret = elevator_change(disk->queue, strstrip(elevator_name));
+ mutex_unlock(&q->elevator_lock);
+ blk_mq_unfreeze_queue(q, memflags);
if (!ret)
return count;
+
return ret;
}
@@ -738,6 +746,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
struct elevator_type *cur = NULL, *e;
int len = 0;
+ mutex_lock(&q->elevator_lock);
if (!q->elevator) {
len += sprintf(name+len, "[none] ");
} else {
@@ -755,6 +764,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
spin_unlock(&elv_list_lock);
len += sprintf(name+len, "\n");
+ mutex_unlock(&q->elevator_lock);
return len;
}
@@ -565,8 +565,11 @@ int __must_check add_disk_fwnode(struct device *parent, struct gendisk *disk,
if (disk->major == BLOCK_EXT_MAJOR)
blk_free_ext_minor(disk->first_minor);
out_exit_elevator:
- if (disk->queue->elevator)
+ if (disk->queue->elevator) {
+ mutex_lock(&disk->queue->elevator_lock);
elevator_exit(disk->queue);
+ mutex_unlock(&disk->queue->elevator_lock);
+ }
return ret;
}
EXPORT_SYMBOL_GPL(add_disk_fwnode);
@@ -742,9 +745,9 @@ void del_gendisk(struct gendisk *disk)
blk_mq_quiesce_queue(q);
if (q->elevator) {
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->elevator_lock);
elevator_exit(q);
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->elevator_lock);
}
rq_qos_exit(q);
blk_mq_unquiesce_queue(q);
@@ -562,6 +562,7 @@ struct request_queue {
struct mutex sysfs_lock;
struct mutex limits_lock;
+ struct mutex elevator_lock;
/*
* for reusing dead hctx instance in case of updating
A queue's elevator can be updated either when modifying nr_hw_queues or through the sysfs scheduler attribute. To prevent simultaneous updates from causing race conditions, introduce a dedicated lock q->elevator_lock. Currently, elevator switching/updating is protected using q->sysfs_lock, but this has led to lockdep splats[1] due to inconsistent lock ordering between q->sysfs_lock and the freeze-lock in multiple block layer call sites. As the scope of q->sysfs_lock is not well-defined, its misuse has resulted in numerous lockdep warnings. To resolve this, replace q-> sysfs_lock with a new dedicated q->elevator_lock, which will be exclusively used to protect elevator switching and updates. [1] https://lore.kernel.org/all/67637e70.050a0220.3157ee.000c.GAE@google.com/ Signed-off-by: Nilay Shroff <nilay@linux.ibm.com> --- block/blk-core.c | 1 + block/blk-mq.c | 12 ++-- block/blk-sysfs.c | 133 ++++++++++++++++++++++++++++------------- block/elevator.c | 18 ++++-- block/genhd.c | 9 ++- include/linux/blkdev.h | 1 + 6 files changed, 119 insertions(+), 55 deletions(-)