@@ -4467,7 +4467,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
unsigned long i, j;
/* protect against switching io scheduler */
- mutex_lock(&q->sysfs_lock);
+ lockdep_assert_held(&q->sysfs_lock);
+
for (i = 0; i < set->nr_hw_queues; i++) {
int old_node;
int node = blk_mq_get_hctx_node(set, i);
@@ -4500,13 +4501,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
xa_for_each_start(&q->hctx_table, j, hctx, j)
blk_mq_exit_hctx(q, set, hctx, j);
- mutex_unlock(&q->sysfs_lock);
-
- /* unregister cpuhp callbacks for exited hctxs */
- blk_mq_remove_hw_queues_cpuhp(q);
-
- /* register cpuhp for new initialized hctxs */
- blk_mq_add_hw_queues_cpuhp(q);
}
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
@@ -4532,10 +4526,19 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
xa_init(&q->hctx_table);
+ mutex_lock(&q->sysfs_lock);
blk_mq_realloc_hw_ctxs(set, q);
+ mutex_unlock(&q->sysfs_lock);
if (!q->nr_hw_queues)
goto err_hctxs;
+ /*
+ * Register cpuhp for new initialized hctxs and ensure that the cpuhp
+ * registration happens outside of q->sysfs_lock to avoid any lock
+ * ordering issue between q->sysfs_lock and global cpuhp lock.
+ */
+ blk_mq_add_hw_queues_cpuhp(q);
+
INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
@@ -4934,12 +4937,12 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
return false;
/* q->elevator needs protection from ->sysfs_lock */
- mutex_lock(&q->sysfs_lock);
+ lockdep_assert_held(&q->sysfs_lock);
/* the check has to be done with holding sysfs_lock */
if (!q->elevator) {
kfree(qe);
- goto unlock;
+ goto out;
}
INIT_LIST_HEAD(&qe->node);
@@ -4949,8 +4952,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
__elevator_get(qe->type);
list_add(&qe->node, head);
elevator_disable(q);
-unlock:
- mutex_unlock(&q->sysfs_lock);
+out:
return true;
}
@@ -4973,6 +4975,8 @@ static void blk_mq_elv_switch_back(struct list_head *head,
struct blk_mq_qe_pair *qe;
struct elevator_type *t;
+ lockdep_assert_held(&q->sysfs_lock);
+
qe = blk_lookup_qe_pair(head, q);
if (!qe)
return;
@@ -4980,11 +4984,9 @@ static void blk_mq_elv_switch_back(struct list_head *head,
list_del(&qe->node);
kfree(qe);
- mutex_lock(&q->sysfs_lock);
elevator_switch(q, t);
/* drop the reference acquired in blk_mq_elv_switch_none */
elevator_put(t);
- mutex_unlock(&q->sysfs_lock);
}
static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
@@ -5006,8 +5008,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
return;
memflags = memalloc_noio_save();
- list_for_each_entry(q, &set->tag_list, tag_set_list)
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ mutex_lock(&q->sysfs_lock);
blk_mq_freeze_queue_nomemsave(q);
+ }
/*
* Switch IO scheduler to 'none', cleaning up the data associated
@@ -5055,8 +5059,21 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_elv_switch_back(&head, q);
- list_for_each_entry(q, &set->tag_list, tag_set_list)
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ mutex_unlock(&q->sysfs_lock);
+
+ /*
+ * Unregister cpuhp callbacks for exited hctxs and register
+ * cpuhp for new initialized hctxs. Ensure that unregister/
+ * register cpuhp is called outside of q->sysfs_lock to avoid
+ * lock ordering issue between q->sysfs_lock and global cpuhp
+ * lock.
+ */
+ blk_mq_remove_hw_queues_cpuhp(q);
+ blk_mq_add_hw_queues_cpuhp(q);
+
blk_mq_unfreeze_queue_nomemrestore(q);
+ }
memalloc_noio_restore(memflags);
/* Free the excess tags when nr_hw_queues shrink. */
@@ -725,7 +725,16 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
int ret;
strscpy(elevator_name, buf, sizeof(elevator_name));
+
+ /*
+ * The elevator change/switch code expects that the q->sysfs_lock
+ * is held while we update the iosched to protect against the
+ * simultaneous hctx update.
+ */
+ mutex_lock(&disk->queue->sysfs_lock);
ret = elevator_change(disk->queue, strstrip(elevator_name));
+ mutex_unlock(&disk->queue->sysfs_lock);
+
if (!ret)
return count;
return ret;
Lockdep reports [1] have identified inconsistent lock ordering between q->sysfs_lock and freeze-lock at several call sites in the block layer. This patch resolves the issue by enforcing a consistent lock acquisition order: q->sysfs_lock is always acquired before freeze-lock. This change eliminates the observed lockdep splats caused by the inconsistent ordering. Additionally, while rearranging the locking order, we ensure that no new lock ordering issues are introduced between the global CPU hotplug (cpuhp) lock and q->sysfs_lock, as previously reported [2]. To address this, blk_mq_add_hw_queues_cpuhp() and blk_mq_remove_hw_queues_cpuhp() are now called outside the critical section protected by q->sysfs_lock. Since blk_mq_add_hw_queues_cpuhp() and blk_mq_remove_hw_queues_cpuhp() are invoked during hardware context allocation via blk_mq_realloc_hw_ ctxs(), which runs holding q->sysfs_lock, we've relocated the add/remove cpuhp function calls to __blk_mq_update_nr_hw_queues() and blk_mq_init_ allocated_queue() after the q->sysfs_lock is released. This ensures proper lock ordering without introducing regressions. [1] https://lore.kernel.org/all/67637e70.050a0220.3157ee.000c.GAE@google.com/ [2] https://lore.kernel.org/all/20241206082202.949142-1-ming.lei@redhat.com/ Signed-off-by: Nilay Shroff <nilay@linux.ibm.com> --- block/blk-mq.c | 49 ++++++++++++++++++++++++++++++++---------------- block/elevator.c | 9 +++++++++ 2 files changed, 42 insertions(+), 16 deletions(-)