@@ -4234,9 +4234,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (nvme_update_ns_info(ns, info))
goto out_unlink_ns;
+ mutex_lock(&ctrl->namespaces_sync_io_queues_mutex);
down_write(&ctrl->namespaces_rwsem);
nvme_ns_add_to_ctrl_list(ns);
up_write(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->namespaces_sync_io_queues_mutex);
nvme_get_ctrl(ctrl);
if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
@@ -4252,9 +4254,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
out_cleanup_ns_from_list:
nvme_put_ctrl(ctrl);
+ mutex_lock(&ctrl->namespaces_sync_io_queues_mutex);
down_write(&ctrl->namespaces_rwsem);
list_del_init(&ns->list);
up_write(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->namespaces_sync_io_queues_mutex);
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
@@ -4304,9 +4308,11 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_cdev_del(&ns->cdev, &ns->cdev_device);
del_gendisk(ns->disk);
+ mutex_lock(&ns->ctrl->namespaces_sync_io_queues_mutex);
down_write(&ns->ctrl->namespaces_rwsem);
list_del_init(&ns->list);
up_write(&ns->ctrl->namespaces_rwsem);
+ mutex_unlock(&ns->ctrl->namespaces_sync_io_queues_mutex);
if (last_path)
nvme_mpath_shutdown_disk(ns->head);
@@ -4399,12 +4405,14 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
struct nvme_ns *ns, *next;
LIST_HEAD(rm_list);
+ mutex_lock(&ctrl->namespaces_sync_io_queues_mutex);
down_write(&ctrl->namespaces_rwsem);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
list_move_tail(&ns->list, &rm_list);
}
up_write(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->namespaces_sync_io_queues_mutex);
list_for_each_entry_safe(ns, next, &rm_list, list)
nvme_ns_remove(ns);
@@ -4565,9 +4573,11 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
/* this is a no-op when called from the controller reset handler */
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
+ mutex_lock(&ctrl->namespaces_sync_io_queues_mutex);
down_write(&ctrl->namespaces_rwsem);
list_splice_init(&ctrl->namespaces, &ns_list);
up_write(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->namespaces_sync_io_queues_mutex);
list_for_each_entry_safe(ns, next, &ns_list, list)
nvme_ns_remove(ns);
@@ -4901,6 +4911,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
INIT_LIST_HEAD(&ctrl->namespaces);
xa_init(&ctrl->cels);
init_rwsem(&ctrl->namespaces_rwsem);
+ mutex_init(&ctrl->namespaces_sync_io_queues_mutex);
ctrl->dev = dev;
ctrl->ops = ops;
ctrl->quirks = quirks;
@@ -5121,10 +5132,16 @@ void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
- down_read(&ctrl->namespaces_rwsem);
+ /*
+ * Guard ctrl->namespaces with namespaces_sync_io_queues_mutex instead
+ * of namespaces_rwsem to avoid deadlock warning. namespace_rwsem lock
+ * here for cancel_work_sync() via blk_sync_queue() would conflict with
+ * nvme_timeout() which locks namespaces_rwsem.
+ */
+ mutex_lock(&ctrl->namespaces_sync_io_queues_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_sync_queue(ns->queue);
- up_read(&ctrl->namespaces_rwsem);
+ mutex_unlock(&ctrl->namespaces_sync_io_queues_mutex);
}
EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
@@ -250,6 +250,7 @@ struct nvme_ctrl {
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
struct rw_semaphore namespaces_rwsem;
+ struct mutex namespaces_sync_io_queues_mutex;
struct device ctrl_device;
struct device *device; /* char device */
#ifdef CONFIG_NVME_HWMON
The commit c0feea594e05 ("workqueue: don't skip lockdep work dependency in cancel_work_sync()") restored a lockdep check and it triggered deadlock warning in sync_io_queues() path. This function locks namespaces_rwsem to traverse namespace list and calls blk_sync_queue() for each namespace. blk_sync_queue() calls cancel_work_sync() to wait for nvme_timeout() completion which has paths to lock the namespaces_rwsem again. Hence, the deadlock warning was reported. This deadlock is not expected to happen in real use cases since the queues to call blk_sync_queue() are already quiesced at that point. Left issue is confusion by the deadlock warning. It is not ideal to suppress the warning with lockdep_off/on() since it will hide unseen deadlock scenarios. It is desired to suppress the warning without losing deadlock detection ability. Instead of lockdep_off/on(), this patch suppresses the warning by adding another lock named 'namespaces_sync_io_queues_mutex'. It works together with namespaces_rwsem. Lock only namespaces_rwsem for fast read access to the namespaces in hot paths. Lock both the locks to modify the list. Lock only namespaces_sync_io_queues_mutex to call blk_sync_queue() so that namespaces_rwsem can be locked in nvme_timeout() without the deadlock warning. Link: https://lore.kernel.org/linux-block/20220930001943.zdbvolc3gkekfmcv@shindev/ Suggested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com> --- drivers/nvme/host/core.c | 21 +++++++++++++++++++-- drivers/nvme/host/nvme.h | 1 + 2 files changed, 20 insertions(+), 2 deletions(-)