Message ID | 20190417034410.31957-7-ming.lei@redhat.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
Series | blk-mq: fix races related with freeing queue | expand |
On 4/17/19 5:44 AM, Ming Lei wrote: > In normal queue cleanup path, hctx is released after request queue > is freed, see blk_mq_release(). > > However, in __blk_mq_update_nr_hw_queues(), hctx may be freed because > of hw queues shrinking. This way is easy to cause use-after-free, > because: one implicit rule is that it is safe to call almost all block > layer APIs if the request queue is alive; and one hctx may be retrieved > by one API, then the hctx can be freed by blk_mq_update_nr_hw_queues(); > finally use-after-free is triggered. > > Fixes this issue by always freeing hctx after releasing request queue. > If some hctxs are removed in blk_mq_update_nr_hw_queues(), introduce > a per-queue list to hold them, then try to resuse these hctxs if numa > node is matched. > > Cc: Dongli Zhang <dongli.zhang@oracle.com> > Cc: James Smart <james.smart@broadcom.com> > Cc: Bart Van Assche <bart.vanassche@wdc.com> > Cc: linux-scsi@vger.kernel.org, > Cc: Martin K . Petersen <martin.petersen@oracle.com>, > Cc: Christoph Hellwig <hch@lst.de>, > Cc: James E . J . Bottomley <jejb@linux.vnet.ibm.com>, > Cc: jianchao wang <jianchao.w.wang@oracle.com> > Signed-off-by: Ming Lei <ming.lei@redhat.com> > --- > block/blk-mq.c | 40 +++++++++++++++++++++++++++------------- > include/linux/blk-mq.h | 2 ++ > include/linux/blkdev.h | 7 +++++++ > 3 files changed, 36 insertions(+), 13 deletions(-) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index eeebba6ec0f7..2ca4395f794d 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -2274,6 +2274,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, > set->ops->exit_hctx(hctx, hctx_idx); > > blk_mq_remove_cpuhp(hctx); > + > + spin_lock(&q->dead_hctx_lock); > + list_add(&hctx->hctx_list, &q->dead_hctx_list); > + spin_unlock(&q->dead_hctx_lock); > } > > static void blk_mq_exit_hw_queues(struct request_queue *q, > @@ -2675,15 +2679,13 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) > */ > void blk_mq_release(struct request_queue *q) > { > - struct blk_mq_hw_ctx *hctx; > - unsigned int i; > + struct blk_mq_hw_ctx *hctx, *next; > > cancel_delayed_work_sync(&q->requeue_work); > > - /* hctx kobj stays in hctx */ > - queue_for_each_hw_ctx(q, hctx, i) { > - if (!hctx) > - continue; > + /* all hctx are in .dead_hctx_list now */ > + list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) { > + list_del_init(&hctx->hctx_list); > kobject_put(&hctx->kobj); > } > > @@ -2750,9 +2752,22 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( > struct blk_mq_tag_set *set, struct request_queue *q, > int hctx_idx, int node) > { > - struct blk_mq_hw_ctx *hctx; > + struct blk_mq_hw_ctx *hctx = NULL, *tmp; > + > + /* reuse dead hctx first */ > + spin_lock(&q->dead_hctx_lock); > + list_for_each_entry(tmp, &q->dead_hctx_list, hctx_list) { > + if (tmp->numa_node == node) { > + hctx = tmp; > + break; > + } > + } > + if (hctx) > + list_del_init(&hctx->hctx_list); > + spin_unlock(&q->dead_hctx_lock); > > - hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); > + if (!hctx) > + hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); > if (!hctx) > goto fail; > > @@ -2790,10 +2805,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, > > hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); > if (hctx) { > - if (hctxs[i]) { > + if (hctxs[i]) > blk_mq_exit_hctx(q, set, hctxs[i], i); > - kobject_put(&hctxs[i]->kobj); > - } > hctxs[i] = hctx; > } else { > if (hctxs[i]) > @@ -2824,9 +2837,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, > if (hctx->tags) > blk_mq_free_map_and_requests(set, j); > blk_mq_exit_hctx(q, set, hctx, j); > - kobject_put(&hctx->kobj); > hctxs[j] = NULL; > - > } > } > mutex_unlock(&q->sysfs_lock); > @@ -2869,6 +2880,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, > if (!q->queue_hw_ctx) > goto err_sys_init; > > + INIT_LIST_HEAD(&q->dead_hctx_list); > + spin_lock_init(&q->dead_hctx_lock); > + > blk_mq_realloc_hw_ctxs(set, q); > if (!q->nr_hw_queues) > goto err_hctxs; > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index db29928de467..15d1aa53d96c 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -70,6 +70,8 @@ struct blk_mq_hw_ctx { > struct dentry *sched_debugfs_dir; > #endif > > + struct list_head hctx_list; > + > /* Must be the last member - see also blk_mq_hw_ctx_size(). */ > struct srcu_struct srcu[0]; > }; > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 4b85dc066264..1325f941f0be 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -535,6 +535,13 @@ struct request_queue { > > struct mutex sysfs_lock; > > + /* > + * for reusing dead hctx instance in case of updating > + * nr_hw_queues > + */ > + struct list_head dead_hctx_list; > + spinlock_t dead_hctx_lock; > + > atomic_t mq_freeze_depth; > > #if defined(CONFIG_BLK_DEV_BSG) > Hmm. I don't particularly like this approach. The much saner approach would be to avoid having I/O in-flight in the first place by setting the queue to something else than live, no? Cheers, Hannes
On Wed, Apr 17, 2019 at 02:08:59PM +0200, Hannes Reinecke wrote: > On 4/17/19 5:44 AM, Ming Lei wrote: > > In normal queue cleanup path, hctx is released after request queue > > is freed, see blk_mq_release(). > > > > However, in __blk_mq_update_nr_hw_queues(), hctx may be freed because > > of hw queues shrinking. This way is easy to cause use-after-free, > > because: one implicit rule is that it is safe to call almost all block > > layer APIs if the request queue is alive; and one hctx may be retrieved > > by one API, then the hctx can be freed by blk_mq_update_nr_hw_queues(); > > finally use-after-free is triggered. > > > > Fixes this issue by always freeing hctx after releasing request queue. > > If some hctxs are removed in blk_mq_update_nr_hw_queues(), introduce > > a per-queue list to hold them, then try to resuse these hctxs if numa > > node is matched. > > > > Cc: Dongli Zhang <dongli.zhang@oracle.com> > > Cc: James Smart <james.smart@broadcom.com> > > Cc: Bart Van Assche <bart.vanassche@wdc.com> > > Cc: linux-scsi@vger.kernel.org, > > Cc: Martin K . Petersen <martin.petersen@oracle.com>, > > Cc: Christoph Hellwig <hch@lst.de>, > > Cc: James E . J . Bottomley <jejb@linux.vnet.ibm.com>, > > Cc: jianchao wang <jianchao.w.wang@oracle.com> > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > > --- > > block/blk-mq.c | 40 +++++++++++++++++++++++++++------------- > > include/linux/blk-mq.h | 2 ++ > > include/linux/blkdev.h | 7 +++++++ > > 3 files changed, 36 insertions(+), 13 deletions(-) > > > > diff --git a/block/blk-mq.c b/block/blk-mq.c > > index eeebba6ec0f7..2ca4395f794d 100644 > > --- a/block/blk-mq.c > > +++ b/block/blk-mq.c > > @@ -2274,6 +2274,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, > > set->ops->exit_hctx(hctx, hctx_idx); > > blk_mq_remove_cpuhp(hctx); > > + > > + spin_lock(&q->dead_hctx_lock); > > + list_add(&hctx->hctx_list, &q->dead_hctx_list); > > + spin_unlock(&q->dead_hctx_lock); > > } > > static void blk_mq_exit_hw_queues(struct request_queue *q, > > @@ -2675,15 +2679,13 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) > > */ > > void blk_mq_release(struct request_queue *q) > > { > > - struct blk_mq_hw_ctx *hctx; > > - unsigned int i; > > + struct blk_mq_hw_ctx *hctx, *next; > > cancel_delayed_work_sync(&q->requeue_work); > > - /* hctx kobj stays in hctx */ > > - queue_for_each_hw_ctx(q, hctx, i) { > > - if (!hctx) > > - continue; > > + /* all hctx are in .dead_hctx_list now */ > > + list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) { > > + list_del_init(&hctx->hctx_list); > > kobject_put(&hctx->kobj); > > } > > @@ -2750,9 +2752,22 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( > > struct blk_mq_tag_set *set, struct request_queue *q, > > int hctx_idx, int node) > > { > > - struct blk_mq_hw_ctx *hctx; > > + struct blk_mq_hw_ctx *hctx = NULL, *tmp; > > + > > + /* reuse dead hctx first */ > > + spin_lock(&q->dead_hctx_lock); > > + list_for_each_entry(tmp, &q->dead_hctx_list, hctx_list) { > > + if (tmp->numa_node == node) { > > + hctx = tmp; > > + break; > > + } > > + } > > + if (hctx) > > + list_del_init(&hctx->hctx_list); > > + spin_unlock(&q->dead_hctx_lock); > > - hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); > > + if (!hctx) > > + hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); > > if (!hctx) > > goto fail; > > @@ -2790,10 +2805,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, > > hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); > > if (hctx) { > > - if (hctxs[i]) { > > + if (hctxs[i]) > > blk_mq_exit_hctx(q, set, hctxs[i], i); > > - kobject_put(&hctxs[i]->kobj); > > - } > > hctxs[i] = hctx; > > } else { > > if (hctxs[i]) > > @@ -2824,9 +2837,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, > > if (hctx->tags) > > blk_mq_free_map_and_requests(set, j); > > blk_mq_exit_hctx(q, set, hctx, j); > > - kobject_put(&hctx->kobj); > > hctxs[j] = NULL; > > - > > } > > } > > mutex_unlock(&q->sysfs_lock); > > @@ -2869,6 +2880,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, > > if (!q->queue_hw_ctx) > > goto err_sys_init; > > + INIT_LIST_HEAD(&q->dead_hctx_list); > > + spin_lock_init(&q->dead_hctx_lock); > > + > > blk_mq_realloc_hw_ctxs(set, q); > > if (!q->nr_hw_queues) > > goto err_hctxs; > > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > > index db29928de467..15d1aa53d96c 100644 > > --- a/include/linux/blk-mq.h > > +++ b/include/linux/blk-mq.h > > @@ -70,6 +70,8 @@ struct blk_mq_hw_ctx { > > struct dentry *sched_debugfs_dir; > > #endif > > + struct list_head hctx_list; > > + > > /* Must be the last member - see also blk_mq_hw_ctx_size(). */ > > struct srcu_struct srcu[0]; > > }; > > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > > index 4b85dc066264..1325f941f0be 100644 > > --- a/include/linux/blkdev.h > > +++ b/include/linux/blkdev.h > > @@ -535,6 +535,13 @@ struct request_queue { > > struct mutex sysfs_lock; > > + /* > > + * for reusing dead hctx instance in case of updating > > + * nr_hw_queues > > + */ > > + struct list_head dead_hctx_list; > > + spinlock_t dead_hctx_lock; > > + > > atomic_t mq_freeze_depth; > > #if defined(CONFIG_BLK_DEV_BSG) > > > Hmm. > I don't particularly like this approach. > The much saner approach would be to avoid having I/O in-flight in the first > place by setting the queue to something else than live, no? There isn't any in-flight IO since blk_mq_freeze_queue() is returned from __blk_mq_update_nr_hw_queues(). However, there can be reference to hctx in other code paths via arbitrary block layer APIs. Quiesce can't avoid the use-after-free too given RCU read lock may not be held in lots of reference to hctx, such as queue_for_each_hw_ctx(). So this patch might be the simplest approach to fix this issue, IMO. However, I am open to any solution for this issue. Thanks, Ming
On Wed, Apr 17, 2019 at 08:59:43PM +0800, Ming Lei wrote: > On Wed, Apr 17, 2019 at 02:08:59PM +0200, Hannes Reinecke wrote: > > On 4/17/19 5:44 AM, Ming Lei wrote: > > > In normal queue cleanup path, hctx is released after request queue > > > is freed, see blk_mq_release(). > > > > > > However, in __blk_mq_update_nr_hw_queues(), hctx may be freed because > > > of hw queues shrinking. This way is easy to cause use-after-free, > > > because: one implicit rule is that it is safe to call almost all block > > > layer APIs if the request queue is alive; and one hctx may be retrieved > > > by one API, then the hctx can be freed by blk_mq_update_nr_hw_queues(); > > > finally use-after-free is triggered. > > > > > > Fixes this issue by always freeing hctx after releasing request queue. > > > If some hctxs are removed in blk_mq_update_nr_hw_queues(), introduce > > > a per-queue list to hold them, then try to resuse these hctxs if numa > > > node is matched. > > > > > > Cc: Dongli Zhang <dongli.zhang@oracle.com> > > > Cc: James Smart <james.smart@broadcom.com> > > > Cc: Bart Van Assche <bart.vanassche@wdc.com> > > > Cc: linux-scsi@vger.kernel.org, > > > Cc: Martin K . Petersen <martin.petersen@oracle.com>, > > > Cc: Christoph Hellwig <hch@lst.de>, > > > Cc: James E . J . Bottomley <jejb@linux.vnet.ibm.com>, > > > Cc: jianchao wang <jianchao.w.wang@oracle.com> > > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > > > --- > > > block/blk-mq.c | 40 +++++++++++++++++++++++++++------------- > > > include/linux/blk-mq.h | 2 ++ > > > include/linux/blkdev.h | 7 +++++++ > > > 3 files changed, 36 insertions(+), 13 deletions(-) > > > > > > diff --git a/block/blk-mq.c b/block/blk-mq.c > > > index eeebba6ec0f7..2ca4395f794d 100644 > > > --- a/block/blk-mq.c > > > +++ b/block/blk-mq.c > > > @@ -2274,6 +2274,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, > > > set->ops->exit_hctx(hctx, hctx_idx); > > > blk_mq_remove_cpuhp(hctx); > > > + > > > + spin_lock(&q->dead_hctx_lock); > > > + list_add(&hctx->hctx_list, &q->dead_hctx_list); > > > + spin_unlock(&q->dead_hctx_lock); > > > } > > > static void blk_mq_exit_hw_queues(struct request_queue *q, > > > @@ -2675,15 +2679,13 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) > > > */ > > > void blk_mq_release(struct request_queue *q) > > > { > > > - struct blk_mq_hw_ctx *hctx; > > > - unsigned int i; > > > + struct blk_mq_hw_ctx *hctx, *next; > > > cancel_delayed_work_sync(&q->requeue_work); > > > - /* hctx kobj stays in hctx */ > > > - queue_for_each_hw_ctx(q, hctx, i) { > > > - if (!hctx) > > > - continue; > > > + /* all hctx are in .dead_hctx_list now */ > > > + list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) { > > > + list_del_init(&hctx->hctx_list); > > > kobject_put(&hctx->kobj); > > > } > > > @@ -2750,9 +2752,22 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( > > > struct blk_mq_tag_set *set, struct request_queue *q, > > > int hctx_idx, int node) > > > { > > > - struct blk_mq_hw_ctx *hctx; > > > + struct blk_mq_hw_ctx *hctx = NULL, *tmp; > > > + > > > + /* reuse dead hctx first */ > > > + spin_lock(&q->dead_hctx_lock); > > > + list_for_each_entry(tmp, &q->dead_hctx_list, hctx_list) { > > > + if (tmp->numa_node == node) { > > > + hctx = tmp; > > > + break; > > > + } > > > + } > > > + if (hctx) > > > + list_del_init(&hctx->hctx_list); > > > + spin_unlock(&q->dead_hctx_lock); > > > - hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); > > > + if (!hctx) > > > + hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); > > > if (!hctx) > > > goto fail; > > > @@ -2790,10 +2805,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, > > > hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); > > > if (hctx) { > > > - if (hctxs[i]) { > > > + if (hctxs[i]) > > > blk_mq_exit_hctx(q, set, hctxs[i], i); > > > - kobject_put(&hctxs[i]->kobj); > > > - } > > > hctxs[i] = hctx; > > > } else { > > > if (hctxs[i]) > > > @@ -2824,9 +2837,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, > > > if (hctx->tags) > > > blk_mq_free_map_and_requests(set, j); > > > blk_mq_exit_hctx(q, set, hctx, j); > > > - kobject_put(&hctx->kobj); > > > hctxs[j] = NULL; > > > - > > > } > > > } > > > mutex_unlock(&q->sysfs_lock); > > > @@ -2869,6 +2880,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, > > > if (!q->queue_hw_ctx) > > > goto err_sys_init; > > > + INIT_LIST_HEAD(&q->dead_hctx_list); > > > + spin_lock_init(&q->dead_hctx_lock); > > > + > > > blk_mq_realloc_hw_ctxs(set, q); > > > if (!q->nr_hw_queues) > > > goto err_hctxs; > > > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > > > index db29928de467..15d1aa53d96c 100644 > > > --- a/include/linux/blk-mq.h > > > +++ b/include/linux/blk-mq.h > > > @@ -70,6 +70,8 @@ struct blk_mq_hw_ctx { > > > struct dentry *sched_debugfs_dir; > > > #endif > > > + struct list_head hctx_list; > > > + > > > /* Must be the last member - see also blk_mq_hw_ctx_size(). */ > > > struct srcu_struct srcu[0]; > > > }; > > > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > > > index 4b85dc066264..1325f941f0be 100644 > > > --- a/include/linux/blkdev.h > > > +++ b/include/linux/blkdev.h > > > @@ -535,6 +535,13 @@ struct request_queue { > > > struct mutex sysfs_lock; > > > + /* > > > + * for reusing dead hctx instance in case of updating > > > + * nr_hw_queues > > > + */ > > > + struct list_head dead_hctx_list; > > > + spinlock_t dead_hctx_lock; > > > + > > > atomic_t mq_freeze_depth; > > > #if defined(CONFIG_BLK_DEV_BSG) > > > > > Hmm. > > I don't particularly like this approach. > > The much saner approach would be to avoid having I/O in-flight in the first > > place by setting the queue to something else than live, no? > > There isn't any in-flight IO since blk_mq_freeze_queue() is returned > from __blk_mq_update_nr_hw_queues(). > > However, there can be reference to hctx in other code paths via > arbitrary block layer APIs. > > Quiesce can't avoid the use-after-free too given RCU read lock may > not be held in lots of reference to hctx, such as queue_for_each_hw_ctx(). > > So this patch might be the simplest approach to fix this issue, IMO. > > However, I am open to any solution for this issue. Hi Hannes, Could you please let us know if you have better idea for this issue? Otherwise, I think we need to move on since it is real issue, and users want to fix that. Thanks, Ming
On 4/22/19 5:30 AM, Ming Lei wrote: > On Wed, Apr 17, 2019 at 08:59:43PM +0800, Ming Lei wrote: >> On Wed, Apr 17, 2019 at 02:08:59PM +0200, Hannes Reinecke wrote: >>> On 4/17/19 5:44 AM, Ming Lei wrote: >>>> In normal queue cleanup path, hctx is released after request queue >>>> is freed, see blk_mq_release(). >>>> >>>> However, in __blk_mq_update_nr_hw_queues(), hctx may be freed because >>>> of hw queues shrinking. This way is easy to cause use-after-free, >>>> because: one implicit rule is that it is safe to call almost all block >>>> layer APIs if the request queue is alive; and one hctx may be retrieved >>>> by one API, then the hctx can be freed by blk_mq_update_nr_hw_queues(); >>>> finally use-after-free is triggered. >>>> >>>> Fixes this issue by always freeing hctx after releasing request queue. >>>> If some hctxs are removed in blk_mq_update_nr_hw_queues(), introduce >>>> a per-queue list to hold them, then try to resuse these hctxs if numa >>>> node is matched. >>>> >>>> Cc: Dongli Zhang <dongli.zhang@oracle.com> >>>> Cc: James Smart <james.smart@broadcom.com> >>>> Cc: Bart Van Assche <bart.vanassche@wdc.com> >>>> Cc: linux-scsi@vger.kernel.org, >>>> Cc: Martin K . Petersen <martin.petersen@oracle.com>, >>>> Cc: Christoph Hellwig <hch@lst.de>, >>>> Cc: James E . J . Bottomley <jejb@linux.vnet.ibm.com>, >>>> Cc: jianchao wang <jianchao.w.wang@oracle.com> >>>> Signed-off-by: Ming Lei <ming.lei@redhat.com> >>>> --- >>>> block/blk-mq.c | 40 +++++++++++++++++++++++++++------------- >>>> include/linux/blk-mq.h | 2 ++ >>>> include/linux/blkdev.h | 7 +++++++ >>>> 3 files changed, 36 insertions(+), 13 deletions(-) >>>> >>>> diff --git a/block/blk-mq.c b/block/blk-mq.c >>>> index eeebba6ec0f7..2ca4395f794d 100644 >>>> --- a/block/blk-mq.c >>>> +++ b/block/blk-mq.c >>>> @@ -2274,6 +2274,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, >>>> set->ops->exit_hctx(hctx, hctx_idx); >>>> blk_mq_remove_cpuhp(hctx); >>>> + >>>> + spin_lock(&q->dead_hctx_lock); >>>> + list_add(&hctx->hctx_list, &q->dead_hctx_list); >>>> + spin_unlock(&q->dead_hctx_lock); >>>> } >>>> static void blk_mq_exit_hw_queues(struct request_queue *q, >>>> @@ -2675,15 +2679,13 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) >>>> */ >>>> void blk_mq_release(struct request_queue *q) >>>> { >>>> - struct blk_mq_hw_ctx *hctx; >>>> - unsigned int i; >>>> + struct blk_mq_hw_ctx *hctx, *next; >>>> cancel_delayed_work_sync(&q->requeue_work); >>>> - /* hctx kobj stays in hctx */ >>>> - queue_for_each_hw_ctx(q, hctx, i) { >>>> - if (!hctx) >>>> - continue; >>>> + /* all hctx are in .dead_hctx_list now */ >>>> + list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) { >>>> + list_del_init(&hctx->hctx_list); >>>> kobject_put(&hctx->kobj); >>>> } >>>> @@ -2750,9 +2752,22 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( >>>> struct blk_mq_tag_set *set, struct request_queue *q, >>>> int hctx_idx, int node) >>>> { >>>> - struct blk_mq_hw_ctx *hctx; >>>> + struct blk_mq_hw_ctx *hctx = NULL, *tmp; >>>> + >>>> + /* reuse dead hctx first */ >>>> + spin_lock(&q->dead_hctx_lock); >>>> + list_for_each_entry(tmp, &q->dead_hctx_list, hctx_list) { >>>> + if (tmp->numa_node == node) { >>>> + hctx = tmp; >>>> + break; >>>> + } >>>> + } >>>> + if (hctx) >>>> + list_del_init(&hctx->hctx_list); >>>> + spin_unlock(&q->dead_hctx_lock); >>>> - hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); >>>> + if (!hctx) >>>> + hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); >>>> if (!hctx) >>>> goto fail; >>>> @@ -2790,10 +2805,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, >>>> hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); >>>> if (hctx) { >>>> - if (hctxs[i]) { >>>> + if (hctxs[i]) >>>> blk_mq_exit_hctx(q, set, hctxs[i], i); >>>> - kobject_put(&hctxs[i]->kobj); >>>> - } >>>> hctxs[i] = hctx; >>>> } else { >>>> if (hctxs[i]) >>>> @@ -2824,9 +2837,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, >>>> if (hctx->tags) >>>> blk_mq_free_map_and_requests(set, j); >>>> blk_mq_exit_hctx(q, set, hctx, j); >>>> - kobject_put(&hctx->kobj); >>>> hctxs[j] = NULL; >>>> - >>>> } >>>> } >>>> mutex_unlock(&q->sysfs_lock); >>>> @@ -2869,6 +2880,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, >>>> if (!q->queue_hw_ctx) >>>> goto err_sys_init; >>>> + INIT_LIST_HEAD(&q->dead_hctx_list); >>>> + spin_lock_init(&q->dead_hctx_lock); >>>> + >>>> blk_mq_realloc_hw_ctxs(set, q); >>>> if (!q->nr_hw_queues) >>>> goto err_hctxs; >>>> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h >>>> index db29928de467..15d1aa53d96c 100644 >>>> --- a/include/linux/blk-mq.h >>>> +++ b/include/linux/blk-mq.h >>>> @@ -70,6 +70,8 @@ struct blk_mq_hw_ctx { >>>> struct dentry *sched_debugfs_dir; >>>> #endif >>>> + struct list_head hctx_list; >>>> + >>>> /* Must be the last member - see also blk_mq_hw_ctx_size(). */ >>>> struct srcu_struct srcu[0]; >>>> }; >>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h >>>> index 4b85dc066264..1325f941f0be 100644 >>>> --- a/include/linux/blkdev.h >>>> +++ b/include/linux/blkdev.h >>>> @@ -535,6 +535,13 @@ struct request_queue { >>>> struct mutex sysfs_lock; >>>> + /* >>>> + * for reusing dead hctx instance in case of updating >>>> + * nr_hw_queues >>>> + */ >>>> + struct list_head dead_hctx_list; >>>> + spinlock_t dead_hctx_lock; >>>> + >>>> atomic_t mq_freeze_depth; >>>> #if defined(CONFIG_BLK_DEV_BSG) >>>> >>> Hmm. >>> I don't particularly like this approach. >>> The much saner approach would be to avoid having I/O in-flight in the first >>> place by setting the queue to something else than live, no? >> >> There isn't any in-flight IO since blk_mq_freeze_queue() is returned >> from __blk_mq_update_nr_hw_queues(). >> >> However, there can be reference to hctx in other code paths via >> arbitrary block layer APIs. >> >> Quiesce can't avoid the use-after-free too given RCU read lock may >> not be held in lots of reference to hctx, such as queue_for_each_hw_ctx(). >> >> So this patch might be the simplest approach to fix this issue, IMO. >> >> However, I am open to any solution for this issue. > > Hi Hannes, > > Could you please let us know if you have better idea for this issue? > Otherwise, I think we need to move on since it is real issue, and users > want to fix that. > Okay. Having looked over the problem and possible alternatives, it looks indeed like a viable solution. I do agree that it's a sensible design to have an additional holding area for hardware context elements, given that they might be reassigned during blk_mq_realloc_hw_ctxs(). However, I would rename the 'dead' elements to 'unused' (ie unused_hctx_list etc). And I would deallocate q->queue_hw_ctx in blk_mq_exit_queue() to make things more consistent. Problem with the current patch is that in blk_mq_release() we iterate the 'dead_hctx_list' and free up everything in there, but then blindly call 'kfree(q->queue_hw_ctx)' without checking if there are any context pointers left. When moving the call to blk_mq_exit_queue() we could to a simple WARN_ON(q->queue_hw_ctx) in blk_mq_release() to ensure that everything got deallocated properly. Cheers, Hannes
Hi Hannes, Thanks for your response. On Tue, Apr 23, 2019 at 01:19:33PM +0200, Hannes Reinecke wrote: > On 4/22/19 5:30 AM, Ming Lei wrote: > > On Wed, Apr 17, 2019 at 08:59:43PM +0800, Ming Lei wrote: > > > On Wed, Apr 17, 2019 at 02:08:59PM +0200, Hannes Reinecke wrote: > > > > On 4/17/19 5:44 AM, Ming Lei wrote: > > > > > In normal queue cleanup path, hctx is released after request queue > > > > > is freed, see blk_mq_release(). > > > > > > > > > > However, in __blk_mq_update_nr_hw_queues(), hctx may be freed because > > > > > of hw queues shrinking. This way is easy to cause use-after-free, > > > > > because: one implicit rule is that it is safe to call almost all block > > > > > layer APIs if the request queue is alive; and one hctx may be retrieved > > > > > by one API, then the hctx can be freed by blk_mq_update_nr_hw_queues(); > > > > > finally use-after-free is triggered. > > > > > > > > > > Fixes this issue by always freeing hctx after releasing request queue. > > > > > If some hctxs are removed in blk_mq_update_nr_hw_queues(), introduce > > > > > a per-queue list to hold them, then try to resuse these hctxs if numa > > > > > node is matched. > > > > > > > > > > Cc: Dongli Zhang <dongli.zhang@oracle.com> > > > > > Cc: James Smart <james.smart@broadcom.com> > > > > > Cc: Bart Van Assche <bart.vanassche@wdc.com> > > > > > Cc: linux-scsi@vger.kernel.org, > > > > > Cc: Martin K . Petersen <martin.petersen@oracle.com>, > > > > > Cc: Christoph Hellwig <hch@lst.de>, > > > > > Cc: James E . J . Bottomley <jejb@linux.vnet.ibm.com>, > > > > > Cc: jianchao wang <jianchao.w.wang@oracle.com> > > > > > Signed-off-by: Ming Lei <ming.lei@redhat.com> > > > > > --- > > > > > block/blk-mq.c | 40 +++++++++++++++++++++++++++------------- > > > > > include/linux/blk-mq.h | 2 ++ > > > > > include/linux/blkdev.h | 7 +++++++ > > > > > 3 files changed, 36 insertions(+), 13 deletions(-) > > > > > > > > > > diff --git a/block/blk-mq.c b/block/blk-mq.c > > > > > index eeebba6ec0f7..2ca4395f794d 100644 > > > > > --- a/block/blk-mq.c > > > > > +++ b/block/blk-mq.c > > > > > @@ -2274,6 +2274,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, > > > > > set->ops->exit_hctx(hctx, hctx_idx); > > > > > blk_mq_remove_cpuhp(hctx); > > > > > + > > > > > + spin_lock(&q->dead_hctx_lock); > > > > > + list_add(&hctx->hctx_list, &q->dead_hctx_list); > > > > > + spin_unlock(&q->dead_hctx_lock); > > > > > } > > > > > static void blk_mq_exit_hw_queues(struct request_queue *q, > > > > > @@ -2675,15 +2679,13 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) > > > > > */ > > > > > void blk_mq_release(struct request_queue *q) > > > > > { > > > > > - struct blk_mq_hw_ctx *hctx; > > > > > - unsigned int i; > > > > > + struct blk_mq_hw_ctx *hctx, *next; > > > > > cancel_delayed_work_sync(&q->requeue_work); > > > > > - /* hctx kobj stays in hctx */ > > > > > - queue_for_each_hw_ctx(q, hctx, i) { > > > > > - if (!hctx) > > > > > - continue; > > > > > + /* all hctx are in .dead_hctx_list now */ > > > > > + list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) { > > > > > + list_del_init(&hctx->hctx_list); > > > > > kobject_put(&hctx->kobj); > > > > > } > > > > > @@ -2750,9 +2752,22 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( > > > > > struct blk_mq_tag_set *set, struct request_queue *q, > > > > > int hctx_idx, int node) > > > > > { > > > > > - struct blk_mq_hw_ctx *hctx; > > > > > + struct blk_mq_hw_ctx *hctx = NULL, *tmp; > > > > > + > > > > > + /* reuse dead hctx first */ > > > > > + spin_lock(&q->dead_hctx_lock); > > > > > + list_for_each_entry(tmp, &q->dead_hctx_list, hctx_list) { > > > > > + if (tmp->numa_node == node) { > > > > > + hctx = tmp; > > > > > + break; > > > > > + } > > > > > + } > > > > > + if (hctx) > > > > > + list_del_init(&hctx->hctx_list); > > > > > + spin_unlock(&q->dead_hctx_lock); > > > > > - hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); > > > > > + if (!hctx) > > > > > + hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); > > > > > if (!hctx) > > > > > goto fail; > > > > > @@ -2790,10 +2805,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, > > > > > hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); > > > > > if (hctx) { > > > > > - if (hctxs[i]) { > > > > > + if (hctxs[i]) > > > > > blk_mq_exit_hctx(q, set, hctxs[i], i); > > > > > - kobject_put(&hctxs[i]->kobj); > > > > > - } > > > > > hctxs[i] = hctx; > > > > > } else { > > > > > if (hctxs[i]) > > > > > @@ -2824,9 +2837,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, > > > > > if (hctx->tags) > > > > > blk_mq_free_map_and_requests(set, j); > > > > > blk_mq_exit_hctx(q, set, hctx, j); > > > > > - kobject_put(&hctx->kobj); > > > > > hctxs[j] = NULL; > > > > > - > > > > > } > > > > > } > > > > > mutex_unlock(&q->sysfs_lock); > > > > > @@ -2869,6 +2880,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, > > > > > if (!q->queue_hw_ctx) > > > > > goto err_sys_init; > > > > > + INIT_LIST_HEAD(&q->dead_hctx_list); > > > > > + spin_lock_init(&q->dead_hctx_lock); > > > > > + > > > > > blk_mq_realloc_hw_ctxs(set, q); > > > > > if (!q->nr_hw_queues) > > > > > goto err_hctxs; > > > > > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > > > > > index db29928de467..15d1aa53d96c 100644 > > > > > --- a/include/linux/blk-mq.h > > > > > +++ b/include/linux/blk-mq.h > > > > > @@ -70,6 +70,8 @@ struct blk_mq_hw_ctx { > > > > > struct dentry *sched_debugfs_dir; > > > > > #endif > > > > > + struct list_head hctx_list; > > > > > + > > > > > /* Must be the last member - see also blk_mq_hw_ctx_size(). */ > > > > > struct srcu_struct srcu[0]; > > > > > }; > > > > > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > > > > > index 4b85dc066264..1325f941f0be 100644 > > > > > --- a/include/linux/blkdev.h > > > > > +++ b/include/linux/blkdev.h > > > > > @@ -535,6 +535,13 @@ struct request_queue { > > > > > struct mutex sysfs_lock; > > > > > + /* > > > > > + * for reusing dead hctx instance in case of updating > > > > > + * nr_hw_queues > > > > > + */ > > > > > + struct list_head dead_hctx_list; > > > > > + spinlock_t dead_hctx_lock; > > > > > + > > > > > atomic_t mq_freeze_depth; > > > > > #if defined(CONFIG_BLK_DEV_BSG) > > > > > > > > > Hmm. > > > > I don't particularly like this approach. > > > > The much saner approach would be to avoid having I/O in-flight in the first > > > > place by setting the queue to something else than live, no? > > > > > > There isn't any in-flight IO since blk_mq_freeze_queue() is returned > > > from __blk_mq_update_nr_hw_queues(). > > > > > > However, there can be reference to hctx in other code paths via > > > arbitrary block layer APIs. > > > > > > Quiesce can't avoid the use-after-free too given RCU read lock may > > > not be held in lots of reference to hctx, such as queue_for_each_hw_ctx(). > > > > > > So this patch might be the simplest approach to fix this issue, IMO. > > > > > > However, I am open to any solution for this issue. > > > > Hi Hannes, > > > > Could you please let us know if you have better idea for this issue? > > Otherwise, I think we need to move on since it is real issue, and users > > want to fix that. > > > Okay. Having looked over the problem and possible alternatives, it looks > indeed like a viable solution. > I do agree that it's a sensible design to have an additional holding area > for hardware context elements, given that they might be reassigned during > blk_mq_realloc_hw_ctxs(). > > However, I would rename the 'dead' elements to 'unused' (ie unused_hctx_list > etc). OK, looks the name of 'unused' is better. > > And I would deallocate q->queue_hw_ctx in blk_mq_exit_queue() to make things > more consistent. No, that is wrong. The request queue's refcount is often held when blk_cleanup_queue() is running, and blk_mq_exit_queue() is called directly by blk_cleanup_queue(). One invariant is that we have to allow most APIs running well if the request queue is live from kobject view. If q->queue_hw_ctx is freed in blk_mq_exit_queue(), it is quite easy to cause use-after-free. > Problem with the current patch is that in blk_mq_release() we iterate > the 'dead_hctx_list' and free up everything in there, but then blindly call > 'kfree(q->queue_hw_ctx)' without checking if there are any context pointers > left. If request queue is dead, it is safe to assume that there isn't any reference to request queue and q->queue_hw_ctx. Otherwise, it must be a bug somewhere. > When moving the call to blk_mq_exit_queue() we could to a simple > WARN_ON(q->queue_hw_ctx) in blk_mq_release() to ensure that everything got > deallocated properly. At that time, hctx instance might be active, but that is fine given hctx is covered by its own kobject. What we need to do is to make sure that no any references to q->queue_hw_ctx and the request queue. Thanks, Ming
On 4/23/19 3:30 PM, Ming Lei wrote: > Hi Hannes, > > Thanks for your response. > > On Tue, Apr 23, 2019 at 01:19:33PM +0200, Hannes Reinecke wrote: >> On 4/22/19 5:30 AM, Ming Lei wrote: [ .. ] >>> >>> Hi Hannes, >>> >>> Could you please let us know if you have better idea for this issue? >>> Otherwise, I think we need to move on since it is real issue, and users >>> want to fix that. >>> >> Okay. Having looked over the problem and possible alternatives, it looks >> indeed like a viable solution. >> I do agree that it's a sensible design to have an additional holding area >> for hardware context elements, given that they might be reassigned during >> blk_mq_realloc_hw_ctxs(). >> >> However, I would rename the 'dead' elements to 'unused' (ie unused_hctx_list >> etc). > > OK, looks the name of 'unused' is better. > >> >> And I would deallocate q->queue_hw_ctx in blk_mq_exit_queue() to make things >> more consistent. > > No, that is wrong. > > The request queue's refcount is often held when blk_cleanup_queue() is running, > and blk_mq_exit_queue() is called directly by blk_cleanup_queue(). One invariant > is that we have to allow most APIs running well if the request queue is live > from kobject view. If q->queue_hw_ctx is freed in blk_mq_exit_queue(), > it is quite easy to cause use-after-free. > Ah. Thought as much. But then in most cases the ->queue_hw_ctx pointer is immaterial as we're accessing things via the hctx pointer, which remains valid. >> Problem with the current patch is that in blk_mq_release() we iterate >> the 'dead_hctx_list' and free up everything in there, but then blindly call >> 'kfree(q->queue_hw_ctx)' without checking if there are any context pointers >> left. > > If request queue is dead, it is safe to assume that there isn't any > reference to request queue and q->queue_hw_ctx. Otherwise, it must be > a bug somewhere. > Precisely. What I'm trying to achieve with this is to protect against such issues, which are quite easy to introduce given the complexity of the code ... >> When moving the call to blk_mq_exit_queue() we could to a simple >> WARN_ON(q->queue_hw_ctx) in blk_mq_release() to ensure that everything got >> deallocated properly. > > At that time, hctx instance might be active, but that is fine given hctx > is covered by its own kobject. What we need to do is to make sure that no > any references to q->queue_hw_ctx and the request queue. > My point being here: void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; cancel_delayed_work_sync(&q->requeue_work); /* all hctx are in .dead_hctx_list now */ list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) { list_del_init(&hctx->hctx_list); kobject_put(&hctx->kobj); } kfree(q->queue_hw_ctx); /* * release .mq_kobj and sw queue's kobject now because * both share lifetime with request queue. */ blk_mq_sysfs_deinit(q); } This assumes that _all_ hctx pointers are being removed from q->queue_hw_ctx, and are moved to the 'dead' list. If for some reason this is not the case we'll be leaking hctx pointers here. And as the reference counting _really_ is tricky it would be a good idea to have some safety checking here (ie if all hctx pointers from ->queue_hw_ctx are indeed NULL) before calling kfree(). This is what I was attempting by moving the kfree() to blk_mq_exit_queue(), as then we could validate that we've moved all hctx pointers to the dead list by simply checking if ->queue_hw_ctx is NULL. But just blindly calling 'kfree()' here is dangerous IMO. Cheers, Hannes
On Tue, Apr 23, 2019 at 04:07:49PM +0200, Hannes Reinecke wrote: > On 4/23/19 3:30 PM, Ming Lei wrote: > > Hi Hannes, > > > > Thanks for your response. > > > > On Tue, Apr 23, 2019 at 01:19:33PM +0200, Hannes Reinecke wrote: > > > On 4/22/19 5:30 AM, Ming Lei wrote: > [ .. ] > > > > > > > > Hi Hannes, > > > > > > > > Could you please let us know if you have better idea for this issue? > > > > Otherwise, I think we need to move on since it is real issue, and users > > > > want to fix that. > > > > > > > Okay. Having looked over the problem and possible alternatives, it looks > > > indeed like a viable solution. > > > I do agree that it's a sensible design to have an additional holding area > > > for hardware context elements, given that they might be reassigned during > > > blk_mq_realloc_hw_ctxs(). > > > > > > However, I would rename the 'dead' elements to 'unused' (ie unused_hctx_list > > > etc). > > > > OK, looks the name of 'unused' is better. > > > > > > > > And I would deallocate q->queue_hw_ctx in blk_mq_exit_queue() to make things > > > more consistent. > > > > No, that is wrong. > > > > The request queue's refcount is often held when blk_cleanup_queue() is running, > > and blk_mq_exit_queue() is called directly by blk_cleanup_queue(). One invariant > > is that we have to allow most APIs running well if the request queue is live > > from kobject view. If q->queue_hw_ctx is freed in blk_mq_exit_queue(), > > it is quite easy to cause use-after-free. > > > Ah. Thought as much. > But then in most cases the ->queue_hw_ctx pointer is immaterial as we're > accessing things via the hctx pointer, which remains valid. > > > > Problem with the current patch is that in blk_mq_release() we iterate > > > the 'dead_hctx_list' and free up everything in there, but then blindly call > > > 'kfree(q->queue_hw_ctx)' without checking if there are any context pointers > > > left. > > > > If request queue is dead, it is safe to assume that there isn't any > > reference to request queue and q->queue_hw_ctx. Otherwise, it must be > > a bug somewhere. > > > Precisely. > What I'm trying to achieve with this is to protect against such issues, > which are quite easy to introduce given the complexity of the code ... But releasing q->queue_hw_ctx in blk_cleanup_queue() is easy to cause use-after-free even though the request queue's refcount is held. We can't do that simply. If someone is still trying to use q->queue_hw_ctx[] after the request queue is dead, the bug is in the caller of block layer API, not in block layer. What the patchset is trying to fix is the race in block layer, not users of block layer, not drivers. So far, I don't see such driver issue. Just thought q->queue_hw_ctx as the request queue's resource, you will see it is pretty reasonable to free q->queue_hw_ctx in the queue's release handler. > > > > When moving the call to blk_mq_exit_queue() we could to a simple > > > WARN_ON(q->queue_hw_ctx) in blk_mq_release() to ensure that everything got > > > deallocated properly. > > > > At that time, hctx instance might be active, but that is fine given hctx > > is covered by its own kobject. What we need to do is to make sure that no > > any references to q->queue_hw_ctx and the request queue. > > > My point being here: > void blk_mq_release(struct request_queue *q) > { > struct blk_mq_hw_ctx *hctx, *next; > > cancel_delayed_work_sync(&q->requeue_work); > > /* all hctx are in .dead_hctx_list now */ > list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) > { > list_del_init(&hctx->hctx_list); > kobject_put(&hctx->kobj); > } > > kfree(q->queue_hw_ctx); > > /* > * release .mq_kobj and sw queue's kobject now because > * both share lifetime with request queue. > */ > blk_mq_sysfs_deinit(q); > } > > This assumes that _all_ hctx pointers are being removed from > q->queue_hw_ctx, and are moved to the 'dead' list. > If for some reason this is not the case we'll be leaking hctx pointers here. IMO, there aren't such some reasons. When blk_mq_release() is called, every hctx of this request queue has been "exited" via blk_mq_exit_hctx(), either from blk_mq_update_nr_hw_queues() or blk_cleanup_queue(). If there are hctxs not moved to the 'dead'(or 'unused') list here, it is simply a bug in blk-mq. However, I don't see such case now. > And as the reference counting _really_ is tricky it would be a good idea to > have some safety checking here (ie if all hctx pointers from ->queue_hw_ctx > are indeed NULL) before calling kfree(). > > This is what I was attempting by moving the kfree() to blk_mq_exit_queue(), > as then we could validate that we've moved all hctx pointers to the dead > list by simply checking if ->queue_hw_ctx is NULL. As I mentioned, it will break the current code either NULL the hctx pointer or kfree(hctx) early, given we allow APIs to run if the queue's refcount is held. > > But just blindly calling 'kfree()' here is dangerous IMO. How can that be dangerous? As I mentioned, if the queue's refcount isn't held, it is simply a bug of users to try to use 'q' or 'q->queue_hw_ctx'. That is the basic use pattern of refcount, isn't it? Thanks, Ming
On Wed, Apr 24, 2019 at 09:12:42AM +0800, Ming Lei wrote: > On Tue, Apr 23, 2019 at 04:07:49PM +0200, Hannes Reinecke wrote: > > On 4/23/19 3:30 PM, Ming Lei wrote: > > > Hi Hannes, > > > > > > Thanks for your response. > > > > > > On Tue, Apr 23, 2019 at 01:19:33PM +0200, Hannes Reinecke wrote: > > > > On 4/22/19 5:30 AM, Ming Lei wrote: > > [ .. ] > > > > > > > > > > Hi Hannes, > > > > > > > > > > Could you please let us know if you have better idea for this issue? > > > > > Otherwise, I think we need to move on since it is real issue, and users > > > > > want to fix that. > > > > > > > > > Okay. Having looked over the problem and possible alternatives, it looks > > > > indeed like a viable solution. > > > > I do agree that it's a sensible design to have an additional holding area > > > > for hardware context elements, given that they might be reassigned during > > > > blk_mq_realloc_hw_ctxs(). > > > > > > > > However, I would rename the 'dead' elements to 'unused' (ie unused_hctx_list > > > > etc). > > > > > > OK, looks the name of 'unused' is better. > > > > > > > > > > > And I would deallocate q->queue_hw_ctx in blk_mq_exit_queue() to make things > > > > more consistent. > > > > > > No, that is wrong. > > > > > > The request queue's refcount is often held when blk_cleanup_queue() is running, > > > and blk_mq_exit_queue() is called directly by blk_cleanup_queue(). One invariant > > > is that we have to allow most APIs running well if the request queue is live > > > from kobject view. If q->queue_hw_ctx is freed in blk_mq_exit_queue(), > > > it is quite easy to cause use-after-free. > > > > > Ah. Thought as much. > > But then in most cases the ->queue_hw_ctx pointer is immaterial as we're > > accessing things via the hctx pointer, which remains valid. > > > > > > Problem with the current patch is that in blk_mq_release() we iterate > > > > the 'dead_hctx_list' and free up everything in there, but then blindly call > > > > 'kfree(q->queue_hw_ctx)' without checking if there are any context pointers > > > > left. > > > > > > If request queue is dead, it is safe to assume that there isn't any > > > reference to request queue and q->queue_hw_ctx. Otherwise, it must be > > > a bug somewhere. > > > > > Precisely. > > What I'm trying to achieve with this is to protect against such issues, > > which are quite easy to introduce given the complexity of the code ... > > But releasing q->queue_hw_ctx in blk_cleanup_queue() is easy to cause > use-after-free even though the request queue's refcount is held. We can't > do that simply. > > If someone is still trying to use q->queue_hw_ctx[] after the request > queue is dead, the bug is in the caller of block layer API, not in > block layer. > > What the patchset is trying to fix is the race in block layer, not > users of block layer, not drivers. So far, I don't see such driver > issue. > > Just thought q->queue_hw_ctx as the request queue's resource, you will > see it is pretty reasonable to free q->queue_hw_ctx in the queue's > release handler. > > > > > > > When moving the call to blk_mq_exit_queue() we could to a simple > > > > WARN_ON(q->queue_hw_ctx) in blk_mq_release() to ensure that everything got > > > > deallocated properly. > > > > > > At that time, hctx instance might be active, but that is fine given hctx > > > is covered by its own kobject. What we need to do is to make sure that no > > > any references to q->queue_hw_ctx and the request queue. > > > > > My point being here: > > void blk_mq_release(struct request_queue *q) > > { > > struct blk_mq_hw_ctx *hctx, *next; > > > > cancel_delayed_work_sync(&q->requeue_work); > > > > /* all hctx are in .dead_hctx_list now */ > > list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) > > { > > list_del_init(&hctx->hctx_list); > > kobject_put(&hctx->kobj); > > } > > > > kfree(q->queue_hw_ctx); > > > > /* > > * release .mq_kobj and sw queue's kobject now because > > * both share lifetime with request queue. > > */ > > blk_mq_sysfs_deinit(q); > > } > > > > This assumes that _all_ hctx pointers are being removed from > > q->queue_hw_ctx, and are moved to the 'dead' list. > > If for some reason this is not the case we'll be leaking hctx pointers here. > > IMO, there aren't such some reasons. When blk_mq_release() is called, > every hctx of this request queue has been "exited" via blk_mq_exit_hctx(), > either from blk_mq_update_nr_hw_queues() or blk_cleanup_queue(). > > If there are hctxs not moved to the 'dead'(or 'unused') list here, it is > simply a bug in blk-mq. However, I don't see such case now. Or we can add the following check in blk_mq_release() for capturing the impossible blk-mq bug: diff --git a/block/blk-mq.c b/block/blk-mq.c index 2ca4395f794d..f0d08087b8f6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2366,6 +2366,8 @@ blk_mq_alloc_hctx(struct request_queue *q, hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; + INIT_LIST_HEAD(&hctx->hctx_list); + /* * Allocate space for all possible cpus to avoid allocation at * runtime @@ -2680,9 +2682,14 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; + int i; cancel_delayed_work_sync(&q->requeue_work); + /* warn if live hctx is found, that shouldn't happen */ + queue_for_each_hw_ctx(q, hctx, i) + WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); + /* all hctx are in .dead_hctx_list now */ list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) { list_del_init(&hctx->hctx_list); Thanks, Ming
On 4/24/19 3:45 AM, Ming Lei wrote: > On Wed, Apr 24, 2019 at 09:12:42AM +0800, Ming Lei wrote: >> On Tue, Apr 23, 2019 at 04:07:49PM +0200, Hannes Reinecke wrote: >>> On 4/23/19 3:30 PM, Ming Lei wrote: >>>> Hi Hannes, >>>> >>>> Thanks for your response. >>>> >>>> On Tue, Apr 23, 2019 at 01:19:33PM +0200, Hannes Reinecke wrote: >>>>> On 4/22/19 5:30 AM, Ming Lei wrote: >>> [ .. ] >>>>>> >>>>>> Hi Hannes, >>>>>> >>>>>> Could you please let us know if you have better idea for this issue? >>>>>> Otherwise, I think we need to move on since it is real issue, and users >>>>>> want to fix that. >>>>>> >>>>> Okay. Having looked over the problem and possible alternatives, it looks >>>>> indeed like a viable solution. >>>>> I do agree that it's a sensible design to have an additional holding area >>>>> for hardware context elements, given that they might be reassigned during >>>>> blk_mq_realloc_hw_ctxs(). >>>>> >>>>> However, I would rename the 'dead' elements to 'unused' (ie unused_hctx_list >>>>> etc). >>>> >>>> OK, looks the name of 'unused' is better. >>>> >>>>> >>>>> And I would deallocate q->queue_hw_ctx in blk_mq_exit_queue() to make things >>>>> more consistent. >>>> >>>> No, that is wrong. >>>> >>>> The request queue's refcount is often held when blk_cleanup_queue() is running, >>>> and blk_mq_exit_queue() is called directly by blk_cleanup_queue(). One invariant >>>> is that we have to allow most APIs running well if the request queue is live >>>> from kobject view. If q->queue_hw_ctx is freed in blk_mq_exit_queue(), >>>> it is quite easy to cause use-after-free. >>>> >>> Ah. Thought as much. >>> But then in most cases the ->queue_hw_ctx pointer is immaterial as we're >>> accessing things via the hctx pointer, which remains valid. >>> >>>>> Problem with the current patch is that in blk_mq_release() we iterate >>>>> the 'dead_hctx_list' and free up everything in there, but then blindly call >>>>> 'kfree(q->queue_hw_ctx)' without checking if there are any context pointers >>>>> left. >>>> >>>> If request queue is dead, it is safe to assume that there isn't any >>>> reference to request queue and q->queue_hw_ctx. Otherwise, it must be >>>> a bug somewhere. >>>> >>> Precisely. >>> What I'm trying to achieve with this is to protect against such issues, >>> which are quite easy to introduce given the complexity of the code ... >> >> But releasing q->queue_hw_ctx in blk_cleanup_queue() is easy to cause >> use-after-free even though the request queue's refcount is held. We can't >> do that simply. >> >> If someone is still trying to use q->queue_hw_ctx[] after the request >> queue is dead, the bug is in the caller of block layer API, not in >> block layer. >> >> What the patchset is trying to fix is the race in block layer, not >> users of block layer, not drivers. So far, I don't see such driver >> issue. >> >> Just thought q->queue_hw_ctx as the request queue's resource, you will >> see it is pretty reasonable to free q->queue_hw_ctx in the queue's >> release handler. >> >>> >>>>> When moving the call to blk_mq_exit_queue() we could to a simple >>>>> WARN_ON(q->queue_hw_ctx) in blk_mq_release() to ensure that everything got >>>>> deallocated properly. >>>> >>>> At that time, hctx instance might be active, but that is fine given hctx >>>> is covered by its own kobject. What we need to do is to make sure that no >>>> any references to q->queue_hw_ctx and the request queue. >>>> >>> My point being here: >>> void blk_mq_release(struct request_queue *q) >>> { >>> struct blk_mq_hw_ctx *hctx, *next; >>> >>> cancel_delayed_work_sync(&q->requeue_work); >>> >>> /* all hctx are in .dead_hctx_list now */ >>> list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) >>> { >>> list_del_init(&hctx->hctx_list); >>> kobject_put(&hctx->kobj); >>> } >>> >>> kfree(q->queue_hw_ctx); >>> >>> /* >>> * release .mq_kobj and sw queue's kobject now because >>> * both share lifetime with request queue. >>> */ >>> blk_mq_sysfs_deinit(q); >>> } >>> >>> This assumes that _all_ hctx pointers are being removed from >>> q->queue_hw_ctx, and are moved to the 'dead' list. >>> If for some reason this is not the case we'll be leaking hctx pointers here. >> >> IMO, there aren't such some reasons. When blk_mq_release() is called, >> every hctx of this request queue has been "exited" via blk_mq_exit_hctx(), >> either from blk_mq_update_nr_hw_queues() or blk_cleanup_queue(). >> >> If there are hctxs not moved to the 'dead'(or 'unused') list here, it is >> simply a bug in blk-mq. However, I don't see such case now. > > Or we can add the following check in blk_mq_release() for capturing > the impossible blk-mq bug: > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 2ca4395f794d..f0d08087b8f6 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -2366,6 +2366,8 @@ blk_mq_alloc_hctx(struct request_queue *q, > hctx->queue = q; > hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; > > + INIT_LIST_HEAD(&hctx->hctx_list); > + > /* > * Allocate space for all possible cpus to avoid allocation at > * runtime > @@ -2680,9 +2682,14 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) > void blk_mq_release(struct request_queue *q) > { > struct blk_mq_hw_ctx *hctx, *next; > + int i; > > cancel_delayed_work_sync(&q->requeue_work); > > + /* warn if live hctx is found, that shouldn't happen */ > + queue_for_each_hw_ctx(q, hctx, i) > + WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); > + > /* all hctx are in .dead_hctx_list now */ > list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) { > list_del_init(&hctx->hctx_list); > Precisely, that would've been my other choice of fixing it, but I didn't as I tried to avoid the loop. But if you agree then I'm fine with it, too. Please add this hunk to the patch. Cheers, Hannes
diff --git a/block/blk-mq.c b/block/blk-mq.c index eeebba6ec0f7..2ca4395f794d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2274,6 +2274,10 @@ static void blk_mq_exit_hctx(struct request_queue *q, set->ops->exit_hctx(hctx, hctx_idx); blk_mq_remove_cpuhp(hctx); + + spin_lock(&q->dead_hctx_lock); + list_add(&hctx->hctx_list, &q->dead_hctx_list); + spin_unlock(&q->dead_hctx_lock); } static void blk_mq_exit_hw_queues(struct request_queue *q, @@ -2675,15 +2679,13 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) */ void blk_mq_release(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned int i; + struct blk_mq_hw_ctx *hctx, *next; cancel_delayed_work_sync(&q->requeue_work); - /* hctx kobj stays in hctx */ - queue_for_each_hw_ctx(q, hctx, i) { - if (!hctx) - continue; + /* all hctx are in .dead_hctx_list now */ + list_for_each_entry_safe(hctx, next, &q->dead_hctx_list, hctx_list) { + list_del_init(&hctx->hctx_list); kobject_put(&hctx->kobj); } @@ -2750,9 +2752,22 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) { - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx = NULL, *tmp; + + /* reuse dead hctx first */ + spin_lock(&q->dead_hctx_lock); + list_for_each_entry(tmp, &q->dead_hctx_list, hctx_list) { + if (tmp->numa_node == node) { + hctx = tmp; + break; + } + } + if (hctx) + list_del_init(&hctx->hctx_list); + spin_unlock(&q->dead_hctx_lock); - hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); + if (!hctx) + hctx = blk_mq_alloc_hctx(q, set, hctx_idx, node); if (!hctx) goto fail; @@ -2790,10 +2805,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); if (hctx) { - if (hctxs[i]) { + if (hctxs[i]) blk_mq_exit_hctx(q, set, hctxs[i], i); - kobject_put(&hctxs[i]->kobj); - } hctxs[i] = hctx; } else { if (hctxs[i]) @@ -2824,9 +2837,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, if (hctx->tags) blk_mq_free_map_and_requests(set, j); blk_mq_exit_hctx(q, set, hctx, j); - kobject_put(&hctx->kobj); hctxs[j] = NULL; - } } mutex_unlock(&q->sysfs_lock); @@ -2869,6 +2880,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (!q->queue_hw_ctx) goto err_sys_init; + INIT_LIST_HEAD(&q->dead_hctx_list); + spin_lock_init(&q->dead_hctx_lock); + blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index db29928de467..15d1aa53d96c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -70,6 +70,8 @@ struct blk_mq_hw_ctx { struct dentry *sched_debugfs_dir; #endif + struct list_head hctx_list; + /* Must be the last member - see also blk_mq_hw_ctx_size(). */ struct srcu_struct srcu[0]; }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4b85dc066264..1325f941f0be 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -535,6 +535,13 @@ struct request_queue { struct mutex sysfs_lock; + /* + * for reusing dead hctx instance in case of updating + * nr_hw_queues + */ + struct list_head dead_hctx_list; + spinlock_t dead_hctx_lock; + atomic_t mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG)
In normal queue cleanup path, hctx is released after request queue is freed, see blk_mq_release(). However, in __blk_mq_update_nr_hw_queues(), hctx may be freed because of hw queues shrinking. This way is easy to cause use-after-free, because: one implicit rule is that it is safe to call almost all block layer APIs if the request queue is alive; and one hctx may be retrieved by one API, then the hctx can be freed by blk_mq_update_nr_hw_queues(); finally use-after-free is triggered. Fixes this issue by always freeing hctx after releasing request queue. If some hctxs are removed in blk_mq_update_nr_hw_queues(), introduce a per-queue list to hold them, then try to resuse these hctxs if numa node is matched. Cc: Dongli Zhang <dongli.zhang@oracle.com> Cc: James Smart <james.smart@broadcom.com> Cc: Bart Van Assche <bart.vanassche@wdc.com> Cc: linux-scsi@vger.kernel.org, Cc: Martin K . Petersen <martin.petersen@oracle.com>, Cc: Christoph Hellwig <hch@lst.de>, Cc: James E . J . Bottomley <jejb@linux.vnet.ibm.com>, Cc: jianchao wang <jianchao.w.wang@oracle.com> Signed-off-by: Ming Lei <ming.lei@redhat.com> --- block/blk-mq.c | 40 +++++++++++++++++++++++++++------------- include/linux/blk-mq.h | 2 ++ include/linux/blkdev.h | 7 +++++++ 3 files changed, 36 insertions(+), 13 deletions(-)