Message ID | 20220223112601.2902761-1-yukuai3@huawei.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] blk-mq: fix potential uaf for 'queue_hw_ctx' | expand |
On Wed, Feb 23, 2022 at 07:26:01PM +0800, Yu Kuai wrote: > blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate > submit_queues through configfs for null_blk), while it might still be > used from other context(e.g. switch elevator to none): > > t1 t2 > elevator_switch > blk_mq_unquiesce_queue > blk_mq_run_hw_queues > queue_for_each_hw_ctx > // assembly code for hctx = (q)->queue_hw_ctx[i] > mov 0x48(%rbp),%rdx -> read old queue_hw_ctx > > __blk_mq_update_nr_hw_queues > blk_mq_realloc_hw_ctxs > hctxs = q->queue_hw_ctx > q->queue_hw_ctx = new_hctxs > kfree(hctxs) > movslq %ebx,%rax > mov (%rdx,%rax,8),%rdi ->uaf > Not only uaf on queue_hw_ctx, but also other similar issue on other structures, and I think the correct and easy fix is to quiesce request queue during updating nr_hw_queues, something like the following patch: diff --git a/block/blk-mq.c b/block/blk-mq.c index a05ce7725031..d8e7c3cce0dd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4467,8 +4467,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; - list_for_each_entry(q, &set->tag_list, tag_set_list) + list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + } /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done @@ -4518,8 +4520,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_elv_switch_back(&head, q); - list_for_each_entry(q, &set->tag_list, tag_set_list) + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); + } } void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) Thanks, Ming
在 2022/02/23 22:30, Ming Lei 写道: > On Wed, Feb 23, 2022 at 07:26:01PM +0800, Yu Kuai wrote: >> blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate >> submit_queues through configfs for null_blk), while it might still be >> used from other context(e.g. switch elevator to none): >> >> t1 t2 >> elevator_switch >> blk_mq_unquiesce_queue >> blk_mq_run_hw_queues >> queue_for_each_hw_ctx >> // assembly code for hctx = (q)->queue_hw_ctx[i] >> mov 0x48(%rbp),%rdx -> read old queue_hw_ctx >> >> __blk_mq_update_nr_hw_queues >> blk_mq_realloc_hw_ctxs >> hctxs = q->queue_hw_ctx >> q->queue_hw_ctx = new_hctxs >> kfree(hctxs) >> movslq %ebx,%rax >> mov (%rdx,%rax,8),%rdi ->uaf >> > > Not only uaf on queue_hw_ctx, but also other similar issue on other > structures, and I think the correct and easy fix is to quiesce request > queue during updating nr_hw_queues, something like the following patch: > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index a05ce7725031..d8e7c3cce0dd 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -4467,8 +4467,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, > if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) > return; > > - list_for_each_entry(q, &set->tag_list, tag_set_list) > + list_for_each_entry(q, &set->tag_list, tag_set_list) { > blk_mq_freeze_queue(q); > + blk_mq_quiesce_queue(q); > + } > /* > * Switch IO scheduler to 'none', cleaning up the data associated > * with the previous scheduler. We will switch back once we are done > @@ -4518,8 +4520,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, > list_for_each_entry(q, &set->tag_list, tag_set_list) > blk_mq_elv_switch_back(&head, q); > > - list_for_each_entry(q, &set->tag_list, tag_set_list) > + list_for_each_entry(q, &set->tag_list, tag_set_list) { > + blk_mq_unquiesce_queue(q); > blk_mq_unfreeze_queue(q); > + } > } > > void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) Hi, Ming If blk_mq_quiesce_queue() is called from __blk_mq_update_nr_hw_queues() first, and then swithing elevator to none won't trigger the problem. However, what if blk_mq_unquiesce_queue() from switching elevator decrease quiesce_depth to 0 first, and then blk_mq_quiesce_queue() is called from __blk_mq_update_nr_hw_queues(), it seems to me such concurrent scenarios still exist. Thanks, Kuai > > > > Thanks, > Ming > > . >
On Thu, Feb 24, 2022 at 09:29:09AM +0800, yukuai (C) wrote: > 在 2022/02/23 22:30, Ming Lei 写道: > > On Wed, Feb 23, 2022 at 07:26:01PM +0800, Yu Kuai wrote: > > > blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate > > > submit_queues through configfs for null_blk), while it might still be > > > used from other context(e.g. switch elevator to none): > > > > > > t1 t2 > > > elevator_switch > > > blk_mq_unquiesce_queue > > > blk_mq_run_hw_queues > > > queue_for_each_hw_ctx > > > // assembly code for hctx = (q)->queue_hw_ctx[i] > > > mov 0x48(%rbp),%rdx -> read old queue_hw_ctx > > > > > > __blk_mq_update_nr_hw_queues > > > blk_mq_realloc_hw_ctxs > > > hctxs = q->queue_hw_ctx > > > q->queue_hw_ctx = new_hctxs > > > kfree(hctxs) > > > movslq %ebx,%rax > > > mov (%rdx,%rax,8),%rdi ->uaf > > > > > > > Not only uaf on queue_hw_ctx, but also other similar issue on other > > structures, and I think the correct and easy fix is to quiesce request > > queue during updating nr_hw_queues, something like the following patch: > > > > diff --git a/block/blk-mq.c b/block/blk-mq.c > > index a05ce7725031..d8e7c3cce0dd 100644 > > --- a/block/blk-mq.c > > +++ b/block/blk-mq.c > > @@ -4467,8 +4467,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, > > if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) > > return; > > - list_for_each_entry(q, &set->tag_list, tag_set_list) > > + list_for_each_entry(q, &set->tag_list, tag_set_list) { > > blk_mq_freeze_queue(q); > > + blk_mq_quiesce_queue(q); > > + } > > /* > > * Switch IO scheduler to 'none', cleaning up the data associated > > * with the previous scheduler. We will switch back once we are done > > @@ -4518,8 +4520,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, > > list_for_each_entry(q, &set->tag_list, tag_set_list) > > blk_mq_elv_switch_back(&head, q); > > - list_for_each_entry(q, &set->tag_list, tag_set_list) > > + list_for_each_entry(q, &set->tag_list, tag_set_list) { > > + blk_mq_unquiesce_queue(q); > > blk_mq_unfreeze_queue(q); > > + } > > } > > void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) > Hi, Ming > > If blk_mq_quiesce_queue() is called from __blk_mq_update_nr_hw_queues() > first, and then swithing elevator to none won't trigger the problem. > However, what if blk_mq_unquiesce_queue() from switching elevator > decrease quiesce_depth to 0 first, and then blk_mq_quiesce_queue() is > called from __blk_mq_update_nr_hw_queues(), it seems to me such > concurrent scenarios still exist. No, the scenario won't exist, once blk_mq_quiesce_queue() returns, it is guaranteed that: - in-progress run queue is drained - no new run queue can be started Thanks, Ming
在 2022/02/24 10:15, Ming Lei 写道: >> Hi, Ming >> >> If blk_mq_quiesce_queue() is called from __blk_mq_update_nr_hw_queues() >> first, and then swithing elevator to none won't trigger the problem. >> However, what if blk_mq_unquiesce_queue() from switching elevator >> decrease quiesce_depth to 0 first, and then blk_mq_quiesce_queue() is >> called from __blk_mq_update_nr_hw_queues(), it seems to me such >> concurrent scenarios still exist. > > No, the scenario won't exist, once blk_mq_quiesce_queue() returns, it is > guaranteed that: > > - in-progress run queue is drained > - no new run queue can be started I understand that... What I mean about the concurrent scenario is that reading queue_hw_ctx in blk_mq_run_hw_queues(), not the actual run queue blk_mq_run_hw_queue(): t1 t2 elevator_switch blk_mq_quiesce_queue -> quiesce_depth = 1 blk_mq_unquiesce_queue-> quiesce_depth = 0 blk_mq_run_hw_queues __blk_mq_update_nr_hw_queues blk_mq_quiesce_queue queue_for_each_hw_ctx -> quiesce_queue can't prevent reading queue_hw_ctx blk_mq_run_hw_queue //need_run is always false, nothing to do Am I missing something about blk_mq_quiesce_queue ? Thanks, Kuai
On Wed, Feb 23, 2022 at 07:26:01PM +0800, Yu Kuai wrote: > blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate > submit_queues through configfs for null_blk), while it might still be > used from other context(e.g. switch elevator to none): > > t1 t2 > elevator_switch > blk_mq_unquiesce_queue > blk_mq_run_hw_queues > queue_for_each_hw_ctx > // assembly code for hctx = (q)->queue_hw_ctx[i] > mov 0x48(%rbp),%rdx -> read old queue_hw_ctx > > __blk_mq_update_nr_hw_queues > blk_mq_realloc_hw_ctxs > hctxs = q->queue_hw_ctx > q->queue_hw_ctx = new_hctxs > kfree(hctxs) > movslq %ebx,%rax > mov (%rdx,%rax,8),%rdi ->uaf > > This problem was found by code review, and I comfirmed that the concurrent > scenarios do exist(specifically 'q->queue_hw_ctx' can be changed during > blk_mq_run_hw_queues), however, the uaf problem hasn't been repoduced yet > without hacking the kernel. > > Sicne the queue is freezed in __blk_mq_update_nr_hw_queues, fix the > problem by protecting 'queue_hw_ctx' through rcu where it can be accessed > without grabbing 'q_usage_counter'. > > Signed-off-by: Yu Kuai <yukuai3@huawei.com> > --- > block/blk-mq.c | 8 +++++++- > include/linux/blk-mq.h | 2 +- > include/linux/blkdev.h | 13 ++++++++++++- > 3 files changed, 20 insertions(+), 3 deletions(-) > > diff --git a/block/blk-mq.c b/block/blk-mq.c > index 6c59ffe765fd..79367457d555 100644 > --- a/block/blk-mq.c > +++ b/block/blk-mq.c > @@ -3955,7 +3955,13 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, > if (hctxs) > memcpy(new_hctxs, hctxs, q->nr_hw_queues * > sizeof(*hctxs)); > - q->queue_hw_ctx = new_hctxs; > + > + rcu_assign_pointer(q->queue_hw_ctx, new_hctxs); > + /* > + * Make sure reading the old queue_hw_ctx from other > + * context concurrently won't trigger uaf. > + */ > + synchronize_rcu(); > kfree(hctxs); > hctxs = new_hctxs; > } > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index d319ffa59354..edcf8ead76c6 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -918,7 +918,7 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) > > #define queue_for_each_hw_ctx(q, hctx, i) \ > for ((i) = 0; (i) < (q)->nr_hw_queues && \ > - ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) > + ({ hctx = queue_hctx((q), i); 1; }); (i)++) > > #define hctx_for_each_ctx(hctx, ctx, i) \ > for ((i) = 0; (i) < (hctx)->nr_ctx && \ > diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h > index 3bfc75a2a450..2018a4dd2028 100644 > --- a/include/linux/blkdev.h > +++ b/include/linux/blkdev.h > @@ -354,7 +354,7 @@ struct request_queue { > unsigned int queue_depth; > > /* hw dispatch queues */ > - struct blk_mq_hw_ctx **queue_hw_ctx; > + struct blk_mq_hw_ctx __rcu **queue_hw_ctx; > unsigned int nr_hw_queues; > > /* > @@ -622,6 +622,17 @@ static inline bool queue_is_mq(struct request_queue *q) > return q->mq_ops; > } > > +static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) > +{ > + struct blk_mq_hw_ctx *hctx; > + > + rcu_read_lock(); > + hctx = *(rcu_dereference(q->queue_hw_ctx) + id); > + rcu_read_unlock(); > + > + return hctx; > +} queue_hctx() should be moved into linux/blk-mq.h, otherwise feel free to add: Reviewed-by: Ming Lei <ming.lei@redhat.com> Also it should be fine to implement queue_for_each_hw_ctx() as list, then we can avoid the allocation for q->queue_hw_ctx without extra cost. I will work toward that direction for improving the code. Thanks, Ming
在 2022/02/25 10:40, Ming Lei 写道: >> +static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) >> +{ >> + struct blk_mq_hw_ctx *hctx; >> + >> + rcu_read_lock(); >> + hctx = *(rcu_dereference(q->queue_hw_ctx) + id); >> + rcu_read_unlock(); >> + >> + return hctx; >> +} > > queue_hctx() should be moved into linux/blk-mq.h, otherwise feel free to > add: > > Reviewed-by: Ming Lei <ming.lei@redhat.com> Thanks for the review, I will send a new patch and move queue_hctx. Kuai > > Also it should be fine to implement queue_for_each_hw_ctx() as list, then we > can avoid the allocation for q->queue_hw_ctx without extra cost. I will work > toward that direction for improving the code. > > Thanks, > Ming > > . >
diff --git a/block/blk-mq.c b/block/blk-mq.c index 6c59ffe765fd..79367457d555 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3955,7 +3955,13 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, if (hctxs) memcpy(new_hctxs, hctxs, q->nr_hw_queues * sizeof(*hctxs)); - q->queue_hw_ctx = new_hctxs; + + rcu_assign_pointer(q->queue_hw_ctx, new_hctxs); + /* + * Make sure reading the old queue_hw_ctx from other + * context concurrently won't trigger uaf. + */ + synchronize_rcu(); kfree(hctxs); hctxs = new_hctxs; } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d319ffa59354..edcf8ead76c6 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -918,7 +918,7 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq) #define queue_for_each_hw_ctx(q, hctx, i) \ for ((i) = 0; (i) < (q)->nr_hw_queues && \ - ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++) + ({ hctx = queue_hctx((q), i); 1; }); (i)++) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3bfc75a2a450..2018a4dd2028 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -354,7 +354,7 @@ struct request_queue { unsigned int queue_depth; /* hw dispatch queues */ - struct blk_mq_hw_ctx **queue_hw_ctx; + struct blk_mq_hw_ctx __rcu **queue_hw_ctx; unsigned int nr_hw_queues; /* @@ -622,6 +622,17 @@ static inline bool queue_is_mq(struct request_queue *q) return q->mq_ops; } +static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id) +{ + struct blk_mq_hw_ctx *hctx; + + rcu_read_lock(); + hctx = *(rcu_dereference(q->queue_hw_ctx) + id); + rcu_read_unlock(); + + return hctx; +} + #ifdef CONFIG_PM static inline enum rpm_status queue_rpm_status(struct request_queue *q) {
blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate submit_queues through configfs for null_blk), while it might still be used from other context(e.g. switch elevator to none): t1 t2 elevator_switch blk_mq_unquiesce_queue blk_mq_run_hw_queues queue_for_each_hw_ctx // assembly code for hctx = (q)->queue_hw_ctx[i] mov 0x48(%rbp),%rdx -> read old queue_hw_ctx __blk_mq_update_nr_hw_queues blk_mq_realloc_hw_ctxs hctxs = q->queue_hw_ctx q->queue_hw_ctx = new_hctxs kfree(hctxs) movslq %ebx,%rax mov (%rdx,%rax,8),%rdi ->uaf This problem was found by code review, and I comfirmed that the concurrent scenarios do exist(specifically 'q->queue_hw_ctx' can be changed during blk_mq_run_hw_queues), however, the uaf problem hasn't been repoduced yet without hacking the kernel. Sicne the queue is freezed in __blk_mq_update_nr_hw_queues, fix the problem by protecting 'queue_hw_ctx' through rcu where it can be accessed without grabbing 'q_usage_counter'. Signed-off-by: Yu Kuai <yukuai3@huawei.com> --- block/blk-mq.c | 8 +++++++- include/linux/blk-mq.h | 2 +- include/linux/blkdev.h | 13 ++++++++++++- 3 files changed, 20 insertions(+), 3 deletions(-)