[RFC] blk-mq: fix potential uaf for 'queue_hw_ctx'

Message ID	20220223112601.2902761-1-yukuai3@huawei.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-block-owner@kernel.org> From: Yu Kuai <yukuai3@huawei.com> To: <axboe@kernel.dk> CC: <linux-block@vger.kernel.org>, <linux-kernel@vger.kernel.org>, <yukuai3@huawei.com>, <yi.zhang@huawei.com> Subject: [PATCH RFC] blk-mq: fix potential uaf for 'queue_hw_ctx' Date: Wed, 23 Feb 2022 19:26:01 +0800 Message-ID: <20220223112601.2902761-1-yukuai3@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: 7BIT Content-Type: text/plain; charset=US-ASCII Precedence: bulk
Series	[RFC] blk-mq: fix potential uaf for 'queue_hw_ctx' \| expand [RFC] blk-mq: fix potential uaf for 'queue_hw_ctx'

Yu Kuai Feb. 23, 2022, 11:26 a.m. UTC

blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate
submit_queues through configfs for null_blk), while it might still be
used from other context(e.g. switch elevator to none):

t1					t2
elevator_switch
 blk_mq_unquiesce_queue
  blk_mq_run_hw_queues
   queue_for_each_hw_ctx
    // assembly code for hctx = (q)->queue_hw_ctx[i]
    mov    0x48(%rbp),%rdx -> read old queue_hw_ctx

					__blk_mq_update_nr_hw_queues
					 blk_mq_realloc_hw_ctxs
					  hctxs = q->queue_hw_ctx
					  q->queue_hw_ctx = new_hctxs
					  kfree(hctxs)
    movslq %ebx,%rax
    mov    (%rdx,%rax,8),%rdi ->uaf

This problem was found by code review, and I comfirmed that the concurrent
scenarios do exist(specifically 'q->queue_hw_ctx' can be changed during
blk_mq_run_hw_queues), however, the uaf problem hasn't been repoduced yet
without hacking the kernel.

Sicne the queue is freezed in __blk_mq_update_nr_hw_queues, fix the
problem by protecting 'queue_hw_ctx' through rcu where it can be accessed
without grabbing 'q_usage_counter'.

Signed-off-by: Yu Kuai <yukuai3@huawei.com>
---
 block/blk-mq.c         |  8 +++++++-
 include/linux/blk-mq.h |  2 +-
 include/linux/blkdev.h | 13 ++++++++++++-
 3 files changed, 20 insertions(+), 3 deletions(-)

Ming Lei Feb. 23, 2022, 2:30 p.m. UTC | #1

On Wed, Feb 23, 2022 at 07:26:01PM +0800, Yu Kuai wrote:
> blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate
> submit_queues through configfs for null_blk), while it might still be
> used from other context(e.g. switch elevator to none):
> 
> t1					t2
> elevator_switch
>  blk_mq_unquiesce_queue
>   blk_mq_run_hw_queues
>    queue_for_each_hw_ctx
>     // assembly code for hctx = (q)->queue_hw_ctx[i]
>     mov    0x48(%rbp),%rdx -> read old queue_hw_ctx
> 
> 					__blk_mq_update_nr_hw_queues
> 					 blk_mq_realloc_hw_ctxs
> 					  hctxs = q->queue_hw_ctx
> 					  q->queue_hw_ctx = new_hctxs
> 					  kfree(hctxs)
>     movslq %ebx,%rax
>     mov    (%rdx,%rax,8),%rdi ->uaf
> 

Not only uaf on queue_hw_ctx, but also other similar issue on other
structures, and I think the correct and easy fix is to quiesce request
queue during updating nr_hw_queues, something like the following patch:

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a05ce7725031..d8e7c3cce0dd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -4467,8 +4467,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 	if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
 		return;
 
-	list_for_each_entry(q, &set->tag_list, tag_set_list)
+	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 		blk_mq_freeze_queue(q);
+		blk_mq_quiesce_queue(q);
+	}
 	/*
 	 * Switch IO scheduler to 'none', cleaning up the data associated
 	 * with the previous scheduler. We will switch back once we are done
@@ -4518,8 +4520,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
 	list_for_each_entry(q, &set->tag_list, tag_set_list)
 		blk_mq_elv_switch_back(&head, q);
 
-	list_for_each_entry(q, &set->tag_list, tag_set_list)
+	list_for_each_entry(q, &set->tag_list, tag_set_list) {
+		blk_mq_unquiesce_queue(q);
 		blk_mq_unfreeze_queue(q);
+	}
 }
 
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)



Thanks,
Ming

Yu Kuai Feb. 24, 2022, 1:29 a.m. UTC | #2

在 2022/02/23 22:30, Ming Lei 写道:
> On Wed, Feb 23, 2022 at 07:26:01PM +0800, Yu Kuai wrote:
>> blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate
>> submit_queues through configfs for null_blk), while it might still be
>> used from other context(e.g. switch elevator to none):
>>
>> t1					t2
>> elevator_switch
>>   blk_mq_unquiesce_queue
>>    blk_mq_run_hw_queues
>>     queue_for_each_hw_ctx
>>      // assembly code for hctx = (q)->queue_hw_ctx[i]
>>      mov    0x48(%rbp),%rdx -> read old queue_hw_ctx
>>
>> 					__blk_mq_update_nr_hw_queues
>> 					 blk_mq_realloc_hw_ctxs
>> 					  hctxs = q->queue_hw_ctx
>> 					  q->queue_hw_ctx = new_hctxs
>> 					  kfree(hctxs)
>>      movslq %ebx,%rax
>>      mov    (%rdx,%rax,8),%rdi ->uaf
>>
> 
> Not only uaf on queue_hw_ctx, but also other similar issue on other
> structures, and I think the correct and easy fix is to quiesce request
> queue during updating nr_hw_queues, something like the following patch:
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index a05ce7725031..d8e7c3cce0dd 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -4467,8 +4467,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>   	if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
>   		return;
>   
> -	list_for_each_entry(q, &set->tag_list, tag_set_list)
> +	list_for_each_entry(q, &set->tag_list, tag_set_list) {
>   		blk_mq_freeze_queue(q);
> +		blk_mq_quiesce_queue(q);
> +	}
>   	/*
>   	 * Switch IO scheduler to 'none', cleaning up the data associated
>   	 * with the previous scheduler. We will switch back once we are done
> @@ -4518,8 +4520,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
>   	list_for_each_entry(q, &set->tag_list, tag_set_list)
>   		blk_mq_elv_switch_back(&head, q);
>   
> -	list_for_each_entry(q, &set->tag_list, tag_set_list)
> +	list_for_each_entry(q, &set->tag_list, tag_set_list) {
> +		blk_mq_unquiesce_queue(q);
>   		blk_mq_unfreeze_queue(q);
> +	}
>   }
>   
>   void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
Hi, Ming

If blk_mq_quiesce_queue() is called from __blk_mq_update_nr_hw_queues()
first, and then swithing elevator to none won't trigger the problem. 
However, what if blk_mq_unquiesce_queue() from switching elevator
decrease quiesce_depth to 0 first, and then blk_mq_quiesce_queue() is
called from __blk_mq_update_nr_hw_queues(), it seems to me such
concurrent scenarios still exist.

Thanks,
Kuai
> 
> 
> 
> Thanks,
> Ming
> 
> .
>

Ming Lei Feb. 24, 2022, 2:15 a.m. UTC | #3

On Thu, Feb 24, 2022 at 09:29:09AM +0800, yukuai (C) wrote:
> 在 2022/02/23 22:30, Ming Lei 写道:
> > On Wed, Feb 23, 2022 at 07:26:01PM +0800, Yu Kuai wrote:
> > > blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate
> > > submit_queues through configfs for null_blk), while it might still be
> > > used from other context(e.g. switch elevator to none):
> > > 
> > > t1					t2
> > > elevator_switch
> > >   blk_mq_unquiesce_queue
> > >    blk_mq_run_hw_queues
> > >     queue_for_each_hw_ctx
> > >      // assembly code for hctx = (q)->queue_hw_ctx[i]
> > >      mov    0x48(%rbp),%rdx -> read old queue_hw_ctx
> > > 
> > > 					__blk_mq_update_nr_hw_queues
> > > 					 blk_mq_realloc_hw_ctxs
> > > 					  hctxs = q->queue_hw_ctx
> > > 					  q->queue_hw_ctx = new_hctxs
> > > 					  kfree(hctxs)
> > >      movslq %ebx,%rax
> > >      mov    (%rdx,%rax,8),%rdi ->uaf
> > > 
> > 
> > Not only uaf on queue_hw_ctx, but also other similar issue on other
> > structures, and I think the correct and easy fix is to quiesce request
> > queue during updating nr_hw_queues, something like the following patch:
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index a05ce7725031..d8e7c3cce0dd 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -4467,8 +4467,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> >   	if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
> >   		return;
> > -	list_for_each_entry(q, &set->tag_list, tag_set_list)
> > +	list_for_each_entry(q, &set->tag_list, tag_set_list) {
> >   		blk_mq_freeze_queue(q);
> > +		blk_mq_quiesce_queue(q);
> > +	}
> >   	/*
> >   	 * Switch IO scheduler to 'none', cleaning up the data associated
> >   	 * with the previous scheduler. We will switch back once we are done
> > @@ -4518,8 +4520,10 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> >   	list_for_each_entry(q, &set->tag_list, tag_set_list)
> >   		blk_mq_elv_switch_back(&head, q);
> > -	list_for_each_entry(q, &set->tag_list, tag_set_list)
> > +	list_for_each_entry(q, &set->tag_list, tag_set_list) {
> > +		blk_mq_unquiesce_queue(q);
> >   		blk_mq_unfreeze_queue(q);
> > +	}
> >   }
> >   void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
> Hi, Ming
> 
> If blk_mq_quiesce_queue() is called from __blk_mq_update_nr_hw_queues()
> first, and then swithing elevator to none won't trigger the problem.
> However, what if blk_mq_unquiesce_queue() from switching elevator
> decrease quiesce_depth to 0 first, and then blk_mq_quiesce_queue() is
> called from __blk_mq_update_nr_hw_queues(), it seems to me such
> concurrent scenarios still exist.

No, the scenario won't exist, once blk_mq_quiesce_queue() returns, it is
guaranteed that:

- in-progress run queue is drained
- no new run queue can be started

Thanks,
Ming

Yu Kuai Feb. 24, 2022, 2:43 a.m. UTC | #4

在 2022/02/24 10:15, Ming Lei 写道:
>> Hi, Ming
>>
>> If blk_mq_quiesce_queue() is called from __blk_mq_update_nr_hw_queues()
>> first, and then swithing elevator to none won't trigger the problem.
>> However, what if blk_mq_unquiesce_queue() from switching elevator
>> decrease quiesce_depth to 0 first, and then blk_mq_quiesce_queue() is
>> called from __blk_mq_update_nr_hw_queues(), it seems to me such
>> concurrent scenarios still exist.
> 
> No, the scenario won't exist, once blk_mq_quiesce_queue() returns, it is
> guaranteed that:
> 
> - in-progress run queue is drained
> - no new run queue can be started

I understand that... What I mean about the concurrent scenario is that
reading queue_hw_ctx in blk_mq_run_hw_queues(), not the actual run
queue blk_mq_run_hw_queue():

t1                              t2
elevator_switch
  blk_mq_quiesce_queue -> quiesce_depth = 1
  blk_mq_unquiesce_queue-> quiesce_depth = 0
   blk_mq_run_hw_queues
                                  __blk_mq_update_nr_hw_queues
                                   blk_mq_quiesce_queue
    queue_for_each_hw_ctx
    -> quiesce_queue can't prevent reading queue_hw_ctx
     blk_mq_run_hw_queue
     //need_run is always false, nothing to do

Am I missing something about blk_mq_quiesce_queue ?

Thanks,
Kuai

Ming Lei Feb. 25, 2022, 2:40 a.m. UTC | #5

On Wed, Feb 23, 2022 at 07:26:01PM +0800, Yu Kuai wrote:
> blk_mq_realloc_hw_ctxs() will free the 'queue_hw_ctx'(e.g. undate
> submit_queues through configfs for null_blk), while it might still be
> used from other context(e.g. switch elevator to none):
> 
> t1					t2
> elevator_switch
>  blk_mq_unquiesce_queue
>   blk_mq_run_hw_queues
>    queue_for_each_hw_ctx
>     // assembly code for hctx = (q)->queue_hw_ctx[i]
>     mov    0x48(%rbp),%rdx -> read old queue_hw_ctx
> 
> 					__blk_mq_update_nr_hw_queues
> 					 blk_mq_realloc_hw_ctxs
> 					  hctxs = q->queue_hw_ctx
> 					  q->queue_hw_ctx = new_hctxs
> 					  kfree(hctxs)
>     movslq %ebx,%rax
>     mov    (%rdx,%rax,8),%rdi ->uaf
> 
> This problem was found by code review, and I comfirmed that the concurrent
> scenarios do exist(specifically 'q->queue_hw_ctx' can be changed during
> blk_mq_run_hw_queues), however, the uaf problem hasn't been repoduced yet
> without hacking the kernel.
> 
> Sicne the queue is freezed in __blk_mq_update_nr_hw_queues, fix the
> problem by protecting 'queue_hw_ctx' through rcu where it can be accessed
> without grabbing 'q_usage_counter'.
> 
> Signed-off-by: Yu Kuai <yukuai3@huawei.com>
> ---
>  block/blk-mq.c         |  8 +++++++-
>  include/linux/blk-mq.h |  2 +-
>  include/linux/blkdev.h | 13 ++++++++++++-
>  3 files changed, 20 insertions(+), 3 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 6c59ffe765fd..79367457d555 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -3955,7 +3955,13 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
>  		if (hctxs)
>  			memcpy(new_hctxs, hctxs, q->nr_hw_queues *
>  			       sizeof(*hctxs));
> -		q->queue_hw_ctx = new_hctxs;
> +
> +		rcu_assign_pointer(q->queue_hw_ctx, new_hctxs);
> +		/*
> +		 * Make sure reading the old queue_hw_ctx from other
> +		 * context concurrently won't trigger uaf.
> +		 */
> +		synchronize_rcu();
>  		kfree(hctxs);
>  		hctxs = new_hctxs;
>  	}
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index d319ffa59354..edcf8ead76c6 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -918,7 +918,7 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
>  
>  #define queue_for_each_hw_ctx(q, hctx, i)				\
>  	for ((i) = 0; (i) < (q)->nr_hw_queues &&			\
> -	     ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
> +	     ({ hctx = queue_hctx((q), i); 1; }); (i)++)
>  
>  #define hctx_for_each_ctx(hctx, ctx, i)					\
>  	for ((i) = 0; (i) < (hctx)->nr_ctx &&				\
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 3bfc75a2a450..2018a4dd2028 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -354,7 +354,7 @@ struct request_queue {
>  	unsigned int		queue_depth;
>  
>  	/* hw dispatch queues */
> -	struct blk_mq_hw_ctx	**queue_hw_ctx;
> +	struct blk_mq_hw_ctx __rcu	**queue_hw_ctx;
>  	unsigned int		nr_hw_queues;
>  
>  	/*
> @@ -622,6 +622,17 @@ static inline bool queue_is_mq(struct request_queue *q)
>  	return q->mq_ops;
>  }
>  
> +static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +
> +	rcu_read_lock();
> +	hctx = *(rcu_dereference(q->queue_hw_ctx) + id);
> +	rcu_read_unlock();
> +
> +	return hctx;
> +}

queue_hctx() should be moved into linux/blk-mq.h, otherwise feel free to
add:

Reviewed-by: Ming Lei <ming.lei@redhat.com>

Also it should be fine to implement queue_for_each_hw_ctx() as list, then we
can avoid the allocation for q->queue_hw_ctx without extra cost. I will work
toward that direction for improving the code.

Thanks,
Ming

Yu Kuai Feb. 25, 2022, 3:15 a.m. UTC | #6

在 2022/02/25 10:40, Ming Lei 写道:

>> +static inline struct blk_mq_hw_ctx *queue_hctx(struct request_queue *q, int id)
>> +{
>> +	struct blk_mq_hw_ctx *hctx;
>> +
>> +	rcu_read_lock();
>> +	hctx = *(rcu_dereference(q->queue_hw_ctx) + id);
>> +	rcu_read_unlock();
>> +
>> +	return hctx;
>> +}
> 
> queue_hctx() should be moved into linux/blk-mq.h, otherwise feel free to
> add:
> 
> Reviewed-by: Ming Lei <ming.lei@redhat.com>

Thanks for the review, I will send a new patch and move queue_hctx.

Kuai
> 
> Also it should be fine to implement queue_for_each_hw_ctx() as list, then we
> can avoid the allocation for q->queue_hw_ctx without extra cost. I will work
> toward that direction for improving the code.
> 
> Thanks,
> Ming
> 
> .
>

[RFC] blk-mq: fix potential uaf for 'queue_hw_ctx'

Commit Message

Comments

Patch