diff mbox series

[RFC] block: fix access of uninitialized pointer address in bt_for_each()

Message ID 20200417125134.45117-1-yukuai3@huawei.com (mailing list archive)
State New, archived
Headers show
Series [RFC] block: fix access of uninitialized pointer address in bt_for_each() | expand

Commit Message

Yu Kuai April 17, 2020, 12:51 p.m. UTC
I recently got a KASAN warning like this in our 4.19 kernel:

 ==================================================================
 BUG: KASAN: slab-out-of-bounds in bt_for_each+0x1dc/0x2c0
 Read of size 8 at addr ffff8000c0865000 by task sh/2023305

 Call trace:
 dump_backtrace+0x0/0x310
 show_stack+0x28/0x38
 dump_stack+0xd8/0x108
 print_address_description+0x68/0x2d0
 kasan_report+0x124/0x2e0
 __asan_load8+0x88/0xb0
 bt_for_each+0x1dc/0x2c0
 blk_mq_queue_tag_busy_iter+0x1f0/0x3e8
 blk_mq_in_flight+0xb4/0xe0
 part_in_flight+0x124/0x178
 part_round_stats+0x128/0x3b0
 blk_account_io_start+0x2b4/0x3f0
 blk_mq_bio_to_request+0x170/0x258
 blk_mq_make_request+0x734/0xdd8
 generic_make_request+0x388/0x740
 submit_bio+0xd8/0x3d0
 ext4_io_submit+0xb4/0xe0 [ext4]
 ext4_writepages+0xb44/0x1c00 [ext4]
 do_writepages+0xc8/0x1f8
 __filemap_fdatawrite_range+0x200/0x2a0
 filemap_flush+0x30/0x40
 ext4_alloc_da_blocks+0x54/0x200 [ext4]
 ext4_release_file+0xfc/0x150 [ext4]
 __fput+0x15c/0x3a8
 ____fput+0x24/0x30
 task_work_run+0x1a4/0x208
 do_notify_resume+0x1a8/0x1c0
 work_pending+0x8/0x10

 Allocated by task 3515778:
 kasan_kmalloc+0xe0/0x190
 kmem_cache_alloc_trace+0x18c/0x418
 alloc_pipe_info+0x74/0x240
 create_pipe_files+0x74/0x2f8
 __do_pipe_flags+0x48/0x168
 do_pipe2+0xa0/0x1d0
 __arm64_sys_pipe2+0x3c/0x50
 el0_svc_common+0xb4/0x1d8
 el0_svc_handler+0x50/0xa8
 el0_svc+0x8/0xc

 Freed by task 3515778:
 __kasan_slab_free+0x120/0x228
 kasan_slab_free+0x10/0x18
 kfree+0x88/0x3d8
 free_pipe_info+0x150/0x178
 put_pipe_info+0x138/0x1c0
 pipe_release+0xe8/0x120
 __fput+0x15c/0x3a8
 ____fput+0x24/0x30
 task_work_run+0x1a4/0x208
 do_notify_resume+0x1a8/0x1c0
 work_pending+0x8/0x10

 The buggy address belongs to the object at ffff8000c0864f00#012 which belongs to the cache kmalloc-256 of size 256
 The buggy address is located 0 bytes to the right of#012 256-byte region [ffff8000c0864f00, ffff8000c0865000)
 The buggy address belongs to the page:
 page:ffff7e0003021900 count:1 mapcount:0 mapping:ffff80036d00fc00 index:0x0 compound_mapcount: 0
 flags: 0xffffe0000008100(slab|head)
 raw: 0ffffe0000008100 ffff7e0003617f88 ffff7e000d1a6208 ffff80036d00fc00
 raw: 0000000000000000 0000000000150015 00000001ffffffff 0000000000000000
 page dumped because: kasan: bad access detected

 Memory state around the buggy address:
 ffff8000c0864f00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8000c0864f80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 >ffff8000c0865000: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
 ^
 ffff8000c0865080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ffff8000c0865100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
 ==================================================================

After looking into it, I think it's because bt_for_each() accessed
uninitialized pointer address:

thread1			thread2
blk_mq_alloc_tag_set
...
blk_mq_init_queue
...
submit_bio(bio1)	submit_bio(bio2)
 blk_mq_get_request
  blk_mq_get_tag
  			 ...
  			 bt_for_each
  			  bt_iter
  			   rq = tags->rqs[b]
  			    rq->q		----> here
  blk_mq_rq_ctx_init
   tags->rqs[a] = rq

blk_mq_get_tag() is called before blk_mq_rq_ctx_init(), which leaves a
window for bt_for_each() to access 'tags->rqs[tag]->q' before
'tags->rqs[tag]' is set in blk_mq_rq_ctx_init(). While blk_mq_init_tags()
is using 'kcalloc()' for 'tags->rqs'. And I think the problem exist in
mainline, too.

The problem haven't been reporduced unless I manually sleep a while
before 'tags->rqs[tag]' is set:

@@ -275,6 +275,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
        struct request *rq = tags->static_rqs[tag];
        req_flags_t rq_flags = 0;
+       static int debug_count = 0;

        if (data->flags & BLK_MQ_REQ_INTERNAL) {
                rq->tag = -1;
@@ -286,6 +287,12 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
                }
                rq->tag = tag;
                rq->internal_tag = -1;
+               if (!strcmp(dev_name(data->q->backing_dev_info->dev), "250:0")) {
+                       if (debug_count == 0) {
+                               debug_count++;
+                               msleep(5000);
+                       }
+               }
                data->hctx->tags->rqs[rq->tag] = rq;
        }

BTW, I noticed there is a similar problem that haven't been solved yet:
https://lore.kernel.org/linux-block/1545261885.185366.488.camel@acm.org/

I'm trying to fix the problem by replacing 'kcalloc' as 'kzalloc' for
'tags->rqs', and set 'tags->rqs[tag]' to 'NULL' before putting the tag.

---
 block/blk-mq.c | 3 ++-
 block/blk-mq.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

Comments

Bart Van Assche April 17, 2020, 2:26 p.m. UTC | #1
On 2020-04-17 05:51, yu kuai wrote:
> I recently got a KASAN warning like this in our 4.19 kernel:
> 
>  ==================================================================
>  BUG: KASAN: slab-out-of-bounds in bt_for_each+0x1dc/0x2c0
>  Read of size 8 at addr ffff8000c0865000 by task sh/2023305
> 
>  Call trace:
>  dump_backtrace+0x0/0x310
>  show_stack+0x28/0x38
>  dump_stack+0xd8/0x108
>  print_address_description+0x68/0x2d0
>  kasan_report+0x124/0x2e0
>  __asan_load8+0x88/0xb0
>  bt_for_each+0x1dc/0x2c0
>  blk_mq_queue_tag_busy_iter+0x1f0/0x3e8
>  blk_mq_in_flight+0xb4/0xe0
>  part_in_flight+0x124/0x178
>  part_round_stats+0x128/0x3b0
>  blk_account_io_start+0x2b4/0x3f0
>  blk_mq_bio_to_request+0x170/0x258
>  blk_mq_make_request+0x734/0xdd8
>  generic_make_request+0x388/0x740
>  submit_bio+0xd8/0x3d0
>  ext4_io_submit+0xb4/0xe0 [ext4]
>  ext4_writepages+0xb44/0x1c00 [ext4]
>  do_writepages+0xc8/0x1f8
>  __filemap_fdatawrite_range+0x200/0x2a0
>  filemap_flush+0x30/0x40
>  ext4_alloc_da_blocks+0x54/0x200 [ext4]
>  ext4_release_file+0xfc/0x150 [ext4]
>  __fput+0x15c/0x3a8
>  ____fput+0x24/0x30
>  task_work_run+0x1a4/0x208
>  do_notify_resume+0x1a8/0x1c0
>  work_pending+0x8/0x10
> 
>  Allocated by task 3515778:
>  kasan_kmalloc+0xe0/0x190
>  kmem_cache_alloc_trace+0x18c/0x418
>  alloc_pipe_info+0x74/0x240
>  create_pipe_files+0x74/0x2f8
>  __do_pipe_flags+0x48/0x168
>  do_pipe2+0xa0/0x1d0
>  __arm64_sys_pipe2+0x3c/0x50
>  el0_svc_common+0xb4/0x1d8
>  el0_svc_handler+0x50/0xa8
>  el0_svc+0x8/0xc
> 
>  Freed by task 3515778:
>  __kasan_slab_free+0x120/0x228
>  kasan_slab_free+0x10/0x18
>  kfree+0x88/0x3d8
>  free_pipe_info+0x150/0x178
>  put_pipe_info+0x138/0x1c0
>  pipe_release+0xe8/0x120
>  __fput+0x15c/0x3a8
>  ____fput+0x24/0x30
>  task_work_run+0x1a4/0x208
>  do_notify_resume+0x1a8/0x1c0
>  work_pending+0x8/0x10

The alloc/free info refers to a data structure owned by the pipe
implementation. The use-after-free report refers to a data structure
owned by the block layer. How can that report make sense?

> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 7ed16ed13976..48b74d0085c7 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -485,6 +485,7 @@ static void __blk_mq_free_request(struct request *rq)
>  	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
>  	const int sched_tag = rq->internal_tag;
>  
> +	hctx->tags->rqs[rq->tag] = NULL;
>  	if (rq->tag != -1)
>  		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
>  	if (sched_tag != -1)

Can the above change trigger the following assignment?

hctx->tags->rqs[-1] = NULL?

> @@ -1999,7 +2000,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
>  	if (!tags)
>  		return NULL;
>  
> -	tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
> +	tags->rqs = kzalloc_node(nr_tags, sizeof(struct request *),
>  				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
>  				 node);

From include/linux/slab.h:

static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags,
                                 int node)
{
	return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
}

I think this means that kcalloc_node() already zeroes the allocated
memory and hence that changing kcalloc() into kzalloc() is not necessary.

>  	if (!tags->rqs) {
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index a6094c27b827..2a55292d3d51 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -196,6 +196,7 @@ static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
>  	if (rq->tag == -1 || rq->internal_tag == -1)
>  		return;
>  
> +	hctx->tags->rqs[rq->tag] = NULL;
>  	__blk_mq_put_driver_tag(hctx, rq);
>  }
>  
> @@ -207,6 +208,7 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
>  		return;
>  
>  	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
> +	hctx->tags->rqs[rq->tag] = NULL;
>  	__blk_mq_put_driver_tag(hctx, rq);
>  }

I don't think the above changes are sufficient to fix the
use-after-free. Has it been considered to free the memory that backs
tags->bitmap_tags only after an RCU grace period has expired? See also
blk_mq_free_tags().

Thanks,

Bart.
Ming Lei April 18, 2020, 2:11 a.m. UTC | #2
On Fri, Apr 17, 2020 at 08:51:34PM +0800, yu kuai wrote:
> I recently got a KASAN warning like this in our 4.19 kernel:
> 
>  ==================================================================
>  BUG: KASAN: slab-out-of-bounds in bt_for_each+0x1dc/0x2c0
>  Read of size 8 at addr ffff8000c0865000 by task sh/2023305
> 
>  Call trace:
>  dump_backtrace+0x0/0x310
>  show_stack+0x28/0x38
>  dump_stack+0xd8/0x108
>  print_address_description+0x68/0x2d0
>  kasan_report+0x124/0x2e0
>  __asan_load8+0x88/0xb0
>  bt_for_each+0x1dc/0x2c0
>  blk_mq_queue_tag_busy_iter+0x1f0/0x3e8
>  blk_mq_in_flight+0xb4/0xe0
>  part_in_flight+0x124/0x178
>  part_round_stats+0x128/0x3b0

This code path is killed since 5b18b5a73760 ("block: delete part_round_stats and
switch to less precise counting").

However, it still can be triggered via readding proc & sysfs iostat.

Jian Chao worked patches for this issue before, please refer to:

https://lore.kernel.org/linux-block/1553492318-1810-1-git-send-email-jianchao.w.wang@oracle.com/

but didn't get chance to merge.

Thanks, 
Ming
Yu Kuai April 18, 2020, 3:24 a.m. UTC | #3
on 2020/4/17 22:26, Bart Van Assche wrote:

> The alloc/free info refers to a data structure owned by the pipe
> implementation. The use-after-free report refers to a data structure
> owned by the block layer. How can that report make sense?

Indeed, I'm comfused here, too.

>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 7ed16ed13976..48b74d0085c7 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -485,6 +485,7 @@ static void __blk_mq_free_request(struct request *rq)
>>   	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
>>   	const int sched_tag = rq->internal_tag;
>>   
>> +	hctx->tags->rqs[rq->tag] = NULL;
>>   	if (rq->tag != -1)
>>   		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
>>   	if (sched_tag != -1)
> 
> Can the above change trigger the following assignment?
> 
> hctx->tags->rqs[-1] = NULL?

My bad, should be inside 'if'.

> static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags,
>                                   int node)
> {
> 	return kmalloc_array_node(n, size, flags | __GFP_ZERO, node);
> }
> 
> I think this means that kcalloc_node() already zeroes the allocated
> memory and hence that changing kcalloc() into kzalloc() is not necessary.

You are right.

>> @@ -196,6 +196,7 @@ static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
>>   	if (rq->tag == -1 || rq->internal_tag == -1)
>>   		return;
>>   
>> +	hctx->tags->rqs[rq->tag] = NULL;
>>   	__blk_mq_put_driver_tag(hctx, rq);
>>   }
>>   
>> @@ -207,6 +208,7 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
>>   		return;
>>   
>>   	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
>> +	hctx->tags->rqs[rq->tag] = NULL;
>>   	__blk_mq_put_driver_tag(hctx, rq);
>>   }
> 
> I don't think the above changes are sufficient to fix the
> use-after-free. Has it been considered to free the memory that backs
> tags->bitmap_tags only after an RCU grace period has expired? See also
> blk_mq_free_tags().

As you pointed out, kcalloc_node() already zeroes out the memory. What I 
don't understand is that how could 'slab-out-of-bounds in bt_for_each' 
triggered instead UAF.

Thanks!
Yu Kuai
Yu Kuai April 18, 2020, 9:42 a.m. UTC | #4
On 2020/4/17 22:26, Bart Van Assche wrote:

> I don't think the above changes are sufficient to fix the
> use-after-free. Has it been considered to free the memory that backs
> tags->bitmap_tags only after an RCU grace period has expired? See also
> blk_mq_free_tags().
> 
I try to reporduce the problem:
1. manually sleep before set 'tags->rqs'
@@ -280,6 +280,7 @@ static struct request *blk_mq_rq_ctx_init(struct 
blk_mq_alloc_data *data,
  	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
  	struct request *rq = tags->static_rqs[tag];
  	req_flags_t rq_flags = 0;
+	static int debug_time = 0;

  	if (data->flags & BLK_MQ_REQ_INTERNAL) {
  		rq->tag = -1;
@@ -291,6 +292,15 @@ static struct request *blk_mq_rq_ctx_init(struct 
blk_mq_alloc_data *data,
  		}
  		rq->tag = tag;
  		rq->internal_tag = -1;
+		if (!strcmp(dev_name(data->q->backing_dev_info->dev), "8:0")) {
+			if (debug_time == 0) {
+				printk("This is sda, will sleep 5s, tag(%d)\n", tag);
+				debug_time++;
+				msleep(5000);
+			} else {
+				printk("sda: get tag (%d)\n", tag);
+			}
+		}
  		data->hctx->tags->rqs[rq->tag] = rq;
  	}
2. echo none > /sys/block/sda/queue/scheduler
3. dd if=/dev/zero of=/dev/sda bs=4k count=1 oflag=direct
will sleep 5s
4. dd if=/dev/zero of=/dev/sda bs=4k count=1 oflag=direct
do this before step 3 finish.

Test result:
[   60.585789] This is sda, will sleep 5s, tag(42)
[   61.983732] sda: get tag (117)
[   61.988268] 
==================================================================
[   61.988933] BUG: KASAN: use-after-free in bt_iter+0x29e/0x310
[   61.989446] Read of size 8 at addr ffff88824f5d8c00 by task dd/2659
[   61.989996]
[   61.990136] CPU: 2 PID: 2659 Comm: dd Not tainted 
4.19.90-00001-g9c3fb8226112-dirty #44
[   61.990830] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
BIOS 1.12.0-2.fc30 04/01/2014
[   61.991591] Call Trace:
[   61.991819]  dump_stack+0x108/0x15f
[   61.992127]  print_address_description+0xa5/0x372
[   61.992551]  kasan_report.cold+0x236/0x2a8
[   61.992909]  ? bt_iter+0x29e/0x310
[   61.993210]  __asan_report_load8_noabort+0x25/0x30
[   61.993625]  bt_iter+0x29e/0x310
[   61.993917]  blk_mq_queue_tag_busy_iter+0x405/0x910
[   61.994342]  ? __blkdev_issue_zeroout+0x190/0x190
[   61.994759]  ? blk_mq_put_tag+0x1a0/0x1a0
[   61.995111]  ? serial8250_start_tx+0xa10/0xa10
[   61.995501]  ? __blkdev_issue_zeroout+0x190/0x190
[   61.995916]  blk_mq_in_flight+0xda/0x140
[   61.996279]  ? blk_mq_end_request+0x3a0/0x3a0
[   61.996665]  part_in_flight+0x59/0x300
[   61.996994]  part_round_stats+0x332/0x6d0
[   61.997345]  ? blk_mq_get_tag+0x7d5/0xb90
[   61.997702]  ? blk_requeue_request+0x2b0/0x2b0
[   61.998090]  ? vprintk_func+0x6c/0x1a2
[   61.998421]  ? printk+0xb9/0xf1
[   61.998700]  blk_account_io_start+0x408/0x810
[   61.999077]  ? blkg_create+0x622/0x1010
[   61.999413]  ? blk_end_request+0xc0/0xc0
[   61.999763]  ? blk_rq_bio_prep+0xc2/0x330
[   62.000111]  blk_mq_bio_to_request+0x238/0x3c0
[   62.000500]  blk_mq_make_request+0x927/0x1690
[   62.000891]  ? blk_mq_try_issue_directly+0x1a0/0x1a0
[   62.001321]  ? blk_put_request+0x130/0x130
[   62.001683]  generic_make_request+0x53b/0xff0
[   62.002071]  ? direct_make_request+0x360/0x360
[   62.002464]  submit_bio+0xa8/0x3d0
[   62.002763]  ? generic_make_request+0xff0/0xff0
[   62.003153]  ? bio_phys_segments+0xc0/0xc0
[   62.003516]  __blkdev_direct_IO_simple+0x400/0xa90
[   62.003934]  ? blkdev_bio_end_io+0x5f0/0x5f0
[   62.004309]  ? kasan_check_read+0x1d/0x30
[   62.004663]  ? mutex_lock+0xa6/0x110
[   62.004975]  ? __handle_mm_fault+0x19ed/0x36c0
[   62.005366]  ? kasan_check_write+0x20/0x30
[   62.005722]  ? mutex_unlock+0x25/0x70
[   62.006040]  ? __pmd_alloc+0x4d0/0x4d0
[   62.006370]  ? bdput+0x50/0x50
[   62.006645]  ? handle_mm_fault+0x3f8/0x840
[   62.007002]  blkdev_direct_IO+0x93a/0xfc0
[   62.007354]  ? inode_owner_or_capable+0x1e0/0x1e0
[   62.007769]  ? mem_cgroup_oom_trylock+0x1c0/0x1c0
[   62.008175]  ? kasan_alloc_pages+0x3c/0x50
[   62.008534]  ? __blkdev_direct_IO_simple+0xa90/0xa90
[   62.008965]  ? current_time+0xe0/0x180
[   62.009289]  ? timespec64_trunc+0x190/0x190
[   62.009660]  ? generic_update_time+0x1aa/0x330
[   62.010051]  generic_file_direct_write+0x1d1/0x490
[   62.010471]  __generic_file_write_iter+0x2c1/0x680
[   62.010892]  blkdev_write_iter+0x1ed/0x420
[   62.011247]  ? blkdev_close+0xe0/0xe0
[   62.011569]  ? __pte_alloc+0x350/0x350
[   62.011895]  ? vvar_fault+0xcb/0xe0
[   62.012201]  ? _cond_resched+0x25/0x50
[   62.012538]  ? read_iter_zero+0x78/0x1a0
[   62.012884]  __vfs_write+0x495/0x760
[   62.013199]  ? __pmd_alloc+0x4d0/0x4d0
[   62.013537]  ? kernel_read+0x150/0x150
[   62.013867]  ? __fsnotify_inode_delete+0x30/0x30
[   62.014272]  vfs_write+0x17b/0x530
[   62.014570]  ksys_write+0x103/0x270
[   62.014876]  ? __x64_sys_read+0xc0/0xc0
[   62.015218]  ? kasan_check_write+0x20/0x30
[   62.015580]  ? fput+0x29/0x1b0
[   62.015851]  ? filp_close+0x119/0x170
[   62.016171]  __x64_sys_write+0x77/0xc0
[   62.016502]  do_syscall_64+0x106/0x260
[   62.016832]  entry_SYSCALL_64_after_hwframe+0x44/0xa9
[   62.017274] RIP: 0033:0x7f84f6274130
[   62.017591] Code: 73 01 c3 48 8b 0d 58 ed 2c 00 f7 d8 64 89 01 48 83 
c8 ff c3 66 0f 1f 44 00 00 83 3d b9 45 2d 00 00 75 10 b8 01 00 00 00 0f 
05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e4
[   62.019186] RSP: 002b:00007ffd43abdcd8 EFLAGS: 00000246 ORIG_RAX: 
0000000000000001
[   62.019833] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 
00007f84f6274130
[   62.020460] RDX: 0000000000001000 RSI: 000055976e38b000 RDI: 
0000000000000001
[   62.021090] RBP: 0000000000001000 R08: 000055976e38a030 R09: 
0000000000000077
[   62.021709] R10: 00007f84f6543b38 R11: 0000000000000246 R12: 
000055976e38b000
[   62.022328] R13: 0000000000000000 R14: 0000000000000000 R15: 
000055976e38b000
[   62.022941]
[   62.023080] The buggy address belongs to the page:
[   62.023502] page:ffffea00093d7600 count:0 mapcount:0 
mapping:0000000000000000 index:0x0
[   62.024187] flags: 0x2fffff80000000()
[   62.024523] raw: 002fffff80000000 ffffea00093d7608 ffffea00093d7608 
0000000000000000
[   62.025186] raw: 0000000000000000 0000000000000000 00000000ffffffff 
0000000000000000
[   62.025852] page dumped because: kasan: bad access detected
[   62.026332]
[   62.026469] Memory state around the buggy address:
[   62.026887]  ffff88824f5d8b00: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[   62.027513]  ffff88824f5d8b80: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[   62.028127] >ffff88824f5d8c00: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[   62.028752]                    ^
[   62.029036]  ffff88824f5d8c80: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[   62.029657]  ffff88824f5d8d00: ff ff ff ff ff ff ff ff ff ff ff ff ff 
ff ff ff
[   62.030284] 
==================================================================

And I try to fix the problem by setting tag->reqs to NULL in 
blk_mq_free_rqs(), and use rcu to avoid concurrency between 
bt_for_each() and blk_mq_free_rqs(). (The rcu implementation is from 
https://lore.kernel.org/linux-block/71fb9eff-43eb-24aa-fb67-be56a3a97983@kernel.dk/)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 256fa1ccc2bd..c3ae8455e4eb 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -605,12 +605,22 @@ struct blk_flush_queue 
*blk_alloc_flush_queue(struct request_queue *q,
  	return NULL;
  }

+static void blk_fq_rcu_free(struct work_struct *work)
+{
+	struct blk_flush_queue *fq = container_of(to_rcu_work(work),
+							struct blk_flush_queue,
+							rcu_work);
+
+	kfree(fq->flush_rq);
+	kfree(fq);
+}
+
  void blk_free_flush_queue(struct blk_flush_queue *fq)
  {
  	/* bio based request queue hasn't flush queue */
  	if (!fq)
  		return;

-	kfree(fq->flush_rq);
-	kfree(fq);
+	INIT_RCU_WORK(&fq->rcu_work, blk_fq_rcu_free);
+	queue_rcu_work(system_wq, &fq->rcu_work);
  }
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 41317c50a446..f744dd144f06 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -229,6 +229,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned 
int bitnr, void *data)
  	if (!reserved)
  		bitnr += tags->nr_reserved_tags;
  	rq = tags->rqs[bitnr];
+//	if (!strcmp(dev_name(hctx->queue->backing_dev_info->dev), "8:0"))
+//		printk("tags->rqs[%d]: %p",bitnr, rq);

  	/*
  	 * We can hit rq == NULL here, because the tagging functions
@@ -249,7 +251,9 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx, 
struct sbitmap_queue *bt,
  		.reserved = reserved,
  	};

+	rcu_read_lock();
  	sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
+	rcu_read_unlock();
  }

  struct bt_tags_iter_data {
@@ -290,8 +294,11 @@ static void bt_tags_for_each(struct blk_mq_tags 
*tags, struct sbitmap_queue *bt,
  		.reserved = reserved,
  	};

-	if (tags->rqs)
+	if (tags->rqs) {
+		rcu_read_lock();
  		sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
+		rcu_read_unlock();
+	}
  }

  static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8a7c3d841661..2c2dd578a27a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c

@@ -1932,12 +1942,38 @@ static blk_qc_t blk_mq_make_request(struct 
request_queue *q, struct bio *bio)
  	return cookie;
  }

+struct rq_page_list {
+	struct rcu_work rcu_work;
+	struct list_head list;
+	bool on_stack;
+};
+
+static void blk_mq_rcu_free_pages(struct work_struct *work)
+{
+	struct rq_page_list *rpl = container_of(to_rcu_work(work),
+						struct rq_page_list, rcu_work);
+	struct page *page;
+
+	while (!list_empty(&rpl->list)) {
+		page = list_first_entry(&rpl->list, struct page, lru);
+		list_del_init(&page->lru);
+		/*
+		 * Remove kmemleak object previously allocated in
+		 * blk_mq_init_rq_map().
+		 */
+		kmemleak_free(page_address(page));
+		__free_pages(page, page->private);
+	}
+	if (!rpl->on_stack)
+		kfree(rpl);
+}
+
  void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
  		     unsigned int hctx_idx)
  {
-	struct page *page;
+	struct rq_page_list *rpl;

-	if (tags->rqs && set->ops->exit_request) {
+	if (tags->rqs) {
  		int i;

  		for (i = 0; i < tags->nr_tags; i++) {
@@ -1945,20 +1981,38 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, 
struct blk_mq_tags *tags,

  			if (!rq)
  				continue;
-			set->ops->exit_request(set, rq, hctx_idx);
+			if (set->ops->exit_request)
+				set->ops->exit_request(set, rq, hctx_idx);
  			tags->static_rqs[i] = NULL;
+			if (set->tags[hctx_idx]->rqs[rq->tag] == rq)
+				set->tags[hctx_idx]->rqs[rq->tag] = NULL;
  		}
  	}

-	while (!list_empty(&tags->page_list)) {
-		page = list_first_entry(&tags->page_list, struct page, lru);
-		list_del_init(&page->lru);
+	if (list_empty(&tags->page_list))
+		return;
+
+	rpl = kmalloc(sizeof(*rpl), GFP_NOIO);
+	if (rpl) {
+		INIT_LIST_HEAD(&rpl->list);
+		list_splice_init(&tags->page_list, &rpl->list);
+
+		/* Punt to RCU free, so we don't race with tag iteration */
+		INIT_RCU_WORK(&rpl->rcu_work, blk_mq_rcu_free_pages);
+		rpl->on_stack = false;
+		queue_rcu_work(system_wq, &rpl->rcu_work);
+	} else {
+		struct rq_page_list stack_rpl;
+
  		/*
-		 * Remove kmemleak object previously allocated in
-		 * blk_mq_init_rq_map().
+		 * Fail alloc, punt to on-stack, we just have to synchronize
+		 * RCU first to ensure readers are done.
  		 */
-		kmemleak_free(page_address(page));
-		__free_pages(page, page->private);
+		INIT_LIST_HEAD(&stack_rpl.list);
+		list_splice_init(&tags->page_list, &stack_rpl.list);
+		stack_rpl.on_stack = true;
+		synchronize_rcu();
+		blk_mq_rcu_free_pages(&stack_rpl.rcu_work.work);
  	}
  }

diff --git a/block/blk.h b/block/blk.h
index 1a5b67b57e6b..8378074ad51e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -35,6 +35,8 @@ struct blk_flush_queue {
  	 */
  	struct request		*orig_rq;
  	spinlock_t		mq_flush_lock;
+
+	struct rcu_work		rcu_work;
  };

  extern struct kmem_cache *blk_requestq_cachep;
Bart Van Assche April 18, 2020, 3:26 p.m. UTC | #5
On 2020-04-18 02:42, yukuai (C) wrote:
> [   61.988933] BUG: KASAN: use-after-free in bt_iter+0x29e/0x310
> [   61.989446] Read of size 8 at addr ffff88824f5d8c00 by task dd/2659
> [   61.989996]
> [   61.990136] CPU: 2 PID: 2659 Comm: dd Not tainted
> 4.19.90-00001-g9c3fb8226112-dirty #44

Hi Yu Kuai,

So this use-after-free was encountered with kernel version 4.19? Please
develop block layer kernel patches against Jens' for-next branch from
git://git.kernel.dk/linux-block. If it wouldn't be possible to reproduce
this issue with Jens' for-next branch, the next step is to check which
patch(es) fixed this issue and to ask Greg KH to backport these patches
to the stable tree.

Thanks,

Bart.
diff mbox series

Patch

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 7ed16ed13976..48b74d0085c7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -485,6 +485,7 @@  static void __blk_mq_free_request(struct request *rq)
 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
 	const int sched_tag = rq->internal_tag;
 
+	hctx->tags->rqs[rq->tag] = NULL;
 	if (rq->tag != -1)
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
 	if (sched_tag != -1)
@@ -1999,7 +2000,7 @@  struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
 	if (!tags)
 		return NULL;
 
-	tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
+	tags->rqs = kzalloc_node(nr_tags, sizeof(struct request *),
 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
 				 node);
 	if (!tags->rqs) {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index a6094c27b827..2a55292d3d51 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -196,6 +196,7 @@  static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
 	if (rq->tag == -1 || rq->internal_tag == -1)
 		return;
 
+	hctx->tags->rqs[rq->tag] = NULL;
 	__blk_mq_put_driver_tag(hctx, rq);
 }
 
@@ -207,6 +208,7 @@  static inline void blk_mq_put_driver_tag(struct request *rq)
 		return;
 
 	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
+	hctx->tags->rqs[rq->tag] = NULL;
 	__blk_mq_put_driver_tag(hctx, rq);
 }