diff mbox series

block: reexpand iov_iter after read/write

Message ID 20210401071807.3328235-1-yangerkun@huawei.com (mailing list archive)
State New, archived
Headers show
Series block: reexpand iov_iter after read/write | expand

Commit Message

yangerkun April 1, 2021, 7:18 a.m. UTC
We get a bug:

BUG: KASAN: slab-out-of-bounds in iov_iter_revert+0x11c/0x404
lib/iov_iter.c:1139
Read of size 8 at addr ffff0000d3fb11f8 by task

CPU: 0 PID: 12582 Comm: syz-executor.2 Not tainted
5.10.0-00843-g352c8610ccd2 #2
Hardware name: linux,dummy-virt (DT)
Call trace:
 dump_backtrace+0x0/0x2d0 arch/arm64/kernel/stacktrace.c:132
 show_stack+0x28/0x34 arch/arm64/kernel/stacktrace.c:196
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x110/0x164 lib/dump_stack.c:118
 print_address_description+0x78/0x5c8 mm/kasan/report.c:385
 __kasan_report mm/kasan/report.c:545 [inline]
 kasan_report+0x148/0x1e4 mm/kasan/report.c:562
 check_memory_region_inline mm/kasan/generic.c:183 [inline]
 __asan_load8+0xb4/0xbc mm/kasan/generic.c:252
 iov_iter_revert+0x11c/0x404 lib/iov_iter.c:1139
 io_read fs/io_uring.c:3421 [inline]
 io_issue_sqe+0x2344/0x2d64 fs/io_uring.c:5943
 __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
 io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
 io_submit_sqe fs/io_uring.c:6395 [inline]
 io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
 __do_sys_io_uring_enter fs/io_uring.c:9013 [inline]
 __se_sys_io_uring_enter fs/io_uring.c:8960 [inline]
 __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960
 __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline]
 invoke_syscall arch/arm64/kernel/syscall.c:48 [inline]
 el0_svc_common arch/arm64/kernel/syscall.c:158 [inline]
 do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227
 el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367
 el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383
 el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670

Allocated by task 12570:
 stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121
 kasan_save_stack mm/kasan/common.c:48 [inline]
 kasan_set_track mm/kasan/common.c:56 [inline]
 __kasan_kmalloc+0xdc/0x120 mm/kasan/common.c:461
 kasan_kmalloc+0xc/0x14 mm/kasan/common.c:475
 __kmalloc+0x23c/0x334 mm/slub.c:3970
 kmalloc include/linux/slab.h:557 [inline]
 __io_alloc_async_data+0x68/0x9c fs/io_uring.c:3210
 io_setup_async_rw fs/io_uring.c:3229 [inline]
 io_read fs/io_uring.c:3436 [inline]
 io_issue_sqe+0x2954/0x2d64 fs/io_uring.c:5943
 __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
 io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
 io_submit_sqe fs/io_uring.c:6395 [inline]
 io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
 __do_sys_io_uring_enter fs/io_uring.c:9013 [inline]
 __se_sys_io_uring_enter fs/io_uring.c:8960 [inline]
 __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960
 __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline]
 invoke_syscall arch/arm64/kernel/syscall.c:48 [inline]
 el0_svc_common arch/arm64/kernel/syscall.c:158 [inline]
 do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227
 el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367
 el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383
 el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670

Freed by task 12570:
 stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121
 kasan_save_stack mm/kasan/common.c:48 [inline]
 kasan_set_track+0x38/0x6c mm/kasan/common.c:56
 kasan_set_free_info+0x20/0x40 mm/kasan/generic.c:355
 __kasan_slab_free+0x124/0x150 mm/kasan/common.c:422
 kasan_slab_free+0x10/0x1c mm/kasan/common.c:431
 slab_free_hook mm/slub.c:1544 [inline]
 slab_free_freelist_hook mm/slub.c:1577 [inline]
 slab_free mm/slub.c:3142 [inline]
 kfree+0x104/0x38c mm/slub.c:4124
 io_dismantle_req fs/io_uring.c:1855 [inline]
 __io_free_req+0x70/0x254 fs/io_uring.c:1867
 io_put_req_find_next fs/io_uring.c:2173 [inline]
 __io_queue_sqe+0x1fc/0x520 fs/io_uring.c:6279
 __io_req_task_submit+0x154/0x21c fs/io_uring.c:2051
 io_req_task_submit+0x2c/0x44 fs/io_uring.c:2063
 task_work_run+0xdc/0x128 kernel/task_work.c:151
 get_signal+0x6f8/0x980 kernel/signal.c:2562
 do_signal+0x108/0x3a4 arch/arm64/kernel/signal.c:658
 do_notify_resume+0xbc/0x25c arch/arm64/kernel/signal.c:722
 work_pending+0xc/0x180

blkdev_read_iter can truncate iov_iter's count since the count + pos may
exceed the size of the blkdev. This will confuse io_read that we have
consume the iovec. And once we do the iov_iter_revert in io_read, we
will trigger the slab-out-of-bounds. Fix it by reexpand the count with
size has been truncated.

blkdev_write_iter can trigger the problem too.

Signed-off-by: yangerkun <yangerkun@huawei.com>
---
 fs/block_dev.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

Comments

yangerkun April 6, 2021, 1:28 a.m. UTC | #1
Ping...

在 2021/4/1 15:18, yangerkun 写道:
> We get a bug:
> 
> BUG: KASAN: slab-out-of-bounds in iov_iter_revert+0x11c/0x404
> lib/iov_iter.c:1139
> Read of size 8 at addr ffff0000d3fb11f8 by task
> 
> CPU: 0 PID: 12582 Comm: syz-executor.2 Not tainted
> 5.10.0-00843-g352c8610ccd2 #2
> Hardware name: linux,dummy-virt (DT)
> Call trace:
>   dump_backtrace+0x0/0x2d0 arch/arm64/kernel/stacktrace.c:132
>   show_stack+0x28/0x34 arch/arm64/kernel/stacktrace.c:196
>   __dump_stack lib/dump_stack.c:77 [inline]
>   dump_stack+0x110/0x164 lib/dump_stack.c:118
>   print_address_description+0x78/0x5c8 mm/kasan/report.c:385
>   __kasan_report mm/kasan/report.c:545 [inline]
>   kasan_report+0x148/0x1e4 mm/kasan/report.c:562
>   check_memory_region_inline mm/kasan/generic.c:183 [inline]
>   __asan_load8+0xb4/0xbc mm/kasan/generic.c:252
>   iov_iter_revert+0x11c/0x404 lib/iov_iter.c:1139
>   io_read fs/io_uring.c:3421 [inline]
>   io_issue_sqe+0x2344/0x2d64 fs/io_uring.c:5943
>   __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>   io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>   io_submit_sqe fs/io_uring.c:6395 [inline]
>   io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
>   __do_sys_io_uring_enter fs/io_uring.c:9013 [inline]
>   __se_sys_io_uring_enter fs/io_uring.c:8960 [inline]
>   __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960
>   __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline]
>   invoke_syscall arch/arm64/kernel/syscall.c:48 [inline]
>   el0_svc_common arch/arm64/kernel/syscall.c:158 [inline]
>   do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227
>   el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367
>   el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383
>   el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670
> 
> Allocated by task 12570:
>   stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121
>   kasan_save_stack mm/kasan/common.c:48 [inline]
>   kasan_set_track mm/kasan/common.c:56 [inline]
>   __kasan_kmalloc+0xdc/0x120 mm/kasan/common.c:461
>   kasan_kmalloc+0xc/0x14 mm/kasan/common.c:475
>   __kmalloc+0x23c/0x334 mm/slub.c:3970
>   kmalloc include/linux/slab.h:557 [inline]
>   __io_alloc_async_data+0x68/0x9c fs/io_uring.c:3210
>   io_setup_async_rw fs/io_uring.c:3229 [inline]
>   io_read fs/io_uring.c:3436 [inline]
>   io_issue_sqe+0x2954/0x2d64 fs/io_uring.c:5943
>   __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>   io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>   io_submit_sqe fs/io_uring.c:6395 [inline]
>   io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
>   __do_sys_io_uring_enter fs/io_uring.c:9013 [inline]
>   __se_sys_io_uring_enter fs/io_uring.c:8960 [inline]
>   __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960
>   __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline]
>   invoke_syscall arch/arm64/kernel/syscall.c:48 [inline]
>   el0_svc_common arch/arm64/kernel/syscall.c:158 [inline]
>   do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227
>   el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367
>   el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383
>   el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670
> 
> Freed by task 12570:
>   stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121
>   kasan_save_stack mm/kasan/common.c:48 [inline]
>   kasan_set_track+0x38/0x6c mm/kasan/common.c:56
>   kasan_set_free_info+0x20/0x40 mm/kasan/generic.c:355
>   __kasan_slab_free+0x124/0x150 mm/kasan/common.c:422
>   kasan_slab_free+0x10/0x1c mm/kasan/common.c:431
>   slab_free_hook mm/slub.c:1544 [inline]
>   slab_free_freelist_hook mm/slub.c:1577 [inline]
>   slab_free mm/slub.c:3142 [inline]
>   kfree+0x104/0x38c mm/slub.c:4124
>   io_dismantle_req fs/io_uring.c:1855 [inline]
>   __io_free_req+0x70/0x254 fs/io_uring.c:1867
>   io_put_req_find_next fs/io_uring.c:2173 [inline]
>   __io_queue_sqe+0x1fc/0x520 fs/io_uring.c:6279
>   __io_req_task_submit+0x154/0x21c fs/io_uring.c:2051
>   io_req_task_submit+0x2c/0x44 fs/io_uring.c:2063
>   task_work_run+0xdc/0x128 kernel/task_work.c:151
>   get_signal+0x6f8/0x980 kernel/signal.c:2562
>   do_signal+0x108/0x3a4 arch/arm64/kernel/signal.c:658
>   do_notify_resume+0xbc/0x25c arch/arm64/kernel/signal.c:722
>   work_pending+0xc/0x180
> 
> blkdev_read_iter can truncate iov_iter's count since the count + pos may
> exceed the size of the blkdev. This will confuse io_read that we have
> consume the iovec. And once we do the iov_iter_revert in io_read, we
> will trigger the slab-out-of-bounds. Fix it by reexpand the count with
> size has been truncated.
> 
> blkdev_write_iter can trigger the problem too.
> 
> Signed-off-by: yangerkun <yangerkun@huawei.com>
> ---
>   fs/block_dev.c | 20 +++++++++++++++++---
>   1 file changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 92ed7d5df677..788e1014576f 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1680,6 +1680,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   	struct inode *bd_inode = bdev_file_inode(file);
>   	loff_t size = i_size_read(bd_inode);
>   	struct blk_plug plug;
> +	size_t shorted = 0;
>   	ssize_t ret;
>   
>   	if (bdev_read_only(I_BDEV(bd_inode)))
> @@ -1697,12 +1698,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>   	if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
>   		return -EOPNOTSUPP;
>   
> -	iov_iter_truncate(from, size - iocb->ki_pos);
> +	size -= iocb->ki_pos;
> +	if (iov_iter_count(from) > size) {
> +		shorted = iov_iter_count(from) - size;
> +		iov_iter_truncate(from, size);
> +	}
>   
>   	blk_start_plug(&plug);
>   	ret = __generic_file_write_iter(iocb, from);
>   	if (ret > 0)
>   		ret = generic_write_sync(iocb, ret);
> +	iov_iter_reexpand(from, iov_iter_count(from) + shorted);
>   	blk_finish_plug(&plug);
>   	return ret;
>   }
> @@ -1714,13 +1720,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>   	struct inode *bd_inode = bdev_file_inode(file);
>   	loff_t size = i_size_read(bd_inode);
>   	loff_t pos = iocb->ki_pos;
> +	size_t shorted = 0;
> +	ssize_t ret;
>   
>   	if (pos >= size)
>   		return 0;
>   
>   	size -= pos;
> -	iov_iter_truncate(to, size);
> -	return generic_file_read_iter(iocb, to);
> +	if (iov_iter_count(to) > size) {
> +		shorted = iov_iter_count(to) - size;
> +		iov_iter_truncate(to, size);
> +	}
> +
> +	ret = generic_file_read_iter(iocb, to);
> +	iov_iter_reexpand(to, iov_iter_count(to) + shorted);
> +	return ret;
>   }
>   EXPORT_SYMBOL_GPL(blkdev_read_iter);
>   
>
Pavel Begunkov April 6, 2021, 11:04 a.m. UTC | #2
On 06/04/2021 02:28, yangerkun wrote:
> Ping...

It wasn't forgotten, but wouln't have worked because of
other reasons. With these two already queued, that's a
different story.

https://git.kernel.dk/cgit/linux-block/commit/?h=io_uring-5.12&id=07204f21577a1d882f0259590c3553fe6a476381
https://git.kernel.dk/cgit/linux-block/commit/?h=io_uring-5.12&id=230d50d448acb6639991440913299e50cacf1daf

Can you re-confirm, that the bug is still there (should be)
and your patch fixes it?

> 
> 在 2021/4/1 15:18, yangerkun 写道:
>> We get a bug:
>>
>> BUG: KASAN: slab-out-of-bounds in iov_iter_revert+0x11c/0x404
>> lib/iov_iter.c:1139
>> Read of size 8 at addr ffff0000d3fb11f8 by task
>>
>> CPU: 0 PID: 12582 Comm: syz-executor.2 Not tainted
>> 5.10.0-00843-g352c8610ccd2 #2
>> Hardware name: linux,dummy-virt (DT)
>> Call trace:
>>   dump_backtrace+0x0/0x2d0 arch/arm64/kernel/stacktrace.c:132
>>   show_stack+0x28/0x34 arch/arm64/kernel/stacktrace.c:196
>>   __dump_stack lib/dump_stack.c:77 [inline]
>>   dump_stack+0x110/0x164 lib/dump_stack.c:118
>>   print_address_description+0x78/0x5c8 mm/kasan/report.c:385
>>   __kasan_report mm/kasan/report.c:545 [inline]
>>   kasan_report+0x148/0x1e4 mm/kasan/report.c:562
>>   check_memory_region_inline mm/kasan/generic.c:183 [inline]
>>   __asan_load8+0xb4/0xbc mm/kasan/generic.c:252
>>   iov_iter_revert+0x11c/0x404 lib/iov_iter.c:1139
>>   io_read fs/io_uring.c:3421 [inline]
>>   io_issue_sqe+0x2344/0x2d64 fs/io_uring.c:5943
>>   __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>>   io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>>   io_submit_sqe fs/io_uring.c:6395 [inline]
>>   io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
>>   __do_sys_io_uring_enter fs/io_uring.c:9013 [inline]
>>   __se_sys_io_uring_enter fs/io_uring.c:8960 [inline]
>>   __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960
>>   __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline]
>>   invoke_syscall arch/arm64/kernel/syscall.c:48 [inline]
>>   el0_svc_common arch/arm64/kernel/syscall.c:158 [inline]
>>   do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227
>>   el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367
>>   el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383
>>   el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670
>>
>> Allocated by task 12570:
>>   stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121
>>   kasan_save_stack mm/kasan/common.c:48 [inline]
>>   kasan_set_track mm/kasan/common.c:56 [inline]
>>   __kasan_kmalloc+0xdc/0x120 mm/kasan/common.c:461
>>   kasan_kmalloc+0xc/0x14 mm/kasan/common.c:475
>>   __kmalloc+0x23c/0x334 mm/slub.c:3970
>>   kmalloc include/linux/slab.h:557 [inline]
>>   __io_alloc_async_data+0x68/0x9c fs/io_uring.c:3210
>>   io_setup_async_rw fs/io_uring.c:3229 [inline]
>>   io_read fs/io_uring.c:3436 [inline]
>>   io_issue_sqe+0x2954/0x2d64 fs/io_uring.c:5943
>>   __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>>   io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>>   io_submit_sqe fs/io_uring.c:6395 [inline]
>>   io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
>>   __do_sys_io_uring_enter fs/io_uring.c:9013 [inline]
>>   __se_sys_io_uring_enter fs/io_uring.c:8960 [inline]
>>   __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960
>>   __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline]
>>   invoke_syscall arch/arm64/kernel/syscall.c:48 [inline]
>>   el0_svc_common arch/arm64/kernel/syscall.c:158 [inline]
>>   do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227
>>   el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367
>>   el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383
>>   el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670
>>
>> Freed by task 12570:
>>   stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121
>>   kasan_save_stack mm/kasan/common.c:48 [inline]
>>   kasan_set_track+0x38/0x6c mm/kasan/common.c:56
>>   kasan_set_free_info+0x20/0x40 mm/kasan/generic.c:355
>>   __kasan_slab_free+0x124/0x150 mm/kasan/common.c:422
>>   kasan_slab_free+0x10/0x1c mm/kasan/common.c:431
>>   slab_free_hook mm/slub.c:1544 [inline]
>>   slab_free_freelist_hook mm/slub.c:1577 [inline]
>>   slab_free mm/slub.c:3142 [inline]
>>   kfree+0x104/0x38c mm/slub.c:4124
>>   io_dismantle_req fs/io_uring.c:1855 [inline]
>>   __io_free_req+0x70/0x254 fs/io_uring.c:1867
>>   io_put_req_find_next fs/io_uring.c:2173 [inline]
>>   __io_queue_sqe+0x1fc/0x520 fs/io_uring.c:6279
>>   __io_req_task_submit+0x154/0x21c fs/io_uring.c:2051
>>   io_req_task_submit+0x2c/0x44 fs/io_uring.c:2063
>>   task_work_run+0xdc/0x128 kernel/task_work.c:151
>>   get_signal+0x6f8/0x980 kernel/signal.c:2562
>>   do_signal+0x108/0x3a4 arch/arm64/kernel/signal.c:658
>>   do_notify_resume+0xbc/0x25c arch/arm64/kernel/signal.c:722
>>   work_pending+0xc/0x180
>>
>> blkdev_read_iter can truncate iov_iter's count since the count + pos may
>> exceed the size of the blkdev. This will confuse io_read that we have
>> consume the iovec. And once we do the iov_iter_revert in io_read, we
>> will trigger the slab-out-of-bounds. Fix it by reexpand the count with
>> size has been truncated.
>>
>> blkdev_write_iter can trigger the problem too.
>>
>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>> ---
>>   fs/block_dev.c | 20 +++++++++++++++++---
>>   1 file changed, 17 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 92ed7d5df677..788e1014576f 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -1680,6 +1680,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>       struct inode *bd_inode = bdev_file_inode(file);
>>       loff_t size = i_size_read(bd_inode);
>>       struct blk_plug plug;
>> +    size_t shorted = 0;
>>       ssize_t ret;
>>         if (bdev_read_only(I_BDEV(bd_inode)))
>> @@ -1697,12 +1698,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>       if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
>>           return -EOPNOTSUPP;
>>   -    iov_iter_truncate(from, size - iocb->ki_pos);
>> +    size -= iocb->ki_pos;
>> +    if (iov_iter_count(from) > size) {
>> +        shorted = iov_iter_count(from) - size;
>> +        iov_iter_truncate(from, size);
>> +    }
>>         blk_start_plug(&plug);
>>       ret = __generic_file_write_iter(iocb, from);
>>       if (ret > 0)
>>           ret = generic_write_sync(iocb, ret);
>> +    iov_iter_reexpand(from, iov_iter_count(from) + shorted);
>>       blk_finish_plug(&plug);
>>       return ret;
>>   }
>> @@ -1714,13 +1720,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>       struct inode *bd_inode = bdev_file_inode(file);
>>       loff_t size = i_size_read(bd_inode);
>>       loff_t pos = iocb->ki_pos;
>> +    size_t shorted = 0;
>> +    ssize_t ret;
>>         if (pos >= size)
>>           return 0;
>>         size -= pos;
>> -    iov_iter_truncate(to, size);
>> -    return generic_file_read_iter(iocb, to);
>> +    if (iov_iter_count(to) > size) {
>> +        shorted = iov_iter_count(to) - size;
>> +        iov_iter_truncate(to, size);
>> +    }
>> +
>> +    ret = generic_file_read_iter(iocb, to);
>> +    iov_iter_reexpand(to, iov_iter_count(to) + shorted);
>> +    return ret;
>>   }
>>   EXPORT_SYMBOL_GPL(blkdev_read_iter);
>>
yangerkun April 7, 2021, 2:16 p.m. UTC | #3
在 2021/4/6 19:04, Pavel Begunkov 写道:
> On 06/04/2021 02:28, yangerkun wrote:
>> Ping...
> 
> It wasn't forgotten, but wouln't have worked because of
> other reasons. With these two already queued, that's a
> different story.
> 
> https://git.kernel.dk/cgit/linux-block/commit/?h=io_uring-5.12&id=07204f21577a1d882f0259590c3553fe6a476381
> https://git.kernel.dk/cgit/linux-block/commit/?h=io_uring-5.12&id=230d50d448acb6639991440913299e50cacf1daf
> 
> Can you re-confirm, that the bug is still there (should be)
> and your patch fixes it?

Hi,

This problem still exists in mainline (2d743660786e Merge branch 'fixes' 
of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs), and this 
patch will fix it.

The io_read for loop will return -EAGAIN. This will lead a 
iov_iter_revert in io_read. Once we truncate iov_iter in 
blkdev_read_iter, we will see this bug...


[  181.204371][ T4241] loop0: detected capacity change from 0 to 232 

[  181.253683][ T4241] 
==================================================================
[  181.255313][ T4241] BUG: KASAN: slab-out-of-bounds in 
iov_iter_revert+0xd0/0x3e0
[  181.256723][ T4241] Read of size 8 at addr ffff0000cfbc8ff8 by task 
a.out/4241
[  181.257776][ T4241] 

[  181.258749][ T4241] CPU: 5 PID: 4241 Comm: a.out Not tainted 
5.12.0-rc6-00006-g2d743660786e
#1 

[  181.260149][ T4241] Hardware name: linux,dummy-virt (DT) 

[  181.261468][ T4241] Call trace: 

[  181.262052][ T4241]  dump_backtrace+0x0/0x348 

[  181.263139][ T4241]  show_stack+0x28/0x38 

[  181.264234][ T4241]  dump_stack+0x134/0x1a4 

[  181.265175][ T4241]  print_address_description.constprop.0+0x68/0x304 

[  181.266430][ T4241]  kasan_report+0x1d0/0x238 

[  181.267308][ T4241]  __asan_load8+0x88/0xc0 

[  181.268317][ T4241]  iov_iter_revert+0xd0/0x3e0 

[  181.269251][ T4241]  io_read+0x310/0x5c0 

[  181.270208][ T4241]  io_issue_sqe+0x3fc/0x25d8 

[  181.271134][ T4241]  __io_queue_sqe+0xf8/0x480 

[  181.272142][ T4241]  io_queue_sqe+0x3a4/0x4c8 

[  181.273053][ T4241]  io_submit_sqes+0xd9c/0x22d0 

[  181.274375][ T4241]  __arm64_sys_io_uring_enter+0x3d0/0xce0 

[  181.275554][ T4241]  do_el0_svc+0xc4/0x228 

[  181.276411][ T4241]  el0_svc+0x24/0x30 

[  181.277323][ T4241]  el0_sync_handler+0x158/0x160 

[  181.278241][ T4241]  el0_sync+0x13c/0x140 

[  181.279287][ T4241] 

[  181.279820][ T4241] Allocated by task 4241: 

[  181.280699][ T4241]  kasan_save_stack+0x24/0x50 

[  181.281626][ T4241]  __kasan_kmalloc+0x84/0xa8 

[  181.282578][ T4241]  io_wq_create+0x94/0x668 

[  181.283469][ T4241]  io_uring_alloc_task_context+0x164/0x368 

[  181.284748][ T4241]  io_uring_add_task_file+0x1b0/0x208 

[  181.285865][ T4241]  io_uring_setup+0xaac/0x12a0 

[  181.286823][ T4241]  __arm64_sys_io_uring_setup+0x34/0x40 

[  181.287957][ T4241]  do_el0_svc+0xc4/0x228 

[  181.288906][ T4241]  el0_svc+0x24/0x30 

[  181.289816][ T4241]  el0_sync_handler+0x158/0x160 

[  181.290751][ T4241]  el0_sync+0x13c/0x140 

[  181.291697][ T4241] 


> 
>>
>> 在 2021/4/1 15:18, yangerkun 写道:
>>> We get a bug:
>>>
>>> BUG: KASAN: slab-out-of-bounds in iov_iter_revert+0x11c/0x404
>>> lib/iov_iter.c:1139
>>> Read of size 8 at addr ffff0000d3fb11f8 by task
>>>
>>> CPU: 0 PID: 12582 Comm: syz-executor.2 Not tainted
>>> 5.10.0-00843-g352c8610ccd2 #2
>>> Hardware name: linux,dummy-virt (DT)
>>> Call trace:
>>>    dump_backtrace+0x0/0x2d0 arch/arm64/kernel/stacktrace.c:132
>>>    show_stack+0x28/0x34 arch/arm64/kernel/stacktrace.c:196
>>>    __dump_stack lib/dump_stack.c:77 [inline]
>>>    dump_stack+0x110/0x164 lib/dump_stack.c:118
>>>    print_address_description+0x78/0x5c8 mm/kasan/report.c:385
>>>    __kasan_report mm/kasan/report.c:545 [inline]
>>>    kasan_report+0x148/0x1e4 mm/kasan/report.c:562
>>>    check_memory_region_inline mm/kasan/generic.c:183 [inline]
>>>    __asan_load8+0xb4/0xbc mm/kasan/generic.c:252
>>>    iov_iter_revert+0x11c/0x404 lib/iov_iter.c:1139
>>>    io_read fs/io_uring.c:3421 [inline]
>>>    io_issue_sqe+0x2344/0x2d64 fs/io_uring.c:5943
>>>    __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>>>    io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>>>    io_submit_sqe fs/io_uring.c:6395 [inline]
>>>    io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
>>>    __do_sys_io_uring_enter fs/io_uring.c:9013 [inline]
>>>    __se_sys_io_uring_enter fs/io_uring.c:8960 [inline]
>>>    __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960
>>>    __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline]
>>>    invoke_syscall arch/arm64/kernel/syscall.c:48 [inline]
>>>    el0_svc_common arch/arm64/kernel/syscall.c:158 [inline]
>>>    do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227
>>>    el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367
>>>    el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383
>>>    el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670
>>>
>>> Allocated by task 12570:
>>>    stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121
>>>    kasan_save_stack mm/kasan/common.c:48 [inline]
>>>    kasan_set_track mm/kasan/common.c:56 [inline]
>>>    __kasan_kmalloc+0xdc/0x120 mm/kasan/common.c:461
>>>    kasan_kmalloc+0xc/0x14 mm/kasan/common.c:475
>>>    __kmalloc+0x23c/0x334 mm/slub.c:3970
>>>    kmalloc include/linux/slab.h:557 [inline]
>>>    __io_alloc_async_data+0x68/0x9c fs/io_uring.c:3210
>>>    io_setup_async_rw fs/io_uring.c:3229 [inline]
>>>    io_read fs/io_uring.c:3436 [inline]
>>>    io_issue_sqe+0x2954/0x2d64 fs/io_uring.c:5943
>>>    __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>>>    io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>>>    io_submit_sqe fs/io_uring.c:6395 [inline]
>>>    io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
>>>    __do_sys_io_uring_enter fs/io_uring.c:9013 [inline]
>>>    __se_sys_io_uring_enter fs/io_uring.c:8960 [inline]
>>>    __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960
>>>    __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline]
>>>    invoke_syscall arch/arm64/kernel/syscall.c:48 [inline]
>>>    el0_svc_common arch/arm64/kernel/syscall.c:158 [inline]
>>>    do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227
>>>    el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367
>>>    el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383
>>>    el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670
>>>
>>> Freed by task 12570:
>>>    stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121
>>>    kasan_save_stack mm/kasan/common.c:48 [inline]
>>>    kasan_set_track+0x38/0x6c mm/kasan/common.c:56
>>>    kasan_set_free_info+0x20/0x40 mm/kasan/generic.c:355
>>>    __kasan_slab_free+0x124/0x150 mm/kasan/common.c:422
>>>    kasan_slab_free+0x10/0x1c mm/kasan/common.c:431
>>>    slab_free_hook mm/slub.c:1544 [inline]
>>>    slab_free_freelist_hook mm/slub.c:1577 [inline]
>>>    slab_free mm/slub.c:3142 [inline]
>>>    kfree+0x104/0x38c mm/slub.c:4124
>>>    io_dismantle_req fs/io_uring.c:1855 [inline]
>>>    __io_free_req+0x70/0x254 fs/io_uring.c:1867
>>>    io_put_req_find_next fs/io_uring.c:2173 [inline]
>>>    __io_queue_sqe+0x1fc/0x520 fs/io_uring.c:6279
>>>    __io_req_task_submit+0x154/0x21c fs/io_uring.c:2051
>>>    io_req_task_submit+0x2c/0x44 fs/io_uring.c:2063
>>>    task_work_run+0xdc/0x128 kernel/task_work.c:151
>>>    get_signal+0x6f8/0x980 kernel/signal.c:2562
>>>    do_signal+0x108/0x3a4 arch/arm64/kernel/signal.c:658
>>>    do_notify_resume+0xbc/0x25c arch/arm64/kernel/signal.c:722
>>>    work_pending+0xc/0x180
>>>
>>> blkdev_read_iter can truncate iov_iter's count since the count + pos may
>>> exceed the size of the blkdev. This will confuse io_read that we have
>>> consume the iovec. And once we do the iov_iter_revert in io_read, we
>>> will trigger the slab-out-of-bounds. Fix it by reexpand the count with
>>> size has been truncated.
>>>
>>> blkdev_write_iter can trigger the problem too.
>>>
>>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>>> ---
>>>    fs/block_dev.c | 20 +++++++++++++++++---
>>>    1 file changed, 17 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>>> index 92ed7d5df677..788e1014576f 100644
>>> --- a/fs/block_dev.c
>>> +++ b/fs/block_dev.c
>>> @@ -1680,6 +1680,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>>        struct inode *bd_inode = bdev_file_inode(file);
>>>        loff_t size = i_size_read(bd_inode);
>>>        struct blk_plug plug;
>>> +    size_t shorted = 0;
>>>        ssize_t ret;
>>>          if (bdev_read_only(I_BDEV(bd_inode)))
>>> @@ -1697,12 +1698,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>>        if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
>>>            return -EOPNOTSUPP;
>>>    -    iov_iter_truncate(from, size - iocb->ki_pos);
>>> +    size -= iocb->ki_pos;
>>> +    if (iov_iter_count(from) > size) {
>>> +        shorted = iov_iter_count(from) - size;
>>> +        iov_iter_truncate(from, size);
>>> +    }
>>>          blk_start_plug(&plug);
>>>        ret = __generic_file_write_iter(iocb, from);
>>>        if (ret > 0)
>>>            ret = generic_write_sync(iocb, ret);
>>> +    iov_iter_reexpand(from, iov_iter_count(from) + shorted);
>>>        blk_finish_plug(&plug);
>>>        return ret;
>>>    }
>>> @@ -1714,13 +1720,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>        struct inode *bd_inode = bdev_file_inode(file);
>>>        loff_t size = i_size_read(bd_inode);
>>>        loff_t pos = iocb->ki_pos;
>>> +    size_t shorted = 0;
>>> +    ssize_t ret;
>>>          if (pos >= size)
>>>            return 0;
>>>          size -= pos;
>>> -    iov_iter_truncate(to, size);
>>> -    return generic_file_read_iter(iocb, to);
>>> +    if (iov_iter_count(to) > size) {
>>> +        shorted = iov_iter_count(to) - size;
>>> +        iov_iter_truncate(to, size);
>>> +    }
>>> +
>>> +    ret = generic_file_read_iter(iocb, to);
>>> +    iov_iter_reexpand(to, iov_iter_count(to) + shorted);
>>> +    return ret;
>>>    }
>>>    EXPORT_SYMBOL_GPL(blkdev_read_iter);
>>>   
>
Pavel Begunkov April 9, 2021, 2:49 p.m. UTC | #4
On 01/04/2021 08:18, yangerkun wrote:
> We get a bug:
> 
> BUG: KASAN: slab-out-of-bounds in iov_iter_revert+0x11c/0x404
> lib/iov_iter.c:1139
> Read of size 8 at addr ffff0000d3fb11f8 by task
> 
> CPU: 0 PID: 12582 Comm: syz-executor.2 Not tainted
> 5.10.0-00843-g352c8610ccd2 #2
> Hardware name: linux,dummy-virt (DT)
> Call trace:
>  dump_backtrace+0x0/0x2d0 arch/arm64/kernel/stacktrace.c:132
>  show_stack+0x28/0x34 arch/arm64/kernel/stacktrace.c:196
>  __dump_stack lib/dump_stack.c:77 [inline]
>  dump_stack+0x110/0x164 lib/dump_stack.c:118
>  print_address_description+0x78/0x5c8 mm/kasan/report.c:385
>  __kasan_report mm/kasan/report.c:545 [inline]
>  kasan_report+0x148/0x1e4 mm/kasan/report.c:562
>  check_memory_region_inline mm/kasan/generic.c:183 [inline]
>  __asan_load8+0xb4/0xbc mm/kasan/generic.c:252
>  iov_iter_revert+0x11c/0x404 lib/iov_iter.c:1139
>  io_read fs/io_uring.c:3421 [inline]
>  io_issue_sqe+0x2344/0x2d64 fs/io_uring.c:5943
>  __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>  io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>  io_submit_sqe fs/io_uring.c:6395 [inline]
>  io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
>  __do_sys_io_uring_enter fs/io_uring.c:9013 [inline]
>  __se_sys_io_uring_enter fs/io_uring.c:8960 [inline]
>  __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960
>  __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline]
>  invoke_syscall arch/arm64/kernel/syscall.c:48 [inline]
>  el0_svc_common arch/arm64/kernel/syscall.c:158 [inline]
>  do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227
>  el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367
>  el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383
>  el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670
> 
> Allocated by task 12570:
>  stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121
>  kasan_save_stack mm/kasan/common.c:48 [inline]
>  kasan_set_track mm/kasan/common.c:56 [inline]
>  __kasan_kmalloc+0xdc/0x120 mm/kasan/common.c:461
>  kasan_kmalloc+0xc/0x14 mm/kasan/common.c:475
>  __kmalloc+0x23c/0x334 mm/slub.c:3970
>  kmalloc include/linux/slab.h:557 [inline]
>  __io_alloc_async_data+0x68/0x9c fs/io_uring.c:3210
>  io_setup_async_rw fs/io_uring.c:3229 [inline]
>  io_read fs/io_uring.c:3436 [inline]
>  io_issue_sqe+0x2954/0x2d64 fs/io_uring.c:5943
>  __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>  io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>  io_submit_sqe fs/io_uring.c:6395 [inline]
>  io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
>  __do_sys_io_uring_enter fs/io_uring.c:9013 [inline]
>  __se_sys_io_uring_enter fs/io_uring.c:8960 [inline]
>  __arm64_sys_io_uring_enter+0x190/0x708 fs/io_uring.c:8960
>  __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline]
>  invoke_syscall arch/arm64/kernel/syscall.c:48 [inline]
>  el0_svc_common arch/arm64/kernel/syscall.c:158 [inline]
>  do_el0_svc+0x120/0x290 arch/arm64/kernel/syscall.c:227
>  el0_svc+0x1c/0x28 arch/arm64/kernel/entry-common.c:367
>  el0_sync_handler+0x98/0x170 arch/arm64/kernel/entry-common.c:383
>  el0_sync+0x140/0x180 arch/arm64/kernel/entry.S:670
> 
> Freed by task 12570:
>  stack_trace_save+0x80/0xb8 kernel/stacktrace.c:121
>  kasan_save_stack mm/kasan/common.c:48 [inline]
>  kasan_set_track+0x38/0x6c mm/kasan/common.c:56
>  kasan_set_free_info+0x20/0x40 mm/kasan/generic.c:355
>  __kasan_slab_free+0x124/0x150 mm/kasan/common.c:422
>  kasan_slab_free+0x10/0x1c mm/kasan/common.c:431
>  slab_free_hook mm/slub.c:1544 [inline]
>  slab_free_freelist_hook mm/slub.c:1577 [inline]
>  slab_free mm/slub.c:3142 [inline]
>  kfree+0x104/0x38c mm/slub.c:4124
>  io_dismantle_req fs/io_uring.c:1855 [inline]
>  __io_free_req+0x70/0x254 fs/io_uring.c:1867
>  io_put_req_find_next fs/io_uring.c:2173 [inline]
>  __io_queue_sqe+0x1fc/0x520 fs/io_uring.c:6279
>  __io_req_task_submit+0x154/0x21c fs/io_uring.c:2051
>  io_req_task_submit+0x2c/0x44 fs/io_uring.c:2063
>  task_work_run+0xdc/0x128 kernel/task_work.c:151
>  get_signal+0x6f8/0x980 kernel/signal.c:2562
>  do_signal+0x108/0x3a4 arch/arm64/kernel/signal.c:658
>  do_notify_resume+0xbc/0x25c arch/arm64/kernel/signal.c:722
>  work_pending+0xc/0x180
> 
> blkdev_read_iter can truncate iov_iter's count since the count + pos may
> exceed the size of the blkdev. This will confuse io_read that we have
> consume the iovec. And once we do the iov_iter_revert in io_read, we
> will trigger the slab-out-of-bounds. Fix it by reexpand the count with
> size has been truncated.

Looks right,

Acked-by: Pavel Begunkov <asml.silencec@gmail.com>

> 
> blkdev_write_iter can trigger the problem too.
> 
> Signed-off-by: yangerkun <yangerkun@huawei.com>
> ---
>  fs/block_dev.c | 20 +++++++++++++++++---
>  1 file changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 92ed7d5df677..788e1014576f 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1680,6 +1680,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	struct inode *bd_inode = bdev_file_inode(file);
>  	loff_t size = i_size_read(bd_inode);
>  	struct blk_plug plug;
> +	size_t shorted = 0;
>  	ssize_t ret;
>  
>  	if (bdev_read_only(I_BDEV(bd_inode)))
> @@ -1697,12 +1698,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
>  		return -EOPNOTSUPP;
>  
> -	iov_iter_truncate(from, size - iocb->ki_pos);
> +	size -= iocb->ki_pos;
> +	if (iov_iter_count(from) > size) {
> +		shorted = iov_iter_count(from) - size;
> +		iov_iter_truncate(from, size);
> +	}
>  
>  	blk_start_plug(&plug);
>  	ret = __generic_file_write_iter(iocb, from);
>  	if (ret > 0)
>  		ret = generic_write_sync(iocb, ret);
> +	iov_iter_reexpand(from, iov_iter_count(from) + shorted);
>  	blk_finish_plug(&plug);
>  	return ret;
>  }
> @@ -1714,13 +1720,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  	struct inode *bd_inode = bdev_file_inode(file);
>  	loff_t size = i_size_read(bd_inode);
>  	loff_t pos = iocb->ki_pos;
> +	size_t shorted = 0;
> +	ssize_t ret;
>  
>  	if (pos >= size)
>  		return 0;
>  
>  	size -= pos;
> -	iov_iter_truncate(to, size);
> -	return generic_file_read_iter(iocb, to);
> +	if (iov_iter_count(to) > size) {
> +		shorted = iov_iter_count(to) - size;
> +		iov_iter_truncate(to, size);
> +	}
> +
> +	ret = generic_file_read_iter(iocb, to);
> +	iov_iter_reexpand(to, iov_iter_count(to) + shorted);
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(blkdev_read_iter);
>  
>
Pavel Begunkov April 15, 2021, 5:37 p.m. UTC | #5
On 09/04/2021 15:49, Pavel Begunkov wrote:
> On 01/04/2021 08:18, yangerkun wrote:
>> We get a bug:
>>
>> BUG: KASAN: slab-out-of-bounds in iov_iter_revert+0x11c/0x404
>> lib/iov_iter.c:1139
>> Read of size 8 at addr ffff0000d3fb11f8 by task
>>
>> CPU: 0 PID: 12582 Comm: syz-executor.2 Not tainted
>> 5.10.0-00843-g352c8610ccd2 #2
>> Hardware name: linux,dummy-virt (DT)
>> Call trace:
...
>>  __asan_load8+0xb4/0xbc mm/kasan/generic.c:252
>>  iov_iter_revert+0x11c/0x404 lib/iov_iter.c:1139
>>  io_read fs/io_uring.c:3421 [inline]
>>  io_issue_sqe+0x2344/0x2d64 fs/io_uring.c:5943
>>  __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>>  io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>>  io_submit_sqe fs/io_uring.c:6395 [inline]
>>  io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
...
>>
>> blkdev_read_iter can truncate iov_iter's count since the count + pos may
>> exceed the size of the blkdev. This will confuse io_read that we have
>> consume the iovec. And once we do the iov_iter_revert in io_read, we
>> will trigger the slab-out-of-bounds. Fix it by reexpand the count with
>> size has been truncated.
> 
> Looks right,
> 
> Acked-by: Pavel Begunkov <asml.silencec@gmail.com>

Fwiw, we need to forget to drag it through 5.13 + stable


>>
>> blkdev_write_iter can trigger the problem too.
>>
>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>> ---
>>  fs/block_dev.c | 20 +++++++++++++++++---
>>  1 file changed, 17 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>> index 92ed7d5df677..788e1014576f 100644
>> --- a/fs/block_dev.c
>> +++ b/fs/block_dev.c
>> @@ -1680,6 +1680,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  	struct inode *bd_inode = bdev_file_inode(file);
>>  	loff_t size = i_size_read(bd_inode);
>>  	struct blk_plug plug;
>> +	size_t shorted = 0;
>>  	ssize_t ret;
>>  
>>  	if (bdev_read_only(I_BDEV(bd_inode)))
>> @@ -1697,12 +1698,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  	if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
>>  		return -EOPNOTSUPP;
>>  
>> -	iov_iter_truncate(from, size - iocb->ki_pos);
>> +	size -= iocb->ki_pos;
>> +	if (iov_iter_count(from) > size) {
>> +		shorted = iov_iter_count(from) - size;
>> +		iov_iter_truncate(from, size);
>> +	}
>>  
>>  	blk_start_plug(&plug);
>>  	ret = __generic_file_write_iter(iocb, from);
>>  	if (ret > 0)
>>  		ret = generic_write_sync(iocb, ret);
>> +	iov_iter_reexpand(from, iov_iter_count(from) + shorted);
>>  	blk_finish_plug(&plug);
>>  	return ret;
>>  }
>> @@ -1714,13 +1720,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>  	struct inode *bd_inode = bdev_file_inode(file);
>>  	loff_t size = i_size_read(bd_inode);
>>  	loff_t pos = iocb->ki_pos;
>> +	size_t shorted = 0;
>> +	ssize_t ret;
>>  
>>  	if (pos >= size)
>>  		return 0;
>>  
>>  	size -= pos;
>> -	iov_iter_truncate(to, size);
>> -	return generic_file_read_iter(iocb, to);
>> +	if (iov_iter_count(to) > size) {
>> +		shorted = iov_iter_count(to) - size;
>> +		iov_iter_truncate(to, size);
>> +	}
>> +
>> +	ret = generic_file_read_iter(iocb, to);
>> +	iov_iter_reexpand(to, iov_iter_count(to) + shorted);
>> +	return ret;
>>  }
>>  EXPORT_SYMBOL_GPL(blkdev_read_iter);
>>  
>>
>
Pavel Begunkov April 15, 2021, 5:39 p.m. UTC | #6
On 15/04/2021 18:37, Pavel Begunkov wrote:
> On 09/04/2021 15:49, Pavel Begunkov wrote:
>> On 01/04/2021 08:18, yangerkun wrote:
>>> We get a bug:
>>>
>>> BUG: KASAN: slab-out-of-bounds in iov_iter_revert+0x11c/0x404
>>> lib/iov_iter.c:1139
>>> Read of size 8 at addr ffff0000d3fb11f8 by task
>>>
>>> CPU: 0 PID: 12582 Comm: syz-executor.2 Not tainted
>>> 5.10.0-00843-g352c8610ccd2 #2
>>> Hardware name: linux,dummy-virt (DT)
>>> Call trace:
> ...
>>>  __asan_load8+0xb4/0xbc mm/kasan/generic.c:252
>>>  iov_iter_revert+0x11c/0x404 lib/iov_iter.c:1139
>>>  io_read fs/io_uring.c:3421 [inline]
>>>  io_issue_sqe+0x2344/0x2d64 fs/io_uring.c:5943
>>>  __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>>>  io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>>>  io_submit_sqe fs/io_uring.c:6395 [inline]
>>>  io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
> ...
>>>
>>> blkdev_read_iter can truncate iov_iter's count since the count + pos may
>>> exceed the size of the blkdev. This will confuse io_read that we have
>>> consume the iovec. And once we do the iov_iter_revert in io_read, we
>>> will trigger the slab-out-of-bounds. Fix it by reexpand the count with
>>> size has been truncated.
>>
>> Looks right,
>>
>> Acked-by: Pavel Begunkov <asml.silencec@gmail.com>
> 
> Fwiw, we need to forget to drag it through 5.13 + stable

Err, yypo, to _not_ forget to 5.13 + stable...

> 
> 
>>>
>>> blkdev_write_iter can trigger the problem too.
>>>
>>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>>> ---
>>>  fs/block_dev.c | 20 +++++++++++++++++---
>>>  1 file changed, 17 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>>> index 92ed7d5df677..788e1014576f 100644
>>> --- a/fs/block_dev.c
>>> +++ b/fs/block_dev.c
>>> @@ -1680,6 +1680,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>>  	struct inode *bd_inode = bdev_file_inode(file);
>>>  	loff_t size = i_size_read(bd_inode);
>>>  	struct blk_plug plug;
>>> +	size_t shorted = 0;
>>>  	ssize_t ret;
>>>  
>>>  	if (bdev_read_only(I_BDEV(bd_inode)))
>>> @@ -1697,12 +1698,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>>  	if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
>>>  		return -EOPNOTSUPP;
>>>  
>>> -	iov_iter_truncate(from, size - iocb->ki_pos);
>>> +	size -= iocb->ki_pos;
>>> +	if (iov_iter_count(from) > size) {
>>> +		shorted = iov_iter_count(from) - size;
>>> +		iov_iter_truncate(from, size);
>>> +	}
>>>  
>>>  	blk_start_plug(&plug);
>>>  	ret = __generic_file_write_iter(iocb, from);
>>>  	if (ret > 0)
>>>  		ret = generic_write_sync(iocb, ret);
>>> +	iov_iter_reexpand(from, iov_iter_count(from) + shorted);
>>>  	blk_finish_plug(&plug);
>>>  	return ret;
>>>  }
>>> @@ -1714,13 +1720,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>  	struct inode *bd_inode = bdev_file_inode(file);
>>>  	loff_t size = i_size_read(bd_inode);
>>>  	loff_t pos = iocb->ki_pos;
>>> +	size_t shorted = 0;
>>> +	ssize_t ret;
>>>  
>>>  	if (pos >= size)
>>>  		return 0;
>>>  
>>>  	size -= pos;
>>> -	iov_iter_truncate(to, size);
>>> -	return generic_file_read_iter(iocb, to);
>>> +	if (iov_iter_count(to) > size) {
>>> +		shorted = iov_iter_count(to) - size;
>>> +		iov_iter_truncate(to, size);
>>> +	}
>>> +
>>> +	ret = generic_file_read_iter(iocb, to);
>>> +	iov_iter_reexpand(to, iov_iter_count(to) + shorted);
>>> +	return ret;
>>>  }
>>>  EXPORT_SYMBOL_GPL(blkdev_read_iter);
>>>  
>>>
>>
>
yangerkun April 28, 2021, 6:16 a.m. UTC | #7
Hi,

Should we pick this patch for 5.13?

在 2021/4/16 1:39, Pavel Begunkov 写道:
> On 15/04/2021 18:37, Pavel Begunkov wrote:
>> On 09/04/2021 15:49, Pavel Begunkov wrote:
>>> On 01/04/2021 08:18, yangerkun wrote:
>>>> We get a bug:
>>>>
>>>> BUG: KASAN: slab-out-of-bounds in iov_iter_revert+0x11c/0x404
>>>> lib/iov_iter.c:1139
>>>> Read of size 8 at addr ffff0000d3fb11f8 by task
>>>>
>>>> CPU: 0 PID: 12582 Comm: syz-executor.2 Not tainted
>>>> 5.10.0-00843-g352c8610ccd2 #2
>>>> Hardware name: linux,dummy-virt (DT)
>>>> Call trace:
>> ...
>>>>   __asan_load8+0xb4/0xbc mm/kasan/generic.c:252
>>>>   iov_iter_revert+0x11c/0x404 lib/iov_iter.c:1139
>>>>   io_read fs/io_uring.c:3421 [inline]
>>>>   io_issue_sqe+0x2344/0x2d64 fs/io_uring.c:5943
>>>>   __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>>>>   io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>>>>   io_submit_sqe fs/io_uring.c:6395 [inline]
>>>>   io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
>> ...
>>>>
>>>> blkdev_read_iter can truncate iov_iter's count since the count + pos may
>>>> exceed the size of the blkdev. This will confuse io_read that we have
>>>> consume the iovec. And once we do the iov_iter_revert in io_read, we
>>>> will trigger the slab-out-of-bounds. Fix it by reexpand the count with
>>>> size has been truncated.
>>>
>>> Looks right,
>>>
>>> Acked-by: Pavel Begunkov <asml.silencec@gmail.com>
>>
>> Fwiw, we need to forget to drag it through 5.13 + stable
> 
> Err, yypo, to _not_ forget to 5.13 + stable...
> 
>>
>>
>>>>
>>>> blkdev_write_iter can trigger the problem too.
>>>>
>>>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>>>> ---
>>>>   fs/block_dev.c | 20 +++++++++++++++++---
>>>>   1 file changed, 17 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>>>> index 92ed7d5df677..788e1014576f 100644
>>>> --- a/fs/block_dev.c
>>>> +++ b/fs/block_dev.c
>>>> @@ -1680,6 +1680,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>>>   	struct inode *bd_inode = bdev_file_inode(file);
>>>>   	loff_t size = i_size_read(bd_inode);
>>>>   	struct blk_plug plug;
>>>> +	size_t shorted = 0;
>>>>   	ssize_t ret;
>>>>   
>>>>   	if (bdev_read_only(I_BDEV(bd_inode)))
>>>> @@ -1697,12 +1698,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>>>   	if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
>>>>   		return -EOPNOTSUPP;
>>>>   
>>>> -	iov_iter_truncate(from, size - iocb->ki_pos);
>>>> +	size -= iocb->ki_pos;
>>>> +	if (iov_iter_count(from) > size) {
>>>> +		shorted = iov_iter_count(from) - size;
>>>> +		iov_iter_truncate(from, size);
>>>> +	}
>>>>   
>>>>   	blk_start_plug(&plug);
>>>>   	ret = __generic_file_write_iter(iocb, from);
>>>>   	if (ret > 0)
>>>>   		ret = generic_write_sync(iocb, ret);
>>>> +	iov_iter_reexpand(from, iov_iter_count(from) + shorted);
>>>>   	blk_finish_plug(&plug);
>>>>   	return ret;
>>>>   }
>>>> @@ -1714,13 +1720,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>   	struct inode *bd_inode = bdev_file_inode(file);
>>>>   	loff_t size = i_size_read(bd_inode);
>>>>   	loff_t pos = iocb->ki_pos;
>>>> +	size_t shorted = 0;
>>>> +	ssize_t ret;
>>>>   
>>>>   	if (pos >= size)
>>>>   		return 0;
>>>>   
>>>>   	size -= pos;
>>>> -	iov_iter_truncate(to, size);
>>>> -	return generic_file_read_iter(iocb, to);
>>>> +	if (iov_iter_count(to) > size) {
>>>> +		shorted = iov_iter_count(to) - size;
>>>> +		iov_iter_truncate(to, size);
>>>> +	}
>>>> +
>>>> +	ret = generic_file_read_iter(iocb, to);
>>>> +	iov_iter_reexpand(to, iov_iter_count(to) + shorted);
>>>> +	return ret;
>>>>   }
>>>>   EXPORT_SYMBOL_GPL(blkdev_read_iter);
>>>>   
>>>>
>>>
>>
>
Pavel Begunkov April 30, 2021, 12:57 p.m. UTC | #8
On 4/28/21 7:16 AM, yangerkun wrote:
> Hi,
> 
> Should we pick this patch for 5.13?

Looks ok to me

> 
> 在 2021/4/16 1:39, Pavel Begunkov 写道:
>> On 15/04/2021 18:37, Pavel Begunkov wrote:
>>> On 09/04/2021 15:49, Pavel Begunkov wrote:
>>>> On 01/04/2021 08:18, yangerkun wrote:
>>>>> We get a bug:
>>>>>
>>>>> BUG: KASAN: slab-out-of-bounds in iov_iter_revert+0x11c/0x404
>>>>> lib/iov_iter.c:1139
>>>>> Read of size 8 at addr ffff0000d3fb11f8 by task
>>>>>
>>>>> CPU: 0 PID: 12582 Comm: syz-executor.2 Not tainted
>>>>> 5.10.0-00843-g352c8610ccd2 #2
>>>>> Hardware name: linux,dummy-virt (DT)
>>>>> Call trace:
>>> ...
>>>>>   __asan_load8+0xb4/0xbc mm/kasan/generic.c:252
>>>>>   iov_iter_revert+0x11c/0x404 lib/iov_iter.c:1139
>>>>>   io_read fs/io_uring.c:3421 [inline]
>>>>>   io_issue_sqe+0x2344/0x2d64 fs/io_uring.c:5943
>>>>>   __io_queue_sqe+0x19c/0x520 fs/io_uring.c:6260
>>>>>   io_queue_sqe+0x2a4/0x590 fs/io_uring.c:6326
>>>>>   io_submit_sqe fs/io_uring.c:6395 [inline]
>>>>>   io_submit_sqes+0x4c0/0xa04 fs/io_uring.c:6624
>>> ...
>>>>>
>>>>> blkdev_read_iter can truncate iov_iter's count since the count + pos may
>>>>> exceed the size of the blkdev. This will confuse io_read that we have
>>>>> consume the iovec. And once we do the iov_iter_revert in io_read, we
>>>>> will trigger the slab-out-of-bounds. Fix it by reexpand the count with
>>>>> size has been truncated.
>>>>
>>>> Looks right,
>>>>
>>>> Acked-by: Pavel Begunkov <asml.silencec@gmail.com>
>>>
>>> Fwiw, we need to forget to drag it through 5.13 + stable
>>
>> Err, yypo, to _not_ forget to 5.13 + stable...
>>
>>>
>>>
>>>>>
>>>>> blkdev_write_iter can trigger the problem too.
>>>>>
>>>>> Signed-off-by: yangerkun <yangerkun@huawei.com>
>>>>> ---
>>>>>   fs/block_dev.c | 20 +++++++++++++++++---
>>>>>   1 file changed, 17 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/fs/block_dev.c b/fs/block_dev.c
>>>>> index 92ed7d5df677..788e1014576f 100644
>>>>> --- a/fs/block_dev.c
>>>>> +++ b/fs/block_dev.c
>>>>> @@ -1680,6 +1680,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>>>>       struct inode *bd_inode = bdev_file_inode(file);
>>>>>       loff_t size = i_size_read(bd_inode);
>>>>>       struct blk_plug plug;
>>>>> +    size_t shorted = 0;
>>>>>       ssize_t ret;
>>>>>         if (bdev_read_only(I_BDEV(bd_inode)))
>>>>> @@ -1697,12 +1698,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>>>>       if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
>>>>>           return -EOPNOTSUPP;
>>>>>   -    iov_iter_truncate(from, size - iocb->ki_pos);
>>>>> +    size -= iocb->ki_pos;
>>>>> +    if (iov_iter_count(from) > size) {
>>>>> +        shorted = iov_iter_count(from) - size;
>>>>> +        iov_iter_truncate(from, size);
>>>>> +    }
>>>>>         blk_start_plug(&plug);
>>>>>       ret = __generic_file_write_iter(iocb, from);
>>>>>       if (ret > 0)
>>>>>           ret = generic_write_sync(iocb, ret);
>>>>> +    iov_iter_reexpand(from, iov_iter_count(from) + shorted);
>>>>>       blk_finish_plug(&plug);
>>>>>       return ret;
>>>>>   }
>>>>> @@ -1714,13 +1720,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>>>>>       struct inode *bd_inode = bdev_file_inode(file);
>>>>>       loff_t size = i_size_read(bd_inode);
>>>>>       loff_t pos = iocb->ki_pos;
>>>>> +    size_t shorted = 0;
>>>>> +    ssize_t ret;
>>>>>         if (pos >= size)
>>>>>           return 0;
>>>>>         size -= pos;
>>>>> -    iov_iter_truncate(to, size);
>>>>> -    return generic_file_read_iter(iocb, to);
>>>>> +    if (iov_iter_count(to) > size) {
>>>>> +        shorted = iov_iter_count(to) - size;
>>>>> +        iov_iter_truncate(to, size);
>>>>> +    }
>>>>> +
>>>>> +    ret = generic_file_read_iter(iocb, to);
>>>>> +    iov_iter_reexpand(to, iov_iter_count(to) + shorted);
>>>>> +    return ret;
>>>>>   }
>>>>>   EXPORT_SYMBOL_GPL(blkdev_read_iter);
>>>>>  
>>>>
>>>
>>
Al Viro April 30, 2021, 2:35 p.m. UTC | #9
On Fri, Apr 30, 2021 at 01:57:22PM +0100, Pavel Begunkov wrote:
> On 4/28/21 7:16 AM, yangerkun wrote:
> > Hi,
> > 
> > Should we pick this patch for 5.13?
> 
> Looks ok to me

	Looks sane.  BTW, Pavel, could you go over #untested.iov_iter
and give it some beating?  Ideally - with per-commit profiling to see
what speedups/slowdowns do they come with...

	It's not in the final state (if nothing else, it needs to be
rebased on top of xarray stuff, and there will be followup cleanups
as well), but I'd appreciate testing and profiling data...

	It does survive xfstests + LTP syscall tests, but that's about
it.
Pavel Begunkov May 6, 2021, 4:57 p.m. UTC | #10
On 4/30/21 3:35 PM, Al Viro wrote:
> On Fri, Apr 30, 2021 at 01:57:22PM +0100, Pavel Begunkov wrote:
>> On 4/28/21 7:16 AM, yangerkun wrote:
>>> Hi,
>>>
>>> Should we pick this patch for 5.13?
>>
>> Looks ok to me
> 
> 	Looks sane.  BTW, Pavel, could you go over #untested.iov_iter
> and give it some beating?  Ideally - with per-commit profiling to see
> what speedups/slowdowns do they come with...

I've heard Jens already tested it out. Jens, is that right? Can you
share? especially since you have much more fitting hardware.

> 
> 	It's not in the final state (if nothing else, it needs to be
> rebased on top of xarray stuff, and there will be followup cleanups
> as well), but I'd appreciate testing and profiling data...
> 
> 	It does survive xfstests + LTP syscall tests, but that's about
> it.
>
Al Viro May 6, 2021, 5:17 p.m. UTC | #11
On Thu, May 06, 2021 at 05:57:02PM +0100, Pavel Begunkov wrote:
> On 4/30/21 3:35 PM, Al Viro wrote:
> > On Fri, Apr 30, 2021 at 01:57:22PM +0100, Pavel Begunkov wrote:
> >> On 4/28/21 7:16 AM, yangerkun wrote:
> >>> Hi,
> >>>
> >>> Should we pick this patch for 5.13?
> >>
> >> Looks ok to me
> > 
> > 	Looks sane.  BTW, Pavel, could you go over #untested.iov_iter
> > and give it some beating?  Ideally - with per-commit profiling to see
> > what speedups/slowdowns do they come with...
> 
> I've heard Jens already tested it out. Jens, is that right? Can you
> share? especially since you have much more fitting hardware.

FWIW, the current branch is #untested.iov_iter-3 and the code generated
by it at least _looks_ better than with mainline; how much of an improvement
does it make would have to be found by profiling...
Jens Axboe May 6, 2021, 5:19 p.m. UTC | #12
On 4/30/21 8:35 AM, Al Viro wrote:
> On Fri, Apr 30, 2021 at 01:57:22PM +0100, Pavel Begunkov wrote:
>> On 4/28/21 7:16 AM, yangerkun wrote:
>>> Hi,
>>>
>>> Should we pick this patch for 5.13?
>>
>> Looks ok to me
> 
> 	Looks sane.  BTW, Pavel, could you go over #untested.iov_iter
> and give it some beating?  Ideally - with per-commit profiling to see
> what speedups/slowdowns do they come with...
> 
> 	It's not in the final state (if nothing else, it needs to be
> rebased on top of xarray stuff, and there will be followup cleanups
> as well), but I'd appreciate testing and profiling data...
> 
> 	It does survive xfstests + LTP syscall tests, but that's about
> it.

Al, I ran your v3 branch of that and I didn't see anything in terms
of speedups. The test case is something that just writes to eventfd
a ton of times, enough to get a picture of the overall runtime. First
I ran with the existing baseline, which is eventfd using ->write():

Executed in  436.58 millis    fish           external
   usr time  106.21 millis  121.00 micros  106.09 millis
   sys time  331.32 millis   33.00 micros  331.29 millis

Executed in  436.84 millis    fish           external
   usr time  113.38 millis    0.00 micros  113.38 millis
   sys time  324.32 millis  226.00 micros  324.10 millis

Then I ran it with the eventfd ->write_iter() patch I posted:

Executed in  484.54 millis    fish           external
   usr time   93.19 millis  119.00 micros   93.07 millis
   sys time  391.35 millis   46.00 micros  391.30 millis

Executed in  485.45 millis    fish           external
   usr time   96.05 millis    0.00 micros   96.05 millis
   sys time  389.42 millis  216.00 micros  389.20 millis

Doing a quick profile, on the latter run with ->write_iter() we're
spending 8% of the time in _copy_from_iter(), and 4% in
new_sync_write(). That's obviously not there at all for the first case.
Both have about 4% in eventfd_write(). Non-iter case spends 1% in
copy_from_user().

Finally with your branch pulled in as well, iow using ->write_iter() for
eventfd and your iov changes:

Executed in  485.26 millis    fish           external
   usr time  103.09 millis   70.00 micros  103.03 millis
   sys time  382.18 millis   83.00 micros  382.09 millis

Executed in  485.16 millis    fish           external
   usr time  104.07 millis   69.00 micros  104.00 millis
   sys time  381.09 millis   94.00 micros  381.00 millis

and there's no real difference there. We're spending less time in
_copy_from_iter() (8% -> 6%) and less time in new_sync_write(), but
doesn't seem to manifest itself in reduced runtime.
Al Viro May 6, 2021, 6:55 p.m. UTC | #13
On Thu, May 06, 2021 at 11:19:03AM -0600, Jens Axboe wrote:

> Doing a quick profile, on the latter run with ->write_iter() we're
> spending 8% of the time in _copy_from_iter(), and 4% in
> new_sync_write(). That's obviously not there at all for the first case.
> Both have about 4% in eventfd_write(). Non-iter case spends 1% in
> copy_from_user().
> 
> Finally with your branch pulled in as well, iow using ->write_iter() for
> eventfd and your iov changes:
> 
> Executed in  485.26 millis    fish           external
>    usr time  103.09 millis   70.00 micros  103.03 millis
>    sys time  382.18 millis   83.00 micros  382.09 millis
> 
> Executed in  485.16 millis    fish           external
>    usr time  104.07 millis   69.00 micros  104.00 millis
>    sys time  381.09 millis   94.00 micros  381.00 millis
> 
> and there's no real difference there. We're spending less time in
> _copy_from_iter() (8% -> 6%) and less time in new_sync_write(), but
> doesn't seem to manifest itself in reduced runtime.

Interesting... do you have instruction-level profiles for _copy_from_iter()
and new_sync_write() on the last of those trees?
Jens Axboe May 6, 2021, 7:15 p.m. UTC | #14
On 5/6/21 12:55 PM, Al Viro wrote:
> On Thu, May 06, 2021 at 11:19:03AM -0600, Jens Axboe wrote:
> 
>> Doing a quick profile, on the latter run with ->write_iter() we're
>> spending 8% of the time in _copy_from_iter(), and 4% in
>> new_sync_write(). That's obviously not there at all for the first case.
>> Both have about 4% in eventfd_write(). Non-iter case spends 1% in
>> copy_from_user().
>>
>> Finally with your branch pulled in as well, iow using ->write_iter() for
>> eventfd and your iov changes:
>>
>> Executed in  485.26 millis    fish           external
>>    usr time  103.09 millis   70.00 micros  103.03 millis
>>    sys time  382.18 millis   83.00 micros  382.09 millis
>>
>> Executed in  485.16 millis    fish           external
>>    usr time  104.07 millis   69.00 micros  104.00 millis
>>    sys time  381.09 millis   94.00 micros  381.00 millis
>>
>> and there's no real difference there. We're spending less time in
>> _copy_from_iter() (8% -> 6%) and less time in new_sync_write(), but
>> doesn't seem to manifest itself in reduced runtime.
> 
> Interesting... do you have instruction-level profiles for _copy_from_iter()
> and new_sync_write() on the last of those trees?

Attached output of perf annotate <func> for that last run.
Al Viro May 6, 2021, 9:08 p.m. UTC | #15
On Thu, May 06, 2021 at 01:15:01PM -0600, Jens Axboe wrote:

> Attached output of perf annotate <func> for that last run.

Heh...  I wonder if keeping the value of iocb_flags(file) in
struct file itself would have a visible effect...
Matthew Wilcox (Oracle) May 6, 2021, 9:17 p.m. UTC | #16
On Thu, May 06, 2021 at 09:08:50PM +0000, Al Viro wrote:
> On Thu, May 06, 2021 at 01:15:01PM -0600, Jens Axboe wrote:
> 
> > Attached output of perf annotate <func> for that last run.
> 
> Heh...  I wonder if keeping the value of iocb_flags(file) in
> struct file itself would have a visible effect...

I suggested that ...
https://lore.kernel.org/linux-fsdevel/20210110061321.GC35215@casper.infradead.org/
Jens Axboe May 7, 2021, 2:59 p.m. UTC | #17
On 5/6/21 3:08 PM, Al Viro wrote:
> On Thu, May 06, 2021 at 01:15:01PM -0600, Jens Axboe wrote:
> 
>> Attached output of perf annotate <func> for that last run.
> 
> Heh...  I wonder if keeping the value of iocb_flags(file) in
> struct file itself would have a visible effect...

A quick hack to get rid of the init_sync_kiocb() in new_sync_write() and
just eliminate the ki_flags read in eventfd_write(), as the test case is
blocking. That brings us closer to the ->write() method, down 7% vs the
previous 10%:

Executed in  468.23 millis    fish           external
   usr time   95.09 millis  114.00 micros   94.98 millis
   sys time  372.98 millis   76.00 micros  372.90 millis

Executed in  468.97 millis    fish           external
   usr time   91.05 millis   89.00 micros   90.96 millis
   sys time  377.92 millis   69.00 micros  377.85 millis
diff mbox series

Patch

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 92ed7d5df677..788e1014576f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1680,6 +1680,7 @@  ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct inode *bd_inode = bdev_file_inode(file);
 	loff_t size = i_size_read(bd_inode);
 	struct blk_plug plug;
+	size_t shorted = 0;
 	ssize_t ret;
 
 	if (bdev_read_only(I_BDEV(bd_inode)))
@@ -1697,12 +1698,17 @@  ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
 		return -EOPNOTSUPP;
 
-	iov_iter_truncate(from, size - iocb->ki_pos);
+	size -= iocb->ki_pos;
+	if (iov_iter_count(from) > size) {
+		shorted = iov_iter_count(from) - size;
+		iov_iter_truncate(from, size);
+	}
 
 	blk_start_plug(&plug);
 	ret = __generic_file_write_iter(iocb, from);
 	if (ret > 0)
 		ret = generic_write_sync(iocb, ret);
+	iov_iter_reexpand(from, iov_iter_count(from) + shorted);
 	blk_finish_plug(&plug);
 	return ret;
 }
@@ -1714,13 +1720,21 @@  ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	struct inode *bd_inode = bdev_file_inode(file);
 	loff_t size = i_size_read(bd_inode);
 	loff_t pos = iocb->ki_pos;
+	size_t shorted = 0;
+	ssize_t ret;
 
 	if (pos >= size)
 		return 0;
 
 	size -= pos;
-	iov_iter_truncate(to, size);
-	return generic_file_read_iter(iocb, to);
+	if (iov_iter_count(to) > size) {
+		shorted = iov_iter_count(to) - size;
+		iov_iter_truncate(to, size);
+	}
+
+	ret = generic_file_read_iter(iocb, to);
+	iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_read_iter);