diff mbox series

[1/7] btrfs: Preallocate chunks in cow_file_range_async

Message ID 20190312152030.31987-2-nborisov@suse.com (mailing list archive)
State New, archived
Headers show
Series Compress path cleanups | expand

Commit Message

Nikolay Borisov March 12, 2019, 3:20 p.m. UTC
This commit changes the implementation of cow_file_range_async in order
to get rid of the BUG_ON in the middle of the loop. Additionally it
reworks the inner loop in the hopes of making it more understandable.

The idea is to make async_cow be a top-level structured, shared
amongst all chunks being sent for compression. This allows to perform
one memory allocation at the beginning and gracefully fail the IO if
there isn't enough memory. Now, each chunk is going to be described
by an async_chunk struct. It's the responsibility of the final chunk
to actually free the memory.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
---
 fs/btrfs/inode.c | 105 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 71 insertions(+), 34 deletions(-)

Comments

Johannes Thumshirn March 12, 2019, 3:35 p.m. UTC | #1
Looks good,
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
David Sterba March 27, 2019, 5:23 p.m. UTC | #2
On Tue, Mar 12, 2019 at 05:20:24PM +0200, Nikolay Borisov wrote:
> @@ -1190,45 +1201,71 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
>  				unsigned int write_flags)
>  {
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> -	struct async_cow *async_cow;
> +	struct async_cow *ctx;
> +	struct async_chunk *async_chunk;
>  	unsigned long nr_pages;
>  	u64 cur_end;
> +	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
> +	int i;
> +	bool should_compress;
>  
>  	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
>  			 1, 0, NULL);
> -	while (start < end) {
> -		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
> -		BUG_ON(!async_cow); /* -ENOMEM */
> +
> +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
> +	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
> +		num_chunks = 1;
> +		should_compress = false;
> +	} else {
> +		should_compress = true;
> +	}
> +
> +	ctx = kmalloc(struct_size(ctx, chunks, num_chunks), GFP_NOFS);

This leads to OOM due to high order allocation. And this is worse than
the previous state, where there are many small allocation that could
potentially fail (but most likely will not due to GFP_NOSF and size <
PAGE_SIZE).

So this needs to be reworked to avoid the costly allocations or reverted
to the previous state.

btrfs/138               [19:44:05][ 4034.368157] run fstests btrfs/138 at 2019-03-25 19:44:05
[ 4034.559716] BTRFS: device fsid 9300f07a-78f4-4ac6-8376-1a902ef26830 devid 1 transid 5 /dev/vdb
[ 4034.573670] BTRFS info (device vdb): disk space caching is enabled
[ 4034.575068] BTRFS info (device vdb): has skinny extents
[ 4034.576258] BTRFS info (device vdb): flagging fs with big metadata feature
[ 4034.580226] BTRFS info (device vdb): checking UUID tree
[ 4066.104734] BTRFS info (device vdb): disk space caching is enabled
[ 4066.108558] BTRFS info (device vdb): has skinny extents
[ 4066.186856] BTRFS info (device vdb): setting 8 feature flag
[ 4074.017307] BTRFS info (device vdb): disk space caching is enabled
[ 4074.019646] BTRFS info (device vdb): has skinny extents
[ 4074.065117] BTRFS info (device vdb): setting 16 feature flag
[ 4075.787401] kworker/u8:12: page allocation failure: order:4, mode:0x604040(GFP_NOFS|__GFP_COMP), nodemask=(null)
[ 4075.789581] CPU: 0 PID: 31258 Comm: kworker/u8:12 Not tainted 5.0.0-rc8-default+ #524
[ 4075.791235] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c89-prebuilt.qemu.org 04/01/2014
[ 4075.793334] Workqueue: writeback wb_workfn (flush-btrfs-718)
[ 4075.794455] Call Trace:
[ 4075.795029]  dump_stack+0x67/0x90
[ 4075.795756]  warn_alloc.cold.131+0x73/0xf3
[ 4075.796601]  __alloc_pages_slowpath+0xa0e/0xb50
[ 4075.797595]  ? __wake_up_common_lock+0x89/0xc0
[ 4075.798558]  __alloc_pages_nodemask+0x2bd/0x310
[ 4075.799537]  kmalloc_order+0x14/0x60
[ 4075.800382]  kmalloc_order_trace+0x1d/0x120
[ 4075.801341]  btrfs_run_delalloc_range+0x3e6/0x4b0 [btrfs]
[ 4075.802344]  writepage_delalloc+0xf8/0x150 [btrfs]
[ 4075.802991]  __extent_writepage+0x113/0x420 [btrfs]
[ 4075.803640]  extent_write_cache_pages+0x2a6/0x400 [btrfs]
[ 4075.804340]  extent_writepages+0x52/0xa0 [btrfs]
[ 4075.804951]  do_writepages+0x3e/0xe0
[ 4075.805480]  ? writeback_sb_inodes+0x133/0x550
[ 4075.806406]  __writeback_single_inode+0x54/0x640
[ 4075.807315]  writeback_sb_inodes+0x204/0x550
[ 4075.808112]  __writeback_inodes_wb+0x5d/0xb0
[ 4075.808692]  wb_writeback+0x337/0x4a0
[ 4075.809207]  wb_workfn+0x3a7/0x590
[ 4075.809849]  process_one_work+0x246/0x610
[ 4075.810665]  worker_thread+0x3c/0x390
[ 4075.811415]  ? rescuer_thread+0x360/0x360
[ 4075.812293]  kthread+0x116/0x130
[ 4075.812965]  ? kthread_create_on_node+0x60/0x60
[ 4075.813870]  ret_from_fork+0x24/0x30
[ 4075.814664] Mem-Info:
[ 4075.815167] active_anon:2942 inactive_anon:15105 isolated_anon:0
[ 4075.815167]  active_file:2749 inactive_file:454876 isolated_file:0
[ 4075.815167]  unevictable:0 dirty:68316 writeback:0 unstable:0
[ 4075.815167]  slab_reclaimable:5500 slab_unreclaimable:6458
[ 4075.815167]  mapped:940 shmem:15483 pagetables:51 bounce:0
[ 4075.815167]  free:7068 free_pcp:297 free_cma:0
[ 4075.823236] Node 0 active_anon:11768kB inactive_anon:60420kB active_file:10996kB inactive_file:1827676kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3760kB dirty:277360kB writeback:0kB shmem:61932kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
[ 4075.828200] Node 0 DMA free:7860kB min:44kB low:56kB high:68kB active_anon:0kB inactive_anon:4kB active_file:0kB inactive_file:8012kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
[ 4075.834484] lowmem_reserve[]: 0 1955 1955 1955
[ 4075.835419] Node 0 DMA32 free:11292kB min:5632kB low:7632kB high:9632kB active_anon:11768kB inactive_anon:60416kB active_file:10996kB inactive_file:1820532kB unevictable:0kB writepending:281184kB present:2080568kB managed:2009324kB mlocked:0kB kernel_stack:1984kB pagetables:204kB bounce:0kB free_pcp:132kB local_pcp:0kB free_cma:0k 
[ 4075.841848] lowmem_reserve[]: 0 0 0 0
[ 4075.842677] Node 0 DMA: 1*4kB (U) 2*8kB (U) 4*16kB (UME) 5*32kB (UME) 1*64kB (E) 3*128kB (UME) 2*256kB (UE) 1*512kB (E) 2*1024kB (UE) 2*2048kB (ME) 0*4096kB = 7860kB
[ 4075.844961] Node 0 DMA32: 234*4kB (UME) 238*8kB (UME) 426*16kB (UM) 43*32kB (UM) 28*64kB (UM) 11*128kB (UM) 0*256kB 0*512kB 0*1024kB 1*2048kB (H) 0*4096kB = 16280kB
[ 4075.847915] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
[ 4075.849266] 474599 total pagecache pages
[ 4075.850058] 0 pages in swap cache
[ 4075.850808] Swap cache stats: add 0, delete 0, find 0/0
[ 4075.851990] Free swap  = 0kB
[ 4075.852811] Total swap = 0kB
[ 4075.853635] 524140 pages RAM
[ 4075.854351] 0 pages HighMem/MovableOnly
[ 4075.855048] 17832 pages reserved
Nikolay Borisov March 28, 2019, 12:49 p.m. UTC | #3
On 27.03.19 г. 19:23 ч., David Sterba wrote:
> On Tue, Mar 12, 2019 at 05:20:24PM +0200, Nikolay Borisov wrote:
>> @@ -1190,45 +1201,71 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
>>  				unsigned int write_flags)
>>  {
>>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>> -	struct async_cow *async_cow;
>> +	struct async_cow *ctx;
>> +	struct async_chunk *async_chunk;
>>  	unsigned long nr_pages;
>>  	u64 cur_end;
>> +	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
>> +	int i;
>> +	bool should_compress;
>>  
>>  	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
>>  			 1, 0, NULL);
>> -	while (start < end) {
>> -		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
>> -		BUG_ON(!async_cow); /* -ENOMEM */
>> +
>> +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
>> +	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
>> +		num_chunks = 1;
>> +		should_compress = false;
>> +	} else {
>> +		should_compress = true;
>> +	}
>> +
>> +	ctx = kmalloc(struct_size(ctx, chunks, num_chunks), GFP_NOFS);
> 
> This leads to OOM due to high order allocation. And this is worse than
> the previous state, where there are many small allocation that could
> potentially fail (but most likely will not due to GFP_NOSF and size <
> PAGE_SIZE).
> 
> So this needs to be reworked to avoid the costly allocations or reverted
> to the previous state.

Right, makes sense. In order to have a simplified submission logic I
think to rework the allocation to have a loop that allocates a single
item for every chunk or alternatively switch to using kvmalloc? I think
the fact that vmalloced memory might not be contiguous is not critical
for the metadata structures in this case?

> 
> btrfs/138               [19:44:05][ 4034.368157] run fstests btrfs/138 at 2019-03-25 19:44:05
> [ 4034.559716] BTRFS: device fsid 9300f07a-78f4-4ac6-8376-1a902ef26830 devid 1 transid 5 /dev/vdb
> [ 4034.573670] BTRFS info (device vdb): disk space caching is enabled
> [ 4034.575068] BTRFS info (device vdb): has skinny extents
> [ 4034.576258] BTRFS info (device vdb): flagging fs with big metadata feature
> [ 4034.580226] BTRFS info (device vdb): checking UUID tree
> [ 4066.104734] BTRFS info (device vdb): disk space caching is enabled
> [ 4066.108558] BTRFS info (device vdb): has skinny extents
> [ 4066.186856] BTRFS info (device vdb): setting 8 feature flag
> [ 4074.017307] BTRFS info (device vdb): disk space caching is enabled
> [ 4074.019646] BTRFS info (device vdb): has skinny extents
> [ 4074.065117] BTRFS info (device vdb): setting 16 feature flag
> [ 4075.787401] kworker/u8:12: page allocation failure: order:4, mode:0x604040(GFP_NOFS|__GFP_COMP), nodemask=(null)
> [ 4075.789581] CPU: 0 PID: 31258 Comm: kworker/u8:12 Not tainted 5.0.0-rc8-default+ #524
> [ 4075.791235] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c89-prebuilt.qemu.org 04/01/2014
> [ 4075.793334] Workqueue: writeback wb_workfn (flush-btrfs-718)
> [ 4075.794455] Call Trace:
> [ 4075.795029]  dump_stack+0x67/0x90
> [ 4075.795756]  warn_alloc.cold.131+0x73/0xf3
> [ 4075.796601]  __alloc_pages_slowpath+0xa0e/0xb50
> [ 4075.797595]  ? __wake_up_common_lock+0x89/0xc0
> [ 4075.798558]  __alloc_pages_nodemask+0x2bd/0x310
> [ 4075.799537]  kmalloc_order+0x14/0x60
> [ 4075.800382]  kmalloc_order_trace+0x1d/0x120
> [ 4075.801341]  btrfs_run_delalloc_range+0x3e6/0x4b0 [btrfs]
> [ 4075.802344]  writepage_delalloc+0xf8/0x150 [btrfs]
> [ 4075.802991]  __extent_writepage+0x113/0x420 [btrfs]
> [ 4075.803640]  extent_write_cache_pages+0x2a6/0x400 [btrfs]
> [ 4075.804340]  extent_writepages+0x52/0xa0 [btrfs]
> [ 4075.804951]  do_writepages+0x3e/0xe0
> [ 4075.805480]  ? writeback_sb_inodes+0x133/0x550
> [ 4075.806406]  __writeback_single_inode+0x54/0x640
> [ 4075.807315]  writeback_sb_inodes+0x204/0x550
> [ 4075.808112]  __writeback_inodes_wb+0x5d/0xb0
> [ 4075.808692]  wb_writeback+0x337/0x4a0
> [ 4075.809207]  wb_workfn+0x3a7/0x590
> [ 4075.809849]  process_one_work+0x246/0x610
> [ 4075.810665]  worker_thread+0x3c/0x390
> [ 4075.811415]  ? rescuer_thread+0x360/0x360
> [ 4075.812293]  kthread+0x116/0x130
> [ 4075.812965]  ? kthread_create_on_node+0x60/0x60
> [ 4075.813870]  ret_from_fork+0x24/0x30
> [ 4075.814664] Mem-Info:
> [ 4075.815167] active_anon:2942 inactive_anon:15105 isolated_anon:0
> [ 4075.815167]  active_file:2749 inactive_file:454876 isolated_file:0
> [ 4075.815167]  unevictable:0 dirty:68316 writeback:0 unstable:0
> [ 4075.815167]  slab_reclaimable:5500 slab_unreclaimable:6458
> [ 4075.815167]  mapped:940 shmem:15483 pagetables:51 bounce:0
> [ 4075.815167]  free:7068 free_pcp:297 free_cma:0
> [ 4075.823236] Node 0 active_anon:11768kB inactive_anon:60420kB active_file:10996kB inactive_file:1827676kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3760kB dirty:277360kB writeback:0kB shmem:61932kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
> [ 4075.828200] Node 0 DMA free:7860kB min:44kB low:56kB high:68kB active_anon:0kB inactive_anon:4kB active_file:0kB inactive_file:8012kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
> [ 4075.834484] lowmem_reserve[]: 0 1955 1955 1955
> [ 4075.835419] Node 0 DMA32 free:11292kB min:5632kB low:7632kB high:9632kB active_anon:11768kB inactive_anon:60416kB active_file:10996kB inactive_file:1820532kB unevictable:0kB writepending:281184kB present:2080568kB managed:2009324kB mlocked:0kB kernel_stack:1984kB pagetables:204kB bounce:0kB free_pcp:132kB local_pcp:0kB free_cma:0k 
> [ 4075.841848] lowmem_reserve[]: 0 0 0 0
> [ 4075.842677] Node 0 DMA: 1*4kB (U) 2*8kB (U) 4*16kB (UME) 5*32kB (UME) 1*64kB (E) 3*128kB (UME) 2*256kB (UE) 1*512kB (E) 2*1024kB (UE) 2*2048kB (ME) 0*4096kB = 7860kB
> [ 4075.844961] Node 0 DMA32: 234*4kB (UME) 238*8kB (UME) 426*16kB (UM) 43*32kB (UM) 28*64kB (UM) 11*128kB (UM) 0*256kB 0*512kB 0*1024kB 1*2048kB (H) 0*4096kB = 16280kB
> [ 4075.847915] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
> [ 4075.849266] 474599 total pagecache pages
> [ 4075.850058] 0 pages in swap cache
> [ 4075.850808] Swap cache stats: add 0, delete 0, find 0/0
> [ 4075.851990] Free swap  = 0kB
> [ 4075.852811] Total swap = 0kB
> [ 4075.853635] 524140 pages RAM
> [ 4075.854351] 0 pages HighMem/MovableOnly
> [ 4075.855048] 17832 pages reserved
>
Nikolay Borisov March 28, 2019, 12:52 p.m. UTC | #4
On 28.03.19 г. 14:49 ч., Nikolay Borisov wrote:
> 
> 
> On 27.03.19 г. 19:23 ч., David Sterba wrote:
>> On Tue, Mar 12, 2019 at 05:20:24PM +0200, Nikolay Borisov wrote:
>>> @@ -1190,45 +1201,71 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
>>>  				unsigned int write_flags)
>>>  {
>>>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>>> -	struct async_cow *async_cow;
>>> +	struct async_cow *ctx;
>>> +	struct async_chunk *async_chunk;
>>>  	unsigned long nr_pages;
>>>  	u64 cur_end;
>>> +	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
>>> +	int i;
>>> +	bool should_compress;
>>>  
>>>  	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
>>>  			 1, 0, NULL);
>>> -	while (start < end) {
>>> -		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
>>> -		BUG_ON(!async_cow); /* -ENOMEM */
>>> +
>>> +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
>>> +	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
>>> +		num_chunks = 1;
>>> +		should_compress = false;
>>> +	} else {
>>> +		should_compress = true;
>>> +	}
>>> +
>>> +	ctx = kmalloc(struct_size(ctx, chunks, num_chunks), GFP_NOFS);
>>
>> This leads to OOM due to high order allocation. And this is worse than
>> the previous state, where there are many small allocation that could
>> potentially fail (but most likely will not due to GFP_NOSF and size <
>> PAGE_SIZE).
>>
>> So this needs to be reworked to avoid the costly allocations or reverted
>> to the previous state.
> 
> Right, makes sense. In order to have a simplified submission logic I
> think to rework the allocation to have a loop that allocates a single
> item for every chunk or alternatively switch to using kvmalloc? I think
> the fact that vmalloced memory might not be contiguous is not critical
> for the metadata structures in this case?

Just had a quick read through gfp_mask-from-fs-io.rst:

vmalloc doesn't support GFP_NOFS semantic because there are hardcoded

GFP_KERNEL allocations deep inside the allocator which are quite
non-trivial
to fix up. That means that calling ``vmalloc`` with GFP_NOFS/GFP_NOIO is

almost always a bug. The good news is that the NOFS/NOIO semantic can be

achieved by the scope API.


> 
>>
>> btrfs/138               [19:44:05][ 4034.368157] run fstests btrfs/138 at 2019-03-25 19:44:05
>> [ 4034.559716] BTRFS: device fsid 9300f07a-78f4-4ac6-8376-1a902ef26830 devid 1 transid 5 /dev/vdb
>> [ 4034.573670] BTRFS info (device vdb): disk space caching is enabled
>> [ 4034.575068] BTRFS info (device vdb): has skinny extents
>> [ 4034.576258] BTRFS info (device vdb): flagging fs with big metadata feature
>> [ 4034.580226] BTRFS info (device vdb): checking UUID tree
>> [ 4066.104734] BTRFS info (device vdb): disk space caching is enabled
>> [ 4066.108558] BTRFS info (device vdb): has skinny extents
>> [ 4066.186856] BTRFS info (device vdb): setting 8 feature flag
>> [ 4074.017307] BTRFS info (device vdb): disk space caching is enabled
>> [ 4074.019646] BTRFS info (device vdb): has skinny extents
>> [ 4074.065117] BTRFS info (device vdb): setting 16 feature flag
>> [ 4075.787401] kworker/u8:12: page allocation failure: order:4, mode:0x604040(GFP_NOFS|__GFP_COMP), nodemask=(null)
>> [ 4075.789581] CPU: 0 PID: 31258 Comm: kworker/u8:12 Not tainted 5.0.0-rc8-default+ #524
>> [ 4075.791235] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c89-prebuilt.qemu.org 04/01/2014
>> [ 4075.793334] Workqueue: writeback wb_workfn (flush-btrfs-718)
>> [ 4075.794455] Call Trace:
>> [ 4075.795029]  dump_stack+0x67/0x90
>> [ 4075.795756]  warn_alloc.cold.131+0x73/0xf3
>> [ 4075.796601]  __alloc_pages_slowpath+0xa0e/0xb50
>> [ 4075.797595]  ? __wake_up_common_lock+0x89/0xc0
>> [ 4075.798558]  __alloc_pages_nodemask+0x2bd/0x310
>> [ 4075.799537]  kmalloc_order+0x14/0x60
>> [ 4075.800382]  kmalloc_order_trace+0x1d/0x120
>> [ 4075.801341]  btrfs_run_delalloc_range+0x3e6/0x4b0 [btrfs]
>> [ 4075.802344]  writepage_delalloc+0xf8/0x150 [btrfs]
>> [ 4075.802991]  __extent_writepage+0x113/0x420 [btrfs]
>> [ 4075.803640]  extent_write_cache_pages+0x2a6/0x400 [btrfs]
>> [ 4075.804340]  extent_writepages+0x52/0xa0 [btrfs]
>> [ 4075.804951]  do_writepages+0x3e/0xe0
>> [ 4075.805480]  ? writeback_sb_inodes+0x133/0x550
>> [ 4075.806406]  __writeback_single_inode+0x54/0x640
>> [ 4075.807315]  writeback_sb_inodes+0x204/0x550
>> [ 4075.808112]  __writeback_inodes_wb+0x5d/0xb0
>> [ 4075.808692]  wb_writeback+0x337/0x4a0
>> [ 4075.809207]  wb_workfn+0x3a7/0x590
>> [ 4075.809849]  process_one_work+0x246/0x610
>> [ 4075.810665]  worker_thread+0x3c/0x390
>> [ 4075.811415]  ? rescuer_thread+0x360/0x360
>> [ 4075.812293]  kthread+0x116/0x130
>> [ 4075.812965]  ? kthread_create_on_node+0x60/0x60
>> [ 4075.813870]  ret_from_fork+0x24/0x30
>> [ 4075.814664] Mem-Info:
>> [ 4075.815167] active_anon:2942 inactive_anon:15105 isolated_anon:0
>> [ 4075.815167]  active_file:2749 inactive_file:454876 isolated_file:0
>> [ 4075.815167]  unevictable:0 dirty:68316 writeback:0 unstable:0
>> [ 4075.815167]  slab_reclaimable:5500 slab_unreclaimable:6458
>> [ 4075.815167]  mapped:940 shmem:15483 pagetables:51 bounce:0
>> [ 4075.815167]  free:7068 free_pcp:297 free_cma:0
>> [ 4075.823236] Node 0 active_anon:11768kB inactive_anon:60420kB active_file:10996kB inactive_file:1827676kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:3760kB dirty:277360kB writeback:0kB shmem:61932kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
>> [ 4075.828200] Node 0 DMA free:7860kB min:44kB low:56kB high:68kB active_anon:0kB inactive_anon:4kB active_file:0kB inactive_file:8012kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB
>> [ 4075.834484] lowmem_reserve[]: 0 1955 1955 1955
>> [ 4075.835419] Node 0 DMA32 free:11292kB min:5632kB low:7632kB high:9632kB active_anon:11768kB inactive_anon:60416kB active_file:10996kB inactive_file:1820532kB unevictable:0kB writepending:281184kB present:2080568kB managed:2009324kB mlocked:0kB kernel_stack:1984kB pagetables:204kB bounce:0kB free_pcp:132kB local_pcp:0kB free_cma:0k 
>> [ 4075.841848] lowmem_reserve[]: 0 0 0 0
>> [ 4075.842677] Node 0 DMA: 1*4kB (U) 2*8kB (U) 4*16kB (UME) 5*32kB (UME) 1*64kB (E) 3*128kB (UME) 2*256kB (UE) 1*512kB (E) 2*1024kB (UE) 2*2048kB (ME) 0*4096kB = 7860kB
>> [ 4075.844961] Node 0 DMA32: 234*4kB (UME) 238*8kB (UME) 426*16kB (UM) 43*32kB (UM) 28*64kB (UM) 11*128kB (UM) 0*256kB 0*512kB 0*1024kB 1*2048kB (H) 0*4096kB = 16280kB
>> [ 4075.847915] Node 0 hugepages_total=0 hugepages_free=0 hugepages_surp=0 hugepages_size=2048kB
>> [ 4075.849266] 474599 total pagecache pages
>> [ 4075.850058] 0 pages in swap cache
>> [ 4075.850808] Swap cache stats: add 0, delete 0, find 0/0
>> [ 4075.851990] Free swap  = 0kB
>> [ 4075.852811] Total swap = 0kB
>> [ 4075.853635] 524140 pages RAM
>> [ 4075.854351] 0 pages HighMem/MovableOnly
>> [ 4075.855048] 17832 pages reserved
>>
David Sterba March 28, 2019, 2:11 p.m. UTC | #5
On Thu, Mar 28, 2019 at 02:49:30PM +0200, Nikolay Borisov wrote:
> 
> 
> On 27.03.19 г. 19:23 ч., David Sterba wrote:
> > On Tue, Mar 12, 2019 at 05:20:24PM +0200, Nikolay Borisov wrote:
> >> @@ -1190,45 +1201,71 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
> >>  				unsigned int write_flags)
> >>  {
> >>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> >> -	struct async_cow *async_cow;
> >> +	struct async_cow *ctx;
> >> +	struct async_chunk *async_chunk;
> >>  	unsigned long nr_pages;
> >>  	u64 cur_end;
> >> +	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
> >> +	int i;
> >> +	bool should_compress;
> >>  
> >>  	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
> >>  			 1, 0, NULL);
> >> -	while (start < end) {
> >> -		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
> >> -		BUG_ON(!async_cow); /* -ENOMEM */
> >> +
> >> +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
> >> +	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
> >> +		num_chunks = 1;
> >> +		should_compress = false;
> >> +	} else {
> >> +		should_compress = true;
> >> +	}
> >> +
> >> +	ctx = kmalloc(struct_size(ctx, chunks, num_chunks), GFP_NOFS);
> > 
> > This leads to OOM due to high order allocation. And this is worse than
> > the previous state, where there are many small allocation that could
> > potentially fail (but most likely will not due to GFP_NOSF and size <
> > PAGE_SIZE).
> > 
> > So this needs to be reworked to avoid the costly allocations or reverted
> > to the previous state.
> 
> Right, makes sense. In order to have a simplified submission logic I
> think to rework the allocation to have a loop that allocates a single
> item for every chunk or alternatively switch to using kvmalloc? I think
> the fact that vmalloced memory might not be contiguous is not critical
> for the metadata structures in this case?

kvmalloc would work here, though there is some overhead involved
compared to bare kmalloc (the mappings). As the call happens on writeout
path, I'd be cautious about unnecessary overhead.

If looping over the range works, then we can allocate the largest size
that does not require kvmalloc (PAGE_ALLOC_COSTLY_ORDER) and then reuse
it for each iteration.
Nikolay Borisov March 28, 2019, 3:10 p.m. UTC | #6
On 28.03.19 г. 16:11 ч., David Sterba wrote:
> On Thu, Mar 28, 2019 at 02:49:30PM +0200, Nikolay Borisov wrote:
>>
>>
>> On 27.03.19 г. 19:23 ч., David Sterba wrote:
>>> On Tue, Mar 12, 2019 at 05:20:24PM +0200, Nikolay Borisov wrote:
>>>> @@ -1190,45 +1201,71 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
>>>>  				unsigned int write_flags)
>>>>  {
>>>>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
>>>> -	struct async_cow *async_cow;
>>>> +	struct async_cow *ctx;
>>>> +	struct async_chunk *async_chunk;
>>>>  	unsigned long nr_pages;
>>>>  	u64 cur_end;
>>>> +	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
>>>> +	int i;
>>>> +	bool should_compress;
>>>>  
>>>>  	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
>>>>  			 1, 0, NULL);
>>>> -	while (start < end) {
>>>> -		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
>>>> -		BUG_ON(!async_cow); /* -ENOMEM */
>>>> +
>>>> +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
>>>> +	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
>>>> +		num_chunks = 1;
>>>> +		should_compress = false;
>>>> +	} else {
>>>> +		should_compress = true;
>>>> +	}
>>>> +
>>>> +	ctx = kmalloc(struct_size(ctx, chunks, num_chunks), GFP_NOFS);
>>>
>>> This leads to OOM due to high order allocation. And this is worse than
>>> the previous state, where there are many small allocation that could
>>> potentially fail (but most likely will not due to GFP_NOSF and size <
>>> PAGE_SIZE).
>>>
>>> So this needs to be reworked to avoid the costly allocations or reverted
>>> to the previous state.
>>
>> Right, makes sense. In order to have a simplified submission logic I
>> think to rework the allocation to have a loop that allocates a single
>> item for every chunk or alternatively switch to using kvmalloc? I think
>> the fact that vmalloced memory might not be contiguous is not critical
>> for the metadata structures in this case?
> 
> kvmalloc would work here, though there is some overhead involved
> compared to bare kmalloc (the mappings). As the call happens on writeout
> path, I'd be cautious about unnecessary overhead.
> 
> If looping over the range works, then we can allocate the largest size
> that does not require kvmalloc (PAGE_ALLOC_COSTLY_ORDER) and then reuse
> it for each iteration.

I'm afraid I don't understand your hybrid suggestion. Ie something along 
the lines of : 

if (struct_size(ctx, chunks, num_chunks) < ((2 << PAGE_ALLOC_COSTLY_ORDER) * PAGE_SIZE))  {
     use kmalloc
} else { 

????
}

>
David Sterba March 28, 2019, 3:45 p.m. UTC | #7
On Thu, Mar 28, 2019 at 05:10:39PM +0200, Nikolay Borisov wrote:
> 
> 
> On 28.03.19 г. 16:11 ч., David Sterba wrote:
> > On Thu, Mar 28, 2019 at 02:49:30PM +0200, Nikolay Borisov wrote:
> >>
> >>
> >> On 27.03.19 г. 19:23 ч., David Sterba wrote:
> >>> On Tue, Mar 12, 2019 at 05:20:24PM +0200, Nikolay Borisov wrote:
> >>>> @@ -1190,45 +1201,71 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
> >>>>  				unsigned int write_flags)
> >>>>  {
> >>>>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> >>>> -	struct async_cow *async_cow;
> >>>> +	struct async_cow *ctx;
> >>>> +	struct async_chunk *async_chunk;
> >>>>  	unsigned long nr_pages;
> >>>>  	u64 cur_end;
> >>>> +	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
> >>>> +	int i;
> >>>> +	bool should_compress;
> >>>>  
> >>>>  	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
> >>>>  			 1, 0, NULL);
> >>>> -	while (start < end) {
> >>>> -		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
> >>>> -		BUG_ON(!async_cow); /* -ENOMEM */
> >>>> +
> >>>> +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
> >>>> +	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
> >>>> +		num_chunks = 1;
> >>>> +		should_compress = false;
> >>>> +	} else {
> >>>> +		should_compress = true;
> >>>> +	}
> >>>> +
> >>>> +	ctx = kmalloc(struct_size(ctx, chunks, num_chunks), GFP_NOFS);
> >>>
> >>> This leads to OOM due to high order allocation. And this is worse than
> >>> the previous state, where there are many small allocation that could
> >>> potentially fail (but most likely will not due to GFP_NOSF and size <
> >>> PAGE_SIZE).
> >>>
> >>> So this needs to be reworked to avoid the costly allocations or reverted
> >>> to the previous state.
> >>
> >> Right, makes sense. In order to have a simplified submission logic I
> >> think to rework the allocation to have a loop that allocates a single
> >> item for every chunk or alternatively switch to using kvmalloc? I think
> >> the fact that vmalloced memory might not be contiguous is not critical
> >> for the metadata structures in this case?
> > 
> > kvmalloc would work here, though there is some overhead involved
> > compared to bare kmalloc (the mappings). As the call happens on writeout
> > path, I'd be cautious about unnecessary overhead.
> > 
> > If looping over the range works, then we can allocate the largest size
> > that does not require kvmalloc (PAGE_ALLOC_COSTLY_ORDER) and then reuse
> > it for each iteration.
> 
> I'm afraid I don't understand your hybrid suggestion. Ie something along 
> the lines of : 

The idea was to split the range to several chunks, allocate buffer and
the reuse the memory, but this does not work as each submission requires
its own memory.  So let's do kvmalloc for now.
diff mbox series

Patch

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 05bbfd02ea49..2c13915c6f71 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -366,7 +366,7 @@  struct async_extent {
 	struct list_head list;
 };
 
-struct async_cow {
+struct async_chunk {
 	struct inode *inode;
 	struct btrfs_fs_info *fs_info;
 	struct page *locked_page;
@@ -375,9 +375,15 @@  struct async_cow {
 	unsigned int write_flags;
 	struct list_head extents;
 	struct btrfs_work work;
+	atomic_t *pending;
+};
+
+struct async_cow {
+	atomic_t num_chunks; /* Number of chunks in flight */
+	struct async_chunk chunks[];
 };
 
-static noinline int add_async_extent(struct async_cow *cow,
+static noinline int add_async_extent(struct async_chunk *cow,
 				     u64 start, u64 ram_size,
 				     u64 compressed_size,
 				     struct page **pages,
@@ -447,7 +453,7 @@  static inline void inode_should_defrag(struct btrfs_inode *inode,
 static noinline void compress_file_range(struct inode *inode,
 					struct page *locked_page,
 					u64 start, u64 end,
-					struct async_cow *async_cow,
+					struct async_chunk *async_cow,
 					int *num_added)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -713,7 +719,7 @@  static void free_async_extent_pages(struct async_extent *async_extent)
  * queued.  We walk all the async extents created by compress_file_range
  * and send them down to the disk.
  */
-static noinline void submit_compressed_extents(struct async_cow *async_cow)
+static noinline void submit_compressed_extents(struct async_chunk *async_cow)
 {
 	struct inode *inode = async_cow->inode;
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -1132,9 +1138,9 @@  static noinline int cow_file_range(struct inode *inode,
  */
 static noinline void async_cow_start(struct btrfs_work *work)
 {
-	struct async_cow *async_cow;
+	struct async_chunk *async_cow;
 	int num_added = 0;
-	async_cow = container_of(work, struct async_cow, work);
+	async_cow = container_of(work, struct async_chunk, work);
 
 	compress_file_range(async_cow->inode, async_cow->locked_page,
 			    async_cow->start, async_cow->end, async_cow,
@@ -1151,10 +1157,10 @@  static noinline void async_cow_start(struct btrfs_work *work)
 static noinline void async_cow_submit(struct btrfs_work *work)
 {
 	struct btrfs_fs_info *fs_info;
-	struct async_cow *async_cow;
+	struct async_chunk *async_cow;
 	unsigned long nr_pages;
 
-	async_cow = container_of(work, struct async_cow, work);
+	async_cow = container_of(work, struct async_chunk, work);
 
 	fs_info = async_cow->fs_info;
 	nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
@@ -1177,11 +1183,16 @@  static noinline void async_cow_submit(struct btrfs_work *work)
 
 static noinline void async_cow_free(struct btrfs_work *work)
 {
-	struct async_cow *async_cow;
-	async_cow = container_of(work, struct async_cow, work);
+	struct async_chunk *async_cow;
+	async_cow = container_of(work, struct async_chunk, work);
 	if (async_cow->inode)
 		btrfs_add_delayed_iput(async_cow->inode);
-	kfree(async_cow);
+	/*
+	 * Since the pointer to 'pending' is at the beginning of the array of
+	 * async_cow's, freeing it ensures the whole array has been freed.
+	 */
+	if (atomic_dec_and_test(async_cow->pending))
+		kfree(async_cow->pending);
 }
 
 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
@@ -1190,45 +1201,71 @@  static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 				unsigned int write_flags)
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-	struct async_cow *async_cow;
+	struct async_cow *ctx;
+	struct async_chunk *async_chunk;
 	unsigned long nr_pages;
 	u64 cur_end;
+	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
+	int i;
+	bool should_compress;
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
 			 1, 0, NULL);
-	while (start < end) {
-		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
-		BUG_ON(!async_cow); /* -ENOMEM */
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
+	    !btrfs_test_opt(fs_info, FORCE_COMPRESS)) {
+		num_chunks = 1;
+		should_compress = false;
+	} else {
+		should_compress = true;
+	}
+
+	ctx = kmalloc(struct_size(ctx, chunks, num_chunks), GFP_NOFS);
+	if (!ctx) {
+		unsigned clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC |
+			EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
+			EXTENT_DO_ACCOUNTING;
+		unsigned long page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+			PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
+			PAGE_SET_ERROR;
+		extent_clear_unlock_delalloc(inode, start, end, 0, locked_page,
+					     clear_bits, page_ops);
+		return -ENOMEM;
+	}
+
+	async_chunk = ctx->chunks;
+	atomic_set(&ctx->num_chunks, num_chunks);
+
+	for (i = 0; i < num_chunks; i++) {
+
+		if (should_compress)
+			cur_end = min(end, start + SZ_512K - 1);
+		else
+			cur_end = end;
+
 		/*
 		 * igrab is called higher up in the call chain, take only the
 		 * lightweight reference for the callback lifetime
 		 */
 		ihold(inode);
-		async_cow->inode = inode;
-		async_cow->fs_info = fs_info;
-		async_cow->locked_page = locked_page;
-		async_cow->start = start;
-		async_cow->write_flags = write_flags;
-
-		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
-		    !btrfs_test_opt(fs_info, FORCE_COMPRESS))
-			cur_end = end;
-		else
-			cur_end = min(end, start + SZ_512K - 1);
-
-		async_cow->end = cur_end;
-		INIT_LIST_HEAD(&async_cow->extents);
-
-		btrfs_init_work(&async_cow->work,
+		async_chunk[i].pending= &ctx->num_chunks;
+		async_chunk[i].inode = inode;
+		async_chunk[i].start = start;
+		async_chunk[i].end = cur_end;
+		async_chunk[i].fs_info = fs_info;
+		async_chunk[i].locked_page = locked_page;
+		async_chunk[i].write_flags = write_flags;
+		INIT_LIST_HEAD(&async_chunk[i].extents);
+
+		btrfs_init_work(&async_chunk[i].work,
 				btrfs_delalloc_helper,
 				async_cow_start, async_cow_submit,
 				async_cow_free);
 
-		nr_pages = (cur_end - start + PAGE_SIZE) >>
-			PAGE_SHIFT;
+		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
 		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
 
-		btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
+		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
 
 		*nr_written += nr_pages;
 		start = cur_end + 1;