diff mbox series

btrfs: use tagged writepage to mitigate livelock of snapshot

Message ID 20181101064903.11638-1-ethanlien@synology.com (mailing list archive)
State New, archived
Headers show
Series btrfs: use tagged writepage to mitigate livelock of snapshot | expand

Commit Message

ethanlien Nov. 1, 2018, 6:49 a.m. UTC
Snapshot is expected to be fast. But if there are writers steadily
create dirty pages in our subvolume, the snapshot may take a very long
time to complete. To fix the problem, we use tagged writepage for snapshot
flusher as we do in the generic write_cache_pages(), so we can ommit pages
dirtied after the snapshot command.

We do a simple snapshot speed test on a Intel D-1531 box:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
--direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 1m58sec
patched:  6.54sec

This is the best case for this patch since for a sequential write case,
we omit nearly all pages dirtied after the snapshot command.

For a multi writers, random write test:

fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
--direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
--filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio

original: 15.83sec
patched:  10.35sec

The improvement is less compared with the sequential write case, since
we omit only half of the pages dirtied after snapshot command.

Signed-off-by: Ethan Lien <ethanlien@synology.com>
---
 fs/btrfs/btrfs_inode.h |  1 +
 fs/btrfs/ctree.h       |  2 +-
 fs/btrfs/extent_io.c   | 16 ++++++++++++++--
 fs/btrfs/inode.c       | 10 ++++++----
 fs/btrfs/ioctl.c       |  2 +-
 5 files changed, 23 insertions(+), 8 deletions(-)

Comments

Nikolay Borisov Nov. 1, 2018, 8:59 a.m. UTC | #1
On 1.11.18 г. 8:49 ч., Ethan Lien wrote:
> Snapshot is expected to be fast. But if there are writers steadily
> create dirty pages in our subvolume, the snapshot may take a very long
> time to complete. To fix the problem, we use tagged writepage for snapshot
> flusher as we do in the generic write_cache_pages(), so we can ommit pages
> dirtied after the snapshot command.

So the gist of this commit really is that you detect when filemap_flush
has been called from snapshot context and tag all pages at *that* time
as PAGECACHE_TAG_TOWRITE and then write them, ignoring any pages that
might have been dirtied beforehand. Your description kind of dances
around this idea without really saying it explicitly. Those semantics
make sense, however i'd like to be stated more explicitly in the change
 log.

However, this is done at the expense of consistency, so we have faster
snapshots but depending the file which are stored in them they might be
broken (i.e a database) since they will be missing pages.

> 
> We do a simple snapshot speed test on a Intel D-1531 box:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 1m58sec
> patched:  6.54sec

Nice.

> 
> This is the best case for this patch since for a sequential write case,
> we omit nearly all pages dirtied after the snapshot command.
> 
> For a multi writers, random write test:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 15.83sec
> patched:  10.35sec
> 
> The improvement is less compared with the sequential write case, since
> we omit only half of the pages dirtied after snapshot command.
> 
> Signed-off-by: Ethan Lien <ethanlien@synology.com>
> ---
>  fs/btrfs/btrfs_inode.h |  1 +
>  fs/btrfs/ctree.h       |  2 +-
>  fs/btrfs/extent_io.c   | 16 ++++++++++++++--
>  fs/btrfs/inode.c       | 10 ++++++----
>  fs/btrfs/ioctl.c       |  2 +-
>  5 files changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 1343ac57b438..4182bfbb56be 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -29,6 +29,7 @@ enum {
>  	BTRFS_INODE_IN_DELALLOC_LIST,
>  	BTRFS_INODE_READDIO_NEED_LOCK,
>  	BTRFS_INODE_HAS_PROPS,
> +	BTRFS_INODE_TAGGED_FLUSH,

IMO the name of the flag should be different. Something like "lie flush"
or whatever. Because all flushes are tagged i.e TAG_DIRTY or TAG_TOWRITE.

>  };
>  
>  /* in memory btrfs inode */
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 2cddfe7806a4..82682da5a40d 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>  			       struct inode *inode, u64 new_size,
>  			       u32 min_type);
>  
> -int btrfs_start_delalloc_inodes(struct btrfs_root *root);
> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
>  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
>  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
>  			      unsigned int extra_bits,
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 4dd6faab02bb..c21d8a0e010a 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct address_space *mapping,
>  			range_whole = 1;
>  		scanned = 1;
>  	}
> -	if (wbc->sync_mode == WB_SYNC_ALL)
> +
> +	/*
> +	 * We don't care if we are the one who set BTRFS_INODE_TAGGED_FLUSH in
> +	 * start_delalloc_inodes(). We do the tagged writepage as long as we are
> +	 * the first one who do the filemap_flush() on this inode.
> +	 */

I think the first sentence of this comment could be removed.

> +	if (range_whole && wbc->nr_to_write == LONG_MAX &&
> +			wbc->sync_mode == WB_SYNC_NONE &&
> +			test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
> +				&BTRFS_I(inode)->runtime_flags))
> +		wbc->tagged_writepages = 1;
> +
> +	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>  		tag = PAGECACHE_TAG_TOWRITE;
>  	else
>  		tag = PAGECACHE_TAG_DIRTY;
>  retry:
> -	if (wbc->sync_mode == WB_SYNC_ALL)
> +	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>  		tag_pages_for_writeback(mapping, index, end);
>  	done_index = index;
>  	while (!done && !nr_to_write_done && (index <= end) &&
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 3ea5339603cf..3df3cbbe91c5 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9975,7 +9975,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
>   * some fairly slow code that needs optimization. This walks the list
>   * of all the inodes with pending delalloc and forces them to disk.
>   */
> -static int start_delalloc_inodes(struct btrfs_root *root, int nr)
> +static int start_delalloc_inodes(struct btrfs_root *root, int nr, int snapshot)

snapshot is a boolean value, so define it as such and not an int. I know
the codebase is inconsistent in that manner but long-term all boolean
'int's shall be converted to bools.

>  {
>  	struct btrfs_inode *binode;
>  	struct inode *inode;
> @@ -10003,6 +10003,8 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr)
>  		}
>  		spin_unlock(&root->delalloc_lock);
>  
> +		if (snapshot)
> +			set_bit(BTRFS_INODE_TAGGED_FLUSH, &binode->runtime_flags);
>  		work = btrfs_alloc_delalloc_work(inode);
>  		if (!work) {
>  			iput(inode);
> @@ -10036,7 +10038,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr)
>  	return ret;
>  }
>  
> -int btrfs_start_delalloc_inodes(struct btrfs_root *root)
> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
>  {
>  	struct btrfs_fs_info *fs_info = root->fs_info;
>  	int ret;
> @@ -10044,7 +10046,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
>  	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
>  		return -EROFS;
>  
> -	ret = start_delalloc_inodes(root, -1);
> +	ret = start_delalloc_inodes(root, -1, 1);
>  	if (ret > 0)
>  		ret = 0;
>  	return ret;
> @@ -10073,7 +10075,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
>  			       &fs_info->delalloc_roots);
>  		spin_unlock(&fs_info->delalloc_root_lock);
>  
> -		ret = start_delalloc_inodes(root, nr);
> +		ret = start_delalloc_inodes(root, nr, 0);
>  		btrfs_put_fs_root(root);
>  		if (ret < 0)
>  			goto out;
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index d60b6caf09e8..d1293b6c31f6 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -775,7 +775,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
>  	wait_event(root->subv_writers->wait,
>  		   percpu_counter_sum(&root->subv_writers->counter) == 0);
>  
> -	ret = btrfs_start_delalloc_inodes(root);
> +	ret = btrfs_start_delalloc_snapshot(root);
>  	if (ret)
>  		goto dec_and_free;
>  
>
ethanlien Nov. 1, 2018, 9:56 a.m. UTC | #2
Nikolay Borisov 於 2018-11-01 16:59 寫到:
> On 1.11.18 г. 8:49 ч., Ethan Lien wrote:
>> Snapshot is expected to be fast. But if there are writers steadily
>> create dirty pages in our subvolume, the snapshot may take a very long
>> time to complete. To fix the problem, we use tagged writepage for 
>> snapshot
>> flusher as we do in the generic write_cache_pages(), so we can ommit 
>> pages
>> dirtied after the snapshot command.
> 
> So the gist of this commit really is that you detect when filemap_flush
> has been called from snapshot context and tag all pages at *that* time
> as PAGECACHE_TAG_TOWRITE and then write them, ignoring any pages that
> might have been dirtied beforehand. Your description kind of dances
> around this idea without really saying it explicitly. Those semantics
> make sense, however i'd like to be stated more explicitly in the change
>  log.
> 
> However, this is done at the expense of consistency, so we have faster
> snapshots but depending the file which are stored in them they might be
> broken (i.e a database) since they will be missing pages.
> 

tag_pages_for_writeback() will tag all pages with PAGECACHE_TAG_DIRTY. 
If a dirty
page get missed here, it means someone has initiated the flush before 
us, so we
will wait that dirty page to complete in create_snapshot() -> 
btrfs_wait_ordered_extents().

>> 
>> We do a simple snapshot speed test on a Intel D-1531 box:
>> 
>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
>> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>> 
>> original: 1m58sec
>> patched:  6.54sec
> 
> Nice.
> 
>> 
>> This is the best case for this patch since for a sequential write 
>> case,
>> we omit nearly all pages dirtied after the snapshot command.
>> 
>> For a multi writers, random write test:
>> 
>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
>> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>> 
>> original: 15.83sec
>> patched:  10.35sec
>> 
>> The improvement is less compared with the sequential write case, since
>> we omit only half of the pages dirtied after snapshot command.
>> 
>> Signed-off-by: Ethan Lien <ethanlien@synology.com>
>> ---
>>  fs/btrfs/btrfs_inode.h |  1 +
>>  fs/btrfs/ctree.h       |  2 +-
>>  fs/btrfs/extent_io.c   | 16 ++++++++++++++--
>>  fs/btrfs/inode.c       | 10 ++++++----
>>  fs/btrfs/ioctl.c       |  2 +-
>>  5 files changed, 23 insertions(+), 8 deletions(-)
>> 
>> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
>> index 1343ac57b438..4182bfbb56be 100644
>> --- a/fs/btrfs/btrfs_inode.h
>> +++ b/fs/btrfs/btrfs_inode.h
>> @@ -29,6 +29,7 @@ enum {
>>  	BTRFS_INODE_IN_DELALLOC_LIST,
>>  	BTRFS_INODE_READDIO_NEED_LOCK,
>>  	BTRFS_INODE_HAS_PROPS,
>> +	BTRFS_INODE_TAGGED_FLUSH,
> 
> IMO the name of the flag should be different. Something like "lie 
> flush"
> or whatever. Because all flushes are tagged i.e TAG_DIRTY or 
> TAG_TOWRITE.
> 

OK I'll use another name.

>>  };
>> 
>>  /* in memory btrfs inode */
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 2cddfe7806a4..82682da5a40d 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct 
>> btrfs_trans_handle *trans,
>>  			       struct inode *inode, u64 new_size,
>>  			       u32 min_type);
>> 
>> -int btrfs_start_delalloc_inodes(struct btrfs_root *root);
>> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
>>  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int 
>> nr);
>>  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 
>> end,
>>  			      unsigned int extra_bits,
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index 4dd6faab02bb..c21d8a0e010a 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct 
>> address_space *mapping,
>>  			range_whole = 1;
>>  		scanned = 1;
>>  	}
>> -	if (wbc->sync_mode == WB_SYNC_ALL)
>> +
>> +	/*
>> +	 * We don't care if we are the one who set BTRFS_INODE_TAGGED_FLUSH 
>> in
>> +	 * start_delalloc_inodes(). We do the tagged writepage as long as we 
>> are
>> +	 * the first one who do the filemap_flush() on this inode.
>> +	 */
> 
> I think the first sentence of this comment could be removed.
> 

OK.

>> +	if (range_whole && wbc->nr_to_write == LONG_MAX &&
>> +			wbc->sync_mode == WB_SYNC_NONE &&
>> +			test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
>> +				&BTRFS_I(inode)->runtime_flags))
>> +		wbc->tagged_writepages = 1;
>> +
>> +	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>>  		tag = PAGECACHE_TAG_TOWRITE;
>>  	else
>>  		tag = PAGECACHE_TAG_DIRTY;
>>  retry:
>> -	if (wbc->sync_mode == WB_SYNC_ALL)
>> +	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>>  		tag_pages_for_writeback(mapping, index, end);
>>  	done_index = index;
>>  	while (!done && !nr_to_write_done && (index <= end) &&
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index 3ea5339603cf..3df3cbbe91c5 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -9975,7 +9975,7 @@ static struct btrfs_delalloc_work 
>> *btrfs_alloc_delalloc_work(struct inode *inode
>>   * some fairly slow code that needs optimization. This walks the list
>>   * of all the inodes with pending delalloc and forces them to disk.
>>   */
>> -static int start_delalloc_inodes(struct btrfs_root *root, int nr)
>> +static int start_delalloc_inodes(struct btrfs_root *root, int nr, int 
>> snapshot)
> 
> snapshot is a boolean value, so define it as such and not an int. I 
> know
> the codebase is inconsistent in that manner but long-term all boolean
> 'int's shall be converted to bools.
> 

OK i'll use boolean here.

>>  {
>>  	struct btrfs_inode *binode;
>>  	struct inode *inode;
>> @@ -10003,6 +10003,8 @@ static int start_delalloc_inodes(struct 
>> btrfs_root *root, int nr)
>>  		}
>>  		spin_unlock(&root->delalloc_lock);
>> 
>> +		if (snapshot)
>> +			set_bit(BTRFS_INODE_TAGGED_FLUSH, &binode->runtime_flags);
>>  		work = btrfs_alloc_delalloc_work(inode);
>>  		if (!work) {
>>  			iput(inode);
>> @@ -10036,7 +10038,7 @@ static int start_delalloc_inodes(struct 
>> btrfs_root *root, int nr)
>>  	return ret;
>>  }
>> 
>> -int btrfs_start_delalloc_inodes(struct btrfs_root *root)
>> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
>>  {
>>  	struct btrfs_fs_info *fs_info = root->fs_info;
>>  	int ret;
>> @@ -10044,7 +10046,7 @@ int btrfs_start_delalloc_inodes(struct 
>> btrfs_root *root)
>>  	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
>>  		return -EROFS;
>> 
>> -	ret = start_delalloc_inodes(root, -1);
>> +	ret = start_delalloc_inodes(root, -1, 1);
>>  	if (ret > 0)
>>  		ret = 0;
>>  	return ret;
>> @@ -10073,7 +10075,7 @@ int btrfs_start_delalloc_roots(struct 
>> btrfs_fs_info *fs_info, int nr)
>>  			       &fs_info->delalloc_roots);
>>  		spin_unlock(&fs_info->delalloc_root_lock);
>> 
>> -		ret = start_delalloc_inodes(root, nr);
>> +		ret = start_delalloc_inodes(root, nr, 0);
>>  		btrfs_put_fs_root(root);
>>  		if (ret < 0)
>>  			goto out;
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index d60b6caf09e8..d1293b6c31f6 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -775,7 +775,7 @@ static int create_snapshot(struct btrfs_root 
>> *root, struct inode *dir,
>>  	wait_event(root->subv_writers->wait,
>>  		   percpu_counter_sum(&root->subv_writers->counter) == 0);
>> 
>> -	ret = btrfs_start_delalloc_inodes(root);
>> +	ret = btrfs_start_delalloc_snapshot(root);
>>  	if (ret)
>>  		goto dec_and_free;
>> 
>>
Nikolay Borisov Nov. 1, 2018, 10:01 a.m. UTC | #3
On 1.11.18 г. 11:56 ч., ethanlien wrote:
> Nikolay Borisov 於 2018-11-01 16:59 寫到:
>> On 1.11.18 г. 8:49 ч., Ethan Lien wrote:
>>> Snapshot is expected to be fast. But if there are writers steadily
>>> create dirty pages in our subvolume, the snapshot may take a very long
>>> time to complete. To fix the problem, we use tagged writepage for
>>> snapshot
>>> flusher as we do in the generic write_cache_pages(), so we can ommit
>>> pages
>>> dirtied after the snapshot command.
>>
>> So the gist of this commit really is that you detect when filemap_flush
>> has been called from snapshot context and tag all pages at *that* time
>> as PAGECACHE_TAG_TOWRITE and then write them, ignoring any pages that
>> might have been dirtied beforehand. Your description kind of dances
>> around this idea without really saying it explicitly. Those semantics
>> make sense, however i'd like to be stated more explicitly in the change
>>  log.
>>
>> However, this is done at the expense of consistency, so we have faster
>> snapshots but depending the file which are stored in them they might be
>> broken (i.e a database) since they will be missing pages.
>>
> 
> tag_pages_for_writeback() will tag all pages with PAGECACHE_TAG_DIRTY.
> If a dirty
> page get missed here, it means someone has initiated the flush before
> us, so we
> will wait that dirty page to complete in create_snapshot() ->
> btrfs_wait_ordered_extents().

And what happens in the scenario where you have  someone dirtying pages,
you issue the snapshot ioctl, mark all currently dirtied pages as
TOWRITE and then you have more delalloc pages being dirtied following
initial call to tag_pages_for_writeback , you will miss those, no ?

> 
>>>
>>> We do a simple snapshot speed test on a Intel D-1531 box:
>>>
>>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
>>> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
>>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>>>
>>> original: 1m58sec
>>> patched:  6.54sec
>>
>> Nice.
>>
>>>
>>> This is the best case for this patch since for a sequential write case,
>>> we omit nearly all pages dirtied after the snapshot command.
>>>
>>> For a multi writers, random write test:
>>>
>>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
>>> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
>>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>>>
>>> original: 15.83sec
>>> patched:  10.35sec
>>>
>>> The improvement is less compared with the sequential write case, since
>>> we omit only half of the pages dirtied after snapshot command.
>>>
>>> Signed-off-by: Ethan Lien <ethanlien@synology.com>
>>> ---
>>>  fs/btrfs/btrfs_inode.h |  1 +
>>>  fs/btrfs/ctree.h       |  2 +-
>>>  fs/btrfs/extent_io.c   | 16 ++++++++++++++--
>>>  fs/btrfs/inode.c       | 10 ++++++----
>>>  fs/btrfs/ioctl.c       |  2 +-
>>>  5 files changed, 23 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
>>> index 1343ac57b438..4182bfbb56be 100644
>>> --- a/fs/btrfs/btrfs_inode.h
>>> +++ b/fs/btrfs/btrfs_inode.h
>>> @@ -29,6 +29,7 @@ enum {
>>>      BTRFS_INODE_IN_DELALLOC_LIST,
>>>      BTRFS_INODE_READDIO_NEED_LOCK,
>>>      BTRFS_INODE_HAS_PROPS,
>>> +    BTRFS_INODE_TAGGED_FLUSH,
>>
>> IMO the name of the flag should be different. Something like "lie flush"

I meant "lite flush" or "strict flush" or something like that, alluding
to the fact we are restricting the set of pages being flushed.

>> or whatever. Because all flushes are tagged i.e TAG_DIRTY or TAG_TOWRITE.
>>
> 
> OK I'll use another name.
> 
>>>  };
>>>
>>>  /* in memory btrfs inode */
>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>> index 2cddfe7806a4..82682da5a40d 100644
>>> --- a/fs/btrfs/ctree.h
>>> +++ b/fs/btrfs/ctree.h
>>> @@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct
>>> btrfs_trans_handle *trans,
>>>                     struct inode *inode, u64 new_size,
>>>                     u32 min_type);
>>>
>>> -int btrfs_start_delalloc_inodes(struct btrfs_root *root);
>>> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
>>>  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
>>>  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
>>>                    unsigned int extra_bits,
>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>> index 4dd6faab02bb..c21d8a0e010a 100644
>>> --- a/fs/btrfs/extent_io.c
>>> +++ b/fs/btrfs/extent_io.c
>>> @@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct
>>> address_space *mapping,
>>>              range_whole = 1;
>>>          scanned = 1;
>>>      }
>>> -    if (wbc->sync_mode == WB_SYNC_ALL)
>>> +
>>> +    /*
>>> +     * We don't care if we are the one who set
>>> BTRFS_INODE_TAGGED_FLUSH in
>>> +     * start_delalloc_inodes(). We do the tagged writepage as long
>>> as we are
>>> +     * the first one who do the filemap_flush() on this inode.
>>> +     */
>>
>> I think the first sentence of this comment could be removed.
>>
> 
> OK.
> 
>>> +    if (range_whole && wbc->nr_to_write == LONG_MAX &&
>>> +            wbc->sync_mode == WB_SYNC_NONE &&
>>> +            test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
>>> +                &BTRFS_I(inode)->runtime_flags))
>>> +        wbc->tagged_writepages = 1;
>>> +
>>> +    if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>>>          tag = PAGECACHE_TAG_TOWRITE;
>>>      else
>>>          tag = PAGECACHE_TAG_DIRTY;
>>>  retry:
>>> -    if (wbc->sync_mode == WB_SYNC_ALL)
>>> +    if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>>>          tag_pages_for_writeback(mapping, index, end);
>>>      done_index = index;
>>>      while (!done && !nr_to_write_done && (index <= end) &&
>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>> index 3ea5339603cf..3df3cbbe91c5 100644
>>> --- a/fs/btrfs/inode.c
>>> +++ b/fs/btrfs/inode.c
>>> @@ -9975,7 +9975,7 @@ static struct btrfs_delalloc_work
>>> *btrfs_alloc_delalloc_work(struct inode *inode
>>>   * some fairly slow code that needs optimization. This walks the list
>>>   * of all the inodes with pending delalloc and forces them to disk.
>>>   */
>>> -static int start_delalloc_inodes(struct btrfs_root *root, int nr)
>>> +static int start_delalloc_inodes(struct btrfs_root *root, int nr,
>>> int snapshot)
>>
>> snapshot is a boolean value, so define it as such and not an int. I know
>> the codebase is inconsistent in that manner but long-term all boolean
>> 'int's shall be converted to bools.
>>
> 
> OK i'll use boolean here.
> 
>>>  {
>>>      struct btrfs_inode *binode;
>>>      struct inode *inode;
>>> @@ -10003,6 +10003,8 @@ static int start_delalloc_inodes(struct
>>> btrfs_root *root, int nr)
>>>          }
>>>          spin_unlock(&root->delalloc_lock);
>>>
>>> +        if (snapshot)
>>> +            set_bit(BTRFS_INODE_TAGGED_FLUSH, &binode->runtime_flags);
>>>          work = btrfs_alloc_delalloc_work(inode);
>>>          if (!work) {
>>>              iput(inode);
>>> @@ -10036,7 +10038,7 @@ static int start_delalloc_inodes(struct
>>> btrfs_root *root, int nr)
>>>      return ret;
>>>  }
>>>
>>> -int btrfs_start_delalloc_inodes(struct btrfs_root *root)
>>> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
>>>  {
>>>      struct btrfs_fs_info *fs_info = root->fs_info;
>>>      int ret;
>>> @@ -10044,7 +10046,7 @@ int btrfs_start_delalloc_inodes(struct
>>> btrfs_root *root)
>>>      if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
>>>          return -EROFS;
>>>
>>> -    ret = start_delalloc_inodes(root, -1);
>>> +    ret = start_delalloc_inodes(root, -1, 1);
>>>      if (ret > 0)
>>>          ret = 0;
>>>      return ret;
>>> @@ -10073,7 +10075,7 @@ int btrfs_start_delalloc_roots(struct
>>> btrfs_fs_info *fs_info, int nr)
>>>                     &fs_info->delalloc_roots);
>>>          spin_unlock(&fs_info->delalloc_root_lock);
>>>
>>> -        ret = start_delalloc_inodes(root, nr);
>>> +        ret = start_delalloc_inodes(root, nr, 0);
>>>          btrfs_put_fs_root(root);
>>>          if (ret < 0)
>>>              goto out;
>>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>>> index d60b6caf09e8..d1293b6c31f6 100644
>>> --- a/fs/btrfs/ioctl.c
>>> +++ b/fs/btrfs/ioctl.c
>>> @@ -775,7 +775,7 @@ static int create_snapshot(struct btrfs_root
>>> *root, struct inode *dir,
>>>      wait_event(root->subv_writers->wait,
>>>             percpu_counter_sum(&root->subv_writers->counter) == 0);
>>>
>>> -    ret = btrfs_start_delalloc_inodes(root);
>>> +    ret = btrfs_start_delalloc_snapshot(root);
>>>      if (ret)
>>>          goto dec_and_free;
>>>
>>>
> 
>
ethanlien Nov. 1, 2018, 10:21 a.m. UTC | #4
Nikolay Borisov 於 2018-11-01 18:01 寫到:
> On 1.11.18 г. 11:56 ч., ethanlien wrote:
>> Nikolay Borisov 於 2018-11-01 16:59 寫到:
>>> On 1.11.18 г. 8:49 ч., Ethan Lien wrote:
>>>> Snapshot is expected to be fast. But if there are writers steadily
>>>> create dirty pages in our subvolume, the snapshot may take a very 
>>>> long
>>>> time to complete. To fix the problem, we use tagged writepage for
>>>> snapshot
>>>> flusher as we do in the generic write_cache_pages(), so we can ommit
>>>> pages
>>>> dirtied after the snapshot command.
>>> 
>>> So the gist of this commit really is that you detect when 
>>> filemap_flush
>>> has been called from snapshot context and tag all pages at *that* 
>>> time
>>> as PAGECACHE_TAG_TOWRITE and then write them, ignoring any pages that
>>> might have been dirtied beforehand. Your description kind of dances
>>> around this idea without really saying it explicitly. Those semantics
>>> make sense, however i'd like to be stated more explicitly in the 
>>> change
>>>  log.
>>> 
>>> However, this is done at the expense of consistency, so we have 
>>> faster
>>> snapshots but depending the file which are stored in them they might 
>>> be
>>> broken (i.e a database) since they will be missing pages.
>>> 
>> 
>> tag_pages_for_writeback() will tag all pages with PAGECACHE_TAG_DIRTY.
>> If a dirty
>> page get missed here, it means someone has initiated the flush before
>> us, so we
>> will wait that dirty page to complete in create_snapshot() ->
>> btrfs_wait_ordered_extents().
> 
> And what happens in the scenario where you have  someone dirtying 
> pages,
> you issue the snapshot ioctl, mark all currently dirtied pages as
> TOWRITE and then you have more delalloc pages being dirtied following
> initial call to tag_pages_for_writeback , you will miss those, no ?
> 

We don't freeze the filesystem when doing snapshot, so there is no 
guarantee
the page dirtied right after we send the ioctl, will be included in 
snapshot.
If a page is dirtied while we scan the dirty pages and its offset is 
before our
current index, we miss it in our current snapshot implementation too.

>> 
>>>> 
>>>> We do a simple snapshot speed test on a Intel D-1531 box:
>>>> 
>>>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
>>>> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
>>>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 
>>>> 5;
>>>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>>>> 
>>>> original: 1m58sec
>>>> patched:  6.54sec
>>> 
>>> Nice.
>>> 
>>>> 
>>>> This is the best case for this patch since for a sequential write 
>>>> case,
>>>> we omit nearly all pages dirtied after the snapshot command.
>>>> 
>>>> For a multi writers, random write test:
>>>> 
>>>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
>>>> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
>>>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 
>>>> 5;
>>>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>>>> 
>>>> original: 15.83sec
>>>> patched:  10.35sec
>>>> 
>>>> The improvement is less compared with the sequential write case, 
>>>> since
>>>> we omit only half of the pages dirtied after snapshot command.
>>>> 
>>>> Signed-off-by: Ethan Lien <ethanlien@synology.com>
>>>> ---
>>>>  fs/btrfs/btrfs_inode.h |  1 +
>>>>  fs/btrfs/ctree.h       |  2 +-
>>>>  fs/btrfs/extent_io.c   | 16 ++++++++++++++--
>>>>  fs/btrfs/inode.c       | 10 ++++++----
>>>>  fs/btrfs/ioctl.c       |  2 +-
>>>>  5 files changed, 23 insertions(+), 8 deletions(-)
>>>> 
>>>> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
>>>> index 1343ac57b438..4182bfbb56be 100644
>>>> --- a/fs/btrfs/btrfs_inode.h
>>>> +++ b/fs/btrfs/btrfs_inode.h
>>>> @@ -29,6 +29,7 @@ enum {
>>>>      BTRFS_INODE_IN_DELALLOC_LIST,
>>>>      BTRFS_INODE_READDIO_NEED_LOCK,
>>>>      BTRFS_INODE_HAS_PROPS,
>>>> +    BTRFS_INODE_TAGGED_FLUSH,
>>> 
>>> IMO the name of the flag should be different. Something like "lie 
>>> flush"
> 
> I meant "lite flush" or "strict flush" or something like that, alluding
> to the fact we are restricting the set of pages being flushed.
> 
>>> or whatever. Because all flushes are tagged i.e TAG_DIRTY or 
>>> TAG_TOWRITE.
>>> 
>> 
>> OK I'll use another name.
>> 
>>>>  };
>>>> 
>>>>  /* in memory btrfs inode */
>>>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>>>> index 2cddfe7806a4..82682da5a40d 100644
>>>> --- a/fs/btrfs/ctree.h
>>>> +++ b/fs/btrfs/ctree.h
>>>> @@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct
>>>> btrfs_trans_handle *trans,
>>>>                     struct inode *inode, u64 new_size,
>>>>                     u32 min_type);
>>>> 
>>>> -int btrfs_start_delalloc_inodes(struct btrfs_root *root);
>>>> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
>>>>  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int 
>>>> nr);
>>>>  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 
>>>> end,
>>>>                    unsigned int extra_bits,
>>>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>>>> index 4dd6faab02bb..c21d8a0e010a 100644
>>>> --- a/fs/btrfs/extent_io.c
>>>> +++ b/fs/btrfs/extent_io.c
>>>> @@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct
>>>> address_space *mapping,
>>>>              range_whole = 1;
>>>>          scanned = 1;
>>>>      }
>>>> -    if (wbc->sync_mode == WB_SYNC_ALL)
>>>> +
>>>> +    /*
>>>> +     * We don't care if we are the one who set
>>>> BTRFS_INODE_TAGGED_FLUSH in
>>>> +     * start_delalloc_inodes(). We do the tagged writepage as long
>>>> as we are
>>>> +     * the first one who do the filemap_flush() on this inode.
>>>> +     */
>>> 
>>> I think the first sentence of this comment could be removed.
>>> 
>> 
>> OK.
>> 
>>>> +    if (range_whole && wbc->nr_to_write == LONG_MAX &&
>>>> +            wbc->sync_mode == WB_SYNC_NONE &&
>>>> +            test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
>>>> +                &BTRFS_I(inode)->runtime_flags))
>>>> +        wbc->tagged_writepages = 1;
>>>> +
>>>> +    if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>>>>          tag = PAGECACHE_TAG_TOWRITE;
>>>>      else
>>>>          tag = PAGECACHE_TAG_DIRTY;
>>>>  retry:
>>>> -    if (wbc->sync_mode == WB_SYNC_ALL)
>>>> +    if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>>>>          tag_pages_for_writeback(mapping, index, end);
>>>>      done_index = index;
>>>>      while (!done && !nr_to_write_done && (index <= end) &&
>>>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>>> index 3ea5339603cf..3df3cbbe91c5 100644
>>>> --- a/fs/btrfs/inode.c
>>>> +++ b/fs/btrfs/inode.c
>>>> @@ -9975,7 +9975,7 @@ static struct btrfs_delalloc_work
>>>> *btrfs_alloc_delalloc_work(struct inode *inode
>>>>   * some fairly slow code that needs optimization. This walks the 
>>>> list
>>>>   * of all the inodes with pending delalloc and forces them to disk.
>>>>   */
>>>> -static int start_delalloc_inodes(struct btrfs_root *root, int nr)
>>>> +static int start_delalloc_inodes(struct btrfs_root *root, int nr,
>>>> int snapshot)
>>> 
>>> snapshot is a boolean value, so define it as such and not an int. I 
>>> know
>>> the codebase is inconsistent in that manner but long-term all boolean
>>> 'int's shall be converted to bools.
>>> 
>> 
>> OK i'll use boolean here.
>> 
>>>>  {
>>>>      struct btrfs_inode *binode;
>>>>      struct inode *inode;
>>>> @@ -10003,6 +10003,8 @@ static int start_delalloc_inodes(struct
>>>> btrfs_root *root, int nr)
>>>>          }
>>>>          spin_unlock(&root->delalloc_lock);
>>>> 
>>>> +        if (snapshot)
>>>> +            set_bit(BTRFS_INODE_TAGGED_FLUSH, 
>>>> &binode->runtime_flags);
>>>>          work = btrfs_alloc_delalloc_work(inode);
>>>>          if (!work) {
>>>>              iput(inode);
>>>> @@ -10036,7 +10038,7 @@ static int start_delalloc_inodes(struct
>>>> btrfs_root *root, int nr)
>>>>      return ret;
>>>>  }
>>>> 
>>>> -int btrfs_start_delalloc_inodes(struct btrfs_root *root)
>>>> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
>>>>  {
>>>>      struct btrfs_fs_info *fs_info = root->fs_info;
>>>>      int ret;
>>>> @@ -10044,7 +10046,7 @@ int btrfs_start_delalloc_inodes(struct
>>>> btrfs_root *root)
>>>>      if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
>>>>          return -EROFS;
>>>> 
>>>> -    ret = start_delalloc_inodes(root, -1);
>>>> +    ret = start_delalloc_inodes(root, -1, 1);
>>>>      if (ret > 0)
>>>>          ret = 0;
>>>>      return ret;
>>>> @@ -10073,7 +10075,7 @@ int btrfs_start_delalloc_roots(struct
>>>> btrfs_fs_info *fs_info, int nr)
>>>>                     &fs_info->delalloc_roots);
>>>>          spin_unlock(&fs_info->delalloc_root_lock);
>>>> 
>>>> -        ret = start_delalloc_inodes(root, nr);
>>>> +        ret = start_delalloc_inodes(root, nr, 0);
>>>>          btrfs_put_fs_root(root);
>>>>          if (ret < 0)
>>>>              goto out;
>>>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>>>> index d60b6caf09e8..d1293b6c31f6 100644
>>>> --- a/fs/btrfs/ioctl.c
>>>> +++ b/fs/btrfs/ioctl.c
>>>> @@ -775,7 +775,7 @@ static int create_snapshot(struct btrfs_root
>>>> *root, struct inode *dir,
>>>>      wait_event(root->subv_writers->wait,
>>>>             percpu_counter_sum(&root->subv_writers->counter) == 0);
>>>> 
>>>> -    ret = btrfs_start_delalloc_inodes(root);
>>>> +    ret = btrfs_start_delalloc_snapshot(root);
>>>>      if (ret)
>>>>          goto dec_and_free;
>>>> 
>>>> 
>> 
>>
Nikolay Borisov Nov. 1, 2018, 11:57 a.m. UTC | #5
On 1.11.18 г. 8:49 ч., Ethan Lien wrote:
> Snapshot is expected to be fast. But if there are writers steadily
> create dirty pages in our subvolume, the snapshot may take a very long
> time to complete. To fix the problem, we use tagged writepage for snapshot
> flusher as we do in the generic write_cache_pages(), so we can ommit pages
> dirtied after the snapshot command.
> 
> We do a simple snapshot speed test on a Intel D-1531 box:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 1m58sec
> patched:  6.54sec
> 
> This is the best case for this patch since for a sequential write case,
> we omit nearly all pages dirtied after the snapshot command.
> 
> For a multi writers, random write test:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 15.83sec
> patched:  10.35sec
> 
> The improvement is less compared with the sequential write case, since
> we omit only half of the pages dirtied after snapshot command.
> 
> Signed-off-by: Ethan Lien <ethanlien@synology.com>
> ---
>  fs/btrfs/btrfs_inode.h |  1 +
>  fs/btrfs/ctree.h       |  2 +-
>  fs/btrfs/extent_io.c   | 16 ++++++++++++++--
>  fs/btrfs/inode.c       | 10 ++++++----
>  fs/btrfs/ioctl.c       |  2 +-
>  5 files changed, 23 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 1343ac57b438..4182bfbb56be 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -29,6 +29,7 @@ enum {
>  	BTRFS_INODE_IN_DELALLOC_LIST,
>  	BTRFS_INODE_READDIO_NEED_LOCK,
>  	BTRFS_INODE_HAS_PROPS,
> +	BTRFS_INODE_TAGGED_FLUSH,
>  };
>  
>  /* in memory btrfs inode */
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 2cddfe7806a4..82682da5a40d 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
>  			       struct inode *inode, u64 new_size,
>  			       u32 min_type);
>  
> -int btrfs_start_delalloc_inodes(struct btrfs_root *root);
> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
>  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
>  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
>  			      unsigned int extra_bits,
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 4dd6faab02bb..c21d8a0e010a 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct address_space *mapping,
>  			range_whole = 1;
>  		scanned = 1;
>  	}
> -	if (wbc->sync_mode == WB_SYNC_ALL)
> +
> +	/*
> +	 * We don't care if we are the one who set BTRFS_INODE_TAGGED_FLUSH in
> +	 * start_delalloc_inodes(). We do the tagged writepage as long as we are
> +	 * the first one who do the filemap_flush() on this inode.
> +	 */
> +	if (range_whole && wbc->nr_to_write == LONG_MAX &&
> +			wbc->sync_mode == WB_SYNC_NONE &&
> +			test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
> +				&BTRFS_I(inode)->runtime_flags))
Actually this check can be simplified to:

range_whole && test_and_clear_bit. filemap_flush triggers range_whole =
1 and then you care about TAGGED_FLUSH (or w/e it's going to be named)
to be set. The nr_to_write && syncmode just make it a tad more difficult
to reason about the code.


> +		wbc->tagged_writepages = 1;
> +
> +	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>  		tag = PAGECACHE_TAG_TOWRITE;
>  	else
>  		tag = PAGECACHE_TAG_DIRTY;
>  retry:
> -	if (wbc->sync_mode == WB_SYNC_ALL)
> +	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>  		tag_pages_for_writeback(mapping, index, end);
>  	done_index = index;
>  	while (!done && !nr_to_write_done && (index <= end) &&
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 3ea5339603cf..3df3cbbe91c5 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -9975,7 +9975,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
>   * some fairly slow code that needs optimization. This walks the list
>   * of all the inodes with pending delalloc and forces them to disk.
>   */
> -static int start_delalloc_inodes(struct btrfs_root *root, int nr)
> +static int start_delalloc_inodes(struct btrfs_root *root, int nr, int snapshot)
>  {
>  	struct btrfs_inode *binode;
>  	struct inode *inode;
> @@ -10003,6 +10003,8 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr)
>  		}
>  		spin_unlock(&root->delalloc_lock);
>  
> +		if (snapshot)
> +			set_bit(BTRFS_INODE_TAGGED_FLUSH, &binode->runtime_flags);
>  		work = btrfs_alloc_delalloc_work(inode);
>  		if (!work) {
>  			iput(inode);
> @@ -10036,7 +10038,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr)
>  	return ret;
>  }
>  
> -int btrfs_start_delalloc_inodes(struct btrfs_root *root)
> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
>  {
>  	struct btrfs_fs_info *fs_info = root->fs_info;
>  	int ret;
> @@ -10044,7 +10046,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root)
>  	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
>  		return -EROFS;
>  
> -	ret = start_delalloc_inodes(root, -1);
> +	ret = start_delalloc_inodes(root, -1, 1);
>  	if (ret > 0)
>  		ret = 0;
>  	return ret;
> @@ -10073,7 +10075,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
>  			       &fs_info->delalloc_roots);
>  		spin_unlock(&fs_info->delalloc_root_lock);
>  
> -		ret = start_delalloc_inodes(root, nr);
> +		ret = start_delalloc_inodes(root, nr, 0);
>  		btrfs_put_fs_root(root);
>  		if (ret < 0)
>  			goto out;
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index d60b6caf09e8..d1293b6c31f6 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -775,7 +775,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
>  	wait_event(root->subv_writers->wait,
>  		   percpu_counter_sum(&root->subv_writers->counter) == 0);
>  
> -	ret = btrfs_start_delalloc_inodes(root);
> +	ret = btrfs_start_delalloc_snapshot(root);
>  	if (ret)
>  		goto dec_and_free;
>  
>
Chris Mason Nov. 1, 2018, 1:24 p.m. UTC | #6
On 1 Nov 2018, at 6:21, ethanlien wrote:

> Nikolay Borisov 於 2018-11-01 18:01 寫到:
>> On 1.11.18 г. 11:56 ч., ethanlien wrote:
>>> Nikolay Borisov 於 2018-11-01 16:59 寫到:
>>>> On 1.11.18 г. 8:49 ч., Ethan Lien wrote:
>>>>> Snapshot is expected to be fast. But if there are writers steadily
>>>>> create dirty pages in our subvolume, the snapshot may take a very 
>>>>> long
>>>>> time to complete. To fix the problem, we use tagged writepage for
>>>>> snapshot
>>>>> flusher as we do in the generic write_cache_pages(), so we can 
>>>>> ommit
>>>>> pages
>>>>> dirtied after the snapshot command.
>>>>
>>>> So the gist of this commit really is that you detect when 
>>>> filemap_flush
>>>> has been called from snapshot context and tag all pages at *that* 
>>>> time
>>>> as PAGECACHE_TAG_TOWRITE and then write them, ignoring any pages 
>>>> that
>>>> might have been dirtied beforehand. Your description kind of dances
>>>> around this idea without really saying it explicitly. Those 
>>>> semantics
>>>> make sense, however i'd like to be stated more explicitly in the 
>>>> change
>>>>  log.
>>>>
>>>> However, this is done at the expense of consistency, so we have 
>>>> faster
>>>> snapshots but depending the file which are stored in them they 
>>>> might be
>>>> broken (i.e a database) since they will be missing pages.
>>>>
>>>
>>> tag_pages_for_writeback() will tag all pages with 
>>> PAGECACHE_TAG_DIRTY.
>>> If a dirty
>>> page get missed here, it means someone has initiated the flush 
>>> before
>>> us, so we
>>> will wait that dirty page to complete in create_snapshot() ->
>>> btrfs_wait_ordered_extents().
>>
>> And what happens in the scenario where you have  someone dirtying 
>> pages,
>> you issue the snapshot ioctl, mark all currently dirtied pages as
>> TOWRITE and then you have more delalloc pages being dirtied following
>> initial call to tag_pages_for_writeback , you will miss those, no ?
>>
>
> We don't freeze the filesystem when doing snapshot, so there is no 
> guarantee
> the page dirtied right after we send the ioctl, will be included in 
> snapshot.
> If a page is dirtied while we scan the dirty pages and its offset is 
> before our
> current index, we miss it in our current snapshot implementation too.

This looks like a pretty good compromise to me between btrfs v0's don't 
flush at all on snapshot and today's full sync on snapshot.  The full 
sync is always going to be a little racey wrt concurrent writes.

-chris
David Sterba Nov. 1, 2018, 6:02 p.m. UTC | #7
On Thu, Nov 01, 2018 at 02:49:03PM +0800, Ethan Lien wrote:
> Snapshot is expected to be fast. But if there are writers steadily
> create dirty pages in our subvolume, the snapshot may take a very long
> time to complete. To fix the problem, we use tagged writepage for snapshot
> flusher as we do in the generic write_cache_pages(), so we can ommit pages
> dirtied after the snapshot command.
> 
> We do a simple snapshot speed test on a Intel D-1531 box:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 1m58sec
> patched:  6.54sec
> 
> This is the best case for this patch since for a sequential write case,
> we omit nearly all pages dirtied after the snapshot command.
> 
> For a multi writers, random write test:
> 
> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
> 
> original: 15.83sec
> patched:  10.35sec
> 
> The improvement is less compared with the sequential write case, since
> we omit only half of the pages dirtied after snapshot command.
> 
> Signed-off-by: Ethan Lien <ethanlien@synology.com>

This looks nice, thanks. I agree with the The suggestions from Nikolay,
please update and resend.

I was bit curious about the 'livelock', what you describe does not seem
to be one. System under heavy IO can make the snapshot dead slow but can
recover from that once the IO stops.

Regarding the sync semantics, there's AFAIK no change to the current
state where the sync is done before snapshot but without further other
guarantees. From that point I think it's safe to select only subset of
pages and make things faster.

As the requested changes are not functional I'll add the patch to
for-next for testing.
ethanlien Nov. 2, 2018, 7 a.m. UTC | #8
Nikolay Borisov 於 2018-11-01 19:57 寫到:
> On 1.11.18 г. 8:49 ч., Ethan Lien wrote:
>> Snapshot is expected to be fast. But if there are writers steadily
>> create dirty pages in our subvolume, the snapshot may take a very long
>> time to complete. To fix the problem, we use tagged writepage for 
>> snapshot
>> flusher as we do in the generic write_cache_pages(), so we can ommit 
>> pages
>> dirtied after the snapshot command.
>> 
>> We do a simple snapshot speed test on a Intel D-1531 box:
>> 
>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
>> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>> 
>> original: 1m58sec
>> patched:  6.54sec
>> 
>> This is the best case for this patch since for a sequential write 
>> case,
>> we omit nearly all pages dirtied after the snapshot command.
>> 
>> For a multi writers, random write test:
>> 
>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
>> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>> 
>> original: 15.83sec
>> patched:  10.35sec
>> 
>> The improvement is less compared with the sequential write case, since
>> we omit only half of the pages dirtied after snapshot command.
>> 
>> Signed-off-by: Ethan Lien <ethanlien@synology.com>
>> ---
>>  fs/btrfs/btrfs_inode.h |  1 +
>>  fs/btrfs/ctree.h       |  2 +-
>>  fs/btrfs/extent_io.c   | 16 ++++++++++++++--
>>  fs/btrfs/inode.c       | 10 ++++++----
>>  fs/btrfs/ioctl.c       |  2 +-
>>  5 files changed, 23 insertions(+), 8 deletions(-)
>> 
>> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
>> index 1343ac57b438..4182bfbb56be 100644
>> --- a/fs/btrfs/btrfs_inode.h
>> +++ b/fs/btrfs/btrfs_inode.h
>> @@ -29,6 +29,7 @@ enum {
>>  	BTRFS_INODE_IN_DELALLOC_LIST,
>>  	BTRFS_INODE_READDIO_NEED_LOCK,
>>  	BTRFS_INODE_HAS_PROPS,
>> +	BTRFS_INODE_TAGGED_FLUSH,
>>  };
>> 
>>  /* in memory btrfs inode */
>> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
>> index 2cddfe7806a4..82682da5a40d 100644
>> --- a/fs/btrfs/ctree.h
>> +++ b/fs/btrfs/ctree.h
>> @@ -3155,7 +3155,7 @@ int btrfs_truncate_inode_items(struct 
>> btrfs_trans_handle *trans,
>>  			       struct inode *inode, u64 new_size,
>>  			       u32 min_type);
>> 
>> -int btrfs_start_delalloc_inodes(struct btrfs_root *root);
>> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
>>  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int 
>> nr);
>>  int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 
>> end,
>>  			      unsigned int extra_bits,
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index 4dd6faab02bb..c21d8a0e010a 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -3928,12 +3928,24 @@ static int extent_write_cache_pages(struct 
>> address_space *mapping,
>>  			range_whole = 1;
>>  		scanned = 1;
>>  	}
>> -	if (wbc->sync_mode == WB_SYNC_ALL)
>> +
>> +	/*
>> +	 * We don't care if we are the one who set BTRFS_INODE_TAGGED_FLUSH 
>> in
>> +	 * start_delalloc_inodes(). We do the tagged writepage as long as we 
>> are
>> +	 * the first one who do the filemap_flush() on this inode.
>> +	 */
>> +	if (range_whole && wbc->nr_to_write == LONG_MAX &&
>> +			wbc->sync_mode == WB_SYNC_NONE &&
>> +			test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
>> +				&BTRFS_I(inode)->runtime_flags))
> Actually this check can be simplified to:
> 
> range_whole && test_and_clear_bit. filemap_flush triggers range_whole =
> 1 and then you care about TAGGED_FLUSH (or w/e it's going to be named)
> to be set. The nr_to_write && syncmode just make it a tad more 
> difficult
> to reason about the code.
> 
> 

Yes, the sync_mode check is not necessary. For nr_to_write, since 
pagevec
contain only limited amount of pages in one loop, nr_to_write 
essentially control
whether we scan all of the dirty pages or not. Since there is a window 
between
start_delalloc_inodes() and extent_write_cache_pages(), another flusher 
with
range_whole==1 but nr_to_write<LONG_MAX may come in and clear the flush
bit before the filemap_flush().

>> +		wbc->tagged_writepages = 1;
>> +
>> +	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>>  		tag = PAGECACHE_TAG_TOWRITE;
>>  	else
>>  		tag = PAGECACHE_TAG_DIRTY;
>>  retry:
>> -	if (wbc->sync_mode == WB_SYNC_ALL)
>> +	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
>>  		tag_pages_for_writeback(mapping, index, end);
>>  	done_index = index;
>>  	while (!done && !nr_to_write_done && (index <= end) &&
>> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> index 3ea5339603cf..3df3cbbe91c5 100644
>> --- a/fs/btrfs/inode.c
>> +++ b/fs/btrfs/inode.c
>> @@ -9975,7 +9975,7 @@ static struct btrfs_delalloc_work 
>> *btrfs_alloc_delalloc_work(struct inode *inode
>>   * some fairly slow code that needs optimization. This walks the list
>>   * of all the inodes with pending delalloc and forces them to disk.
>>   */
>> -static int start_delalloc_inodes(struct btrfs_root *root, int nr)
>> +static int start_delalloc_inodes(struct btrfs_root *root, int nr, int 
>> snapshot)
>>  {
>>  	struct btrfs_inode *binode;
>>  	struct inode *inode;
>> @@ -10003,6 +10003,8 @@ static int start_delalloc_inodes(struct 
>> btrfs_root *root, int nr)
>>  		}
>>  		spin_unlock(&root->delalloc_lock);
>> 
>> +		if (snapshot)
>> +			set_bit(BTRFS_INODE_TAGGED_FLUSH, &binode->runtime_flags);
>>  		work = btrfs_alloc_delalloc_work(inode);
>>  		if (!work) {
>>  			iput(inode);
>> @@ -10036,7 +10038,7 @@ static int start_delalloc_inodes(struct 
>> btrfs_root *root, int nr)
>>  	return ret;
>>  }
>> 
>> -int btrfs_start_delalloc_inodes(struct btrfs_root *root)
>> +int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
>>  {
>>  	struct btrfs_fs_info *fs_info = root->fs_info;
>>  	int ret;
>> @@ -10044,7 +10046,7 @@ int btrfs_start_delalloc_inodes(struct 
>> btrfs_root *root)
>>  	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
>>  		return -EROFS;
>> 
>> -	ret = start_delalloc_inodes(root, -1);
>> +	ret = start_delalloc_inodes(root, -1, 1);
>>  	if (ret > 0)
>>  		ret = 0;
>>  	return ret;
>> @@ -10073,7 +10075,7 @@ int btrfs_start_delalloc_roots(struct 
>> btrfs_fs_info *fs_info, int nr)
>>  			       &fs_info->delalloc_roots);
>>  		spin_unlock(&fs_info->delalloc_root_lock);
>> 
>> -		ret = start_delalloc_inodes(root, nr);
>> +		ret = start_delalloc_inodes(root, nr, 0);
>>  		btrfs_put_fs_root(root);
>>  		if (ret < 0)
>>  			goto out;
>> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
>> index d60b6caf09e8..d1293b6c31f6 100644
>> --- a/fs/btrfs/ioctl.c
>> +++ b/fs/btrfs/ioctl.c
>> @@ -775,7 +775,7 @@ static int create_snapshot(struct btrfs_root 
>> *root, struct inode *dir,
>>  	wait_event(root->subv_writers->wait,
>>  		   percpu_counter_sum(&root->subv_writers->counter) == 0);
>> 
>> -	ret = btrfs_start_delalloc_inodes(root);
>> +	ret = btrfs_start_delalloc_snapshot(root);
>>  	if (ret)
>>  		goto dec_and_free;
>> 
>>
ethanlien Nov. 2, 2018, 7:13 a.m. UTC | #9
David Sterba 於 2018-11-02 02:02 寫到:
> On Thu, Nov 01, 2018 at 02:49:03PM +0800, Ethan Lien wrote:
>> Snapshot is expected to be fast. But if there are writers steadily
>> create dirty pages in our subvolume, the snapshot may take a very long
>> time to complete. To fix the problem, we use tagged writepage for 
>> snapshot
>> flusher as we do in the generic write_cache_pages(), so we can ommit 
>> pages
>> dirtied after the snapshot command.
>> 
>> We do a simple snapshot speed test on a Intel D-1531 box:
>> 
>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
>> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>> 
>> original: 1m58sec
>> patched:  6.54sec
>> 
>> This is the best case for this patch since for a sequential write 
>> case,
>> we omit nearly all pages dirtied after the snapshot command.
>> 
>> For a multi writers, random write test:
>> 
>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
>> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>> 
>> original: 15.83sec
>> patched:  10.35sec
>> 
>> The improvement is less compared with the sequential write case, since
>> we omit only half of the pages dirtied after snapshot command.
>> 
>> Signed-off-by: Ethan Lien <ethanlien@synology.com>
> 
> This looks nice, thanks. I agree with the The suggestions from Nikolay,
> please update and resend.
> 
> I was bit curious about the 'livelock', what you describe does not seem
> to be one. System under heavy IO can make the snapshot dead slow but 
> can
> recover from that once the IO stops.

I'm not sure if this is indeed the case of 'livelock'. I learn the term 
from commit:
f446daaea9d4a420d, "mm: implement writeback livelock avoidance using 
page tagging".
If this is not the case, I can use another term.

> Regarding the sync semantics, there's AFAIK no change to the current
> state where the sync is done before snapshot but without further other
> guarantees. From that point I think it's safe to select only subset of
> pages and make things faster.
> 
> As the requested changes are not functional I'll add the patch to
> for-next for testing.
Nikolay Borisov Nov. 2, 2018, 8:43 a.m. UTC | #10
On 2.11.18 г. 9:13 ч., ethanlien wrote:
> David Sterba 於 2018-11-02 02:02 寫到:
>> On Thu, Nov 01, 2018 at 02:49:03PM +0800, Ethan Lien wrote:
>>> Snapshot is expected to be fast. But if there are writers steadily
>>> create dirty pages in our subvolume, the snapshot may take a very long
>>> time to complete. To fix the problem, we use tagged writepage for
>>> snapshot
>>> flusher as we do in the generic write_cache_pages(), so we can ommit
>>> pages
>>> dirtied after the snapshot command.
>>>
>>> We do a simple snapshot speed test on a Intel D-1531 box:
>>>
>>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G
>>> --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120
>>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>>>
>>> original: 1m58sec
>>> patched:  6.54sec
>>>
>>> This is the best case for this patch since for a sequential write case,
>>> we omit nearly all pages dirtied after the snapshot command.
>>>
>>> For a multi writers, random write test:
>>>
>>> fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G
>>> --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120
>>> --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5;
>>> time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio
>>>
>>> original: 15.83sec
>>> patched:  10.35sec
>>>
>>> The improvement is less compared with the sequential write case, since
>>> we omit only half of the pages dirtied after snapshot command.
>>>
>>> Signed-off-by: Ethan Lien <ethanlien@synology.com>
>>
>> This looks nice, thanks. I agree with the The suggestions from Nikolay,
>> please update and resend.
>>
>> I was bit curious about the 'livelock', what you describe does not seem
>> to be one. System under heavy IO can make the snapshot dead slow but can
>> recover from that once the IO stops.>
> I'm not sure if this is indeed the case of 'livelock'. I learn the term
> from commit:
> f446daaea9d4a420d, "mm: implement writeback livelock avoidance using
> page tagging".
> If this is not the case, I can use another term.

It's really bad to use terms you don't fully comprehend because that way
you can, inadvertently, mislead people who don't necessarily have the
same context as you do. A brief description of what livelock can be
found here:

https://stackoverflow.com/questions/6155951/whats-the-difference-between-deadlock-and-livelock

In the context of wb livelock could occur since you have 2 processes - 1
producer (dirtying pages) and the other one consumer (writeback). So a
livelock will be when both are doing useful work yet the system doesn't
make forward progress ( in this case there is a risk of the snapshot
never being created since we will never finish doing the writeback for
constantly dirtied pages).

David, indeed what Ethan fixes and what Jan fixed in f446daaea9d4a420d
are the same class of issues. And livelock in this context really
depends on one's definition of how long a
no-forward-progress-but-useful-work-being-done situation persisting.

> 
>> Regarding the sync semantics, there's AFAIK no change to the current
>> state where the sync is done before snapshot but without further other
>> guarantees. From that point I think it's safe to select only subset of
>> pages and make things faster.
>>
>> As the requested changes are not functional I'll add the patch to
>> for-next for testing.
> 
>
diff mbox series

Patch

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..4182bfbb56be 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -29,6 +29,7 @@  enum {
 	BTRFS_INODE_IN_DELALLOC_LIST,
 	BTRFS_INODE_READDIO_NEED_LOCK,
 	BTRFS_INODE_HAS_PROPS,
+	BTRFS_INODE_TAGGED_FLUSH,
 };
 
 /* in memory btrfs inode */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2cddfe7806a4..82682da5a40d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3155,7 +3155,7 @@  int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct inode *inode, u64 new_size,
 			       u32 min_type);
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      unsigned int extra_bits,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dd6faab02bb..c21d8a0e010a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3928,12 +3928,24 @@  static int extent_write_cache_pages(struct address_space *mapping,
 			range_whole = 1;
 		scanned = 1;
 	}
-	if (wbc->sync_mode == WB_SYNC_ALL)
+
+	/*
+	 * We don't care if we are the one who set BTRFS_INODE_TAGGED_FLUSH in
+	 * start_delalloc_inodes(). We do the tagged writepage as long as we are
+	 * the first one who do the filemap_flush() on this inode.
+	 */
+	if (range_whole && wbc->nr_to_write == LONG_MAX &&
+			wbc->sync_mode == WB_SYNC_NONE &&
+			test_and_clear_bit(BTRFS_INODE_TAGGED_FLUSH,
+				&BTRFS_I(inode)->runtime_flags))
+		wbc->tagged_writepages = 1;
+
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && !nr_to_write_done && (index <= end) &&
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3ea5339603cf..3df3cbbe91c5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9975,7 +9975,7 @@  static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-static int start_delalloc_inodes(struct btrfs_root *root, int nr)
+static int start_delalloc_inodes(struct btrfs_root *root, int nr, int snapshot)
 {
 	struct btrfs_inode *binode;
 	struct inode *inode;
@@ -10003,6 +10003,8 @@  static int start_delalloc_inodes(struct btrfs_root *root, int nr)
 		}
 		spin_unlock(&root->delalloc_lock);
 
+		if (snapshot)
+			set_bit(BTRFS_INODE_TAGGED_FLUSH, &binode->runtime_flags);
 		work = btrfs_alloc_delalloc_work(inode);
 		if (!work) {
 			iput(inode);
@@ -10036,7 +10038,7 @@  static int start_delalloc_inodes(struct btrfs_root *root, int nr)
 	return ret;
 }
 
-int btrfs_start_delalloc_inodes(struct btrfs_root *root)
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
@@ -10044,7 +10046,7 @@  int btrfs_start_delalloc_inodes(struct btrfs_root *root)
 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
 		return -EROFS;
 
-	ret = start_delalloc_inodes(root, -1);
+	ret = start_delalloc_inodes(root, -1, 1);
 	if (ret > 0)
 		ret = 0;
 	return ret;
@@ -10073,7 +10075,7 @@  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr)
 			       &fs_info->delalloc_roots);
 		spin_unlock(&fs_info->delalloc_root_lock);
 
-		ret = start_delalloc_inodes(root, nr);
+		ret = start_delalloc_inodes(root, nr, 0);
 		btrfs_put_fs_root(root);
 		if (ret < 0)
 			goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d60b6caf09e8..d1293b6c31f6 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -775,7 +775,7 @@  static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	wait_event(root->subv_writers->wait,
 		   percpu_counter_sum(&root->subv_writers->counter) == 0);
 
-	ret = btrfs_start_delalloc_inodes(root);
+	ret = btrfs_start_delalloc_snapshot(root);
 	if (ret)
 		goto dec_and_free;