diff mbox

[RFC,v2,2/2] Btrfs: improve fsync for nocow file

Message ID 1434527672-5762-2-git-send-email-bo.li.liu@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Liu Bo June 17, 2015, 7:54 a.m. UTC
If  we're overwriting an allocated file without changing timestamp
and inode version, and the file is with NODATACOW, we don't have any metadata to
commit, thus we can just flush the data device cache and go forward.

However, if there's have any change on extents' disk bytenr, inode size,
timestamp or inode version, we need to go through the normal btrfs_log_inode
path.

Test:
----------------------------
1. sysbench test of
"1 file + 1 thread + bs=4k + size=40k + synchronous I/O mode + randomwrite +
fsync_on_each_write",
2. loop device associated with tmpfs file
3.
  - For btrfs, "-o nodatacow" and "-o noi_version" option
  - For ext4 and xfs, no extra mount options
----------------------------

Results:
----------------------------
- btrfs:
w/o: ~30Mb/sec
w:   ~131Mb/sec

- other filesystems: (both don't enable i_version by default)
ext4:  203Mb/sec
xfs:   212Mb/sec
----------------------------

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
v2: Catch errors from data writeback and skip barrier if necessary.

 fs/btrfs/btrfs_inode.h |    2 +
 fs/btrfs/disk-io.c     |    2 +-
 fs/btrfs/disk-io.h     |    1 +
 fs/btrfs/file.c        |   54 +++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/inode.c       |    3 ++
 5 files changed, 56 insertions(+), 6 deletions(-)

Comments

David Sterba June 17, 2015, 3:58 p.m. UTC | #1
On Wed, Jun 17, 2015 at 03:54:32PM +0800, Liu Bo wrote:
> If  we're overwriting an allocated file without changing timestamp
> and inode version, and the file is with NODATACOW, we don't have any metadata to
> commit, thus we can just flush the data device cache and go forward.
> 
> However, if there's have any change on extents' disk bytenr, inode size,
> timestamp or inode version, we need to go through the normal btrfs_log_inode
> path.
> 
> Test:
> ----------------------------
> 1. sysbench test of
> "1 file + 1 thread + bs=4k + size=40k + synchronous I/O mode + randomwrite +
> fsync_on_each_write",
> 2. loop device associated with tmpfs file
> 3.
>   - For btrfs, "-o nodatacow" and "-o noi_version" option
>   - For ext4 and xfs, no extra mount options
> ----------------------------
> 
> Results:
> ----------------------------
> - btrfs:
> w/o: ~30Mb/sec
> w:   ~131Mb/sec
> 
> - other filesystems: (both don't enable i_version by default)
> ext4:  203Mb/sec
> xfs:   212Mb/sec
> ----------------------------
> 
> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> ---
> v2: Catch errors from data writeback and skip barrier if necessary.
> 
>  fs/btrfs/btrfs_inode.h |    2 +
>  fs/btrfs/disk-io.c     |    2 +-
>  fs/btrfs/disk-io.h     |    1 +
>  fs/btrfs/file.c        |   54 +++++++++++++++++++++++++++++++++++++++++++----
>  fs/btrfs/inode.c       |    3 ++
>  5 files changed, 56 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index de5e4f2..f7b99b6 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -44,6 +44,8 @@
>  #define BTRFS_INODE_IN_DELALLOC_LIST		9
>  #define BTRFS_INODE_READDIO_NEED_LOCK		10
>  #define BTRFS_INODE_HAS_PROPS		        11
> +#define BTRFS_INODE_NOTIMESTAMP			12
> +#define BTRFS_INODE_NOISIZE			13

It's not clear what the flags mean and that they're related to syncing
under some conditions.

>  /*
>   * The following 3 bits are meant only for the btree inode.
>   * When any of them is set, it means an error happened while writing an
> --- a/fs/btrfs/disk-io.h
> +++ b/fs/btrfs/disk-io.h
> @@ -60,6 +60,7 @@ void close_ctree(struct btrfs_root *root);
>  int write_ctree_super(struct btrfs_trans_handle *trans,
>  		      struct btrfs_root *root, int max_mirrors);
>  struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
> +int barrier_all_devices(struct btrfs_fs_info *info);
>  int btrfs_commit_super(struct btrfs_root *root);
>  struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
>  					    u64 bytenr);
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index faa7d39..180a3e1 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -523,8 +523,12 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
>  	 * the disk i_size.  There is no need to log the inode
>  	 * at this time.
>  	 */
> -	if (end_pos > isize)
> +	if (end_pos > isize) {
>  		i_size_write(inode, end_pos);
> +		clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> +	} else {
> +		set_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> +	}
>  	return 0;
>  }
>  
> @@ -1715,19 +1719,33 @@ out:
>  static void update_time_for_write(struct inode *inode)
>  {
>  	struct timespec now;
> +	int sync_it = 0;
>  
> -	if (IS_NOCMTIME(inode))
> +	if (IS_NOCMTIME(inode)) {
> +		set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
>  		return;
> +	}
>  
>  	now = current_fs_time(inode->i_sb);
> -	if (!timespec_equal(&inode->i_mtime, &now))
> +	if (!timespec_equal(&inode->i_mtime, &now)) {
>  		inode->i_mtime = now;
> +		sync_it = S_MTIME;
> +	}
>  
> -	if (!timespec_equal(&inode->i_ctime, &now))
> +	if (!timespec_equal(&inode->i_ctime, &now)) {
>  		inode->i_ctime = now;
> +		sync_it |= S_CTIME;
> +	}
>  
> -	if (IS_I_VERSION(inode))
> +	if (IS_I_VERSION(inode)) {
>  		inode_inc_iversion(inode);
> +		sync_it |= S_VERSION;
> +	}
> +
> +	if (!sync_it)
> +		set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
> +	else
> +		clear_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
>  }
>  
>  static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
> @@ -1983,6 +2001,32 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>  		goto out;
>  	}
>  
> +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
> +		if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP,
> +					&BTRFS_I(inode)->runtime_flags) &&
> +		    test_and_clear_bit(BTRFS_INODE_NOISIZE,
> +					&BTRFS_I(inode)->runtime_flags)) {
> +
> +			/* make sure data is on disk and catch error */
> +			if (!full_sync)
> +				ret = filemap_fdatawait_range(inode->i_mapping,
> +							      start, end);
> +
> +			if (!ret && !btrfs_test_opt(root, NOBARRIER)) {
> +				mutex_lock(&root->fs_info->
> +					   fs_devices->device_list_mutex);
> +				ret = barrier_all_devices(root->fs_info);

Calling barrier devices at this point looks very fishy, taking global
device locks to sync one file as well. All files in the filesystem will
pay the penalty for just one nodatacow file that's being synced.

I'm not sure that handling the NOISIZE bit is safe regarding size
extending and sync, ie. if it's properly synchronized with i_mutex from
all contexts.

> +				if (ret)
> +					btrfs_error(root->fs_info, ret,
> +						"errors while submitting device barriers.");	
> +				mutex_unlock(&root->fs_info->
> +					   fs_devices->device_list_mutex);
> +			}
> +			mutex_unlock(&inode->i_mutex);
> +			goto out;
> +		}
> +	}


> +
>  	/*
>  	 * ok we haven't committed the transaction yet, lets do a commit
>  	 */
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 43192e1..e2912c8 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1383,6 +1383,7 @@ out_check:
>  
>  		btrfs_release_path(path);
>  		if (cow_start != (u64)-1) {
> +			clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>  			ret = cow_file_range(inode, locked_page,
>  					     cow_start, found_key.offset - 1,
>  					     page_started, nr_written, 1);
> @@ -1425,6 +1426,7 @@ out_check:
>  						em->start + em->len - 1, 0);
>  			}
>  			type = BTRFS_ORDERED_PREALLOC;
> +			clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>  		} else {
>  			type = BTRFS_ORDERED_NOCOW;
>  		}
> @@ -1463,6 +1465,7 @@ out_check:
>  	}
>  
>  	if (cow_start != (u64)-1) {
> +		clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>  		ret = cow_file_range(inode, locked_page, cow_start, end,
>  				     page_started, nr_written, 1);
>  		if (ret)
> -- 
> 1.7.7.6
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo June 18, 2015, 3:27 a.m. UTC | #2
On Wed, Jun 17, 2015 at 05:58:36PM +0200, David Sterba wrote:
> On Wed, Jun 17, 2015 at 03:54:32PM +0800, Liu Bo wrote:
> > If  we're overwriting an allocated file without changing timestamp
> > and inode version, and the file is with NODATACOW, we don't have any metadata to
> > commit, thus we can just flush the data device cache and go forward.
> > 
> > However, if there's have any change on extents' disk bytenr, inode size,
> > timestamp or inode version, we need to go through the normal btrfs_log_inode
> > path.
> > 
> > Test:
> > ----------------------------
> > 1. sysbench test of
> > "1 file + 1 thread + bs=4k + size=40k + synchronous I/O mode + randomwrite +
> > fsync_on_each_write",
> > 2. loop device associated with tmpfs file
> > 3.
> >   - For btrfs, "-o nodatacow" and "-o noi_version" option
> >   - For ext4 and xfs, no extra mount options
> > ----------------------------
> > 
> > Results:
> > ----------------------------
> > - btrfs:
> > w/o: ~30Mb/sec
> > w:   ~131Mb/sec
> > 
> > - other filesystems: (both don't enable i_version by default)
> > ext4:  203Mb/sec
> > xfs:   212Mb/sec
> > ----------------------------
> > 
> > Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> > ---
> > v2: Catch errors from data writeback and skip barrier if necessary.
> > 
> >  fs/btrfs/btrfs_inode.h |    2 +
> >  fs/btrfs/disk-io.c     |    2 +-
> >  fs/btrfs/disk-io.h     |    1 +
> >  fs/btrfs/file.c        |   54 +++++++++++++++++++++++++++++++++++++++++++----
> >  fs/btrfs/inode.c       |    3 ++
> >  5 files changed, 56 insertions(+), 6 deletions(-)
> > 
> > diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> > index de5e4f2..f7b99b6 100644
> > --- a/fs/btrfs/btrfs_inode.h
> > +++ b/fs/btrfs/btrfs_inode.h
> > @@ -44,6 +44,8 @@
> >  #define BTRFS_INODE_IN_DELALLOC_LIST		9
> >  #define BTRFS_INODE_READDIO_NEED_LOCK		10
> >  #define BTRFS_INODE_HAS_PROPS		        11
> > +#define BTRFS_INODE_NOTIMESTAMP			12
> > +#define BTRFS_INODE_NOISIZE			13
> 
> It's not clear what the flags mean and that they're related to syncing
> under some conditions.

What do you think about BTRFS_ILOG_NOTIMESTAMP and BTRFS_ILOG_NOISIZE? 

> 
> >  /*
> >   * The following 3 bits are meant only for the btree inode.
> >   * When any of them is set, it means an error happened while writing an
> > --- a/fs/btrfs/disk-io.h
> > +++ b/fs/btrfs/disk-io.h
> > @@ -60,6 +60,7 @@ void close_ctree(struct btrfs_root *root);
> >  int write_ctree_super(struct btrfs_trans_handle *trans,
> >  		      struct btrfs_root *root, int max_mirrors);
> >  struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
> > +int barrier_all_devices(struct btrfs_fs_info *info);
> >  int btrfs_commit_super(struct btrfs_root *root);
> >  struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
> >  					    u64 bytenr);
> > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> > index faa7d39..180a3e1 100644
> > --- a/fs/btrfs/file.c
> > +++ b/fs/btrfs/file.c
> > @@ -523,8 +523,12 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
> >  	 * the disk i_size.  There is no need to log the inode
> >  	 * at this time.
> >  	 */
> > -	if (end_pos > isize)
> > +	if (end_pos > isize) {
> >  		i_size_write(inode, end_pos);
> > +		clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> > +	} else {
> > +		set_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> > +	}
> >  	return 0;
> >  }
> >  
> > @@ -1715,19 +1719,33 @@ out:
> >  static void update_time_for_write(struct inode *inode)
> >  {
> >  	struct timespec now;
> > +	int sync_it = 0;
> >  
> > -	if (IS_NOCMTIME(inode))
> > +	if (IS_NOCMTIME(inode)) {
> > +		set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
> >  		return;
> > +	}
> >  
> >  	now = current_fs_time(inode->i_sb);
> > -	if (!timespec_equal(&inode->i_mtime, &now))
> > +	if (!timespec_equal(&inode->i_mtime, &now)) {
> >  		inode->i_mtime = now;
> > +		sync_it = S_MTIME;
> > +	}
> >  
> > -	if (!timespec_equal(&inode->i_ctime, &now))
> > +	if (!timespec_equal(&inode->i_ctime, &now)) {
> >  		inode->i_ctime = now;
> > +		sync_it |= S_CTIME;
> > +	}
> >  
> > -	if (IS_I_VERSION(inode))
> > +	if (IS_I_VERSION(inode)) {
> >  		inode_inc_iversion(inode);
> > +		sync_it |= S_VERSION;
> > +	}
> > +
> > +	if (!sync_it)
> > +		set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
> > +	else
> > +		clear_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
> >  }
> >  
> >  static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
> > @@ -1983,6 +2001,32 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
> >  		goto out;
> >  	}
> >  
> > +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
> > +		if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP,
> > +					&BTRFS_I(inode)->runtime_flags) &&
> > +		    test_and_clear_bit(BTRFS_INODE_NOISIZE,
> > +					&BTRFS_I(inode)->runtime_flags)) {
> > +
> > +			/* make sure data is on disk and catch error */
> > +			if (!full_sync)
> > +				ret = filemap_fdatawait_range(inode->i_mapping,
> > +							      start, end);
> > +
> > +			if (!ret && !btrfs_test_opt(root, NOBARRIER)) {
> > +				mutex_lock(&root->fs_info->
> > +					   fs_devices->device_list_mutex);
> > +				ret = barrier_all_devices(root->fs_info);
> 
> Calling barrier devices at this point looks very fishy, taking global
> device locks to sync one file as well. All files in the filesystem will
> pay the penalty for just one nodatacow file that's being synced.

Well, I'm afraid this is necessary as this is a fsync, an expensive operation,
in the normal case, each fsync issues a superblock flush which calls barrier devices.

I was expecting to not take the global device lock but btrfs is able to
manage multiple devices which requires us to do so.

> 
> I'm not sure that handling the NOISIZE bit is safe regarding size
> extending and sync, ie. if it's properly synchronized with i_mutex from
> all contexts.

That's also my concern, but the worst case is that someone clears
NOISIZE bit and we continue on the normal fsync path.

And this NOISIZE bit not only stands for i_size change, but also will be
cleared when we do COW, I'm not sure if we need to use another bit for
the COW change or not.

Thanks,

-liubo

> 
> > +				if (ret)
> > +					btrfs_error(root->fs_info, ret,
> > +						"errors while submitting device barriers.");	
> > +				mutex_unlock(&root->fs_info->
> > +					   fs_devices->device_list_mutex);
> > +			}
> > +			mutex_unlock(&inode->i_mutex);
> > +			goto out;
> > +		}
> > +	}
> 
> 
> > +
> >  	/*
> >  	 * ok we haven't committed the transaction yet, lets do a commit
> >  	 */
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 43192e1..e2912c8 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -1383,6 +1383,7 @@ out_check:
> >  
> >  		btrfs_release_path(path);
> >  		if (cow_start != (u64)-1) {
> > +			clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> >  			ret = cow_file_range(inode, locked_page,
> >  					     cow_start, found_key.offset - 1,
> >  					     page_started, nr_written, 1);
> > @@ -1425,6 +1426,7 @@ out_check:
> >  						em->start + em->len - 1, 0);
> >  			}
> >  			type = BTRFS_ORDERED_PREALLOC;
> > +			clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> >  		} else {
> >  			type = BTRFS_ORDERED_NOCOW;
> >  		}
> > @@ -1463,6 +1465,7 @@ out_check:
> >  	}
> >  
> >  	if (cow_start != (u64)-1) {
> > +		clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> >  		ret = cow_file_range(inode, locked_page, cow_start, end,
> >  				     page_started, nr_written, 1);
> >  		if (ret)
> > -- 
> > 1.7.7.6
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba June 24, 2015, 6:21 p.m. UTC | #3
On Thu, Jun 18, 2015 at 11:27:24AM +0800, Liu Bo wrote:
> > >  #define BTRFS_INODE_IN_DELALLOC_LIST		9
> > >  #define BTRFS_INODE_READDIO_NEED_LOCK		10
> > >  #define BTRFS_INODE_HAS_PROPS		        11
> > > +#define BTRFS_INODE_NOTIMESTAMP			12
> > > +#define BTRFS_INODE_NOISIZE			13
> > 
> > It's not clear what the flags mean and that they're related to syncing
> > under some conditions.
> 
> What do you think about BTRFS_ILOG_NOTIMESTAMP and BTRFS_ILOG_NOISIZE? 

I'd say BTRFS_INODE_FSYNC_NOTIMESTAMP and BTRFS_INODE_FSYNC_NOSIZE

> > > @@ -1983,6 +2001,32 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
> > >  		goto out;
> > >  	}
> > >  
> > > +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
> > > +		if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP,
> > > +					&BTRFS_I(inode)->runtime_flags) &&
> > > +		    test_and_clear_bit(BTRFS_INODE_NOISIZE,
> > > +					&BTRFS_I(inode)->runtime_flags)) {
> > > +
> > > +			/* make sure data is on disk and catch error */
> > > +			if (!full_sync)
> > > +				ret = filemap_fdatawait_range(inode->i_mapping,
> > > +							      start, end);
> > > +
> > > +			if (!ret && !btrfs_test_opt(root, NOBARRIER)) {
> > > +				mutex_lock(&root->fs_info->
> > > +					   fs_devices->device_list_mutex);
> > > +				ret = barrier_all_devices(root->fs_info);
> > 
> > Calling barrier devices at this point looks very fishy, taking global
> > device locks to sync one file as well. All files in the filesystem will
> > pay the penalty for just one nodatacow file that's being synced.
> 
> Well, I'm afraid this is necessary as this is a fsync, an expensive operation,
> in the normal case, each fsync issues a superblock flush which calls barrier devices.
> 
> I was expecting to not take the global device lock but btrfs is able to
> manage multiple devices which requires us to do so.

I've read the code again and came to the same conclusion, objections
withdrawn.

> > I'm not sure that handling the NOISIZE bit is safe regarding size
> > extending and sync, ie. if it's properly synchronized with i_mutex from
> > all contexts.
> 
> That's also my concern, but the worst case is that someone clears
> NOISIZE bit and we continue on the normal fsync path.

Sounds safe.

> And this NOISIZE bit not only stands for i_size change, but also will be
> cleared when we do COW, I'm not sure if we need to use another bit for
> the COW change or not.

I'm not sure I understand, you mean split the NOISIZE into two bits and
use NOISIZE just for inode size change and the other one for the
cow_file_range case?

Btw, shouln't the NOISIZE bit get cleared inside cow_file_range? Both
calls are in run_delalloc_nocow, this makes sense, but I'm a bit worried
that we could forget to add it somewhere else. I don't think this would
hurt performance, cow_file_range is pretty big.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liu Bo June 25, 2015, 2:24 a.m. UTC | #4
On Wed, Jun 24, 2015 at 08:21:30PM +0200, David Sterba wrote:
> On Thu, Jun 18, 2015 at 11:27:24AM +0800, Liu Bo wrote:
> > > >  #define BTRFS_INODE_IN_DELALLOC_LIST		9
> > > >  #define BTRFS_INODE_READDIO_NEED_LOCK		10
> > > >  #define BTRFS_INODE_HAS_PROPS		        11
> > > > +#define BTRFS_INODE_NOTIMESTAMP			12
> > > > +#define BTRFS_INODE_NOISIZE			13
> > > 
> > > It's not clear what the flags mean and that they're related to syncing
> > > under some conditions.
> > 
> > What do you think about BTRFS_ILOG_NOTIMESTAMP and BTRFS_ILOG_NOISIZE? 
> 
> I'd say BTRFS_INODE_FSYNC_NOTIMESTAMP and BTRFS_INODE_FSYNC_NOSIZE

Looks good.

> 
> > > > @@ -1983,6 +2001,32 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
> > > >  		goto out;
> > > >  	}
> > > >  
> > > > +	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
> > > > +		if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP,
> > > > +					&BTRFS_I(inode)->runtime_flags) &&
> > > > +		    test_and_clear_bit(BTRFS_INODE_NOISIZE,
> > > > +					&BTRFS_I(inode)->runtime_flags)) {
> > > > +
> > > > +			/* make sure data is on disk and catch error */
> > > > +			if (!full_sync)
> > > > +				ret = filemap_fdatawait_range(inode->i_mapping,
> > > > +							      start, end);
> > > > +
> > > > +			if (!ret && !btrfs_test_opt(root, NOBARRIER)) {
> > > > +				mutex_lock(&root->fs_info->
> > > > +					   fs_devices->device_list_mutex);
> > > > +				ret = barrier_all_devices(root->fs_info);
> > > 
> > > Calling barrier devices at this point looks very fishy, taking global
> > > device locks to sync one file as well. All files in the filesystem will
> > > pay the penalty for just one nodatacow file that's being synced.
> > 
> > Well, I'm afraid this is necessary as this is a fsync, an expensive operation,
> > in the normal case, each fsync issues a superblock flush which calls barrier devices.
> > 
> > I was expecting to not take the global device lock but btrfs is able to
> > manage multiple devices which requires us to do so.
> 
> I've read the code again and came to the same conclusion, objections
> withdrawn.
> 
> > > I'm not sure that handling the NOISIZE bit is safe regarding size
> > > extending and sync, ie. if it's properly synchronized with i_mutex from
> > > all contexts.
> > 
> > That's also my concern, but the worst case is that someone clears
> > NOISIZE bit and we continue on the normal fsync path.
> 
> Sounds safe.
> 
> > And this NOISIZE bit not only stands for i_size change, but also will be
> > cleared when we do COW, I'm not sure if we need to use another bit for
> > the COW change or not.
> 
> I'm not sure I understand, you mean split the NOISIZE into two bits and
> use NOISIZE just for inode size change and the other one for the
> cow_file_range case?

Yes, for now it has mixed meanings, either changing i_size or doing Cow.
But I think it'd better to leave it mixed if we document it well.

> 
> Btw, shouln't the NOISIZE bit get cleared inside cow_file_range? Both
> calls are in run_delalloc_nocow, this makes sense, but I'm a bit worried
> that we could forget to add it somewhere else. I don't think this would
> hurt performance, cow_file_range is pretty big.

That sounds be better.

Thanks,

-liubo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Sterba June 25, 2015, 4:10 p.m. UTC | #5
On Thu, Jun 25, 2015 at 10:24:25AM +0800, Liu Bo wrote:
> > I'm not sure I understand, you mean split the NOISIZE into two bits and
> > use NOISIZE just for inode size change and the other one for the
> > cow_file_range case?
> 
> Yes, for now it has mixed meanings, either changing i_size or doing Cow.
> But I think it'd better to leave it mixed if we document it well.

Agreed, please also comment the new fsync code that actually uses the
nocow + notimestamp + noisize bits.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index de5e4f2..f7b99b6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,8 @@ 
 #define BTRFS_INODE_IN_DELALLOC_LIST		9
 #define BTRFS_INODE_READDIO_NEED_LOCK		10
 #define BTRFS_INODE_HAS_PROPS		        11
+#define BTRFS_INODE_NOTIMESTAMP			12
+#define BTRFS_INODE_NOISIZE			13
 /*
  * The following 3 bits are meant only for the btree inode.
  * When any of them is set, it means an error happened while writing an
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 639f266..8a41df1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3299,7 +3299,7 @@  static int write_dev_flush(struct btrfs_device *device, int wait)
  * send an empty flush down to each device in parallel,
  * then wait for them
  */
-static int barrier_all_devices(struct btrfs_fs_info *info)
+int barrier_all_devices(struct btrfs_fs_info *info)
 {
 	struct list_head *head;
 	struct btrfs_device *dev;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 27d44c0..bea982c 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -60,6 +60,7 @@  void close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int barrier_all_devices(struct btrfs_fs_info *info);
 int btrfs_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index faa7d39..180a3e1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -523,8 +523,12 @@  int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 	 * the disk i_size.  There is no need to log the inode
 	 * at this time.
 	 */
-	if (end_pos > isize)
+	if (end_pos > isize) {
 		i_size_write(inode, end_pos);
+		clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
+	} else {
+		set_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
+	}
 	return 0;
 }
 
@@ -1715,19 +1719,33 @@  out:
 static void update_time_for_write(struct inode *inode)
 {
 	struct timespec now;
+	int sync_it = 0;
 
-	if (IS_NOCMTIME(inode))
+	if (IS_NOCMTIME(inode)) {
+		set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
 		return;
+	}
 
 	now = current_fs_time(inode->i_sb);
-	if (!timespec_equal(&inode->i_mtime, &now))
+	if (!timespec_equal(&inode->i_mtime, &now)) {
 		inode->i_mtime = now;
+		sync_it = S_MTIME;
+	}
 
-	if (!timespec_equal(&inode->i_ctime, &now))
+	if (!timespec_equal(&inode->i_ctime, &now)) {
 		inode->i_ctime = now;
+		sync_it |= S_CTIME;
+	}
 
-	if (IS_I_VERSION(inode))
+	if (IS_I_VERSION(inode)) {
 		inode_inc_iversion(inode);
+		sync_it |= S_VERSION;
+	}
+
+	if (!sync_it)
+		set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
+	else
+		clear_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
 }
 
 static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
@@ -1983,6 +2001,32 @@  int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		goto out;
 	}
 
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
+		if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP,
+					&BTRFS_I(inode)->runtime_flags) &&
+		    test_and_clear_bit(BTRFS_INODE_NOISIZE,
+					&BTRFS_I(inode)->runtime_flags)) {
+
+			/* make sure data is on disk and catch error */
+			if (!full_sync)
+				ret = filemap_fdatawait_range(inode->i_mapping,
+							      start, end);
+
+			if (!ret && !btrfs_test_opt(root, NOBARRIER)) {
+				mutex_lock(&root->fs_info->
+					   fs_devices->device_list_mutex);
+				ret = barrier_all_devices(root->fs_info);
+				if (ret)
+					btrfs_error(root->fs_info, ret,
+						"errors while submitting device barriers.");	
+				mutex_unlock(&root->fs_info->
+					   fs_devices->device_list_mutex);
+			}
+			mutex_unlock(&inode->i_mutex);
+			goto out;
+		}
+	}
+
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 43192e1..e2912c8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1383,6 +1383,7 @@  out_check:
 
 		btrfs_release_path(path);
 		if (cow_start != (u64)-1) {
+			clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
 			ret = cow_file_range(inode, locked_page,
 					     cow_start, found_key.offset - 1,
 					     page_started, nr_written, 1);
@@ -1425,6 +1426,7 @@  out_check:
 						em->start + em->len - 1, 0);
 			}
 			type = BTRFS_ORDERED_PREALLOC;
+			clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
 		} else {
 			type = BTRFS_ORDERED_NOCOW;
 		}
@@ -1463,6 +1465,7 @@  out_check:
 	}
 
 	if (cow_start != (u64)-1) {
+		clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
 		ret = cow_file_range(inode, locked_page, cow_start, end,
 				     page_started, nr_written, 1);
 		if (ret)