diff mbox

[5/5] btrfs: don't update mtime on deduped inodes

Message ID 1435352461-17689-6-git-send-email-mfasheh@suse.de (mailing list archive)
State Under Review
Headers show

Commit Message

Mark Fasheh June 26, 2015, 9:01 p.m. UTC
One issue users have reported is that dedupe changes mtime on files,
resulting in tools like rsync thinking that their contents have changed when
in fact the data is exactly the same. Clone still wants an mtime change, so
we special case this in the code.

This was tested with the btrfs-extent-same tool.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
---
 fs/btrfs/ioctl.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

Comments

Zygo Blaxell June 27, 2015, 9:44 p.m. UTC | #1
On Fri, Jun 26, 2015 at 02:01:01PM -0700, Mark Fasheh wrote:
> One issue users have reported is that dedupe changes mtime on files,
> resulting in tools like rsync thinking that their contents have changed when
> in fact the data is exactly the same. Clone still wants an mtime change, so
> we special case this in the code.
> 
> This was tested with the btrfs-extent-same tool.
> 
> Signed-off-by: Mark Fasheh <mfasheh@suse.de>
> ---
>  fs/btrfs/ioctl.c | 25 +++++++++++++++----------
>  1 file changed, 15 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index 83f4679..0af0f13 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 {
>  
>  
>  static int btrfs_clone(struct inode *src, struct inode *inode,
> -		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
> +		       u64 off, u64 olen, u64 olen_aligned, u64 destoff,
> +		       int no_mtime);
>  
>  /* Mask out flags that are inappropriate for the given type of inode. */
>  static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
> @@ -3054,7 +3055,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
>  	/* pass original length for comparison so we stay within i_size */
>  	ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
>  	if (ret == 0)
> -		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
> +		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
>  
>  	if (same_inode)
>  		unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
> @@ -3219,13 +3220,17 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
>  				     struct inode *inode,
>  				     u64 endoff,
>  				     const u64 destoff,
> -				     const u64 olen)
> +				     const u64 olen,
> +				     int no_mtime)
>  {
>  	struct btrfs_root *root = BTRFS_I(inode)->root;
>  	int ret;
>  
>  	inode_inc_iversion(inode);
> -	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> +	if (no_mtime)
> +		inode->i_ctime = CURRENT_TIME;

I don't see a good reason to modify the ctime either.  Again, nothing
is changing here.  All we are doing is shuffling physical storage around.

Defrag and balance (which also move physical extents around) don't
touch ctime, mtime, or even atime.

> +	else
> +		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
>  	/*
>  	 * We round up to the block size at eof when determining which
>  	 * extents to clone above, but shouldn't round up the file size.
> @@ -3310,13 +3315,13 @@ static void clone_update_extent_map(struct inode *inode,
>   * @inode: Inode to clone to
>   * @off: Offset within source to start clone from
>   * @olen: Original length, passed by user, of range to clone
> - * @olen_aligned: Block-aligned value of olen, extent_same uses
> - *               identical values here
> + * @olen_aligned: Block-aligned value of olen
>   * @destoff: Offset within @inode to start clone
> + * @no_mtime: Whether to update mtime on the target inode
>   */
>  static int btrfs_clone(struct inode *src, struct inode *inode,
>  		       const u64 off, const u64 olen, const u64 olen_aligned,
> -		       const u64 destoff)
> +		       const u64 destoff, int no_mtime)
>  {
>  	struct btrfs_root *root = BTRFS_I(inode)->root;
>  	struct btrfs_path *path = NULL;
> @@ -3640,7 +3645,7 @@ process_slot:
>  					      root->sectorsize);
>  			ret = clone_finish_inode_update(trans, inode,
>  							last_dest_end,
> -							destoff, olen);
> +							destoff, olen, no_mtime);
>  			if (ret)
>  				goto out;
>  			if (new_key.offset + datal >= destoff + len)
> @@ -3678,7 +3683,7 @@ process_slot:
>  		clone_update_extent_map(inode, trans, NULL, last_dest_end,
>  					destoff + len - last_dest_end);
>  		ret = clone_finish_inode_update(trans, inode, destoff + len,
> -						destoff, olen);
> +						destoff, olen, no_mtime);
>  	}
>  
>  out:
> @@ -3808,7 +3813,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
>  		btrfs_double_extent_lock(src, off, inode, destoff, len);
>  	}
>  
> -	ret = btrfs_clone(src, inode, off, olen, len, destoff);
> +	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
>  
>  	if (same_inode) {
>  		u64 lock_start = min_t(u64, off, destoff);
> -- 
> 2.1.2
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Fasheh June 29, 2015, 5:52 p.m. UTC | #2
On Sat, Jun 27, 2015 at 05:44:28PM -0400, Zygo Blaxell wrote:
> On Fri, Jun 26, 2015 at 02:01:01PM -0700, Mark Fasheh wrote:
> > One issue users have reported is that dedupe changes mtime on files,
> > resulting in tools like rsync thinking that their contents have changed when
> > in fact the data is exactly the same. Clone still wants an mtime change, so
> > we special case this in the code.
> > 
> > This was tested with the btrfs-extent-same tool.
> > 
> > Signed-off-by: Mark Fasheh <mfasheh@suse.de>
> > ---
> >  fs/btrfs/ioctl.c | 25 +++++++++++++++----------
> >  1 file changed, 15 insertions(+), 10 deletions(-)
> > 
> > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> > index 83f4679..0af0f13 100644
> > --- a/fs/btrfs/ioctl.c
> > +++ b/fs/btrfs/ioctl.c
> > @@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 {
> >  
> >  
> >  static int btrfs_clone(struct inode *src, struct inode *inode,
> > -		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
> > +		       u64 off, u64 olen, u64 olen_aligned, u64 destoff,
> > +		       int no_mtime);
> >  
> >  /* Mask out flags that are inappropriate for the given type of inode. */
> >  static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
> > @@ -3054,7 +3055,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
> >  	/* pass original length for comparison so we stay within i_size */
> >  	ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
> >  	if (ret == 0)
> > -		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
> > +		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
> >  
> >  	if (same_inode)
> >  		unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
> > @@ -3219,13 +3220,17 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
> >  				     struct inode *inode,
> >  				     u64 endoff,
> >  				     const u64 destoff,
> > -				     const u64 olen)
> > +				     const u64 olen,
> > +				     int no_mtime)
> >  {
> >  	struct btrfs_root *root = BTRFS_I(inode)->root;
> >  	int ret;
> >  
> >  	inode_inc_iversion(inode);
> > -	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> > +	if (no_mtime)
> > +		inode->i_ctime = CURRENT_TIME;
> 
> I don't see a good reason to modify the ctime either.  Again, nothing
> is changing here.  All we are doing is shuffling physical storage around.
> 
> Defrag and balance (which also move physical extents around) don't
> touch ctime, mtime, or even atime.

To be fair, those may actually be oversights, it's not uncommon to update
ctime on metadata changes.

Does a ctime change hurt any backup software (the reason for my first
patch)? I guess it could cause revaluation of meta data, but does that
actually happen? From what I can tell stuff like rsync is using mtime +
i_size to see if an inode changed.

Is there any software out there that monitors an inodes extent state which
might *want* ctime updates when this happens? Is that kind of usage a
stretch (or even something we care about?).

So my thinking is if it doesn't hurt anything, leave it in. Obviously if it
*is* causing issues then we should take it right out :)

Thanks for the discussion and review btw,
	--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Zygo Blaxell June 29, 2015, 7:35 p.m. UTC | #3
On Mon, Jun 29, 2015 at 10:52:41AM -0700, Mark Fasheh wrote:
> On Sat, Jun 27, 2015 at 05:44:28PM -0400, Zygo Blaxell wrote:
> > On Fri, Jun 26, 2015 at 02:01:01PM -0700, Mark Fasheh wrote:
> > > One issue users have reported is that dedupe changes mtime on files,
> > > resulting in tools like rsync thinking that their contents have changed when
> > > in fact the data is exactly the same. Clone still wants an mtime change, so
> > > we special case this in the code.
> > > 
> > > This was tested with the btrfs-extent-same tool.
> > > 
> > > Signed-off-by: Mark Fasheh <mfasheh@suse.de>
> > > ---
> > >  fs/btrfs/ioctl.c | 25 +++++++++++++++----------
> > >  1 file changed, 15 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> > > index 83f4679..0af0f13 100644
> > > --- a/fs/btrfs/ioctl.c
> > > +++ b/fs/btrfs/ioctl.c
> > > @@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 {
> > >  
> > >  
> > >  static int btrfs_clone(struct inode *src, struct inode *inode,
> > > -		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
> > > +		       u64 off, u64 olen, u64 olen_aligned, u64 destoff,
> > > +		       int no_mtime);
> > >  
> > >  /* Mask out flags that are inappropriate for the given type of inode. */
> > >  static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
> > > @@ -3054,7 +3055,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
> > >  	/* pass original length for comparison so we stay within i_size */
> > >  	ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
> > >  	if (ret == 0)
> > > -		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
> > > +		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
> > >  
> > >  	if (same_inode)
> > >  		unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
> > > @@ -3219,13 +3220,17 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
> > >  				     struct inode *inode,
> > >  				     u64 endoff,
> > >  				     const u64 destoff,
> > > -				     const u64 olen)
> > > +				     const u64 olen,
> > > +				     int no_mtime)
> > >  {
> > >  	struct btrfs_root *root = BTRFS_I(inode)->root;
> > >  	int ret;
> > >  
> > >  	inode_inc_iversion(inode);
> > > -	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> > > +	if (no_mtime)
> > > +		inode->i_ctime = CURRENT_TIME;
> > 
> > I don't see a good reason to modify the ctime either.  Again, nothing
> > is changing here.  All we are doing is shuffling physical storage around.
> > 
> > Defrag and balance (which also move physical extents around) don't
> > touch ctime, mtime, or even atime.
> 
> To be fair, those may actually be oversights, it's not uncommon to update
> ctime on metadata changes.

It makes no sense semantically.  There are no changes to any inode
fields here.  Normally when extents are moved around inodes don't change
at all.

The current balance behavior is definitely not an oversight.  Balance
would have to rewrite every inode on the filesystem (actually every
*snapshot* of every inode) to update ctime.

Defrag is copy-in-place while holding a lock to prevent concurrent
modifications.  Defrag does the complementary operation to extent-same.
Defrag will also not change any file contents, and it's already got the
correct ctime behavior.  ;)

> Does a ctime change hurt any backup software (the reason for my first
> patch)? I guess it could cause revaluation of meta data, but does that
> actually happen? From what I can tell stuff like rsync is using mtime +
> i_size to see if an inode changed.

Off the top of my head, git uses ctime to figure out whether its index is
up to date.  It probably borrowed that from other SCMs.

Some admins (myself included) build ad-hoc (and even formal) forensic
timelines from ctime data.  This doesn't work if dedup wipes out the
ctime, especially if you are doing aggressive dedup across multiple
snapshots.

The core problem is that deduping stops being a transparent rearrangement
of the physical layout of the data (like defrag or balance) if the ctime
changes whenever you do it.

> Is there any software out there that monitors an inodes extent state which
> might *want* ctime updates when this happens? Is that kind of usage a
> stretch (or even something we care about?).
> 
> So my thinking is if it doesn't hurt anything, leave it in. Obviously if it
> *is* causing issues then we should take it right out :)
> 
> Thanks for the discussion and review btw,
> 	--Mark
> 
> --
> Mark Fasheh
>
Mark Fasheh June 29, 2015, 8:29 p.m. UTC | #4
On Mon, Jun 29, 2015 at 03:35:02PM -0400, Zygo Blaxell wrote:
> On Mon, Jun 29, 2015 at 10:52:41AM -0700, Mark Fasheh wrote:
> > On Sat, Jun 27, 2015 at 05:44:28PM -0400, Zygo Blaxell wrote:
> > > On Fri, Jun 26, 2015 at 02:01:01PM -0700, Mark Fasheh wrote:
> > > > One issue users have reported is that dedupe changes mtime on files,
> > > > resulting in tools like rsync thinking that their contents have changed when
> > > > in fact the data is exactly the same. Clone still wants an mtime change, so
> > > > we special case this in the code.
> > > > 
> > > > This was tested with the btrfs-extent-same tool.
> > > > 
> > > > Signed-off-by: Mark Fasheh <mfasheh@suse.de>
> > > > ---
> > > >  fs/btrfs/ioctl.c | 25 +++++++++++++++----------
> > > >  1 file changed, 15 insertions(+), 10 deletions(-)
> > > > 
> > > > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> > > > index 83f4679..0af0f13 100644
> > > > --- a/fs/btrfs/ioctl.c
> > > > +++ b/fs/btrfs/ioctl.c
> > > > @@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 {
> > > >  
> > > >  
> > > >  static int btrfs_clone(struct inode *src, struct inode *inode,
> > > > -		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
> > > > +		       u64 off, u64 olen, u64 olen_aligned, u64 destoff,
> > > > +		       int no_mtime);
> > > >  
> > > >  /* Mask out flags that are inappropriate for the given type of inode. */
> > > >  static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
> > > > @@ -3054,7 +3055,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
> > > >  	/* pass original length for comparison so we stay within i_size */
> > > >  	ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
> > > >  	if (ret == 0)
> > > > -		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
> > > > +		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
> > > >  
> > > >  	if (same_inode)
> > > >  		unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
> > > > @@ -3219,13 +3220,17 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
> > > >  				     struct inode *inode,
> > > >  				     u64 endoff,
> > > >  				     const u64 destoff,
> > > > -				     const u64 olen)
> > > > +				     const u64 olen,
> > > > +				     int no_mtime)
> > > >  {
> > > >  	struct btrfs_root *root = BTRFS_I(inode)->root;
> > > >  	int ret;
> > > >  
> > > >  	inode_inc_iversion(inode);
> > > > -	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
> > > > +	if (no_mtime)
> > > > +		inode->i_ctime = CURRENT_TIME;
> > > 
> > > I don't see a good reason to modify the ctime either.  Again, nothing
> > > is changing here.  All we are doing is shuffling physical storage around.
> > > 
> > > Defrag and balance (which also move physical extents around) don't
> > > touch ctime, mtime, or even atime.
> > 
> > To be fair, those may actually be oversights, it's not uncommon to update
> > ctime on metadata changes.
> 
> It makes no sense semantically.  There are no changes to any inode
> fields here.  Normally when extents are moved around inodes don't change
> at all.
> 
> The current balance behavior is definitely not an oversight.  Balance
> would have to rewrite every inode on the filesystem (actually every
> *snapshot* of every inode) to update ctime.
> 
> Defrag is copy-in-place while holding a lock to prevent concurrent
> modifications.  Defrag does the complementary operation to extent-same.
> Defrag will also not change any file contents, and it's already got the
> correct ctime behavior.  ;)

Ahh ok sure that makes sense :)


> > Does a ctime change hurt any backup software (the reason for my first
> > patch)? I guess it could cause revaluation of meta data, but does that
> > actually happen? From what I can tell stuff like rsync is using mtime +
> > i_size to see if an inode changed.
> 
> Off the top of my head, git uses ctime to figure out whether its index is
> up to date.  It probably borrowed that from other SCMs.
> 
> Some admins (myself included) build ad-hoc (and even formal) forensic
> timelines from ctime data.  This doesn't work if dedup wipes out the
> ctime, especially if you are doing aggressive dedup across multiple
> snapshots.
> 
> The core problem is that deduping stops being a transparent rearrangement
> of the physical layout of the data (like defrag or balance) if the ctime
> changes whenever you do it.

Ok, thanks for describing those use cases. It makes sense now to me why we
would want to avoid the ctime changes. I should have another patch out
shortly.

Thanks again,
	--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 83f4679..0af0f13 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -87,7 +87,8 @@  struct btrfs_ioctl_received_subvol_args_32 {
 
 
 static int btrfs_clone(struct inode *src, struct inode *inode,
-		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+		       u64 off, u64 olen, u64 olen_aligned, u64 destoff,
+		       int no_mtime);
 
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -3054,7 +3055,7 @@  static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 	/* pass original length for comparison so we stay within i_size */
 	ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp);
 	if (ret == 0)
-		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff);
+		ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1);
 
 	if (same_inode)
 		unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start,
@@ -3219,13 +3220,17 @@  static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
 				     struct inode *inode,
 				     u64 endoff,
 				     const u64 destoff,
-				     const u64 olen)
+				     const u64 olen,
+				     int no_mtime)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
 	inode_inc_iversion(inode);
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	if (no_mtime)
+		inode->i_ctime = CURRENT_TIME;
+	else
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	/*
 	 * We round up to the block size at eof when determining which
 	 * extents to clone above, but shouldn't round up the file size.
@@ -3310,13 +3315,13 @@  static void clone_update_extent_map(struct inode *inode,
  * @inode: Inode to clone to
  * @off: Offset within source to start clone from
  * @olen: Original length, passed by user, of range to clone
- * @olen_aligned: Block-aligned value of olen, extent_same uses
- *               identical values here
+ * @olen_aligned: Block-aligned value of olen
  * @destoff: Offset within @inode to start clone
+ * @no_mtime: Whether to update mtime on the target inode
  */
 static int btrfs_clone(struct inode *src, struct inode *inode,
 		       const u64 off, const u64 olen, const u64 olen_aligned,
-		       const u64 destoff)
+		       const u64 destoff, int no_mtime)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_path *path = NULL;
@@ -3640,7 +3645,7 @@  process_slot:
 					      root->sectorsize);
 			ret = clone_finish_inode_update(trans, inode,
 							last_dest_end,
-							destoff, olen);
+							destoff, olen, no_mtime);
 			if (ret)
 				goto out;
 			if (new_key.offset + datal >= destoff + len)
@@ -3678,7 +3683,7 @@  process_slot:
 		clone_update_extent_map(inode, trans, NULL, last_dest_end,
 					destoff + len - last_dest_end);
 		ret = clone_finish_inode_update(trans, inode, destoff + len,
-						destoff, olen);
+						destoff, olen, no_mtime);
 	}
 
 out:
@@ -3808,7 +3813,7 @@  static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 		btrfs_double_extent_lock(src, off, inode, destoff, len);
 	}
 
-	ret = btrfs_clone(src, inode, off, olen, len, destoff);
+	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
 
 	if (same_inode) {
 		u64 lock_start = min_t(u64, off, destoff);