diff mbox series

[12/15] vfs: implement opportunistic short dedupe

Message ID 153870036143.29072.11970142092673351715.stgit@magnolia (mailing list archive)
State New, archived
Headers show
Series fs: fixes for serious clone/dedupe problems | expand

Commit Message

Darrick J. Wong Oct. 5, 2018, 12:46 a.m. UTC
From: Darrick J. Wong <darrick.wong@oracle.com>

For a given dedupe request, the bytes_deduped field in the control
structure tells userspace if we managed to deduplicate some, but not all
of, the requested regions starting from the file offsets supplied.
However, due to sloppy coding, the current dedupe code returns
FILE_DEDUPE_RANGE_DIFFERS if any part of the range is different.
Fix this so that we can actually support partial request completion.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/read_write.c    |   44 +++++++++++++++++++++++++++++++++++---------
 include/linux/fs.h |    2 +-
 2 files changed, 36 insertions(+), 10 deletions(-)

Comments

Amir Goldstein Oct. 5, 2018, 6:40 a.m. UTC | #1
On Fri, Oct 5, 2018 at 3:46 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
>
> From: Darrick J. Wong <darrick.wong@oracle.com>
>
> For a given dedupe request, the bytes_deduped field in the control
> structure tells userspace if we managed to deduplicate some, but not all
> of, the requested regions starting from the file offsets supplied.
> However, due to sloppy coding, the current dedupe code returns
> FILE_DEDUPE_RANGE_DIFFERS if any part of the range is different.
> Fix this so that we can actually support partial request completion.
>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
>  fs/read_write.c    |   44 +++++++++++++++++++++++++++++++++++---------
>  include/linux/fs.h |    2 +-
>  2 files changed, 36 insertions(+), 10 deletions(-)
>
>
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 292d68c2f47c..9be9f261edd2 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -1781,13 +1781,11 @@ int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
>          * Check that the extents are the same.
>          */
>         if (is_dedupe) {
> -               bool            is_same = false;
> -
>                 ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> -                               inode_out, pos_out, *len, &is_same);
> +                               inode_out, pos_out, len);
>                 if (ret)
>                         return ret;
> -               if (!is_same)
> +               if (*len == 0)
>                         return -EBADE;
>         }
>
> @@ -1872,13 +1870,30 @@ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
>         return page;
>  }
>
> +static unsigned int vfs_dedupe_memcmp(const char *s1, const char *s2,
> +                                     unsigned int cmp_len)
> +{
> +       const char *orig_s1 = s1;
> +       const char *e1 = s1 + cmp_len;
> +       const char *e2 = s2 + cmp_len;
> +
> +       while (s1 < e1 && s2 < e2) {
> +               if (*s1 != *s2)
> +                       break;
> +               s1++;
> +               s2++;
> +       }
> +
> +       return s1 - orig_s1;
> +}
> +

A few nits:
'len' wouldn't have been ambiguous in this context.
I find the for loop in memcmp more elegant. It is definitely shorter.
Not sure how differently the variants compile, but decrementing
count/len seems much more sane then checking 2 conditions that
always have the same result.

Thanks,
Amir.
Darrick J. Wong Oct. 5, 2018, 5:42 p.m. UTC | #2
On Fri, Oct 05, 2018 at 09:40:44AM +0300, Amir Goldstein wrote:
> On Fri, Oct 5, 2018 at 3:46 AM Darrick J. Wong <darrick.wong@oracle.com> wrote:
> >
> > From: Darrick J. Wong <darrick.wong@oracle.com>
> >
> > For a given dedupe request, the bytes_deduped field in the control
> > structure tells userspace if we managed to deduplicate some, but not all
> > of, the requested regions starting from the file offsets supplied.
> > However, due to sloppy coding, the current dedupe code returns
> > FILE_DEDUPE_RANGE_DIFFERS if any part of the range is different.
> > Fix this so that we can actually support partial request completion.
> >
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> >  fs/read_write.c    |   44 +++++++++++++++++++++++++++++++++++---------
> >  include/linux/fs.h |    2 +-
> >  2 files changed, 36 insertions(+), 10 deletions(-)
> >
> >
> > diff --git a/fs/read_write.c b/fs/read_write.c
> > index 292d68c2f47c..9be9f261edd2 100644
> > --- a/fs/read_write.c
> > +++ b/fs/read_write.c
> > @@ -1781,13 +1781,11 @@ int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
> >          * Check that the extents are the same.
> >          */
> >         if (is_dedupe) {
> > -               bool            is_same = false;
> > -
> >                 ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
> > -                               inode_out, pos_out, *len, &is_same);
> > +                               inode_out, pos_out, len);
> >                 if (ret)
> >                         return ret;
> > -               if (!is_same)
> > +               if (*len == 0)
> >                         return -EBADE;
> >         }
> >
> > @@ -1872,13 +1870,30 @@ static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
> >         return page;
> >  }
> >
> > +static unsigned int vfs_dedupe_memcmp(const char *s1, const char *s2,
> > +                                     unsigned int cmp_len)
> > +{
> > +       const char *orig_s1 = s1;
> > +       const char *e1 = s1 + cmp_len;
> > +       const char *e2 = s2 + cmp_len;
> > +
> > +       while (s1 < e1 && s2 < e2) {
> > +               if (*s1 != *s2)
> > +                       break;
> > +               s1++;
> > +               s2++;
> > +       }
> > +
> > +       return s1 - orig_s1;
> > +}
> > +
> 
> A few nits:
> 'len' wouldn't have been ambiguous in this context.
> I find the for loop in memcmp more elegant. It is definitely shorter.
> Not sure how differently the variants compile, but decrementing
> count/len seems much more sane then checking 2 conditions that
> always have the same result.

Fair enough; will fix.

--D

> Thanks,
> Amir.
diff mbox series

Patch

diff --git a/fs/read_write.c b/fs/read_write.c
index 292d68c2f47c..9be9f261edd2 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1781,13 +1781,11 @@  int vfs_clone_file_prep(struct file *file_in, loff_t pos_in,
 	 * Check that the extents are the same.
 	 */
 	if (is_dedupe) {
-		bool		is_same = false;
-
 		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
-				inode_out, pos_out, *len, &is_same);
+				inode_out, pos_out, len);
 		if (ret)
 			return ret;
-		if (!is_same)
+		if (*len == 0)
 			return -EBADE;
 	}
 
@@ -1872,13 +1870,30 @@  static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
 	return page;
 }
 
+static unsigned int vfs_dedupe_memcmp(const char *s1, const char *s2,
+				      unsigned int cmp_len)
+{
+	const char *orig_s1 = s1;
+	const char *e1 = s1 + cmp_len;
+	const char *e2 = s2 + cmp_len;
+
+	while (s1 < e1 && s2 < e2) {
+		if (*s1 != *s2)
+			break;
+		s1++;
+		s2++;
+	}
+
+	return s1 - orig_s1;
+}
+
 /*
  * Compare extents of two files to see if they are the same.
  * Caller must have locked both inodes to prevent write races.
  */
 int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 				  struct inode *dest, loff_t destoff,
-				  loff_t len, bool *is_same)
+				  loff_t *req_len)
 {
 	loff_t src_poff;
 	loff_t dest_poff;
@@ -1886,8 +1901,11 @@  int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 	void *dest_addr;
 	struct page *src_page;
 	struct page *dest_page;
-	loff_t cmp_len;
+	loff_t len = *req_len;
+	loff_t same_len = 0;
 	bool same;
+	unsigned int cmp_len;
+	unsigned int cmp_same;
 	int error;
 
 	error = -EINVAL;
@@ -1897,7 +1915,7 @@  int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 		dest_poff = destoff & (PAGE_SIZE - 1);
 		cmp_len = min(PAGE_SIZE - src_poff,
 			      PAGE_SIZE - dest_poff);
-		cmp_len = min(cmp_len, len);
+		cmp_len = min_t(loff_t, cmp_len, len);
 		if (cmp_len <= 0)
 			goto out_error;
 
@@ -1919,7 +1937,10 @@  int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 		flush_dcache_page(src_page);
 		flush_dcache_page(dest_page);
 
-		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+		cmp_same = vfs_dedupe_memcmp(src_addr + src_poff,
+					     dest_addr + dest_poff, cmp_len);
+		same_len += cmp_same;
+		if (cmp_same != cmp_len)
 			same = false;
 
 		kunmap_atomic(dest_addr);
@@ -1937,7 +1958,12 @@  int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 		len -= cmp_len;
 	}
 
-	*is_same = same;
+	/*
+	 * If less than the whole range matched, we have to back down to the
+	 * nearest block boundary.
+	 */
+	if (*req_len != same_len)
+		*req_len = ALIGN_DOWN(same_len, dest->i_sb->s_blocksize);
 	return 0;
 
 out_error:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index eb35363478e5..490128b84d10 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1840,7 +1840,7 @@  extern int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
 		struct file *file_out, loff_t pos_out, u64 len);
 extern int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
 					 struct inode *dest, loff_t destoff,
-					 loff_t len, bool *is_same);
+					 loff_t *len);
 extern int vfs_dedupe_file_range(struct file *file,
 				 struct file_dedupe_range *same);
 extern s64 vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,