diff mbox

[4/4] btrfs: offline dedupe

Message ID 1369160908-26195-5-git-send-email-mfasheh@suse.de (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Mark Fasheh May 21, 2013, 6:28 p.m. UTC
This patch adds an ioctl, BTRFS_IOC_FILE_EXTENT_SAME which will try to
de-duplicate a list of extents across a range of files.

Internally, the ioctl re-uses code from the clone ioctl. This avoids
rewriting a large chunk of extent handling code.

Userspace passes in an array of file, offset pairs along with a length
argument. The ioctl will then (for each dedupe) do a byte-by-byte comparison
of the user data before deduping the extent. Status and number of bytes
deduped are returned for each operation.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
---
 fs/btrfs/ioctl.c           | 290 +++++++++++++++++++++++++++++++++++++++++++++
 include/uapi/linux/btrfs.h |  27 +++++
 2 files changed, 317 insertions(+)

Comments

David Sterba May 24, 2013, 2:05 p.m. UTC | #1
On Tue, May 21, 2013 at 11:28:28AM -0700, Mark Fasheh wrote:
> +static noinline int fill_data(struct inode *inode, u64 off, u64 len,
> +			      char **cur_buffer)
> +{
> +	struct page *page;
> +	void *addr;
> +	char *buffer;
> +	pgoff_t index;
> +	pgoff_t last_index;
> +	int ret = 0;
> +	int bytes_copied = 0;
> +	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
> +
> +	buffer = kmalloc(len, GFP_NOFS);

I've looked at the caller chain and len can be as big as 1MB, this is
likely to fail to allocate. More comments below.

> +	if (!buffer)
> +		return -ENOMEM;
> +
> +	index = off >> PAGE_CACHE_SHIFT;
> +	last_index = (off + len - 1) >> PAGE_CACHE_SHIFT;
> +
> +	while (index <= last_index) {
> +		page = grab_cache_page(inode->i_mapping, index);
> +		if (!page) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +
> +		if (!PageUptodate(page)) {
> +			extent_read_full_page_nolock(tree, page,
> +						     btrfs_get_extent, 0);
> +			lock_page(page);
> +			if (!PageUptodate(page)) {
> +				unlock_page(page);
> +				page_cache_release(page);
> +				ret = -EINVAL;
> +				goto out;
> +			}
> +		}
> +
> +		addr = kmap(page);
> +		memcpy(buffer + bytes_copied, addr, PAGE_CACHE_SIZE);
> +		kunmap(page);
> +		unlock_page(page);
> +		page_cache_release(page);
> +		bytes_copied += PAGE_CACHE_SIZE;
> +		index++;
> +	}
> +
> +	*cur_buffer = buffer;
> +
> +out:
> +	if (ret)
> +		kfree(buffer);
> +	return ret;
> +}
> +
>  static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
>  {
>  	/* do any pending delalloc/csum calc on src, one way or
> @@ -2476,6 +2534,236 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
> +static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
> +			     struct inode *dst, u64 dst_loff)
> +{
> +	char *orig_buffer = NULL;
> +	char *dst_inode_buffer = NULL;
> +	int ret;
> +
> +	/*
> +	 * btrfs_clone() can't handle extents in the same file
> +	 * yet. Once that works, we can drop this check and replace it
> +	 * with a check for the same inode, but overlapping extents.
> +	 */
> +	if (src == dst)
> +		return -EINVAL;
> +
> +	btrfs_double_lock(src, loff, dst, dst_loff, len);
> +

Let's see how fill_data is actually used:

> +	ret = fill_data(src, loff, len, &orig_buffer);

Now orig_buffer is allocated from inside fill_data

> +	ret = fill_data(dst, dst_loff, len, &dst_inode_buffer);

And again, although the buffers are just memcmp'ed:

> +	ret = memcmp(orig_buffer, dst_inode_buffer, len);

extents linked:

> +	ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
> +
> +out:
> +	btrfs_double_unlock(src, loff, dst, dst_loff, len);
> +
and thrown away, all of this to be repeated on next call of
btrfs_extent_same:

> +	kfree(dst_inode_buffer);
> +	kfree(orig_buffer);
> +	return ret;
> +}

Apart from the potential kmalloc failure, I think at least the second
fill_data can be replaced with memcpy with orig_buffer. (Possibly both
fill_data/memcpy could be replaced by memcmp reading from both inodes in
parallel.)

> +#define BTRFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
> +#define BTRFS_ONE_DEDUPE_LEN	(1 * 1024 * 1024)
> +
> +static long btrfs_ioctl_file_extent_same(struct file *file,
> +					 void __user *argp)
> +{
> +	struct btrfs_ioctl_same_args *args;
> +	struct btrfs_ioctl_same_args tmp;
> +	struct btrfs_ioctl_same_extent_info *info;
> +	struct inode *src = file->f_dentry->d_inode;
> +	struct file *dst_file = NULL;
> +	struct inode *dst;
> +	u64 off;
> +	u64 len;
> +	int args_size;
> +	int i;
> +	int ret;
> +	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;

The ioctl is available to non-root, so an extra care should be taken to
potentail overflows etc. I haven't spotted anything so far.

> +	if (copy_from_user(&tmp,
> +			   (struct btrfs_ioctl_same_args __user *)argp,
> +			   sizeof(tmp)))
> +		return -EFAULT;
> +
> +	args_size = sizeof(tmp) + (tmp.dest_count *
> +			sizeof(struct btrfs_ioctl_same_extent_info));

A minor note: computing the size like this works as long as the
structures do not require extra padding between elements, ie the gap at
the end of an element until alignmed start of the next element. That's
ok now and I don't think that the compiler would use alignment larger
than 8 bytes (ie for u64).

size of btrfs_ioctl_same_extent_info is 4x u64, and 'info' starts at 3rd
u64 inside btrfs_ioctl_same_args.

A proposed paranoid-safety check is something like

	struct btrfs_ioctl_same_extent_info dummy[2];

	ASSERT(sizeof(struct btrfs_ioctl_same_extent_info)
		== (&dummy[1] - &dummy[0]))

> +	/* Keep size of ioctl argument sane */
> +	if (args_size > PAGE_CACHE_SIZE)
> +		return -E2BIG;
> +
> +	args = kmalloc(args_size, GFP_NOFS);
> +	if (!args)
> +		return -ENOMEM;
> +
> +	ret = -EFAULT;
> +	if (copy_from_user(args,
> +			   (struct btrfs_ioctl_same_args __user *)argp,
> +			   args_size))
> +		goto out;
> +	/* Make sure args didn't change magically between copies. */

Not much fun left :)

> +	if (memcmp(&tmp, args, sizeof(tmp)))
> +		goto out;
> +
> +	if ((sizeof(tmp) + (sizeof(*info) * args->dest_count)) > args_size)
> +		goto out;
> +
> +	/* pre-format 'out' fields to sane default values */
> +	for (i = 0; i < args->dest_count; i++) {
> +		info = &args->info[i];
> +		info->bytes_deduped = 0;
> +		info->status = 0;
> +	}
> +
> +	off = args->logical_offset;
> +	len = args->length;
> +
> +	/*
> +	 * Limit the total length we will dedupe for each operation. 
> +	 * This is intended to bound the entire ioctl to something sane.
> +	 */
> +	if (len > BTRFS_MAX_DEDUPE_LEN)
> +		len = BTRFS_MAX_DEDUPE_LEN;
> +
> +	ret = -EINVAL;
> +	if (off + len > src->i_size || off + len < off)
> +		goto out;
> +	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
> +		goto out;
> +
> +	ret = -EISDIR;
> +	if (S_ISDIR(src->i_mode))
> +		goto out;
> +
> +	ret = 0;
> +	for (i = 0; i < args->dest_count; i++) {
> +		u64 dest_off;
> +		u64 src_off;
> +		u64 op_len;
> +
> +		info = &args->info[i];
> +
> +		dst_file = fget(info->fd);
> +		if (!dst_file) {
> +			printk(KERN_ERR "btrfs: invalid fd %lld\n", info->fd);

I think adding the keyword 'dedup' to the messages would make it more
clear at which operation it happend.

> +			info->status = -EBADF;
> +			continue;
> +		}
> +
> +		dst = dst_file->f_dentry->d_inode;
> +		if (S_ISDIR(dst->i_mode)) {
> +			printk(KERN_ERR "btrfs: file is dir %lld\n", info->fd);
> +			info->status = -EISDIR;
> +			goto next;
> +		}
> +
> +		info->status = -EINVAL;
> +		if (dst == src) {
> +			printk(KERN_ERR "btrfs: file dup %lld\n", info->fd);
> +			goto next;
> +		}
> +
> +		dest_off = info->logical_offset;
> +
> +		if (dest_off + len > dst->i_size || dest_off + len < dest_off)
> +			goto next;
> +		if (!IS_ALIGNED(dest_off, bs))
> +			goto next;
> +
> +		/*
> +		 * The purpose of this loop is to limit the number of
> +		 * bytes we dedupe during a single call to
> +		 * btrfs_extent_same().
> +		 *
> +		 * In order to memcmp the data we have to allocate a
> +		 * pair of buffers. We don't want to allocate too
> +		 * large a buffer, so limiting the size for each
> +		 * dedupe is an easy way to do this.
> +		 */
> +		src_off = off;
> +		op_len = len;
> +		while (op_len) {
> +			u64 tmp_len;
> +
> +			tmp_len = op_len;
> +			if (op_len > BTRFS_ONE_DEDUPE_LEN)
> +				tmp_len = BTRFS_ONE_DEDUPE_LEN;
> +
> +			info->status = btrfs_extent_same(src, src_off, tmp_len,
> +							 dst, dest_off);
> +			if (info->status == 0) {
> +				info->bytes_deduped += tmp_len;
> +			} else
> +				break;
> +
> +			dest_off += tmp_len;
> +			src_off += tmp_len;
> +			op_len -= tmp_len;
> +		}
> +
> +next:
> +		fput(dst_file);
> +		dst_file = NULL;
> +	}
> +
> +	if (copy_to_user(argp, args, args_size))
> +		ret = -EFAULT;
> +
> +out:
> +	if (dst_file)
> +		fput(dst_file);
> +	kfree(args);
> +	return ret;
> +}
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Fasheh May 24, 2013, 6:17 p.m. UTC | #2
Hey David, thanks again for the review! Comments are inline below.

On Fri, May 24, 2013 at 04:05:36PM +0200, David Sterba wrote:
> On Tue, May 21, 2013 at 11:28:28AM -0700, Mark Fasheh wrote:
> > +static noinline int fill_data(struct inode *inode, u64 off, u64 len,
> > +			      char **cur_buffer)
> > +{
> > +	struct page *page;
> > +	void *addr;
> > +	char *buffer;
> > +	pgoff_t index;
> > +	pgoff_t last_index;
> > +	int ret = 0;
> > +	int bytes_copied = 0;
> > +	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
> > +
> > +	buffer = kmalloc(len, GFP_NOFS);
> 
> I've looked at the caller chain and len can be as big as 1MB, this is
> likely to fail to allocate. More comments below.

Well it's there in the description that we'll do this 1MB at a time at max :)


> > @@ -2476,6 +2534,236 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
> > +static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
> > +			     struct inode *dst, u64 dst_loff)
> > +{
> > +	char *orig_buffer = NULL;
> > +	char *dst_inode_buffer = NULL;
> > +	int ret;
> > +
> > +	/*
> > +	 * btrfs_clone() can't handle extents in the same file
> > +	 * yet. Once that works, we can drop this check and replace it
> > +	 * with a check for the same inode, but overlapping extents.
> > +	 */
> > +	if (src == dst)
> > +		return -EINVAL;
> > +
> > +	btrfs_double_lock(src, loff, dst, dst_loff, len);
> > +
> 
> Let's see how fill_data is actually used:
> 
> > +	ret = fill_data(src, loff, len, &orig_buffer);
> 
> Now orig_buffer is allocated from inside fill_data
> 
> > +	ret = fill_data(dst, dst_loff, len, &dst_inode_buffer);
> 
> And again, although the buffers are just memcmp'ed:
> 
> > +	ret = memcmp(orig_buffer, dst_inode_buffer, len);
> 
> extents linked:
> 
> > +	ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
> > +
> > +out:
> > +	btrfs_double_unlock(src, loff, dst, dst_loff, len);
> > +
> and thrown away, all of this to be repeated on next call of
> btrfs_extent_same:
> 
> > +	kfree(dst_inode_buffer);
> > +	kfree(orig_buffer);
> > +	return ret;
> > +}
> 
> Apart from the potential kmalloc failure, I think at least the second
> fill_data can be replaced with memcpy with orig_buffer. (Possibly both
> fill_data/memcpy could be replaced by memcmp reading from both inodes in
> parallel.)

We have to do the read (fill_data) on each inode for each iteration. The
design of this is that we only lock down the inodes when we're deduping and
then relock on the next pass. Otherwise we could be doing a ton of work
while preventing access to file data. The obvious downside is that data can
change between locks so we have to always check.

Keeping the kmalloc'd buffers around isn't a bad idea and luckily is trivial
to handle. I can just alloc them up front in the caller and pass them
through on each call.


> > +#define BTRFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
> > +#define BTRFS_ONE_DEDUPE_LEN	(1 * 1024 * 1024)
> > +
> > +static long btrfs_ioctl_file_extent_same(struct file *file,
> > +					 void __user *argp)
> > +{
> > +	struct btrfs_ioctl_same_args *args;
> > +	struct btrfs_ioctl_same_args tmp;
> > +	struct btrfs_ioctl_same_extent_info *info;
> > +	struct inode *src = file->f_dentry->d_inode;
> > +	struct file *dst_file = NULL;
> > +	struct inode *dst;
> > +	u64 off;
> > +	u64 len;
> > +	int args_size;
> > +	int i;
> > +	int ret;
> > +	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
> 
> The ioctl is available to non-root, so an extra care should be taken to
> potentail overflows etc. I haven't spotted anything so far.


Sure. Actually, you got me thinking about some sanity checks... I need to
add at least this check:

	if (btrfs_root_readonly(root))
		return -EROFS;

which isn't in there as of now.


Also I don't really check the open mode (read, write, etc) on files passed
in. We do this in the clone ioctl and it makes sense there since data (to
the user) can change. With this ioctl though data won't ever change (even if
the underlying extent does). So I left the checks out. A part of me is
thinking we might want to be conservative to start with though and just add
those type of checks in. Basically, I figure the source should be open for
read at least and target files need write access.


> > +	if (copy_from_user(&tmp,
> > +			   (struct btrfs_ioctl_same_args __user *)argp,
> > +			   sizeof(tmp)))
> > +		return -EFAULT;
> > +
> > +	args_size = sizeof(tmp) + (tmp.dest_count *
> > +			sizeof(struct btrfs_ioctl_same_extent_info));
> 
> A minor note: computing the size like this works as long as the
> structures do not require extra padding between elements, ie the gap at
> the end of an element until alignmed start of the next element. That's
> ok now and I don't think that the compiler would use alignment larger
> than 8 bytes (ie for u64).
> 
> size of btrfs_ioctl_same_extent_info is 4x u64, and 'info' starts at 3rd
> u64 inside btrfs_ioctl_same_args.
> 
> A proposed paranoid-safety check is something like
> 
> 	struct btrfs_ioctl_same_extent_info dummy[2];
> 
> 	ASSERT(sizeof(struct btrfs_ioctl_same_extent_info)
> 		== (&dummy[1] - &dummy[0]))

Maybe BUILD_BUG_ON is even better actually. If someone messed this up the
build would catch it and we'd never even get a module with bad checks.

I'll have to think up how to catch it in that context though.


> > +
> > +	ret = 0;
> > +	for (i = 0; i < args->dest_count; i++) {
> > +		u64 dest_off;
> > +		u64 src_off;
> > +		u64 op_len;
> > +
> > +		info = &args->info[i];
> > +
> > +		dst_file = fget(info->fd);
> > +		if (!dst_file) {
> > +			printk(KERN_ERR "btrfs: invalid fd %lld\n", info->fd);
> 
> I think adding the keyword 'dedup' to the messages would make it more
> clear at which operation it happend.

Good idea.... Btw, I was actually thinking of just ditching those printks in
the next iteration of this series. We don't tend to printk as often in other
parts of the btrfs code. Any opinion on that? :)
	--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gabriel de Perthuis May 24, 2013, 7:50 p.m. UTC | #3
>>> +#define BTRFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
>>> +#define BTRFS_ONE_DEDUPE_LEN	(1 * 1024 * 1024)
>>> +
>>> +static long btrfs_ioctl_file_extent_same(struct file *file,
>>> +					 void __user *argp)
>>> +{
>>> +	struct btrfs_ioctl_same_args *args;
>>> +	struct btrfs_ioctl_same_args tmp;
>>> +	struct btrfs_ioctl_same_extent_info *info;
>>> +	struct inode *src = file->f_dentry->d_inode;
>>> +	struct file *dst_file = NULL;
>>> +	struct inode *dst;
>>> +	u64 off;
>>> +	u64 len;
>>> +	int args_size;
>>> +	int i;
>>> +	int ret;
>>> +	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
>>
>> The ioctl is available to non-root, so an extra care should be taken to
>> potentail overflows etc. I haven't spotted anything so far.
> 
> 
> Sure. Actually, you got me thinking about some sanity checks... I need to
> add at least this check:
> 
> 	if (btrfs_root_readonly(root))
> 		return -EROFS;
> 
> which isn't in there as of now.

It's not needed and I'd rather do without, read-only snapshots and deduplication go together well for backups.
Data and metadata are guaranteed to be immutable, extent storage isn't.  This is also the case with raid.


> Also I don't really check the open mode (read, write, etc) on files passed
> in. We do this in the clone ioctl and it makes sense there since data (to
> the user) can change. With this ioctl though data won't ever change (even if
> the underlying extent does). So I left the checks out. A part of me is
> thinking we might want to be conservative to start with though and just add
> those type of checks in. Basically, I figure the source should be open for
> read at least and target files need write access.

I don't know of any privileged files that one would be able to open(2), but if this is available to unprivileged users the files all need to be open for reading so that it can't be used to guess at their contents.
As long as root gets to bypass the checks (no btrfs_root_readonly) it doesn't hurt my use case.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Fasheh May 24, 2013, 10:38 p.m. UTC | #4
On Fri, May 24, 2013 at 09:50:14PM +0200, Gabriel de Perthuis wrote:
> > Sure. Actually, you got me thinking about some sanity checks... I need to
> > add at least this check:
> > 
> > 	if (btrfs_root_readonly(root))
> > 		return -EROFS;
> > 
> > which isn't in there as of now.
> 
> It's not needed and I'd rather do without, read-only snapshots and deduplication go together well for backups.
> Data and metadata are guaranteed to be immutable, extent storage isn't.  This is also the case with raid.

You're absolutely right - I miswrote the check I meant.
Specifically, I was thinking about when the entire fs is readonly due to
either some error or the user mounted with -oro. So something more like:

	if (root->fs_info->sb->s_flags & MS_RDONLY)
		return -EROFS;

I think that should be reasonable and wouldn't affect most use cases,
right?


> > Also I don't really check the open mode (read, write, etc) on files passed
> > in. We do this in the clone ioctl and it makes sense there since data (to
> > the user) can change. With this ioctl though data won't ever change (even if
> > the underlying extent does). So I left the checks out. A part of me is
> > thinking we might want to be conservative to start with though and just add
> > those type of checks in. Basically, I figure the source should be open for
> > read at least and target files need write access.
> 
> I don't know of any privileged files that one would be able to open(2),
> but if this is available to unprivileged users the files all need to be
> open for reading so that it can't be used to guess at their contents. As
> long as root gets to bypass the checks (no btrfs_root_readonly) it doesn't
> hurt my use case.

Oh ok so this seems to make sense. How does this logic sound:

We're not going to worry about write access since it would be entirely
reasonable for the user to want to do this on a readonly submount
(specifically for the purpose of deduplicating backups).

Read access needs to be provided however so we know that the user has access
to the file data.

So basically, if a user can open any files for read, they can check their
contents and dedupe them.

Letting users dedupe files in say, /etc seems kind of weird to me but I'm
struggling to come up with a good explanation of why that should mean we
limit this ioctl to root.
	--Mark

--
Mark Fasheh
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Gabriel de Perthuis May 24, 2013, 11:36 p.m. UTC | #5
Le sam. 25 mai 2013 00:38:27 CEST, Mark Fasheh a écrit :
> On Fri, May 24, 2013 at 09:50:14PM +0200, Gabriel de Perthuis wrote:
>>> Sure. Actually, you got me thinking about some sanity checks... I need to
>>> add at least this check:
>>>
>>> 	if (btrfs_root_readonly(root))
>>> 		return -EROFS;
>>>
>>> which isn't in there as of now.
>>
>> It's not needed and I'd rather do without, read-only snapshots and deduplication go together well for backups.
>> Data and metadata are guaranteed to be immutable, extent storage isn't.  This is also the case with raid.
>
> You're absolutely right - I miswrote the check I meant.
> Specifically, I was thinking about when the entire fs is readonly due to
> either some error or the user mounted with -oro. So something more like:
>
> 	if (root->fs_info->sb->s_flags & MS_RDONLY)
> 		return -EROFS;
>
> I think that should be reasonable and wouldn't affect most use cases,
> right?

That's all right.

>>> Also I don't really check the open mode (read, write, etc) on files passed
>>> in. We do this in the clone ioctl and it makes sense there since data (to
>>> the user) can change. With this ioctl though data won't ever change (even if
>>> the underlying extent does). So I left the checks out. A part of me is
>>> thinking we might want to be conservative to start with though and just add
>>> those type of checks in. Basically, I figure the source should be open for
>>> read at least and target files need write access.
>>
>> I don't know of any privileged files that one would be able to open(2),
>> but if this is available to unprivileged users the files all need to be
>> open for reading so that it can't be used to guess at their contents. As
>> long as root gets to bypass the checks (no btrfs_root_readonly) it doesn't
>> hurt my use case.
>
> Oh ok so this seems to make sense. How does this logic sound:
>
> We're not going to worry about write access since it would be entirely
> reasonable for the user to want to do this on a readonly submount
> (specifically for the purpose of deduplicating backups).
>
> Read access needs to be provided however so we know that the user has access
> to the file data.
>
> So basically, if a user can open any files for read, they can check their
> contents and dedupe them.
>
> Letting users dedupe files in say, /etc seems kind of weird to me but I'm
> struggling to come up with a good explanation of why that should mean we
> limit this ioctl to root.
> 	--Mark

I agree with that model.  Most of the code is shared with clone (and the
copy_range RFC) which are unprivileged, so it doesn't increase the potential
surface for bugs much.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e90c519..54fcb90 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -57,6 +57,9 @@ 
 #include "send.h"
 #include "dev-replace.h"
 
+static int btrfs_clone(struct inode *src, struct inode *inode,
+		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
 {
@@ -2456,6 +2459,61 @@  out:
 	return ret;
 }
 
+static noinline int fill_data(struct inode *inode, u64 off, u64 len,
+			      char **cur_buffer)
+{
+	struct page *page;
+	void *addr;
+	char *buffer;
+	pgoff_t index;
+	pgoff_t last_index;
+	int ret = 0;
+	int bytes_copied = 0;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+
+	buffer = kmalloc(len, GFP_NOFS);
+	if (!buffer)
+		return -ENOMEM;
+
+	index = off >> PAGE_CACHE_SHIFT;
+	last_index = (off + len - 1) >> PAGE_CACHE_SHIFT;
+
+	while (index <= last_index) {
+		page = grab_cache_page(inode->i_mapping, index);
+		if (!page) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (!PageUptodate(page)) {
+			extent_read_full_page_nolock(tree, page,
+						     btrfs_get_extent, 0);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				ret = -EINVAL;
+				goto out;
+			}
+		}
+
+		addr = kmap(page);
+		memcpy(buffer + bytes_copied, addr, PAGE_CACHE_SIZE);
+		kunmap(page);
+		unlock_page(page);
+		page_cache_release(page);
+		bytes_copied += PAGE_CACHE_SIZE;
+		index++;
+	}
+
+	*cur_buffer = buffer;
+
+out:
+	if (ret)
+		kfree(buffer);
+	return ret;
+}
+
 static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
 {
 	/* do any pending delalloc/csum calc on src, one way or
@@ -2476,6 +2534,236 @@  static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
 	}
 }
 
+static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
+				struct inode *inode2, u64 loff2, u64 len)
+{
+	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+
+	mutex_unlock(&inode1->i_mutex);
+	mutex_unlock(&inode2->i_mutex);
+}
+
+static void btrfs_double_lock(struct inode *inode1, u64 loff1,
+			      struct inode *inode2, u64 loff2, u64 len)
+{
+	if (inode1 < inode2) {
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+		lock_extent_range(inode1, loff1, len);
+		lock_extent_range(inode2, loff2, len);
+	} else {
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
+		lock_extent_range(inode2, loff2, len);
+		lock_extent_range(inode1, loff1, len);
+	}
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+			     struct inode *dst, u64 dst_loff)
+{
+	char *orig_buffer = NULL;
+	char *dst_inode_buffer = NULL;
+	int ret;
+
+	/*
+	 * btrfs_clone() can't handle extents in the same file
+	 * yet. Once that works, we can drop this check and replace it
+	 * with a check for the same inode, but overlapping extents.
+	 */
+	if (src == dst)
+		return -EINVAL;
+
+	btrfs_double_lock(src, loff, dst, dst_loff, len);
+
+	ret = fill_data(src, loff, len, &orig_buffer);
+	if (ret) {
+		printk(KERN_ERR "btrfs: unable to source populate data "
+		       "buffer.\n");
+		goto out;
+	}
+
+	ret = fill_data(dst, dst_loff, len, &dst_inode_buffer);
+	if (ret) {
+		printk(KERN_ERR "btrfs: unable to populate destination data "
+		       "buffer.\n");
+		goto out;
+	}
+
+	ret = memcmp(orig_buffer, dst_inode_buffer, len);
+	if (ret) {
+		ret = BTRFS_SAME_DATA_DIFFERS;
+		printk(KERN_ERR "btrfs: data for inode %lu does not "
+		       "match\n", dst->i_ino);
+		goto out;
+	}
+
+	ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+
+out:
+	btrfs_double_unlock(src, loff, dst, dst_loff, len);
+
+	kfree(dst_inode_buffer);
+	kfree(orig_buffer);
+	return ret;
+}
+
+#define BTRFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
+#define BTRFS_ONE_DEDUPE_LEN	(1 * 1024 * 1024)
+
+static long btrfs_ioctl_file_extent_same(struct file *file,
+					 void __user *argp)
+{
+	struct btrfs_ioctl_same_args *args;
+	struct btrfs_ioctl_same_args tmp;
+	struct btrfs_ioctl_same_extent_info *info;
+	struct inode *src = file->f_dentry->d_inode;
+	struct file *dst_file = NULL;
+	struct inode *dst;
+	u64 off;
+	u64 len;
+	int args_size;
+	int i;
+	int ret;
+	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+
+	if (copy_from_user(&tmp,
+			   (struct btrfs_ioctl_same_args __user *)argp,
+			   sizeof(tmp)))
+		return -EFAULT;
+
+	args_size = sizeof(tmp) + (tmp.dest_count *
+			sizeof(struct btrfs_ioctl_same_extent_info));
+
+	/* Keep size of ioctl argument sane */
+	if (args_size > PAGE_CACHE_SIZE)
+		return -E2BIG;
+
+	args = kmalloc(args_size, GFP_NOFS);
+	if (!args)
+		return -ENOMEM;
+
+	ret = -EFAULT;
+	if (copy_from_user(args,
+			   (struct btrfs_ioctl_same_args __user *)argp,
+			   args_size))
+		goto out;
+	/* Make sure args didn't change magically between copies. */
+	if (memcmp(&tmp, args, sizeof(tmp)))
+		goto out;
+
+	if ((sizeof(tmp) + (sizeof(*info) * args->dest_count)) > args_size)
+		goto out;
+
+	/* pre-format 'out' fields to sane default values */
+	for (i = 0; i < args->dest_count; i++) {
+		info = &args->info[i];
+		info->bytes_deduped = 0;
+		info->status = 0;
+	}
+
+	off = args->logical_offset;
+	len = args->length;
+
+	/*
+	 * Limit the total length we will dedupe for each operation. 
+	 * This is intended to bound the entire ioctl to something sane.
+	 */
+	if (len > BTRFS_MAX_DEDUPE_LEN)
+		len = BTRFS_MAX_DEDUPE_LEN;
+
+	ret = -EINVAL;
+	if (off + len > src->i_size || off + len < off)
+		goto out;
+	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
+		goto out;
+
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode))
+		goto out;
+
+	ret = 0;
+	for (i = 0; i < args->dest_count; i++) {
+		u64 dest_off;
+		u64 src_off;
+		u64 op_len;
+
+		info = &args->info[i];
+
+		dst_file = fget(info->fd);
+		if (!dst_file) {
+			printk(KERN_ERR "btrfs: invalid fd %lld\n", info->fd);
+			info->status = -EBADF;
+			continue;
+		}
+
+		dst = dst_file->f_dentry->d_inode;
+		if (S_ISDIR(dst->i_mode)) {
+			printk(KERN_ERR "btrfs: file is dir %lld\n", info->fd);
+			info->status = -EISDIR;
+			goto next;
+		}
+
+		info->status = -EINVAL;
+		if (dst == src) {
+			printk(KERN_ERR "btrfs: file dup %lld\n", info->fd);
+			goto next;
+		}
+
+		dest_off = info->logical_offset;
+
+		if (dest_off + len > dst->i_size || dest_off + len < dest_off)
+			goto next;
+		if (!IS_ALIGNED(dest_off, bs))
+			goto next;
+
+		/*
+		 * The purpose of this loop is to limit the number of
+		 * bytes we dedupe during a single call to
+		 * btrfs_extent_same().
+		 *
+		 * In order to memcmp the data we have to allocate a
+		 * pair of buffers. We don't want to allocate too
+		 * large a buffer, so limiting the size for each
+		 * dedupe is an easy way to do this.
+		 */
+		src_off = off;
+		op_len = len;
+		while (op_len) {
+			u64 tmp_len;
+
+			tmp_len = op_len;
+			if (op_len > BTRFS_ONE_DEDUPE_LEN)
+				tmp_len = BTRFS_ONE_DEDUPE_LEN;
+
+			info->status = btrfs_extent_same(src, src_off, tmp_len,
+							 dst, dest_off);
+			if (info->status == 0) {
+				info->bytes_deduped += tmp_len;
+			} else
+				break;
+
+			dest_off += tmp_len;
+			src_off += tmp_len;
+			op_len -= tmp_len;
+		}
+
+next:
+		fput(dst_file);
+		dst_file = NULL;
+	}
+
+	if (copy_to_user(argp, args, args_size))
+		ret = -EFAULT;
+
+out:
+	if (dst_file)
+		fput(dst_file);
+	kfree(args);
+	return ret;
+}
+
 /**
  * btrfs_clone() - clone a range from inode file to another
  *
@@ -4151,6 +4439,8 @@  long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_get_fslabel(file, argp);
 	case BTRFS_IOC_SET_FSLABEL:
 		return btrfs_ioctl_set_fslabel(file, argp);
+	case BTRFS_IOC_FILE_EXTENT_SAME:
+		return btrfs_ioctl_file_extent_same(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index fa3a5f9..5465bc2 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -305,6 +305,31 @@  struct btrfs_ioctl_clone_range_args {
 #define BTRFS_DEFRAG_RANGE_COMPRESS 1
 #define BTRFS_DEFRAG_RANGE_START_IO 2
 
+#define BTRFS_SAME_DATA_DIFFERS	1
+/* For extent-same ioctl */
+struct btrfs_ioctl_same_extent_info {
+	__s64 fd;		/* in - destination file */
+	__u64 logical_offset;	/* in - start of extent in destination */
+	__u64 bytes_deduped;	/* out - total # of bytes we were able
+				 * to dedupe from this file */
+	/* status of this dedupe operation:
+	 * 0 if dedup succeeds
+	 * < 0 for error
+	 * == BTRFS_SAME_DATA_DIFFERS if data differs
+	 */
+	__s32 status;		/* out - see above description */
+	__u32 reserved;
+};
+
+struct btrfs_ioctl_same_args {
+	__u64 logical_offset;	/* in - start of extent in source */
+	__u64 length;		/* in - length of extent */
+	__u16 dest_count;	/* in - total elements in info array */
+	__u16 reserved1;
+	__u32 reserved2;
+	struct btrfs_ioctl_same_extent_info info[0];
+};
+
 struct btrfs_ioctl_space_info {
 	__u64 flags;
 	__u64 total_bytes;
@@ -510,5 +535,7 @@  struct btrfs_ioctl_send_args {
 				      struct btrfs_ioctl_get_dev_stats)
 #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
 				    struct btrfs_ioctl_dev_replace_args)
+#define BTRFS_IOC_FILE_EXTENT_SAME _IOWR(BTRFS_IOCTL_MAGIC, 54, \
+					 struct btrfs_ioctl_same_args)
 
 #endif /* _UAPI_LINUX_BTRFS_H */