Add support for RENAME_{EXCHANGE,WHITEOUT}
diff mbox

Message ID CAO8L9g5qet=wcqYFusNtVOZ8GJ_ZcQp++frjZmyFRNhKZ3xCbA@mail.gmail.com
State New
Headers show

Commit Message

Dan Fuhry Jan. 28, 2016, 5:40 p.m. UTC
Google didn't attach the patch the second time around. Herp-a-derp
derp, sorry for the spam.

On Thu, Jan 28, 2016 at 10:52 AM, Dan Fuhry <dan@fuhry.com> wrote:
> Add support for the RENAME_WHITEOUT and RENAME_EXCHANGE flags in
> renameat2(). This brings us pretty close to having btrfs ready to be
> an upper layer for overlayfs. (The last remaining issue is in
> btrfs_sync_file, which I'm looking at next if I have time.)
>
> This includes Davide Italiano's implementation of RENAME_EXCHANGE. I
> quickly pinged him on github [1] last week to confirm resubmission of
> his patch which originally went to this list on 2 April 2015 and
> didn't make it in.
>
> This is my first time submitting a kernel patch, I think I've covered
> all the style recommendations, and also addressed Filipe David
> Manana's review notes [2] on Davide Italiano's patch. Certainly let me
> know if there's anything else that should be cleaned up. Thanks!
>
> [1] https://github.com/fuhry/linux/commit/9a7300d973a192193857c595997f292a14c14197#commitcomment-15570553
> [2] https://www.marc.info/?l=linux-btrfs&m=142796914520768&w=3
>
> --
> Dan Fuhry
> Senior Software Engineer, Datto Inc
> (203) 529-4949 x402 / dfuhry@datto.com

Patch
diff mbox

btrfs: add support for RENAME_EXCHANGE and RENAME_WHITEOUT

Two new flags, RENAME_EXCHANGE and RENAME_WHITEOUT, provide for new
behavior in the renameat2() syscall. This behavior is primarily used by
overlayfs. This patch adds support for these flags to btrfs, enabling it to
be used as a fully functional upper layer for overlayfs.

RENAME_EXCHANGE support was written by Davide Italiano
<dccitaliano@gmail.com>, originally submitted on 2 April 2015.

Signed-off-by: Dan Fuhry <dfuhry@datto.com>

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a70c579..274f854a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9220,8 +9220,246 @@  static int btrfs_getattr(struct vfsmount *mnt,
 	return 0;
 }
 
+static int btrfs_rename_exchange(struct inode *old_dir,
+			      struct dentry *old_dentry,
+			      struct inode *new_dir,
+			      struct dentry *new_dentry)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = BTRFS_I(old_dir)->root;
+	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct inode *old_inode = old_dentry->d_inode;
+	struct timespec ctime = CURRENT_TIME;
+	struct dentry *parent;
+	u64 old_ino = btrfs_ino(old_inode);
+	u64 new_ino = btrfs_ino(new_inode);
+	u64 old_idx = 0;
+	u64 new_idx = 0;
+	u64 root_objectid;
+	int ret;
+
+	/* we only allow rename subvolume link between subvolumes */
+	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+		return -EXDEV;
+
+	/* close the racy window with snapshot create/destroy ioctl */
+	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+		down_read(&root->fs_info->subvol_sem);
+	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+		down_read(&dest->fs_info->subvol_sem);
+
+	/*
+	 * We want to reserve the absolute worst case amount of items.  So if
+	 * both inodes are subvols and we need to unlink them then that would
+	 * require 4 item modifications, but if they are both normal inodes it
+	 * would require 5 item modifications, so we'll assume their normal
+	 * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
+	 * should cover the worst case number of items we'll modify.
+	 */
+	trans = btrfs_start_transaction(root, 12);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_notrans;
+	}
+
+	/*
+	 * We need to find a free sequence number both in the source and
+	 * in the destination directory for the exchange.
+	 */
+	ret = btrfs_set_inode_index(new_dir, &old_idx);
+	if (ret)
+		goto out_fail;
+	ret = btrfs_set_inode_index(old_dir, &new_idx);
+	if (ret)
+		goto out_fail;
+
+	BTRFS_I(old_inode)->dir_index = 0ULL;
+	BTRFS_I(new_inode)->dir_index = 0ULL;
+
+	/* Reference for the source. */
+	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		/* force full log commit if subvolume involved. */
+		btrfs_set_log_full_commit(root->fs_info, trans);
+	} else {
+		ret = btrfs_insert_inode_ref(trans, dest,
+					     new_dentry->d_name.name,
+					     new_dentry->d_name.len,
+					     old_ino,
+					     btrfs_ino(new_dir), old_idx);
+		if (ret)
+			goto out_fail;
+		btrfs_pin_log_trans(root);
+	}
+
+	/* And now for the dest. */
+	if (unlikely(new_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		/* force full log commit if subvolume involved. */
+		btrfs_set_log_full_commit(dest->fs_info, trans);
+	} else {
+		ret = btrfs_insert_inode_ref(trans, root,
+					     old_dentry->d_name.name,
+					     old_dentry->d_name.len,
+					     new_ino,
+					     btrfs_ino(old_dir), new_idx);
+		if (ret)
+			goto out_fail;
+		btrfs_pin_log_trans(dest);
+	}
+
+	/*
+	 * Update i-node version and ctime/mtime.
+	 */
+	inode_inc_iversion(old_dir);
+	inode_inc_iversion(new_dir);
+	inode_inc_iversion(old_inode);
+	inode_inc_iversion(new_inode);
+	old_dir->i_ctime = old_dir->i_mtime = ctime;
+	new_dir->i_ctime = new_dir->i_mtime = ctime;
+	old_inode->i_ctime = ctime;
+	new_inode->i_ctime = ctime;
+
+	if (old_dentry->d_parent != new_dentry->d_parent) {
+		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+		btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
+	}
+
+	/* src is a subvolume */
+	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
+		ret = btrfs_unlink_subvol(trans, root, old_dir,
+					  root_objectid,
+					  old_dentry->d_name.name,
+					  old_dentry->d_name.len);
+	} else { /* src is a inode */
+		ret = __btrfs_unlink_inode(trans, root, old_dir,
+					   old_dentry->d_inode,
+					   old_dentry->d_name.name,
+					   old_dentry->d_name.len);
+		if (!ret)
+			ret = btrfs_update_inode(trans, root, old_inode);
+	}
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_fail;
+	}
+
+	/* dest is a subvolume */
+	if (unlikely(new_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
+		ret = btrfs_unlink_subvol(trans, dest, new_dir,
+					  root_objectid,
+					  new_dentry->d_name.name,
+					  new_dentry->d_name.len);
+	} else { /* dest is an inode */
+		ret = __btrfs_unlink_inode(trans, dest, new_dir,
+					   new_dentry->d_inode,
+					   new_dentry->d_name.name,
+					   new_dentry->d_name.len);
+		if (!ret)
+			ret = btrfs_update_inode(trans, dest, new_inode);
+	}
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_fail;
+	}
+
+	ret = btrfs_add_link(trans, new_dir, old_inode,
+			     new_dentry->d_name.name,
+			     new_dentry->d_name.len, 0, old_idx);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_fail;
+	}
+
+	ret = btrfs_add_link(trans, old_dir, new_inode,
+			     old_dentry->d_name.name,
+			     old_dentry->d_name.len, 0, new_idx);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_fail;
+	}
+
+	if (old_inode->i_nlink == 1)
+		BTRFS_I(old_inode)->dir_index = old_idx;
+	if (new_inode->i_nlink == 1)
+		BTRFS_I(new_inode)->dir_index = new_idx;
+
+	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		parent = new_dentry->d_parent;
+		btrfs_log_new_name(trans, old_inode, old_dir, parent);
+		btrfs_end_log_trans(root);
+	}
+	if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+		parent = old_dentry->d_parent;
+		btrfs_log_new_name(trans, new_inode, new_dir, parent);
+		btrfs_end_log_trans(dest);
+	}
+out_fail:
+	ret = btrfs_end_transaction(trans, root);
+out_notrans:
+	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+		up_read(&dest->fs_info->subvol_sem);
+	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+		up_read(&root->fs_info->subvol_sem);
+
+	return ret;
+}
+
+static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct inode *dir,
+				     struct dentry *dentry)
+{
+	int ret;
+	struct inode *inode;
+	u64 objectid;
+	u64 index;
+
+	ret = btrfs_find_free_ino(root, &objectid);
+	if (ret)
+		return ret;
+
+	inode = btrfs_new_inode(trans, root, dir,
+				dentry->d_name.name,
+				dentry->d_name.len,
+				btrfs_ino(dir),
+				objectid,
+				S_IFCHR | WHITEOUT_MODE,
+				&index);
+
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		return ret;
+	}
+
+	inode->i_op = &btrfs_special_inode_operations;
+	init_special_inode(inode, inode->i_mode,
+		WHITEOUT_DEV);
+
+	ret = btrfs_init_inode_security(trans, inode, dir,
+				&dentry->d_name);
+	if (ret)
+		return ret;
+
+	ret = btrfs_add_nondir(trans, dir, dentry,
+				inode, 0, index);
+	if (ret)
+		return ret;
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret)
+		return ret;
+
+	unlock_new_inode(inode);
+	iput(inode);
+
+	return 0;
+}
+
 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-			   struct inode *new_dir, struct dentry *new_dentry)
+			   struct inode *new_dir, struct dentry *new_dentry,
+			   unsigned int flags)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
@@ -9283,15 +9521,15 @@  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * We want to reserve the absolute worst case amount of items.  So if
 	 * both inodes are subvols and we need to unlink them then that would
 	 * require 4 item modifications, but if they are both normal inodes it
-	 * would require 5 item modifications, so we'll assume their normal
+	 * would require 5 item modifications, so we'll assume they are normal
 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
 	 * should cover the worst case number of items we'll modify.
 	 */
 	trans = btrfs_start_transaction(root, 11);
 	if (IS_ERR(trans)) {
-                ret = PTR_ERR(trans);
-                goto out_notrans;
-        }
+		ret = PTR_ERR(trans);
+		goto out_notrans;
+	}
 
 	if (dest != root)
 		btrfs_record_root_in_trans(trans, dest);
@@ -9391,6 +9629,16 @@  static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		btrfs_log_new_name(trans, old_inode, old_dir, parent);
 		btrfs_end_log_trans(root);
 	}
+
+	if (flags & RENAME_WHITEOUT) {
+		ret = btrfs_whiteout_for_rename(trans, root, old_dir,
+						old_dentry);
+
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out_fail;
+		}
+	}
 out_fail:
 	btrfs_end_transaction(trans, root);
 out_notrans:
@@ -9404,10 +9652,14 @@  static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 			 struct inode *new_dir, struct dentry *new_dentry,
 			 unsigned int flags)
 {
-	if (flags & ~RENAME_NOREPLACE)
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
 		return -EINVAL;
 
-	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (flags & RENAME_EXCHANGE)
+		return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+					  new_dentry);
+
+	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
 }
 
 static void btrfs_run_delalloc_work(struct btrfs_work *work)