diff mbox

[RFC,v3,1/2] fs: add AT_REPLACE flag for linkat() which replaces the target

Message ID eac9480f80c689504148b5d658ee4218cc1e421e.1524549513.git.osandov@fb.com (mailing list archive)
State New, archived
Headers show

Commit Message

Omar Sandoval April 24, 2018, 6:19 a.m. UTC
From: Omar Sandoval <osandov@fb.com>

One of the most common uses of temporary files is the classic atomic
replacement pattern, i.e.,

- write temporary file
- fsync temporary file
- rename temporary file over real file
- fsync parent directory

Now, we have O_TMPFILE, which gives us a much better way to create
temporary files, but it's not possible to use it for this pattern.

This patch introduces an AT_REPLACE flag which allows linkat() to
replace the target file. Now, the temporary file in the pattern above
can be a proper O_TMPFILE. Even without O_TMPFILE, this is a new
primitive which might be useful in other contexts.

The implementation on the VFS side mimics sys_renameat2().

Cc: Xi Wang <xi@cs.washington.edu>
Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 fs/ecryptfs/inode.c        |   2 +-
 fs/namei.c                 | 181 +++++++++++++++++++++++++++++--------
 fs/nfsd/vfs.c              |   2 +-
 fs/overlayfs/overlayfs.h   |   2 +-
 include/linux/fs.h         |   3 +-
 include/uapi/linux/fcntl.h |   1 +
 6 files changed, 150 insertions(+), 41 deletions(-)

Comments

Omar Sandoval April 24, 2018, 10:14 p.m. UTC | #1
On Mon, Apr 23, 2018 at 11:19:41PM -0700, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> One of the most common uses of temporary files is the classic atomic
> replacement pattern, i.e.,
> 
> - write temporary file
> - fsync temporary file
> - rename temporary file over real file
> - fsync parent directory
> 
> Now, we have O_TMPFILE, which gives us a much better way to create
> temporary files, but it's not possible to use it for this pattern.
> 
> This patch introduces an AT_REPLACE flag which allows linkat() to
> replace the target file. Now, the temporary file in the pattern above
> can be a proper O_TMPFILE. Even without O_TMPFILE, this is a new
> primitive which might be useful in other contexts.
> 
> The implementation on the VFS side mimics sys_renameat2().
> 
> Cc: Xi Wang <xi@cs.washington.edu>
> Signed-off-by: Omar Sandoval <osandov@fb.com>
> ---
>  fs/ecryptfs/inode.c        |   2 +-
>  fs/namei.c                 | 181 +++++++++++++++++++++++++++++--------
>  fs/nfsd/vfs.c              |   2 +-
>  fs/overlayfs/overlayfs.h   |   2 +-
>  include/linux/fs.h         |   3 +-
>  include/uapi/linux/fcntl.h |   1 +
>  6 files changed, 150 insertions(+), 41 deletions(-)
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 186bd2464fd5..2cc2b1deaa12 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -4149,6 +4149,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
>   * @dir:	new parent
>   * @new_dentry:	where to create the new link
>   * @delegated_inode: returns inode needing a delegation break
> + * @flags:      link flags
>   *
>   * The caller must hold dir->i_mutex
>   *
> @@ -4162,16 +4163,26 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
>   * be appropriate for callers that expect the underlying filesystem not
>   * to be NFS exported.
>   */
> -int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
> +int vfs_link(struct dentry *old_dentry, struct inode *dir,
> +	     struct dentry *new_dentry, struct inode **delegated_inode,
> +	     unsigned int flags)
>  {
>  	struct inode *inode = old_dentry->d_inode;
> +	struct inode *target = new_dentry->d_inode;
>  	unsigned max_links = dir->i_sb->s_max_links;
>  	int error;
>  
>  	if (!inode)
>  		return -ENOENT;
>  
> -	error = may_create(dir, new_dentry);
> +	if (target) {
> +		if (flags & AT_REPLACE)

This needs an

	if (inode == target)
		return 0;
		
I'll fix this in v4.

> +			error = may_delete(dir, new_dentry, d_is_dir(old_dentry));
> +		else
> +			error = -EEXIST;
> +	} else {
> +		error = may_create(dir, new_dentry);
> +	}
>  	if (error)
>  		return error;
>
diff mbox

Patch

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 97d17eaeba07..d3d29bf5d6b7 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -437,7 +437,7 @@  static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 	dget(lower_new_dentry);
 	lower_dir_dentry = lock_parent(lower_new_dentry);
 	rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
-		      lower_new_dentry, NULL);
+		      lower_new_dentry, NULL, 0);
 	if (rc || d_really_is_negative(lower_new_dentry))
 		goto out_lock;
 	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
diff --git a/fs/namei.c b/fs/namei.c
index 186bd2464fd5..2cc2b1deaa12 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4149,6 +4149,7 @@  SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
  * @dir:	new parent
  * @new_dentry:	where to create the new link
  * @delegated_inode: returns inode needing a delegation break
+ * @flags:      link flags
  *
  * The caller must hold dir->i_mutex
  *
@@ -4162,16 +4163,26 @@  SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
  * be appropriate for callers that expect the underlying filesystem not
  * to be NFS exported.
  */
-int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+int vfs_link(struct dentry *old_dentry, struct inode *dir,
+	     struct dentry *new_dentry, struct inode **delegated_inode,
+	     unsigned int flags)
 {
 	struct inode *inode = old_dentry->d_inode;
+	struct inode *target = new_dentry->d_inode;
 	unsigned max_links = dir->i_sb->s_max_links;
 	int error;
 
 	if (!inode)
 		return -ENOENT;
 
-	error = may_create(dir, new_dentry);
+	if (target) {
+		if (flags & AT_REPLACE)
+			error = may_delete(dir, new_dentry, d_is_dir(old_dentry));
+		else
+			error = -EEXIST;
+	} else {
+		error = may_create(dir, new_dentry);
+	}
 	if (error)
 		return error;
 
@@ -4190,8 +4201,10 @@  int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	 */
 	if (HAS_UNMAPPED_ID(inode))
 		return -EPERM;
-	if (!dir->i_op->link)
+	if (!dir->i_op->link && !dir->i_op->link2)
 		return -EPERM;
+	if (flags && !dir->i_op->link2)
+		return -EINVAL;
 	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
 
@@ -4199,26 +4212,58 @@  int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (error)
 		return error;
 
-	inode_lock(inode);
+	dget(new_dentry);
+	lock_two_nondirectories(inode, target);
+
+	if (is_local_mountpoint(new_dentry)) {
+		error = -EBUSY;
+		goto out;
+	}
+
 	/* Make sure we don't allow creating hardlink to an unlinked file */
-	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
+	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) {
 		error =  -ENOENT;
-	else if (max_links && inode->i_nlink >= max_links)
+		goto out;
+	}
+	if (max_links && inode->i_nlink >= max_links) {
 		error = -EMLINK;
-	else {
-		error = try_break_deleg(inode, delegated_inode);
-		if (!error)
-			error = dir->i_op->link(old_dentry, dir, new_dentry);
+		goto out;
+	}
+
+	error = try_break_deleg(inode, delegated_inode);
+	if (error)
+		goto out;
+	if (target) {
+		error = try_break_deleg(target, delegated_inode);
+		if (error)
+			goto out;
+	}
+
+	if (dir->i_op->link)
+		error = dir->i_op->link(old_dentry, dir, new_dentry);
+	else
+		error = dir->i_op->link2(old_dentry, dir, new_dentry, flags);
+	if (error)
+		goto out;
+
+	if (target) {
+		dont_mount(new_dentry);
+		detach_mounts(new_dentry);
 	}
 
-	if (!error && (inode->i_state & I_LINKABLE)) {
+	if (inode->i_state & I_LINKABLE) {
 		spin_lock(&inode->i_lock);
 		inode->i_state &= ~I_LINKABLE;
 		spin_unlock(&inode->i_lock);
 	}
-	inode_unlock(inode);
-	if (!error)
+out:
+	unlock_two_nondirectories(inode, target);
+	dput(new_dentry);
+	if (!error) {
+		if (target)
+			fsnotify_link_count(target);
 		fsnotify_link(dir, inode, new_dentry);
+	}
 	return error;
 }
 EXPORT_SYMBOL(vfs_link);
@@ -4237,11 +4282,15 @@  int do_linkat(int olddfd, const char __user *oldname, int newdfd,
 {
 	struct dentry *new_dentry;
 	struct path old_path, new_path;
+	struct qstr new_last;
+	int new_type;
 	struct inode *delegated_inode = NULL;
-	int how = 0;
+	struct filename *to;
+	unsigned int how = 0, target_flags;
+	bool should_retry = false;
 	int error;
 
-	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_REPLACE)) != 0)
 		return -EINVAL;
 	/*
 	 * To use null names we require CAP_DAC_READ_SEARCH
@@ -4256,44 +4305,102 @@  int do_linkat(int olddfd, const char __user *oldname, int newdfd,
 
 	if (flags & AT_SYMLINK_FOLLOW)
 		how |= LOOKUP_FOLLOW;
+
+	if (flags & AT_REPLACE)
+		target_flags = LOOKUP_RENAME_TARGET;
+	else
+		target_flags = LOOKUP_CREATE | LOOKUP_EXCL;
 retry:
 	error = user_path_at(olddfd, oldname, how, &old_path);
 	if (error)
 		return error;
 
-	new_dentry = user_path_create(newdfd, newname, &new_path,
-					(how & LOOKUP_REVAL));
-	error = PTR_ERR(new_dentry);
-	if (IS_ERR(new_dentry))
-		goto out;
+	to = filename_parentat(newdfd, getname(newname), how & LOOKUP_REVAL,
+			       &new_path, &new_last, &new_type);
+	if (IS_ERR(to)) {
+		error = PTR_ERR(to);
+		goto exit1;
+	}
+
+	if (old_path.mnt != new_path.mnt) {
+		error = -EXDEV;
+		goto exit2;
+	}
+
+	if (new_type != LAST_NORM) {
+		if (flags & AT_REPLACE)
+			error = -EBUSY;
+		else
+			error = -EEXIST;
+		goto exit2;
+	}
+
+	error = mnt_want_write(old_path.mnt);
+	if (error)
+		goto exit2;
+
+retry_deleg:
+	inode_lock_nested(new_path.dentry->d_inode, I_MUTEX_PARENT);
+
+	new_dentry = __lookup_hash(&new_last, new_path.dentry,
+				   (how & LOOKUP_REVAL) | target_flags);
+	if (IS_ERR(new_dentry)) {
+		error = PTR_ERR(new_dentry);
+		goto exit3;
+	}
+	if (!(flags & AT_REPLACE) && d_is_positive(new_dentry)) {
+		error = -EEXIST;
+		goto exit4;
+	}
+	if (new_last.name[new_last.len]) {
+		/* trailing slash on negative dentry gives -ENOENT */
+		if (d_is_negative(new_dentry)) {
+			error = -ENOENT;
+			goto exit4;
+		}
+
+		/*
+		 * unless the source is a directory, trailing slash gives
+		 * -ENOTDIR (this can only happen in the AT_REPLACE case, so we
+		 * make this consistent with sys_renameat2() even though a
+		 * source directory will fail later with -EPERM)
+		 */
+		if (!d_is_dir(old_path.dentry)) {
+			error = -ENOTDIR;
+			goto exit4;
+		}
+	}
 
-	error = -EXDEV;
-	if (old_path.mnt != new_path.mnt)
-		goto out_dput;
 	error = may_linkat(&old_path);
 	if (unlikely(error))
-		goto out_dput;
+		goto exit4;
 	error = security_path_link(old_path.dentry, &new_path, new_dentry);
 	if (error)
-		goto out_dput;
-	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
-out_dput:
-	done_path_create(&new_path, new_dentry);
+		goto exit4;
+	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry,
+			 &delegated_inode, flags & AT_REPLACE);
+exit4:
+	dput(new_dentry);
+exit3:
+	inode_unlock(new_path.dentry->d_inode);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
-		if (!error) {
-			path_put(&old_path);
-			goto retry;
-		}
+		if (!error)
+			goto retry_deleg;
 	}
-	if (retry_estale(error, how)) {
-		path_put(&old_path);
+	mnt_drop_write(old_path.mnt);
+exit2:
+	if (retry_estale(error, how))
+		should_retry = true;
+	path_put(&new_path);
+	putname(to);
+exit1:
+	path_put(&old_path);
+	if (should_retry) {
+		should_retry = false;
 		how |= LOOKUP_REVAL;
 		goto retry;
 	}
-out:
-	path_put(&old_path);
-
 	return error;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 2410b093a2e6..541a78a6d684 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1594,7 +1594,7 @@  nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 	err = nfserr_noent;
 	if (d_really_is_negative(dold))
 		goto out_dput;
-	host_err = vfs_link(dold, dirp, dnew, NULL);
+	host_err = vfs_link(dold, dirp, dnew, NULL, 0);
 	if (!host_err) {
 		err = nfserrno(commit_metadata(ffhp));
 		if (!err)
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e0b7de799f6b..906d0ee1de74 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -100,7 +100,7 @@  static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
 static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
 			      struct dentry *new_dentry, bool debug)
 {
-	int err = vfs_link(old_dentry, dir, new_dentry, NULL);
+	int err = vfs_link(old_dentry, dir, new_dentry, NULL, 0);
 	if (debug) {
 		pr_debug("link(%pd2, %pd2) = %i\n",
 			 old_dentry, new_dentry, err);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 760d8da1b6c7..8e44b12023c2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1607,7 +1607,7 @@  extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
 extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
 extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
 extern int vfs_symlink(struct inode *, struct dentry *, const char *);
-extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
+extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
 extern int vfs_rmdir(struct inode *, struct dentry *);
 extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
@@ -1752,6 +1752,7 @@  struct inode_operations {
 
 	int (*create) (struct inode *,struct dentry *, umode_t, bool);
 	int (*link) (struct dentry *,struct inode *,struct dentry *);
+	int (*link2) (struct dentry *,struct inode *,struct dentry *,unsigned int);
 	int (*unlink) (struct inode *,struct dentry *);
 	int (*symlink) (struct inode *,struct dentry *,const char *);
 	int (*mkdir) (struct inode *,struct dentry *,umode_t);
diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h
index 6448cdd9a350..b601ad36e726 100644
--- a/include/uapi/linux/fcntl.h
+++ b/include/uapi/linux/fcntl.h
@@ -84,6 +84,7 @@ 
 #define AT_SYMLINK_FOLLOW	0x400   /* Follow symbolic links.  */
 #define AT_NO_AUTOMOUNT		0x800	/* Suppress terminal automount traversal */
 #define AT_EMPTY_PATH		0x1000	/* Allow empty relative pathname */
+#define AT_REPLACE		0x2000	/* Replace new path */
 
 #define AT_STATX_SYNC_TYPE	0x6000	/* Type of synchronisation required from statx() */
 #define AT_STATX_SYNC_AS_STAT	0x0000	/* - Do whatever stat() does */