[v3,1/6] vfs: create vfs helper vfs_tmpfile()
diff mbox

Message ID 1484588765-9397-2-git-send-email-amir73il@gmail.com
State New
Headers show

Commit Message

Amir Goldstein Jan. 16, 2017, 5:46 p.m. UTC
Factor out some common vfs bits from do_tmpfile()
to be used by overlayfs for concurrent copy up.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/namei.c         | 66 +++++++++++++++++++++++++++++++++++-------------------
 include/linux/fs.h |  3 +++
 2 files changed, 46 insertions(+), 23 deletions(-)

Comments

Miklos Szeredi Jan. 16, 2017, 7:47 p.m. UTC | #1
On Mon, Jan 16, 2017 at 6:46 PM, Amir Goldstein <amir73il@gmail.com> wrote:
> Factor out some common vfs bits from do_tmpfile()
> to be used by overlayfs for concurrent copy up.
>
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> ---
>  fs/namei.c         | 66 +++++++++++++++++++++++++++++++++++-------------------
>  include/linux/fs.h |  3 +++
>  2 files changed, 46 insertions(+), 23 deletions(-)
>
> diff --git a/fs/namei.c b/fs/namei.c
> index ad74877..3e7c7a6 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -3353,11 +3353,49 @@ static int do_last(struct nameidata *nd,
>         return error;
>  }
>
> +struct dentry *vfs_tmpfile(struct inode *dir, struct dentry *dentry,

dir and dentry refer to the same thing; can just pass the dentry.

> +                          umode_t mode, int open_flag)
> +{
> +       static const struct qstr name = QSTR_INIT("/", 1);
> +       struct dentry *child = NULL;
> +       struct inode *inode;
> +       int error;
> +
> +       /* we want directory to be writable */
> +       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);

This is not in the scope of this patch, but shoudln't we be using
may_create() here?   Or at least a variant without the audit thing...

Al?

Thanks,
Miklos

> +       if (error)
> +               goto out_err;
> +       error = -EOPNOTSUPP;
> +       if (!dir->i_op->tmpfile)
> +               goto out_err;
> +       error = -ENOMEM;
> +       child = d_alloc(dentry, &name);
> +       if (unlikely(!child))
> +               goto out_err;
> +       error = dir->i_op->tmpfile(dir, child, mode);
> +       if (error)
> +               goto out_err;
> +       error = -ENOENT;
> +       inode = child->d_inode;
> +       if (unlikely(!inode))
> +               goto out_err;
> +       if (!(open_flag & O_EXCL)) {
> +               spin_lock(&inode->i_lock);
> +               inode->i_state |= I_LINKABLE;
> +               spin_unlock(&inode->i_lock);
> +       }
> +       return child;
> +
> +out_err:
> +       dput(child);
> +       return ERR_PTR(error);
> +}
> +EXPORT_SYMBOL(vfs_tmpfile);
> +
>  static int do_tmpfile(struct nameidata *nd, unsigned flags,
>                 const struct open_flags *op,
>                 struct file *file, int *opened)
>  {
> -       static const struct qstr name = QSTR_INIT("/", 1);
>         struct dentry *child;
>         struct inode *dir;
>         struct path path;
> @@ -3368,24 +3406,12 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
>         if (unlikely(error))
>                 goto out;
>         dir = path.dentry->d_inode;
> -       /* we want directory to be writable */
> -       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
> -       if (error)
> -               goto out2;
> -       if (!dir->i_op->tmpfile) {
> -               error = -EOPNOTSUPP;
> -               goto out2;
> -       }
> -       child = d_alloc(path.dentry, &name);
> -       if (unlikely(!child)) {
> -               error = -ENOMEM;
> +       child = vfs_tmpfile(dir, path.dentry, op->mode, op->open_flag);
> +       error = PTR_ERR(child);
> +       if (unlikely(IS_ERR(child)))
>                 goto out2;
> -       }
>         dput(path.dentry);
>         path.dentry = child;
> -       error = dir->i_op->tmpfile(dir, child, op->mode);
> -       if (error)
> -               goto out2;
>         audit_inode(nd->name, child, 0);
>         /* Don't check for other permissions, the inode was just created */
>         error = may_open(&path, 0, op->open_flag);
> @@ -3396,14 +3422,8 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
>         if (error)
>                 goto out2;
>         error = open_check_o_direct(file);
> -       if (error) {
> +       if (error)
>                 fput(file);
> -       } else if (!(op->open_flag & O_EXCL)) {
> -               struct inode *inode = file_inode(file);
> -               spin_lock(&inode->i_lock);
> -               inode->i_state |= I_LINKABLE;
> -               spin_unlock(&inode->i_lock);
> -       }
>  out2:
>         mnt_drop_write(path.mnt);
>  out:
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 2ba0743..8c7cbcb 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -1561,6 +1561,9 @@ extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
>  extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
>  extern int vfs_whiteout(struct inode *, struct dentry *);
>
> +extern struct dentry *vfs_tmpfile(struct inode *dir, struct dentry *dentry,
> +                                 umode_t mode, int open_flag);
> +
>  /*
>   * VFS file helper functions.
>   */
> --
> 2.7.4
>
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Feb. 19, 2017, 3:27 a.m. UTC | #2
On Mon, Jan 16, 2017 at 08:47:32PM +0100, Miklos Szeredi wrote:

> > +                          umode_t mode, int open_flag)
> > +{
> > +       static const struct qstr name = QSTR_INIT("/", 1);
> > +       struct dentry *child = NULL;
> > +       struct inode *inode;
> > +       int error;
> > +
> > +       /* we want directory to be writable */
> > +       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
> 
> This is not in the scope of this patch, but shoudln't we be using
> may_create() here?   Or at least a variant without the audit thing...
> 
> Al?

may_create() expects directory + child dentry; here we have only parent.
IS_DEADDIR is rather pointless here - directory is not locked, for
starters, so rmdir might happen right under you.  Or right after you've
returned from your function, for that matter.  userns checks...
FWIW, no such checks are done in ->atomic_open() paths, so I'm not sure
how much are those worth...
Miklos Szeredi March 9, 2017, 11:13 a.m. UTC | #3
On Sun, Feb 19, 2017 at 4:27 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Mon, Jan 16, 2017 at 08:47:32PM +0100, Miklos Szeredi wrote:
>
>> > +                          umode_t mode, int open_flag)
>> > +{
>> > +       static const struct qstr name = QSTR_INIT("/", 1);
>> > +       struct dentry *child = NULL;
>> > +       struct inode *inode;
>> > +       int error;
>> > +
>> > +       /* we want directory to be writable */
>> > +       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
>>
>> This is not in the scope of this patch, but shoudln't we be using
>> may_create() here?   Or at least a variant without the audit thing...
>>
>> Al?
>
> may_create() expects directory + child dentry; here we have only parent.
> IS_DEADDIR is rather pointless here - directory is not locked, for
> starters, so rmdir might happen right under you.  Or right after you've
> returned from your function, for that matter.  userns checks...
> FWIW, no such checks are done in ->atomic_open() paths, so I'm not sure
> how much are those worth...

Eric would know since he added those checks.

Thanks,
Miklos
Eric W. Biederman March 9, 2017, 5:31 p.m. UTC | #4
Miklos Szeredi <miklos@szeredi.hu> writes:

> On Sun, Feb 19, 2017 at 4:27 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
>> On Mon, Jan 16, 2017 at 08:47:32PM +0100, Miklos Szeredi wrote:
>>
>>> > +                          umode_t mode, int open_flag)
>>> > +{
>>> > +       static const struct qstr name = QSTR_INIT("/", 1);
>>> > +       struct dentry *child = NULL;
>>> > +       struct inode *inode;
>>> > +       int error;
>>> > +
>>> > +       /* we want directory to be writable */
>>> > +       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
>>>
>>> This is not in the scope of this patch, but shoudln't we be using
>>> may_create() here?   Or at least a variant without the audit thing...
>>>
>>> Al?
>>
>> may_create() expects directory + child dentry; here we have only parent.
>> IS_DEADDIR is rather pointless here - directory is not locked, for
>> starters, so rmdir might happen right under you.  Or right after you've
>> returned from your function, for that matter.  userns checks...
>> FWIW, no such checks are done in ->atomic_open() paths, so I'm not sure
>> how much are those worth...
>
> Eric would know since he added those checks.

Unless I am missing something the atomic_open path was fixed this merge
window when may_o_create was fixed.  Missing places any place where
we create files is an oversight.

The point of those checks is when we have a filesystem mounted by root
in a user namespace like tmpfs or hopefully soon fuse that it will let
the vfs filter out uids and gids that the filesystem does not know how
to map thus has no hope of understanding.  Since the filesystem does not
care about the uids and gids odds are filesystems won't be bothered to
test or deal with that case and corruption will result.  As far as I can
see not filtering out umappable uids and gids is just laying a trap for
filesystem developers.

Which means vfs_tmpfile is definitely something that needs to be patched
to verify that the current_fsuid and current_fsgid are valid from
the filesystems point of view.

At the same time this only matters for filesystems that set
FS_USERNS_MOUNT and implement tmpfile.  Which right now is tmpfs.  Given
that tmpfs actually only uses the vfs inode, there are no corruption or
other filesystem misbehaviors right now.  So it won't kill us if we
don't fix this for 4.11.

I am hoping things are far enough along that we can merge the patches to
fuse that make it safe to set FS_USER_NS for 4.12-rc1, and have truly
unprivileged fuse mounts.  At which point this will matter more.

Eric

Patch
diff mbox

diff --git a/fs/namei.c b/fs/namei.c
index ad74877..3e7c7a6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3353,11 +3353,49 @@  static int do_last(struct nameidata *nd,
 	return error;
 }
 
+struct dentry *vfs_tmpfile(struct inode *dir, struct dentry *dentry,
+			   umode_t mode, int open_flag)
+{
+	static const struct qstr name = QSTR_INIT("/", 1);
+	struct dentry *child = NULL;
+	struct inode *inode;
+	int error;
+
+	/* we want directory to be writable */
+	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+	if (error)
+		goto out_err;
+	error = -EOPNOTSUPP;
+	if (!dir->i_op->tmpfile)
+		goto out_err;
+	error = -ENOMEM;
+	child = d_alloc(dentry, &name);
+	if (unlikely(!child))
+		goto out_err;
+	error = dir->i_op->tmpfile(dir, child, mode);
+	if (error)
+		goto out_err;
+	error = -ENOENT;
+	inode = child->d_inode;
+	if (unlikely(!inode))
+		goto out_err;
+	if (!(open_flag & O_EXCL)) {
+		spin_lock(&inode->i_lock);
+		inode->i_state |= I_LINKABLE;
+		spin_unlock(&inode->i_lock);
+	}
+	return child;
+
+out_err:
+	dput(child);
+	return ERR_PTR(error);
+}
+EXPORT_SYMBOL(vfs_tmpfile);
+
 static int do_tmpfile(struct nameidata *nd, unsigned flags,
 		const struct open_flags *op,
 		struct file *file, int *opened)
 {
-	static const struct qstr name = QSTR_INIT("/", 1);
 	struct dentry *child;
 	struct inode *dir;
 	struct path path;
@@ -3368,24 +3406,12 @@  static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	if (unlikely(error))
 		goto out;
 	dir = path.dentry->d_inode;
-	/* we want directory to be writable */
-	error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
-	if (error)
-		goto out2;
-	if (!dir->i_op->tmpfile) {
-		error = -EOPNOTSUPP;
-		goto out2;
-	}
-	child = d_alloc(path.dentry, &name);
-	if (unlikely(!child)) {
-		error = -ENOMEM;
+	child = vfs_tmpfile(dir, path.dentry, op->mode, op->open_flag);
+	error = PTR_ERR(child);
+	if (unlikely(IS_ERR(child)))
 		goto out2;
-	}
 	dput(path.dentry);
 	path.dentry = child;
-	error = dir->i_op->tmpfile(dir, child, op->mode);
-	if (error)
-		goto out2;
 	audit_inode(nd->name, child, 0);
 	/* Don't check for other permissions, the inode was just created */
 	error = may_open(&path, 0, op->open_flag);
@@ -3396,14 +3422,8 @@  static int do_tmpfile(struct nameidata *nd, unsigned flags,
 	if (error)
 		goto out2;
 	error = open_check_o_direct(file);
-	if (error) {
+	if (error)
 		fput(file);
-	} else if (!(op->open_flag & O_EXCL)) {
-		struct inode *inode = file_inode(file);
-		spin_lock(&inode->i_lock);
-		inode->i_state |= I_LINKABLE;
-		spin_unlock(&inode->i_lock);
-	}
 out2:
 	mnt_drop_write(path.mnt);
 out:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2ba0743..8c7cbcb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1561,6 +1561,9 @@  extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
 extern int vfs_whiteout(struct inode *, struct dentry *);
 
+extern struct dentry *vfs_tmpfile(struct inode *dir, struct dentry *dentry,
+				  umode_t mode, int open_flag);
+
 /*
  * VFS file helper functions.
  */