diff mbox series

[RFC] autofs: find_autofs_mount overmounted parent support

Message ID 20210303152931.771996-1-alexander.mikhalitsyn@virtuozzo.com (mailing list archive)
State New, archived
Headers show
Series [RFC] autofs: find_autofs_mount overmounted parent support | expand

Commit Message

Alexander Mikhalitsyn March 3, 2021, 3:28 p.m. UTC
It was discovered that find_autofs_mount() function
in autofs not support cases when autofs mount
parent is overmounted. In this case this function will
always return -ENOENT.

Real-life reproducer is fairly simple.
Consider the following mounts on root mntns:
--
35 24 0:36 / /proc/sys/fs/binfmt_misc ... shared:16 - autofs systemd-1 ...
654 35 0:57 / /proc/sys/fs/binfmt_misc ... shared:322 - binfmt_misc ...
--
and some process which calls ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
$ unshare -m -p --fork --mount-proc ./process-bin

Due to "mount-proc" /proc will be overmounted and
ioctl() will fail with -ENOENT

Cc: Matthew Wilcox <willy@infradead.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
Cc: autofs@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
---
 fs/autofs/dev-ioctl.c | 127 +++++++++++++++++++++++++++++++++++++-----
 fs/namespace.c        |  44 +++++++++++++++
 include/linux/mount.h |   5 ++
 3 files changed, 162 insertions(+), 14 deletions(-)

Comments

Ian Kent March 4, 2021, 6:54 a.m. UTC | #1
On Wed, 2021-03-03 at 18:28 +0300, Alexander Mikhalitsyn wrote:
> It was discovered that find_autofs_mount() function
> in autofs not support cases when autofs mount
> parent is overmounted. In this case this function will
> always return -ENOENT.

Ok, I get this shouldn't happen.

> 
> Real-life reproducer is fairly simple.
> Consider the following mounts on root mntns:
> --
> 35 24 0:36 / /proc/sys/fs/binfmt_misc ... shared:16 - autofs systemd-
> 1 ...
> 654 35 0:57 / /proc/sys/fs/binfmt_misc ... shared:322 - binfmt_misc
> ...
> --
> and some process which calls ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> $ unshare -m -p --fork --mount-proc ./process-bin
> 
> Due to "mount-proc" /proc will be overmounted and
> ioctl() will fail with -ENOENT

I think I need a better explanation ...

What's being said here?

For a start your talking about direct mounts, I'm pretty sure this
use case can't occur with indirect mounts in the sense that the
indirect mount base should/must never be over mounted and IIRC that
base can't be /proc (but maybe that's just mounts inside proc ...),
can't remember now but from a common sense POV an indirect mount
won't/can't be on /proc.

And why is this ioctl be called?

If the mount is over mounted should that prevent expiration of the
over mounted /proc anyway, so maybe the return is correct ... or
not ...

I get that the mount namespaces should be independent and intuitively
this is a bug but what is the actual use and expected result.

But anyway, aren't you saying that the VFS path walk isn't handling
mount namespaces properly or are you saying that a process outside
this new mount namespace becomes broken because of it?

Either way the solution looks more complicated than I'd expect so
some explanation along these lines would be good.

Ian
> 
> Cc: Matthew Wilcox <willy@infradead.org>
> Cc: Al Viro <viro@zeniv.linux.org.uk>
> Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
> Cc: autofs@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Alexander Mikhalitsyn <
> alexander.mikhalitsyn@virtuozzo.com>
> ---
>  fs/autofs/dev-ioctl.c | 127 +++++++++++++++++++++++++++++++++++++---
> --
>  fs/namespace.c        |  44 +++++++++++++++
>  include/linux/mount.h |   5 ++
>  3 files changed, 162 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
> index 5bf781ea6d67..55edd3eba8ce 100644
> --- a/fs/autofs/dev-ioctl.c
> +++ b/fs/autofs/dev-ioctl.c
> @@ -10,6 +10,7 @@
>  #include <linux/fdtable.h>
>  #include <linux/magic.h>
>  #include <linux/nospec.h>
> +#include <linux/nsproxy.h>
>  
>  #include "autofs_i.h"
>  
> @@ -179,32 +180,130 @@ static int autofs_dev_ioctl_protosubver(struct
> file *fp,
>  	return 0;
>  }
>  
> +struct filter_autofs_data {
> +	char *pathbuf;
> +	const char *fpathname;
> +	int (*test)(const struct path *path, void *data);
> +	void *data;
> +};
> +
> +static int filter_autofs(const struct path *path, void *p)
> +{
> +	struct filter_autofs_data *data = p;
> +	char *name;
> +	int err;
> +
> +	if (path->mnt->mnt_sb->s_magic != AUTOFS_SUPER_MAGIC)
> +		return 0;
> +
> +	name = d_path(path, data->pathbuf, PATH_MAX);
> +	if (IS_ERR(name)) {
> +		err = PTR_ERR(name);
> +		pr_err("d_path failed, errno %d\n", err);
> +		return 0;
> +	}
> +
> +	if (strncmp(data->fpathname, name, PATH_MAX))
> +		return 0;
> +
> +	if (!data->test(path, data->data))
> +		return 0;
> +
> +	return 1;
> +}
> +
>  /* Find the topmost mount satisfying test() */
>  static int find_autofs_mount(const char *pathname,
>  			     struct path *res,
>  			     int test(const struct path *path, void
> *data),
>  			     void *data)
>  {
> -	struct path path;
> +	struct filter_autofs_data mdata = {
> +		.pathbuf = NULL,
> +		.test = test,
> +		.data = data,
> +	};
> +	struct mnt_namespace *mnt_ns = current->nsproxy->mnt_ns;
> +	struct path path = {};
> +	char *fpathbuf = NULL;
>  	int err;
>  
> +	/*
> +	 * In most cases user will provide full path to autofs mount
> point
> +	 * as it is in /proc/X/mountinfo. But if not, then we need to
> +	 * open provided relative path and calculate full path.
> +	 * It will not work in case when parent mount of autofs mount
> +	 * is overmounted:
> +	 * cd /root
> +	 * ./autofs_mount /root/autofs_yard/mnt
> +	 * mount -t tmpfs tmpfs /root/autofs_yard/mnt
> +	 * mount -t tmpfs tmpfs /root/autofs_yard
> +	 * ./call_ioctl /root/autofs_yard/mnt <- all fine here because
> we
> +	 * 					 have full path and
> don't
> +	 * 					 need to call
> kern_path()
> +	 * 					 and d_path()
> +	 * ./call_ioctl autofs_yard/mnt <- will fail because
> kern_path()
> +	 * 				   can't lookup
> /root/autofs_yard/mnt
> +	 * 				   (/root/autofs_yard
> directory is
> +	 * 				    empty)
> +	 *
> +	 * TO DISCUSS: we can write special algorithm for relative path
> case
> +	 * by getting cwd path combining it with relative path from
> user. But
> +	 * is it worth it? User also may use paths with symlinks in
> components
> +	 * of path.
> +	 *
> +	 */
>  	err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
> -	if (err)
> -		return err;
> -	err = -ENOENT;
> -	while (path.dentry == path.mnt->mnt_root) {
> -		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
> -			if (test(&path, data)) {
> -				path_get(&path);
> -				*res = path;
> -				err = 0;
> -				break;
> -			}
> +	if (err) {
> +		if (pathname[0] == '/') {
> +			/*
> +			 * pathname looks like full path let's try to
> use it
> +			 * as it is when searching autofs mount
> +			 */
> +			mdata.fpathname = pathname;
> +			err = 0;
> +			pr_debug("kern_path failed on %s, errno %d.
> Will use path as it is to search mount\n",
> +				 pathname, err);
> +		} else {
> +			pr_err("kern_path failed on %s, errno %d\n",
> +			       pathname, err);
> +			return err;
> +		}
> +	} else {
> +		pr_debug("find_autofs_mount: let's resolve full path
> %s\n",
> +			 pathname);
> +
> +		fpathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> +		if (!fpathbuf) {
> +			err = -ENOMEM;
> +			goto err;
> +		}
> +
> +		/*
> +		 * We have pathname from user but it may be relative,
> we need to
> +		 * have full path because we want to compare it with
> mountpoints
> +		 * paths later.
> +		 */
> +		mdata.fpathname = d_path(&path, fpathbuf, PATH_MAX);
> +		if (IS_ERR(mdata.fpathname)) {
> +			err = PTR_ERR(mdata.fpathname);
> +			pr_err("d_path failed, errno %d\n", err);
> +			goto err;
>  		}
> -		if (!follow_up(&path))
> -			break;
>  	}
> +
> +	mdata.pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> +	if (!mdata.pathbuf) {
> +		err = -ENOMEM;
> +		goto err;
> +	}
> +
> +	err = lookup_mount_path(mnt_ns, res, filter_autofs, &mdata);
> +
> +err:
>  	path_put(&path);
> +	kfree(fpathbuf);
> +	kfree(mdata.pathbuf);
>  	return err;
>  }
>  
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 56bb5a5fdc0d..e1d006dbdfe2 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1367,6 +1367,50 @@ void mnt_cursor_del(struct mnt_namespace *ns,
> struct mount *cursor)
>  }
>  #endif  /* CONFIG_PROC_FS */
>  
> +/**
> + * lookup_mount_path - traverse all mounts in mount namespace
> + *                     and filter using test() probe callback
> + * As a result struct path will be provided.
> + * @ns: root of mount tree
> + * @res: struct path pointer where resulting path will be written
> + * @test: filter callback
> + * @data: will be provided as argument to test() callback
> + *
> + */
> +int lookup_mount_path(struct mnt_namespace *ns,
> +		      struct path *res,
> +		      int test(const struct path *mnt, void *data),
> +		      void *data)
> +{
> +	struct mount *mnt;
> +	int err = -ENOENT;
> +
> +	down_read(&namespace_sem);
> +	lock_ns_list(ns);
> +	list_for_each_entry(mnt, &ns->list, mnt_list) {
> +		struct path tmppath;
> +
> +		if (mnt_is_cursor(mnt))
> +			continue;
> +
> +		tmppath.dentry = mnt->mnt.mnt_root;
> +		tmppath.mnt = &mnt->mnt;
> +
> +		if (test(&tmppath, data)) {
> +			path_get(&tmppath);
> +			*res = tmppath;
> +			err = 0;
> +			break;
> +		}
> +	}
> +	unlock_ns_list(ns);
> +	up_read(&namespace_sem);
> +
> +	return err;
> +}
> +
> +EXPORT_SYMBOL(lookup_mount_path);
> +
>  /**
>   * may_umount_tree - check if a mount tree is busy
>   * @mnt: root of mount tree
> diff --git a/include/linux/mount.h b/include/linux/mount.h
> index 5d92a7e1a742..a79e6392e38e 100644
> --- a/include/linux/mount.h
> +++ b/include/linux/mount.h
> @@ -118,6 +118,11 @@ extern unsigned int sysctl_mount_max;
>  
>  extern bool path_is_mountpoint(const struct path *path);
>  
> +extern int lookup_mount_path(struct mnt_namespace *ns,
> +			     struct path *res,
> +			     int test(const struct path *mnt, void
> *data),
> +			     void *data);
> +
>  extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int
> num);
>  
>  #endif /* _LINUX_MOUNT_H */
Alexander Mikhalitsyn March 4, 2021, 10:11 a.m. UTC | #2
On Thu, 04 Mar 2021 14:54:11 +0800
Ian Kent <raven@themaw.net> wrote:

> On Wed, 2021-03-03 at 18:28 +0300, Alexander Mikhalitsyn wrote:
> > It was discovered that find_autofs_mount() function
> > in autofs not support cases when autofs mount
> > parent is overmounted. In this case this function will
> > always return -ENOENT.
> 
> Ok, I get this shouldn't happen.
> 
> > 
> > Real-life reproducer is fairly simple.
> > Consider the following mounts on root mntns:
> > --
> > 35 24 0:36 / /proc/sys/fs/binfmt_misc ... shared:16 - autofs systemd-
> > 1 ...
> > 654 35 0:57 / /proc/sys/fs/binfmt_misc ... shared:322 - binfmt_misc
> > ...
> > --
> > and some process which calls ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> > $ unshare -m -p --fork --mount-proc ./process-bin
> > 
> > Due to "mount-proc" /proc will be overmounted and
> > ioctl() will fail with -ENOENT
> 
> I think I need a better explanation ...

Thank you for the quick reply, Ian.
I'm sorry If my patch description was not sufficiently clear and detailed.

That problem connected with CRIU (Checkpoint-Restore in Userspace) project.
In CRIU we have support of autofs mounts C/R. To acheive that we need to use
ioctl's from /dev/autofs to get data about mounts, restore mount as catatonic
(if needed), change pipe fd and so on. But the problem is that during CRIU
dump we may meet situation when VFS subtree where autofs mount present was
overmounted as whole.

Simpliest example is /proc/sys/fs/binfmt_misc. This mount present on most
GNU/Linux distributions by default. For instance on my Fedora 33:

trigger automount of binfmt_misc
$ ls /proc/sys/fs/binfmt_misc

$ cat /proc/1/mountinfo | grep binfmt
35 24 0:36 / /proc/sys/fs/binfmt_misc rw,relatime shared:16 - autofs systemd-1 rw,...,direct,pipe_ino=223
632 35 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime shared:315 - binfmt_misc binfmt_misc rw

$ sudo unshare -m -p --fork --mount-proc sh
# cat /proc/self/mountinfo | grep "/proc"
828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-1 rw,...,direct,pipe_ino=223
943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc binfmt_misc rw
949 828 0:57 / /proc rw...,relatime - proc proc rw

As we can see now autofs mount /proc/sys/fs/binfmt_misc is inaccessible.
If we do something like:

struct autofs_dev_ioctl *param;
param = malloc(...);
devfd = open("/dev/autofs", O_RDONLY);
init_autofs_dev_ioctl(param);
param->size = size;
strcpy(param->path, "/proc/sys/fs/binfmt_misc");
param->openmount.devid = 36;
err = ioctl(devfd, AUTOFS_DEV_IOCTL_OPENMOUNT, param)

now we get err = -ENOENT.

> 
> What's being said here?
> 
> For a start your talking about direct mounts, I'm pretty sure this
> use case can't occur with indirect mounts in the sense that the
> indirect mount base should/must never be over mounted and IIRC that
> base can't be /proc (but maybe that's just mounts inside proc ...),
> can't remember now but from a common sense POV an indirect mount
> won't/can't be on /proc.
> 
> And why is this ioctl be called?

We call this ioctl during criu dump stage to open fd from autofs
mount dentry. This fd is used later to call ioctl(AUTOFS_IOC_CATATONIC)
(we do that on criu dump if we see that control process of autofs mount
is dead or pipe is dead).

> 
> If the mount is over mounted should that prevent expiration of the
> over mounted /proc anyway, so maybe the return is correct ... or
> not ...

I agree that case with overmounted subtree with autofs mount is weird case.
But it may be easily created by user and we in CRIU try to handle that.

> 
> I get that the mount namespaces should be independent and intuitively
> this is a bug but what is the actual use and expected result.
> 
> But anyway, aren't you saying that the VFS path walk isn't handling
> mount namespaces properly or are you saying that a process outside
> this new mount namespace becomes broken because of it?

No-no, it's only about opening autofs mount by device id + path.

> 
> Either way the solution looks more complicated than I'd expect so
> some explanation along these lines would be good.
> 
> Ian

Regards,
Alex

> > 
> > Cc: Matthew Wilcox <willy@infradead.org>
> > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
> > Cc: autofs@vger.kernel.org
> > Cc: linux-kernel@vger.kernel.org
> > Signed-off-by: Alexander Mikhalitsyn <
> > alexander.mikhalitsyn@virtuozzo.com>
> > ---
> >  fs/autofs/dev-ioctl.c | 127 +++++++++++++++++++++++++++++++++++++---
> > --
> >  fs/namespace.c        |  44 +++++++++++++++
> >  include/linux/mount.h |   5 ++
> >  3 files changed, 162 insertions(+), 14 deletions(-)
> > 
> > diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
> > index 5bf781ea6d67..55edd3eba8ce 100644
> > --- a/fs/autofs/dev-ioctl.c
> > +++ b/fs/autofs/dev-ioctl.c
> > @@ -10,6 +10,7 @@
> >  #include <linux/fdtable.h>
> >  #include <linux/magic.h>
> >  #include <linux/nospec.h>
> > +#include <linux/nsproxy.h>
> >  
> >  #include "autofs_i.h"
> >  
> > @@ -179,32 +180,130 @@ static int autofs_dev_ioctl_protosubver(struct
> > file *fp,
> >  	return 0;
> >  }
> >  
> > +struct filter_autofs_data {
> > +	char *pathbuf;
> > +	const char *fpathname;
> > +	int (*test)(const struct path *path, void *data);
> > +	void *data;
> > +};
> > +
> > +static int filter_autofs(const struct path *path, void *p)
> > +{
> > +	struct filter_autofs_data *data = p;
> > +	char *name;
> > +	int err;
> > +
> > +	if (path->mnt->mnt_sb->s_magic != AUTOFS_SUPER_MAGIC)
> > +		return 0;
> > +
> > +	name = d_path(path, data->pathbuf, PATH_MAX);
> > +	if (IS_ERR(name)) {
> > +		err = PTR_ERR(name);
> > +		pr_err("d_path failed, errno %d\n", err);
> > +		return 0;
> > +	}
> > +
> > +	if (strncmp(data->fpathname, name, PATH_MAX))
> > +		return 0;
> > +
> > +	if (!data->test(path, data->data))
> > +		return 0;
> > +
> > +	return 1;
> > +}
> > +
> >  /* Find the topmost mount satisfying test() */
> >  static int find_autofs_mount(const char *pathname,
> >  			     struct path *res,
> >  			     int test(const struct path *path, void
> > *data),
> >  			     void *data)
> >  {
> > -	struct path path;
> > +	struct filter_autofs_data mdata = {
> > +		.pathbuf = NULL,
> > +		.test = test,
> > +		.data = data,
> > +	};
> > +	struct mnt_namespace *mnt_ns = current->nsproxy->mnt_ns;
> > +	struct path path = {};
> > +	char *fpathbuf = NULL;
> >  	int err;
> >  
> > +	/*
> > +	 * In most cases user will provide full path to autofs mount
> > point
> > +	 * as it is in /proc/X/mountinfo. But if not, then we need to
> > +	 * open provided relative path and calculate full path.
> > +	 * It will not work in case when parent mount of autofs mount
> > +	 * is overmounted:
> > +	 * cd /root
> > +	 * ./autofs_mount /root/autofs_yard/mnt
> > +	 * mount -t tmpfs tmpfs /root/autofs_yard/mnt
> > +	 * mount -t tmpfs tmpfs /root/autofs_yard
> > +	 * ./call_ioctl /root/autofs_yard/mnt <- all fine here because
> > we
> > +	 * 					 have full path and
> > don't
> > +	 * 					 need to call
> > kern_path()
> > +	 * 					 and d_path()
> > +	 * ./call_ioctl autofs_yard/mnt <- will fail because
> > kern_path()
> > +	 * 				   can't lookup
> > /root/autofs_yard/mnt
> > +	 * 				   (/root/autofs_yard
> > directory is
> > +	 * 				    empty)
> > +	 *
> > +	 * TO DISCUSS: we can write special algorithm for relative path
> > case
> > +	 * by getting cwd path combining it with relative path from
> > user. But
> > +	 * is it worth it? User also may use paths with symlinks in
> > components
> > +	 * of path.
> > +	 *
> > +	 */
> >  	err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
> > -	if (err)
> > -		return err;
> > -	err = -ENOENT;
> > -	while (path.dentry == path.mnt->mnt_root) {
> > -		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
> > -			if (test(&path, data)) {
> > -				path_get(&path);
> > -				*res = path;
> > -				err = 0;
> > -				break;
> > -			}
> > +	if (err) {
> > +		if (pathname[0] == '/') {
> > +			/*
> > +			 * pathname looks like full path let's try to
> > use it
> > +			 * as it is when searching autofs mount
> > +			 */
> > +			mdata.fpathname = pathname;
> > +			err = 0;
> > +			pr_debug("kern_path failed on %s, errno %d.
> > Will use path as it is to search mount\n",
> > +				 pathname, err);
> > +		} else {
> > +			pr_err("kern_path failed on %s, errno %d\n",
> > +			       pathname, err);
> > +			return err;
> > +		}
> > +	} else {
> > +		pr_debug("find_autofs_mount: let's resolve full path
> > %s\n",
> > +			 pathname);
> > +
> > +		fpathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > +		if (!fpathbuf) {
> > +			err = -ENOMEM;
> > +			goto err;
> > +		}
> > +
> > +		/*
> > +		 * We have pathname from user but it may be relative,
> > we need to
> > +		 * have full path because we want to compare it with
> > mountpoints
> > +		 * paths later.
> > +		 */
> > +		mdata.fpathname = d_path(&path, fpathbuf, PATH_MAX);
> > +		if (IS_ERR(mdata.fpathname)) {
> > +			err = PTR_ERR(mdata.fpathname);
> > +			pr_err("d_path failed, errno %d\n", err);
> > +			goto err;
> >  		}
> > -		if (!follow_up(&path))
> > -			break;
> >  	}
> > +
> > +	mdata.pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > +	if (!mdata.pathbuf) {
> > +		err = -ENOMEM;
> > +		goto err;
> > +	}
> > +
> > +	err = lookup_mount_path(mnt_ns, res, filter_autofs, &mdata);
> > +
> > +err:
> >  	path_put(&path);
> > +	kfree(fpathbuf);
> > +	kfree(mdata.pathbuf);
> >  	return err;
> >  }
> >  
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index 56bb5a5fdc0d..e1d006dbdfe2 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -1367,6 +1367,50 @@ void mnt_cursor_del(struct mnt_namespace *ns,
> > struct mount *cursor)
> >  }
> >  #endif  /* CONFIG_PROC_FS */
> >  
> > +/**
> > + * lookup_mount_path - traverse all mounts in mount namespace
> > + *                     and filter using test() probe callback
> > + * As a result struct path will be provided.
> > + * @ns: root of mount tree
> > + * @res: struct path pointer where resulting path will be written
> > + * @test: filter callback
> > + * @data: will be provided as argument to test() callback
> > + *
> > + */
> > +int lookup_mount_path(struct mnt_namespace *ns,
> > +		      struct path *res,
> > +		      int test(const struct path *mnt, void *data),
> > +		      void *data)
> > +{
> > +	struct mount *mnt;
> > +	int err = -ENOENT;
> > +
> > +	down_read(&namespace_sem);
> > +	lock_ns_list(ns);
> > +	list_for_each_entry(mnt, &ns->list, mnt_list) {
> > +		struct path tmppath;
> > +
> > +		if (mnt_is_cursor(mnt))
> > +			continue;
> > +
> > +		tmppath.dentry = mnt->mnt.mnt_root;
> > +		tmppath.mnt = &mnt->mnt;
> > +
> > +		if (test(&tmppath, data)) {
> > +			path_get(&tmppath);
> > +			*res = tmppath;
> > +			err = 0;
> > +			break;
> > +		}
> > +	}
> > +	unlock_ns_list(ns);
> > +	up_read(&namespace_sem);
> > +
> > +	return err;
> > +}
> > +
> > +EXPORT_SYMBOL(lookup_mount_path);
> > +
> >  /**
> >   * may_umount_tree - check if a mount tree is busy
> >   * @mnt: root of mount tree
> > diff --git a/include/linux/mount.h b/include/linux/mount.h
> > index 5d92a7e1a742..a79e6392e38e 100644
> > --- a/include/linux/mount.h
> > +++ b/include/linux/mount.h
> > @@ -118,6 +118,11 @@ extern unsigned int sysctl_mount_max;
> >  
> >  extern bool path_is_mountpoint(const struct path *path);
> >  
> > +extern int lookup_mount_path(struct mnt_namespace *ns,
> > +			     struct path *res,
> > +			     int test(const struct path *mnt, void
> > *data),
> > +			     void *data);
> > +
> >  extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int
> > num);
> >  
> >  #endif /* _LINUX_MOUNT_H */
>
Ian Kent March 5, 2021, 10:10 a.m. UTC | #3
On Thu, 2021-03-04 at 13:11 +0300, Alexander Mikhalitsyn wrote:
> On Thu, 04 Mar 2021 14:54:11 +0800
> Ian Kent <raven@themaw.net> wrote:
> 
> > On Wed, 2021-03-03 at 18:28 +0300, Alexander Mikhalitsyn wrote:
> > > It was discovered that find_autofs_mount() function
> > > in autofs not support cases when autofs mount
> > > parent is overmounted. In this case this function will
> > > always return -ENOENT.
> > 
> > Ok, I get this shouldn't happen.
> > 
> > > Real-life reproducer is fairly simple.
> > > Consider the following mounts on root mntns:
> > > --
> > > 35 24 0:36 / /proc/sys/fs/binfmt_misc ... shared:16 - autofs
> > > systemd-
> > > 1 ...
> > > 654 35 0:57 / /proc/sys/fs/binfmt_misc ... shared:322 -
> > > binfmt_misc
> > > ...
> > > --
> > > and some process which calls ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> > > $ unshare -m -p --fork --mount-proc ./process-bin
> > > 
> > > Due to "mount-proc" /proc will be overmounted and
> > > ioctl() will fail with -ENOENT
> > 
> > I think I need a better explanation ...
> 
> Thank you for the quick reply, Ian.
> I'm sorry If my patch description was not sufficiently clear and
> detailed.
> 
> That problem connected with CRIU (Checkpoint-Restore in Userspace)
> project.
> In CRIU we have support of autofs mounts C/R. To acheive that we need
> to use
> ioctl's from /dev/autofs to get data about mounts, restore mount as
> catatonic
> (if needed), change pipe fd and so on. But the problem is that during
> CRIU
> dump we may meet situation when VFS subtree where autofs mount
> present was
> overmounted as whole.
> 
> Simpliest example is /proc/sys/fs/binfmt_misc. This mount present on
> most
> GNU/Linux distributions by default. For instance on my Fedora 33:

Yes, I don't know why systemd uses this direct mount, there must
have been a reason for it.

> 
> trigger automount of binfmt_misc
> $ ls /proc/sys/fs/binfmt_misc
> 
> $ cat /proc/1/mountinfo | grep binfmt
> 35 24 0:36 / /proc/sys/fs/binfmt_misc rw,relatime shared:16 - autofs
> systemd-1 rw,...,direct,pipe_ino=223
> 632 35 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime shared:315 -
> binfmt_misc binfmt_misc rw

Yes, I think this looks normal.

> 
> $ sudo unshare -m -p --fork --mount-proc sh
> # cat /proc/self/mountinfo | grep "/proc"
> 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
> 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-
> 1 rw,...,direct,pipe_ino=223
> 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc
> binfmt_misc rw
> 949 828 0:57 / /proc rw...,relatime - proc proc rw

Isn't this screwed up, /proc is on top of the binfmt_misc mount ...

Is this what's seen from the root namespace?

> 
> As we can see now autofs mount /proc/sys/fs/binfmt_misc is
> inaccessible.
> If we do something like:
> 
> struct autofs_dev_ioctl *param;
> param = malloc(...);
> devfd = open("/dev/autofs", O_RDONLY);
> init_autofs_dev_ioctl(param);
> param->size = size;
> strcpy(param->path, "/proc/sys/fs/binfmt_misc");
> param->openmount.devid = 36;
> err = ioctl(devfd, AUTOFS_DEV_IOCTL_OPENMOUNT, param)
> 
> now we get err = -ENOENT.

Maybe that should be EINVAL, not sure about cases though.

> 
> > What's being said here?
> > 
> > For a start your talking about direct mounts, I'm pretty sure this
> > use case can't occur with indirect mounts in the sense that the
> > indirect mount base should/must never be over mounted and IIRC that
> > base can't be /proc (but maybe that's just mounts inside proc ...),
> > can't remember now but from a common sense POV an indirect mount
> > won't/can't be on /proc.
> > 
> > And why is this ioctl be called?
> 
> We call this ioctl during criu dump stage to open fd from autofs
> mount dentry. This fd is used later to call
> ioctl(AUTOFS_IOC_CATATONIC)
> (we do that on criu dump if we see that control process of autofs
> mount
> is dead or pipe is dead).

Right so your usage "is" the way it's intended, ;)

> 
> > If the mount is over mounted should that prevent expiration of the
> > over mounted /proc anyway, so maybe the return is correct ... or
> > not ...
> 
> I agree that case with overmounted subtree with autofs mount is weird
> case.
> But it may be easily created by user and we in CRIU try to handle
> that.

I'm not yet ready to make a call on how I think this this should
be done.

Since you seem to be clear on what this should be used for I'll
need to look more closely at the patch.

But, at first glance, it looked like it would break the existing
function of the ioctl.

Could you explain how the patch works, in particular why it doesn't
break the existing functionality.

Long ago I'm pretty sure I continued to follow up but IIRC that
went away and was replaced by a single follow_up(), but since
the changes didn't break the existing function of autofs I
didn't pay that much attention to them, I'll need to look at
that too. Not only that, the namespace code has moved a long
way too however there's still little attention given to
sanitizing the mounts in the new namespace by anything that I'm
aware of that uses the feature. TBH I'm not sure why I don't
see a lot more problems of that nature.

I have to wonder if what's needed is attention to the follow up
but that /proc covering the earlier mounts is a bit of a concern.

> 
> > I get that the mount namespaces should be independent and
> > intuitively
> > this is a bug but what is the actual use and expected result.
> > 
> > But anyway, aren't you saying that the VFS path walk isn't handling
> > mount namespaces properly or are you saying that a process outside
> > this new mount namespace becomes broken because of it?
> 
> No-no, it's only about opening autofs mount by device id + path.

That's right, specifically getting a file handle to a covered autofs
mount for things like bringing it back to life etc. But that silently
implies the same mount namespace.

Let me look at the patch and think about it a bit.
I'll probably need to run some tests too.
I am a little busy right now so it may take a bit of time.

Ian 
> 
> > Either way the solution looks more complicated than I'd expect so
> > some explanation along these lines would be good.
> > 
> > Ian
> 
> Regards,
> Alex
> 
> > > Cc: Matthew Wilcox <willy@infradead.org>
> > > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > > Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > > Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
> > > Cc: autofs@vger.kernel.org
> > > Cc: linux-kernel@vger.kernel.org
> > > Signed-off-by: Alexander Mikhalitsyn <
> > > alexander.mikhalitsyn@virtuozzo.com>
> > > ---
> > >  fs/autofs/dev-ioctl.c | 127
> > > +++++++++++++++++++++++++++++++++++++---
> > > --
> > >  fs/namespace.c        |  44 +++++++++++++++
> > >  include/linux/mount.h |   5 ++
> > >  3 files changed, 162 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
> > > index 5bf781ea6d67..55edd3eba8ce 100644
> > > --- a/fs/autofs/dev-ioctl.c
> > > +++ b/fs/autofs/dev-ioctl.c
> > > @@ -10,6 +10,7 @@
> > >  #include <linux/fdtable.h>
> > >  #include <linux/magic.h>
> > >  #include <linux/nospec.h>
> > > +#include <linux/nsproxy.h>
> > >  
> > >  #include "autofs_i.h"
> > >  
> > > @@ -179,32 +180,130 @@ static int
> > > autofs_dev_ioctl_protosubver(struct
> > > file *fp,
> > >  	return 0;
> > >  }
> > >  
> > > +struct filter_autofs_data {
> > > +	char *pathbuf;
> > > +	const char *fpathname;
> > > +	int (*test)(const struct path *path, void *data);
> > > +	void *data;
> > > +};
> > > +
> > > +static int filter_autofs(const struct path *path, void *p)
> > > +{
> > > +	struct filter_autofs_data *data = p;
> > > +	char *name;
> > > +	int err;
> > > +
> > > +	if (path->mnt->mnt_sb->s_magic != AUTOFS_SUPER_MAGIC)
> > > +		return 0;
> > > +
> > > +	name = d_path(path, data->pathbuf, PATH_MAX);
> > > +	if (IS_ERR(name)) {
> > > +		err = PTR_ERR(name);
> > > +		pr_err("d_path failed, errno %d\n", err);
> > > +		return 0;
> > > +	}
> > > +
> > > +	if (strncmp(data->fpathname, name, PATH_MAX))
> > > +		return 0;
> > > +
> > > +	if (!data->test(path, data->data))
> > > +		return 0;
> > > +
> > > +	return 1;
> > > +}
> > > +
> > >  /* Find the topmost mount satisfying test() */
> > >  static int find_autofs_mount(const char *pathname,
> > >  			     struct path *res,
> > >  			     int test(const struct path *path, void
> > > *data),
> > >  			     void *data)
> > >  {
> > > -	struct path path;
> > > +	struct filter_autofs_data mdata = {
> > > +		.pathbuf = NULL,
> > > +		.test = test,
> > > +		.data = data,
> > > +	};
> > > +	struct mnt_namespace *mnt_ns = current->nsproxy->mnt_ns;
> > > +	struct path path = {};
> > > +	char *fpathbuf = NULL;
> > >  	int err;
> > >  
> > > +	/*
> > > +	 * In most cases user will provide full path to autofs mount
> > > point
> > > +	 * as it is in /proc/X/mountinfo. But if not, then we need to
> > > +	 * open provided relative path and calculate full path.
> > > +	 * It will not work in case when parent mount of autofs mount
> > > +	 * is overmounted:
> > > +	 * cd /root
> > > +	 * ./autofs_mount /root/autofs_yard/mnt
> > > +	 * mount -t tmpfs tmpfs /root/autofs_yard/mnt
> > > +	 * mount -t tmpfs tmpfs /root/autofs_yard
> > > +	 * ./call_ioctl /root/autofs_yard/mnt <- all fine here because
> > > we
> > > +	 * 					 have full path and
> > > don't
> > > +	 * 					 need to call
> > > kern_path()
> > > +	 * 					 and d_path()
> > > +	 * ./call_ioctl autofs_yard/mnt <- will fail because
> > > kern_path()
> > > +	 * 				   can't lookup
> > > /root/autofs_yard/mnt
> > > +	 * 				   (/root/autofs_yard
> > > directory is
> > > +	 * 				    empty)
> > > +	 *
> > > +	 * TO DISCUSS: we can write special algorithm for relative path
> > > case
> > > +	 * by getting cwd path combining it with relative path from
> > > user. But
> > > +	 * is it worth it? User also may use paths with symlinks in
> > > components
> > > +	 * of path.
> > > +	 *
> > > +	 */
> > >  	err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
> > > -	if (err)
> > > -		return err;
> > > -	err = -ENOENT;
> > > -	while (path.dentry == path.mnt->mnt_root) {
> > > -		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
> > > -			if (test(&path, data)) {
> > > -				path_get(&path);
> > > -				*res = path;
> > > -				err = 0;
> > > -				break;
> > > -			}
> > > +	if (err) {
> > > +		if (pathname[0] == '/') {
> > > +			/*
> > > +			 * pathname looks like full path let's try to
> > > use it
> > > +			 * as it is when searching autofs mount
> > > +			 */
> > > +			mdata.fpathname = pathname;
> > > +			err = 0;
> > > +			pr_debug("kern_path failed on %s, errno %d.
> > > Will use path as it is to search mount\n",
> > > +				 pathname, err);
> > > +		} else {
> > > +			pr_err("kern_path failed on %s, errno %d\n",
> > > +			       pathname, err);
> > > +			return err;
> > > +		}
> > > +	} else {
> > > +		pr_debug("find_autofs_mount: let's resolve full path
> > > %s\n",
> > > +			 pathname);
> > > +
> > > +		fpathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > +		if (!fpathbuf) {
> > > +			err = -ENOMEM;
> > > +			goto err;
> > > +		}
> > > +
> > > +		/*
> > > +		 * We have pathname from user but it may be relative,
> > > we need to
> > > +		 * have full path because we want to compare it with
> > > mountpoints
> > > +		 * paths later.
> > > +		 */
> > > +		mdata.fpathname = d_path(&path, fpathbuf, PATH_MAX);
> > > +		if (IS_ERR(mdata.fpathname)) {
> > > +			err = PTR_ERR(mdata.fpathname);
> > > +			pr_err("d_path failed, errno %d\n", err);
> > > +			goto err;
> > >  		}
> > > -		if (!follow_up(&path))
> > > -			break;
> > >  	}
> > > +
> > > +	mdata.pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > +	if (!mdata.pathbuf) {
> > > +		err = -ENOMEM;
> > > +		goto err;
> > > +	}
> > > +
> > > +	err = lookup_mount_path(mnt_ns, res, filter_autofs, &mdata);
> > > +
> > > +err:
> > >  	path_put(&path);
> > > +	kfree(fpathbuf);
> > > +	kfree(mdata.pathbuf);
> > >  	return err;
> > >  }
> > >  
> > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > index 56bb5a5fdc0d..e1d006dbdfe2 100644
> > > --- a/fs/namespace.c
> > > +++ b/fs/namespace.c
> > > @@ -1367,6 +1367,50 @@ void mnt_cursor_del(struct mnt_namespace
> > > *ns,
> > > struct mount *cursor)
> > >  }
> > >  #endif  /* CONFIG_PROC_FS */
> > >  
> > > +/**
> > > + * lookup_mount_path - traverse all mounts in mount namespace
> > > + *                     and filter using test() probe callback
> > > + * As a result struct path will be provided.
> > > + * @ns: root of mount tree
> > > + * @res: struct path pointer where resulting path will be
> > > written
> > > + * @test: filter callback
> > > + * @data: will be provided as argument to test() callback
> > > + *
> > > + */
> > > +int lookup_mount_path(struct mnt_namespace *ns,
> > > +		      struct path *res,
> > > +		      int test(const struct path *mnt, void *data),
> > > +		      void *data)
> > > +{
> > > +	struct mount *mnt;
> > > +	int err = -ENOENT;
> > > +
> > > +	down_read(&namespace_sem);
> > > +	lock_ns_list(ns);
> > > +	list_for_each_entry(mnt, &ns->list, mnt_list) {
> > > +		struct path tmppath;
> > > +
> > > +		if (mnt_is_cursor(mnt))
> > > +			continue;
> > > +
> > > +		tmppath.dentry = mnt->mnt.mnt_root;
> > > +		tmppath.mnt = &mnt->mnt;
> > > +
> > > +		if (test(&tmppath, data)) {
> > > +			path_get(&tmppath);
> > > +			*res = tmppath;
> > > +			err = 0;
> > > +			break;
> > > +		}
> > > +	}
> > > +	unlock_ns_list(ns);
> > > +	up_read(&namespace_sem);
> > > +
> > > +	return err;
> > > +}
> > > +
> > > +EXPORT_SYMBOL(lookup_mount_path);
> > > +
> > >  /**
> > >   * may_umount_tree - check if a mount tree is busy
> > >   * @mnt: root of mount tree
> > > diff --git a/include/linux/mount.h b/include/linux/mount.h
> > > index 5d92a7e1a742..a79e6392e38e 100644
> > > --- a/include/linux/mount.h
> > > +++ b/include/linux/mount.h
> > > @@ -118,6 +118,11 @@ extern unsigned int sysctl_mount_max;
> > >  
> > >  extern bool path_is_mountpoint(const struct path *path);
> > >  
> > > +extern int lookup_mount_path(struct mnt_namespace *ns,
> > > +			     struct path *res,
> > > +			     int test(const struct path *mnt, void
> > > *data),
> > > +			     void *data);
> > > +
> > >  extern void kern_unmount_array(struct vfsmount *mnt[], unsigned
> > > int
> > > num);
> > >  
> > >  #endif /* _LINUX_MOUNT_H */
Alexander Mikhalitsyn March 5, 2021, 11:55 a.m. UTC | #4
On Fri, 05 Mar 2021 18:10:02 +0800
Ian Kent <raven@themaw.net> wrote:

> On Thu, 2021-03-04 at 13:11 +0300, Alexander Mikhalitsyn wrote:
> > On Thu, 04 Mar 2021 14:54:11 +0800
> > Ian Kent <raven@themaw.net> wrote:
> > 
> > > On Wed, 2021-03-03 at 18:28 +0300, Alexander Mikhalitsyn wrote:
> > > > It was discovered that find_autofs_mount() function
> > > > in autofs not support cases when autofs mount
> > > > parent is overmounted. In this case this function will
> > > > always return -ENOENT.
> > > 
> > > Ok, I get this shouldn't happen.
> > > 
> > > > Real-life reproducer is fairly simple.
> > > > Consider the following mounts on root mntns:
> > > > --
> > > > 35 24 0:36 / /proc/sys/fs/binfmt_misc ... shared:16 - autofs
> > > > systemd-
> > > > 1 ...
> > > > 654 35 0:57 / /proc/sys/fs/binfmt_misc ... shared:322 -
> > > > binfmt_misc
> > > > ...
> > > > --
> > > > and some process which calls ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> > > > $ unshare -m -p --fork --mount-proc ./process-bin
> > > > 
> > > > Due to "mount-proc" /proc will be overmounted and
> > > > ioctl() will fail with -ENOENT
> > > 
> > > I think I need a better explanation ...
> > 
> > Thank you for the quick reply, Ian.
> > I'm sorry If my patch description was not sufficiently clear and
> > detailed.
> > 
> > That problem connected with CRIU (Checkpoint-Restore in Userspace)
> > project.
> > In CRIU we have support of autofs mounts C/R. To acheive that we need
> > to use
> > ioctl's from /dev/autofs to get data about mounts, restore mount as
> > catatonic
> > (if needed), change pipe fd and so on. But the problem is that during
> > CRIU
> > dump we may meet situation when VFS subtree where autofs mount
> > present was
> > overmounted as whole.
> > 
> > Simpliest example is /proc/sys/fs/binfmt_misc. This mount present on
> > most
> > GNU/Linux distributions by default. For instance on my Fedora 33:
> 
> Yes, I don't know why systemd uses this direct mount, there must
> have been a reason for it.
> 
> > 
> > trigger automount of binfmt_misc
> > $ ls /proc/sys/fs/binfmt_misc
> > 
> > $ cat /proc/1/mountinfo | grep binfmt
> > 35 24 0:36 / /proc/sys/fs/binfmt_misc rw,relatime shared:16 - autofs
> > systemd-1 rw,...,direct,pipe_ino=223
> > 632 35 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime shared:315 -
> > binfmt_misc binfmt_misc rw
> 
> Yes, I think this looks normal.
> 
> > 
> > $ sudo unshare -m -p --fork --mount-proc sh
> > # cat /proc/self/mountinfo | grep "/proc"
> > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
> > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-
> > 1 rw,...,direct,pipe_ino=223
> > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc
> > binfmt_misc rw
> > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> 
> Isn't this screwed up, /proc is on top of the binfmt_misc mount ...
> 
> Is this what's seen from the root namespace?

No-no, after issuing
$ sudo unshare -m -p --fork --mount-proc sh

we enter to the pid+mount namespace and:

# cat /proc/self/mountinfo | grep "/proc"

So, it's picture from inside namespaces.

> 
> > 
> > As we can see now autofs mount /proc/sys/fs/binfmt_misc is
> > inaccessible.
> > If we do something like:
> > 
> > struct autofs_dev_ioctl *param;
> > param = malloc(...);
> > devfd = open("/dev/autofs", O_RDONLY);
> > init_autofs_dev_ioctl(param);
> > param->size = size;
> > strcpy(param->path, "/proc/sys/fs/binfmt_misc");
> > param->openmount.devid = 36;
> > err = ioctl(devfd, AUTOFS_DEV_IOCTL_OPENMOUNT, param)
> > 
> > now we get err = -ENOENT.
> 
> Maybe that should be EINVAL, not sure about cases though.

in current version -ENOENT is returned in this particular case

> 
> > 
> > > What's being said here?
> > > 
> > > For a start your talking about direct mounts, I'm pretty sure this
> > > use case can't occur with indirect mounts in the sense that the
> > > indirect mount base should/must never be over mounted and IIRC that
> > > base can't be /proc (but maybe that's just mounts inside proc ...),
> > > can't remember now but from a common sense POV an indirect mount
> > > won't/can't be on /proc.
> > > 
> > > And why is this ioctl be called?
> > 
> > We call this ioctl during criu dump stage to open fd from autofs
> > mount dentry. This fd is used later to call
> > ioctl(AUTOFS_IOC_CATATONIC)
> > (we do that on criu dump if we see that control process of autofs
> > mount
> > is dead or pipe is dead).
> 
> Right so your usage "is" the way it's intended, ;)

That's good! ;)

> 
> > 
> > > If the mount is over mounted should that prevent expiration of the
> > > over mounted /proc anyway, so maybe the return is correct ... or
> > > not ...
> > 
> > I agree that case with overmounted subtree with autofs mount is weird
> > case.
> > But it may be easily created by user and we in CRIU try to handle
> > that.
> 
> I'm not yet ready to make a call on how I think this this should
> be done.
> 
> Since you seem to be clear on what this should be used for I'll
> need to look more closely at the patch.
> 
> But, at first glance, it looked like it would break the existing
> function of the ioctl.
> 
> Could you explain how the patch works, in particular why it doesn't
> break the existing functionality.

Sure. With pleasure. Idea of patch is naive:
1. find_autofs_mount() function called only from syscall context, so,
we always can determine current mount namespace of caller.
So, I've introduced

> > > > + int lookup_mount_path(struct mnt_namespace *ns,
> > > > +		      struct path *res,
> > > > +		      int test(const struct path *mnt, void *data),
> > > > +		      void *data)

lookup_mount_path() helper function, which allows to traverse mounts list of
mount namespace and find proper autofs mount by user-provided helper test().

2. Helper function is fairly simple:
a) it checks that mount is autofs mount (by magic number on superblock)
b) it calculates full path to mount point of each mount in mount namespace
and compare it with path which user was provided to the ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
parameters.

Problem here is case when user provided relative path in ioctl parameters
(struct autofs_dev_ioctl). In this case we may fail to resolve user provided path to
struct path. For instance:

# cat /proc/self/mountinfo | grep "/proc"
828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-
1 rw,...,direct,pipe_ino=223
943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc
binfmt_misc rw
949 828 0:57 / /proc rw...,relatime - proc proc rw

in this case 
kern_path("/proc/sys/fs/binfmt_misc", LOOKUP_MOUNTPOINT, &path) == -ENOENT

To overcome this issue, if kern_path() failed with -ENOENT
AND user-provided mount path looks like fullpath (starts from /)
we just try to find autofs mount in mounts list just by searching
autofs mounts in mounts list with mount point path equal to user-provided
path. This covers our problem case.

This patch is fully compatible with old behaviour - if parent mounts of
autofs mount is not overmounted - then
kern_path("/proc/sys/fs/binfmt_misc", LOOKUP_MOUNTPOINT, &path)
will not fail, and we also easily find needed autofs mount in mounts list
of caller mount namespace.

> 
> Long ago I'm pretty sure I continued to follow up but IIRC that
> went away and was replaced by a single follow_up(), but since
> the changes didn't break the existing function of autofs I
> didn't pay that much attention to them, I'll need to look at
> that too. Not only that, the namespace code has moved a long
> way too however there's still little attention given to
> sanitizing the mounts in the new namespace by anything that I'm
> aware of that uses the feature. TBH I'm not sure why I don't
> see a lot more problems of that nature.
> 
> I have to wonder if what's needed is attention to the follow up
> but that /proc covering the earlier mounts is a bit of a concern.
> 
> > 
> > > I get that the mount namespaces should be independent and
> > > intuitively
> > > this is a bug but what is the actual use and expected result.
> > > 
> > > But anyway, aren't you saying that the VFS path walk isn't handling
> > > mount namespaces properly or are you saying that a process outside
> > > this new mount namespace becomes broken because of it?
> > 
> > No-no, it's only about opening autofs mount by device id + path.
> 
> That's right, specifically getting a file handle to a covered autofs
> mount for things like bringing it back to life etc. But that silently
> implies the same mount namespace.
> 
> Let me look at the patch and think about it a bit.
> I'll probably need to run some tests too.
> I am a little busy right now so it may take a bit of time.
> 
> Ian 

Thank you very much for your attention to the patch and comments.

Regards,
Alex

> > 
> > > Either way the solution looks more complicated than I'd expect so
> > > some explanation along these lines would be good.
> > > 
> > > Ian
> > 
> > Regards,
> > Alex
> > 
> > > > Cc: Matthew Wilcox <willy@infradead.org>
> > > > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > > > Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > > > Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
> > > > Cc: autofs@vger.kernel.org
> > > > Cc: linux-kernel@vger.kernel.org
> > > > Signed-off-by: Alexander Mikhalitsyn <
> > > > alexander.mikhalitsyn@virtuozzo.com>
> > > > ---
> > > >  fs/autofs/dev-ioctl.c | 127
> > > > +++++++++++++++++++++++++++++++++++++---
> > > > --
> > > >  fs/namespace.c        |  44 +++++++++++++++
> > > >  include/linux/mount.h |   5 ++
> > > >  3 files changed, 162 insertions(+), 14 deletions(-)
> > > > 
> > > > diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
> > > > index 5bf781ea6d67..55edd3eba8ce 100644
> > > > --- a/fs/autofs/dev-ioctl.c
> > > > +++ b/fs/autofs/dev-ioctl.c
> > > > @@ -10,6 +10,7 @@
> > > >  #include <linux/fdtable.h>
> > > >  #include <linux/magic.h>
> > > >  #include <linux/nospec.h>
> > > > +#include <linux/nsproxy.h>
> > > >  
> > > >  #include "autofs_i.h"
> > > >  
> > > > @@ -179,32 +180,130 @@ static int
> > > > autofs_dev_ioctl_protosubver(struct
> > > > file *fp,
> > > >  	return 0;
> > > >  }
> > > >  
> > > > +struct filter_autofs_data {
> > > > +	char *pathbuf;
> > > > +	const char *fpathname;
> > > > +	int (*test)(const struct path *path, void *data);
> > > > +	void *data;
> > > > +};
> > > > +
> > > > +static int filter_autofs(const struct path *path, void *p)
> > > > +{
> > > > +	struct filter_autofs_data *data = p;
> > > > +	char *name;
> > > > +	int err;
> > > > +
> > > > +	if (path->mnt->mnt_sb->s_magic != AUTOFS_SUPER_MAGIC)
> > > > +		return 0;
> > > > +
> > > > +	name = d_path(path, data->pathbuf, PATH_MAX);
> > > > +	if (IS_ERR(name)) {
> > > > +		err = PTR_ERR(name);
> > > > +		pr_err("d_path failed, errno %d\n", err);
> > > > +		return 0;
> > > > +	}
> > > > +
> > > > +	if (strncmp(data->fpathname, name, PATH_MAX))
> > > > +		return 0;
> > > > +
> > > > +	if (!data->test(path, data->data))
> > > > +		return 0;
> > > > +
> > > > +	return 1;
> > > > +}
> > > > +
> > > >  /* Find the topmost mount satisfying test() */
> > > >  static int find_autofs_mount(const char *pathname,
> > > >  			     struct path *res,
> > > >  			     int test(const struct path *path, void
> > > > *data),
> > > >  			     void *data)
> > > >  {
> > > > -	struct path path;
> > > > +	struct filter_autofs_data mdata = {
> > > > +		.pathbuf = NULL,
> > > > +		.test = test,
> > > > +		.data = data,
> > > > +	};
> > > > +	struct mnt_namespace *mnt_ns = current->nsproxy->mnt_ns;
> > > > +	struct path path = {};
> > > > +	char *fpathbuf = NULL;
> > > >  	int err;
> > > >  
> > > > +	/*
> > > > +	 * In most cases user will provide full path to autofs mount
> > > > point
> > > > +	 * as it is in /proc/X/mountinfo. But if not, then we need to
> > > > +	 * open provided relative path and calculate full path.
> > > > +	 * It will not work in case when parent mount of autofs mount
> > > > +	 * is overmounted:
> > > > +	 * cd /root
> > > > +	 * ./autofs_mount /root/autofs_yard/mnt
> > > > +	 * mount -t tmpfs tmpfs /root/autofs_yard/mnt
> > > > +	 * mount -t tmpfs tmpfs /root/autofs_yard
> > > > +	 * ./call_ioctl /root/autofs_yard/mnt <- all fine here because
> > > > we
> > > > +	 * 					 have full path and
> > > > don't
> > > > +	 * 					 need to call
> > > > kern_path()
> > > > +	 * 					 and d_path()
> > > > +	 * ./call_ioctl autofs_yard/mnt <- will fail because
> > > > kern_path()
> > > > +	 * 				   can't lookup
> > > > /root/autofs_yard/mnt
> > > > +	 * 				   (/root/autofs_yard
> > > > directory is
> > > > +	 * 				    empty)
> > > > +	 *
> > > > +	 * TO DISCUSS: we can write special algorithm for relative path
> > > > case
> > > > +	 * by getting cwd path combining it with relative path from
> > > > user. But
> > > > +	 * is it worth it? User also may use paths with symlinks in
> > > > components
> > > > +	 * of path.
> > > > +	 *
> > > > +	 */
> > > >  	err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
> > > > -	if (err)
> > > > -		return err;
> > > > -	err = -ENOENT;
> > > > -	while (path.dentry == path.mnt->mnt_root) {
> > > > -		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
> > > > -			if (test(&path, data)) {
> > > > -				path_get(&path);
> > > > -				*res = path;
> > > > -				err = 0;
> > > > -				break;
> > > > -			}
> > > > +	if (err) {
> > > > +		if (pathname[0] == '/') {
> > > > +			/*
> > > > +			 * pathname looks like full path let's try to
> > > > use it
> > > > +			 * as it is when searching autofs mount
> > > > +			 */
> > > > +			mdata.fpathname = pathname;
> > > > +			err = 0;
> > > > +			pr_debug("kern_path failed on %s, errno %d.
> > > > Will use path as it is to search mount\n",
> > > > +				 pathname, err);
> > > > +		} else {
> > > > +			pr_err("kern_path failed on %s, errno %d\n",
> > > > +			       pathname, err);
> > > > +			return err;
> > > > +		}
> > > > +	} else {
> > > > +		pr_debug("find_autofs_mount: let's resolve full path
> > > > %s\n",
> > > > +			 pathname);
> > > > +
> > > > +		fpathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > > +		if (!fpathbuf) {
> > > > +			err = -ENOMEM;
> > > > +			goto err;
> > > > +		}
> > > > +
> > > > +		/*
> > > > +		 * We have pathname from user but it may be relative,
> > > > we need to
> > > > +		 * have full path because we want to compare it with
> > > > mountpoints
> > > > +		 * paths later.
> > > > +		 */
> > > > +		mdata.fpathname = d_path(&path, fpathbuf, PATH_MAX);
> > > > +		if (IS_ERR(mdata.fpathname)) {
> > > > +			err = PTR_ERR(mdata.fpathname);
> > > > +			pr_err("d_path failed, errno %d\n", err);
> > > > +			goto err;
> > > >  		}
> > > > -		if (!follow_up(&path))
> > > > -			break;
> > > >  	}
> > > > +
> > > > +	mdata.pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > > +	if (!mdata.pathbuf) {
> > > > +		err = -ENOMEM;
> > > > +		goto err;
> > > > +	}
> > > > +
> > > > +	err = lookup_mount_path(mnt_ns, res, filter_autofs, &mdata);
> > > > +
> > > > +err:
> > > >  	path_put(&path);
> > > > +	kfree(fpathbuf);
> > > > +	kfree(mdata.pathbuf);
> > > >  	return err;
> > > >  }
> > > >  
> > > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > > index 56bb5a5fdc0d..e1d006dbdfe2 100644
> > > > --- a/fs/namespace.c
> > > > +++ b/fs/namespace.c
> > > > @@ -1367,6 +1367,50 @@ void mnt_cursor_del(struct mnt_namespace
> > > > *ns,
> > > > struct mount *cursor)
> > > >  }
> > > >  #endif  /* CONFIG_PROC_FS */
> > > >  
> > > > +/**
> > > > + * lookup_mount_path - traverse all mounts in mount namespace
> > > > + *                     and filter using test() probe callback
> > > > + * As a result struct path will be provided.
> > > > + * @ns: root of mount tree
> > > > + * @res: struct path pointer where resulting path will be
> > > > written
> > > > + * @test: filter callback
> > > > + * @data: will be provided as argument to test() callback
> > > > + *
> > > > + */
> > > > +int lookup_mount_path(struct mnt_namespace *ns,
> > > > +		      struct path *res,
> > > > +		      int test(const struct path *mnt, void *data),
> > > > +		      void *data)
> > > > +{
> > > > +	struct mount *mnt;
> > > > +	int err = -ENOENT;
> > > > +
> > > > +	down_read(&namespace_sem);
> > > > +	lock_ns_list(ns);
> > > > +	list_for_each_entry(mnt, &ns->list, mnt_list) {
> > > > +		struct path tmppath;
> > > > +
> > > > +		if (mnt_is_cursor(mnt))
> > > > +			continue;
> > > > +
> > > > +		tmppath.dentry = mnt->mnt.mnt_root;
> > > > +		tmppath.mnt = &mnt->mnt;
> > > > +
> > > > +		if (test(&tmppath, data)) {
> > > > +			path_get(&tmppath);
> > > > +			*res = tmppath;
> > > > +			err = 0;
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +	unlock_ns_list(ns);
> > > > +	up_read(&namespace_sem);
> > > > +
> > > > +	return err;
> > > > +}
> > > > +
> > > > +EXPORT_SYMBOL(lookup_mount_path);
> > > > +
> > > >  /**
> > > >   * may_umount_tree - check if a mount tree is busy
> > > >   * @mnt: root of mount tree
> > > > diff --git a/include/linux/mount.h b/include/linux/mount.h
> > > > index 5d92a7e1a742..a79e6392e38e 100644
> > > > --- a/include/linux/mount.h
> > > > +++ b/include/linux/mount.h
> > > > @@ -118,6 +118,11 @@ extern unsigned int sysctl_mount_max;
> > > >  
> > > >  extern bool path_is_mountpoint(const struct path *path);
> > > >  
> > > > +extern int lookup_mount_path(struct mnt_namespace *ns,
> > > > +			     struct path *res,
> > > > +			     int test(const struct path *mnt, void
> > > > *data),
> > > > +			     void *data);
> > > > +
> > > >  extern void kern_unmount_array(struct vfsmount *mnt[], unsigned
> > > > int
> > > > num);
> > > >  
> > > >  #endif /* _LINUX_MOUNT_H */
>
Ian Kent March 6, 2021, 9:13 a.m. UTC | #5
On Fri, 2021-03-05 at 14:55 +0300, Alexander Mikhalitsyn wrote:
> On Fri, 05 Mar 2021 18:10:02 +0800
> Ian Kent <raven@themaw.net> wrote:
> 
> > On Thu, 2021-03-04 at 13:11 +0300, Alexander Mikhalitsyn wrote:
> > > On Thu, 04 Mar 2021 14:54:11 +0800
> > > Ian Kent <raven@themaw.net> wrote:
> > > 
> > > > On Wed, 2021-03-03 at 18:28 +0300, Alexander Mikhalitsyn wrote:
> > > > > It was discovered that find_autofs_mount() function
> > > > > in autofs not support cases when autofs mount
> > > > > parent is overmounted. In this case this function will
> > > > > always return -ENOENT.
> > > > 
> > > > Ok, I get this shouldn't happen.
> > > > 
> > > > > Real-life reproducer is fairly simple.
> > > > > Consider the following mounts on root mntns:
> > > > > --
> > > > > 35 24 0:36 / /proc/sys/fs/binfmt_misc ... shared:16 - autofs
> > > > > systemd-
> > > > > 1 ...
> > > > > 654 35 0:57 / /proc/sys/fs/binfmt_misc ... shared:322 -
> > > > > binfmt_misc
> > > > > ...
> > > > > --
> > > > > and some process which calls
> > > > > ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> > > > > $ unshare -m -p --fork --mount-proc ./process-bin
> > > > > 
> > > > > Due to "mount-proc" /proc will be overmounted and
> > > > > ioctl() will fail with -ENOENT
> > > > 
> > > > I think I need a better explanation ...
> > > 
> > > Thank you for the quick reply, Ian.
> > > I'm sorry If my patch description was not sufficiently clear and
> > > detailed.
> > > 
> > > That problem connected with CRIU (Checkpoint-Restore in
> > > Userspace)
> > > project.
> > > In CRIU we have support of autofs mounts C/R. To acheive that we
> > > need
> > > to use
> > > ioctl's from /dev/autofs to get data about mounts, restore mount
> > > as
> > > catatonic
> > > (if needed), change pipe fd and so on. But the problem is that
> > > during
> > > CRIU
> > > dump we may meet situation when VFS subtree where autofs mount
> > > present was
> > > overmounted as whole.
> > > 
> > > Simpliest example is /proc/sys/fs/binfmt_misc. This mount present
> > > on
> > > most
> > > GNU/Linux distributions by default. For instance on my Fedora 33:
> > 
> > Yes, I don't know why systemd uses this direct mount, there must
> > have been a reason for it.
> > 
> > > trigger automount of binfmt_misc
> > > $ ls /proc/sys/fs/binfmt_misc
> > > 
> > > $ cat /proc/1/mountinfo | grep binfmt
> > > 35 24 0:36 / /proc/sys/fs/binfmt_misc rw,relatime shared:16 -
> > > autofs
> > > systemd-1 rw,...,direct,pipe_ino=223
> > > 632 35 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime shared:315
> > > -
> > > binfmt_misc binfmt_misc rw
> > 
> > Yes, I think this looks normal.
> > 
> > > $ sudo unshare -m -p --fork --mount-proc sh
> > > # cat /proc/self/mountinfo | grep "/proc"
> > > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc
> > > rw
> > > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs
> > > systemd-
> > > 1 rw,...,direct,pipe_ino=223
> > > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime -
> > > binfmt_misc
> > > binfmt_misc rw
> > > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> > 
> > Isn't this screwed up, /proc is on top of the binfmt_misc mount ...
> > 
> > Is this what's seen from the root namespace?
> 
> No-no, after issuing
> $ sudo unshare -m -p --fork --mount-proc sh
> 
> we enter to the pid+mount namespace and:
> 
> # cat /proc/self/mountinfo | grep "/proc"
> 
> So, it's picture from inside namespaces.

Ok, so potentially some of those have been propagated from the
original mount namespace.

It seems to me the sensible thing would be those mounts would
not propagate when a new proc has been requested. It doesn't
make sense to me to carry around mounts that are not accessible
because of something requested by the mount namespace creator.

But that's nothing new and isn't likely to change any time soon.

> 
> > > As we can see now autofs mount /proc/sys/fs/binfmt_misc is
> > > inaccessible.
> > > If we do something like:
> > > 
> > > struct autofs_dev_ioctl *param;
> > > param = malloc(...);
> > > devfd = open("/dev/autofs", O_RDONLY);
> > > init_autofs_dev_ioctl(param);
> > > param->size = size;
> > > strcpy(param->path, "/proc/sys/fs/binfmt_misc");
> > > param->openmount.devid = 36;
> > > err = ioctl(devfd, AUTOFS_DEV_IOCTL_OPENMOUNT, param)
> > > 
> > > now we get err = -ENOENT.
> > 
> > Maybe that should be EINVAL, not sure about cases though.
> 
> in current version -ENOENT is returned in this particular case
> 
> > > > What's being said here?
> > > > 
> > > > For a start your talking about direct mounts, I'm pretty sure
> > > > this
> > > > use case can't occur with indirect mounts in the sense that the
> > > > indirect mount base should/must never be over mounted and IIRC
> > > > that
> > > > base can't be /proc (but maybe that's just mounts inside proc
> > > > ...),
> > > > can't remember now but from a common sense POV an indirect
> > > > mount
> > > > won't/can't be on /proc.
> > > > 
> > > > And why is this ioctl be called?
> > > 
> > > We call this ioctl during criu dump stage to open fd from autofs
> > > mount dentry. This fd is used later to call
> > > ioctl(AUTOFS_IOC_CATATONIC)
> > > (we do that on criu dump if we see that control process of autofs
> > > mount
> > > is dead or pipe is dead).
> > 
> > Right so your usage "is" the way it's intended, ;)
> 
> That's good! ;)
> 
> > > > If the mount is over mounted should that prevent expiration of
> > > > the
> > > > over mounted /proc anyway, so maybe the return is correct ...
> > > > or
> > > > not ...
> > > 
> > > I agree that case with overmounted subtree with autofs mount is
> > > weird
> > > case.
> > > But it may be easily created by user and we in CRIU try to handle
> > > that.
> > 
> > I'm not yet ready to make a call on how I think this this should
> > be done.
> > 
> > Since you seem to be clear on what this should be used for I'll
> > need to look more closely at the patch.
> > 
> > But, at first glance, it looked like it would break the existing
> > function of the ioctl.
> > 
> > Could you explain how the patch works, in particular why it doesn't
> > break the existing functionality.
> 
> Sure. With pleasure. Idea of patch is naive:
> 1. find_autofs_mount() function called only from syscall context, so,
> we always can determine current mount namespace of caller.
> So, I've introduced
> 
> > > > > + int lookup_mount_path(struct mnt_namespace *ns,
> > > > > +		      struct path *res,
> > > > > +		      int test(const struct path *mnt, void
> > > > > *data),
> > > > > +		      void *data)
> 
> lookup_mount_path() helper function, which allows to traverse mounts
> list of
> mount namespace and find proper autofs mount by user-provided helper
> test().
> 
> 2. Helper function is fairly simple:
> a) it checks that mount is autofs mount (by magic number on
> superblock)
> b) it calculates full path to mount point of each mount in mount
> namespace
> and compare it with path which user was provided to the
> ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> parameters.

Oh right, it's using the mounts list, it isn't a path walk oriented
search. That's probably why I didn't see the expected follow call.

Another problem with the existing code is it will get it wrong if
there is more than one autofs mount in the stack. For example an
autofs submount mounted on a direct mount which is rare and not
all that sensible but possible.

So, if we change this, there will need to be some agreed policy
about which mount would is selected.

The originally code (long before what is there now) selected the
lowest mount in the stack because this mechanism is only needed
for direct mounts and, as long as there's not something seriously
wrong, that is the mount you would need. That's what would be needed
for the case above and I think it's what's needed in your case too.

I still haven't yet looked closely at your change, I'll need to do
that.

> 
> Problem here is case when user provided relative path in ioctl
> parameters
> (struct autofs_dev_ioctl). In this case we may fail to resolve user
> provided path to
> struct path. For instance:

I don't like the idea of allowing a relative path as an a parameter
at all. I think that should be a failure case.

These direct mounts come from a table (a map in autofs or a unit
in systemd) and they should always be identified by that full path.

Ian

> 
> # cat /proc/self/mountinfo | grep "/proc"
> 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
> 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-
> 1 rw,...,direct,pipe_ino=223
> 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc
> binfmt_misc rw
> 949 828 0:57 / /proc rw...,relatime - proc proc rw
> 
> in this case 
> kern_path("/proc/sys/fs/binfmt_misc", LOOKUP_MOUNTPOINT, &path) ==
> -ENOENT
> 
> To overcome this issue, if kern_path() failed with -ENOENT
> AND user-provided mount path looks like fullpath (starts from /)
> we just try to find autofs mount in mounts list just by searching
> autofs mounts in mounts list with mount point path equal to user-
> provided
> path. This covers our problem case.
> 
> This patch is fully compatible with old behaviour - if parent mounts
> of
> autofs mount is not overmounted - then
> kern_path("/proc/sys/fs/binfmt_misc", LOOKUP_MOUNTPOINT, &path)
> will not fail, and we also easily find needed autofs mount in mounts
> list
> of caller mount namespace.
> 
> > Long ago I'm pretty sure I continued to follow up but IIRC that
> > went away and was replaced by a single follow_up(), but since
> > the changes didn't break the existing function of autofs I
> > didn't pay that much attention to them, I'll need to look at
> > that too. Not only that, the namespace code has moved a long
> > way too however there's still little attention given to
> > sanitizing the mounts in the new namespace by anything that I'm
> > aware of that uses the feature. TBH I'm not sure why I don't
> > see a lot more problems of that nature.
> > 
> > I have to wonder if what's needed is attention to the follow up
> > but that /proc covering the earlier mounts is a bit of a concern.
> > 
> > > > I get that the mount namespaces should be independent and
> > > > intuitively
> > > > this is a bug but what is the actual use and expected result.
> > > > 
> > > > But anyway, aren't you saying that the VFS path walk isn't
> > > > handling
> > > > mount namespaces properly or are you saying that a process
> > > > outside
> > > > this new mount namespace becomes broken because of it?
> > > 
> > > No-no, it's only about opening autofs mount by device id + path.
> > 
> > That's right, specifically getting a file handle to a covered
> > autofs
> > mount for things like bringing it back to life etc. But that
> > silently
> > implies the same mount namespace.
> > 
> > Let me look at the patch and think about it a bit.
> > I'll probably need to run some tests too.
> > I am a little busy right now so it may take a bit of time.
> > 
> > Ian 
> 
> Thank you very much for your attention to the patch and comments.
> 
> Regards,
> Alex
> 
> > > > Either way the solution looks more complicated than I'd expect
> > > > so
> > > > some explanation along these lines would be good.
> > > > 
> > > > Ian
> > > 
> > > Regards,
> > > Alex
> > > 
> > > > > Cc: Matthew Wilcox <willy@infradead.org>
> > > > > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > > > > Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > > > > Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
> > > > > Cc: autofs@vger.kernel.org
> > > > > Cc: linux-kernel@vger.kernel.org
> > > > > Signed-off-by: Alexander Mikhalitsyn <
> > > > > alexander.mikhalitsyn@virtuozzo.com>
> > > > > ---
> > > > >  fs/autofs/dev-ioctl.c | 127
> > > > > +++++++++++++++++++++++++++++++++++++---
> > > > > --
> > > > >  fs/namespace.c        |  44 +++++++++++++++
> > > > >  include/linux/mount.h |   5 ++
> > > > >  3 files changed, 162 insertions(+), 14 deletions(-)
> > > > > 
> > > > > diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
> > > > > index 5bf781ea6d67..55edd3eba8ce 100644
> > > > > --- a/fs/autofs/dev-ioctl.c
> > > > > +++ b/fs/autofs/dev-ioctl.c
> > > > > @@ -10,6 +10,7 @@
> > > > >  #include <linux/fdtable.h>
> > > > >  #include <linux/magic.h>
> > > > >  #include <linux/nospec.h>
> > > > > +#include <linux/nsproxy.h>
> > > > >  
> > > > >  #include "autofs_i.h"
> > > > >  
> > > > > @@ -179,32 +180,130 @@ static int
> > > > > autofs_dev_ioctl_protosubver(struct
> > > > > file *fp,
> > > > >  	return 0;
> > > > >  }
> > > > >  
> > > > > +struct filter_autofs_data {
> > > > > +	char *pathbuf;
> > > > > +	const char *fpathname;
> > > > > +	int (*test)(const struct path *path, void *data);
> > > > > +	void *data;
> > > > > +};
> > > > > +
> > > > > +static int filter_autofs(const struct path *path, void *p)
> > > > > +{
> > > > > +	struct filter_autofs_data *data = p;
> > > > > +	char *name;
> > > > > +	int err;
> > > > > +
> > > > > +	if (path->mnt->mnt_sb->s_magic != AUTOFS_SUPER_MAGIC)
> > > > > +		return 0;
> > > > > +
> > > > > +	name = d_path(path, data->pathbuf, PATH_MAX);
> > > > > +	if (IS_ERR(name)) {
> > > > > +		err = PTR_ERR(name);
> > > > > +		pr_err("d_path failed, errno %d\n", err);
> > > > > +		return 0;
> > > > > +	}
> > > > > +
> > > > > +	if (strncmp(data->fpathname, name, PATH_MAX))
> > > > > +		return 0;
> > > > > +
> > > > > +	if (!data->test(path, data->data))
> > > > > +		return 0;
> > > > > +
> > > > > +	return 1;
> > > > > +}
> > > > > +
> > > > >  /* Find the topmost mount satisfying test() */
> > > > >  static int find_autofs_mount(const char *pathname,
> > > > >  			     struct path *res,
> > > > >  			     int test(const struct path *path,
> > > > > void
> > > > > *data),
> > > > >  			     void *data)
> > > > >  {
> > > > > -	struct path path;
> > > > > +	struct filter_autofs_data mdata = {
> > > > > +		.pathbuf = NULL,
> > > > > +		.test = test,
> > > > > +		.data = data,
> > > > > +	};
> > > > > +	struct mnt_namespace *mnt_ns = current->nsproxy-
> > > > > >mnt_ns;
> > > > > +	struct path path = {};
> > > > > +	char *fpathbuf = NULL;
> > > > >  	int err;
> > > > >  
> > > > > +	/*
> > > > > +	 * In most cases user will provide full path to autofs
> > > > > mount
> > > > > point
> > > > > +	 * as it is in /proc/X/mountinfo. But if not, then we
> > > > > need to
> > > > > +	 * open provided relative path and calculate full path.
> > > > > +	 * It will not work in case when parent mount of autofs
> > > > > mount
> > > > > +	 * is overmounted:
> > > > > +	 * cd /root
> > > > > +	 * ./autofs_mount /root/autofs_yard/mnt
> > > > > +	 * mount -t tmpfs tmpfs /root/autofs_yard/mnt
> > > > > +	 * mount -t tmpfs tmpfs /root/autofs_yard
> > > > > +	 * ./call_ioctl /root/autofs_yard/mnt <- all fine here
> > > > > because
> > > > > we
> > > > > +	 * 					 have full
> > > > > path and
> > > > > don't
> > > > > +	 * 					 need to call
> > > > > kern_path()
> > > > > +	 * 					 and d_path()
> > > > > +	 * ./call_ioctl autofs_yard/mnt <- will fail because
> > > > > kern_path()
> > > > > +	 * 				   can't lookup
> > > > > /root/autofs_yard/mnt
> > > > > +	 * 				   (/root/autofs_yard
> > > > > directory is
> > > > > +	 * 				    empty)
> > > > > +	 *
> > > > > +	 * TO DISCUSS: we can write special algorithm for
> > > > > relative path
> > > > > case
> > > > > +	 * by getting cwd path combining it with relative path
> > > > > from
> > > > > user. But
> > > > > +	 * is it worth it? User also may use paths with
> > > > > symlinks in
> > > > > components
> > > > > +	 * of path.
> > > > > +	 *
> > > > > +	 */
> > > > >  	err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
> > > > > -	if (err)
> > > > > -		return err;
> > > > > -	err = -ENOENT;
> > > > > -	while (path.dentry == path.mnt->mnt_root) {
> > > > > -		if (path.dentry->d_sb->s_magic ==
> > > > > AUTOFS_SUPER_MAGIC) {
> > > > > -			if (test(&path, data)) {
> > > > > -				path_get(&path);
> > > > > -				*res = path;
> > > > > -				err = 0;
> > > > > -				break;
> > > > > -			}
> > > > > +	if (err) {
> > > > > +		if (pathname[0] == '/') {
> > > > > +			/*
> > > > > +			 * pathname looks like full path let's
> > > > > try to
> > > > > use it
> > > > > +			 * as it is when searching autofs mount
> > > > > +			 */
> > > > > +			mdata.fpathname = pathname;
> > > > > +			err = 0;
> > > > > +			pr_debug("kern_path failed on %s, errno
> > > > > %d.
> > > > > Will use path as it is to search mount\n",
> > > > > +				 pathname, err);
> > > > > +		} else {
> > > > > +			pr_err("kern_path failed on %s, errno
> > > > > %d\n",
> > > > > +			       pathname, err);
> > > > > +			return err;
> > > > > +		}
> > > > > +	} else {
> > > > > +		pr_debug("find_autofs_mount: let's resolve full
> > > > > path
> > > > > %s\n",
> > > > > +			 pathname);
> > > > > +
> > > > > +		fpathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > > > +		if (!fpathbuf) {
> > > > > +			err = -ENOMEM;
> > > > > +			goto err;
> > > > > +		}
> > > > > +
> > > > > +		/*
> > > > > +		 * We have pathname from user but it may be
> > > > > relative,
> > > > > we need to
> > > > > +		 * have full path because we want to compare it
> > > > > with
> > > > > mountpoints
> > > > > +		 * paths later.
> > > > > +		 */
> > > > > +		mdata.fpathname = d_path(&path, fpathbuf,
> > > > > PATH_MAX);
> > > > > +		if (IS_ERR(mdata.fpathname)) {
> > > > > +			err = PTR_ERR(mdata.fpathname);
> > > > > +			pr_err("d_path failed, errno %d\n",
> > > > > err);
> > > > > +			goto err;
> > > > >  		}
> > > > > -		if (!follow_up(&path))
> > > > > -			break;
> > > > >  	}
> > > > > +
> > > > > +	mdata.pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > > > +	if (!mdata.pathbuf) {
> > > > > +		err = -ENOMEM;
> > > > > +		goto err;
> > > > > +	}
> > > > > +
> > > > > +	err = lookup_mount_path(mnt_ns, res, filter_autofs,
> > > > > &mdata);
> > > > > +
> > > > > +err:
> > > > >  	path_put(&path);
> > > > > +	kfree(fpathbuf);
> > > > > +	kfree(mdata.pathbuf);
> > > > >  	return err;
> > > > >  }
> > > > >  
> > > > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > > > index 56bb5a5fdc0d..e1d006dbdfe2 100644
> > > > > --- a/fs/namespace.c
> > > > > +++ b/fs/namespace.c
> > > > > @@ -1367,6 +1367,50 @@ void mnt_cursor_del(struct
> > > > > mnt_namespace
> > > > > *ns,
> > > > > struct mount *cursor)
> > > > >  }
> > > > >  #endif  /* CONFIG_PROC_FS */
> > > > >  
> > > > > +/**
> > > > > + * lookup_mount_path - traverse all mounts in mount
> > > > > namespace
> > > > > + *                     and filter using test() probe
> > > > > callback
> > > > > + * As a result struct path will be provided.
> > > > > + * @ns: root of mount tree
> > > > > + * @res: struct path pointer where resulting path will be
> > > > > written
> > > > > + * @test: filter callback
> > > > > + * @data: will be provided as argument to test() callback
> > > > > + *
> > > > > + */
> > > > > +int lookup_mount_path(struct mnt_namespace *ns,
> > > > > +		      struct path *res,
> > > > > +		      int test(const struct path *mnt, void
> > > > > *data),
> > > > > +		      void *data)
> > > > > +{
> > > > > +	struct mount *mnt;
> > > > > +	int err = -ENOENT;
> > > > > +
> > > > > +	down_read(&namespace_sem);
> > > > > +	lock_ns_list(ns);
> > > > > +	list_for_each_entry(mnt, &ns->list, mnt_list) {
> > > > > +		struct path tmppath;
> > > > > +
> > > > > +		if (mnt_is_cursor(mnt))
> > > > > +			continue;
> > > > > +
> > > > > +		tmppath.dentry = mnt->mnt.mnt_root;
> > > > > +		tmppath.mnt = &mnt->mnt;
> > > > > +
> > > > > +		if (test(&tmppath, data)) {
> > > > > +			path_get(&tmppath);
> > > > > +			*res = tmppath;
> > > > > +			err = 0;
> > > > > +			break;
> > > > > +		}
> > > > > +	}
> > > > > +	unlock_ns_list(ns);
> > > > > +	up_read(&namespace_sem);
> > > > > +
> > > > > +	return err;
> > > > > +}
> > > > > +
> > > > > +EXPORT_SYMBOL(lookup_mount_path);
> > > > > +
> > > > >  /**
> > > > >   * may_umount_tree - check if a mount tree is busy
> > > > >   * @mnt: root of mount tree
> > > > > diff --git a/include/linux/mount.h b/include/linux/mount.h
> > > > > index 5d92a7e1a742..a79e6392e38e 100644
> > > > > --- a/include/linux/mount.h
> > > > > +++ b/include/linux/mount.h
> > > > > @@ -118,6 +118,11 @@ extern unsigned int sysctl_mount_max;
> > > > >  
> > > > >  extern bool path_is_mountpoint(const struct path *path);
> > > > >  
> > > > > +extern int lookup_mount_path(struct mnt_namespace *ns,
> > > > > +			     struct path *res,
> > > > > +			     int test(const struct path *mnt,
> > > > > void
> > > > > *data),
> > > > > +			     void *data);
> > > > > +
> > > > >  extern void kern_unmount_array(struct vfsmount *mnt[],
> > > > > unsigned
> > > > > int
> > > > > num);
> > > > >  
> > > > >  #endif /* _LINUX_MOUNT_H */
Al Viro March 7, 2021, 11:51 p.m. UTC | #6
On Thu, Mar 04, 2021 at 01:11:33PM +0300, Alexander Mikhalitsyn wrote:

> That problem connected with CRIU (Checkpoint-Restore in Userspace) project.
> In CRIU we have support of autofs mounts C/R. To acheive that we need to use
> ioctl's from /dev/autofs to get data about mounts, restore mount as catatonic
> (if needed), change pipe fd and so on. But the problem is that during CRIU
> dump we may meet situation when VFS subtree where autofs mount present was
> overmounted as whole.
> 
> Simpliest example is /proc/sys/fs/binfmt_misc. This mount present on most
> GNU/Linux distributions by default. For instance on my Fedora 33:
> 
> trigger automount of binfmt_misc
> $ ls /proc/sys/fs/binfmt_misc
> 
> $ cat /proc/1/mountinfo | grep binfmt
> 35 24 0:36 / /proc/sys/fs/binfmt_misc rw,relatime shared:16 - autofs systemd-1 rw,...,direct,pipe_ino=223
> 632 35 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime shared:315 - binfmt_misc binfmt_misc rw
> 
> $ sudo unshare -m -p --fork --mount-proc sh
> # cat /proc/self/mountinfo | grep "/proc"
> 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
> 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-1 rw,...,direct,pipe_ino=223
> 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc binfmt_misc rw
> 949 828 0:57 / /proc rw...,relatime - proc proc rw
> 
> As we can see now autofs mount /proc/sys/fs/binfmt_misc is inaccessible.
> If we do something like:
> 
> struct autofs_dev_ioctl *param;
> param = malloc(...);
> devfd = open("/dev/autofs", O_RDONLY);
> init_autofs_dev_ioctl(param);
> param->size = size;
> strcpy(param->path, "/proc/sys/fs/binfmt_misc");
> param->openmount.devid = 36;
> err = ioctl(devfd, AUTOFS_DEV_IOCTL_OPENMOUNT, param)
> 
> now we get err = -ENOENT.

Where does that -ENOENT come from?  AFAICS, pathwalk ought to succeed and
return you the root of overmounting binfmt_misc.  Why doesn't the loop in
find_autofs_mount() locate anything it would accept?

I really dislike the patch - the whole "normalize path" thing is fundamentally
bogus, not to mention the iterator over all mounts, etc., so I would like to
understand what the hell is going on before even thinking of *not* NAKing
it on sight.
Al Viro March 8, 2021, 12:12 a.m. UTC | #7
On Sun, Mar 07, 2021 at 11:51:20PM +0000, Al Viro wrote:
> On Thu, Mar 04, 2021 at 01:11:33PM +0300, Alexander Mikhalitsyn wrote:
> 
> > That problem connected with CRIU (Checkpoint-Restore in Userspace) project.
> > In CRIU we have support of autofs mounts C/R. To acheive that we need to use
> > ioctl's from /dev/autofs to get data about mounts, restore mount as catatonic
> > (if needed), change pipe fd and so on. But the problem is that during CRIU
> > dump we may meet situation when VFS subtree where autofs mount present was
> > overmounted as whole.
> > 
> > Simpliest example is /proc/sys/fs/binfmt_misc. This mount present on most
> > GNU/Linux distributions by default. For instance on my Fedora 33:
> > 
> > trigger automount of binfmt_misc
> > $ ls /proc/sys/fs/binfmt_misc
> > 
> > $ cat /proc/1/mountinfo | grep binfmt
> > 35 24 0:36 / /proc/sys/fs/binfmt_misc rw,relatime shared:16 - autofs systemd-1 rw,...,direct,pipe_ino=223
> > 632 35 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime shared:315 - binfmt_misc binfmt_misc rw
> > 
> > $ sudo unshare -m -p --fork --mount-proc sh
> > # cat /proc/self/mountinfo | grep "/proc"
> > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
> > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-1 rw,...,direct,pipe_ino=223
> > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc binfmt_misc rw
> > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> > 
> > As we can see now autofs mount /proc/sys/fs/binfmt_misc is inaccessible.
> > If we do something like:
> > 
> > struct autofs_dev_ioctl *param;
> > param = malloc(...);
> > devfd = open("/dev/autofs", O_RDONLY);
> > init_autofs_dev_ioctl(param);
> > param->size = size;
> > strcpy(param->path, "/proc/sys/fs/binfmt_misc");
> > param->openmount.devid = 36;
> > err = ioctl(devfd, AUTOFS_DEV_IOCTL_OPENMOUNT, param)
> > 
> > now we get err = -ENOENT.
> 
> Where does that -ENOENT come from?  AFAICS, pathwalk ought to succeed and
> return you the root of overmounting binfmt_misc.  Why doesn't the loop in
> find_autofs_mount() locate anything it would accept?
> 
> I really dislike the patch - the whole "normalize path" thing is fundamentally
> bogus, not to mention the iterator over all mounts, etc., so I would like to
> understand what the hell is going on before even thinking of *not* NAKing
> it on sight.

	Wait, so you have /proc overmounted, without anything autofs-related on
/proc/sys/fs/binfmt_misc and still want to have the pathname resolved, just
because it would've resolved with that overmount of /proc removed?

	I hope I'm misreading you; in case I'm not, the ABI is extremely
tasteless and until you manage to document the exact semantics you want
for param->path, consider it NAKed.
Al Viro March 8, 2021, 12:28 a.m. UTC | #8
On Mon, Mar 08, 2021 at 12:12:22AM +0000, Al Viro wrote:

> 	Wait, so you have /proc overmounted, without anything autofs-related on
> /proc/sys/fs/binfmt_misc and still want to have the pathname resolved, just
> because it would've resolved with that overmount of /proc removed?
> 
> 	I hope I'm misreading you; in case I'm not, the ABI is extremely
> tasteless and until you manage to document the exact semantics you want
> for param->path, consider it NAKed.

	BTW, if that thing would be made to work, what's to stop somebody from
doing ...at() syscalls with the resulting fd as a starting point and pathnames
starting with ".."?  "/foo is overmounted, but we can get to anything under
/foo/bar/ in the underlying tree since there's an autofs mount somewhere in
/foo/bar/splat/puke/*"?

	IOW, the real question (aside of "WTF?") is what are you using the
resulting descriptor for and what do you need to be able to do with it.
Details, please.
Alexander Mikhalitsyn March 9, 2021, 10:43 a.m. UTC | #9
On Sat, 06 Mar 2021 17:13:32 +0800
Ian Kent <raven@themaw.net> wrote:

> On Fri, 2021-03-05 at 14:55 +0300, Alexander Mikhalitsyn wrote:
> > On Fri, 05 Mar 2021 18:10:02 +0800
> > Ian Kent <raven@themaw.net> wrote:
> > 
> > > On Thu, 2021-03-04 at 13:11 +0300, Alexander Mikhalitsyn wrote:
> > > > On Thu, 04 Mar 2021 14:54:11 +0800
> > > > Ian Kent <raven@themaw.net> wrote:
> > > > 
> > > > > On Wed, 2021-03-03 at 18:28 +0300, Alexander Mikhalitsyn wrote:
> > > > > > It was discovered that find_autofs_mount() function
> > > > > > in autofs not support cases when autofs mount
> > > > > > parent is overmounted. In this case this function will
> > > > > > always return -ENOENT.
> > > > > 
> > > > > Ok, I get this shouldn't happen.
> > > > > 
> > > > > > Real-life reproducer is fairly simple.
> > > > > > Consider the following mounts on root mntns:
> > > > > > --
> > > > > > 35 24 0:36 / /proc/sys/fs/binfmt_misc ... shared:16 - autofs
> > > > > > systemd-
> > > > > > 1 ...
> > > > > > 654 35 0:57 / /proc/sys/fs/binfmt_misc ... shared:322 -
> > > > > > binfmt_misc
> > > > > > ...
> > > > > > --
> > > > > > and some process which calls
> > > > > > ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> > > > > > $ unshare -m -p --fork --mount-proc ./process-bin
> > > > > > 
> > > > > > Due to "mount-proc" /proc will be overmounted and
> > > > > > ioctl() will fail with -ENOENT
> > > > > 
> > > > > I think I need a better explanation ...
> > > > 
> > > > Thank you for the quick reply, Ian.
> > > > I'm sorry If my patch description was not sufficiently clear and
> > > > detailed.
> > > > 
> > > > That problem connected with CRIU (Checkpoint-Restore in
> > > > Userspace)
> > > > project.
> > > > In CRIU we have support of autofs mounts C/R. To acheive that we
> > > > need
> > > > to use
> > > > ioctl's from /dev/autofs to get data about mounts, restore mount
> > > > as
> > > > catatonic
> > > > (if needed), change pipe fd and so on. But the problem is that
> > > > during
> > > > CRIU
> > > > dump we may meet situation when VFS subtree where autofs mount
> > > > present was
> > > > overmounted as whole.
> > > > 
> > > > Simpliest example is /proc/sys/fs/binfmt_misc. This mount present
> > > > on
> > > > most
> > > > GNU/Linux distributions by default. For instance on my Fedora 33:
> > > 
> > > Yes, I don't know why systemd uses this direct mount, there must
> > > have been a reason for it.
> > > 
> > > > trigger automount of binfmt_misc
> > > > $ ls /proc/sys/fs/binfmt_misc
> > > > 
> > > > $ cat /proc/1/mountinfo | grep binfmt
> > > > 35 24 0:36 / /proc/sys/fs/binfmt_misc rw,relatime shared:16 -
> > > > autofs
> > > > systemd-1 rw,...,direct,pipe_ino=223
> > > > 632 35 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime shared:315
> > > > -
> > > > binfmt_misc binfmt_misc rw
> > > 
> > > Yes, I think this looks normal.
> > > 
> > > > $ sudo unshare -m -p --fork --mount-proc sh
> > > > # cat /proc/self/mountinfo | grep "/proc"
> > > > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc
> > > > rw
> > > > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs
> > > > systemd-
> > > > 1 rw,...,direct,pipe_ino=223
> > > > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime -
> > > > binfmt_misc
> > > > binfmt_misc rw
> > > > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> > > 
> > > Isn't this screwed up, /proc is on top of the binfmt_misc mount ...
> > > 
> > > Is this what's seen from the root namespace?
> > 
> > No-no, after issuing
> > $ sudo unshare -m -p --fork --mount-proc sh
> > 
> > we enter to the pid+mount namespace and:
> > 
> > # cat /proc/self/mountinfo | grep "/proc"
> > 
> > So, it's picture from inside namespaces.
> 
> Ok, so potentially some of those have been propagated from the
> original mount namespace.
> 
> It seems to me the sensible thing would be those mounts would
> not propagate when a new proc has been requested. It doesn't
> make sense to me to carry around mounts that are not accessible
> because of something requested by the mount namespace creator.
> 
> But that's nothing new and isn't likely to change any time soon.
> 
> > 
> > > > As we can see now autofs mount /proc/sys/fs/binfmt_misc is
> > > > inaccessible.
> > > > If we do something like:
> > > > 
> > > > struct autofs_dev_ioctl *param;
> > > > param = malloc(...);
> > > > devfd = open("/dev/autofs", O_RDONLY);
> > > > init_autofs_dev_ioctl(param);
> > > > param->size = size;
> > > > strcpy(param->path, "/proc/sys/fs/binfmt_misc");
> > > > param->openmount.devid = 36;
> > > > err = ioctl(devfd, AUTOFS_DEV_IOCTL_OPENMOUNT, param)
> > > > 
> > > > now we get err = -ENOENT.
> > > 
> > > Maybe that should be EINVAL, not sure about cases though.
> > 
> > in current version -ENOENT is returned in this particular case
> > 
> > > > > What's being said here?
> > > > > 
> > > > > For a start your talking about direct mounts, I'm pretty sure
> > > > > this
> > > > > use case can't occur with indirect mounts in the sense that the
> > > > > indirect mount base should/must never be over mounted and IIRC
> > > > > that
> > > > > base can't be /proc (but maybe that's just mounts inside proc
> > > > > ...),
> > > > > can't remember now but from a common sense POV an indirect
> > > > > mount
> > > > > won't/can't be on /proc.
> > > > > 
> > > > > And why is this ioctl be called?
> > > > 
> > > > We call this ioctl during criu dump stage to open fd from autofs
> > > > mount dentry. This fd is used later to call
> > > > ioctl(AUTOFS_IOC_CATATONIC)
> > > > (we do that on criu dump if we see that control process of autofs
> > > > mount
> > > > is dead or pipe is dead).
> > > 
> > > Right so your usage "is" the way it's intended, ;)
> > 
> > That's good! ;)
> > 
> > > > > If the mount is over mounted should that prevent expiration of
> > > > > the
> > > > > over mounted /proc anyway, so maybe the return is correct ...
> > > > > or
> > > > > not ...
> > > > 
> > > > I agree that case with overmounted subtree with autofs mount is
> > > > weird
> > > > case.
> > > > But it may be easily created by user and we in CRIU try to handle
> > > > that.
> > > 
> > > I'm not yet ready to make a call on how I think this this should
> > > be done.
> > > 
> > > Since you seem to be clear on what this should be used for I'll
> > > need to look more closely at the patch.
> > > 
> > > But, at first glance, it looked like it would break the existing
> > > function of the ioctl.
> > > 
> > > Could you explain how the patch works, in particular why it doesn't
> > > break the existing functionality.
> > 
> > Sure. With pleasure. Idea of patch is naive:
> > 1. find_autofs_mount() function called only from syscall context, so,
> > we always can determine current mount namespace of caller.
> > So, I've introduced
> > 
> > > > > > + int lookup_mount_path(struct mnt_namespace *ns,
> > > > > > +		      struct path *res,
> > > > > > +		      int test(const struct path *mnt, void
> > > > > > *data),
> > > > > > +		      void *data)
> > 
> > lookup_mount_path() helper function, which allows to traverse mounts
> > list of
> > mount namespace and find proper autofs mount by user-provided helper
> > test().
> > 
> > 2. Helper function is fairly simple:
> > a) it checks that mount is autofs mount (by magic number on
> > superblock)
> > b) it calculates full path to mount point of each mount in mount
> > namespace
> > and compare it with path which user was provided to the
> > ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> > parameters.
> 
> Oh right, it's using the mounts list, it isn't a path walk oriented
> search. That's probably why I didn't see the expected follow call.
> 
> Another problem with the existing code is it will get it wrong if
> there is more than one autofs mount in the stack. For example an
> autofs submount mounted on a direct mount which is rare and not
> all that sensible but possible.

Oh. :(

> 
> So, if we change this, there will need to be some agreed policy
> about which mount would is selected.

Sure. I'm ready to write some ltp (?) tests which will cover this
autofs ioctl and all usecases.

> 
> The originally code (long before what is there now) selected the
> lowest mount in the stack because this mechanism is only needed
> for direct mounts and, as long as there's not something seriously
> wrong, that is the mount you would need. That's what would be needed
> for the case above and I think it's what's needed in your case too.
> 
> I still haven't yet looked closely at your change, I'll need to do
> that.
> 
> > 
> > Problem here is case when user provided relative path in ioctl
> > parameters
> > (struct autofs_dev_ioctl). In this case we may fail to resolve user
> > provided path to
> > struct path. For instance:
> 
> I don't like the idea of allowing a relative path as an a parameter
> at all. I think that should be a failure case.

I'm too, and I've even thought about to restrict use relative paths here,
but actual kernel code allows that, so is it will be okay to discard compatibility here?
I know that autofs daemon implementation uses full paths and if we restrict relative paths here
it will not break systemd/automount daemons.

> 
> These direct mounts come from a table (a map in autofs or a unit
> in systemd) and they should always be identified by that full path.
> 
> Ian

Thank you!

Regards,
Alex

> 
> > 
> > # cat /proc/self/mountinfo | grep "/proc"
> > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
> > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-
> > 1 rw,...,direct,pipe_ino=223
> > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc
> > binfmt_misc rw
> > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> > 
> > in this case 
> > kern_path("/proc/sys/fs/binfmt_misc", LOOKUP_MOUNTPOINT, &path) ==
> > -ENOENT
> > 
> > To overcome this issue, if kern_path() failed with -ENOENT
> > AND user-provided mount path looks like fullpath (starts from /)
> > we just try to find autofs mount in mounts list just by searching
> > autofs mounts in mounts list with mount point path equal to user-
> > provided
> > path. This covers our problem case.
> > 
> > This patch is fully compatible with old behaviour - if parent mounts
> > of
> > autofs mount is not overmounted - then
> > kern_path("/proc/sys/fs/binfmt_misc", LOOKUP_MOUNTPOINT, &path)
> > will not fail, and we also easily find needed autofs mount in mounts
> > list
> > of caller mount namespace.
> > 
> > > Long ago I'm pretty sure I continued to follow up but IIRC that
> > > went away and was replaced by a single follow_up(), but since
> > > the changes didn't break the existing function of autofs I
> > > didn't pay that much attention to them, I'll need to look at
> > > that too. Not only that, the namespace code has moved a long
> > > way too however there's still little attention given to
> > > sanitizing the mounts in the new namespace by anything that I'm
> > > aware of that uses the feature. TBH I'm not sure why I don't
> > > see a lot more problems of that nature.
> > > 
> > > I have to wonder if what's needed is attention to the follow up
> > > but that /proc covering the earlier mounts is a bit of a concern.
> > > 
> > > > > I get that the mount namespaces should be independent and
> > > > > intuitively
> > > > > this is a bug but what is the actual use and expected result.
> > > > > 
> > > > > But anyway, aren't you saying that the VFS path walk isn't
> > > > > handling
> > > > > mount namespaces properly or are you saying that a process
> > > > > outside
> > > > > this new mount namespace becomes broken because of it?
> > > > 
> > > > No-no, it's only about opening autofs mount by device id + path.
> > > 
> > > That's right, specifically getting a file handle to a covered
> > > autofs
> > > mount for things like bringing it back to life etc. But that
> > > silently
> > > implies the same mount namespace.
> > > 
> > > Let me look at the patch and think about it a bit.
> > > I'll probably need to run some tests too.
> > > I am a little busy right now so it may take a bit of time.
> > > 
> > > Ian 
> > 
> > Thank you very much for your attention to the patch and comments.
> > 
> > Regards,
> > Alex
> > 
> > > > > Either way the solution looks more complicated than I'd expect
> > > > > so
> > > > > some explanation along these lines would be good.
> > > > > 
> > > > > Ian
> > > > 
> > > > Regards,
> > > > Alex
> > > > 
> > > > > > Cc: Matthew Wilcox <willy@infradead.org>
> > > > > > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > > > > > Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > > > > > Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
> > > > > > Cc: autofs@vger.kernel.org
> > > > > > Cc: linux-kernel@vger.kernel.org
> > > > > > Signed-off-by: Alexander Mikhalitsyn <
> > > > > > alexander.mikhalitsyn@virtuozzo.com>
> > > > > > ---
> > > > > >  fs/autofs/dev-ioctl.c | 127
> > > > > > +++++++++++++++++++++++++++++++++++++---
> > > > > > --
> > > > > >  fs/namespace.c        |  44 +++++++++++++++
> > > > > >  include/linux/mount.h |   5 ++
> > > > > >  3 files changed, 162 insertions(+), 14 deletions(-)
> > > > > > 
> > > > > > diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
> > > > > > index 5bf781ea6d67..55edd3eba8ce 100644
> > > > > > --- a/fs/autofs/dev-ioctl.c
> > > > > > +++ b/fs/autofs/dev-ioctl.c
> > > > > > @@ -10,6 +10,7 @@
> > > > > >  #include <linux/fdtable.h>
> > > > > >  #include <linux/magic.h>
> > > > > >  #include <linux/nospec.h>
> > > > > > +#include <linux/nsproxy.h>
> > > > > >  
> > > > > >  #include "autofs_i.h"
> > > > > >  
> > > > > > @@ -179,32 +180,130 @@ static int
> > > > > > autofs_dev_ioctl_protosubver(struct
> > > > > > file *fp,
> > > > > >  	return 0;
> > > > > >  }
> > > > > >  
> > > > > > +struct filter_autofs_data {
> > > > > > +	char *pathbuf;
> > > > > > +	const char *fpathname;
> > > > > > +	int (*test)(const struct path *path, void *data);
> > > > > > +	void *data;
> > > > > > +};
> > > > > > +
> > > > > > +static int filter_autofs(const struct path *path, void *p)
> > > > > > +{
> > > > > > +	struct filter_autofs_data *data = p;
> > > > > > +	char *name;
> > > > > > +	int err;
> > > > > > +
> > > > > > +	if (path->mnt->mnt_sb->s_magic != AUTOFS_SUPER_MAGIC)
> > > > > > +		return 0;
> > > > > > +
> > > > > > +	name = d_path(path, data->pathbuf, PATH_MAX);
> > > > > > +	if (IS_ERR(name)) {
> > > > > > +		err = PTR_ERR(name);
> > > > > > +		pr_err("d_path failed, errno %d\n", err);
> > > > > > +		return 0;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (strncmp(data->fpathname, name, PATH_MAX))
> > > > > > +		return 0;
> > > > > > +
> > > > > > +	if (!data->test(path, data->data))
> > > > > > +		return 0;
> > > > > > +
> > > > > > +	return 1;
> > > > > > +}
> > > > > > +
> > > > > >  /* Find the topmost mount satisfying test() */
> > > > > >  static int find_autofs_mount(const char *pathname,
> > > > > >  			     struct path *res,
> > > > > >  			     int test(const struct path *path,
> > > > > > void
> > > > > > *data),
> > > > > >  			     void *data)
> > > > > >  {
> > > > > > -	struct path path;
> > > > > > +	struct filter_autofs_data mdata = {
> > > > > > +		.pathbuf = NULL,
> > > > > > +		.test = test,
> > > > > > +		.data = data,
> > > > > > +	};
> > > > > > +	struct mnt_namespace *mnt_ns = current->nsproxy-
> > > > > > >mnt_ns;
> > > > > > +	struct path path = {};
> > > > > > +	char *fpathbuf = NULL;
> > > > > >  	int err;
> > > > > >  
> > > > > > +	/*
> > > > > > +	 * In most cases user will provide full path to autofs
> > > > > > mount
> > > > > > point
> > > > > > +	 * as it is in /proc/X/mountinfo. But if not, then we
> > > > > > need to
> > > > > > +	 * open provided relative path and calculate full path.
> > > > > > +	 * It will not work in case when parent mount of autofs
> > > > > > mount
> > > > > > +	 * is overmounted:
> > > > > > +	 * cd /root
> > > > > > +	 * ./autofs_mount /root/autofs_yard/mnt
> > > > > > +	 * mount -t tmpfs tmpfs /root/autofs_yard/mnt
> > > > > > +	 * mount -t tmpfs tmpfs /root/autofs_yard
> > > > > > +	 * ./call_ioctl /root/autofs_yard/mnt <- all fine here
> > > > > > because
> > > > > > we
> > > > > > +	 * 					 have full
> > > > > > path and
> > > > > > don't
> > > > > > +	 * 					 need to call
> > > > > > kern_path()
> > > > > > +	 * 					 and d_path()
> > > > > > +	 * ./call_ioctl autofs_yard/mnt <- will fail because
> > > > > > kern_path()
> > > > > > +	 * 				   can't lookup
> > > > > > /root/autofs_yard/mnt
> > > > > > +	 * 				   (/root/autofs_yard
> > > > > > directory is
> > > > > > +	 * 				    empty)
> > > > > > +	 *
> > > > > > +	 * TO DISCUSS: we can write special algorithm for
> > > > > > relative path
> > > > > > case
> > > > > > +	 * by getting cwd path combining it with relative path
> > > > > > from
> > > > > > user. But
> > > > > > +	 * is it worth it? User also may use paths with
> > > > > > symlinks in
> > > > > > components
> > > > > > +	 * of path.
> > > > > > +	 *
> > > > > > +	 */
> > > > > >  	err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
> > > > > > -	if (err)
> > > > > > -		return err;
> > > > > > -	err = -ENOENT;
> > > > > > -	while (path.dentry == path.mnt->mnt_root) {
> > > > > > -		if (path.dentry->d_sb->s_magic ==
> > > > > > AUTOFS_SUPER_MAGIC) {
> > > > > > -			if (test(&path, data)) {
> > > > > > -				path_get(&path);
> > > > > > -				*res = path;
> > > > > > -				err = 0;
> > > > > > -				break;
> > > > > > -			}
> > > > > > +	if (err) {
> > > > > > +		if (pathname[0] == '/') {
> > > > > > +			/*
> > > > > > +			 * pathname looks like full path let's
> > > > > > try to
> > > > > > use it
> > > > > > +			 * as it is when searching autofs mount
> > > > > > +			 */
> > > > > > +			mdata.fpathname = pathname;
> > > > > > +			err = 0;
> > > > > > +			pr_debug("kern_path failed on %s, errno
> > > > > > %d.
> > > > > > Will use path as it is to search mount\n",
> > > > > > +				 pathname, err);
> > > > > > +		} else {
> > > > > > +			pr_err("kern_path failed on %s, errno
> > > > > > %d\n",
> > > > > > +			       pathname, err);
> > > > > > +			return err;
> > > > > > +		}
> > > > > > +	} else {
> > > > > > +		pr_debug("find_autofs_mount: let's resolve full
> > > > > > path
> > > > > > %s\n",
> > > > > > +			 pathname);
> > > > > > +
> > > > > > +		fpathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > > > > +		if (!fpathbuf) {
> > > > > > +			err = -ENOMEM;
> > > > > > +			goto err;
> > > > > > +		}
> > > > > > +
> > > > > > +		/*
> > > > > > +		 * We have pathname from user but it may be
> > > > > > relative,
> > > > > > we need to
> > > > > > +		 * have full path because we want to compare it
> > > > > > with
> > > > > > mountpoints
> > > > > > +		 * paths later.
> > > > > > +		 */
> > > > > > +		mdata.fpathname = d_path(&path, fpathbuf,
> > > > > > PATH_MAX);
> > > > > > +		if (IS_ERR(mdata.fpathname)) {
> > > > > > +			err = PTR_ERR(mdata.fpathname);
> > > > > > +			pr_err("d_path failed, errno %d\n",
> > > > > > err);
> > > > > > +			goto err;
> > > > > >  		}
> > > > > > -		if (!follow_up(&path))
> > > > > > -			break;
> > > > > >  	}
> > > > > > +
> > > > > > +	mdata.pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > > > > +	if (!mdata.pathbuf) {
> > > > > > +		err = -ENOMEM;
> > > > > > +		goto err;
> > > > > > +	}
> > > > > > +
> > > > > > +	err = lookup_mount_path(mnt_ns, res, filter_autofs,
> > > > > > &mdata);
> > > > > > +
> > > > > > +err:
> > > > > >  	path_put(&path);
> > > > > > +	kfree(fpathbuf);
> > > > > > +	kfree(mdata.pathbuf);
> > > > > >  	return err;
> > > > > >  }
> > > > > >  
> > > > > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > > > > index 56bb5a5fdc0d..e1d006dbdfe2 100644
> > > > > > --- a/fs/namespace.c
> > > > > > +++ b/fs/namespace.c
> > > > > > @@ -1367,6 +1367,50 @@ void mnt_cursor_del(struct
> > > > > > mnt_namespace
> > > > > > *ns,
> > > > > > struct mount *cursor)
> > > > > >  }
> > > > > >  #endif  /* CONFIG_PROC_FS */
> > > > > >  
> > > > > > +/**
> > > > > > + * lookup_mount_path - traverse all mounts in mount
> > > > > > namespace
> > > > > > + *                     and filter using test() probe
> > > > > > callback
> > > > > > + * As a result struct path will be provided.
> > > > > > + * @ns: root of mount tree
> > > > > > + * @res: struct path pointer where resulting path will be
> > > > > > written
> > > > > > + * @test: filter callback
> > > > > > + * @data: will be provided as argument to test() callback
> > > > > > + *
> > > > > > + */
> > > > > > +int lookup_mount_path(struct mnt_namespace *ns,
> > > > > > +		      struct path *res,
> > > > > > +		      int test(const struct path *mnt, void
> > > > > > *data),
> > > > > > +		      void *data)
> > > > > > +{
> > > > > > +	struct mount *mnt;
> > > > > > +	int err = -ENOENT;
> > > > > > +
> > > > > > +	down_read(&namespace_sem);
> > > > > > +	lock_ns_list(ns);
> > > > > > +	list_for_each_entry(mnt, &ns->list, mnt_list) {
> > > > > > +		struct path tmppath;
> > > > > > +
> > > > > > +		if (mnt_is_cursor(mnt))
> > > > > > +			continue;
> > > > > > +
> > > > > > +		tmppath.dentry = mnt->mnt.mnt_root;
> > > > > > +		tmppath.mnt = &mnt->mnt;
> > > > > > +
> > > > > > +		if (test(&tmppath, data)) {
> > > > > > +			path_get(&tmppath);
> > > > > > +			*res = tmppath;
> > > > > > +			err = 0;
> > > > > > +			break;
> > > > > > +		}
> > > > > > +	}
> > > > > > +	unlock_ns_list(ns);
> > > > > > +	up_read(&namespace_sem);
> > > > > > +
> > > > > > +	return err;
> > > > > > +}
> > > > > > +
> > > > > > +EXPORT_SYMBOL(lookup_mount_path);
> > > > > > +
> > > > > >  /**
> > > > > >   * may_umount_tree - check if a mount tree is busy
> > > > > >   * @mnt: root of mount tree
> > > > > > diff --git a/include/linux/mount.h b/include/linux/mount.h
> > > > > > index 5d92a7e1a742..a79e6392e38e 100644
> > > > > > --- a/include/linux/mount.h
> > > > > > +++ b/include/linux/mount.h
> > > > > > @@ -118,6 +118,11 @@ extern unsigned int sysctl_mount_max;
> > > > > >  
> > > > > >  extern bool path_is_mountpoint(const struct path *path);
> > > > > >  
> > > > > > +extern int lookup_mount_path(struct mnt_namespace *ns,
> > > > > > +			     struct path *res,
> > > > > > +			     int test(const struct path *mnt,
> > > > > > void
> > > > > > *data),
> > > > > > +			     void *data);
> > > > > > +
> > > > > >  extern void kern_unmount_array(struct vfsmount *mnt[],
> > > > > > unsigned
> > > > > > int
> > > > > > num);
> > > > > >  
> > > > > >  #endif /* _LINUX_MOUNT_H */
>
Alexander Mikhalitsyn March 9, 2021, 11:31 a.m. UTC | #10
On Mon, 8 Mar 2021 00:12:22 +0000
Al Viro <viro@zeniv.linux.org.uk> wrote:

> On Sun, Mar 07, 2021 at 11:51:20PM +0000, Al Viro wrote:
> > On Thu, Mar 04, 2021 at 01:11:33PM +0300, Alexander Mikhalitsyn wrote:
> > 
> > > That problem connected with CRIU (Checkpoint-Restore in Userspace) project.
> > > In CRIU we have support of autofs mounts C/R. To acheive that we need to use
> > > ioctl's from /dev/autofs to get data about mounts, restore mount as catatonic
> > > (if needed), change pipe fd and so on. But the problem is that during CRIU
> > > dump we may meet situation when VFS subtree where autofs mount present was
> > > overmounted as whole.
> > > 
> > > Simpliest example is /proc/sys/fs/binfmt_misc. This mount present on most
> > > GNU/Linux distributions by default. For instance on my Fedora 33:
> > > 
> > > trigger automount of binfmt_misc
> > > $ ls /proc/sys/fs/binfmt_misc
> > > 
> > > $ cat /proc/1/mountinfo | grep binfmt
> > > 35 24 0:36 / /proc/sys/fs/binfmt_misc rw,relatime shared:16 - autofs systemd-1 rw,...,direct,pipe_ino=223
> > > 632 35 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime shared:315 - binfmt_misc binfmt_misc rw
> > > 
> > > $ sudo unshare -m -p --fork --mount-proc sh
> > > # cat /proc/self/mountinfo | grep "/proc"
> > > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
> > > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-1 rw,...,direct,pipe_ino=223
> > > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc binfmt_misc rw
> > > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> > > 
> > > As we can see now autofs mount /proc/sys/fs/binfmt_misc is inaccessible.
> > > If we do something like:
> > > 
> > > struct autofs_dev_ioctl *param;
> > > param = malloc(...);
> > > devfd = open("/dev/autofs", O_RDONLY);
> > > init_autofs_dev_ioctl(param);
> > > param->size = size;
> > > strcpy(param->path, "/proc/sys/fs/binfmt_misc");
> > > param->openmount.devid = 36;
> > > err = ioctl(devfd, AUTOFS_DEV_IOCTL_OPENMOUNT, param)
> > > 
> > > now we get err = -ENOENT.
> > 
> Where does that -ENOENT come from?  AFAICS, pathwalk ought to succeed and
> return you the root of overmounting binfmt_misc.  Why doesn't the loop in
> find_autofs_mount() locate anything it would accept?
> 

Consider our mounts tree:
> > > # cat /proc/self/mountinfo | grep "/proc"
> > > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
> > > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-1 rw,...,direct,pipe_ino=223
> > > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc binfmt_misc rw
> > > 949 828 0:57 / /proc rw...,relatime - proc proc rw

ENOENT comes from here (current kernel code):
/* Find the topmost mount satisfying test() */
static int find_autofs_mount(const char *pathname,
			     struct path *res,
			     int test(const struct path *path, void *data),
			     void *data)
{
	struct path path;
	int err;

	err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
	if (err)
		return err;
<-------- here we successfuly open root dentry (/proc/sys/fs/binfmt_misc) of /proc (mnt_id = 949)

	err = -ENOENT;
<---- set err and start search autofs mount
/*
 here we use follow_up to move through upper dentries and find overmounted autofs.
 But in our case we opened dentry from /proc (mnt_id = 949) and this concrete dentry is *NOT*
overmounted (whole /proc overmounted).
*/
	while (path.dentry == path.mnt->mnt_root) {
		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
			if (test(&path, data)) {
/*
 we never get there
*/
				path_get(&path);
				*res = path;
				err = 0;
				break;
			}
		}
		if (!follow_up(&path))
			break;
	}
/*
loop finished. err stays as it was err = -ENOENT
*/
	path_put(&path);
	return err;
}

Source: https://github.com/torvalds/linux/blob/master/fs/autofs/dev-ioctl.c#L194

> I really dislike the patch - the whole "normalize path" thing is fundamentally
> bogus, not to mention the iterator over all mounts, etc., so I would like to
> understand what the hell is going on before even thinking of *not* NAKing
> it on sight.

I'm not trying to break current API or something similar. I'm just prepared
RFC patch with my proposal. I'm ready to rework all of that to make it good.
But without maintainers/community comments/suggestions it's unreal :)

Please, explain what do you mean by "normalize path thing"?
I'm not changing semantics of current ioctl() I've just trying to extend it to make
it work in case when parent mount of autofs mount is overmounted.

> 
> 	Wait, so you have /proc overmounted, without anything autofs-related on
> /proc/sys/fs/binfmt_misc and still want to have the pathname resolved, just
> because it would've resolved with that overmount of /proc removed?

Something like that.

1. I don't expect that /proc/sys/fs/binfmt_misc path will be resolved
(because, for instance we can overmount /proc by empty tmpfs and in this case after
overmounting we can't even open /proc/sys/fs/binfmt_misc and that's okay).

2. We talking about autofs specific function which is used in several autofs-specific
ioctls. One of that ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT) which is designed to open
overmounted autofs mounts. Because it's frequent case when autofs mount is overmounted
(when we talk about direct mounts). This ioctl allows to open file desciptor
of autofs root dentry and later, autofs daemon use it to manage mount (call another autofs
ioctls on that fd).

I've just meet problem, that this API not works when parent mount of autofs mount is overmounted.
For example:
tmpfs     /some-dir
autofs    /some-dir/autofs1 <-autofs direct mount
nfs       /some-dir/autofs1 <-automounted fs on top of autofs

ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT) will work in this case. Because loop
with follow_up() starts from /some-dir/autofs1 dentry of nfs, then follow_up()
and move to /some-dir/autofs1 dentry of autofs.

But if we change picture to:
tmpfs1     /some-dir
autofs     /some-dir/autofs1 <-autofs direct mount
nfs        /some-dir/autofs1 <-automounted fs on top of autofs
tmpfs2     /some-dir

This will breaks API. Because know we can't even open /some-dir/autofs1
dentry.

Ok. We can create this dentry at first by mkdir /some-dir/autofs1.
But it will not help because our loop:
	while (path.dentry == path.mnt->mnt_root) {
		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
			if (test(&path, data)) {
...
		if (!follow_up(&path))
			break;
	}
will start from dentry /some-dir/autofs1 from tmpfs2. And after follow_up
on that dentry we will move to / dentry => loop finishes => user get ENOENT.

> 
> 	I hope I'm misreading you; in case I'm not, the ABI is extremely
> tasteless and until you manage to document the exact semantics you want
> for param->path, consider it NAKed.
> 
> 	BTW, if that thing would be made to work, what's to stop somebody from
> doing ...at() syscalls with the resulting fd as a starting point and pathnames
> starting with ".."?  "/foo is overmounted, but we can get to anything under
> /foo/bar/ in the underlying tree since there's an autofs mount somewhere in
> /foo/bar/splat/puke/*"?

Interesting point. Thank you!
I'm not sure. But is it serious problem for us? What stop somebody to open
and hold fd to any directory before overmounting?

> 
> 	IOW, the real question (aside of "WTF?") is what are you using the
> resulting descriptor for and what do you need to be able to do with it.
> Details, please.

Sure. I've covered use cases of file descriptor returned by ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
above.

Thanks for your reply!
I'm sorry If my patch description is unclear. I'm newbie here :)

Regards,
Alex
Alexander Mikhalitsyn March 22, 2021, 7:53 a.m. UTC | #11
On Tue, 9 Mar 2021 14:31:05 +0300
Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com> wrote:

> On Mon, 8 Mar 2021 00:12:22 +0000
> Al Viro <viro@zeniv.linux.org.uk> wrote:
> 
> > On Sun, Mar 07, 2021 at 11:51:20PM +0000, Al Viro wrote:
> > > On Thu, Mar 04, 2021 at 01:11:33PM +0300, Alexander Mikhalitsyn wrote:
> > > 
> > > > That problem connected with CRIU (Checkpoint-Restore in Userspace) project.
> > > > In CRIU we have support of autofs mounts C/R. To acheive that we need to use
> > > > ioctl's from /dev/autofs to get data about mounts, restore mount as catatonic
> > > > (if needed), change pipe fd and so on. But the problem is that during CRIU
> > > > dump we may meet situation when VFS subtree where autofs mount present was
> > > > overmounted as whole.
> > > > 
> > > > Simpliest example is /proc/sys/fs/binfmt_misc. This mount present on most
> > > > GNU/Linux distributions by default. For instance on my Fedora 33:
> > > > 
> > > > trigger automount of binfmt_misc
> > > > $ ls /proc/sys/fs/binfmt_misc
> > > > 
> > > > $ cat /proc/1/mountinfo | grep binfmt
> > > > 35 24 0:36 / /proc/sys/fs/binfmt_misc rw,relatime shared:16 - autofs systemd-1 rw,...,direct,pipe_ino=223
> > > > 632 35 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime shared:315 - binfmt_misc binfmt_misc rw
> > > > 
> > > > $ sudo unshare -m -p --fork --mount-proc sh
> > > > # cat /proc/self/mountinfo | grep "/proc"
> > > > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
> > > > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-1 rw,...,direct,pipe_ino=223
> > > > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc binfmt_misc rw
> > > > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> > > > 
> > > > As we can see now autofs mount /proc/sys/fs/binfmt_misc is inaccessible.
> > > > If we do something like:
> > > > 
> > > > struct autofs_dev_ioctl *param;
> > > > param = malloc(...);
> > > > devfd = open("/dev/autofs", O_RDONLY);
> > > > init_autofs_dev_ioctl(param);
> > > > param->size = size;
> > > > strcpy(param->path, "/proc/sys/fs/binfmt_misc");
> > > > param->openmount.devid = 36;
> > > > err = ioctl(devfd, AUTOFS_DEV_IOCTL_OPENMOUNT, param)
> > > > 
> > > > now we get err = -ENOENT.
> > > 
> > Where does that -ENOENT come from?  AFAICS, pathwalk ought to succeed and
> > return you the root of overmounting binfmt_misc.  Why doesn't the loop in
> > find_autofs_mount() locate anything it would accept?
> > 
> 
> Consider our mounts tree:
> > > > # cat /proc/self/mountinfo | grep "/proc"
> > > > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw
> > > > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs systemd-1 rw,...,direct,pipe_ino=223
> > > > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime - binfmt_misc binfmt_misc rw
> > > > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> 
> ENOENT comes from here (current kernel code):
> /* Find the topmost mount satisfying test() */
> static int find_autofs_mount(const char *pathname,
> 			     struct path *res,
> 			     int test(const struct path *path, void *data),
> 			     void *data)
> {
> 	struct path path;
> 	int err;
> 
> 	err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
> 	if (err)
> 		return err;
> <-------- here we successfuly open root dentry (/proc/sys/fs/binfmt_misc) of /proc (mnt_id = 949)
> 
> 	err = -ENOENT;
> <---- set err and start search autofs mount
> /*
>  here we use follow_up to move through upper dentries and find overmounted autofs.
>  But in our case we opened dentry from /proc (mnt_id = 949) and this concrete dentry is *NOT*
> overmounted (whole /proc overmounted).
> */
> 	while (path.dentry == path.mnt->mnt_root) {
> 		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
> 			if (test(&path, data)) {
> /*
>  we never get there
> */
> 				path_get(&path);
> 				*res = path;
> 				err = 0;
> 				break;
> 			}
> 		}
> 		if (!follow_up(&path))
> 			break;
> 	}
> /*
> loop finished. err stays as it was err = -ENOENT
> */
> 	path_put(&path);
> 	return err;
> }
> 
> Source: https://github.com/torvalds/linux/blob/master/fs/autofs/dev-ioctl.c#L194
> 
> > I really dislike the patch - the whole "normalize path" thing is fundamentally
> > bogus, not to mention the iterator over all mounts, etc., so I would like to
> > understand what the hell is going on before even thinking of *not* NAKing
> > it on sight.
> 
> I'm not trying to break current API or something similar. I'm just prepared
> RFC patch with my proposal. I'm ready to rework all of that to make it good.
> But without maintainers/community comments/suggestions it's unreal :)
> 
> Please, explain what do you mean by "normalize path thing"?
> I'm not changing semantics of current ioctl() I've just trying to extend it to make
> it work in case when parent mount of autofs mount is overmounted.
> 
> > 
> > 	Wait, so you have /proc overmounted, without anything autofs-related on
> > /proc/sys/fs/binfmt_misc and still want to have the pathname resolved, just
> > because it would've resolved with that overmount of /proc removed?
> 
> Something like that.
> 
> 1. I don't expect that /proc/sys/fs/binfmt_misc path will be resolved
> (because, for instance we can overmount /proc by empty tmpfs and in this case after
> overmounting we can't even open /proc/sys/fs/binfmt_misc and that's okay).
> 
> 2. We talking about autofs specific function which is used in several autofs-specific
> ioctls. One of that ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT) which is designed to open
> overmounted autofs mounts. Because it's frequent case when autofs mount is overmounted
> (when we talk about direct mounts). This ioctl allows to open file desciptor
> of autofs root dentry and later, autofs daemon use it to manage mount (call another autofs
> ioctls on that fd).
> 
> I've just meet problem, that this API not works when parent mount of autofs mount is overmounted.
> For example:
> tmpfs     /some-dir
> autofs    /some-dir/autofs1 <-autofs direct mount
> nfs       /some-dir/autofs1 <-automounted fs on top of autofs
> 
> ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT) will work in this case. Because loop
> with follow_up() starts from /some-dir/autofs1 dentry of nfs, then follow_up()
> and move to /some-dir/autofs1 dentry of autofs.
> 
> But if we change picture to:
> tmpfs1     /some-dir
> autofs     /some-dir/autofs1 <-autofs direct mount
> nfs        /some-dir/autofs1 <-automounted fs on top of autofs
> tmpfs2     /some-dir
> 
> This will breaks API. Because know we can't even open /some-dir/autofs1
> dentry.
> 
> Ok. We can create this dentry at first by mkdir /some-dir/autofs1.
> But it will not help because our loop:
> 	while (path.dentry == path.mnt->mnt_root) {
> 		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
> 			if (test(&path, data)) {
> ...
> 		if (!follow_up(&path))
> 			break;
> 	}
> will start from dentry /some-dir/autofs1 from tmpfs2. And after follow_up
> on that dentry we will move to / dentry => loop finishes => user get ENOENT.
> 
> > 
> > 	I hope I'm misreading you; in case I'm not, the ABI is extremely
> > tasteless and until you manage to document the exact semantics you want
> > for param->path, consider it NAKed.
> > 
> > 	BTW, if that thing would be made to work, what's to stop somebody from
> > doing ...at() syscalls with the resulting fd as a starting point and pathnames
> > starting with ".."?  "/foo is overmounted, but we can get to anything under
> > /foo/bar/ in the underlying tree since there's an autofs mount somewhere in
> > /foo/bar/splat/puke/*"?
> 
> Interesting point. Thank you!
> I'm not sure. But is it serious problem for us? What stop somebody to open
> and hold fd to any directory before overmounting?
> 
> > 
> > 	IOW, the real question (aside of "WTF?") is what are you using the
> > resulting descriptor for and what do you need to be able to do with it.
> > Details, please.
> 
> Sure. I've covered use cases of file descriptor returned by ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> above.
> 
> Thanks for your reply!
> I'm sorry If my patch description is unclear. I'm newbie here :)
> 
> Regards,
> Alex

Gentle ping.

Thanks,
Alex
Ian Kent March 23, 2021, 8:44 a.m. UTC | #12
On Tue, 2021-03-09 at 13:43 +0300, Alexander Mikhalitsyn wrote:
> On Sat, 06 Mar 2021 17:13:32 +0800
> Ian Kent <raven@themaw.net> wrote:
> 
> > On Fri, 2021-03-05 at 14:55 +0300, Alexander Mikhalitsyn wrote:
> > > On Fri, 05 Mar 2021 18:10:02 +0800
> > > Ian Kent <raven@themaw.net> wrote:
> > > 
> > > > On Thu, 2021-03-04 at 13:11 +0300, Alexander Mikhalitsyn wrote:
> > > > > On Thu, 04 Mar 2021 14:54:11 +0800
> > > > > Ian Kent <raven@themaw.net> wrote:
> > > > > 
> > > > > > On Wed, 2021-03-03 at 18:28 +0300, Alexander Mikhalitsyn
> > > > > > wrote:
> > > > > > > It was discovered that find_autofs_mount() function
> > > > > > > in autofs not support cases when autofs mount
> > > > > > > parent is overmounted. In this case this function will
> > > > > > > always return -ENOENT.
> > > > > > 
> > > > > > Ok, I get this shouldn't happen.
> > > > > > 
> > > > > > > Real-life reproducer is fairly simple.
> > > > > > > Consider the following mounts on root mntns:
> > > > > > > --
> > > > > > > 35 24 0:36 / /proc/sys/fs/binfmt_misc ... shared:16 -
> > > > > > > autofs
> > > > > > > systemd-
> > > > > > > 1 ...
> > > > > > > 654 35 0:57 / /proc/sys/fs/binfmt_misc ... shared:322 -
> > > > > > > binfmt_misc
> > > > > > > ...
> > > > > > > --
> > > > > > > and some process which calls
> > > > > > > ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> > > > > > > $ unshare -m -p --fork --mount-proc ./process-bin
> > > > > > > 
> > > > > > > Due to "mount-proc" /proc will be overmounted and
> > > > > > > ioctl() will fail with -ENOENT
> > > > > > 
> > > > > > I think I need a better explanation ...
> > > > > 
> > > > > Thank you for the quick reply, Ian.
> > > > > I'm sorry If my patch description was not sufficiently clear
> > > > > and
> > > > > detailed.
> > > > > 
> > > > > That problem connected with CRIU (Checkpoint-Restore in
> > > > > Userspace)
> > > > > project.
> > > > > In CRIU we have support of autofs mounts C/R. To acheive that
> > > > > we
> > > > > need
> > > > > to use
> > > > > ioctl's from /dev/autofs to get data about mounts, restore
> > > > > mount
> > > > > as
> > > > > catatonic
> > > > > (if needed), change pipe fd and so on. But the problem is
> > > > > that
> > > > > during
> > > > > CRIU
> > > > > dump we may meet situation when VFS subtree where autofs
> > > > > mount
> > > > > present was
> > > > > overmounted as whole.
> > > > > 
> > > > > Simpliest example is /proc/sys/fs/binfmt_misc. This mount
> > > > > present
> > > > > on
> > > > > most
> > > > > GNU/Linux distributions by default. For instance on my Fedora
> > > > > 33:
> > > > 
> > > > Yes, I don't know why systemd uses this direct mount, there
> > > > must
> > > > have been a reason for it.
> > > > 
> > > > > trigger automount of binfmt_misc
> > > > > $ ls /proc/sys/fs/binfmt_misc
> > > > > 
> > > > > $ cat /proc/1/mountinfo | grep binfmt
> > > > > 35 24 0:36 / /proc/sys/fs/binfmt_misc rw,relatime shared:16 -
> > > > > autofs
> > > > > systemd-1 rw,...,direct,pipe_ino=223
> > > > > 632 35 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime
> > > > > shared:315
> > > > > -
> > > > > binfmt_misc binfmt_misc rw
> > > > 
> > > > Yes, I think this looks normal.
> > > > 
> > > > > $ sudo unshare -m -p --fork --mount-proc sh
> > > > > # cat /proc/self/mountinfo | grep "/proc"
> > > > > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc
> > > > > proc
> > > > > rw
> > > > > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs
> > > > > systemd-
> > > > > 1 rw,...,direct,pipe_ino=223
> > > > > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime -
> > > > > binfmt_misc
> > > > > binfmt_misc rw
> > > > > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> > > > 
> > > > Isn't this screwed up, /proc is on top of the binfmt_misc mount
> > > > ...
> > > > 
> > > > Is this what's seen from the root namespace?
> > > 
> > > No-no, after issuing
> > > $ sudo unshare -m -p --fork --mount-proc sh
> > > 
> > > we enter to the pid+mount namespace and:
> > > 
> > > # cat /proc/self/mountinfo | grep "/proc"
> > > 
> > > So, it's picture from inside namespaces.
> > 
> > Ok, so potentially some of those have been propagated from the
> > original mount namespace.
> > 
> > It seems to me the sensible thing would be those mounts would
> > not propagate when a new proc has been requested. It doesn't
> > make sense to me to carry around mounts that are not accessible
> > because of something requested by the mount namespace creator.
> > 
> > But that's nothing new and isn't likely to change any time soon.
> > 
> > > > > As we can see now autofs mount /proc/sys/fs/binfmt_misc is
> > > > > inaccessible.
> > > > > If we do something like:
> > > > > 
> > > > > struct autofs_dev_ioctl *param;
> > > > > param = malloc(...);
> > > > > devfd = open("/dev/autofs", O_RDONLY);
> > > > > init_autofs_dev_ioctl(param);
> > > > > param->size = size;
> > > > > strcpy(param->path, "/proc/sys/fs/binfmt_misc");
> > > > > param->openmount.devid = 36;
> > > > > err = ioctl(devfd, AUTOFS_DEV_IOCTL_OPENMOUNT, param)
> > > > > 
> > > > > now we get err = -ENOENT.
> > > > 
> > > > Maybe that should be EINVAL, not sure about cases though.
> > > 
> > > in current version -ENOENT is returned in this particular case
> > > 
> > > > > > What's being said here?
> > > > > > 
> > > > > > For a start your talking about direct mounts, I'm pretty
> > > > > > sure
> > > > > > this
> > > > > > use case can't occur with indirect mounts in the sense that
> > > > > > the
> > > > > > indirect mount base should/must never be over mounted and
> > > > > > IIRC
> > > > > > that
> > > > > > base can't be /proc (but maybe that's just mounts inside
> > > > > > proc
> > > > > > ...),
> > > > > > can't remember now but from a common sense POV an indirect
> > > > > > mount
> > > > > > won't/can't be on /proc.
> > > > > > 
> > > > > > And why is this ioctl be called?
> > > > > 
> > > > > We call this ioctl during criu dump stage to open fd from
> > > > > autofs
> > > > > mount dentry. This fd is used later to call
> > > > > ioctl(AUTOFS_IOC_CATATONIC)
> > > > > (we do that on criu dump if we see that control process of
> > > > > autofs
> > > > > mount
> > > > > is dead or pipe is dead).
> > > > 
> > > > Right so your usage "is" the way it's intended, ;)
> > > 
> > > That's good! ;)
> > > 
> > > > > > If the mount is over mounted should that prevent expiration
> > > > > > of
> > > > > > the
> > > > > > over mounted /proc anyway, so maybe the return is correct
> > > > > > ...
> > > > > > or
> > > > > > not ...
> > > > > 
> > > > > I agree that case with overmounted subtree with autofs mount
> > > > > is
> > > > > weird
> > > > > case.
> > > > > But it may be easily created by user and we in CRIU try to
> > > > > handle
> > > > > that.
> > > > 
> > > > I'm not yet ready to make a call on how I think this this
> > > > should
> > > > be done.
> > > > 
> > > > Since you seem to be clear on what this should be used for I'll
> > > > need to look more closely at the patch.
> > > > 
> > > > But, at first glance, it looked like it would break the
> > > > existing
> > > > function of the ioctl.
> > > > 
> > > > Could you explain how the patch works, in particular why it
> > > > doesn't
> > > > break the existing functionality.
> > > 
> > > Sure. With pleasure. Idea of patch is naive:
> > > 1. find_autofs_mount() function called only from syscall context,
> > > so,
> > > we always can determine current mount namespace of caller.
> > > So, I've introduced
> > > 
> > > > > > > + int lookup_mount_path(struct mnt_namespace *ns,
> > > > > > > +		      struct path *res,
> > > > > > > +		      int test(const struct path *mnt, void
> > > > > > > *data),
> > > > > > > +		      void *data)
> > > 
> > > lookup_mount_path() helper function, which allows to traverse
> > > mounts
> > > list of
> > > mount namespace and find proper autofs mount by user-provided
> > > helper
> > > test().
> > > 
> > > 2. Helper function is fairly simple:
> > > a) it checks that mount is autofs mount (by magic number on
> > > superblock)
> > > b) it calculates full path to mount point of each mount in mount
> > > namespace
> > > and compare it with path which user was provided to the
> > > ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> > > parameters.
> > 
> > Oh right, it's using the mounts list, it isn't a path walk oriented
> > search. That's probably why I didn't see the expected follow call.
> > 
> > Another problem with the existing code is it will get it wrong if
> > there is more than one autofs mount in the stack. For example an
> > autofs submount mounted on a direct mount which is rare and not
> > all that sensible but possible.
> 
> Oh. :(
> 
> > So, if we change this, there will need to be some agreed policy
> > about which mount would is selected.
> 
> Sure. I'm ready to write some ltp (?) tests which will cover this
> autofs ioctl and all usecases.
> 
> > The originally code (long before what is there now) selected the
> > lowest mount in the stack because this mechanism is only needed
> > for direct mounts and, as long as there's not something seriously
> > wrong, that is the mount you would need. That's what would be
> > needed
> > for the case above and I think it's what's needed in your case too.
> > 
> > I still haven't yet looked closely at your change, I'll need to do
> > that.
> > 
> > > Problem here is case when user provided relative path in ioctl
> > > parameters
> > > (struct autofs_dev_ioctl). In this case we may fail to resolve
> > > user
> > > provided path to
> > > struct path. For instance:
> > 
> > I don't like the idea of allowing a relative path as an a parameter
> > at all. I think that should be a failure case.
> 
> I'm too, and I've even thought about to restrict use relative paths
> here,
> but actual kernel code allows that, so is it will be okay to discard
> compatibility here?
> I know that autofs daemon implementation uses full paths and if we
> restrict relative paths here
> it will not break systemd/automount daemons.

Sorry for not responding earlier, I've been distracted by other
work.

I have thought about this quite a bit though and the news is not
good I'm afraid.

First, as Al mentioned, the iterator over all the name space mounts
isn't ok, that itself is a significant problem. The only way that
searching for a mount would be ok is if you could use one of the
mount hash lists but I don't think you have enough information to
do that. For example the mount point hash uses the mount point
dentry to get the hash index and you don't have that in this case.

The problem is that the number of mounts can be very large, and
autofs can be one of the applications with a large number of mounts
itself but it's by no means the only one, so iterating over the
whole list of name space mounts is a no-go.

But there's more.

The ioctl is meant to allow you to get a file handle of a direct
mount that is covered by a mount at that mount point only (ie.
something mounted on the same path directory). Anything else
probably shouldn't be allowed.

There's also a problem with allowing access from a mount name
space other than the one in which the mount was originally
mounted and that's because of mount propagation that can occur.
Processes in another mount name space (that can receive these
mounts due to admin mount propagation setup) shouldn't be allowed
to alter these mounts using administrative ioctls. I don't check
for that but I probably should.

There's the notion of ownership, even processes in the same name
space that aren't the mounting process shouldn't be able to use
the admin type ioctls if they aren't the process that mounted it
unless the mount owner process has gone away. I also don't check
the mount is not owned, that probably can be done using the struct
pid of the owner process in the super block info., it should be
stale when the process has gone away.

And finally there's the issue of allowing access to mounts that
are covered by a mount further up the directory tree, historically
that is meant to hide everything below that mount point so it's
questionable (and possibly a security problem) to try and come up
with some way that is available modules to bypass it.

Sorry to be so negative but cross name space access has been a
concern of mine for quite a while and that's why I have fairly
specific views on it.

Ian
> 
> > These direct mounts come from a table (a map in autofs or a unit
> > in systemd) and they should always be identified by that full path.
> > 
> > Ian
> 
> Thank you!
> 
> Regards,
> Alex
> 
> > > # cat /proc/self/mountinfo | grep "/proc"
> > > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc
> > > rw
> > > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs
> > > systemd-
> > > 1 rw,...,direct,pipe_ino=223
> > > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime -
> > > binfmt_misc
> > > binfmt_misc rw
> > > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> > > 
> > > in this case 
> > > kern_path("/proc/sys/fs/binfmt_misc", LOOKUP_MOUNTPOINT, &path)
> > > ==
> > > -ENOENT
> > > 
> > > To overcome this issue, if kern_path() failed with -ENOENT
> > > AND user-provided mount path looks like fullpath (starts from /)
> > > we just try to find autofs mount in mounts list just by searching
> > > autofs mounts in mounts list with mount point path equal to user-
> > > provided
> > > path. This covers our problem case.
> > > 
> > > This patch is fully compatible with old behaviour - if parent
> > > mounts
> > > of
> > > autofs mount is not overmounted - then
> > > kern_path("/proc/sys/fs/binfmt_misc", LOOKUP_MOUNTPOINT, &path)
> > > will not fail, and we also easily find needed autofs mount in
> > > mounts
> > > list
> > > of caller mount namespace.
> > > 
> > > > Long ago I'm pretty sure I continued to follow up but IIRC that
> > > > went away and was replaced by a single follow_up(), but since
> > > > the changes didn't break the existing function of autofs I
> > > > didn't pay that much attention to them, I'll need to look at
> > > > that too. Not only that, the namespace code has moved a long
> > > > way too however there's still little attention given to
> > > > sanitizing the mounts in the new namespace by anything that I'm
> > > > aware of that uses the feature. TBH I'm not sure why I don't
> > > > see a lot more problems of that nature.
> > > > 
> > > > I have to wonder if what's needed is attention to the follow up
> > > > but that /proc covering the earlier mounts is a bit of a
> > > > concern.
> > > > 
> > > > > > I get that the mount namespaces should be independent and
> > > > > > intuitively
> > > > > > this is a bug but what is the actual use and expected
> > > > > > result.
> > > > > > 
> > > > > > But anyway, aren't you saying that the VFS path walk isn't
> > > > > > handling
> > > > > > mount namespaces properly or are you saying that a process
> > > > > > outside
> > > > > > this new mount namespace becomes broken because of it?
> > > > > 
> > > > > No-no, it's only about opening autofs mount by device id +
> > > > > path.
> > > > 
> > > > That's right, specifically getting a file handle to a covered
> > > > autofs
> > > > mount for things like bringing it back to life etc. But that
> > > > silently
> > > > implies the same mount namespace.
> > > > 
> > > > Let me look at the patch and think about it a bit.
> > > > I'll probably need to run some tests too.
> > > > I am a little busy right now so it may take a bit of time.
> > > > 
> > > > Ian 
> > > 
> > > Thank you very much for your attention to the patch and comments.
> > > 
> > > Regards,
> > > Alex
> > > 
> > > > > > Either way the solution looks more complicated than I'd
> > > > > > expect
> > > > > > so
> > > > > > some explanation along these lines would be good.
> > > > > > 
> > > > > > Ian
> > > > > 
> > > > > Regards,
> > > > > Alex
> > > > > 
> > > > > > > Cc: Matthew Wilcox <willy@infradead.org>
> > > > > > > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > > > > > > Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > > > > > > Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
> > > > > > > Cc: autofs@vger.kernel.org
> > > > > > > Cc: linux-kernel@vger.kernel.org
> > > > > > > Signed-off-by: Alexander Mikhalitsyn <
> > > > > > > alexander.mikhalitsyn@virtuozzo.com>
> > > > > > > ---
> > > > > > >  fs/autofs/dev-ioctl.c | 127
> > > > > > > +++++++++++++++++++++++++++++++++++++---
> > > > > > > --
> > > > > > >  fs/namespace.c        |  44 +++++++++++++++
> > > > > > >  include/linux/mount.h |   5 ++
> > > > > > >  3 files changed, 162 insertions(+), 14 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-
> > > > > > > ioctl.c
> > > > > > > index 5bf781ea6d67..55edd3eba8ce 100644
> > > > > > > --- a/fs/autofs/dev-ioctl.c
> > > > > > > +++ b/fs/autofs/dev-ioctl.c
> > > > > > > @@ -10,6 +10,7 @@
> > > > > > >  #include <linux/fdtable.h>
> > > > > > >  #include <linux/magic.h>
> > > > > > >  #include <linux/nospec.h>
> > > > > > > +#include <linux/nsproxy.h>
> > > > > > >  
> > > > > > >  #include "autofs_i.h"
> > > > > > >  
> > > > > > > @@ -179,32 +180,130 @@ static int
> > > > > > > autofs_dev_ioctl_protosubver(struct
> > > > > > > file *fp,
> > > > > > >  	return 0;
> > > > > > >  }
> > > > > > >  
> > > > > > > +struct filter_autofs_data {
> > > > > > > +	char *pathbuf;
> > > > > > > +	const char *fpathname;
> > > > > > > +	int (*test)(const struct path *path, void *data);
> > > > > > > +	void *data;
> > > > > > > +};
> > > > > > > +
> > > > > > > +static int filter_autofs(const struct path *path, void
> > > > > > > *p)
> > > > > > > +{
> > > > > > > +	struct filter_autofs_data *data = p;
> > > > > > > +	char *name;
> > > > > > > +	int err;
> > > > > > > +
> > > > > > > +	if (path->mnt->mnt_sb->s_magic != AUTOFS_SUPER_MAGIC)
> > > > > > > +		return 0;
> > > > > > > +
> > > > > > > +	name = d_path(path, data->pathbuf, PATH_MAX);
> > > > > > > +	if (IS_ERR(name)) {
> > > > > > > +		err = PTR_ERR(name);
> > > > > > > +		pr_err("d_path failed, errno %d\n", err);
> > > > > > > +		return 0;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	if (strncmp(data->fpathname, name, PATH_MAX))
> > > > > > > +		return 0;
> > > > > > > +
> > > > > > > +	if (!data->test(path, data->data))
> > > > > > > +		return 0;
> > > > > > > +
> > > > > > > +	return 1;
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* Find the topmost mount satisfying test() */
> > > > > > >  static int find_autofs_mount(const char *pathname,
> > > > > > >  			     struct path *res,
> > > > > > >  			     int test(const struct path *path,
> > > > > > > void
> > > > > > > *data),
> > > > > > >  			     void *data)
> > > > > > >  {
> > > > > > > -	struct path path;
> > > > > > > +	struct filter_autofs_data mdata = {
> > > > > > > +		.pathbuf = NULL,
> > > > > > > +		.test = test,
> > > > > > > +		.data = data,
> > > > > > > +	};
> > > > > > > +	struct mnt_namespace *mnt_ns = current->nsproxy-
> > > > > > > > mnt_ns;
> > > > > > > +	struct path path = {};
> > > > > > > +	char *fpathbuf = NULL;
> > > > > > >  	int err;
> > > > > > >  
> > > > > > > +	/*
> > > > > > > +	 * In most cases user will provide full path to autofs
> > > > > > > mount
> > > > > > > point
> > > > > > > +	 * as it is in /proc/X/mountinfo. But if not, then we
> > > > > > > need to
> > > > > > > +	 * open provided relative path and calculate full path.
> > > > > > > +	 * It will not work in case when parent mount of autofs
> > > > > > > mount
> > > > > > > +	 * is overmounted:
> > > > > > > +	 * cd /root
> > > > > > > +	 * ./autofs_mount /root/autofs_yard/mnt
> > > > > > > +	 * mount -t tmpfs tmpfs /root/autofs_yard/mnt
> > > > > > > +	 * mount -t tmpfs tmpfs /root/autofs_yard
> > > > > > > +	 * ./call_ioctl /root/autofs_yard/mnt <- all fine here
> > > > > > > because
> > > > > > > we
> > > > > > > +	 * 					 have full
> > > > > > > path and
> > > > > > > don't
> > > > > > > +	 * 					 need to call
> > > > > > > kern_path()
> > > > > > > +	 * 					 and d_path()
> > > > > > > +	 * ./call_ioctl autofs_yard/mnt <- will fail because
> > > > > > > kern_path()
> > > > > > > +	 * 				   can't lookup
> > > > > > > /root/autofs_yard/mnt
> > > > > > > +	 * 				   (/root/autofs_yard
> > > > > > > directory is
> > > > > > > +	 * 				    empty)
> > > > > > > +	 *
> > > > > > > +	 * TO DISCUSS: we can write special algorithm for
> > > > > > > relative path
> > > > > > > case
> > > > > > > +	 * by getting cwd path combining it with relative path
> > > > > > > from
> > > > > > > user. But
> > > > > > > +	 * is it worth it? User also may use paths with
> > > > > > > symlinks in
> > > > > > > components
> > > > > > > +	 * of path.
> > > > > > > +	 *
> > > > > > > +	 */
> > > > > > >  	err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
> > > > > > > -	if (err)
> > > > > > > -		return err;
> > > > > > > -	err = -ENOENT;
> > > > > > > -	while (path.dentry == path.mnt->mnt_root) {
> > > > > > > -		if (path.dentry->d_sb->s_magic ==
> > > > > > > AUTOFS_SUPER_MAGIC) {
> > > > > > > -			if (test(&path, data)) {
> > > > > > > -				path_get(&path);
> > > > > > > -				*res = path;
> > > > > > > -				err = 0;
> > > > > > > -				break;
> > > > > > > -			}
> > > > > > > +	if (err) {
> > > > > > > +		if (pathname[0] == '/') {
> > > > > > > +			/*
> > > > > > > +			 * pathname looks like full path let's
> > > > > > > try to
> > > > > > > use it
> > > > > > > +			 * as it is when searching autofs mount
> > > > > > > +			 */
> > > > > > > +			mdata.fpathname = pathname;
> > > > > > > +			err = 0;
> > > > > > > +			pr_debug("kern_path failed on %s, errno
> > > > > > > %d.
> > > > > > > Will use path as it is to search mount\n",
> > > > > > > +				 pathname, err);
> > > > > > > +		} else {
> > > > > > > +			pr_err("kern_path failed on %s, errno
> > > > > > > %d\n",
> > > > > > > +			       pathname, err);
> > > > > > > +			return err;
> > > > > > > +		}
> > > > > > > +	} else {
> > > > > > > +		pr_debug("find_autofs_mount: let's resolve full
> > > > > > > path
> > > > > > > %s\n",
> > > > > > > +			 pathname);
> > > > > > > +
> > > > > > > +		fpathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > > > > > +		if (!fpathbuf) {
> > > > > > > +			err = -ENOMEM;
> > > > > > > +			goto err;
> > > > > > > +		}
> > > > > > > +
> > > > > > > +		/*
> > > > > > > +		 * We have pathname from user but it may be
> > > > > > > relative,
> > > > > > > we need to
> > > > > > > +		 * have full path because we want to compare it
> > > > > > > with
> > > > > > > mountpoints
> > > > > > > +		 * paths later.
> > > > > > > +		 */
> > > > > > > +		mdata.fpathname = d_path(&path, fpathbuf,
> > > > > > > PATH_MAX);
> > > > > > > +		if (IS_ERR(mdata.fpathname)) {
> > > > > > > +			err = PTR_ERR(mdata.fpathname);
> > > > > > > +			pr_err("d_path failed, errno %d\n",
> > > > > > > err);
> > > > > > > +			goto err;
> > > > > > >  		}
> > > > > > > -		if (!follow_up(&path))
> > > > > > > -			break;
> > > > > > >  	}
> > > > > > > +
> > > > > > > +	mdata.pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > > > > > +	if (!mdata.pathbuf) {
> > > > > > > +		err = -ENOMEM;
> > > > > > > +		goto err;
> > > > > > > +	}
> > > > > > > +
> > > > > > > +	err = lookup_mount_path(mnt_ns, res, filter_autofs,
> > > > > > > &mdata);
> > > > > > > +
> > > > > > > +err:
> > > > > > >  	path_put(&path);
> > > > > > > +	kfree(fpathbuf);
> > > > > > > +	kfree(mdata.pathbuf);
> > > > > > >  	return err;
> > > > > > >  }
> > > > > > >  
> > > > > > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > > > > > index 56bb5a5fdc0d..e1d006dbdfe2 100644
> > > > > > > --- a/fs/namespace.c
> > > > > > > +++ b/fs/namespace.c
> > > > > > > @@ -1367,6 +1367,50 @@ void mnt_cursor_del(struct
> > > > > > > mnt_namespace
> > > > > > > *ns,
> > > > > > > struct mount *cursor)
> > > > > > >  }
> > > > > > >  #endif  /* CONFIG_PROC_FS */
> > > > > > >  
> > > > > > > +/**
> > > > > > > + * lookup_mount_path - traverse all mounts in mount
> > > > > > > namespace
> > > > > > > + *                     and filter using test() probe
> > > > > > > callback
> > > > > > > + * As a result struct path will be provided.
> > > > > > > + * @ns: root of mount tree
> > > > > > > + * @res: struct path pointer where resulting path will
> > > > > > > be
> > > > > > > written
> > > > > > > + * @test: filter callback
> > > > > > > + * @data: will be provided as argument to test()
> > > > > > > callback
> > > > > > > + *
> > > > > > > + */
> > > > > > > +int lookup_mount_path(struct mnt_namespace *ns,
> > > > > > > +		      struct path *res,
> > > > > > > +		      int test(const struct path *mnt, void
> > > > > > > *data),
> > > > > > > +		      void *data)
> > > > > > > +{
> > > > > > > +	struct mount *mnt;
> > > > > > > +	int err = -ENOENT;
> > > > > > > +
> > > > > > > +	down_read(&namespace_sem);
> > > > > > > +	lock_ns_list(ns);
> > > > > > > +	list_for_each_entry(mnt, &ns->list, mnt_list) {
> > > > > > > +		struct path tmppath;
> > > > > > > +
> > > > > > > +		if (mnt_is_cursor(mnt))
> > > > > > > +			continue;
> > > > > > > +
> > > > > > > +		tmppath.dentry = mnt->mnt.mnt_root;
> > > > > > > +		tmppath.mnt = &mnt->mnt;
> > > > > > > +
> > > > > > > +		if (test(&tmppath, data)) {
> > > > > > > +			path_get(&tmppath);
> > > > > > > +			*res = tmppath;
> > > > > > > +			err = 0;
> > > > > > > +			break;
> > > > > > > +		}
> > > > > > > +	}
> > > > > > > +	unlock_ns_list(ns);
> > > > > > > +	up_read(&namespace_sem);
> > > > > > > +
> > > > > > > +	return err;
> > > > > > > +}
> > > > > > > +
> > > > > > > +EXPORT_SYMBOL(lookup_mount_path);
> > > > > > > +
> > > > > > >  /**
> > > > > > >   * may_umount_tree - check if a mount tree is busy
> > > > > > >   * @mnt: root of mount tree
> > > > > > > diff --git a/include/linux/mount.h
> > > > > > > b/include/linux/mount.h
> > > > > > > index 5d92a7e1a742..a79e6392e38e 100644
> > > > > > > --- a/include/linux/mount.h
> > > > > > > +++ b/include/linux/mount.h
> > > > > > > @@ -118,6 +118,11 @@ extern unsigned int
> > > > > > > sysctl_mount_max;
> > > > > > >  
> > > > > > >  extern bool path_is_mountpoint(const struct path *path);
> > > > > > >  
> > > > > > > +extern int lookup_mount_path(struct mnt_namespace *ns,
> > > > > > > +			     struct path *res,
> > > > > > > +			     int test(const struct path *mnt,
> > > > > > > void
> > > > > > > *data),
> > > > > > > +			     void *data);
> > > > > > > +
> > > > > > >  extern void kern_unmount_array(struct vfsmount *mnt[],
> > > > > > > unsigned
> > > > > > > int
> > > > > > > num);
> > > > > > >  
> > > > > > >  #endif /* _LINUX_MOUNT_H */
Alexander Mikhalitsyn March 24, 2021, 2:20 p.m. UTC | #13
On Tue, Mar 23, 2021 at 11:44 AM Ian Kent <raven@themaw.net> wrote:
>
> On Tue, 2021-03-09 at 13:43 +0300, Alexander Mikhalitsyn wrote:
> > On Sat, 06 Mar 2021 17:13:32 +0800
> > Ian Kent <raven@themaw.net> wrote:
> >
> > > On Fri, 2021-03-05 at 14:55 +0300, Alexander Mikhalitsyn wrote:
> > > > On Fri, 05 Mar 2021 18:10:02 +0800
> > > > Ian Kent <raven@themaw.net> wrote:
> > > >
> > > > > On Thu, 2021-03-04 at 13:11 +0300, Alexander Mikhalitsyn wrote:
> > > > > > On Thu, 04 Mar 2021 14:54:11 +0800
> > > > > > Ian Kent <raven@themaw.net> wrote:
> > > > > >
> > > > > > > On Wed, 2021-03-03 at 18:28 +0300, Alexander Mikhalitsyn
> > > > > > > wrote:
> > > > > > > > It was discovered that find_autofs_mount() function
> > > > > > > > in autofs not support cases when autofs mount
> > > > > > > > parent is overmounted. In this case this function will
> > > > > > > > always return -ENOENT.
> > > > > > >
> > > > > > > Ok, I get this shouldn't happen.
> > > > > > >
> > > > > > > > Real-life reproducer is fairly simple.
> > > > > > > > Consider the following mounts on root mntns:
> > > > > > > > --
> > > > > > > > 35 24 0:36 / /proc/sys/fs/binfmt_misc ... shared:16 -
> > > > > > > > autofs
> > > > > > > > systemd-
> > > > > > > > 1 ...
> > > > > > > > 654 35 0:57 / /proc/sys/fs/binfmt_misc ... shared:322 -
> > > > > > > > binfmt_misc
> > > > > > > > ...
> > > > > > > > --
> > > > > > > > and some process which calls
> > > > > > > > ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> > > > > > > > $ unshare -m -p --fork --mount-proc ./process-bin
> > > > > > > >
> > > > > > > > Due to "mount-proc" /proc will be overmounted and
> > > > > > > > ioctl() will fail with -ENOENT
> > > > > > >
> > > > > > > I think I need a better explanation ...
> > > > > >
> > > > > > Thank you for the quick reply, Ian.
> > > > > > I'm sorry If my patch description was not sufficiently clear
> > > > > > and
> > > > > > detailed.
> > > > > >
> > > > > > That problem connected with CRIU (Checkpoint-Restore in
> > > > > > Userspace)
> > > > > > project.
> > > > > > In CRIU we have support of autofs mounts C/R. To acheive that
> > > > > > we
> > > > > > need
> > > > > > to use
> > > > > > ioctl's from /dev/autofs to get data about mounts, restore
> > > > > > mount
> > > > > > as
> > > > > > catatonic
> > > > > > (if needed), change pipe fd and so on. But the problem is
> > > > > > that
> > > > > > during
> > > > > > CRIU
> > > > > > dump we may meet situation when VFS subtree where autofs
> > > > > > mount
> > > > > > present was
> > > > > > overmounted as whole.
> > > > > >
> > > > > > Simpliest example is /proc/sys/fs/binfmt_misc. This mount
> > > > > > present
> > > > > > on
> > > > > > most
> > > > > > GNU/Linux distributions by default. For instance on my Fedora
> > > > > > 33:
> > > > >
> > > > > Yes, I don't know why systemd uses this direct mount, there
> > > > > must
> > > > > have been a reason for it.
> > > > >
> > > > > > trigger automount of binfmt_misc
> > > > > > $ ls /proc/sys/fs/binfmt_misc
> > > > > >
> > > > > > $ cat /proc/1/mountinfo | grep binfmt
> > > > > > 35 24 0:36 / /proc/sys/fs/binfmt_misc rw,relatime shared:16 -
> > > > > > autofs
> > > > > > systemd-1 rw,...,direct,pipe_ino=223
> > > > > > 632 35 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime
> > > > > > shared:315
> > > > > > -
> > > > > > binfmt_misc binfmt_misc rw
> > > > >
> > > > > Yes, I think this looks normal.
> > > > >
> > > > > > $ sudo unshare -m -p --fork --mount-proc sh
> > > > > > # cat /proc/self/mountinfo | grep "/proc"
> > > > > > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc
> > > > > > proc
> > > > > > rw
> > > > > > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs
> > > > > > systemd-
> > > > > > 1 rw,...,direct,pipe_ino=223
> > > > > > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime -
> > > > > > binfmt_misc
> > > > > > binfmt_misc rw
> > > > > > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> > > > >
> > > > > Isn't this screwed up, /proc is on top of the binfmt_misc mount
> > > > > ...
> > > > >
> > > > > Is this what's seen from the root namespace?
> > > >
> > > > No-no, after issuing
> > > > $ sudo unshare -m -p --fork --mount-proc sh
> > > >
> > > > we enter to the pid+mount namespace and:
> > > >
> > > > # cat /proc/self/mountinfo | grep "/proc"
> > > >
> > > > So, it's picture from inside namespaces.
> > >
> > > Ok, so potentially some of those have been propagated from the
> > > original mount namespace.
> > >
> > > It seems to me the sensible thing would be those mounts would
> > > not propagate when a new proc has been requested. It doesn't
> > > make sense to me to carry around mounts that are not accessible
> > > because of something requested by the mount namespace creator.
> > >
> > > But that's nothing new and isn't likely to change any time soon.
> > >
> > > > > > As we can see now autofs mount /proc/sys/fs/binfmt_misc is
> > > > > > inaccessible.
> > > > > > If we do something like:
> > > > > >
> > > > > > struct autofs_dev_ioctl *param;
> > > > > > param = malloc(...);
> > > > > > devfd = open("/dev/autofs", O_RDONLY);
> > > > > > init_autofs_dev_ioctl(param);
> > > > > > param->size = size;
> > > > > > strcpy(param->path, "/proc/sys/fs/binfmt_misc");
> > > > > > param->openmount.devid = 36;
> > > > > > err = ioctl(devfd, AUTOFS_DEV_IOCTL_OPENMOUNT, param)
> > > > > >
> > > > > > now we get err = -ENOENT.
> > > > >
> > > > > Maybe that should be EINVAL, not sure about cases though.
> > > >
> > > > in current version -ENOENT is returned in this particular case
> > > >
> > > > > > > What's being said here?
> > > > > > >
> > > > > > > For a start your talking about direct mounts, I'm pretty
> > > > > > > sure
> > > > > > > this
> > > > > > > use case can't occur with indirect mounts in the sense that
> > > > > > > the
> > > > > > > indirect mount base should/must never be over mounted and
> > > > > > > IIRC
> > > > > > > that
> > > > > > > base can't be /proc (but maybe that's just mounts inside
> > > > > > > proc
> > > > > > > ...),
> > > > > > > can't remember now but from a common sense POV an indirect
> > > > > > > mount
> > > > > > > won't/can't be on /proc.
> > > > > > >
> > > > > > > And why is this ioctl be called?
> > > > > >
> > > > > > We call this ioctl during criu dump stage to open fd from
> > > > > > autofs
> > > > > > mount dentry. This fd is used later to call
> > > > > > ioctl(AUTOFS_IOC_CATATONIC)
> > > > > > (we do that on criu dump if we see that control process of
> > > > > > autofs
> > > > > > mount
> > > > > > is dead or pipe is dead).
> > > > >
> > > > > Right so your usage "is" the way it's intended, ;)
> > > >
> > > > That's good! ;)
> > > >
> > > > > > > If the mount is over mounted should that prevent expiration
> > > > > > > of
> > > > > > > the
> > > > > > > over mounted /proc anyway, so maybe the return is correct
> > > > > > > ...
> > > > > > > or
> > > > > > > not ...
> > > > > >
> > > > > > I agree that case with overmounted subtree with autofs mount
> > > > > > is
> > > > > > weird
> > > > > > case.
> > > > > > But it may be easily created by user and we in CRIU try to
> > > > > > handle
> > > > > > that.
> > > > >
> > > > > I'm not yet ready to make a call on how I think this this
> > > > > should
> > > > > be done.
> > > > >
> > > > > Since you seem to be clear on what this should be used for I'll
> > > > > need to look more closely at the patch.
> > > > >
> > > > > But, at first glance, it looked like it would break the
> > > > > existing
> > > > > function of the ioctl.
> > > > >
> > > > > Could you explain how the patch works, in particular why it
> > > > > doesn't
> > > > > break the existing functionality.
> > > >
> > > > Sure. With pleasure. Idea of patch is naive:
> > > > 1. find_autofs_mount() function called only from syscall context,
> > > > so,
> > > > we always can determine current mount namespace of caller.
> > > > So, I've introduced
> > > >
> > > > > > > > + int lookup_mount_path(struct mnt_namespace *ns,
> > > > > > > > +               struct path *res,
> > > > > > > > +               int test(const struct path *mnt, void
> > > > > > > > *data),
> > > > > > > > +               void *data)
> > > >
> > > > lookup_mount_path() helper function, which allows to traverse
> > > > mounts
> > > > list of
> > > > mount namespace and find proper autofs mount by user-provided
> > > > helper
> > > > test().
> > > >
> > > > 2. Helper function is fairly simple:
> > > > a) it checks that mount is autofs mount (by magic number on
> > > > superblock)
> > > > b) it calculates full path to mount point of each mount in mount
> > > > namespace
> > > > and compare it with path which user was provided to the
> > > > ioctl(AUTOFS_DEV_IOCTL_OPENMOUNT)
> > > > parameters.
> > >
> > > Oh right, it's using the mounts list, it isn't a path walk oriented
> > > search. That's probably why I didn't see the expected follow call.
> > >
> > > Another problem with the existing code is it will get it wrong if
> > > there is more than one autofs mount in the stack. For example an
> > > autofs submount mounted on a direct mount which is rare and not
> > > all that sensible but possible.
> >
> > Oh. :(
> >
> > > So, if we change this, there will need to be some agreed policy
> > > about which mount would is selected.
> >
> > Sure. I'm ready to write some ltp (?) tests which will cover this
> > autofs ioctl and all usecases.
> >
> > > The originally code (long before what is there now) selected the
> > > lowest mount in the stack because this mechanism is only needed
> > > for direct mounts and, as long as there's not something seriously
> > > wrong, that is the mount you would need. That's what would be
> > > needed
> > > for the case above and I think it's what's needed in your case too.
> > >
> > > I still haven't yet looked closely at your change, I'll need to do
> > > that.
> > >
> > > > Problem here is case when user provided relative path in ioctl
> > > > parameters
> > > > (struct autofs_dev_ioctl). In this case we may fail to resolve
> > > > user
> > > > provided path to
> > > > struct path. For instance:
> > >
> > > I don't like the idea of allowing a relative path as an a parameter
> > > at all. I think that should be a failure case.
> >
> > I'm too, and I've even thought about to restrict use relative paths
> > here,
> > but actual kernel code allows that, so is it will be okay to discard
> > compatibility here?
> > I know that autofs daemon implementation uses full paths and if we
> > restrict relative paths here
> > it will not break systemd/automount daemons.
>
> Sorry for not responding earlier, I've been distracted by other
> work.

No problem, thank you for your attention to the patch and discussion!

>
> I have thought about this quite a bit though and the news is not
> good I'm afraid.
>
> First, as Al mentioned, the iterator over all the name space mounts
> isn't ok, that itself is a significant problem. The only way that
> searching for a mount would be ok is if you could use one of the
> mount hash lists but I don't think you have enough information to
> do that. For example the mount point hash uses the mount point
> dentry to get the hash index and you don't have that in this case.
>
> The problem is that the number of mounts can be very large, and
> autofs can be one of the applications with a large number of mounts
> itself but it's by no means the only one, so iterating over the
> whole list of name space mounts is a no-go.

Yes, but we can overcome that by combining old search approach +
new approach which will be called only when old finds nothing.

>
> But there's more.
>
> The ioctl is meant to allow you to get a file handle of a direct
> mount that is covered by a mount at that mount point only (ie.
> something mounted on the same path directory). Anything else
> probably shouldn't be allowed.

If I understand you right, then if we have some direct autofs mount
and some parent mount of that mount was overmounted it's okay if autofs
daemon will loose control over that mount?

Example:
/mnt
/mnt/nfs <- autofs
/mnt/nfs <- nft

now overmount /mnt with something
restart autofs daemon

At this point, autofs daemon looses control over /mnt/nfs.

I don't insisting that this is the problem because we can treat that case
as weird and not common. Maybe it's worth adding some notice about parent
overmounts in autofs docs? (I can try to prepare a patch with notices
about that.)

>
> There's also a problem with allowing access from a mount name
> space other than the one in which the mount was originally
> mounted and that's because of mount propagation that can occur.
> Processes in another mount name space (that can receive these
> mounts due to admin mount propagation setup) shouldn't be allowed
> to alter these mounts using administrative ioctls. I don't check
> for that but I probably should.

Looks like they can. Because we just filtering by path in current
mnt ns + devid. devid stays the same during propagation, path
resolved in current mntns. I will check.

>
> There's the notion of ownership, even processes in the same name
> space that aren't the mounting process shouldn't be able to use
> the admin type ioctls if they aren't the process that mounted it
> unless the mount owner process has gone away. I also don't check
> the mount is not owned, that probably can be done using the struct
> pid of the owner process in the super block info., it should be
> stale when the process has gone away.
>
> And finally there's the issue of allowing access to mounts that
> are covered by a mount further up the directory tree, historically
> that is meant to hide everything below that mount point so it's
> questionable (and possibly a security problem) to try and come up
> with some way that is available modules to bypass it.

I've got it. So, some users may overmount things to restrict access
to some underlying mounts/files. It looks weird for me (user may spawn
processes with opened file descriptors to the underlying mounts before
overmounting and continue to use hidden mounts/files), but If
it's a supported case then of course we must preserve that behavior
and restrict opening overmounted things.

>
> Sorry to be so negative but cross name space access has been a
> concern of mine for quite a while and that's why I have fairly
> specific views on it.

Thank you for the detailed answer!

I've thought about ways to solve the problem with CRIU. I will
try another approach that should not involve that ioctl in overmounted
cases. Idea is to treat autofs mount which parents was overmounted
as catatonic and dump it as a catatonic from start. There is no requirement
to change that's mounts modes during the dump procedures.

I hope that we continue the discussion about that problem with
overmounted autofs and
I'm ready to prepare additional info to the Documentation to inform
users that autofs + overmounted parent is not a good idea and I will check case
when administrative ioctl() used from different mount ns.

Regards,
Alex

>
> Ian
> >
> > > These direct mounts come from a table (a map in autofs or a unit
> > > in systemd) and they should always be identified by that full path.
> > >
> > > Ian
> >
> > Thank you!
> >
> > Regards,
> > Alex
> >
> > > > # cat /proc/self/mountinfo | grep "/proc"
> > > > 828 809 0:23 / /proc rw,nosuid,nodev,noexec,relatime - proc proc
> > > > rw
> > > > 829 828 0:36 / /proc/sys/fs/binfmt_misc rw,relatime - autofs
> > > > systemd-
> > > > 1 rw,...,direct,pipe_ino=223
> > > > 943 829 0:56 / /proc/sys/fs/binfmt_misc rw,...,relatime -
> > > > binfmt_misc
> > > > binfmt_misc rw
> > > > 949 828 0:57 / /proc rw...,relatime - proc proc rw
> > > >
> > > > in this case
> > > > kern_path("/proc/sys/fs/binfmt_misc", LOOKUP_MOUNTPOINT, &path)
> > > > ==
> > > > -ENOENT
> > > >
> > > > To overcome this issue, if kern_path() failed with -ENOENT
> > > > AND user-provided mount path looks like fullpath (starts from /)
> > > > we just try to find autofs mount in mounts list just by searching
> > > > autofs mounts in mounts list with mount point path equal to user-
> > > > provided
> > > > path. This covers our problem case.
> > > >
> > > > This patch is fully compatible with old behaviour - if parent
> > > > mounts
> > > > of
> > > > autofs mount is not overmounted - then
> > > > kern_path("/proc/sys/fs/binfmt_misc", LOOKUP_MOUNTPOINT, &path)
> > > > will not fail, and we also easily find needed autofs mount in
> > > > mounts
> > > > list
> > > > of caller mount namespace.
> > > >
> > > > > Long ago I'm pretty sure I continued to follow up but IIRC that
> > > > > went away and was replaced by a single follow_up(), but since
> > > > > the changes didn't break the existing function of autofs I
> > > > > didn't pay that much attention to them, I'll need to look at
> > > > > that too. Not only that, the namespace code has moved a long
> > > > > way too however there's still little attention given to
> > > > > sanitizing the mounts in the new namespace by anything that I'm
> > > > > aware of that uses the feature. TBH I'm not sure why I don't
> > > > > see a lot more problems of that nature.
> > > > >
> > > > > I have to wonder if what's needed is attention to the follow up
> > > > > but that /proc covering the earlier mounts is a bit of a
> > > > > concern.
> > > > >
> > > > > > > I get that the mount namespaces should be independent and
> > > > > > > intuitively
> > > > > > > this is a bug but what is the actual use and expected
> > > > > > > result.
> > > > > > >
> > > > > > > But anyway, aren't you saying that the VFS path walk isn't
> > > > > > > handling
> > > > > > > mount namespaces properly or are you saying that a process
> > > > > > > outside
> > > > > > > this new mount namespace becomes broken because of it?
> > > > > >
> > > > > > No-no, it's only about opening autofs mount by device id +
> > > > > > path.
> > > > >
> > > > > That's right, specifically getting a file handle to a covered
> > > > > autofs
> > > > > mount for things like bringing it back to life etc. But that
> > > > > silently
> > > > > implies the same mount namespace.
> > > > >
> > > > > Let me look at the patch and think about it a bit.
> > > > > I'll probably need to run some tests too.
> > > > > I am a little busy right now so it may take a bit of time.
> > > > >
> > > > > Ian
> > > >
> > > > Thank you very much for your attention to the patch and comments.
> > > >
> > > > Regards,
> > > > Alex
> > > >
> > > > > > > Either way the solution looks more complicated than I'd
> > > > > > > expect
> > > > > > > so
> > > > > > > some explanation along these lines would be good.
> > > > > > >
> > > > > > > Ian
> > > > > >
> > > > > > Regards,
> > > > > > Alex
> > > > > >
> > > > > > > > Cc: Matthew Wilcox <willy@infradead.org>
> > > > > > > > Cc: Al Viro <viro@zeniv.linux.org.uk>
> > > > > > > > Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
> > > > > > > > Cc: Kirill Tkhai <ktkhai@virtuozzo.com>
> > > > > > > > Cc: autofs@vger.kernel.org
> > > > > > > > Cc: linux-kernel@vger.kernel.org
> > > > > > > > Signed-off-by: Alexander Mikhalitsyn <
> > > > > > > > alexander.mikhalitsyn@virtuozzo.com>
> > > > > > > > ---
> > > > > > > >  fs/autofs/dev-ioctl.c | 127
> > > > > > > > +++++++++++++++++++++++++++++++++++++---
> > > > > > > > --
> > > > > > > >  fs/namespace.c        |  44 +++++++++++++++
> > > > > > > >  include/linux/mount.h |   5 ++
> > > > > > > >  3 files changed, 162 insertions(+), 14 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-
> > > > > > > > ioctl.c
> > > > > > > > index 5bf781ea6d67..55edd3eba8ce 100644
> > > > > > > > --- a/fs/autofs/dev-ioctl.c
> > > > > > > > +++ b/fs/autofs/dev-ioctl.c
> > > > > > > > @@ -10,6 +10,7 @@
> > > > > > > >  #include <linux/fdtable.h>
> > > > > > > >  #include <linux/magic.h>
> > > > > > > >  #include <linux/nospec.h>
> > > > > > > > +#include <linux/nsproxy.h>
> > > > > > > >
> > > > > > > >  #include "autofs_i.h"
> > > > > > > >
> > > > > > > > @@ -179,32 +180,130 @@ static int
> > > > > > > > autofs_dev_ioctl_protosubver(struct
> > > > > > > > file *fp,
> > > > > > > >   return 0;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +struct filter_autofs_data {
> > > > > > > > + char *pathbuf;
> > > > > > > > + const char *fpathname;
> > > > > > > > + int (*test)(const struct path *path, void *data);
> > > > > > > > + void *data;
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > +static int filter_autofs(const struct path *path, void
> > > > > > > > *p)
> > > > > > > > +{
> > > > > > > > + struct filter_autofs_data *data = p;
> > > > > > > > + char *name;
> > > > > > > > + int err;
> > > > > > > > +
> > > > > > > > + if (path->mnt->mnt_sb->s_magic != AUTOFS_SUPER_MAGIC)
> > > > > > > > +         return 0;
> > > > > > > > +
> > > > > > > > + name = d_path(path, data->pathbuf, PATH_MAX);
> > > > > > > > + if (IS_ERR(name)) {
> > > > > > > > +         err = PTR_ERR(name);
> > > > > > > > +         pr_err("d_path failed, errno %d\n", err);
> > > > > > > > +         return 0;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > + if (strncmp(data->fpathname, name, PATH_MAX))
> > > > > > > > +         return 0;
> > > > > > > > +
> > > > > > > > + if (!data->test(path, data->data))
> > > > > > > > +         return 0;
> > > > > > > > +
> > > > > > > > + return 1;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /* Find the topmost mount satisfying test() */
> > > > > > > >  static int find_autofs_mount(const char *pathname,
> > > > > > > >                        struct path *res,
> > > > > > > >                        int test(const struct path *path,
> > > > > > > > void
> > > > > > > > *data),
> > > > > > > >                        void *data)
> > > > > > > >  {
> > > > > > > > - struct path path;
> > > > > > > > + struct filter_autofs_data mdata = {
> > > > > > > > +         .pathbuf = NULL,
> > > > > > > > +         .test = test,
> > > > > > > > +         .data = data,
> > > > > > > > + };
> > > > > > > > + struct mnt_namespace *mnt_ns = current->nsproxy-
> > > > > > > > > mnt_ns;
> > > > > > > > + struct path path = {};
> > > > > > > > + char *fpathbuf = NULL;
> > > > > > > >   int err;
> > > > > > > >
> > > > > > > > + /*
> > > > > > > > +  * In most cases user will provide full path to autofs
> > > > > > > > mount
> > > > > > > > point
> > > > > > > > +  * as it is in /proc/X/mountinfo. But if not, then we
> > > > > > > > need to
> > > > > > > > +  * open provided relative path and calculate full path.
> > > > > > > > +  * It will not work in case when parent mount of autofs
> > > > > > > > mount
> > > > > > > > +  * is overmounted:
> > > > > > > > +  * cd /root
> > > > > > > > +  * ./autofs_mount /root/autofs_yard/mnt
> > > > > > > > +  * mount -t tmpfs tmpfs /root/autofs_yard/mnt
> > > > > > > > +  * mount -t tmpfs tmpfs /root/autofs_yard
> > > > > > > > +  * ./call_ioctl /root/autofs_yard/mnt <- all fine here
> > > > > > > > because
> > > > > > > > we
> > > > > > > > +  *                                       have full
> > > > > > > > path and
> > > > > > > > don't
> > > > > > > > +  *                                       need to call
> > > > > > > > kern_path()
> > > > > > > > +  *                                       and d_path()
> > > > > > > > +  * ./call_ioctl autofs_yard/mnt <- will fail because
> > > > > > > > kern_path()
> > > > > > > > +  *                                 can't lookup
> > > > > > > > /root/autofs_yard/mnt
> > > > > > > > +  *                                 (/root/autofs_yard
> > > > > > > > directory is
> > > > > > > > +  *                                  empty)
> > > > > > > > +  *
> > > > > > > > +  * TO DISCUSS: we can write special algorithm for
> > > > > > > > relative path
> > > > > > > > case
> > > > > > > > +  * by getting cwd path combining it with relative path
> > > > > > > > from
> > > > > > > > user. But
> > > > > > > > +  * is it worth it? User also may use paths with
> > > > > > > > symlinks in
> > > > > > > > components
> > > > > > > > +  * of path.
> > > > > > > > +  *
> > > > > > > > +  */
> > > > > > > >   err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
> > > > > > > > - if (err)
> > > > > > > > -         return err;
> > > > > > > > - err = -ENOENT;
> > > > > > > > - while (path.dentry == path.mnt->mnt_root) {
> > > > > > > > -         if (path.dentry->d_sb->s_magic ==
> > > > > > > > AUTOFS_SUPER_MAGIC) {
> > > > > > > > -                 if (test(&path, data)) {
> > > > > > > > -                         path_get(&path);
> > > > > > > > -                         *res = path;
> > > > > > > > -                         err = 0;
> > > > > > > > -                         break;
> > > > > > > > -                 }
> > > > > > > > + if (err) {
> > > > > > > > +         if (pathname[0] == '/') {
> > > > > > > > +                 /*
> > > > > > > > +                  * pathname looks like full path let's
> > > > > > > > try to
> > > > > > > > use it
> > > > > > > > +                  * as it is when searching autofs mount
> > > > > > > > +                  */
> > > > > > > > +                 mdata.fpathname = pathname;
> > > > > > > > +                 err = 0;
> > > > > > > > +                 pr_debug("kern_path failed on %s, errno
> > > > > > > > %d.
> > > > > > > > Will use path as it is to search mount\n",
> > > > > > > > +                          pathname, err);
> > > > > > > > +         } else {
> > > > > > > > +                 pr_err("kern_path failed on %s, errno
> > > > > > > > %d\n",
> > > > > > > > +                        pathname, err);
> > > > > > > > +                 return err;
> > > > > > > > +         }
> > > > > > > > + } else {
> > > > > > > > +         pr_debug("find_autofs_mount: let's resolve full
> > > > > > > > path
> > > > > > > > %s\n",
> > > > > > > > +                  pathname);
> > > > > > > > +
> > > > > > > > +         fpathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > > > > > > +         if (!fpathbuf) {
> > > > > > > > +                 err = -ENOMEM;
> > > > > > > > +                 goto err;
> > > > > > > > +         }
> > > > > > > > +
> > > > > > > > +         /*
> > > > > > > > +          * We have pathname from user but it may be
> > > > > > > > relative,
> > > > > > > > we need to
> > > > > > > > +          * have full path because we want to compare it
> > > > > > > > with
> > > > > > > > mountpoints
> > > > > > > > +          * paths later.
> > > > > > > > +          */
> > > > > > > > +         mdata.fpathname = d_path(&path, fpathbuf,
> > > > > > > > PATH_MAX);
> > > > > > > > +         if (IS_ERR(mdata.fpathname)) {
> > > > > > > > +                 err = PTR_ERR(mdata.fpathname);
> > > > > > > > +                 pr_err("d_path failed, errno %d\n",
> > > > > > > > err);
> > > > > > > > +                 goto err;
> > > > > > > >           }
> > > > > > > > -         if (!follow_up(&path))
> > > > > > > > -                 break;
> > > > > > > >   }
> > > > > > > > +
> > > > > > > > + mdata.pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
> > > > > > > > + if (!mdata.pathbuf) {
> > > > > > > > +         err = -ENOMEM;
> > > > > > > > +         goto err;
> > > > > > > > + }
> > > > > > > > +
> > > > > > > > + err = lookup_mount_path(mnt_ns, res, filter_autofs,
> > > > > > > > &mdata);
> > > > > > > > +
> > > > > > > > +err:
> > > > > > > >   path_put(&path);
> > > > > > > > + kfree(fpathbuf);
> > > > > > > > + kfree(mdata.pathbuf);
> > > > > > > >   return err;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > diff --git a/fs/namespace.c b/fs/namespace.c
> > > > > > > > index 56bb5a5fdc0d..e1d006dbdfe2 100644
> > > > > > > > --- a/fs/namespace.c
> > > > > > > > +++ b/fs/namespace.c
> > > > > > > > @@ -1367,6 +1367,50 @@ void mnt_cursor_del(struct
> > > > > > > > mnt_namespace
> > > > > > > > *ns,
> > > > > > > > struct mount *cursor)
> > > > > > > >  }
> > > > > > > >  #endif  /* CONFIG_PROC_FS */
> > > > > > > >
> > > > > > > > +/**
> > > > > > > > + * lookup_mount_path - traverse all mounts in mount
> > > > > > > > namespace
> > > > > > > > + *                     and filter using test() probe
> > > > > > > > callback
> > > > > > > > + * As a result struct path will be provided.
> > > > > > > > + * @ns: root of mount tree
> > > > > > > > + * @res: struct path pointer where resulting path will
> > > > > > > > be
> > > > > > > > written
> > > > > > > > + * @test: filter callback
> > > > > > > > + * @data: will be provided as argument to test()
> > > > > > > > callback
> > > > > > > > + *
> > > > > > > > + */
> > > > > > > > +int lookup_mount_path(struct mnt_namespace *ns,
> > > > > > > > +               struct path *res,
> > > > > > > > +               int test(const struct path *mnt, void
> > > > > > > > *data),
> > > > > > > > +               void *data)
> > > > > > > > +{
> > > > > > > > + struct mount *mnt;
> > > > > > > > + int err = -ENOENT;
> > > > > > > > +
> > > > > > > > + down_read(&namespace_sem);
> > > > > > > > + lock_ns_list(ns);
> > > > > > > > + list_for_each_entry(mnt, &ns->list, mnt_list) {
> > > > > > > > +         struct path tmppath;
> > > > > > > > +
> > > > > > > > +         if (mnt_is_cursor(mnt))
> > > > > > > > +                 continue;
> > > > > > > > +
> > > > > > > > +         tmppath.dentry = mnt->mnt.mnt_root;
> > > > > > > > +         tmppath.mnt = &mnt->mnt;
> > > > > > > > +
> > > > > > > > +         if (test(&tmppath, data)) {
> > > > > > > > +                 path_get(&tmppath);
> > > > > > > > +                 *res = tmppath;
> > > > > > > > +                 err = 0;
> > > > > > > > +                 break;
> > > > > > > > +         }
> > > > > > > > + }
> > > > > > > > + unlock_ns_list(ns);
> > > > > > > > + up_read(&namespace_sem);
> > > > > > > > +
> > > > > > > > + return err;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +EXPORT_SYMBOL(lookup_mount_path);
> > > > > > > > +
> > > > > > > >  /**
> > > > > > > >   * may_umount_tree - check if a mount tree is busy
> > > > > > > >   * @mnt: root of mount tree
> > > > > > > > diff --git a/include/linux/mount.h
> > > > > > > > b/include/linux/mount.h
> > > > > > > > index 5d92a7e1a742..a79e6392e38e 100644
> > > > > > > > --- a/include/linux/mount.h
> > > > > > > > +++ b/include/linux/mount.h
> > > > > > > > @@ -118,6 +118,11 @@ extern unsigned int
> > > > > > > > sysctl_mount_max;
> > > > > > > >
> > > > > > > >  extern bool path_is_mountpoint(const struct path *path);
> > > > > > > >
> > > > > > > > +extern int lookup_mount_path(struct mnt_namespace *ns,
> > > > > > > > +                      struct path *res,
> > > > > > > > +                      int test(const struct path *mnt,
> > > > > > > > void
> > > > > > > > *data),
> > > > > > > > +                      void *data);
> > > > > > > > +
> > > > > > > >  extern void kern_unmount_array(struct vfsmount *mnt[],
> > > > > > > > unsigned
> > > > > > > > int
> > > > > > > > num);
> > > > > > > >
> > > > > > > >  #endif /* _LINUX_MOUNT_H */
>
diff mbox series

Patch

diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 5bf781ea6d67..55edd3eba8ce 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -10,6 +10,7 @@ 
 #include <linux/fdtable.h>
 #include <linux/magic.h>
 #include <linux/nospec.h>
+#include <linux/nsproxy.h>
 
 #include "autofs_i.h"
 
@@ -179,32 +180,130 @@  static int autofs_dev_ioctl_protosubver(struct file *fp,
 	return 0;
 }
 
+struct filter_autofs_data {
+	char *pathbuf;
+	const char *fpathname;
+	int (*test)(const struct path *path, void *data);
+	void *data;
+};
+
+static int filter_autofs(const struct path *path, void *p)
+{
+	struct filter_autofs_data *data = p;
+	char *name;
+	int err;
+
+	if (path->mnt->mnt_sb->s_magic != AUTOFS_SUPER_MAGIC)
+		return 0;
+
+	name = d_path(path, data->pathbuf, PATH_MAX);
+	if (IS_ERR(name)) {
+		err = PTR_ERR(name);
+		pr_err("d_path failed, errno %d\n", err);
+		return 0;
+	}
+
+	if (strncmp(data->fpathname, name, PATH_MAX))
+		return 0;
+
+	if (!data->test(path, data->data))
+		return 0;
+
+	return 1;
+}
+
 /* Find the topmost mount satisfying test() */
 static int find_autofs_mount(const char *pathname,
 			     struct path *res,
 			     int test(const struct path *path, void *data),
 			     void *data)
 {
-	struct path path;
+	struct filter_autofs_data mdata = {
+		.pathbuf = NULL,
+		.test = test,
+		.data = data,
+	};
+	struct mnt_namespace *mnt_ns = current->nsproxy->mnt_ns;
+	struct path path = {};
+	char *fpathbuf = NULL;
 	int err;
 
+	/*
+	 * In most cases user will provide full path to autofs mount point
+	 * as it is in /proc/X/mountinfo. But if not, then we need to
+	 * open provided relative path and calculate full path.
+	 * It will not work in case when parent mount of autofs mount
+	 * is overmounted:
+	 * cd /root
+	 * ./autofs_mount /root/autofs_yard/mnt
+	 * mount -t tmpfs tmpfs /root/autofs_yard/mnt
+	 * mount -t tmpfs tmpfs /root/autofs_yard
+	 * ./call_ioctl /root/autofs_yard/mnt <- all fine here because we
+	 * 					 have full path and don't
+	 * 					 need to call kern_path()
+	 * 					 and d_path()
+	 * ./call_ioctl autofs_yard/mnt <- will fail because kern_path()
+	 * 				   can't lookup /root/autofs_yard/mnt
+	 * 				   (/root/autofs_yard directory is
+	 * 				    empty)
+	 *
+	 * TO DISCUSS: we can write special algorithm for relative path case
+	 * by getting cwd path combining it with relative path from user. But
+	 * is it worth it? User also may use paths with symlinks in components
+	 * of path.
+	 *
+	 */
 	err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path);
-	if (err)
-		return err;
-	err = -ENOENT;
-	while (path.dentry == path.mnt->mnt_root) {
-		if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) {
-			if (test(&path, data)) {
-				path_get(&path);
-				*res = path;
-				err = 0;
-				break;
-			}
+	if (err) {
+		if (pathname[0] == '/') {
+			/*
+			 * pathname looks like full path let's try to use it
+			 * as it is when searching autofs mount
+			 */
+			mdata.fpathname = pathname;
+			err = 0;
+			pr_debug("kern_path failed on %s, errno %d. Will use path as it is to search mount\n",
+				 pathname, err);
+		} else {
+			pr_err("kern_path failed on %s, errno %d\n",
+			       pathname, err);
+			return err;
+		}
+	} else {
+		pr_debug("find_autofs_mount: let's resolve full path %s\n",
+			 pathname);
+
+		fpathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+		if (!fpathbuf) {
+			err = -ENOMEM;
+			goto err;
+		}
+
+		/*
+		 * We have pathname from user but it may be relative, we need to
+		 * have full path because we want to compare it with mountpoints
+		 * paths later.
+		 */
+		mdata.fpathname = d_path(&path, fpathbuf, PATH_MAX);
+		if (IS_ERR(mdata.fpathname)) {
+			err = PTR_ERR(mdata.fpathname);
+			pr_err("d_path failed, errno %d\n", err);
+			goto err;
 		}
-		if (!follow_up(&path))
-			break;
 	}
+
+	mdata.pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!mdata.pathbuf) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	err = lookup_mount_path(mnt_ns, res, filter_autofs, &mdata);
+
+err:
 	path_put(&path);
+	kfree(fpathbuf);
+	kfree(mdata.pathbuf);
 	return err;
 }
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 56bb5a5fdc0d..e1d006dbdfe2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1367,6 +1367,50 @@  void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
 }
 #endif  /* CONFIG_PROC_FS */
 
+/**
+ * lookup_mount_path - traverse all mounts in mount namespace
+ *                     and filter using test() probe callback
+ * As a result struct path will be provided.
+ * @ns: root of mount tree
+ * @res: struct path pointer where resulting path will be written
+ * @test: filter callback
+ * @data: will be provided as argument to test() callback
+ *
+ */
+int lookup_mount_path(struct mnt_namespace *ns,
+		      struct path *res,
+		      int test(const struct path *mnt, void *data),
+		      void *data)
+{
+	struct mount *mnt;
+	int err = -ENOENT;
+
+	down_read(&namespace_sem);
+	lock_ns_list(ns);
+	list_for_each_entry(mnt, &ns->list, mnt_list) {
+		struct path tmppath;
+
+		if (mnt_is_cursor(mnt))
+			continue;
+
+		tmppath.dentry = mnt->mnt.mnt_root;
+		tmppath.mnt = &mnt->mnt;
+
+		if (test(&tmppath, data)) {
+			path_get(&tmppath);
+			*res = tmppath;
+			err = 0;
+			break;
+		}
+	}
+	unlock_ns_list(ns);
+	up_read(&namespace_sem);
+
+	return err;
+}
+
+EXPORT_SYMBOL(lookup_mount_path);
+
 /**
  * may_umount_tree - check if a mount tree is busy
  * @mnt: root of mount tree
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 5d92a7e1a742..a79e6392e38e 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -118,6 +118,11 @@  extern unsigned int sysctl_mount_max;
 
 extern bool path_is_mountpoint(const struct path *path);
 
+extern int lookup_mount_path(struct mnt_namespace *ns,
+			     struct path *res,
+			     int test(const struct path *mnt, void *data),
+			     void *data);
+
 extern void kern_unmount_array(struct vfsmount *mnt[], unsigned int num);
 
 #endif /* _LINUX_MOUNT_H */