diff mbox series

[v3,08/13] fanotify: enable FAN_REPORT_FID init flag

Message ID 20181125134352.21499-9-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show
Series fanotify: add support for more event types | expand

Commit Message

Amir Goldstein Nov. 25, 2018, 1:43 p.m. UTC
When setting up an fanotify listener, user may request to get fid
information in event instead of an open file descriptor.

The fid obtained with event on a watched object contains the file
handle returned by name_to_handle_at(2) and fsid returned by statfs(2).

When setting a mark, we need to make sure that the filesystem
supports encoding file handles with name_to_handle_at(2) and that
statfs(2) encodes a non-zero fsid.

Cc: <linux-api@vger.kernel.org>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/notify/fanotify/fanotify_user.c | 54 +++++++++++++++++++++++++++++-
 fs/statfs.c                        |  4 ++-
 include/linux/fanotify.h           |  2 +-
 include/linux/statfs.h             |  3 ++
 4 files changed, 60 insertions(+), 3 deletions(-)

Comments

Jan Kara Nov. 29, 2018, 9:46 a.m. UTC | #1
On Sun 25-11-18 15:43:47, Amir Goldstein wrote:
> When setting up an fanotify listener, user may request to get fid
> information in event instead of an open file descriptor.
> 
> The fid obtained with event on a watched object contains the file
> handle returned by name_to_handle_at(2) and fsid returned by statfs(2).
> 
> When setting a mark, we need to make sure that the filesystem
> supports encoding file handles with name_to_handle_at(2) and that
> statfs(2) encodes a non-zero fsid.
...
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index ea8e81a3e80b..d7aa2f392a64 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -857,6 +859,49 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
>  	return fd;
>  }
>  
> +/* Check if filesystem can encode a unique fid */
> +static int fanotify_test_fid(struct path *path)
> +{
> +	struct kstatfs stat, root_stat;
> +	int err;
> +
> +	/*
> +	 * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
> +	 * TODO: cache fsid in the mark connector.
> +	 */

TODO in a submitted patch looks strange (looks like the patch isn't done
yet ;)) and in this particular case the code is perfectly justified even
without talking about future functionality...

> +	err = vfs_statfs(path, &stat);
> +	if (err)
> +		return err;
> +
> +	if (!stat.f_fsid.val[0] && !stat.f_fsid.val[1])
> +		return -ENODEV;
> +
> +	/*
> +	 * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
> +	 * which uses a different fsid than sb root.
> +	 */
> +	err = statfs_by_dentry(path->dentry->d_sb->s_root, &root_stat);
> +	if (err)
> +		return err;
> +
> +	if (root_stat.f_fsid.val[0] != stat.f_fsid.val[0] ||
> +	    root_stat.f_fsid.val[1] != stat.f_fsid.val[1])
> +		return -EXDEV;

I think inode watches in subvolumes are actually fine? The same fs object
is going to get different struct inode for different subvolumes if I
remember right. So there won't be any surprises with unexpected fsid being
reported.

Also mount watches are actually fine for subvolume as different subvolumes
appear as different mountpoints, don't they? And I think implementation
that would have different fsid for inodes in the same mountpoint would be
indeed very weird. So again no problem with fsid mismatch.

So we need this check only for superblock watches.

> diff --git a/fs/statfs.c b/fs/statfs.c
> index f0216629621d..6a5d840a2d8d 100644
> --- a/fs/statfs.c
> +++ b/fs/statfs.c
> @@ -50,7 +50,8 @@ static int calculate_f_flags(struct vfsmount *mnt)
>  		flags_by_sb(mnt->mnt_sb->s_flags);
>  }
>  
> -static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
> +/* Does not set buf->f_flags */
> +int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
>  {
>  	int retval;
>  
> @@ -66,6 +67,7 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
>  		buf->f_frsize = buf->f_bsize;
>  	return retval;
>  }
> +EXPORT_SYMBOL(statfs_by_dentry);
>  
>  int vfs_statfs(const struct path *path, struct kstatfs *buf)
>  {

Make this export a separate patch, CC Al Viro on it. Honestly I'm not very
happy about statfs_by_dentry() interface as it may return different result
than vfs_statfs() so it just waits for someone careless to use
statfs_by_dentry() and get burnt. How about implementing vfs_get_fsid()
that will get dentry and return fsid, that will be just internally
implemented using statfs_by_dentry()? We can then use that everywhere in
fsnotify and the interface is going to be much cleaner like that. The
comment regarding CC to Al Viro and separate patch still applies though.

								Honza
Jan Kara Nov. 29, 2018, 10:52 a.m. UTC | #2
On Thu 29-11-18 10:46:36, Jan Kara wrote:
> On Sun 25-11-18 15:43:47, Amir Goldstein wrote:
> > When setting up an fanotify listener, user may request to get fid
> > information in event instead of an open file descriptor.
> > 
> > The fid obtained with event on a watched object contains the file
> > handle returned by name_to_handle_at(2) and fsid returned by statfs(2).
> > 
> > When setting a mark, we need to make sure that the filesystem
> > supports encoding file handles with name_to_handle_at(2) and that
> > statfs(2) encodes a non-zero fsid.
> ...
> > diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> > index ea8e81a3e80b..d7aa2f392a64 100644
> > --- a/fs/notify/fanotify/fanotify_user.c
> > +++ b/fs/notify/fanotify/fanotify_user.c
> > @@ -857,6 +859,49 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
> >  	return fd;
> >  }
> >  
> > +/* Check if filesystem can encode a unique fid */
> > +static int fanotify_test_fid(struct path *path)
> > +{
> > +	struct kstatfs stat, root_stat;
> > +	int err;
> > +
> > +	/*
> > +	 * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
> > +	 * TODO: cache fsid in the mark connector.
> > +	 */
> 
> TODO in a submitted patch looks strange (looks like the patch isn't done
> yet ;)) and in this particular case the code is perfectly justified even
> without talking about future functionality...
> 
> > +	err = vfs_statfs(path, &stat);
> > +	if (err)
> > +		return err;
> > +
> > +	if (!stat.f_fsid.val[0] && !stat.f_fsid.val[1])
> > +		return -ENODEV;
> > +
> > +	/*
> > +	 * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
> > +	 * which uses a different fsid than sb root.
> > +	 */
> > +	err = statfs_by_dentry(path->dentry->d_sb->s_root, &root_stat);
> > +	if (err)
> > +		return err;
> > +
> > +	if (root_stat.f_fsid.val[0] != stat.f_fsid.val[0] ||
> > +	    root_stat.f_fsid.val[1] != stat.f_fsid.val[1])
> > +		return -EXDEV;
> 
> I think inode watches in subvolumes are actually fine? The same fs object
> is going to get different struct inode for different subvolumes if I
> remember right. So there won't be any surprises with unexpected fsid being
> reported.
> 
> Also mount watches are actually fine for subvolume as different subvolumes
> appear as different mountpoints, don't they? And I think implementation
> that would have different fsid for inodes in the same mountpoint would be
> indeed very weird. So again no problem with fsid mismatch.
> 
> So we need this check only for superblock watches.

See my reply to the next patch for more on this. Also I was thinking about
the "no zero fsid" restriction. I understand where you are coming from but
IMO FAN_REPORT_FID can be useful even if the filesystem doesn't support
fsid - e.g. if you create a notification group and put there marks  only for
one filesystem, then you don't need the fsid part at all. I don't have a
strong opinion on this (we can always enable this functionality later if we
want) but wanted to run this by you.

								Honza
Amir Goldstein Nov. 29, 2018, 11:03 a.m. UTC | #3
On Thu, Nov 29, 2018 at 11:46 AM Jan Kara <jack@suse.cz> wrote:
>
> On Sun 25-11-18 15:43:47, Amir Goldstein wrote:
> > When setting up an fanotify listener, user may request to get fid
> > information in event instead of an open file descriptor.
> >
> > The fid obtained with event on a watched object contains the file
> > handle returned by name_to_handle_at(2) and fsid returned by statfs(2).
> >
> > When setting a mark, we need to make sure that the filesystem
> > supports encoding file handles with name_to_handle_at(2) and that
> > statfs(2) encodes a non-zero fsid.
> ...
> > diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> > index ea8e81a3e80b..d7aa2f392a64 100644
> > --- a/fs/notify/fanotify/fanotify_user.c
> > +++ b/fs/notify/fanotify/fanotify_user.c
> > @@ -857,6 +859,49 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
> >       return fd;
> >  }
> >
> > +/* Check if filesystem can encode a unique fid */
> > +static int fanotify_test_fid(struct path *path)
> > +{
> > +     struct kstatfs stat, root_stat;
> > +     int err;
> > +
> > +     /*
> > +      * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
> > +      * TODO: cache fsid in the mark connector.
> > +      */
>
> TODO in a submitted patch looks strange (looks like the patch isn't done
> yet ;)) and in this particular case the code is perfectly justified even
> without talking about future functionality...
>

Not to mention that I forgot to remove the TODO in the patch that adds
cached fsid ;-)

> > +     err = vfs_statfs(path, &stat);
> > +     if (err)
> > +             return err;
> > +
> > +     if (!stat.f_fsid.val[0] && !stat.f_fsid.val[1])
> > +             return -ENODEV;
> > +
> > +     /*
> > +      * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
> > +      * which uses a different fsid than sb root.
> > +      */
> > +     err = statfs_by_dentry(path->dentry->d_sb->s_root, &root_stat);
> > +     if (err)
> > +             return err;
> > +
> > +     if (root_stat.f_fsid.val[0] != stat.f_fsid.val[0] ||
> > +         root_stat.f_fsid.val[1] != stat.f_fsid.val[1])
> > +             return -EXDEV;
>
> I think inode watches in subvolumes are actually fine? The same fs object
> is going to get different struct inode for different subvolumes if I
> remember right. So there won't be any surprises with unexpected fsid being
> reported.
>
> Also mount watches are actually fine for subvolume as different subvolumes
> appear as different mountpoints, don't they? And I think implementation
> that would have different fsid for inodes in the same mountpoint would be
> indeed very weird. So again no problem with fsid mismatch.
>
> So we need this check only for superblock watches.
>

Not so simple (or is it?).
If a group has inode, mount and filesystem marks, not all added at the
same time.
When event on an object that is associated with all the above marks,
which cached fsid should be used in the report?
Naturally, it makes sense to prefer to more accurate fsid of mount/inode
over the broader fsid of filesystem. Right?
But what happens when mount/inode marks are removed?
Or if filesystem has events in the mask that inode/mount do not?
Then the same object reports events with different fsid depending
on the type of event and time it took place (which marks existed).

Not a good situation to get ourselves into.

The simple way out of this is: we do not support FAN_REPORT_FID
on marks using path that is not relative to main volume. period.

Considering the fact that FAN_REPORT_FID is mainly indented to
enable reporting directory modification events and mount marks
are not supported with reporting directory modification events, we
only loose the ability to watch modification on selective directories
inside btrfs subvolume.

I also don't like the fact that I disabled filesystem watch over tmpfs,
because for the case of watching a single filesystem or single
directory, which is quite a common case, we don't need fsid
to be non-zero and we don't care if it mismatches with s_root fsid.

A solution I was contemplating was to allow zero fsid and non
root fsid as long as it is the only sb watched by the group, so
for non unique fsid:
- store group->sb and group->fsid
- return -EXDEV for an attempt to add mark from a different sb
(no matter if it is inode/mount/sb mark)
- when trying to add mark with zero or non root fsid (common case)
set group->sb to a special value so no fs will match it and then
attempt to add any mark with zero/non-root fsid will fail

This is something that is quite easy for me to implement and less
easy to document the expected behavior.
I donno, maybe:
EXDEV    watching several filesystems and either new mark or existing marks
                 are on filesystems with non unique fsid

The easy way out of it for me was: no support for FAN_REPORT_FID
on btrfs subvolumes at the moment - it could be added with restrictions
in the future.

Do you have a different view of the problem than mine?

> > diff --git a/fs/statfs.c b/fs/statfs.c
> > index f0216629621d..6a5d840a2d8d 100644
> > --- a/fs/statfs.c
> > +++ b/fs/statfs.c
> > @@ -50,7 +50,8 @@ static int calculate_f_flags(struct vfsmount *mnt)
> >               flags_by_sb(mnt->mnt_sb->s_flags);
> >  }
> >
> > -static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
> > +/* Does not set buf->f_flags */
> > +int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
> >  {
> >       int retval;
> >
> > @@ -66,6 +67,7 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
> >               buf->f_frsize = buf->f_bsize;
> >       return retval;
> >  }
> > +EXPORT_SYMBOL(statfs_by_dentry);
> >
> >  int vfs_statfs(const struct path *path, struct kstatfs *buf)
> >  {
>
> Make this export a separate patch, CC Al Viro on it. Honestly I'm not very
> happy about statfs_by_dentry() interface as it may return different result
> than vfs_statfs() so it just waits for someone careless to use
> statfs_by_dentry() and get burnt. How about implementing vfs_get_fsid()
> that will get dentry and return fsid, that will be just internally
> implemented using statfs_by_dentry()? We can then use that everywhere in
> fsnotify and the interface is going to be much cleaner like that. The
> comment regarding CC to Al Viro and separate patch still applies though.
>

OK. sounds good.

Thanks,
Amir.
Jan Kara Nov. 29, 2018, 1:08 p.m. UTC | #4
On Thu 29-11-18 13:03:03, Amir Goldstein wrote:
> On Thu, Nov 29, 2018 at 11:46 AM Jan Kara <jack@suse.cz> wrote:
> > > +     err = vfs_statfs(path, &stat);
> > > +     if (err)
> > > +             return err;
> > > +
> > > +     if (!stat.f_fsid.val[0] && !stat.f_fsid.val[1])
> > > +             return -ENODEV;
> > > +
> > > +     /*
> > > +      * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
> > > +      * which uses a different fsid than sb root.
> > > +      */
> > > +     err = statfs_by_dentry(path->dentry->d_sb->s_root, &root_stat);
> > > +     if (err)
> > > +             return err;
> > > +
> > > +     if (root_stat.f_fsid.val[0] != stat.f_fsid.val[0] ||
> > > +         root_stat.f_fsid.val[1] != stat.f_fsid.val[1])
> > > +             return -EXDEV;
> >
> > I think inode watches in subvolumes are actually fine? The same fs object
> > is going to get different struct inode for different subvolumes if I
> > remember right. So there won't be any surprises with unexpected fsid being
> > reported.
> >
> > Also mount watches are actually fine for subvolume as different subvolumes
> > appear as different mountpoints, don't they? And I think implementation
> > that would have different fsid for inodes in the same mountpoint would be
> > indeed very weird. So again no problem with fsid mismatch.
> >
> > So we need this check only for superblock watches.
> >
> 
> Not so simple (or is it?).
> If a group has inode, mount and filesystem marks, not all added at the
> same time.
> When event on an object that is associated with all the above marks,
> which cached fsid should be used in the report?
> Naturally, it makes sense to prefer to more accurate fsid of mount/inode
> over the broader fsid of filesystem. Right?
> But what happens when mount/inode marks are removed?
> Or if filesystem has events in the mask that inode/mount do not?
> Then the same object reports events with different fsid depending
> on the type of event and time it took place (which marks existed).
> 
> Not a good situation to get ourselves into.
> 
> The simple way out of this is: we do not support FAN_REPORT_FID
> on marks using path that is not relative to main volume. period.
> 
> Considering the fact that FAN_REPORT_FID is mainly indented to
> enable reporting directory modification events and mount marks
> are not supported with reporting directory modification events, we
> only loose the ability to watch modification on selective directories
> inside btrfs subvolume.
> 
> I also don't like the fact that I disabled filesystem watch over tmpfs,
> because for the case of watching a single filesystem or single
> directory, which is quite a common case, we don't need fsid
> to be non-zero and we don't care if it mismatches with s_root fsid.
> 
> A solution I was contemplating was to allow zero fsid and non
> root fsid as long as it is the only sb watched by the group, so
> for non unique fsid:
> - store group->sb and group->fsid
> - return -EXDEV for an attempt to add mark from a different sb
> (no matter if it is inode/mount/sb mark)
> - when trying to add mark with zero or non root fsid (common case)
> set group->sb to a special value so no fs will match it and then
> attempt to add any mark with zero/non-root fsid will fail
> 
> This is something that is quite easy for me to implement and less
> easy to document the expected behavior.
> I donno, maybe:
> EXDEV    watching several filesystems and either new mark or existing marks
>                  are on filesystems with non unique fsid
> 
> The easy way out of it for me was: no support for FAN_REPORT_FID
> on btrfs subvolumes at the moment - it could be added with restrictions
> in the future.
> 
> Do you have a different view of the problem than mine?

Yeah, OK, you're right the semantics isn't really obvious. So I'm fine with
going for EXDEV now and we can open that can of worms later.

								Honza
diff mbox series

Patch

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ea8e81a3e80b..d7aa2f392a64 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -17,6 +17,8 @@ 
 #include <linux/compat.h>
 #include <linux/sched/signal.h>
 #include <linux/memcontrol.h>
+#include <linux/statfs.h>
+#include <linux/exportfs.h>
 
 #include <asm/ioctls.h>
 
@@ -857,6 +859,49 @@  SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	return fd;
 }
 
+/* Check if filesystem can encode a unique fid */
+static int fanotify_test_fid(struct path *path)
+{
+	struct kstatfs stat, root_stat;
+	int err;
+
+	/*
+	 * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
+	 * TODO: cache fsid in the mark connector.
+	 */
+	err = vfs_statfs(path, &stat);
+	if (err)
+		return err;
+
+	if (!stat.f_fsid.val[0] && !stat.f_fsid.val[1])
+		return -ENODEV;
+
+	/*
+	 * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
+	 * which uses a different fsid than sb root.
+	 */
+	err = statfs_by_dentry(path->dentry->d_sb->s_root, &root_stat);
+	if (err)
+		return err;
+
+	if (root_stat.f_fsid.val[0] != stat.f_fsid.val[0] ||
+	    root_stat.f_fsid.val[1] != stat.f_fsid.val[1])
+		return -EXDEV;
+
+	/*
+	 * We need to make sure that the file system supports at least
+	 * encoding a file handle so user can use name_to_handle_at() to
+	 * compare fid returned with event to the file handle of watched
+	 * objects. However, name_to_handle_at() requires that the
+	 * filesystem also supports decoding file handles.
+	 */
+	if (!path->dentry->d_sb->s_export_op ||
+	    !path->dentry->d_sb->s_export_op->fh_to_dentry)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
+
 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 			    int dfd, const char  __user *pathname)
 {
@@ -942,6 +987,12 @@  static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	if (ret)
 		goto fput_and_out;
 
+	if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) {
+		ret = fanotify_test_fid(&path);
+		if (ret)
+			goto path_put_and_out;
+	}
+
 	/* inode held in place by reference to path; group by fget on fd */
 	if (mark_type == FAN_MARK_INODE)
 		inode = path.dentry->d_inode;
@@ -970,6 +1021,7 @@  static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 		ret = -EINVAL;
 	}
 
+path_put_and_out:
 	path_put(&path);
 fput_and_out:
 	fdput(f);
@@ -1006,7 +1058,7 @@  COMPAT_SYSCALL_DEFINE6(fanotify_mark,
  */
 static int __init fanotify_user_setup(void)
 {
-	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 7);
+	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 8);
 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
 
 	fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
diff --git a/fs/statfs.c b/fs/statfs.c
index f0216629621d..6a5d840a2d8d 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -50,7 +50,8 @@  static int calculate_f_flags(struct vfsmount *mnt)
 		flags_by_sb(mnt->mnt_sb->s_flags);
 }
 
-static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+/* Does not set buf->f_flags */
+int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
 {
 	int retval;
 
@@ -66,6 +67,7 @@  static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
 		buf->f_frsize = buf->f_bsize;
 	return retval;
 }
+EXPORT_SYMBOL(statfs_by_dentry);
 
 int vfs_statfs(const struct path *path, struct kstatfs *buf)
 {
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index 9e2142795335..f59be967f72b 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -19,7 +19,7 @@ 
 				 FAN_CLASS_PRE_CONTENT)
 
 #define FANOTIFY_INIT_FLAGS	(FANOTIFY_CLASS_BITS | \
-				 FAN_REPORT_TID | \
+				 FAN_REPORT_TID | FAN_REPORT_FID | \
 				 FAN_CLOEXEC | FAN_NONBLOCK | \
 				 FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS)
 
diff --git a/include/linux/statfs.h b/include/linux/statfs.h
index 3142e98546ac..2c3ca7cb8c98 100644
--- a/include/linux/statfs.h
+++ b/include/linux/statfs.h
@@ -41,4 +41,7 @@  struct kstatfs {
 #define ST_NODIRATIME	0x0800	/* do not update directory access times */
 #define ST_RELATIME	0x1000	/* update atime relative to mtime/ctime */
 
+struct dentry;
+extern int statfs_by_dentry(struct dentry *dentry, struct kstatfs *stat);
+
 #endif