diff mbox

[RFC,2/2] fanotify: emit FAN_MODIFY_DIR on filesystem changes

Message ID a7ad913cf04481b0295832e3e201159a7e66ea03.1489445257.git.p@regnarg.cz (mailing list archive)
State New, archived
Headers show

Commit Message

Filip Štědronský March 13, 2017, 11:03 p.m. UTC
Besause fanotify requires `struct path`, the event cannot be generated
directly in `fsnotify_move` and friends because they only get the inode
(and their callers, `vfs_rename`&co. cannot supply any better info).
So instead it needs to be generated higher in the call chain, i.e. in
the callers of functions like `vfs_rename`.

This leads to some code duplication. Currently, there are several places
whence functions like `vfs_rename` or `vfs_unlink` are called:

  * syscall handlers (done)
  * NFS server (done)
  * stacked filesystems
      - ecryptfs (done)
      - overlayfs
        (Currently doesn't report even ordinary fanotify events, because
         it internally clones the upper mount; not sure about the
         rationale.  One can always watch the overlay mount instead.)
  * few rather minor things
      - devtmpfs
        (its internal changes are not tied to any vfsmount so it cannot
         emit mount-scoped events)
      - cachefiles (done)
      - ipc/mqueue.c (done)
      - fs/nfsd/nfs4recover.c (done)
      - kernel/bpf/inode.c (done)
        net/unix/af_unix.c (done)

(grep -rE '\bvfs_(rename|unlink|mknod|whiteout|create|mkdir|rmdir|symlink|link)\(')

Signed-off-by: Filip Štědronský <r.lkml@regnarg.cz>

---

An alternative might be to create wrapper functions like
vfs_path_(rename|unlink|...). They could also take care of calling
security_path_(rename|unlink|...), which is currently also up to
the indvidual callers (possibly with a flag because it might not
be always desired).
---
 fs/cachefiles/namei.c |  9 +++++++
 fs/ecryptfs/inode.c   | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/namei.c            | 23 +++++++++++++++++-
 fs/nfsd/nfs4recover.c |  7 ++++++
 fs/nfsd/vfs.c         | 24 ++++++++++++++++--
 ipc/mqueue.c          |  9 +++++++
 kernel/bpf/inode.c    |  3 +++
 net/unix/af_unix.c    |  2 ++
 8 files changed, 141 insertions(+), 3 deletions(-)

Comments

Amir Goldstein March 14, 2017, 11:18 a.m. UTC | #1
On Tue, Mar 14, 2017 at 1:03 AM, Filip Štědronský <r.lklm@regnarg.cz> wrote:
> Besause fanotify requires `struct path`, the event cannot be generated
> directly in `fsnotify_move` and friends because they only get the inode
> (and their callers, `vfs_rename`&co. cannot supply any better info).
> So instead it needs to be generated higher in the call chain, i.e. in
> the callers of functions like `vfs_rename`.
>
> This leads to some code duplication. Currently, there are several places
> whence functions like `vfs_rename` or `vfs_unlink` are called:
>
>   * syscall handlers (done)
>   * NFS server (done)
>   * stacked filesystems
>       - ecryptfs (done)
>       - overlayfs
>         (Currently doesn't report even ordinary fanotify events, because
>          it internally clones the upper mount; not sure about the
>          rationale.  One can always watch the overlay mount instead.)
>   * few rather minor things
>       - devtmpfs
>         (its internal changes are not tied to any vfsmount so it cannot
>          emit mount-scoped events)
>       - cachefiles (done)
>       - ipc/mqueue.c (done)
>       - fs/nfsd/nfs4recover.c (done)
>       - kernel/bpf/inode.c (done)
>         net/unix/af_unix.c (done)
>
> (grep -rE '\bvfs_(rename|unlink|mknod|whiteout|create|mkdir|rmdir|symlink|link)\(')
>
> Signed-off-by: Filip Štědronský <r.lkml@regnarg.cz>
>
> ---
>
> An alternative might be to create wrapper functions like
> vfs_path_(rename|unlink|...). They could also take care of calling
> security_path_(rename|unlink|...), which is currently also up to
> the indvidual callers (possibly with a flag because it might not
> be always desired).

That's an interesting idea. There is some duplicity between security/audit
hook and fsnotify hooks. It should be interesting to try and deduplicate
some of this code.

> ---
>  fs/cachefiles/namei.c |  9 +++++++
>  fs/ecryptfs/inode.c   | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/namei.c            | 23 +++++++++++++++++-
>  fs/nfsd/nfs4recover.c |  7 ++++++
>  fs/nfsd/vfs.c         | 24 ++++++++++++++++--
>  ipc/mqueue.c          |  9 +++++++
>  kernel/bpf/inode.c    |  3 +++
>  net/unix/af_unix.c    |  2 ++
>  8 files changed, 141 insertions(+), 3 deletions(-)
>

OK, just for comparison, I am going to put here the diff of the sub set of
my patches that are needed to support fanotify filename events.


$ git diff --stat fsnotify_sb..fanotify_dentry
 fs/notify/fanotify/fanotify.c      | 94
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
 fs/notify/fanotify/fanotify.h      | 25 ++++++++++++++++++++++++-
 fs/notify/fanotify/fanotify_user.c | 92
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
 fs/notify/fdinfo.c                 | 25 +++----------------------
 fs/notify/inode_mark.c             |  1 +
 fs/notify/mark.c                   | 15 ++++++++++++---
 include/linux/fsnotify_backend.h   | 21 ++++++++++++++++-----
 include/uapi/linux/fanotify.h      | 41
+++++++++++++++++++++++++++++++++++------
 8 files changed, 255 insertions(+), 59 deletions(-)

Yes, it is a bit more code, much mostly because it adds more functionality
(optionally reporting the filename).
But most of the code is contained within the fsnotify/fanotify subsystem.

The altenative to sprinkle fsnotify_modify_dir() hooks is much less
maintainable IMO.

Of course I am presenting biased information :-)
so for full disclosure, these patches also depend on a previous
cleanup series, with the following diffstat.
But as I claimed before and am going to claim again,
the cleanup series improves the code IMO regardless of
the additional functionality that it enables:

 $ git diff --stat base..fsnotify_dentry
 arch/powerpc/platforms/cell/spufs/inode.c |  2 +-
 fs/btrfs/ioctl.c                          |  2 +-
 fs/debugfs/inode.c                        |  8 ++++----
 fs/devpts/inode.c                         |  2 +-
 fs/namei.c                                | 23 +++++++++++++----------
 fs/notify/fsnotify.c                      |  2 +-
 fs/ocfs2/refcounttree.c                   |  2 +-
 fs/overlayfs/inode.c                      | 15 ++++++++-------
 fs/tracefs/inode.c                        |  4 ++--
 include/linux/fsnotify.h                  | 78
+++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------
 include/linux/fsnotify_backend.h          |  3 ++-
 net/sunrpc/rpc_pipe.c                     |  6 +++---
 12 files changed, 90 insertions(+), 57 deletions(-)

But even when taking the cleanup series into account,
the changes outside of the fsnotify subsystem and include files
are still a lot smaller then in your counter proposal.

This does not come without a price though.
I managed to stay a way from cross subsystems changes,
by allowing to loose some information about the event.

When a filename event is generated (rename|delete|create)
the path of the parent fd that will be reported to user is NOT
the actual path that the process executing the operation used,
but the path from which the watching process has added the mark.

So for example, if you have a bind mount:
    mount -o bind /a/b/c/d/e/f/g /tmp/g

And you add a watch on mount /a
you *will* get events for create,delete,rename in directory /tmp/g
but that path in the associated fd with point to /a/b/c/d/e/f/g

Some would claim that it is wrong to report the events
if they were not originated from the mount were the
watch was added.

I claim that fanotify filters event by mount not because it
was a requirement, but because it was an implementation challenge
to do otherwise.
And I claim that what mount watchers are really interested in is
"all the changes that happen in the file system in the area
 that is visible to me through this mount point".

In other words, an indexer needs to know if files were modified\
create/deleted if that indexer sits in container host namespace
regardless if those files were modified from within a container
namespace.

It's not a matter of security/isolation. It's a matter of functionality.
I agree that for some event (e.g. permission events) it is possible
to argue both ways (i.e. that the namespace context should be used
as a filter for events).
But for the new proposed events (FS_MODIFY_DIR), I really don't
see the point in isolation by mount/namespace.

> diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
> index 41df8a27d7eb..8c86699424d1 100644
> --- a/fs/cachefiles/namei.c
> +++ b/fs/cachefiles/namei.c
> @@ -313,6 +313,8 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
>                         cachefiles_io_error(cache, "Unlink security error");
>                 } else {
>                         ret = vfs_unlink(d_inode(dir), rep, NULL);
> +                       if (ret == 0)
> +                               fsnotify_modify_dir(&path);
>
>                         if (preemptive)
>                                 cachefiles_mark_object_buried(cache, rep, why);
> @@ -418,6 +420,10 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
>                 if (ret != 0 && ret != -ENOMEM)
>                         cachefiles_io_error(cache,
>                                             "Rename failed with error %d", ret);
> +               if (ret == 0) {
> +                       fsnotify_modify_dir(&path);
> +                       fsnotify_modify_dir(&path_to_graveyard);
> +               }
>
>                 if (preemptive)
>                         cachefiles_mark_object_buried(cache, rep, why);
> @@ -560,6 +566,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
>                         cachefiles_hist(cachefiles_mkdir_histogram, start);
>                         if (ret < 0)
>                                 goto create_error;
> +                       fsnotify_modify_dir(&path);
>
>                         ASSERT(d_backing_inode(next));
>
> @@ -589,6 +596,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent,
>                         cachefiles_hist(cachefiles_create_histogram, start);
>                         if (ret < 0)
>                                 goto create_error;
> +                       fsnotify_modify_dir(&path);
>
>                         ASSERT(d_backing_inode(next));
>
> @@ -779,6 +787,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
>                 ret = vfs_mkdir(d_inode(dir), subdir, 0700);
>                 if (ret < 0)
>                         goto mkdir_error;
> +               fsnotify_modify_dir(&path);
>
>                 ASSERT(d_backing_inode(subdir));
>
> diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
> index e7413f82d27b..88a41b270bcc 100644
> --- a/fs/ecryptfs/inode.c
> +++ b/fs/ecryptfs/inode.c
> @@ -29,6 +29,8 @@
>  #include <linux/dcache.h>
>  #include <linux/namei.h>
>  #include <linux/mount.h>
> +#include <linux/path.h>
> +#include <linux/fsnotify.h>
>  #include <linux/fs_stack.h>
>  #include <linux/slab.h>
>  #include <linux/xattr.h>
> @@ -144,16 +146,22 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
>  {
>         struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
>         struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
> +       struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
> +       struct path lower_dir_path = {lower_mnt, NULL};
>         struct dentry *lower_dir_dentry;
>         int rc;
>
>         dget(lower_dentry);
>         lower_dir_dentry = lock_parent(lower_dentry);
> +       lower_dir_path.dentry = lower_dir_dentry;
>         rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
>         if (rc) {
>                 printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
>                 goto out_unlock;
>         }
> +
> +       fsnotify_modify_dir(&lower_dir_path);
> +
>         fsstack_copy_attr_times(dir, lower_dir_inode);
>         set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
>         inode->i_ctime = dir->i_ctime;
> @@ -184,9 +192,13 @@ ecryptfs_do_create(struct inode *directory_inode,
>         struct dentry *lower_dentry;
>         struct dentry *lower_dir_dentry;
>         struct inode *inode;
> +       struct path lower_dir_path;
>
>         lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
>         lower_dir_dentry = lock_parent(lower_dentry);
> +       lower_dir_path.dentry = lower_dir_dentry;
> +       lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
> +
>         rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true);
>         if (rc) {
>                 printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
> @@ -194,10 +206,14 @@ ecryptfs_do_create(struct inode *directory_inode,
>                 inode = ERR_PTR(rc);
>                 goto out_lock;
>         }
> +
> +       fsnotify_modify_dir(&lower_dir_path);
> +
>         inode = __ecryptfs_get_inode(d_inode(lower_dentry),
>                                      directory_inode->i_sb);
>         if (IS_ERR(inode)) {
>                 vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL);
> +               fsnotify_modify_dir(&lower_dir_path);
>                 goto out_lock;
>         }
>         fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry));
> @@ -432,6 +448,7 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
>         struct dentry *lower_old_dentry;
>         struct dentry *lower_new_dentry;
>         struct dentry *lower_dir_dentry;
> +       struct path lower_dir_path;
>         u64 file_size_save;
>         int rc;
>
> @@ -441,10 +458,16 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
>         dget(lower_old_dentry);
>         dget(lower_new_dentry);
>         lower_dir_dentry = lock_parent(lower_new_dentry);
> +       lower_dir_path.dentry = lower_dir_dentry;
> +       lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(new_dentry);
> +
>         rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
>                       lower_new_dentry, NULL);
>         if (rc || d_really_is_negative(lower_new_dentry))
>                 goto out_lock;
> +
> +       fsnotify_modify_dir(&lower_dir_path);
> +
>         rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
>         if (rc)
>                 goto out_lock;
> @@ -471,6 +494,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
>         int rc;
>         struct dentry *lower_dentry;
>         struct dentry *lower_dir_dentry;
> +       struct path lower_dir_path;
>         char *encoded_symname;
>         size_t encoded_symlen;
>         struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
> @@ -478,6 +502,9 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
>         lower_dentry = ecryptfs_dentry_to_lower(dentry);
>         dget(lower_dentry);
>         lower_dir_dentry = lock_parent(lower_dentry);
> +       lower_dir_path.dentry = lower_dir_dentry;
> +       lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(dentry);
> +
>         mount_crypt_stat = &ecryptfs_superblock_to_private(
>                 dir->i_sb)->mount_crypt_stat;
>         rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
> @@ -491,6 +518,9 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
>         kfree(encoded_symname);
>         if (rc || d_really_is_negative(lower_dentry))
>                 goto out_lock;
> +
> +       fsnotify_modify_dir(&lower_dir_path);
> +
>         rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
>         if (rc)
>                 goto out_lock;
> @@ -509,12 +539,18 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
>         int rc;
>         struct dentry *lower_dentry;
>         struct dentry *lower_dir_dentry;
> +       struct path lower_dir_path;
>
>         lower_dentry = ecryptfs_dentry_to_lower(dentry);
>         lower_dir_dentry = lock_parent(lower_dentry);
> +       lower_dir_path.dentry = lower_dir_dentry;
> +       lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(dentry);
>         rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode);
>         if (rc || d_really_is_negative(lower_dentry))
>                 goto out;
> +
> +       fsnotify_modify_dir(&lower_dir_path);
> +
>         rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
>         if (rc)
>                 goto out;
> @@ -532,16 +568,24 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
>  {
>         struct dentry *lower_dentry;
>         struct dentry *lower_dir_dentry;
> +       struct path lower_dir_path;
>         int rc;
>
>         lower_dentry = ecryptfs_dentry_to_lower(dentry);
>         dget(dentry);
>         lower_dir_dentry = lock_parent(lower_dentry);
> +       lower_dir_path.dentry = lower_dir_dentry;
> +       lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(dentry);
>         dget(lower_dentry);
> +
>         rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry);
>         dput(lower_dentry);
>         if (!rc && d_really_is_positive(dentry))
>                 clear_nlink(d_inode(dentry));
> +
> +       if (rc)
> +               fsnotify_modify_dir(&lower_dir_path);
> +
>         fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
>         set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
>         unlock_dir(lower_dir_dentry);
> @@ -557,12 +601,19 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev
>         int rc;
>         struct dentry *lower_dentry;
>         struct dentry *lower_dir_dentry;
> +       struct path lower_dir_path;
>
>         lower_dentry = ecryptfs_dentry_to_lower(dentry);
>         lower_dir_dentry = lock_parent(lower_dentry);
> +       lower_dir_path.dentry = lower_dir_dentry;
> +       lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(dentry);
> +
>         rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev);
>         if (rc || d_really_is_negative(lower_dentry))
>                 goto out;
> +
> +       fsnotify_modify_dir(&lower_dir_path);
> +
>         rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
>         if (rc)
>                 goto out;
> @@ -585,6 +636,9 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
>         struct dentry *lower_new_dentry;
>         struct dentry *lower_old_dir_dentry;
>         struct dentry *lower_new_dir_dentry;
> +       struct vfsmount *lower_mnt;
> +       struct path lower_old_dir_path;
> +       struct path lower_new_dir_path;
>         struct dentry *trap = NULL;
>         struct inode *target_inode;
>
> @@ -593,10 +647,15 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
>
>         lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
>         lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
> +       lower_mnt = ecryptfs_dentry_to_lower_mnt(old_dentry);
>         dget(lower_old_dentry);
>         dget(lower_new_dentry);
>         lower_old_dir_dentry = dget_parent(lower_old_dentry);
>         lower_new_dir_dentry = dget_parent(lower_new_dentry);
> +       lower_old_dir_path.dentry = lower_old_dir_dentry;
> +       lower_old_dir_path.mnt = lower_mnt;
> +       lower_new_dir_path.dentry = lower_new_dir_dentry;
> +       lower_new_dir_path.mnt = lower_mnt;
>         target_inode = d_inode(new_dentry);
>         trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
>         /* source should not be ancestor of target */
> @@ -614,6 +673,14 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
>                         NULL, 0);
>         if (rc)
>                 goto out_lock;
> +
> +       /* ecryptfs does not support crossing mount boundaries, we can take
> +        * vfsmount from an arbitrary dentry.
> +        */
> +       fsnotify_modify_dir(&lower_old_dir_path);
> +       if (!path_equal(&lower_old_dir_path, &lower_new_dir_path))
> +               fsnotify_modify_dir(&lower_new_dir_path);
> +
>         if (target_inode)
>                 fsstack_copy_attr_all(target_inode,
>                                       ecryptfs_inode_to_lower(target_inode));
> diff --git a/fs/namei.c b/fs/namei.c
> index ad74877e1442..17667f0c89e5 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -3009,8 +3009,12 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
>                                 dput(dentry);
>                                 dentry = file->f_path.dentry;
>                         }
> -                       if (*opened & FILE_CREATED)
> +                       if (*opened & FILE_CREATED) {
> +                               struct path parent_path = {file->f_path.mnt,
> +                                                       dentry->d_parent};
>                                 fsnotify_create(dir, dentry);
> +                               fsnotify_modify_dir(&parent_path);
> +                       }
>                         if (unlikely(d_is_negative(dentry))) {
>                                 error = -ENOENT;
>                         } else {
> @@ -3157,6 +3161,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
>                 if (error)
>                         goto out_dput;
>                 fsnotify_create(dir_inode, dentry);
> +               fsnotify_modify_dir(&nd->path);
>         }
>         if (unlikely(create_error) && !dentry->d_inode) {
>                 error = create_error;
> @@ -3702,6 +3707,7 @@ SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
>                         error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
>                         break;
>         }
> +       fsnotify_modify_dir(&path);
>  out:
>         done_path_create(&path, dentry);
>         if (retry_estale(error, lookup_flags)) {
> @@ -3759,6 +3765,8 @@ SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
>         error = security_path_mkdir(&path, dentry, mode);
>         if (!error)
>                 error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
> +       if (!error)
> +               fsnotify_modify_dir(&path);
>         done_path_create(&path, dentry);
>         if (retry_estale(error, lookup_flags)) {
>                 lookup_flags |= LOOKUP_REVAL;
> @@ -3855,6 +3863,8 @@ static long do_rmdir(int dfd, const char __user *pathname)
>         if (error)
>                 goto exit3;
>         error = vfs_rmdir(path.dentry->d_inode, dentry);
> +       if (!error)
> +               fsnotify_modify_dir(&path);
>  exit3:
>         dput(dentry);
>  exit2:
> @@ -3979,6 +3989,8 @@ static long do_unlinkat(int dfd, const char __user *pathname)
>                 if (error)
>                         goto exit2;
>                 error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
> +               if (!error)
> +                       fsnotify_modify_dir(&path);
>  exit2:
>                 dput(dentry);
>         }
> @@ -4070,6 +4082,8 @@ SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
>         error = security_path_symlink(&path, dentry, from->name);
>         if (!error)
>                 error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
> +       if (!error)
> +               fsnotify_modify_dir(&path);
>         done_path_create(&path, dentry);
>         if (retry_estale(error, lookup_flags)) {
>                 lookup_flags |= LOOKUP_REVAL;
> @@ -4219,6 +4233,8 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
>         if (error)
>                 goto out_dput;
>         error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
> +       if (!error)
> +               fsnotify_modify_dir(&new_path);
>  out_dput:
>         done_path_create(&new_path, new_dentry);
>         if (delegated_inode) {
> @@ -4532,6 +4548,11 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
>         error = vfs_rename(old_path.dentry->d_inode, old_dentry,
>                            new_path.dentry->d_inode, new_dentry,
>                            &delegated_inode, flags);
> +       if (error == 0) {
> +               fsnotify_modify_dir(&old_path);
> +               if (!path_equal(&old_path, &new_path))
> +                       fsnotify_modify_dir(&new_path);
> +       }
>  exit5:
>         dput(new_dentry);
>  exit4:
> diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
> index 66eaeb1e8c2c..58f70bbaac38 100644
> --- a/fs/nfsd/nfs4recover.c
> +++ b/fs/nfsd/nfs4recover.c
> @@ -36,6 +36,7 @@
>  #include <linux/file.h>
>  #include <linux/slab.h>
>  #include <linux/namei.h>
> +#include <linux/fsnotify.h>
>  #include <linux/sched.h>
>  #include <linux/fs.h>
>  #include <linux/module.h>
> @@ -216,6 +217,8 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
>                  */
>                 goto out_put;
>         status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU);
> +       if (status == 0)
> +               fsnotify_modify_dir(&nn->rec_file->f_path);
>  out_put:
>         dput(dentry);
>  out_unlock:
> @@ -338,6 +341,8 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
>         if (d_really_is_negative(dentry))
>                 goto out;
>         status = vfs_rmdir(d_inode(dir), dentry);
> +       if (status == 0)
> +               fsnotify_modify_dir(&nn->rec_file->f_path);
>  out:
>         dput(dentry);
>  out_unlock:
> @@ -401,6 +406,8 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
>         if (status)
>                 printk("failed to remove client recovery directory %pd\n",
>                                 child);
> +       else
> +               fsnotify_modify_dir(&nn->rec_file->f_path);
>         /* Keep trying, success or failure: */
>         return 0;
>  }
> diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
> index 26c6fdb4bf67..7632ab3fd99e 100644
> --- a/fs/nfsd/vfs.c
> +++ b/fs/nfsd/vfs.c
> @@ -364,6 +364,18 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
>  }
>
>  /*
> + * Helper to emit fsnotify modify_dir event. Call with fph locked.
> + */
> +static void nfsd_fsnotify_modify_dir(struct svc_fh *fhp)
> +{
> +       struct path path;
> +
> +       path.mnt = fhp->fh_export->ex_path.mnt;
> +       path.dentry = fhp->fh_dentry;
> +       fsnotify_modify_dir(&path);
> +}
> +
> +/*
>   * Set various file attributes.  After this call fhp needs an fh_put.
>   */
>  __be32
> @@ -1207,6 +1219,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
>                 goto out_nfserr;
>
>         err = nfsd_create_setattr(rqstp, resfhp, iap);
> +       nfsd_fsnotify_modify_dir(fhp);
>
>         /*
>          * nfsd_create_setattr already committed the child.  Transactional
> @@ -1525,8 +1538,10 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
>
>         host_err = vfs_symlink(d_inode(dentry), dnew, path);
>         err = nfserrno(host_err);
> -       if (!err)
> +       if (!err) {
> +               nfsd_fsnotify_modify_dir(fhp);
>                 err = nfserrno(commit_metadata(fhp));
> +       }
>         fh_unlock(fhp);
>
>         fh_drop_write(fhp);
> @@ -1593,6 +1608,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
>                 goto out_dput;
>         host_err = vfs_link(dold, dirp, dnew, NULL);
>         if (!host_err) {
> +               nfsd_fsnotify_modify_dir(tfhp);
>                 err = nfserrno(commit_metadata(ffhp));
>                 if (!err)
>                         err = nfserrno(commit_metadata(tfhp));
> @@ -1686,6 +1702,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
>
>         host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
>         if (!host_err) {
> +               nfsd_fsnotify_modify_dir(tfhp);
> +               nfsd_fsnotify_modify_dir(ffhp);
>                 host_err = commit_metadata(tfhp);
>                 if (!host_err)
>                         host_err = commit_metadata(ffhp);
> @@ -1757,8 +1775,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
>                 host_err = vfs_unlink(dirp, rdentry, NULL);
>         else
>                 host_err = vfs_rmdir(dirp, rdentry);
> -       if (!host_err)
> +       if (!host_err) {
> +               nfsd_fsnotify_modify_dir(fhp);
>                 host_err = commit_metadata(fhp);
> +       }
>         dput(rdentry);
>
>  out_nfserr:
> diff --git a/ipc/mqueue.c b/ipc/mqueue.c
> index 7a2d8f0c8ae5..10e413c2216f 100644
> --- a/ipc/mqueue.c
> +++ b/ipc/mqueue.c
> @@ -19,6 +19,7 @@
>  #include <linux/file.h>
>  #include <linux/mount.h>
>  #include <linux/namei.h>
> +#include <linux/fsnotify.h>
>  #include <linux/sysctl.h>
>  #include <linux/poll.h>
>  #include <linux/mqueue.h>
> @@ -818,6 +819,10 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
>                         filp = do_create(ipc_ns, d_inode(root),
>                                                 &path, oflag, mode,
>                                                 u_attr ? &attr : NULL);
> +                       if (!IS_ERR(filp)) {
> +                               struct path root_path = {mnt, mnt->mnt_root};
> +                               fsnotify_modify_dir(&root_path);
> +                       }
>                 }
>         } else {
>                 if (d_really_is_negative(path.dentry)) {
> @@ -878,6 +883,10 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
>         } else {
>                 ihold(inode);
>                 err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
> +               if (!err) {
> +                       struct path path = {mnt, dentry->d_parent};
> +                       fsnotify_modify_dir(&path);
> +               }
>         }
>         dput(dentry);
>
> diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
> index 0b030c9126d3..93137292b051 100644
> --- a/kernel/bpf/inode.c
> +++ b/kernel/bpf/inode.c
> @@ -16,6 +16,7 @@
>  #include <linux/major.h>
>  #include <linux/mount.h>
>  #include <linux/namei.h>
> +#include <linux/fsnotify.h>
>  #include <linux/fs.h>
>  #include <linux/kdev_t.h>
>  #include <linux/parser.h>
> @@ -255,6 +256,8 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
>
>         dentry->d_fsdata = raw;
>         ret = vfs_mknod(dir, dentry, mode, devt);
> +       if (ret == 0)
> +               fsnotify_modify_dir(&path);
>         dentry->d_fsdata = NULL;
>  out:
>         done_path_create(&path, dentry);
> diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
> index cef79873b09d..5049bd4bd1d8 100644
> --- a/net/unix/af_unix.c
> +++ b/net/unix/af_unix.c
> @@ -91,6 +91,7 @@
>  #include <linux/stat.h>
>  #include <linux/dcache.h>
>  #include <linux/namei.h>
> +#include <linux/fsnotify.h>
>  #include <linux/socket.h>
>  #include <linux/un.h>
>  #include <linux/fcntl.h>
> @@ -976,6 +977,7 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
>         if (!err) {
>                 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
>                 if (!err) {
> +                       fsnotify_modify_dir(&path);
>                         res->mnt = mntget(path.mnt);
>                         res->dentry = dget(dentry);
>                 }
> --
> 2.11.1
>
Filip Štědronský March 14, 2017, 2:58 p.m. UTC | #2
Hi,

On Tue, Mar 14, 2017 at 01:18:01PM +0200, Amir Goldstein wrote:
> I claim that fanotify filters event by mount not because it
> was a requirement, but because it was an implementation challenge
> to do otherwise.
>
> And I claim that what mount watchers are really interested in is
> "all the changes that happen in the file system in the area
>  that is visible to me through this mount point".
>
> In other words, an indexer needs to know if files were modified\
> create/deleted if that indexer sits in container host namespace
> regardless if those files were modified from within a container
> namespace.
> 
> It's not a matter of security/isolation. It's a matter of functionality.
> I agree that for some event (e.g. permission events) it is possible
> to argue both ways (i.e. that the namespace context should be used
> as a filter for events).
> But for the new proposed events (FS_MODIFY_DIR), I really don't
> see the point in isolation by mount/namespace.

there are basically two classes of uses for a fantotify-like
interface:

(1) Keeping an up-to-date representation of the file system.
    For this, superblock watches are clearly what you want.

      * You are interested to know the current state of the
        filesystem so you need to know about every change, 
        regardless of where it came from.
      * As I mentioned earlier, in case of remote, ditributed
        and virtual filesystems, the change might come from
        within the filesystem itself (if the protocol supports
        reporting such changes). This can probably be
        implemented only with superblock-scoped watches because
        the change is fundamentally not related to any mount.
      * Some filesystems might also support change journalling
        and it might be concievable to extend the API in the
        future to report "past" events (for example by passing
        sequence number of last seen event or similar).
      * The argument about containers escaping change notification
        you mentioned earlier.

    All those factors speak greatly in favour of superblock
    watches.

(2) Tracking filesystem *activity*. Now you are not building
    an image of current filesystem state but rather a log of
    what happened. Perhaps you are also interested in who
    (user/process/...) did what. Permission events also fit
    mostly in this category.

    For those it *might* make sense to have mount-scoped
    watches, for example if you want to monitor only one
    container or a subset of processes.

We both concentrate on the first but we shouldn't forget about
the second, which was one of the original motivations for
fanotify.

Thus I conclude that it might be desirable to implement
mount-scoped filename events in the long run. Even though
I agree that the sb-scoped events are more important because
they cover more use cases and you can do additional filtering
(e.g. by pid) if deemed necessary.

This would require:

(a) Sprinkling the callers of vfs_* with fanotify calls
    as I did, or
(b) Creating wrapper functions like vfs_path_unlink & co.
    that would make the necessary fanotify call (and probably
    tell the lower function not to generate another
    notification), as I suggested earlier.
(c) Give the vfs_* functions an *optional* vfsmount argument.

In the end I probably find (c) the most elegant but this
can be discussed later, even after your changes are merged.

Filip
Amir Goldstein March 14, 2017, 3:35 p.m. UTC | #3
On Tue, Mar 14, 2017 at 4:58 PM, Filip Štědronský <r.lkml@regnarg.cz> wrote:
> Hi,
>
> On Tue, Mar 14, 2017 at 01:18:01PM +0200, Amir Goldstein wrote:
>> I claim that fanotify filters event by mount not because it
>> was a requirement, but because it was an implementation challenge
>> to do otherwise.
>>
>> And I claim that what mount watchers are really interested in is
>> "all the changes that happen in the file system in the area
>>  that is visible to me through this mount point".
>>
>> In other words, an indexer needs to know if files were modified\
>> create/deleted if that indexer sits in container host namespace
>> regardless if those files were modified from within a container
>> namespace.
>>
>> It's not a matter of security/isolation. It's a matter of functionality.
>> I agree that for some event (e.g. permission events) it is possible
>> to argue both ways (i.e. that the namespace context should be used
>> as a filter for events).
>> But for the new proposed events (FS_MODIFY_DIR), I really don't
>> see the point in isolation by mount/namespace.
>
> there are basically two classes of uses for a fantotify-like
> interface:
>
> (1) Keeping an up-to-date representation of the file system.
>     For this, superblock watches are clearly what you want.
>
>       * You are interested to know the current state of the
>         filesystem so you need to know about every change,
>         regardless of where it came from.
>       * As I mentioned earlier, in case of remote, ditributed
>         and virtual filesystems, the change might come from
>         within the filesystem itself (if the protocol supports
>         reporting such changes). This can probably be
>         implemented only with superblock-scoped watches because
>         the change is fundamentally not related to any mount.
>       * Some filesystems might also support change journalling
>         and it might be concievable to extend the API in the
>         future to report "past" events (for example by passing
>         sequence number of last seen event or similar).
>       * The argument about containers escaping change notification
>         you mentioned earlier.
>
>     All those factors speak greatly in favour of superblock
>     watches.
>
> (2) Tracking filesystem *activity*. Now you are not building
>     an image of current filesystem state but rather a log of
>     what happened. Perhaps you are also interested in who
>     (user/process/...) did what. Permission events also fit
>     mostly in this category.
>
>     For those it *might* make sense to have mount-scoped
>     watches, for example if you want to monitor only one
>     container or a subset of processes.
>
> We both concentrate on the first but we shouldn't forget about
> the second, which was one of the original motivations for
> fanotify.
>
> Thus I conclude that it might be desirable to implement
> mount-scoped filename events in the long run. Even though
> I agree that the sb-scoped events are more important because
> they cover more use cases and you can do additional filtering
> (e.g. by pid) if deemed necessary.
>
> This would require:
>
> (a) Sprinkling the callers of vfs_* with fanotify calls
>     as I did, or
> (b) Creating wrapper functions like vfs_path_unlink & co.
>     that would make the necessary fanotify call (and probably
>     tell the lower function not to generate another
>     notification), as I suggested earlier.
> (c) Give the vfs_* functions an *optional* vfsmount argument.
>
> In the end I probably find (c) the most elegant but this
> can be discussed later, even after your changes are merged.
>

Agreed. That is an independent question.
Thanks for the thorough summary.

Amir.
Marko Rauhamaa March 15, 2017, 8:19 a.m. UTC | #4
Filip Štědronský <r.lkml@regnarg.cz>:

> there are basically two classes of uses for a fantotify-like
> interface:
>
> (1) Keeping an up-to-date representation of the file system. For this,
>     superblock watches are clearly what you want.
>
>     [...]
>
>     All those factors speak greatly in favour of superblock
>     watches.
>
> (2) Tracking filesystem *activity*. Now you are not building
>     an image of current filesystem state but rather a log of what
>     happened. Perhaps you are also interested in who
>     (user/process/...) did what. Permission events also fit mostly in
>     this category.
>
>     For those it *might* make sense to have mount-scoped watches, for
>     example if you want to monitor only one container or a subset of
>     processes.
>
> We both concentrate on the first but we shouldn't forget about the
> second, which was one of the original motivations for fanotify.

My (employer's) needs are centered around (2). We definitely crave
permission events with a filesystem scope. At the moment, you can avoid
permission checks with a simple unshare command (<URL:
https://lkml.org/lkml/2016/12/21/144>).

So I must be able to see everything that is happening in my universe. It
might also be useful to monitor a subuniverse of mine, but the former
need is critical at the moment.

As for "who (user/process/...) did what", the fanotify API is flawed in
that we don't have a CLOSE_WRITE_PERM event. The hit-and-run process is
long gone by the time we receive the event. That's more of a rule than
an exception.


Marko
Jan Kara March 15, 2017, 1:39 p.m. UTC | #5
On Wed 15-03-17 10:19:52, Marko Rauhamaa wrote:
> Filip Štědronský <r.lkml@regnarg.cz>:
> 
> > there are basically two classes of uses for a fantotify-like
> > interface:
> >
> > (1) Keeping an up-to-date representation of the file system. For this,
> >     superblock watches are clearly what you want.
> >
> >     [...]
> >
> >     All those factors speak greatly in favour of superblock
> >     watches.
> >
> > (2) Tracking filesystem *activity*. Now you are not building
> >     an image of current filesystem state but rather a log of what
> >     happened. Perhaps you are also interested in who
> >     (user/process/...) did what. Permission events also fit mostly in
> >     this category.
> >
> >     For those it *might* make sense to have mount-scoped watches, for
> >     example if you want to monitor only one container or a subset of
> >     processes.
> >
> > We both concentrate on the first but we shouldn't forget about the
> > second, which was one of the original motivations for fanotify.
> 
> My (employer's) needs are centered around (2). We definitely crave
> permission events with a filesystem scope. At the moment, you can avoid
> permission checks with a simple unshare command (<URL:
> https://lkml.org/lkml/2016/12/21/144>).

Yes, that is bad.

> So I must be able to see everything that is happening in my universe. It
> might also be useful to monitor a subuniverse of mine, but the former
> need is critical at the moment.

So I understand your need. However with superblock watches I'm still
concerned that the process would be able to see too much. E.g. if it is
restricted to see only some subtree of a filesystem (by bind mounts &
namespaces), it should not be able to see events on the same filesystem
outside of that subtree. I have not found a good solution for that yet.

> As for "who (user/process/...) did what", the fanotify API is flawed in
> that we don't have a CLOSE_WRITE_PERM event. The hit-and-run process is
> long gone by the time we receive the event. That's more of a rule than
> an exception.

Adding CLOSE_WRITE_PERM would not be that difficult I assume. What do you
need it for?

								Honza
Marko Rauhamaa March 15, 2017, 2:18 p.m. UTC | #6
Jan Kara <jack@suse.cz>:

> On Wed 15-03-17 10:19:52, Marko Rauhamaa wrote:
>> As for "who (user/process/...) did what", the fanotify API is flawed
>> in that we don't have a CLOSE_WRITE_PERM event. The hit-and-run
>> process is long gone by the time we receive the event. That's more of
>> a rule than an exception.
>
> Adding CLOSE_WRITE_PERM would not be that difficult I assume. What do you
> need it for?

Mainly to hold the process hostage until I have verified the content
change. If I disqualify the content change, I will need to report on the
process. CLOSE_WRITE only gives me a pid that is often stale as it
doesn't block the process.

(Another possibility would be to keep the process around as a zombie as
long as the CLOSE_WRITE event's file descriptor is open. That sounds
more complicated and questionable, though.)


Marko
Amir Goldstein March 15, 2017, 2:44 p.m. UTC | #7
On Wed, Mar 15, 2017 at 3:39 PM, Jan Kara <jack@suse.cz> wrote:
> On Wed 15-03-17 10:19:52, Marko Rauhamaa wrote:
>> Filip Štědronský <r.lkml@regnarg.cz>:
>>
>> > there are basically two classes of uses for a fantotify-like
>> > interface:
>> >
>> > (1) Keeping an up-to-date representation of the file system. For this,
>> >     superblock watches are clearly what you want.
>> >
>> >     [...]
>> >
>> >     All those factors speak greatly in favour of superblock
>> >     watches.
>> >
>> > (2) Tracking filesystem *activity*. Now you are not building
>> >     an image of current filesystem state but rather a log of what
>> >     happened. Perhaps you are also interested in who
>> >     (user/process/...) did what. Permission events also fit mostly in
>> >     this category.
>> >
>> >     For those it *might* make sense to have mount-scoped watches, for
>> >     example if you want to monitor only one container or a subset of
>> >     processes.
>> >
>> > We both concentrate on the first but we shouldn't forget about the
>> > second, which was one of the original motivations for fanotify.
>>
>> My (employer's) needs are centered around (2). We definitely crave
>> permission events with a filesystem scope. At the moment, you can avoid
>> permission checks with a simple unshare command (<URL:
>> https://lkml.org/lkml/2016/12/21/144>).
>
> Yes, that is bad.
>
>> So I must be able to see everything that is happening in my universe. It
>> might also be useful to monitor a subuniverse of mine, but the former
>> need is critical at the moment.
>
> So I understand your need. However with superblock watches I'm still
> concerned that the process would be able to see too much. E.g. if it is
> restricted to see only some subtree of a filesystem (by bind mounts &
> namespaces), it should not be able to see events on the same filesystem
> outside of that subtree. I have not found a good solution for that yet.
>

See the last patch in my series. The cherry on the top ;-)

commit 5e3b5bd943991cdf5b72745c1e24833bc998b7ed
Author: Amir Goldstein <amir73il@gmail.com>
Date:   Sun Dec 18 11:25:55 2016 +0200

    fanotify: filter events by root mark mount point

    When adding a super block root watch from a mount point that is not mounted
    on the root of the file system, filter out events on file system objects
    that happen outside this mount point directory (on non decendant objects).

    This is not like FAN_MARK_MOUNT which filters only events that happened
    on the mount of the mark. All events on file system objects are reported
    as long as these objects are accessible from the mark mount point.

    Signed-off-by: Amir Goldstein <amir73il@gmail.com>

 fs/notify/fanotify/fanotify.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

Our use case is monitoring a large directory tree, not the entire file system.

I used a simple check in should_send_event()

+       /*
+        * Only interesetd in dentry events visible from the mount
+        * from which the root watch was added
+        */
+       if (mark_mnt && mark_mnt->mnt_root != dentry &&
+           d_ancestor(mark_mnt->mnt_root, dentry) == NULL)
+               return false;
+

This does not cover the case of events on objects that are hidden
under another mount in my mount namespace, but it covers the
simple case of bind mount.

Note that 'mark_mnt' does NOT stand for the vfs_mark mount,
because root watch is an inode_mark (on the root inode).
It stands for the mount from which the root watch was added.
Same mount that is used to construct the event->fd for all
the dentry events.

This scheme does NOT allow multiple root watches with
different mount point filter on the same group.
Every group can have just one root watch per sb with a single
mount filter.

Amir.
Jan Kara March 19, 2017, 10:19 a.m. UTC | #8
On Tue 14-03-17 13:18:01, Amir Goldstein wrote:
> On Tue, Mar 14, 2017 at 1:03 AM, Filip Štědronský <r.lklm@regnarg.cz> wrote:
> > Besause fanotify requires `struct path`, the event cannot be generated
> > directly in `fsnotify_move` and friends because they only get the inode
> > (and their callers, `vfs_rename`&co. cannot supply any better info).
> > So instead it needs to be generated higher in the call chain, i.e. in
> > the callers of functions like `vfs_rename`.
> >
> > This leads to some code duplication. Currently, there are several places
> > whence functions like `vfs_rename` or `vfs_unlink` are called:
> >
> >   * syscall handlers (done)
> >   * NFS server (done)
> >   * stacked filesystems
> >       - ecryptfs (done)
> >       - overlayfs
> >         (Currently doesn't report even ordinary fanotify events, because
> >          it internally clones the upper mount; not sure about the
> >          rationale.  One can always watch the overlay mount instead.)
> >   * few rather minor things
> >       - devtmpfs
> >         (its internal changes are not tied to any vfsmount so it cannot
> >          emit mount-scoped events)
> >       - cachefiles (done)
> >       - ipc/mqueue.c (done)
> >       - fs/nfsd/nfs4recover.c (done)
> >       - kernel/bpf/inode.c (done)
> >         net/unix/af_unix.c (done)
> >
> > (grep -rE '\bvfs_(rename|unlink|mknod|whiteout|create|mkdir|rmdir|symlink|link)\(')
> >
> > Signed-off-by: Filip Štědronský <r.lkml@regnarg.cz>
> >
> > ---
> >
> > An alternative might be to create wrapper functions like
> > vfs_path_(rename|unlink|...). They could also take care of calling
> > security_path_(rename|unlink|...), which is currently also up to
> > the indvidual callers (possibly with a flag because it might not
> > be always desired).
> 
> That's an interesting idea. There is some duplicity between security/audit
> hook and fsnotify hooks. It should be interesting to try and deduplicate
> some of this code.

Yeah, but ecryptfs or nfsd don't actually call these security hooks AFAICT.
And at least for NFSD it seems correct they don't call them since you are
running in a context of an NFSD server process and don't have security
context of the process actually issuing the IO. So I'm not sure
"deduplication" is really possible.

However if you can really call fsnotify hooks with 'path' available in all
the places, it should be equally hard to just pass 'path' to
vfs_(create|mkdir|...) and that way we don't have to sprinkle fsnotify
calls into several call sites but keep them local to vfs_(create|mkdir|...)
helpers. Hmm?

> 
> > ---
> >  fs/cachefiles/namei.c |  9 +++++++
> >  fs/ecryptfs/inode.c   | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/namei.c            | 23 +++++++++++++++++-
> >  fs/nfsd/nfs4recover.c |  7 ++++++
> >  fs/nfsd/vfs.c         | 24 ++++++++++++++++--
> >  ipc/mqueue.c          |  9 +++++++
> >  kernel/bpf/inode.c    |  3 +++
> >  net/unix/af_unix.c    |  2 ++
> >  8 files changed, 141 insertions(+), 3 deletions(-)
> >
> 
> OK, just for comparison, I am going to put here the diff of the sub set of
> my patches that are needed to support fanotify filename events.
> 
> 
> $ git diff --stat fsnotify_sb..fanotify_dentry
>  fs/notify/fanotify/fanotify.c      | 94
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
>  fs/notify/fanotify/fanotify.h      | 25 ++++++++++++++++++++++++-
>  fs/notify/fanotify/fanotify_user.c | 92
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
>  fs/notify/fdinfo.c                 | 25 +++----------------------
>  fs/notify/inode_mark.c             |  1 +
>  fs/notify/mark.c                   | 15 ++++++++++++---
>  include/linux/fsnotify_backend.h   | 21 ++++++++++++++++-----
>  include/uapi/linux/fanotify.h      | 41
> +++++++++++++++++++++++++++++++++++------
>  8 files changed, 255 insertions(+), 59 deletions(-)
> 
> Yes, it is a bit more code, much mostly because it adds more functionality
> (optionally reporting the filename).
> But most of the code is contained within the fsnotify/fanotify subsystem.

So I'm not that concerned about actual line numbers. They play some role
but they don't tell much about how maintainable the result is in the end.

> The altenative to sprinkle fsnotify_modify_dir() hooks is much less
> maintainable IMO.

Agreed on this.

> I managed to stay a way from cross subsystems changes,
> by allowing to loose some information about the event.
> 
> When a filename event is generated (rename|delete|create)
> the path of the parent fd that will be reported to user is NOT
> the actual path that the process executing the operation used,
> but the path from which the watching process has added the mark.

Yeah, and frankly I'm not yet convinced this is a sane thing to do. I'd
much rather propagate path to vfs_(create|mkdir|...) helpers.

> So for example, if you have a bind mount:
>     mount -o bind /a/b/c/d/e/f/g /tmp/g
> 
> And you add a watch on mount /a
> you *will* get events for create,delete,rename in directory /tmp/g
> but that path in the associated fd with point to /a/b/c/d/e/f/g
> 
> Some would claim that it is wrong to report the events
> if they were not originated from the mount were the
> watch was added.
> 
> I claim that fanotify filters event by mount not because it
> was a requirement, but because it was an implementation challenge
> to do otherwise.
> And I claim that what mount watchers are really interested in is
> "all the changes that happen in the file system in the area
>  that is visible to me through this mount point".
> 
> In other words, an indexer needs to know if files were modified\
> create/deleted if that indexer sits in container host namespace
> regardless if those files were modified from within a container
> namespace.
> 
> It's not a matter of security/isolation. It's a matter of functionality.
> I agree that for some event (e.g. permission events) it is possible
> to argue both ways (i.e. that the namespace context should be used
> as a filter for events).
> But for the new proposed events (FS_MODIFY_DIR), I really don't
> see the point in isolation by mount/namespace.

I would be *very* careful with such assumptions. People are very inventive
in ways how they can abuse stuff. What I'm most nervous about is that the
(mnt, dentry) pair you create may not be reachable path. When you combine
that with xyz_at(2) syscalls allowing you to traverse directory hierarchy
from fd which you got from fanotify event, things can get really
interesting. So unless there are very clear rules proving this cannot be
misused, I'm against hand-crafting of path structures inside fsnotify.

								Honza
Filip Štědronský March 19, 2017, 10:37 a.m. UTC | #9
On Sun, Mar 19, 2017 at 11:19:43AM +0100, Jan Kara wrote:
> However if you can really call fsnotify hooks with 'path' available in all
> the places, it should be equally hard to just pass 'path' to
> vfs_(create|mkdir|...) and that way we don't have to sprinkle fsnotify
> calls into several call sites but keep them local to vfs_(create|mkdir|...)
> helpers. Hmm?

the problem is: not absolutely all. One illuminating example is the use
of vfs_mknod in devtmpfs. There a struct path is not only unavailable
but makes not semantic sense: the changes do not go thru any mountpoint.
And in general I think there will be situations where you would need
to call VFS functions without paths.

Thus I suggested either
(a) wrapping the VFS functions with path variants, or
(b) giving them an optional vfsmount argument that can be set to NULL
    when it does not make sense

Filip
Jan Kara March 19, 2017, 6:04 p.m. UTC | #10
On Sun 19-03-17 11:37:39, Filip Štědronský wrote:
> On Sun, Mar 19, 2017 at 11:19:43AM +0100, Jan Kara wrote:
> > However if you can really call fsnotify hooks with 'path' available in all
> > the places, it should be equally hard to just pass 'path' to
> > vfs_(create|mkdir|...) and that way we don't have to sprinkle fsnotify
> > calls into several call sites but keep them local to vfs_(create|mkdir|...)
> > helpers. Hmm?
> 
> the problem is: not absolutely all. One illuminating example is the use
> of vfs_mknod in devtmpfs. There a struct path is not only unavailable
> but makes not semantic sense: the changes do not go thru any mountpoint.

How come? In current kernel the call looks like:

vfs_mknod(d_inode(path.dentry), dentry, mode, dev->devt);

so the path is available there... I've actually quickly checked all
vfs_mknod() callers and they all seem to have path available.

> And in general I think there will be situations where you would need
> to call VFS functions without paths.
> 
> Thus I suggested either
> (a) wrapping the VFS functions with path variants, or
> (b) giving them an optional vfsmount argument that can be set to NULL
>     when it does not make sense

So my first take is that fsnotify calls happen still relatively high in the
call stack where we should mostly have mount point available from the path
lookup. That being said there may be places where we've lost that
information and it will not be easy to propagate it there and that would
have to be dealt with on case-by-case basis. But mountpoint is needed for
other stuff like security checks these days as well so we should have it
available in principle.

								Honza
Amir Goldstein March 20, 2017, 11:40 a.m. UTC | #11
On Sun, Mar 19, 2017 at 2:04 PM, Jan Kara <jack@suse.cz> wrote:
> On Sun 19-03-17 11:37:39, Filip Štědronský wrote:
>> On Sun, Mar 19, 2017 at 11:19:43AM +0100, Jan Kara wrote:
>> > However if you can really call fsnotify hooks with 'path' available in all
>> > the places, it should be equally hard to just pass 'path' to
>> > vfs_(create|mkdir|...) and that way we don't have to sprinkle fsnotify
>> > calls into several call sites but keep them local to vfs_(create|mkdir|...)
>> > helpers. Hmm?
>>
>> the problem is: not absolutely all. One illuminating example is the use
>> of vfs_mknod in devtmpfs. There a struct path is not only unavailable
>> but makes not semantic sense: the changes do not go thru any mountpoint.
>
> How come? In current kernel the call looks like:
>
> vfs_mknod(d_inode(path.dentry), dentry, mode, dev->devt);
>
> so the path is available there... I've actually quickly checked all
> vfs_mknod() callers and they all seem to have path available.
>
>> And in general I think there will be situations where you would need
>> to call VFS functions without paths.
>>
>> Thus I suggested either
>> (a) wrapping the VFS functions with path variants, or
>> (b) giving them an optional vfsmount argument that can be set to NULL
>>     when it does not make sense
>
> So my first take is that fsnotify calls happen still relatively high in the
> call stack where we should mostly have mount point available from the path
> lookup. That being said there may be places where we've lost that
> information and it will not be easy to propagate it there and that would
> have to be dealt with on case-by-case basis. But mountpoint is needed for
> other stuff like security checks these days as well so we should have it
> available in principle.
>

I agree that propagating the path to fsnotify seem like the right thing to do.
fsnotify_inoderemove() is an example (the only one I know of) where path
is not available (when called down from from dput()), but frankly, I can't
think of any use cases that really needs to make use of the
FS_DELETE_SELF event in that case.

d_delete() which also calls fsnotify_inoderemove() already calls
fsnotify_nameremove() hook with the exact same dentry, so
the FS_DELETE_SELF event can be generated by that hook as well
as the FS_DELETE event.
Filip Štědronský March 20, 2017, 11:52 a.m. UTC | #12
Hi,

On Sun, Mar 19, 2017 at 07:04:13PM +0100, Jan Kara wrote:
> How come? In current kernel the call looks like:
> 
> vfs_mknod(d_inode(path.dentry), dentry, mode, dev->devt);
> 
> so the path is available there... I've actually quickly checked all
> vfs_mknod() callers and they all seem to have path available.

terribly sorry, must have misremembered something. Been staring at the
code long into the night. You are quite right.

But it is an internal mount so userspace never gets the notifications.
The same goes for the cloned upper mount in overlayfs. This might be
considered ok, because the change is semantically "internal" and does
not originate through any userspace-visible mountpoint. Superblock
watches would solve this case.

Otherwise it seems feasible to pass a path to all VFS functions.

Filip
J. Bruce Fields March 21, 2017, 3:38 p.m. UTC | #13
On Sun, Mar 19, 2017 at 11:19:43AM +0100, Jan Kara wrote:
> On Tue 14-03-17 13:18:01, Amir Goldstein wrote:
> > On Tue, Mar 14, 2017 at 1:03 AM, Filip Štědronský <r.lklm@regnarg.cz> wrote:
> > > Besause fanotify requires `struct path`, the event cannot be generated
> > > directly in `fsnotify_move` and friends because they only get the inode
> > > (and their callers, `vfs_rename`&co. cannot supply any better info).
> > > So instead it needs to be generated higher in the call chain, i.e. in
> > > the callers of functions like `vfs_rename`.
> > >
> > > This leads to some code duplication. Currently, there are several places
> > > whence functions like `vfs_rename` or `vfs_unlink` are called:
> > >
> > >   * syscall handlers (done)
> > >   * NFS server (done)
> > >   * stacked filesystems
> > >       - ecryptfs (done)
> > >       - overlayfs
> > >         (Currently doesn't report even ordinary fanotify events, because
> > >          it internally clones the upper mount; not sure about the
> > >          rationale.  One can always watch the overlay mount instead.)
> > >   * few rather minor things
> > >       - devtmpfs
> > >         (its internal changes are not tied to any vfsmount so it cannot
> > >          emit mount-scoped events)
> > >       - cachefiles (done)
> > >       - ipc/mqueue.c (done)
> > >       - fs/nfsd/nfs4recover.c (done)
> > >       - kernel/bpf/inode.c (done)
> > >         net/unix/af_unix.c (done)
> > >
> > > (grep -rE '\bvfs_(rename|unlink|mknod|whiteout|create|mkdir|rmdir|symlink|link)\(')
> > >
> > > Signed-off-by: Filip Štědronský <r.lkml@regnarg.cz>
> > >
> > > ---
> > >
> > > An alternative might be to create wrapper functions like
> > > vfs_path_(rename|unlink|...). They could also take care of calling
> > > security_path_(rename|unlink|...), which is currently also up to
> > > the indvidual callers (possibly with a flag because it might not
> > > be always desired).
> > 
> > That's an interesting idea. There is some duplicity between security/audit
> > hook and fsnotify hooks. It should be interesting to try and deduplicate
> > some of this code.
> 
> Yeah, but ecryptfs or nfsd don't actually call these security hooks AFAICT.

We don't?  E.g. nfsd_unlink calls vfs_unlink which calls
security_inode_unlink().

> And at least for NFSD it seems correct they don't call them since you are
> running in a context of an NFSD server process and don't have security
> context of the process actually issuing the IO.

# ps -eo comm,label|grep nfsd
nfsd            system_u:system_r:kernel_t:s0
...

So I guess that's the process context they see.

There is actually a way for the protocol to pass the security context,
in the case it's running over kerberos.  So some day this might
optionally start using the context of the client process, for what it's
worth.

--b.

> So I'm not sure
> "deduplication" is really possible.
> 
> However if you can really call fsnotify hooks with 'path' available in all
> the places, it should be equally hard to just pass 'path' to
> vfs_(create|mkdir|...) and that way we don't have to sprinkle fsnotify
> calls into several call sites but keep them local to vfs_(create|mkdir|...)
> helpers. Hmm?
> 
> > 
> > > ---
> > >  fs/cachefiles/namei.c |  9 +++++++
> > >  fs/ecryptfs/inode.c   | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > >  fs/namei.c            | 23 +++++++++++++++++-
> > >  fs/nfsd/nfs4recover.c |  7 ++++++
> > >  fs/nfsd/vfs.c         | 24 ++++++++++++++++--
> > >  ipc/mqueue.c          |  9 +++++++
> > >  kernel/bpf/inode.c    |  3 +++
> > >  net/unix/af_unix.c    |  2 ++
> > >  8 files changed, 141 insertions(+), 3 deletions(-)
> > >
> > 
> > OK, just for comparison, I am going to put here the diff of the sub set of
> > my patches that are needed to support fanotify filename events.
> > 
> > 
> > $ git diff --stat fsnotify_sb..fanotify_dentry
> >  fs/notify/fanotify/fanotify.c      | 94
> > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
> >  fs/notify/fanotify/fanotify.h      | 25 ++++++++++++++++++++++++-
> >  fs/notify/fanotify/fanotify_user.c | 92
> > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
> >  fs/notify/fdinfo.c                 | 25 +++----------------------
> >  fs/notify/inode_mark.c             |  1 +
> >  fs/notify/mark.c                   | 15 ++++++++++++---
> >  include/linux/fsnotify_backend.h   | 21 ++++++++++++++++-----
> >  include/uapi/linux/fanotify.h      | 41
> > +++++++++++++++++++++++++++++++++++------
> >  8 files changed, 255 insertions(+), 59 deletions(-)
> > 
> > Yes, it is a bit more code, much mostly because it adds more functionality
> > (optionally reporting the filename).
> > But most of the code is contained within the fsnotify/fanotify subsystem.
> 
> So I'm not that concerned about actual line numbers. They play some role
> but they don't tell much about how maintainable the result is in the end.
> 
> > The altenative to sprinkle fsnotify_modify_dir() hooks is much less
> > maintainable IMO.
> 
> Agreed on this.
> 
> > I managed to stay a way from cross subsystems changes,
> > by allowing to loose some information about the event.
> > 
> > When a filename event is generated (rename|delete|create)
> > the path of the parent fd that will be reported to user is NOT
> > the actual path that the process executing the operation used,
> > but the path from which the watching process has added the mark.
> 
> Yeah, and frankly I'm not yet convinced this is a sane thing to do. I'd
> much rather propagate path to vfs_(create|mkdir|...) helpers.
> 
> > So for example, if you have a bind mount:
> >     mount -o bind /a/b/c/d/e/f/g /tmp/g
> > 
> > And you add a watch on mount /a
> > you *will* get events for create,delete,rename in directory /tmp/g
> > but that path in the associated fd with point to /a/b/c/d/e/f/g
> > 
> > Some would claim that it is wrong to report the events
> > if they were not originated from the mount were the
> > watch was added.
> > 
> > I claim that fanotify filters event by mount not because it
> > was a requirement, but because it was an implementation challenge
> > to do otherwise.
> > And I claim that what mount watchers are really interested in is
> > "all the changes that happen in the file system in the area
> >  that is visible to me through this mount point".
> > 
> > In other words, an indexer needs to know if files were modified\
> > create/deleted if that indexer sits in container host namespace
> > regardless if those files were modified from within a container
> > namespace.
> > 
> > It's not a matter of security/isolation. It's a matter of functionality.
> > I agree that for some event (e.g. permission events) it is possible
> > to argue both ways (i.e. that the namespace context should be used
> > as a filter for events).
> > But for the new proposed events (FS_MODIFY_DIR), I really don't
> > see the point in isolation by mount/namespace.
> 
> I would be *very* careful with such assumptions. People are very inventive
> in ways how they can abuse stuff. What I'm most nervous about is that the
> (mnt, dentry) pair you create may not be reachable path. When you combine
> that with xyz_at(2) syscalls allowing you to traverse directory hierarchy
> from fd which you got from fanotify event, things can get really
> interesting. So unless there are very clear rules proving this cannot be
> misused, I'm against hand-crafting of path structures inside fsnotify.
> 
> 								Honza
> -- 
> Jan Kara <jack@suse.com>
> SUSE Labs, CR
Jan Kara March 21, 2017, 4:41 p.m. UTC | #14
On Tue 21-03-17 11:38:49, J. Bruce Fields wrote:
> On Sun, Mar 19, 2017 at 11:19:43AM +0100, Jan Kara wrote:
> > On Tue 14-03-17 13:18:01, Amir Goldstein wrote:
> > > On Tue, Mar 14, 2017 at 1:03 AM, Filip Štědronský <r.lklm@regnarg.cz> wrote:
> > > > Besause fanotify requires `struct path`, the event cannot be generated
> > > > directly in `fsnotify_move` and friends because they only get the inode
> > > > (and their callers, `vfs_rename`&co. cannot supply any better info).
> > > > So instead it needs to be generated higher in the call chain, i.e. in
> > > > the callers of functions like `vfs_rename`.
> > > >
> > > > This leads to some code duplication. Currently, there are several places
> > > > whence functions like `vfs_rename` or `vfs_unlink` are called:
> > > >
> > > >   * syscall handlers (done)
> > > >   * NFS server (done)
> > > >   * stacked filesystems
> > > >       - ecryptfs (done)
> > > >       - overlayfs
> > > >         (Currently doesn't report even ordinary fanotify events, because
> > > >          it internally clones the upper mount; not sure about the
> > > >          rationale.  One can always watch the overlay mount instead.)
> > > >   * few rather minor things
> > > >       - devtmpfs
> > > >         (its internal changes are not tied to any vfsmount so it cannot
> > > >          emit mount-scoped events)
> > > >       - cachefiles (done)
> > > >       - ipc/mqueue.c (done)
> > > >       - fs/nfsd/nfs4recover.c (done)
> > > >       - kernel/bpf/inode.c (done)
> > > >         net/unix/af_unix.c (done)
> > > >
> > > > (grep -rE '\bvfs_(rename|unlink|mknod|whiteout|create|mkdir|rmdir|symlink|link)\(')
> > > >
> > > > Signed-off-by: Filip Štědronský <r.lkml@regnarg.cz>
> > > >
> > > > ---
> > > >
> > > > An alternative might be to create wrapper functions like
> > > > vfs_path_(rename|unlink|...). They could also take care of calling
> > > > security_path_(rename|unlink|...), which is currently also up to
> > > > the indvidual callers (possibly with a flag because it might not
> > > > be always desired).
> > > 
> > > That's an interesting idea. There is some duplicity between security/audit
> > > hook and fsnotify hooks. It should be interesting to try and deduplicate
> > > some of this code.
> > 
> > Yeah, but ecryptfs or nfsd don't actually call these security hooks AFAICT.
> 
> We don't?  E.g. nfsd_unlink calls vfs_unlink which calls
> security_inode_unlink().

OK, I have not been specific enough :). ecryptfs or nfsd don't call *path*
security hooks AFAICT - e.g. security_path_unlink() from nfsd_unlink().

								Honza
J. Bruce Fields March 21, 2017, 5:45 p.m. UTC | #15
On Tue, Mar 21, 2017 at 05:41:22PM +0100, Jan Kara wrote:
> On Tue 21-03-17 11:38:49, J. Bruce Fields wrote:
> > On Sun, Mar 19, 2017 at 11:19:43AM +0100, Jan Kara wrote:
> > > On Tue 14-03-17 13:18:01, Amir Goldstein wrote:
> > > > On Tue, Mar 14, 2017 at 1:03 AM, Filip Štědronský <r.lklm@regnarg.cz> wrote:
> > > > > An alternative might be to create wrapper functions like
> > > > > vfs_path_(rename|unlink|...). They could also take care of calling
> > > > > security_path_(rename|unlink|...), which is currently also up to
> > > > > the indvidual callers (possibly with a flag because it might not
> > > > > be always desired).
> > > > 
> > > > That's an interesting idea. There is some duplicity between security/audit
> > > > hook and fsnotify hooks. It should be interesting to try and deduplicate
> > > > some of this code.
> > > 
> > > Yeah, but ecryptfs or nfsd don't actually call these security hooks AFAICT.
> > 
> > We don't?  E.g. nfsd_unlink calls vfs_unlink which calls
> > security_inode_unlink().
> 
> OK, I have not been specific enough :). ecryptfs or nfsd don't call *path*
> security hooks AFAICT - e.g. security_path_unlink() from nfsd_unlink().

Oh, got it, thanks.

But, no, nfsd is definitely is not meant to be invisible to security
modules, so that's just a bug.

--b.
diff mbox

Patch

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 41df8a27d7eb..8c86699424d1 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -313,6 +313,8 @@  static int cachefiles_bury_object(struct cachefiles_cache *cache,
 			cachefiles_io_error(cache, "Unlink security error");
 		} else {
 			ret = vfs_unlink(d_inode(dir), rep, NULL);
+			if (ret == 0)
+				fsnotify_modify_dir(&path);
 
 			if (preemptive)
 				cachefiles_mark_object_buried(cache, rep, why);
@@ -418,6 +420,10 @@  static int cachefiles_bury_object(struct cachefiles_cache *cache,
 		if (ret != 0 && ret != -ENOMEM)
 			cachefiles_io_error(cache,
 					    "Rename failed with error %d", ret);
+		if (ret == 0) {
+			fsnotify_modify_dir(&path);
+			fsnotify_modify_dir(&path_to_graveyard);
+		}
 
 		if (preemptive)
 			cachefiles_mark_object_buried(cache, rep, why);
@@ -560,6 +566,7 @@  int cachefiles_walk_to_object(struct cachefiles_object *parent,
 			cachefiles_hist(cachefiles_mkdir_histogram, start);
 			if (ret < 0)
 				goto create_error;
+			fsnotify_modify_dir(&path);
 
 			ASSERT(d_backing_inode(next));
 
@@ -589,6 +596,7 @@  int cachefiles_walk_to_object(struct cachefiles_object *parent,
 			cachefiles_hist(cachefiles_create_histogram, start);
 			if (ret < 0)
 				goto create_error;
+			fsnotify_modify_dir(&path);
 
 			ASSERT(d_backing_inode(next));
 
@@ -779,6 +787,7 @@  struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 		ret = vfs_mkdir(d_inode(dir), subdir, 0700);
 		if (ret < 0)
 			goto mkdir_error;
+		fsnotify_modify_dir(&path);
 
 		ASSERT(d_backing_inode(subdir));
 
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e7413f82d27b..88a41b270bcc 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -29,6 +29,8 @@ 
 #include <linux/dcache.h>
 #include <linux/namei.h>
 #include <linux/mount.h>
+#include <linux/path.h>
+#include <linux/fsnotify.h>
 #include <linux/fs_stack.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
@@ -144,16 +146,22 @@  static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
+	struct vfsmount *lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+	struct path lower_dir_path = {lower_mnt, NULL};
 	struct dentry *lower_dir_dentry;
 	int rc;
 
 	dget(lower_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
+	lower_dir_path.dentry = lower_dir_dentry;
 	rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
 	if (rc) {
 		printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
 		goto out_unlock;
 	}
+
+	fsnotify_modify_dir(&lower_dir_path);
+
 	fsstack_copy_attr_times(dir, lower_dir_inode);
 	set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
 	inode->i_ctime = dir->i_ctime;
@@ -184,9 +192,13 @@  ecryptfs_do_create(struct inode *directory_inode,
 	struct dentry *lower_dentry;
 	struct dentry *lower_dir_dentry;
 	struct inode *inode;
+	struct path lower_dir_path;
 
 	lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
+	lower_dir_path.dentry = lower_dir_dentry;
+	lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(ecryptfs_dentry);
+
 	rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true);
 	if (rc) {
 		printk(KERN_ERR "%s: Failure to create dentry in lower fs; "
@@ -194,10 +206,14 @@  ecryptfs_do_create(struct inode *directory_inode,
 		inode = ERR_PTR(rc);
 		goto out_lock;
 	}
+
+	fsnotify_modify_dir(&lower_dir_path);
+
 	inode = __ecryptfs_get_inode(d_inode(lower_dentry),
 				     directory_inode->i_sb);
 	if (IS_ERR(inode)) {
 		vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL);
+		fsnotify_modify_dir(&lower_dir_path);
 		goto out_lock;
 	}
 	fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry));
@@ -432,6 +448,7 @@  static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 	struct dentry *lower_old_dentry;
 	struct dentry *lower_new_dentry;
 	struct dentry *lower_dir_dentry;
+	struct path lower_dir_path;
 	u64 file_size_save;
 	int rc;
 
@@ -441,10 +458,16 @@  static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir,
 	dget(lower_old_dentry);
 	dget(lower_new_dentry);
 	lower_dir_dentry = lock_parent(lower_new_dentry);
+	lower_dir_path.dentry = lower_dir_dentry;
+	lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(new_dentry);
+
 	rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
 		      lower_new_dentry, NULL);
 	if (rc || d_really_is_negative(lower_new_dentry))
 		goto out_lock;
+
+	fsnotify_modify_dir(&lower_dir_path);
+
 	rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb);
 	if (rc)
 		goto out_lock;
@@ -471,6 +494,7 @@  static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 	int rc;
 	struct dentry *lower_dentry;
 	struct dentry *lower_dir_dentry;
+	struct path lower_dir_path;
 	char *encoded_symname;
 	size_t encoded_symlen;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
@@ -478,6 +502,9 @@  static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	dget(lower_dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
+	lower_dir_path.dentry = lower_dir_dentry;
+	lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+
 	mount_crypt_stat = &ecryptfs_superblock_to_private(
 		dir->i_sb)->mount_crypt_stat;
 	rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname,
@@ -491,6 +518,9 @@  static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry,
 	kfree(encoded_symname);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out_lock;
+
+	fsnotify_modify_dir(&lower_dir_path);
+
 	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
 	if (rc)
 		goto out_lock;
@@ -509,12 +539,18 @@  static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
 	int rc;
 	struct dentry *lower_dentry;
 	struct dentry *lower_dir_dentry;
+	struct path lower_dir_path;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
+	lower_dir_path.dentry = lower_dir_dentry;
+	lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(dentry);
 	rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out;
+
+	fsnotify_modify_dir(&lower_dir_path);
+
 	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
 	if (rc)
 		goto out;
@@ -532,16 +568,24 @@  static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct dentry *lower_dentry;
 	struct dentry *lower_dir_dentry;
+	struct path lower_dir_path;
 	int rc;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	dget(dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
+	lower_dir_path.dentry = lower_dir_dentry;
+	lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(dentry);
 	dget(lower_dentry);
+
 	rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry);
 	dput(lower_dentry);
 	if (!rc && d_really_is_positive(dentry))
 		clear_nlink(d_inode(dentry));
+
+	if (rc)
+		fsnotify_modify_dir(&lower_dir_path);
+
 	fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
 	set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
 	unlock_dir(lower_dir_dentry);
@@ -557,12 +601,19 @@  ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev
 	int rc;
 	struct dentry *lower_dentry;
 	struct dentry *lower_dir_dentry;
+	struct path lower_dir_path;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	lower_dir_dentry = lock_parent(lower_dentry);
+	lower_dir_path.dentry = lower_dir_dentry;
+	lower_dir_path.mnt = ecryptfs_dentry_to_lower_mnt(dentry);
+
 	rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev);
 	if (rc || d_really_is_negative(lower_dentry))
 		goto out;
+
+	fsnotify_modify_dir(&lower_dir_path);
+
 	rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb);
 	if (rc)
 		goto out;
@@ -585,6 +636,9 @@  ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct dentry *lower_new_dentry;
 	struct dentry *lower_old_dir_dentry;
 	struct dentry *lower_new_dir_dentry;
+	struct vfsmount *lower_mnt;
+	struct path lower_old_dir_path;
+	struct path lower_new_dir_path;
 	struct dentry *trap = NULL;
 	struct inode *target_inode;
 
@@ -593,10 +647,15 @@  ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
 	lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
+	lower_mnt = ecryptfs_dentry_to_lower_mnt(old_dentry);
 	dget(lower_old_dentry);
 	dget(lower_new_dentry);
 	lower_old_dir_dentry = dget_parent(lower_old_dentry);
 	lower_new_dir_dentry = dget_parent(lower_new_dentry);
+	lower_old_dir_path.dentry = lower_old_dir_dentry;
+	lower_old_dir_path.mnt = lower_mnt;
+	lower_new_dir_path.dentry = lower_new_dir_dentry;
+	lower_new_dir_path.mnt = lower_mnt;
 	target_inode = d_inode(new_dentry);
 	trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
 	/* source should not be ancestor of target */
@@ -614,6 +673,14 @@  ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			NULL, 0);
 	if (rc)
 		goto out_lock;
+
+	/* ecryptfs does not support crossing mount boundaries, we can take
+	 * vfsmount from an arbitrary dentry.
+	 */
+	fsnotify_modify_dir(&lower_old_dir_path);
+	if (!path_equal(&lower_old_dir_path, &lower_new_dir_path))
+		fsnotify_modify_dir(&lower_new_dir_path);
+
 	if (target_inode)
 		fsstack_copy_attr_all(target_inode,
 				      ecryptfs_inode_to_lower(target_inode));
diff --git a/fs/namei.c b/fs/namei.c
index ad74877e1442..17667f0c89e5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3009,8 +3009,12 @@  static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 				dput(dentry);
 				dentry = file->f_path.dentry;
 			}
-			if (*opened & FILE_CREATED)
+			if (*opened & FILE_CREATED) {
+				struct path parent_path = {file->f_path.mnt,
+							dentry->d_parent};
 				fsnotify_create(dir, dentry);
+				fsnotify_modify_dir(&parent_path);
+			}
 			if (unlikely(d_is_negative(dentry))) {
 				error = -ENOENT;
 			} else {
@@ -3157,6 +3161,7 @@  static int lookup_open(struct nameidata *nd, struct path *path,
 		if (error)
 			goto out_dput;
 		fsnotify_create(dir_inode, dentry);
+		fsnotify_modify_dir(&nd->path);
 	}
 	if (unlikely(create_error) && !dentry->d_inode) {
 		error = create_error;
@@ -3702,6 +3707,7 @@  SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode,
 			error = vfs_mknod(path.dentry->d_inode,dentry,mode,0);
 			break;
 	}
+	fsnotify_modify_dir(&path);
 out:
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
@@ -3759,6 +3765,8 @@  SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
 	error = security_path_mkdir(&path, dentry, mode);
 	if (!error)
 		error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+	if (!error)
+		fsnotify_modify_dir(&path);
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -3855,6 +3863,8 @@  static long do_rmdir(int dfd, const char __user *pathname)
 	if (error)
 		goto exit3;
 	error = vfs_rmdir(path.dentry->d_inode, dentry);
+	if (!error)
+		fsnotify_modify_dir(&path);
 exit3:
 	dput(dentry);
 exit2:
@@ -3979,6 +3989,8 @@  static long do_unlinkat(int dfd, const char __user *pathname)
 		if (error)
 			goto exit2;
 		error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
+		if (!error)
+			fsnotify_modify_dir(&path);
 exit2:
 		dput(dentry);
 	}
@@ -4070,6 +4082,8 @@  SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
 	error = security_path_symlink(&path, dentry, from->name);
 	if (!error)
 		error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+	if (!error)
+		fsnotify_modify_dir(&path);
 	done_path_create(&path, dentry);
 	if (retry_estale(error, lookup_flags)) {
 		lookup_flags |= LOOKUP_REVAL;
@@ -4219,6 +4233,8 @@  SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 	if (error)
 		goto out_dput;
 	error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+	if (!error)
+		fsnotify_modify_dir(&new_path);
 out_dput:
 	done_path_create(&new_path, new_dentry);
 	if (delegated_inode) {
@@ -4532,6 +4548,11 @@  SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
 	error = vfs_rename(old_path.dentry->d_inode, old_dentry,
 			   new_path.dentry->d_inode, new_dentry,
 			   &delegated_inode, flags);
+	if (error == 0) {
+		fsnotify_modify_dir(&old_path);
+		if (!path_equal(&old_path, &new_path))
+			fsnotify_modify_dir(&new_path);
+	}
 exit5:
 	dput(new_dentry);
 exit4:
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 66eaeb1e8c2c..58f70bbaac38 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -36,6 +36,7 @@ 
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
+#include <linux/fsnotify.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/module.h>
@@ -216,6 +217,8 @@  nfsd4_create_clid_dir(struct nfs4_client *clp)
 		 */
 		goto out_put;
 	status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU);
+	if (status == 0)
+		fsnotify_modify_dir(&nn->rec_file->f_path);
 out_put:
 	dput(dentry);
 out_unlock:
@@ -338,6 +341,8 @@  nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
 	if (d_really_is_negative(dentry))
 		goto out;
 	status = vfs_rmdir(d_inode(dir), dentry);
+	if (status == 0)
+		fsnotify_modify_dir(&nn->rec_file->f_path);
 out:
 	dput(dentry);
 out_unlock:
@@ -401,6 +406,8 @@  purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn)
 	if (status)
 		printk("failed to remove client recovery directory %pd\n",
 				child);
+	else
+		fsnotify_modify_dir(&nn->rec_file->f_path);
 	/* Keep trying, success or failure: */
 	return 0;
 }
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 26c6fdb4bf67..7632ab3fd99e 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -364,6 +364,18 @@  nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
 }
 
 /*
+ * Helper to emit fsnotify modify_dir event. Call with fph locked.
+ */
+static void nfsd_fsnotify_modify_dir(struct svc_fh *fhp)
+{
+	struct path path;
+
+	path.mnt = fhp->fh_export->ex_path.mnt;
+	path.dentry = fhp->fh_dentry;
+	fsnotify_modify_dir(&path);
+}
+
+/*
  * Set various file attributes.  After this call fhp needs an fh_put.
  */
 __be32
@@ -1207,6 +1219,7 @@  nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		goto out_nfserr;
 
 	err = nfsd_create_setattr(rqstp, resfhp, iap);
+	nfsd_fsnotify_modify_dir(fhp);
 
 	/*
 	 * nfsd_create_setattr already committed the child.  Transactional
@@ -1525,8 +1538,10 @@  nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 	host_err = vfs_symlink(d_inode(dentry), dnew, path);
 	err = nfserrno(host_err);
-	if (!err)
+	if (!err) {
+		nfsd_fsnotify_modify_dir(fhp);
 		err = nfserrno(commit_metadata(fhp));
+	}
 	fh_unlock(fhp);
 
 	fh_drop_write(fhp);
@@ -1593,6 +1608,7 @@  nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
 		goto out_dput;
 	host_err = vfs_link(dold, dirp, dnew, NULL);
 	if (!host_err) {
+		nfsd_fsnotify_modify_dir(tfhp);
 		err = nfserrno(commit_metadata(ffhp));
 		if (!err)
 			err = nfserrno(commit_metadata(tfhp));
@@ -1686,6 +1702,8 @@  nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
 
 	host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
 	if (!host_err) {
+		nfsd_fsnotify_modify_dir(tfhp);
+		nfsd_fsnotify_modify_dir(ffhp);
 		host_err = commit_metadata(tfhp);
 		if (!host_err)
 			host_err = commit_metadata(ffhp);
@@ -1757,8 +1775,10 @@  nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 		host_err = vfs_unlink(dirp, rdentry, NULL);
 	else
 		host_err = vfs_rmdir(dirp, rdentry);
-	if (!host_err)
+	if (!host_err) {
+		nfsd_fsnotify_modify_dir(fhp);
 		host_err = commit_metadata(fhp);
+	}
 	dput(rdentry);
 
 out_nfserr:
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 7a2d8f0c8ae5..10e413c2216f 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -19,6 +19,7 @@ 
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/fsnotify.h>
 #include <linux/sysctl.h>
 #include <linux/poll.h>
 #include <linux/mqueue.h>
@@ -818,6 +819,10 @@  SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode,
 			filp = do_create(ipc_ns, d_inode(root),
 						&path, oflag, mode,
 						u_attr ? &attr : NULL);
+			if (!IS_ERR(filp)) {
+				struct path root_path = {mnt, mnt->mnt_root};
+				fsnotify_modify_dir(&root_path);
+			}
 		}
 	} else {
 		if (d_really_is_negative(path.dentry)) {
@@ -878,6 +883,10 @@  SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
 	} else {
 		ihold(inode);
 		err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
+		if (!err) {
+			struct path path = {mnt, dentry->d_parent};
+			fsnotify_modify_dir(&path);
+		}
 	}
 	dput(dentry);
 
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 0b030c9126d3..93137292b051 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -16,6 +16,7 @@ 
 #include <linux/major.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
+#include <linux/fsnotify.h>
 #include <linux/fs.h>
 #include <linux/kdev_t.h>
 #include <linux/parser.h>
@@ -255,6 +256,8 @@  static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
 
 	dentry->d_fsdata = raw;
 	ret = vfs_mknod(dir, dentry, mode, devt);
+	if (ret == 0)
+		fsnotify_modify_dir(&path);
 	dentry->d_fsdata = NULL;
 out:
 	done_path_create(&path, dentry);
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index cef79873b09d..5049bd4bd1d8 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -91,6 +91,7 @@ 
 #include <linux/stat.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
+#include <linux/fsnotify.h>
 #include <linux/socket.h>
 #include <linux/un.h>
 #include <linux/fcntl.h>
@@ -976,6 +977,7 @@  static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 	if (!err) {
 		err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
 		if (!err) {
+			fsnotify_modify_dir(&path);
 			res->mnt = mntget(path.mnt);
 			res->dentry = dget(dentry);
 		}