diff mbox

[RFC,1/2] fanotify: new event FAN_MODIFY_DIR

Message ID a0fa4e7890ea16e7f90a17524e817755f7dde8b5.1489445257.git.p@regnarg.cz (mailing list archive)
State New, archived
Headers show

Commit Message

Filip Štědronský March 13, 2017, 11:02 p.m. UTC
Fanotify currently does not report directory modification events
(rename, unlink, etc.). But these events are essential for about half of
concievable fanotify use cases, especially:

  - file system indexers / desktop search tools
  - file synchronization tools (like Dropbox, Nextcloud, etc.),
    online backup tools

and pretty much any app that needs to maintain and up-to-date internal
representation of current contents of the file system.

All applications of the above classes that I'm aware of currently use
recursive inotify watches, which do not scale (my home dir has ~200k
directories, creating all the watches takes ~2min and eats several tens
of MB of kernel memory).

There have been many calls for such a feature, pretty much since the
creation of fanotify in 2009:
  * By GNOME developers:
    https://wiki.gnome.org/BastienNocera/KernelWishlist#VFS.2C_filesystems
  * By KDE developers:
    http://lkml.kernel.org/r/201211011352.42476.Martin%40lichtvoll.de
    'Better support for (desktop) file search / indexing applications'
  * And others:
    http://lkml.kernel.org/r/AANLkTi=owK=WZW4oNtpm5WpAZhqCQUdTR2K5gzJ_MqZ+%40mail.gmail.com
    'Fanotify mv/rename?'
    http://lkml.kernel.org/r/1238158043.23703.20.camel%40fatty
    'Issues with using fanotify for a filesystem indexer'

Add a new fanotify event, FAN_MODIFY_DIR, that is emitted whenever the
contents of a directory change (a directory entry is added, removed or
renamed). This covers all the currently missing events: rename, unlink,
mknod, mkdir, rmdir, etc.

Included with the event is a file descriptor to the modified directory
but no further info. The userspace application is expected to rescan the
whole directory and update its model accordingly. This needs to be done
carefully to prevent race conditions. A cross-directory rename generates
two FAN_MODIFY_DIR events.

This minimalistic approach has several advantages:
  * Does not require changes to the fanotify userspace API or the
    fsnotify in-kernel framework, apart from adding a new event.
    Especially doesn't complicate it by adding string fields.
  * Has simple and clear semantics, even with multiple renames occurring
    in parallel etc. In case of any inconsistencies, one can simply wait
    for a while and rescan again.
  * FAN_MODIFY_DIR events are easily merged in case of multiple
    operations on a directory (e.g. deleting all files).

Signed-off-by: Filip Štědronský <r.lkml@regnarg.cz>

---

An alternative might be to create wrapper functions like
vfs_path_(rename|unlink|...). They could also take care of calling
security_path_(rename|unlink|...), which is currently also up to
the indvidual callers (possibly with a flag because it might not
be always desired).

An alternative was proposed by Amir Goldstein in several long series of
patches that add superblock-scoped (as opposed to vfsmount-scoped)
fanotify watches and specific dentry manipulation events with filenames:

    http://lkml.kernel.org/r/1481984434-13293-1-git-send-email-amir73il%40gmail.com
    http://lkml.kernel.org/r/1482247207-4424-1-git-send-email-amir73il%40gmail.com
    http://lkml.kernel.org/r/1476126784-12520-1-git-send-email-amir73il%40gmail.com
    http://lkml.kernel.org/r/1489411223-12081-1-git-send-email-amir73il%40gmail.com

There is large but not complete overlap between that proposal and
mine (which is was originally created over a year ago, before Amir's
work, but never posted).

I think the superblock watch idea is very interesting because it might
in the future allow reporing fsnotify events from remote and virtual
filesystems. So I'm posting this more as a potential source of more
ideas for discussion, or a fallback proposal in case Amir's patches
don't make it.
---
 fs/notify/fanotify/fanotify.c    |  1 +
 include/linux/fsnotify.h         | 17 +++++++++++++++++
 include/linux/fsnotify_backend.h |  1 +
 include/uapi/linux/fanotify.h    |  5 ++++-
 4 files changed, 23 insertions(+), 1 deletion(-)

Comments

Filip Štědronský March 13, 2017, 11:16 p.m. UTC | #1
An example userspace program that uses FAN_MODIFY_DIR to reliably keep
an up-to-date internal representation of the file system. It uses some
filehandle trickery to identify inodes, other heuristics could be also
used.

---

//#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/fanotify.h>
#include <stdint.h>
#include <dirent.h>
#include <assert.h>
#include <string.h>

#include <map>
#include <set>
#include <list>
using namespace std;

#ifndef FAN_MODIFY_DIR
#define FAN_MODIFY_DIR 0x00040000
#endif

// die-on-error helpers
#define CHK(x) ({ __typeof__(x) r = x; if (r == -1) { perror(#x); abort(); } r; })
#define CHKN(x) ({ __typeof__(x) r = x; if (r == NULL) { perror(#x); abort(); } r; })
struct inode_info;
struct dentry_info;

struct inode_info {
    ino_t ino;
    mode_t mode;
    char handle[MAX_HANDLE_SZ];
    set<struct dentry_info *> links;
    map<string, struct dentry_info *> children; // for directory inodes
};

struct dentry_info {
    struct inode_info *parent, *inode;
    string name;
};


map<ino_t, inode_info*> inodes;

int root_fd;
int fan_fd;

bool compare_handles(const void *h1, const void *h2) {
    const struct file_handle *fh1 = (const struct file_handle*) h1;
    const struct file_handle *fh2 = (const struct file_handle*) h2;
    return (fh1->handle_bytes == fh2->handle_bytes
                && memcmp(h1, h2, fh1->handle_bytes) == 0);
}

bool handle_valid(void *handle) {
    int check_fd = open_by_handle_at(root_fd, (struct file_handle*)handle, O_PATH);
    if (check_fd >= 0) {
        CHK(close(check_fd));
        return true;
    } else if (errno == ESTALE) {
        return false;
    } else {
        perror("open_by_handle_at");
        exit(1);
    }
}

// Get the path corresponding to an inode (one of its paths, in the presence of
// hardlinks).
void inode_path(const struct inode_info *inode, char *buf, size_t bufsiz) {
    list<string> components;
    while (true) {
        if (inode->links.empty()) break;
        struct dentry_info *dentry = *inode->links.begin();
        components.push_front(dentry->name);
        inode = dentry->parent;
    }
    buf[0] = '\0';
    for (auto name: components) {
        int len = snprintf(buf, bufsiz, "/%s", name.c_str());
        buf += len;
        bufsiz -= len;
    }
}


void delete_dentry(struct dentry_info *dentry) {
    assert(dentry->parent->children[dentry->name] == dentry);

    char path_buf[4096];
    inode_path(dentry->parent, path_buf, sizeof(path_buf));
    printf("unlinked %s/%s (ino %lu, parent %lu)\n", path_buf, dentry->name.c_str(),
           dentry->inode->ino, dentry->parent->ino);

    dentry->parent->children.erase(dentry->name.c_str());
    dentry->inode->links.erase(dentry);
    // TODO: If this was the last dentry pointing to an inode, schedule removing
    //       the inode after a timeout (we cannot remove it immediately because
    //       the zero-link situation might occur during a rename when the source
    //       directory has been processed but the target directory hasn't).
    delete dentry;
}

struct dentry_info *add_dentry(struct inode_info *parent, const char *name,
                                struct inode_info *child) {
    struct dentry_info *dentry = new dentry_info();
    dentry->parent = parent;
    dentry->name = name;
    dentry->inode = child;
    parent->children[name] = dentry;
    child->links.insert(dentry);

    char path_buf[4096] = "\0";
    inode_path(parent, path_buf, sizeof(path_buf));
    printf("linked %s/%s (ino %lu, parent %lu)\n", path_buf, name, child->ino, parent->ino);

    return dentry;
}

void delete_inode(struct inode_info *inode) {
    for (auto dentry: inode->links) {
        delete_dentry(dentry);
    }
    delete inode;
}

// Given a file descriptor, find the corresponding inode object in our database,
// or create a new one if it does not exist. An O_PATH fd suffices.
struct inode_info *find_inode(int fd) {
    struct stat st;
    CHK(fstat(fd, &st));
    char handle[sizeof(struct file_handle) + MAX_HANDLE_SZ];
    struct file_handle *fh = (struct file_handle*)handle;
    fh->handle_bytes = sizeof(handle);
    int mntid;
    CHK(name_to_handle_at(fd, "", (struct file_handle*)handle, &mntid,
                            AT_EMPTY_PATH));

    struct inode_info *info = inodes[st.st_ino];
    if (info) {
        // Handles can refer to the same file despite not being equal.
        // If the old handle can still be opened, we can be assured
        // that the inode number has not been recycled.
        if (compare_handles(handle, info->handle) || handle_valid(info->handle)) {
            return info;
        } else {
            delete_inode(info);
            info = NULL;
        }
    }

    inodes[st.st_ino] = info = new inode_info();
    info->ino = st.st_ino;
    info->mode = st.st_mode;
    memcpy(info->handle, handle, fh->handle_bytes);
    return info;
}

// Scan directory and update internal filesystem representation accordingly.
// Closes `dirfd`.
void scan(int dirfd, bool recursive) {
    struct inode_info *dir = find_inode(dirfd);

    char path_buf[4096] = "\0";
    inode_path(dir, path_buf, sizeof(path_buf));
    printf("scan %s (%lu)\n", path_buf, dir->ino);

    DIR *dp = CHKN(fdopendir(dirfd));
    set<string> seen;
    while (struct dirent *ent = readdir(dp)) {
        if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) continue;
        seen.insert(ent->d_name);
        if (dir->children.find(ent->d_name) != dir->children.end()
                && dir->children[ent->d_name]->inode->ino == ent->d_ino) {
            // Heuristic: It is massively unlikely that an inode number
            // would be recylced at the same path as before. So if we
            // see the same inode for the same child, we skip the more
            // expensive checks altogether. This saves us a buttload of
            // syscalls, especially given that most directory entries
            // will be unchanged after a FAN_MODIFY_DIR.
            //
            // This can be skipped if strict correctness is preferred
            // over speed.
            continue;
        }
        int fd = openat(dirfd, ent->d_name, O_PATH|O_NOFOLLOW);
        if (fd < 0) continue;
        struct inode_info *child = find_inode(fd);
        if (dir->children.find(ent->d_name) != dir->children.end()) {
            struct dentry_info *old_dentry = dir->children[ent->d_name];
            if (child != old_dentry->inode) {
                delete_dentry(old_dentry);
                add_dentry(dir, ent->d_name, child);
            }
        } else {
            add_dentry(dir, ent->d_name, child);
        }
        if (recursive && S_ISDIR(child->mode)) {
            // `fd' is just an O_PATH fd. For scanning we need O_RDONLY.
            int scan_fd = CHK(openat(fd, ".", O_RDONLY|O_DIRECTORY));
            scan(scan_fd, true); // closes scan_fd
        }
        close(fd);
    }
    for (auto it: dir->children) {
        if (seen.find(it.second->name) == seen.end()) delete_dentry(it.second);
    }
    closedir(dp);
}

void event_loop() {
    while (true) {
        char buf[4096];
        ssize_t len = CHK(read(fan_fd, buf, sizeof(buf)));
        const struct fanotify_event_metadata *event;
        event = (const struct fanotify_event_metadata*) buf;
        while (FAN_EVENT_OK(event, len)) {
            if (event->vers != FANOTIFY_METADATA_VERSION) abort();
            if (event->mask & FAN_MODIFY_DIR) {
                scan(event->fd, false);
            } else if (event->mask & FAN_Q_OVERFLOW) {
                abort(); // TODO: full rescan needed
            } else {
                close(event->fd);
            }
            event = FAN_EVENT_NEXT(event, len);
        }
    }
}

int main(int argc, char **argv) {
    if (argc != 2) { fprintf(stderr, "Usage: %s MOUNTPOINT\n", argv[0]); return 1; }

    root_fd = CHK(open(argv[1], O_RDONLY|O_DIRECTORY));
    // In a real application, FAN_UNLIMITED_QUEUE would be replaced with a secondary
    // userspace queue filled during scanning.
    fan_fd = CHK(fanotify_init(FAN_UNLIMITED_QUEUE, O_RDONLY));
    CHK(fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_MODIFY_DIR|FAN_ONDIR,
                        root_fd, NULL));
    
    scan(dup(root_fd), true);

    event_loop();

    return 0;
}
Amir Goldstein March 14, 2017, 10:11 a.m. UTC | #2
On Tue, Mar 14, 2017 at 1:02 AM, Filip Štědronský <r.lklm@regnarg.cz> wrote:
> Fanotify currently does not report directory modification events
> (rename, unlink, etc.). But these events are essential for about half of
> concievable fanotify use cases, especially:
>
>   - file system indexers / desktop search tools
>   - file synchronization tools (like Dropbox, Nextcloud, etc.),
>     online backup tools

This last one is the use case of my employer, Ctera Networks.
Out of curiosity, what is the use case that you are focusing on?

>
> and pretty much any app that needs to maintain and up-to-date internal
> representation of current contents of the file system.
>
> All applications of the above classes that I'm aware of currently use
> recursive inotify watches, which do not scale (my home dir has ~200k
> directories, creating all the watches takes ~2min and eats several tens
> of MB of kernel memory).
>
> There have been many calls for such a feature, pretty much since the
> creation of fanotify in 2009:
>   * By GNOME developers:
>     https://wiki.gnome.org/BastienNocera/KernelWishlist#VFS.2C_filesystems
>   * By KDE developers:
>     http://lkml.kernel.org/r/201211011352.42476.Martin%40lichtvoll.de
>     'Better support for (desktop) file search / indexing applications'
>   * And others:
>     http://lkml.kernel.org/r/AANLkTi=owK=WZW4oNtpm5WpAZhqCQUdTR2K5gzJ_MqZ+%40mail.gmail.com
>     'Fanotify mv/rename?'
>     http://lkml.kernel.org/r/1238158043.23703.20.camel%40fatty
>     'Issues with using fanotify for a filesystem indexer'
>

Thanks for sharing this summary!
I had the feeling that all recursive inotify users are hiding in the shadows,
but was missing more concrete evidence.

> Add a new fanotify event, FAN_MODIFY_DIR, that is emitted whenever the
> contents of a directory change (a directory entry is added, removed or
> renamed). This covers all the currently missing events: rename, unlink,
> mknod, mkdir, rmdir, etc.
>
> Included with the event is a file descriptor to the modified directory
> but no further info. The userspace application is expected to rescan the
> whole directory and update its model accordingly. This needs to be done
> carefully to prevent race conditions. A cross-directory rename generates
> two FAN_MODIFY_DIR events.
>

Your approach is interesting and I am glad you shared it with us.
I do like it and it gives me an idea, I am going to prepare a branch
with a subset
of my patches, so you can try them with your userspace sample program.

In comments to your patches I am going to argue that, as a matter of fact,
you can take a small sub set of my patches and get the same functionality
of FAN_MODIFY_DIR, with code that is *less* complex then what you propose.
So please treat my comments as technical comments how FAN_MODIFY_DIR
should be implemented IMO and not as an attempt to reject an opposing proposal.

> This minimalistic approach has several advantages:
>   * Does not require changes to the fanotify userspace API or the
>     fsnotify in-kernel framework, apart from adding a new event.

About the argument of not having to change in-kernel framework,
I don't think it should be a consideration at all.
I claim that my 1st fsnotify cleanup series is an improvement of the framework
(https://github.com/amir73il/linux/commits/fsnotify_dentry)
regardless of additional functionality.
So changing the in-kernel framework by adding complexity may be bad,
but changing it by simplifying and making code more readable should not
be an argument against the "non-minimalistic" approach.

About the change to usespace API, if you strip off the *added* functionality
of super block watch from my patches, your proposal for user API looks
like this:

    fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \
                           FAN_MODIFY_DIR|FAN_ONDIR,

And my proposal for user API looks like this:

    fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \
                           FAN_MOVE|FAN_CREATE|FAN_DELETE,

You can see why my proposal for user API is not any less minimalistic
and it is fully compatible with existing intotify API for the same events.

I understand why it is hard to see this behind my added functionality,
so I will post a minimalistic branch and test program as an example.

>     Especially doesn't complicate it by adding string fields.

So the string fields in my proposal are optional.
userspace opts-in to get them by specifying the  FAN_EVENT_INFO_NAME flag:

    fanotify_init(FAN_CLOEXEC | FAN_CLASS_NOTIF |
                       FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_NAME,

If you don't specify FAN_EVENT_INFO_NAME, you can get filename events
FAN_MOVE|FAN_CREATE|FAN_DELETE without the name.

What you do get is the file descriptor of the parent. sounds familiar? ;-)

The flag FAN_EVENT_INFO_PARENT in an explicit opt-in to get parent fd
instead of victim id on specific events.
If  FAN_EVENT_INFO_PARENT is not specified in fanotify_init()
the filename events FAN_MOVE|FAN_CREATE|FAN_DELETE are masked
out of fanotify_mark().

This is important because your patchs adds FAN_MODIFY_DIR to
FAN_ALL_EVENTS (as does mine).
So old fanotify userspace code, compiled with new kernel headers, with:

    fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT,
                           FAN_ALL_EVENTS,

Will start getting your FAN_MODIFY_DIR events and those programs
might have wrong assumptions about the provided fd.

So I chose to have stronger backward compatibility, and added the opt-in
FAN_EVENT_INFO_PARENT flag.

>   * Has simple and clear semantics, even with multiple renames occurring
>     in parallel etc. In case of any inconsistencies, one can simply wait
>     for a while and rescan again.
>   * FAN_MODIFY_DIR events are easily merged in case of multiple
>     operations on a directory (e.g. deleting all files).
>

I agree those 2 are important advantages.
You could get them with my proposed API when dropping the
FAN_EVENT_INFO_NAME flag.

Thanks for pointing that out.

> Signed-off-by: Filip Štědronský <r.lkml@regnarg.cz>
>
> ---
>
> An alternative might be to create wrapper functions like
> vfs_path_(rename|unlink|...). They could also take care of calling
> security_path_(rename|unlink|...), which is currently also up to
> the indvidual callers (possibly with a flag because it might not
> be always desired).
>
> An alternative was proposed by Amir Goldstein in several long series of
> patches that add superblock-scoped (as opposed to vfsmount-scoped)
> fanotify watches and specific dentry manipulation events with filenames:
>
>     http://lkml.kernel.org/r/1481984434-13293-1-git-send-email-amir73il%40gmail.com
>     http://lkml.kernel.org/r/1482247207-4424-1-git-send-email-amir73il%40gmail.com
>     http://lkml.kernel.org/r/1476126784-12520-1-git-send-email-amir73il%40gmail.com
>     http://lkml.kernel.org/r/1489411223-12081-1-git-send-email-amir73il%40gmail.com
>
> There is large but not complete overlap between that proposal and
> mine (which is was originally created over a year ago, before Amir's
> work, but never posted).
>

I am proud of the role I played to help your proposal see the day of light :-)

> I think the superblock watch idea is very interesting because it might
> in the future allow reporing fsnotify events from remote and virtual
> filesystems. So I'm posting this more as a potential source of more
> ideas for discussion, or a fallback proposal in case Amir's patches
> don't make it.
> ---
>  fs/notify/fanotify/fanotify.c    |  1 +
>  include/linux/fsnotify.h         | 17 +++++++++++++++++
>  include/linux/fsnotify_backend.h |  1 +
>  include/uapi/linux/fanotify.h    |  5 ++++-
>  4 files changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index bbc175d4213d..5178b06c338c 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -186,6 +186,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
>
>         BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
>         BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
> +       BUILD_BUG_ON(FAN_MODIFY_DIR != FS_MODIFY_DIR);
>         BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
>         BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
>         BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
> diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
> index b43d3f5bd9ea..00fb87c975d6 100644
> --- a/include/linux/fsnotify.h
> +++ b/include/linux/fsnotify.h
> @@ -208,6 +208,23 @@ static inline void fsnotify_modify(struct file *file)
>  }
>
>  /*
> + * fsnotify_modifydir - directory contents were changed
> + * (as a result of rename, creat, unlink, etc.)
> + */
> +static inline void fsnotify_modify_dir(struct path *path)
> +{
> +       struct inode *inode = path->dentry->d_inode;
> +       __u32 mask = FS_MODIFY_DIR;
> +
> +       if (S_ISDIR(inode->i_mode))
> +               mask |= FS_ISDIR;

It is going to be somewhat confusing for users to understand
the difference between FS_MODIFY_DIR and FS_MODIFY_DIR|FS_ISDIR.
IMO, it is much more intuitive (and compatible with intoify semantics)
when users
get specific event information, e.g. FS_DELETE and FS_DELETE|FS_ISDIR.


> +       else
> +               return;
> +
> +       fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
> +}
> +
> +/*
>   * fsnotify_open - file was opened
>   */
>  static inline void fsnotify_open(struct file *file)
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 487246546ebe..7751b337ec31 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -42,6 +42,7 @@
>
>  #define FS_OPEN_PERM           0x00010000      /* open event in an permission hook */
>  #define FS_ACCESS_PERM         0x00020000      /* access event in a permissions hook */
> +#define FS_MODIFY_DIR          0x00040000      /* directory changed (rename/unlink/...) */
>
>  #define FS_EXCL_UNLINK         0x04000000      /* do not send events if object is unlinked */
>  #define FS_ISDIR               0x40000000      /* event occurred against dir */
> diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> index 030508d195d3..f14e048d492a 100644
> --- a/include/uapi/linux/fanotify.h
> +++ b/include/uapi/linux/fanotify.h
> @@ -15,6 +15,8 @@
>  #define FAN_OPEN_PERM          0x00010000      /* File open in perm check */
>  #define FAN_ACCESS_PERM                0x00020000      /* File accessed in perm check */
>
> +#define FAN_MODIFY_DIR         0x00040000      /* directory changed (rename/unlink/...) */
> +
>  #define FAN_ONDIR              0x40000000      /* event occurred against dir */
>
>  #define FAN_EVENT_ON_CHILD     0x08000000      /* interested in child events */
> @@ -67,7 +69,8 @@
>  #define FAN_ALL_EVENTS (FAN_ACCESS |\
>                         FAN_MODIFY |\
>                         FAN_CLOSE |\
> -                       FAN_OPEN)
> +                       FAN_OPEN |\
> +                       FAN_MODIFY_DIR)
>
>  /*
>   * All events which require a permission response from userspace
> --
> 2.11.1
>
Amir Goldstein March 14, 2017, 10:40 a.m. UTC | #3
On Tue, Mar 14, 2017 at 1:16 AM, Filip Štědronský <r.lklm@regnarg.cz> wrote:
> An example userspace program that uses FAN_MODIFY_DIR to reliably keep
> an up-to-date internal representation of the file system. It uses some
> filehandle trickery to identify inodes, other heuristics could be also
> used.
>


An I am very happy that you used filehandles to keep track of objects
in your example, because it fits my proposal really well.

See, if you used my proposed API, you would have had

      fan_fd = CHK(fanotify_init(FAN_UNLIMITED_QUEUE| \
                             FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_FH,
                             O_RDONLY));

And then you would have gotten the filhandle of parent with the event
and you would need find_inode().

Furthermore, my proposal records the filehandle at the time of the event
and therefore, does not have to keep an elevated refcount of the object
in the events queue.

Again, this needs some work, but I will try to post a testing branch
which demonstrates how my patches should be used with this test
program.

Thanks again for sharing it.

Amir.

> ---
>
> //#define _GNU_SOURCE
>
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <sys/fanotify.h>
> #include <stdint.h>
> #include <dirent.h>
> #include <assert.h>
> #include <string.h>
>
> #include <map>
> #include <set>
> #include <list>
> using namespace std;
>
> #ifndef FAN_MODIFY_DIR
> #define FAN_MODIFY_DIR 0x00040000
> #endif
>
> // die-on-error helpers
> #define CHK(x) ({ __typeof__(x) r = x; if (r == -1) { perror(#x); abort(); } r; })
> #define CHKN(x) ({ __typeof__(x) r = x; if (r == NULL) { perror(#x); abort(); } r; })
> struct inode_info;
> struct dentry_info;
>
> struct inode_info {
>     ino_t ino;
>     mode_t mode;
>     char handle[MAX_HANDLE_SZ];
>     set<struct dentry_info *> links;
>     map<string, struct dentry_info *> children; // for directory inodes
> };
>
> struct dentry_info {
>     struct inode_info *parent, *inode;
>     string name;
> };
>
>
> map<ino_t, inode_info*> inodes;
>
> int root_fd;
> int fan_fd;
>
> bool compare_handles(const void *h1, const void *h2) {
>     const struct file_handle *fh1 = (const struct file_handle*) h1;
>     const struct file_handle *fh2 = (const struct file_handle*) h2;
>     return (fh1->handle_bytes == fh2->handle_bytes
>                 && memcmp(h1, h2, fh1->handle_bytes) == 0);
> }
>
> bool handle_valid(void *handle) {
>     int check_fd = open_by_handle_at(root_fd, (struct file_handle*)handle, O_PATH);
>     if (check_fd >= 0) {
>         CHK(close(check_fd));
>         return true;
>     } else if (errno == ESTALE) {
>         return false;
>     } else {
>         perror("open_by_handle_at");
>         exit(1);
>     }
> }
>
> // Get the path corresponding to an inode (one of its paths, in the presence of
> // hardlinks).
> void inode_path(const struct inode_info *inode, char *buf, size_t bufsiz) {
>     list<string> components;
>     while (true) {
>         if (inode->links.empty()) break;
>         struct dentry_info *dentry = *inode->links.begin();
>         components.push_front(dentry->name);
>         inode = dentry->parent;
>     }
>     buf[0] = '\0';
>     for (auto name: components) {
>         int len = snprintf(buf, bufsiz, "/%s", name.c_str());
>         buf += len;
>         bufsiz -= len;
>     }
> }
>
>
> void delete_dentry(struct dentry_info *dentry) {
>     assert(dentry->parent->children[dentry->name] == dentry);
>
>     char path_buf[4096];
>     inode_path(dentry->parent, path_buf, sizeof(path_buf));
>     printf("unlinked %s/%s (ino %lu, parent %lu)\n", path_buf, dentry->name.c_str(),
>            dentry->inode->ino, dentry->parent->ino);
>
>     dentry->parent->children.erase(dentry->name.c_str());
>     dentry->inode->links.erase(dentry);
>     // TODO: If this was the last dentry pointing to an inode, schedule removing
>     //       the inode after a timeout (we cannot remove it immediately because
>     //       the zero-link situation might occur during a rename when the source
>     //       directory has been processed but the target directory hasn't).
>     delete dentry;
> }
>
> struct dentry_info *add_dentry(struct inode_info *parent, const char *name,
>                                 struct inode_info *child) {
>     struct dentry_info *dentry = new dentry_info();
>     dentry->parent = parent;
>     dentry->name = name;
>     dentry->inode = child;
>     parent->children[name] = dentry;
>     child->links.insert(dentry);
>
>     char path_buf[4096] = "\0";
>     inode_path(parent, path_buf, sizeof(path_buf));
>     printf("linked %s/%s (ino %lu, parent %lu)\n", path_buf, name, child->ino, parent->ino);
>
>     return dentry;
> }
>
> void delete_inode(struct inode_info *inode) {
>     for (auto dentry: inode->links) {
>         delete_dentry(dentry);
>     }
>     delete inode;
> }
>
> // Given a file descriptor, find the corresponding inode object in our database,
> // or create a new one if it does not exist. An O_PATH fd suffices.
> struct inode_info *find_inode(int fd) {
>     struct stat st;
>     CHK(fstat(fd, &st));
>     char handle[sizeof(struct file_handle) + MAX_HANDLE_SZ];
>     struct file_handle *fh = (struct file_handle*)handle;
>     fh->handle_bytes = sizeof(handle);
>     int mntid;
>     CHK(name_to_handle_at(fd, "", (struct file_handle*)handle, &mntid,
>                             AT_EMPTY_PATH));
>
>     struct inode_info *info = inodes[st.st_ino];
>     if (info) {
>         // Handles can refer to the same file despite not being equal.
>         // If the old handle can still be opened, we can be assured
>         // that the inode number has not been recycled.
>         if (compare_handles(handle, info->handle) || handle_valid(info->handle)) {
>             return info;
>         } else {
>             delete_inode(info);
>             info = NULL;
>         }
>     }
>
>     inodes[st.st_ino] = info = new inode_info();
>     info->ino = st.st_ino;
>     info->mode = st.st_mode;
>     memcpy(info->handle, handle, fh->handle_bytes);
>     return info;
> }
>
> // Scan directory and update internal filesystem representation accordingly.
> // Closes `dirfd`.
> void scan(int dirfd, bool recursive) {
>     struct inode_info *dir = find_inode(dirfd);
>
>     char path_buf[4096] = "\0";
>     inode_path(dir, path_buf, sizeof(path_buf));
>     printf("scan %s (%lu)\n", path_buf, dir->ino);
>
>     DIR *dp = CHKN(fdopendir(dirfd));
>     set<string> seen;
>     while (struct dirent *ent = readdir(dp)) {
>         if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) continue;
>         seen.insert(ent->d_name);
>         if (dir->children.find(ent->d_name) != dir->children.end()
>                 && dir->children[ent->d_name]->inode->ino == ent->d_ino) {
>             // Heuristic: It is massively unlikely that an inode number
>             // would be recylced at the same path as before. So if we
>             // see the same inode for the same child, we skip the more
>             // expensive checks altogether. This saves us a buttload of
>             // syscalls, especially given that most directory entries
>             // will be unchanged after a FAN_MODIFY_DIR.
>             //
>             // This can be skipped if strict correctness is preferred
>             // over speed.
>             continue;
>         }
>         int fd = openat(dirfd, ent->d_name, O_PATH|O_NOFOLLOW);
>         if (fd < 0) continue;
>         struct inode_info *child = find_inode(fd);
>         if (dir->children.find(ent->d_name) != dir->children.end()) {
>             struct dentry_info *old_dentry = dir->children[ent->d_name];
>             if (child != old_dentry->inode) {
>                 delete_dentry(old_dentry);
>                 add_dentry(dir, ent->d_name, child);
>             }
>         } else {
>             add_dentry(dir, ent->d_name, child);
>         }
>         if (recursive && S_ISDIR(child->mode)) {
>             // `fd' is just an O_PATH fd. For scanning we need O_RDONLY.
>             int scan_fd = CHK(openat(fd, ".", O_RDONLY|O_DIRECTORY));
>             scan(scan_fd, true); // closes scan_fd
>         }
>         close(fd);
>     }
>     for (auto it: dir->children) {
>         if (seen.find(it.second->name) == seen.end()) delete_dentry(it.second);
>     }
>     closedir(dp);
> }
>
> void event_loop() {
>     while (true) {
>         char buf[4096];
>         ssize_t len = CHK(read(fan_fd, buf, sizeof(buf)));
>         const struct fanotify_event_metadata *event;
>         event = (const struct fanotify_event_metadata*) buf;
>         while (FAN_EVENT_OK(event, len)) {
>             if (event->vers != FANOTIFY_METADATA_VERSION) abort();
>             if (event->mask & FAN_MODIFY_DIR) {
>                 scan(event->fd, false);
>             } else if (event->mask & FAN_Q_OVERFLOW) {
>                 abort(); // TODO: full rescan needed
>             } else {
>                 close(event->fd);
>             }
>             event = FAN_EVENT_NEXT(event, len);
>         }
>     }
> }
>
> int main(int argc, char **argv) {
>     if (argc != 2) { fprintf(stderr, "Usage: %s MOUNTPOINT\n", argv[0]); return 1; }
>
>     root_fd = CHK(open(argv[1], O_RDONLY|O_DIRECTORY));
>     // In a real application, FAN_UNLIMITED_QUEUE would be replaced with a secondary
>     // userspace queue filled during scanning.
>     fan_fd = CHK(fanotify_init(FAN_UNLIMITED_QUEUE, O_RDONLY));
>     CHK(fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_MODIFY_DIR|FAN_ONDIR,
>                         root_fd, NULL));
>
>     scan(dup(root_fd), true);
>
>     event_loop();
>
>     return 0;
> }
Filip Štědronský March 14, 2017, 12:41 p.m. UTC | #4
Hi,

On Tue, Mar 14, 2017 at 12:11:40PM +0200, Amir Goldstein wrote:
> >   - file system indexers / desktop search tools
> >   - file synchronization tools (like Dropbox, Nextcloud, etc.),
> >     online backup tools
> 
> This last one is the use case of my employer, Ctera Networks.
> Out of curiosity, what is the use case that you are focusing on?

I'm working on a file synchronization tool as part of my bachelor
thesis at Charles University.

When I started (now over a year ago ... long story), there were AFIK no
attempts at solving the recursive inotify issue, only a lot of
complaints, so I cobbled up together something simple (I'm not a kernel
developer by trade, this was my first patch) that would allow me to
work on the userspace parts, which are the main bulk.

I try to focus on algorithmic and implementation efficiency as opposed
to fancy GUIs and similar. I want it to be fast with 100k directories
and 10M files in home dir. But it's very WIP so we'll see how that turns
out.

> I had the feeling that all recursive inotify users are hiding in the shadows,
> but was missing more concrete evidence.

Yes, even Dropbox uses inotify. They have articles in their help on how
to increase inotify.max_user_watches: https://www.dropbox.com/help/145.
That's not good PR. ;-) (I'm not affiliated with DB, just pointing that
out.)

> About the argument of not having to change in-kernel framework,
> I don't think it should be a consideration at all.

Understood. I tried to stay conservative and non-controversial because I
imagined that radical framework changes would take months of discussions
(look at the history of statx) and this issue seems to be pressing for
quite a lot of people.  But rushing is of course not the best strategy
either, there are always compromises.

> If you don't specify FAN_EVENT_INFO_NAME, you can get filename events
> FAN_MOVE|FAN_CREATE|FAN_DELETE without the name.
> 
> What you do get is the file descriptor of the parent. sounds familiar? ;-)

I didn't notice this bit. That sounds like a win-win.

Filip
Filip Štědronský March 14, 2017, 1:46 p.m. UTC | #5
Hi,

On Tue, Mar 14, 2017 at 12:40:56PM +0200, Amir Goldstein wrote:
> An I am very happy that you used filehandles to keep track of objects
> in your example, because it fits my proposal really well.

you can read more about the rationales in the WIP draft of my thesis
(especially the chapter Identifying Inodes and its surroundings):

    http://regnarg.cz/tmp/thesis.pdf
    https://github.com/regnarg/bc

Either way, one can never use pathnames as identifiers because in
a world with parallel renames on multiple levels of the hierarchy
(between which no ordering is guaranteed unless they are cross-dir),
pathnames are not even a well-defined concept.

NB: I used a simple script for testing that does precisely this
(a lot of parallel within-directory renames at many levels):
    https://github.com/regnarg/bc/blob/master/experiments/fanotify/parallel_renamist.sh
It seems like a good stress test for any application that tries to
do reliable filesystem monitoring.

> See, if you used my proposed API, you would have had
> 
>       fan_fd = CHK(fanotify_init(FAN_UNLIMITED_QUEUE| \
>                              FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_FH,
>                              O_RDONLY));

Sure, I'll definitely try to implement your interface as an
optional backend and mention it in my thesis and definitely
give it some heavy testing.

> Furthermore, my proposal records the filehandle at the time of the event
> and therefore, does not have to keep an elevated refcount of the object
> in the events queue.

That sounds good but from what I've understood,  not all
filesystems support handles. Is this a concern? (Maybe the
right answer is to extend handle support...)

Filip
Amir Goldstein March 14, 2017, 1:55 p.m. UTC | #6
On Tue, Mar 14, 2017 at 2:41 PM, Filip Štědronský <r.lkml@regnarg.cz> wrote:
> Hi,
>
> On Tue, Mar 14, 2017 at 12:11:40PM +0200, Amir Goldstein wrote:
>> >   - file system indexers / desktop search tools
>> >   - file synchronization tools (like Dropbox, Nextcloud, etc.),
>> >     online backup tools
>>
>> This last one is the use case of my employer, Ctera Networks.
>> Out of curiosity, what is the use case that you are focusing on?
>
> I'm working on a file synchronization tool as part of my bachelor
> thesis at Charles University.
>
> When I started (now over a year ago ... long story), there were AFIK no
> attempts at solving the recursive inotify issue, only a lot of
> complaints, so I cobbled up together something simple (I'm not a kernel
> developer by trade, this was my first patch) that would allow me to
> work on the userspace parts, which are the main bulk.
>

That sounds very useful. I was actually looking for an open userspace
implementation were I could demonstrate my patches, but all the projects
I looked at (beagle etc..) seems to have died from desperation waiting
for intotify scalabiltiy to be fixed...

> I try to focus on algorithmic and implementation efficiency as opposed
> to fancy GUIs and similar. I want it to be fast with 100k directories
> and 10M files in home dir. But it's very WIP so we'll see how that turns
> out.
>

I did some basic tests of my super block watch with 1M directories,
and currently the code is under integration testing in our company.

I prepared a drop-in replacement for our fs change listener class
that was previously implemented with inotify.
Currently, the userspace code still "thinks" it is adding watches on directories
recursively, but what it really does is just gets all their file handles.
Then, the code that maps an event to "watch descriptor" actually matches
the fhandle in the event to the directory fhandle (much like in your example).

If you are not married to using your kernel patch, you can just go a head and
try my patch set.
You can fetch it directly from:
https://github.com/amir73il/linux/tree/fanotify_sb

It has been recently rebased over v4.11-rc1.

There are also ready ports for stable kernels v4.1 v4.4 and v4.9
in my github if that serves you better.

Marko Rauhamaa [CCed] has also tested my patches to his satisfaction.


>> I had the feeling that all recursive inotify users are hiding in the shadows,
>> but was missing more concrete evidence.
>
> Yes, even Dropbox uses inotify. They have articles in their help on how
> to increase inotify.max_user_watches: https://www.dropbox.com/help/145.
> That's not good PR. ;-) (I'm not affiliated with DB, just pointing that
> out.)
>
>> About the argument of not having to change in-kernel framework,
>> I don't think it should be a consideration at all.
>
> Understood. I tried to stay conservative and non-controversial because I
> imagined that radical framework changes would take months of discussions
> (look at the history of statx) and this issue seems to be pressing for
> quite a lot of people.  But rushing is of course not the best strategy
> either, there are always compromises.
>

Don't get me wrong, I certainly do *understand* why you would want to stay
away from making radical framework changes.
But I believe it is the best interest of the community to choose the
best solution
going forward while eliminating the fear-of-change factor.
This is why I say that fear of change SHOULD not be a consideration.

>> If you don't specify FAN_EVENT_INFO_NAME, you can get filename events
>> FAN_MOVE|FAN_CREATE|FAN_DELETE without the name.
>>
>> What you do get is the file descriptor of the parent. sounds familiar? ;-)
>
> I didn't notice this bit. That sounds like a win-win.
>

Well, I may me giving you half the truth here.
With current patches not specifying FAN_EVENT_INFO_NAME
will mask out the filename events from the mark, but
1. The implementation could be easily changed
2. You can set FAN_EVENT_INFO_NAME and ignore the
    filename info in userspace. You still get the parent fd
    as your program expects and you get a bonus fhandle
    of the parent for free with the event, see my test program:

github.com/amir73il/fsnotify-utils/blob/master/src/test/fanotify_demo.c#L101

Please let me know if that is sufficient for your needs
or if you need me to prepare a version that delivers filename events
without filename info, therefore allowing to merge filename events.

Sounds to me like if you get an event FAN_DELETE, "aaa",
your implementation can update the db directly without having
to scan the directory, so it should be useful.
For your consideration.

Amir.
Filip Štědronský March 14, 2017, 2:48 p.m. UTC | #7
Hi,

On Tue, Mar 14, 2017 at 03:55:19PM +0200, Amir Goldstein wrote:
> Please let me know if that is sufficient for your needs
> or if you need me to prepare a version that delivers filename events
> without filename info, therefore allowing to merge filename events.
> 
> Sounds to me like if you get an event FAN_DELETE, "aaa",
> your implementation can update the db directly without having
> to scan the directory, so it should be useful.
> For your consideration.

both approaches might be useful and there are tradeoffs. Direct updates
save us from rescanning but give more events and more context switches.

On the other hand, with an unlimited queue and well-mergable events I
could e.g. sleep for five seconds each time after emptying the queue,
thus giving the events more potential to merge and reducing context
switches.

But one risks blowing up the queue. However, I have some ideas.
Basically we could implement some kind of "bulk read" mode for fanotify
that would pass events to userspace only when
(a) a given event count thresold is hit (e.g. half the queue limit
    if queue is limited), or
(b) the oldest event is older than a specified maximum latency. That
    might vary from 1 second to 5 minutes depending on specific
    application needs (e.g. background backup does not need to be
    low-latency).

This would greatly reduce extra context switches when watching
a large portion (or whole) of the file system. All of this presumably
could be implemented on top of your suggested patches.

Either way, I suggest that implementing the nameless filename events
is a good idea. I do not know whether they will be the best choice
for my application but they probably will be useful for some
applications.

Filip
Amir Goldstein March 14, 2017, 3:07 p.m. UTC | #8
On Tue, Mar 14, 2017 at 3:46 PM, Filip Štědronský <r.lkml@regnarg.cz> wrote:
> Hi,
>
> On Tue, Mar 14, 2017 at 12:40:56PM +0200, Amir Goldstein wrote:
>> An I am very happy that you used filehandles to keep track of objects
>> in your example, because it fits my proposal really well.
>
> you can read more about the rationales in the WIP draft of my thesis
> (especially the chapter Identifying Inodes and its surroundings):
>
>     http://regnarg.cz/tmp/thesis.pdf
>     https://github.com/regnarg/bc
>
> Either way, one can never use pathnames as identifiers because in
> a world with parallel renames on multiple levels of the hierarchy
> (between which no ordering is guaranteed unless they are cross-dir),
> pathnames are not even a well-defined concept.
>
> NB: I used a simple script for testing that does precisely this
> (a lot of parallel within-directory renames at many levels):
>     https://github.com/regnarg/bc/blob/master/experiments/fanotify/parallel_renamist.sh
> It seems like a good stress test for any application that tries to
> do reliable filesystem monitoring.

I agree. not only to filesystem monitoring.
filesystems could use a test like this in general.
It would be good to stress test overlayfs dir rename like this.

>
>> See, if you used my proposed API, you would have had
>>
>>       fan_fd = CHK(fanotify_init(FAN_UNLIMITED_QUEUE| \
>>                              FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_FH,
>>                              O_RDONLY));
>
> Sure, I'll definitely try to implement your interface as an
> optional backend and mention it in my thesis and definitely
> give it some heavy testing.
>
>> Furthermore, my proposal records the filehandle at the time of the event
>> and therefore, does not have to keep an elevated refcount of the object
>> in the events queue.
>
> That sounds good but from what I've understood,  not all
> filesystems support handles. Is this a concern? (Maybe the
> right answer is to extend handle support...)
>

Very interesting question.
Extending fhandle support is not always easy.

The majority of local file system do support fhandles,
because they are required for exporting a file system over NFS.
see: git grep -lw s_export_op fs/

The reason that some file systems "don't support" fhandles
is usually because they either do not have an efficient way to implement
open_by_handle() or because they do not have a way to encode a unique
and persistent (over reboot) identifer for name_to_handle().

An example of a commonly used file system that currently does not support
file handles is overlayfs.

The interesting thing to note wrt fanotify and overlayfs is that while it is
technically hard to implement open_by_handle() for overlayfs, it is quite
trivial to implement name_to_handle() for the special common case of all
layers on the same underlying fs (e.g. docker).
The trivial implementation (at least for directories) is to return fhandle
of lower most dir in case of lower or merge dir and fhandle of upper dir
otherwise.

I don't think there is any filesystem that implements  name_to_handle()
without implementing open_by_handle(), but technically it is possible.
And for fanotify FAN_EVENT_INFO_FH only the former is required.

But in any case, FAN_EVENT_INFO_FH is optional.
If your userspace implementation can settle with parent fd information
then filesystems that do not support fhandles can use this mode.

Amir.
Amir Goldstein March 14, 2017, 10:30 p.m. UTC | #9
On Tue, Mar 14, 2017 at 4:48 PM, Filip Štědronský <r.lkml@regnarg.cz> wrote:
> Hi,
>
> On Tue, Mar 14, 2017 at 03:55:19PM +0200, Amir Goldstein wrote:
>> Please let me know if that is sufficient for your needs
>> or if you need me to prepare a version that delivers filename events
>> without filename info, therefore allowing to merge filename events.
>>
>> Sounds to me like if you get an event FAN_DELETE, "aaa",
>> your implementation can update the db directly without having
>> to scan the directory, so it should be useful.
>> For your consideration.
>
> both approaches might be useful and there are tradeoffs. Direct updates
> save us from rescanning but give more events and more context switches.
>
> On the other hand, with an unlimited queue and well-mergable events I
> could e.g. sleep for five seconds each time after emptying the queue,
> thus giving the events more potential to merge and reducing context
> switches.
>
> But one risks blowing up the queue. However, I have some ideas.
> Basically we could implement some kind of "bulk read" mode for fanotify
> that would pass events to userspace only when
> (a) a given event count thresold is hit (e.g. half the queue limit
>     if queue is limited), or
> (b) the oldest event is older than a specified maximum latency. That
>     might vary from 1 second to 5 minutes depending on specific
>     application needs (e.g. background backup does not need to be
>     low-latency).
>
> This would greatly reduce extra context switches when watching
> a large portion (or whole) of the file system. All of this presumably
> could be implemented on top of your suggested patches.
>
> Either way, I suggest that implementing the nameless filename events
> is a good idea. I do not know whether they will be the best choice
> for my application but they probably will be useful for some
> applications.
>

Done and pushed all branches.
Added 2 new patches to the fanotify_dentry set:
ce0c223 fanotify: allow merge of filename events
cd872d1 fanotify: make filename info optional for filename events

You should test with branch fanotify_sb, using super block watch,
because filename events are not supported in mount scope.
You can follow user API used by my fanotify_demo.c program.

Tested with my fanotify_demo test
both without FAN_EVENT_INFO_NAME and without FAN_EVENT_INFO_FH.

Without FAN_EVENT_INFO_NAME more filename events are merged,
but in my test that runs fs ops by a script, most events do not get merged
because of different pid value.

I guess that for the filesync/indexer use case, we would need to add a new
FAN_EVENT_INFO_NOPID flag to loose the irrelevant pid information
in order to be able to merge a lot more filename event on a directory.

I also plan to add FAN_EVENT_INFO_NOFD, to get rid of redundant
fd (when INFO_FH is available) and then be able to drop the refcount
on the victim object.

Cheers,
Amir.
Michael Kerrisk (man-pages) March 15, 2017, 4:52 a.m. UTC | #10
[CC += linux-api@vger.kernel.org]

Filip,

Since this is a kernel-user-space API change, please CC linux-api@
(and on future iterations of this patch). The kernel source file
Documentation/SubmitChecklist notes that all Linux kernel patches that
change userspace interfaces should be CCed to
linux-api@vger.kernel.org, so that the various parties who are
interested in API changes are informed. For further information, see
https://www.kernel.org/doc/man-pages/linux-api-ml.html

Thanks,

Michael



On Tue, Mar 14, 2017 at 12:02 AM, Filip Štědronský <r.lklm@regnarg.cz> wrote:
> Fanotify currently does not report directory modification events
> (rename, unlink, etc.). But these events are essential for about half of
> concievable fanotify use cases, especially:
>
>   - file system indexers / desktop search tools
>   - file synchronization tools (like Dropbox, Nextcloud, etc.),
>     online backup tools
>
> and pretty much any app that needs to maintain and up-to-date internal
> representation of current contents of the file system.
>
> All applications of the above classes that I'm aware of currently use
> recursive inotify watches, which do not scale (my home dir has ~200k
> directories, creating all the watches takes ~2min and eats several tens
> of MB of kernel memory).
>
> There have been many calls for such a feature, pretty much since the
> creation of fanotify in 2009:
>   * By GNOME developers:
>     https://wiki.gnome.org/BastienNocera/KernelWishlist#VFS.2C_filesystems
>   * By KDE developers:
>     http://lkml.kernel.org/r/201211011352.42476.Martin%40lichtvoll.de
>     'Better support for (desktop) file search / indexing applications'
>   * And others:
>     http://lkml.kernel.org/r/AANLkTi=owK=WZW4oNtpm5WpAZhqCQUdTR2K5gzJ_MqZ+%40mail.gmail.com
>     'Fanotify mv/rename?'
>     http://lkml.kernel.org/r/1238158043.23703.20.camel%40fatty
>     'Issues with using fanotify for a filesystem indexer'
>
> Add a new fanotify event, FAN_MODIFY_DIR, that is emitted whenever the
> contents of a directory change (a directory entry is added, removed or
> renamed). This covers all the currently missing events: rename, unlink,
> mknod, mkdir, rmdir, etc.
>
> Included with the event is a file descriptor to the modified directory
> but no further info. The userspace application is expected to rescan the
> whole directory and update its model accordingly. This needs to be done
> carefully to prevent race conditions. A cross-directory rename generates
> two FAN_MODIFY_DIR events.
>
> This minimalistic approach has several advantages:
>   * Does not require changes to the fanotify userspace API or the
>     fsnotify in-kernel framework, apart from adding a new event.
>     Especially doesn't complicate it by adding string fields.
>   * Has simple and clear semantics, even with multiple renames occurring
>     in parallel etc. In case of any inconsistencies, one can simply wait
>     for a while and rescan again.
>   * FAN_MODIFY_DIR events are easily merged in case of multiple
>     operations on a directory (e.g. deleting all files).
>
> Signed-off-by: Filip Štědronský <r.lkml@regnarg.cz>
>
> ---
>
> An alternative might be to create wrapper functions like
> vfs_path_(rename|unlink|...). They could also take care of calling
> security_path_(rename|unlink|...), which is currently also up to
> the indvidual callers (possibly with a flag because it might not
> be always desired).
>
> An alternative was proposed by Amir Goldstein in several long series of
> patches that add superblock-scoped (as opposed to vfsmount-scoped)
> fanotify watches and specific dentry manipulation events with filenames:
>
>     http://lkml.kernel.org/r/1481984434-13293-1-git-send-email-amir73il%40gmail.com
>     http://lkml.kernel.org/r/1482247207-4424-1-git-send-email-amir73il%40gmail.com
>     http://lkml.kernel.org/r/1476126784-12520-1-git-send-email-amir73il%40gmail.com
>     http://lkml.kernel.org/r/1489411223-12081-1-git-send-email-amir73il%40gmail.com
>
> There is large but not complete overlap between that proposal and
> mine (which is was originally created over a year ago, before Amir's
> work, but never posted).
>
> I think the superblock watch idea is very interesting because it might
> in the future allow reporing fsnotify events from remote and virtual
> filesystems. So I'm posting this more as a potential source of more
> ideas for discussion, or a fallback proposal in case Amir's patches
> don't make it.
> ---
>  fs/notify/fanotify/fanotify.c    |  1 +
>  include/linux/fsnotify.h         | 17 +++++++++++++++++
>  include/linux/fsnotify_backend.h |  1 +
>  include/uapi/linux/fanotify.h    |  5 ++++-
>  4 files changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
> index bbc175d4213d..5178b06c338c 100644
> --- a/fs/notify/fanotify/fanotify.c
> +++ b/fs/notify/fanotify/fanotify.c
> @@ -186,6 +186,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
>
>         BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
>         BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
> +       BUILD_BUG_ON(FAN_MODIFY_DIR != FS_MODIFY_DIR);
>         BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
>         BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
>         BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
> diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
> index b43d3f5bd9ea..00fb87c975d6 100644
> --- a/include/linux/fsnotify.h
> +++ b/include/linux/fsnotify.h
> @@ -208,6 +208,23 @@ static inline void fsnotify_modify(struct file *file)
>  }
>
>  /*
> + * fsnotify_modifydir - directory contents were changed
> + * (as a result of rename, creat, unlink, etc.)
> + */
> +static inline void fsnotify_modify_dir(struct path *path)
> +{
> +       struct inode *inode = path->dentry->d_inode;
> +       __u32 mask = FS_MODIFY_DIR;
> +
> +       if (S_ISDIR(inode->i_mode))
> +               mask |= FS_ISDIR;
> +       else
> +               return;
> +
> +       fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
> +}
> +
> +/*
>   * fsnotify_open - file was opened
>   */
>  static inline void fsnotify_open(struct file *file)
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 487246546ebe..7751b337ec31 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -42,6 +42,7 @@
>
>  #define FS_OPEN_PERM           0x00010000      /* open event in an permission hook */
>  #define FS_ACCESS_PERM         0x00020000      /* access event in a permissions hook */
> +#define FS_MODIFY_DIR          0x00040000      /* directory changed (rename/unlink/...) */
>
>  #define FS_EXCL_UNLINK         0x04000000      /* do not send events if object is unlinked */
>  #define FS_ISDIR               0x40000000      /* event occurred against dir */
> diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> index 030508d195d3..f14e048d492a 100644
> --- a/include/uapi/linux/fanotify.h
> +++ b/include/uapi/linux/fanotify.h
> @@ -15,6 +15,8 @@
>  #define FAN_OPEN_PERM          0x00010000      /* File open in perm check */
>  #define FAN_ACCESS_PERM                0x00020000      /* File accessed in perm check */
>
> +#define FAN_MODIFY_DIR         0x00040000      /* directory changed (rename/unlink/...) */
> +
>  #define FAN_ONDIR              0x40000000      /* event occurred against dir */
>
>  #define FAN_EVENT_ON_CHILD     0x08000000      /* interested in child events */
> @@ -67,7 +69,8 @@
>  #define FAN_ALL_EVENTS (FAN_ACCESS |\
>                         FAN_MODIFY |\
>                         FAN_CLOSE |\
> -                       FAN_OPEN)
> +                       FAN_OPEN |\
> +                       FAN_MODIFY_DIR)
>
>  /*
>   * All events which require a permission response from userspace
> --
> 2.11.1
>
Jan Kara March 15, 2017, 2:05 p.m. UTC | #11
Hello,

On Tue 14-03-17 12:11:40, Amir Goldstein wrote:
> > Add a new fanotify event, FAN_MODIFY_DIR, that is emitted whenever the
> > contents of a directory change (a directory entry is added, removed or
> > renamed). This covers all the currently missing events: rename, unlink,
> > mknod, mkdir, rmdir, etc.
> >
> > Included with the event is a file descriptor to the modified directory
> > but no further info. The userspace application is expected to rescan the
> > whole directory and update its model accordingly. This needs to be done
> > carefully to prevent race conditions. A cross-directory rename generates
> > two FAN_MODIFY_DIR events.
> >
> 
> Your approach is interesting and I am glad you shared it with us.
> I do like it and it gives me an idea, I am going to prepare a branch
> with a subset
> of my patches, so you can try them with your userspace sample program.
> 
> In comments to your patches I am going to argue that, as a matter of fact,
> you can take a small sub set of my patches and get the same functionality
> of FAN_MODIFY_DIR, with code that is *less* complex then what you propose.
> So please treat my comments as technical comments how FAN_MODIFY_DIR
> should be implemented IMO and not as an attempt to reject an opposing proposal.

Yeah, so I have yet to read both your patch sets in detail (good news is
that this is getting towards top of my TODO stack and I think I'll do this
when flying to LSF/MM this weekend). With Filip's design I like the
minimalistic approach of adding just DIR_CHANGED event with (albeit
optional) messing with names. That just seems to fit well with the fanotify
design.

> > This minimalistic approach has several advantages:
> >   * Does not require changes to the fanotify userspace API or the
> >     fsnotify in-kernel framework, apart from adding a new event.
> 
> About the argument of not having to change in-kernel framework,
> I don't think it should be a consideration at all.
> I claim that my 1st fsnotify cleanup series is an improvement of the framework
> (https://github.com/amir73il/linux/commits/fsnotify_dentry)
> regardless of additional functionality.
> So changing the in-kernel framework by adding complexity may be bad,
> but changing it by simplifying and making code more readable should not
> be an argument against the "non-minimalistic" approach.
> 
> About the change to usespace API, if you strip off the *added* functionality
> of super block watch from my patches, your proposal for user API looks
> like this:
> 
>     fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \
>                            FAN_MODIFY_DIR|FAN_ONDIR,
> 
> And my proposal for user API looks like this:
> 
>     fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \
>                            FAN_MOVE|FAN_CREATE|FAN_DELETE,
> 
> You can see why my proposal for user API is not any less minimalistic
> and it is fully compatible with existing intotify API for the same events.
> 
> I understand why it is hard to see this behind my added functionality,
> so I will post a minimalistic branch and test program as an example.
> 
> >     Especially doesn't complicate it by adding string fields.
> 
> So the string fields in my proposal are optional.
> userspace opts-in to get them by specifying the  FAN_EVENT_INFO_NAME flag:
> 
>     fanotify_init(FAN_CLOEXEC | FAN_CLASS_NOTIF |
>                        FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_NAME,
> 
> If you don't specify FAN_EVENT_INFO_NAME, you can get filename events
> FAN_MOVE|FAN_CREATE|FAN_DELETE without the name.

So when thinking about this again, I'm again concerned that the names need
not tell userspace what it thinks. I know we already discussed this but
in our last discussion I think I forgot to point out that inotify directory
events (and fanotify would be the same AFAICT) may come out of order compared
to real filesystem changes. E.g. sequence:

Task 1		Task 2

mv a b
		mv b c

may come out of inotify as:

IN_MOVED_FROM "b" COOKIE 1
IN_MOVED_TO   "c" COOKIE 1
IN_MOVED_FROM "a" COOKIE 2
IN_MOVED_TO   "b" COOKIE 2

and if user program just blindly belives this sequence its internal
representation of the filesystem will be different from the real state of
the filesystem.

So for now I'm more inclined towards the trivial approach (possibly using
your patches and stripping additional functionality from them). But I'll
leave final decision to when I'll be able to read everything in detail.

								Honza
Amir Goldstein March 15, 2017, 2:34 p.m. UTC | #12
On Wed, Mar 15, 2017 at 4:05 PM, Jan Kara <jack@suse.cz> wrote:
> Hello,
>
> On Tue 14-03-17 12:11:40, Amir Goldstein wrote:
>> > Add a new fanotify event, FAN_MODIFY_DIR, that is emitted whenever the
>> > contents of a directory change (a directory entry is added, removed or
>> > renamed). This covers all the currently missing events: rename, unlink,
>> > mknod, mkdir, rmdir, etc.
>> >
>> > Included with the event is a file descriptor to the modified directory
>> > but no further info. The userspace application is expected to rescan the
>> > whole directory and update its model accordingly. This needs to be done
>> > carefully to prevent race conditions. A cross-directory rename generates
>> > two FAN_MODIFY_DIR events.
>> >
>>
>> Your approach is interesting and I am glad you shared it with us.
>> I do like it and it gives me an idea, I am going to prepare a branch
>> with a subset
>> of my patches, so you can try them with your userspace sample program.
>>
>> In comments to your patches I am going to argue that, as a matter of fact,
>> you can take a small sub set of my patches and get the same functionality
>> of FAN_MODIFY_DIR, with code that is *less* complex then what you propose.
>> So please treat my comments as technical comments how FAN_MODIFY_DIR
>> should be implemented IMO and not as an attempt to reject an opposing proposal.
>
> Yeah, so I have yet to read both your patch sets in detail (good news is
> that this is getting towards top of my TODO stack and I think I'll do this
> when flying to LSF/MM this weekend). With Filip's design I like the
> minimalistic approach of adding just DIR_CHANGED event with (albeit
> optional) messing with names. That just seems to fit well with the fanotify
> design.
>
>> > This minimalistic approach has several advantages:
>> >   * Does not require changes to the fanotify userspace API or the
>> >     fsnotify in-kernel framework, apart from adding a new event.
>>
>> About the argument of not having to change in-kernel framework,
>> I don't think it should be a consideration at all.
>> I claim that my 1st fsnotify cleanup series is an improvement of the framework
>> (https://github.com/amir73il/linux/commits/fsnotify_dentry)
>> regardless of additional functionality.
>> So changing the in-kernel framework by adding complexity may be bad,
>> but changing it by simplifying and making code more readable should not
>> be an argument against the "non-minimalistic" approach.
>>
>> About the change to usespace API, if you strip off the *added* functionality
>> of super block watch from my patches, your proposal for user API looks
>> like this:
>>
>>     fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \
>>                            FAN_MODIFY_DIR|FAN_ONDIR,
>>
>> And my proposal for user API looks like this:
>>
>>     fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \
>>                            FAN_MOVE|FAN_CREATE|FAN_DELETE,
>>
>> You can see why my proposal for user API is not any less minimalistic
>> and it is fully compatible with existing intotify API for the same events.
>>
>> I understand why it is hard to see this behind my added functionality,
>> so I will post a minimalistic branch and test program as an example.
>>
>> >     Especially doesn't complicate it by adding string fields.
>>
>> So the string fields in my proposal are optional.
>> userspace opts-in to get them by specifying the  FAN_EVENT_INFO_NAME flag:
>>
>>     fanotify_init(FAN_CLOEXEC | FAN_CLASS_NOTIF |
>>                        FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_NAME,
>>
>> If you don't specify FAN_EVENT_INFO_NAME, you can get filename events
>> FAN_MOVE|FAN_CREATE|FAN_DELETE without the name.
>
> So when thinking about this again, I'm again concerned that the names need
> not tell userspace what it thinks. I know we already discussed this but
> in our last discussion I think I forgot to point out that inotify directory
> events (and fanotify would be the same AFAICT) may come out of order compared
> to real filesystem changes. E.g. sequence:
>
> Task 1          Task 2
>
> mv a b
>                 mv b c
>
> may come out of inotify as:
>
> IN_MOVED_FROM "b" COOKIE 1
> IN_MOVED_TO   "c" COOKIE 1
> IN_MOVED_FROM "a" COOKIE 2
> IN_MOVED_TO   "b" COOKIE 2
>

Really? How come?

fsnotify_move() inside vfs_rename() is serialized with lock_rename()

I should use this opportunity to note that fanotify event does not have
a cookie field and that I did not add one to my implementation, so that
is a problem.

> and if user program just blindly belives this sequence its internal
> representation of the filesystem will be different from the real state of
> the filesystem.
>
> So for now I'm more inclined towards the trivial approach (possibly using
> your patches and stripping additional functionality from them). But I'll
> leave final decision to when I'll be able to read everything in detail.
>

The addition of filename info is completely optional and the relevant patch
("fanotify: pass filename info for filename events")
Can be yanked out of the series and pushed back to the 'maybe' pile.

Amir.
Jan Kara March 16, 2017, 10:38 a.m. UTC | #13
On Wed 15-03-17 16:34:11, Amir Goldstein wrote:
> On Wed, Mar 15, 2017 at 4:05 PM, Jan Kara <jack@suse.cz> wrote:
> >> So the string fields in my proposal are optional.
> >> userspace opts-in to get them by specifying the  FAN_EVENT_INFO_NAME flag:
> >>
> >>     fanotify_init(FAN_CLOEXEC | FAN_CLASS_NOTIF |
> >>                        FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_NAME,
> >>
> >> If you don't specify FAN_EVENT_INFO_NAME, you can get filename events
> >> FAN_MOVE|FAN_CREATE|FAN_DELETE without the name.
> >
> > So when thinking about this again, I'm again concerned that the names need
> > not tell userspace what it thinks. I know we already discussed this but
> > in our last discussion I think I forgot to point out that inotify directory
> > events (and fanotify would be the same AFAICT) may come out of order compared
> > to real filesystem changes. E.g. sequence:
> >
> > Task 1          Task 2
> >
> > mv a b
> >                 mv b c
> >
> > may come out of inotify as:
> >
> > IN_MOVED_FROM "b" COOKIE 1
> > IN_MOVED_TO   "c" COOKIE 1
> > IN_MOVED_FROM "a" COOKIE 2
> > IN_MOVED_TO   "b" COOKIE 2
> >
> 
> Really? How come?
> 
> fsnotify_move() inside vfs_rename() is serialized with lock_rename()

Sorry, I got confused by the unlocking in vfs_rename() however that unlocks
only the moved file / directory but not the directory in which the rename
is happening. Taking my objection back.

								Honza
Amir Goldstein March 20, 2017, 12:10 p.m. UTC | #14
On Tue, Mar 14, 2017 at 9:46 AM, Filip Štědronský <r.lkml@regnarg.cz> wrote:
> Hi,
>
> On Tue, Mar 14, 2017 at 12:40:56PM +0200, Amir Goldstein wrote:
>> An I am very happy that you used filehandles to keep track of objects
>> in your example, because it fits my proposal really well.
>
> you can read more about the rationales in the WIP draft of my thesis
> (especially the chapter Identifying Inodes and its surroundings):
>
>     http://regnarg.cz/tmp/thesis.pdf
>     https://github.com/regnarg/bc
>

Filip,

I read your draft and it's very interesting.
Looking forward for the unfinished chapters ;-)

open_by_handle() is important for your implementation in order
to be able to get inode number (unique object id) from the file handle.
In practice, although file handles are opaque descriptors, they usually
contain the inode number information is a well known format.

If it will be possible to take those formats and make them standard ABI,
or alternatively, to call a new get_inum_by_handle() syscall, then
the indexer/file sync won't need to have the extra privileges needed
for  open_by_handle() and file systems that can provide "view only"
file handles (e.g. overlayfs) could be used with the indexer.

Apropos chapter 1.3 "Filesystem based change detection"
It's on my todo list to try and implement something with XFS
using the LSN information embedded in every inode.
It would be great if we can collaborate on this in the future.

Amir.
diff mbox

Patch

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index bbc175d4213d..5178b06c338c 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -186,6 +186,7 @@  static int fanotify_handle_event(struct fsnotify_group *group,
 
 	BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
 	BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(FAN_MODIFY_DIR != FS_MODIFY_DIR);
 	BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
 	BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE);
 	BUILD_BUG_ON(FAN_OPEN != FS_OPEN);
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index b43d3f5bd9ea..00fb87c975d6 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -208,6 +208,23 @@  static inline void fsnotify_modify(struct file *file)
 }
 
 /*
+ * fsnotify_modifydir - directory contents were changed
+ * (as a result of rename, creat, unlink, etc.)
+ */
+static inline void fsnotify_modify_dir(struct path *path)
+{
+	struct inode *inode = path->dentry->d_inode;
+	__u32 mask = FS_MODIFY_DIR;
+
+	if (S_ISDIR(inode->i_mode))
+		mask |= FS_ISDIR;
+	else
+		return;
+
+	fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
+}
+
+/*
  * fsnotify_open - file was opened
  */
 static inline void fsnotify_open(struct file *file)
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 487246546ebe..7751b337ec31 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -42,6 +42,7 @@ 
 
 #define FS_OPEN_PERM		0x00010000	/* open event in an permission hook */
 #define FS_ACCESS_PERM		0x00020000	/* access event in a permissions hook */
+#define FS_MODIFY_DIR		0x00040000	/* directory changed (rename/unlink/...) */
 
 #define FS_EXCL_UNLINK		0x04000000	/* do not send events if object is unlinked */
 #define FS_ISDIR		0x40000000	/* event occurred against dir */
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index 030508d195d3..f14e048d492a 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -15,6 +15,8 @@ 
 #define FAN_OPEN_PERM		0x00010000	/* File open in perm check */
 #define FAN_ACCESS_PERM		0x00020000	/* File accessed in perm check */
 
+#define FAN_MODIFY_DIR		0x00040000	/* directory changed (rename/unlink/...) */
+
 #define FAN_ONDIR		0x40000000	/* event occurred against dir */
 
 #define FAN_EVENT_ON_CHILD	0x08000000	/* interested in child events */
@@ -67,7 +69,8 @@ 
 #define FAN_ALL_EVENTS (FAN_ACCESS |\
 			FAN_MODIFY |\
 			FAN_CLOSE |\
-			FAN_OPEN)
+			FAN_OPEN |\
+			FAN_MODIFY_DIR)
 
 /*
  * All events which require a permission response from userspace