Message ID | a0fa4e7890ea16e7f90a17524e817755f7dde8b5.1489445257.git.p@regnarg.cz (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
An example userspace program that uses FAN_MODIFY_DIR to reliably keep an up-to-date internal representation of the file system. It uses some filehandle trickery to identify inodes, other heuristics could be also used. --- //#define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/fanotify.h> #include <stdint.h> #include <dirent.h> #include <assert.h> #include <string.h> #include <map> #include <set> #include <list> using namespace std; #ifndef FAN_MODIFY_DIR #define FAN_MODIFY_DIR 0x00040000 #endif // die-on-error helpers #define CHK(x) ({ __typeof__(x) r = x; if (r == -1) { perror(#x); abort(); } r; }) #define CHKN(x) ({ __typeof__(x) r = x; if (r == NULL) { perror(#x); abort(); } r; }) struct inode_info; struct dentry_info; struct inode_info { ino_t ino; mode_t mode; char handle[MAX_HANDLE_SZ]; set<struct dentry_info *> links; map<string, struct dentry_info *> children; // for directory inodes }; struct dentry_info { struct inode_info *parent, *inode; string name; }; map<ino_t, inode_info*> inodes; int root_fd; int fan_fd; bool compare_handles(const void *h1, const void *h2) { const struct file_handle *fh1 = (const struct file_handle*) h1; const struct file_handle *fh2 = (const struct file_handle*) h2; return (fh1->handle_bytes == fh2->handle_bytes && memcmp(h1, h2, fh1->handle_bytes) == 0); } bool handle_valid(void *handle) { int check_fd = open_by_handle_at(root_fd, (struct file_handle*)handle, O_PATH); if (check_fd >= 0) { CHK(close(check_fd)); return true; } else if (errno == ESTALE) { return false; } else { perror("open_by_handle_at"); exit(1); } } // Get the path corresponding to an inode (one of its paths, in the presence of // hardlinks). void inode_path(const struct inode_info *inode, char *buf, size_t bufsiz) { list<string> components; while (true) { if (inode->links.empty()) break; struct dentry_info *dentry = *inode->links.begin(); components.push_front(dentry->name); inode = dentry->parent; } buf[0] = '\0'; for (auto name: components) { int len = snprintf(buf, bufsiz, "/%s", name.c_str()); buf += len; bufsiz -= len; } } void delete_dentry(struct dentry_info *dentry) { assert(dentry->parent->children[dentry->name] == dentry); char path_buf[4096]; inode_path(dentry->parent, path_buf, sizeof(path_buf)); printf("unlinked %s/%s (ino %lu, parent %lu)\n", path_buf, dentry->name.c_str(), dentry->inode->ino, dentry->parent->ino); dentry->parent->children.erase(dentry->name.c_str()); dentry->inode->links.erase(dentry); // TODO: If this was the last dentry pointing to an inode, schedule removing // the inode after a timeout (we cannot remove it immediately because // the zero-link situation might occur during a rename when the source // directory has been processed but the target directory hasn't). delete dentry; } struct dentry_info *add_dentry(struct inode_info *parent, const char *name, struct inode_info *child) { struct dentry_info *dentry = new dentry_info(); dentry->parent = parent; dentry->name = name; dentry->inode = child; parent->children[name] = dentry; child->links.insert(dentry); char path_buf[4096] = "\0"; inode_path(parent, path_buf, sizeof(path_buf)); printf("linked %s/%s (ino %lu, parent %lu)\n", path_buf, name, child->ino, parent->ino); return dentry; } void delete_inode(struct inode_info *inode) { for (auto dentry: inode->links) { delete_dentry(dentry); } delete inode; } // Given a file descriptor, find the corresponding inode object in our database, // or create a new one if it does not exist. An O_PATH fd suffices. struct inode_info *find_inode(int fd) { struct stat st; CHK(fstat(fd, &st)); char handle[sizeof(struct file_handle) + MAX_HANDLE_SZ]; struct file_handle *fh = (struct file_handle*)handle; fh->handle_bytes = sizeof(handle); int mntid; CHK(name_to_handle_at(fd, "", (struct file_handle*)handle, &mntid, AT_EMPTY_PATH)); struct inode_info *info = inodes[st.st_ino]; if (info) { // Handles can refer to the same file despite not being equal. // If the old handle can still be opened, we can be assured // that the inode number has not been recycled. if (compare_handles(handle, info->handle) || handle_valid(info->handle)) { return info; } else { delete_inode(info); info = NULL; } } inodes[st.st_ino] = info = new inode_info(); info->ino = st.st_ino; info->mode = st.st_mode; memcpy(info->handle, handle, fh->handle_bytes); return info; } // Scan directory and update internal filesystem representation accordingly. // Closes `dirfd`. void scan(int dirfd, bool recursive) { struct inode_info *dir = find_inode(dirfd); char path_buf[4096] = "\0"; inode_path(dir, path_buf, sizeof(path_buf)); printf("scan %s (%lu)\n", path_buf, dir->ino); DIR *dp = CHKN(fdopendir(dirfd)); set<string> seen; while (struct dirent *ent = readdir(dp)) { if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) continue; seen.insert(ent->d_name); if (dir->children.find(ent->d_name) != dir->children.end() && dir->children[ent->d_name]->inode->ino == ent->d_ino) { // Heuristic: It is massively unlikely that an inode number // would be recylced at the same path as before. So if we // see the same inode for the same child, we skip the more // expensive checks altogether. This saves us a buttload of // syscalls, especially given that most directory entries // will be unchanged after a FAN_MODIFY_DIR. // // This can be skipped if strict correctness is preferred // over speed. continue; } int fd = openat(dirfd, ent->d_name, O_PATH|O_NOFOLLOW); if (fd < 0) continue; struct inode_info *child = find_inode(fd); if (dir->children.find(ent->d_name) != dir->children.end()) { struct dentry_info *old_dentry = dir->children[ent->d_name]; if (child != old_dentry->inode) { delete_dentry(old_dentry); add_dentry(dir, ent->d_name, child); } } else { add_dentry(dir, ent->d_name, child); } if (recursive && S_ISDIR(child->mode)) { // `fd' is just an O_PATH fd. For scanning we need O_RDONLY. int scan_fd = CHK(openat(fd, ".", O_RDONLY|O_DIRECTORY)); scan(scan_fd, true); // closes scan_fd } close(fd); } for (auto it: dir->children) { if (seen.find(it.second->name) == seen.end()) delete_dentry(it.second); } closedir(dp); } void event_loop() { while (true) { char buf[4096]; ssize_t len = CHK(read(fan_fd, buf, sizeof(buf))); const struct fanotify_event_metadata *event; event = (const struct fanotify_event_metadata*) buf; while (FAN_EVENT_OK(event, len)) { if (event->vers != FANOTIFY_METADATA_VERSION) abort(); if (event->mask & FAN_MODIFY_DIR) { scan(event->fd, false); } else if (event->mask & FAN_Q_OVERFLOW) { abort(); // TODO: full rescan needed } else { close(event->fd); } event = FAN_EVENT_NEXT(event, len); } } } int main(int argc, char **argv) { if (argc != 2) { fprintf(stderr, "Usage: %s MOUNTPOINT\n", argv[0]); return 1; } root_fd = CHK(open(argv[1], O_RDONLY|O_DIRECTORY)); // In a real application, FAN_UNLIMITED_QUEUE would be replaced with a secondary // userspace queue filled during scanning. fan_fd = CHK(fanotify_init(FAN_UNLIMITED_QUEUE, O_RDONLY)); CHK(fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_MODIFY_DIR|FAN_ONDIR, root_fd, NULL)); scan(dup(root_fd), true); event_loop(); return 0; }
On Tue, Mar 14, 2017 at 1:02 AM, Filip Štědronský <r.lklm@regnarg.cz> wrote: > Fanotify currently does not report directory modification events > (rename, unlink, etc.). But these events are essential for about half of > concievable fanotify use cases, especially: > > - file system indexers / desktop search tools > - file synchronization tools (like Dropbox, Nextcloud, etc.), > online backup tools This last one is the use case of my employer, Ctera Networks. Out of curiosity, what is the use case that you are focusing on? > > and pretty much any app that needs to maintain and up-to-date internal > representation of current contents of the file system. > > All applications of the above classes that I'm aware of currently use > recursive inotify watches, which do not scale (my home dir has ~200k > directories, creating all the watches takes ~2min and eats several tens > of MB of kernel memory). > > There have been many calls for such a feature, pretty much since the > creation of fanotify in 2009: > * By GNOME developers: > https://wiki.gnome.org/BastienNocera/KernelWishlist#VFS.2C_filesystems > * By KDE developers: > http://lkml.kernel.org/r/201211011352.42476.Martin%40lichtvoll.de > 'Better support for (desktop) file search / indexing applications' > * And others: > http://lkml.kernel.org/r/AANLkTi=owK=WZW4oNtpm5WpAZhqCQUdTR2K5gzJ_MqZ+%40mail.gmail.com > 'Fanotify mv/rename?' > http://lkml.kernel.org/r/1238158043.23703.20.camel%40fatty > 'Issues with using fanotify for a filesystem indexer' > Thanks for sharing this summary! I had the feeling that all recursive inotify users are hiding in the shadows, but was missing more concrete evidence. > Add a new fanotify event, FAN_MODIFY_DIR, that is emitted whenever the > contents of a directory change (a directory entry is added, removed or > renamed). This covers all the currently missing events: rename, unlink, > mknod, mkdir, rmdir, etc. > > Included with the event is a file descriptor to the modified directory > but no further info. The userspace application is expected to rescan the > whole directory and update its model accordingly. This needs to be done > carefully to prevent race conditions. A cross-directory rename generates > two FAN_MODIFY_DIR events. > Your approach is interesting and I am glad you shared it with us. I do like it and it gives me an idea, I am going to prepare a branch with a subset of my patches, so you can try them with your userspace sample program. In comments to your patches I am going to argue that, as a matter of fact, you can take a small sub set of my patches and get the same functionality of FAN_MODIFY_DIR, with code that is *less* complex then what you propose. So please treat my comments as technical comments how FAN_MODIFY_DIR should be implemented IMO and not as an attempt to reject an opposing proposal. > This minimalistic approach has several advantages: > * Does not require changes to the fanotify userspace API or the > fsnotify in-kernel framework, apart from adding a new event. About the argument of not having to change in-kernel framework, I don't think it should be a consideration at all. I claim that my 1st fsnotify cleanup series is an improvement of the framework (https://github.com/amir73il/linux/commits/fsnotify_dentry) regardless of additional functionality. So changing the in-kernel framework by adding complexity may be bad, but changing it by simplifying and making code more readable should not be an argument against the "non-minimalistic" approach. About the change to usespace API, if you strip off the *added* functionality of super block watch from my patches, your proposal for user API looks like this: fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \ FAN_MODIFY_DIR|FAN_ONDIR, And my proposal for user API looks like this: fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \ FAN_MOVE|FAN_CREATE|FAN_DELETE, You can see why my proposal for user API is not any less minimalistic and it is fully compatible with existing intotify API for the same events. I understand why it is hard to see this behind my added functionality, so I will post a minimalistic branch and test program as an example. > Especially doesn't complicate it by adding string fields. So the string fields in my proposal are optional. userspace opts-in to get them by specifying the FAN_EVENT_INFO_NAME flag: fanotify_init(FAN_CLOEXEC | FAN_CLASS_NOTIF | FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_NAME, If you don't specify FAN_EVENT_INFO_NAME, you can get filename events FAN_MOVE|FAN_CREATE|FAN_DELETE without the name. What you do get is the file descriptor of the parent. sounds familiar? ;-) The flag FAN_EVENT_INFO_PARENT in an explicit opt-in to get parent fd instead of victim id on specific events. If FAN_EVENT_INFO_PARENT is not specified in fanotify_init() the filename events FAN_MOVE|FAN_CREATE|FAN_DELETE are masked out of fanotify_mark(). This is important because your patchs adds FAN_MODIFY_DIR to FAN_ALL_EVENTS (as does mine). So old fanotify userspace code, compiled with new kernel headers, with: fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_ALL_EVENTS, Will start getting your FAN_MODIFY_DIR events and those programs might have wrong assumptions about the provided fd. So I chose to have stronger backward compatibility, and added the opt-in FAN_EVENT_INFO_PARENT flag. > * Has simple and clear semantics, even with multiple renames occurring > in parallel etc. In case of any inconsistencies, one can simply wait > for a while and rescan again. > * FAN_MODIFY_DIR events are easily merged in case of multiple > operations on a directory (e.g. deleting all files). > I agree those 2 are important advantages. You could get them with my proposed API when dropping the FAN_EVENT_INFO_NAME flag. Thanks for pointing that out. > Signed-off-by: Filip Štědronský <r.lkml@regnarg.cz> > > --- > > An alternative might be to create wrapper functions like > vfs_path_(rename|unlink|...). They could also take care of calling > security_path_(rename|unlink|...), which is currently also up to > the indvidual callers (possibly with a flag because it might not > be always desired). > > An alternative was proposed by Amir Goldstein in several long series of > patches that add superblock-scoped (as opposed to vfsmount-scoped) > fanotify watches and specific dentry manipulation events with filenames: > > http://lkml.kernel.org/r/1481984434-13293-1-git-send-email-amir73il%40gmail.com > http://lkml.kernel.org/r/1482247207-4424-1-git-send-email-amir73il%40gmail.com > http://lkml.kernel.org/r/1476126784-12520-1-git-send-email-amir73il%40gmail.com > http://lkml.kernel.org/r/1489411223-12081-1-git-send-email-amir73il%40gmail.com > > There is large but not complete overlap between that proposal and > mine (which is was originally created over a year ago, before Amir's > work, but never posted). > I am proud of the role I played to help your proposal see the day of light :-) > I think the superblock watch idea is very interesting because it might > in the future allow reporing fsnotify events from remote and virtual > filesystems. So I'm posting this more as a potential source of more > ideas for discussion, or a fallback proposal in case Amir's patches > don't make it. > --- > fs/notify/fanotify/fanotify.c | 1 + > include/linux/fsnotify.h | 17 +++++++++++++++++ > include/linux/fsnotify_backend.h | 1 + > include/uapi/linux/fanotify.h | 5 ++++- > 4 files changed, 23 insertions(+), 1 deletion(-) > > diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c > index bbc175d4213d..5178b06c338c 100644 > --- a/fs/notify/fanotify/fanotify.c > +++ b/fs/notify/fanotify/fanotify.c > @@ -186,6 +186,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, > > BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); > BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); > + BUILD_BUG_ON(FAN_MODIFY_DIR != FS_MODIFY_DIR); > BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); > BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); > BUILD_BUG_ON(FAN_OPEN != FS_OPEN); > diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h > index b43d3f5bd9ea..00fb87c975d6 100644 > --- a/include/linux/fsnotify.h > +++ b/include/linux/fsnotify.h > @@ -208,6 +208,23 @@ static inline void fsnotify_modify(struct file *file) > } > > /* > + * fsnotify_modifydir - directory contents were changed > + * (as a result of rename, creat, unlink, etc.) > + */ > +static inline void fsnotify_modify_dir(struct path *path) > +{ > + struct inode *inode = path->dentry->d_inode; > + __u32 mask = FS_MODIFY_DIR; > + > + if (S_ISDIR(inode->i_mode)) > + mask |= FS_ISDIR; It is going to be somewhat confusing for users to understand the difference between FS_MODIFY_DIR and FS_MODIFY_DIR|FS_ISDIR. IMO, it is much more intuitive (and compatible with intoify semantics) when users get specific event information, e.g. FS_DELETE and FS_DELETE|FS_ISDIR. > + else > + return; > + > + fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0); > +} > + > +/* > * fsnotify_open - file was opened > */ > static inline void fsnotify_open(struct file *file) > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h > index 487246546ebe..7751b337ec31 100644 > --- a/include/linux/fsnotify_backend.h > +++ b/include/linux/fsnotify_backend.h > @@ -42,6 +42,7 @@ > > #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ > #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ > +#define FS_MODIFY_DIR 0x00040000 /* directory changed (rename/unlink/...) */ > > #define FS_EXCL_UNLINK 0x04000000 /* do not send events if object is unlinked */ > #define FS_ISDIR 0x40000000 /* event occurred against dir */ > diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h > index 030508d195d3..f14e048d492a 100644 > --- a/include/uapi/linux/fanotify.h > +++ b/include/uapi/linux/fanotify.h > @@ -15,6 +15,8 @@ > #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */ > #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */ > > +#define FAN_MODIFY_DIR 0x00040000 /* directory changed (rename/unlink/...) */ > + > #define FAN_ONDIR 0x40000000 /* event occurred against dir */ > > #define FAN_EVENT_ON_CHILD 0x08000000 /* interested in child events */ > @@ -67,7 +69,8 @@ > #define FAN_ALL_EVENTS (FAN_ACCESS |\ > FAN_MODIFY |\ > FAN_CLOSE |\ > - FAN_OPEN) > + FAN_OPEN |\ > + FAN_MODIFY_DIR) > > /* > * All events which require a permission response from userspace > -- > 2.11.1 >
On Tue, Mar 14, 2017 at 1:16 AM, Filip Štědronský <r.lklm@regnarg.cz> wrote: > An example userspace program that uses FAN_MODIFY_DIR to reliably keep > an up-to-date internal representation of the file system. It uses some > filehandle trickery to identify inodes, other heuristics could be also > used. > An I am very happy that you used filehandles to keep track of objects in your example, because it fits my proposal really well. See, if you used my proposed API, you would have had fan_fd = CHK(fanotify_init(FAN_UNLIMITED_QUEUE| \ FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_FH, O_RDONLY)); And then you would have gotten the filhandle of parent with the event and you would need find_inode(). Furthermore, my proposal records the filehandle at the time of the event and therefore, does not have to keep an elevated refcount of the object in the events queue. Again, this needs some work, but I will try to post a testing branch which demonstrates how my patches should be used with this test program. Thanks again for sharing it. Amir. > --- > > //#define _GNU_SOURCE > > #include <stdio.h> > #include <stdlib.h> > #include <unistd.h> > #include <sys/types.h> > #include <sys/stat.h> > #include <fcntl.h> > #include <sys/fanotify.h> > #include <stdint.h> > #include <dirent.h> > #include <assert.h> > #include <string.h> > > #include <map> > #include <set> > #include <list> > using namespace std; > > #ifndef FAN_MODIFY_DIR > #define FAN_MODIFY_DIR 0x00040000 > #endif > > // die-on-error helpers > #define CHK(x) ({ __typeof__(x) r = x; if (r == -1) { perror(#x); abort(); } r; }) > #define CHKN(x) ({ __typeof__(x) r = x; if (r == NULL) { perror(#x); abort(); } r; }) > struct inode_info; > struct dentry_info; > > struct inode_info { > ino_t ino; > mode_t mode; > char handle[MAX_HANDLE_SZ]; > set<struct dentry_info *> links; > map<string, struct dentry_info *> children; // for directory inodes > }; > > struct dentry_info { > struct inode_info *parent, *inode; > string name; > }; > > > map<ino_t, inode_info*> inodes; > > int root_fd; > int fan_fd; > > bool compare_handles(const void *h1, const void *h2) { > const struct file_handle *fh1 = (const struct file_handle*) h1; > const struct file_handle *fh2 = (const struct file_handle*) h2; > return (fh1->handle_bytes == fh2->handle_bytes > && memcmp(h1, h2, fh1->handle_bytes) == 0); > } > > bool handle_valid(void *handle) { > int check_fd = open_by_handle_at(root_fd, (struct file_handle*)handle, O_PATH); > if (check_fd >= 0) { > CHK(close(check_fd)); > return true; > } else if (errno == ESTALE) { > return false; > } else { > perror("open_by_handle_at"); > exit(1); > } > } > > // Get the path corresponding to an inode (one of its paths, in the presence of > // hardlinks). > void inode_path(const struct inode_info *inode, char *buf, size_t bufsiz) { > list<string> components; > while (true) { > if (inode->links.empty()) break; > struct dentry_info *dentry = *inode->links.begin(); > components.push_front(dentry->name); > inode = dentry->parent; > } > buf[0] = '\0'; > for (auto name: components) { > int len = snprintf(buf, bufsiz, "/%s", name.c_str()); > buf += len; > bufsiz -= len; > } > } > > > void delete_dentry(struct dentry_info *dentry) { > assert(dentry->parent->children[dentry->name] == dentry); > > char path_buf[4096]; > inode_path(dentry->parent, path_buf, sizeof(path_buf)); > printf("unlinked %s/%s (ino %lu, parent %lu)\n", path_buf, dentry->name.c_str(), > dentry->inode->ino, dentry->parent->ino); > > dentry->parent->children.erase(dentry->name.c_str()); > dentry->inode->links.erase(dentry); > // TODO: If this was the last dentry pointing to an inode, schedule removing > // the inode after a timeout (we cannot remove it immediately because > // the zero-link situation might occur during a rename when the source > // directory has been processed but the target directory hasn't). > delete dentry; > } > > struct dentry_info *add_dentry(struct inode_info *parent, const char *name, > struct inode_info *child) { > struct dentry_info *dentry = new dentry_info(); > dentry->parent = parent; > dentry->name = name; > dentry->inode = child; > parent->children[name] = dentry; > child->links.insert(dentry); > > char path_buf[4096] = "\0"; > inode_path(parent, path_buf, sizeof(path_buf)); > printf("linked %s/%s (ino %lu, parent %lu)\n", path_buf, name, child->ino, parent->ino); > > return dentry; > } > > void delete_inode(struct inode_info *inode) { > for (auto dentry: inode->links) { > delete_dentry(dentry); > } > delete inode; > } > > // Given a file descriptor, find the corresponding inode object in our database, > // or create a new one if it does not exist. An O_PATH fd suffices. > struct inode_info *find_inode(int fd) { > struct stat st; > CHK(fstat(fd, &st)); > char handle[sizeof(struct file_handle) + MAX_HANDLE_SZ]; > struct file_handle *fh = (struct file_handle*)handle; > fh->handle_bytes = sizeof(handle); > int mntid; > CHK(name_to_handle_at(fd, "", (struct file_handle*)handle, &mntid, > AT_EMPTY_PATH)); > > struct inode_info *info = inodes[st.st_ino]; > if (info) { > // Handles can refer to the same file despite not being equal. > // If the old handle can still be opened, we can be assured > // that the inode number has not been recycled. > if (compare_handles(handle, info->handle) || handle_valid(info->handle)) { > return info; > } else { > delete_inode(info); > info = NULL; > } > } > > inodes[st.st_ino] = info = new inode_info(); > info->ino = st.st_ino; > info->mode = st.st_mode; > memcpy(info->handle, handle, fh->handle_bytes); > return info; > } > > // Scan directory and update internal filesystem representation accordingly. > // Closes `dirfd`. > void scan(int dirfd, bool recursive) { > struct inode_info *dir = find_inode(dirfd); > > char path_buf[4096] = "\0"; > inode_path(dir, path_buf, sizeof(path_buf)); > printf("scan %s (%lu)\n", path_buf, dir->ino); > > DIR *dp = CHKN(fdopendir(dirfd)); > set<string> seen; > while (struct dirent *ent = readdir(dp)) { > if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) continue; > seen.insert(ent->d_name); > if (dir->children.find(ent->d_name) != dir->children.end() > && dir->children[ent->d_name]->inode->ino == ent->d_ino) { > // Heuristic: It is massively unlikely that an inode number > // would be recylced at the same path as before. So if we > // see the same inode for the same child, we skip the more > // expensive checks altogether. This saves us a buttload of > // syscalls, especially given that most directory entries > // will be unchanged after a FAN_MODIFY_DIR. > // > // This can be skipped if strict correctness is preferred > // over speed. > continue; > } > int fd = openat(dirfd, ent->d_name, O_PATH|O_NOFOLLOW); > if (fd < 0) continue; > struct inode_info *child = find_inode(fd); > if (dir->children.find(ent->d_name) != dir->children.end()) { > struct dentry_info *old_dentry = dir->children[ent->d_name]; > if (child != old_dentry->inode) { > delete_dentry(old_dentry); > add_dentry(dir, ent->d_name, child); > } > } else { > add_dentry(dir, ent->d_name, child); > } > if (recursive && S_ISDIR(child->mode)) { > // `fd' is just an O_PATH fd. For scanning we need O_RDONLY. > int scan_fd = CHK(openat(fd, ".", O_RDONLY|O_DIRECTORY)); > scan(scan_fd, true); // closes scan_fd > } > close(fd); > } > for (auto it: dir->children) { > if (seen.find(it.second->name) == seen.end()) delete_dentry(it.second); > } > closedir(dp); > } > > void event_loop() { > while (true) { > char buf[4096]; > ssize_t len = CHK(read(fan_fd, buf, sizeof(buf))); > const struct fanotify_event_metadata *event; > event = (const struct fanotify_event_metadata*) buf; > while (FAN_EVENT_OK(event, len)) { > if (event->vers != FANOTIFY_METADATA_VERSION) abort(); > if (event->mask & FAN_MODIFY_DIR) { > scan(event->fd, false); > } else if (event->mask & FAN_Q_OVERFLOW) { > abort(); // TODO: full rescan needed > } else { > close(event->fd); > } > event = FAN_EVENT_NEXT(event, len); > } > } > } > > int main(int argc, char **argv) { > if (argc != 2) { fprintf(stderr, "Usage: %s MOUNTPOINT\n", argv[0]); return 1; } > > root_fd = CHK(open(argv[1], O_RDONLY|O_DIRECTORY)); > // In a real application, FAN_UNLIMITED_QUEUE would be replaced with a secondary > // userspace queue filled during scanning. > fan_fd = CHK(fanotify_init(FAN_UNLIMITED_QUEUE, O_RDONLY)); > CHK(fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_MODIFY_DIR|FAN_ONDIR, > root_fd, NULL)); > > scan(dup(root_fd), true); > > event_loop(); > > return 0; > }
Hi, On Tue, Mar 14, 2017 at 12:11:40PM +0200, Amir Goldstein wrote: > > - file system indexers / desktop search tools > > - file synchronization tools (like Dropbox, Nextcloud, etc.), > > online backup tools > > This last one is the use case of my employer, Ctera Networks. > Out of curiosity, what is the use case that you are focusing on? I'm working on a file synchronization tool as part of my bachelor thesis at Charles University. When I started (now over a year ago ... long story), there were AFIK no attempts at solving the recursive inotify issue, only a lot of complaints, so I cobbled up together something simple (I'm not a kernel developer by trade, this was my first patch) that would allow me to work on the userspace parts, which are the main bulk. I try to focus on algorithmic and implementation efficiency as opposed to fancy GUIs and similar. I want it to be fast with 100k directories and 10M files in home dir. But it's very WIP so we'll see how that turns out. > I had the feeling that all recursive inotify users are hiding in the shadows, > but was missing more concrete evidence. Yes, even Dropbox uses inotify. They have articles in their help on how to increase inotify.max_user_watches: https://www.dropbox.com/help/145. That's not good PR. ;-) (I'm not affiliated with DB, just pointing that out.) > About the argument of not having to change in-kernel framework, > I don't think it should be a consideration at all. Understood. I tried to stay conservative and non-controversial because I imagined that radical framework changes would take months of discussions (look at the history of statx) and this issue seems to be pressing for quite a lot of people. But rushing is of course not the best strategy either, there are always compromises. > If you don't specify FAN_EVENT_INFO_NAME, you can get filename events > FAN_MOVE|FAN_CREATE|FAN_DELETE without the name. > > What you do get is the file descriptor of the parent. sounds familiar? ;-) I didn't notice this bit. That sounds like a win-win. Filip
Hi, On Tue, Mar 14, 2017 at 12:40:56PM +0200, Amir Goldstein wrote: > An I am very happy that you used filehandles to keep track of objects > in your example, because it fits my proposal really well. you can read more about the rationales in the WIP draft of my thesis (especially the chapter Identifying Inodes and its surroundings): http://regnarg.cz/tmp/thesis.pdf https://github.com/regnarg/bc Either way, one can never use pathnames as identifiers because in a world with parallel renames on multiple levels of the hierarchy (between which no ordering is guaranteed unless they are cross-dir), pathnames are not even a well-defined concept. NB: I used a simple script for testing that does precisely this (a lot of parallel within-directory renames at many levels): https://github.com/regnarg/bc/blob/master/experiments/fanotify/parallel_renamist.sh It seems like a good stress test for any application that tries to do reliable filesystem monitoring. > See, if you used my proposed API, you would have had > > fan_fd = CHK(fanotify_init(FAN_UNLIMITED_QUEUE| \ > FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_FH, > O_RDONLY)); Sure, I'll definitely try to implement your interface as an optional backend and mention it in my thesis and definitely give it some heavy testing. > Furthermore, my proposal records the filehandle at the time of the event > and therefore, does not have to keep an elevated refcount of the object > in the events queue. That sounds good but from what I've understood, not all filesystems support handles. Is this a concern? (Maybe the right answer is to extend handle support...) Filip
On Tue, Mar 14, 2017 at 2:41 PM, Filip Štědronský <r.lkml@regnarg.cz> wrote: > Hi, > > On Tue, Mar 14, 2017 at 12:11:40PM +0200, Amir Goldstein wrote: >> > - file system indexers / desktop search tools >> > - file synchronization tools (like Dropbox, Nextcloud, etc.), >> > online backup tools >> >> This last one is the use case of my employer, Ctera Networks. >> Out of curiosity, what is the use case that you are focusing on? > > I'm working on a file synchronization tool as part of my bachelor > thesis at Charles University. > > When I started (now over a year ago ... long story), there were AFIK no > attempts at solving the recursive inotify issue, only a lot of > complaints, so I cobbled up together something simple (I'm not a kernel > developer by trade, this was my first patch) that would allow me to > work on the userspace parts, which are the main bulk. > That sounds very useful. I was actually looking for an open userspace implementation were I could demonstrate my patches, but all the projects I looked at (beagle etc..) seems to have died from desperation waiting for intotify scalabiltiy to be fixed... > I try to focus on algorithmic and implementation efficiency as opposed > to fancy GUIs and similar. I want it to be fast with 100k directories > and 10M files in home dir. But it's very WIP so we'll see how that turns > out. > I did some basic tests of my super block watch with 1M directories, and currently the code is under integration testing in our company. I prepared a drop-in replacement for our fs change listener class that was previously implemented with inotify. Currently, the userspace code still "thinks" it is adding watches on directories recursively, but what it really does is just gets all their file handles. Then, the code that maps an event to "watch descriptor" actually matches the fhandle in the event to the directory fhandle (much like in your example). If you are not married to using your kernel patch, you can just go a head and try my patch set. You can fetch it directly from: https://github.com/amir73il/linux/tree/fanotify_sb It has been recently rebased over v4.11-rc1. There are also ready ports for stable kernels v4.1 v4.4 and v4.9 in my github if that serves you better. Marko Rauhamaa [CCed] has also tested my patches to his satisfaction. >> I had the feeling that all recursive inotify users are hiding in the shadows, >> but was missing more concrete evidence. > > Yes, even Dropbox uses inotify. They have articles in their help on how > to increase inotify.max_user_watches: https://www.dropbox.com/help/145. > That's not good PR. ;-) (I'm not affiliated with DB, just pointing that > out.) > >> About the argument of not having to change in-kernel framework, >> I don't think it should be a consideration at all. > > Understood. I tried to stay conservative and non-controversial because I > imagined that radical framework changes would take months of discussions > (look at the history of statx) and this issue seems to be pressing for > quite a lot of people. But rushing is of course not the best strategy > either, there are always compromises. > Don't get me wrong, I certainly do *understand* why you would want to stay away from making radical framework changes. But I believe it is the best interest of the community to choose the best solution going forward while eliminating the fear-of-change factor. This is why I say that fear of change SHOULD not be a consideration. >> If you don't specify FAN_EVENT_INFO_NAME, you can get filename events >> FAN_MOVE|FAN_CREATE|FAN_DELETE without the name. >> >> What you do get is the file descriptor of the parent. sounds familiar? ;-) > > I didn't notice this bit. That sounds like a win-win. > Well, I may me giving you half the truth here. With current patches not specifying FAN_EVENT_INFO_NAME will mask out the filename events from the mark, but 1. The implementation could be easily changed 2. You can set FAN_EVENT_INFO_NAME and ignore the filename info in userspace. You still get the parent fd as your program expects and you get a bonus fhandle of the parent for free with the event, see my test program: github.com/amir73il/fsnotify-utils/blob/master/src/test/fanotify_demo.c#L101 Please let me know if that is sufficient for your needs or if you need me to prepare a version that delivers filename events without filename info, therefore allowing to merge filename events. Sounds to me like if you get an event FAN_DELETE, "aaa", your implementation can update the db directly without having to scan the directory, so it should be useful. For your consideration. Amir.
Hi, On Tue, Mar 14, 2017 at 03:55:19PM +0200, Amir Goldstein wrote: > Please let me know if that is sufficient for your needs > or if you need me to prepare a version that delivers filename events > without filename info, therefore allowing to merge filename events. > > Sounds to me like if you get an event FAN_DELETE, "aaa", > your implementation can update the db directly without having > to scan the directory, so it should be useful. > For your consideration. both approaches might be useful and there are tradeoffs. Direct updates save us from rescanning but give more events and more context switches. On the other hand, with an unlimited queue and well-mergable events I could e.g. sleep for five seconds each time after emptying the queue, thus giving the events more potential to merge and reducing context switches. But one risks blowing up the queue. However, I have some ideas. Basically we could implement some kind of "bulk read" mode for fanotify that would pass events to userspace only when (a) a given event count thresold is hit (e.g. half the queue limit if queue is limited), or (b) the oldest event is older than a specified maximum latency. That might vary from 1 second to 5 minutes depending on specific application needs (e.g. background backup does not need to be low-latency). This would greatly reduce extra context switches when watching a large portion (or whole) of the file system. All of this presumably could be implemented on top of your suggested patches. Either way, I suggest that implementing the nameless filename events is a good idea. I do not know whether they will be the best choice for my application but they probably will be useful for some applications. Filip
On Tue, Mar 14, 2017 at 3:46 PM, Filip Štědronský <r.lkml@regnarg.cz> wrote: > Hi, > > On Tue, Mar 14, 2017 at 12:40:56PM +0200, Amir Goldstein wrote: >> An I am very happy that you used filehandles to keep track of objects >> in your example, because it fits my proposal really well. > > you can read more about the rationales in the WIP draft of my thesis > (especially the chapter Identifying Inodes and its surroundings): > > http://regnarg.cz/tmp/thesis.pdf > https://github.com/regnarg/bc > > Either way, one can never use pathnames as identifiers because in > a world with parallel renames on multiple levels of the hierarchy > (between which no ordering is guaranteed unless they are cross-dir), > pathnames are not even a well-defined concept. > > NB: I used a simple script for testing that does precisely this > (a lot of parallel within-directory renames at many levels): > https://github.com/regnarg/bc/blob/master/experiments/fanotify/parallel_renamist.sh > It seems like a good stress test for any application that tries to > do reliable filesystem monitoring. I agree. not only to filesystem monitoring. filesystems could use a test like this in general. It would be good to stress test overlayfs dir rename like this. > >> See, if you used my proposed API, you would have had >> >> fan_fd = CHK(fanotify_init(FAN_UNLIMITED_QUEUE| \ >> FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_FH, >> O_RDONLY)); > > Sure, I'll definitely try to implement your interface as an > optional backend and mention it in my thesis and definitely > give it some heavy testing. > >> Furthermore, my proposal records the filehandle at the time of the event >> and therefore, does not have to keep an elevated refcount of the object >> in the events queue. > > That sounds good but from what I've understood, not all > filesystems support handles. Is this a concern? (Maybe the > right answer is to extend handle support...) > Very interesting question. Extending fhandle support is not always easy. The majority of local file system do support fhandles, because they are required for exporting a file system over NFS. see: git grep -lw s_export_op fs/ The reason that some file systems "don't support" fhandles is usually because they either do not have an efficient way to implement open_by_handle() or because they do not have a way to encode a unique and persistent (over reboot) identifer for name_to_handle(). An example of a commonly used file system that currently does not support file handles is overlayfs. The interesting thing to note wrt fanotify and overlayfs is that while it is technically hard to implement open_by_handle() for overlayfs, it is quite trivial to implement name_to_handle() for the special common case of all layers on the same underlying fs (e.g. docker). The trivial implementation (at least for directories) is to return fhandle of lower most dir in case of lower or merge dir and fhandle of upper dir otherwise. I don't think there is any filesystem that implements name_to_handle() without implementing open_by_handle(), but technically it is possible. And for fanotify FAN_EVENT_INFO_FH only the former is required. But in any case, FAN_EVENT_INFO_FH is optional. If your userspace implementation can settle with parent fd information then filesystems that do not support fhandles can use this mode. Amir.
On Tue, Mar 14, 2017 at 4:48 PM, Filip Štědronský <r.lkml@regnarg.cz> wrote: > Hi, > > On Tue, Mar 14, 2017 at 03:55:19PM +0200, Amir Goldstein wrote: >> Please let me know if that is sufficient for your needs >> or if you need me to prepare a version that delivers filename events >> without filename info, therefore allowing to merge filename events. >> >> Sounds to me like if you get an event FAN_DELETE, "aaa", >> your implementation can update the db directly without having >> to scan the directory, so it should be useful. >> For your consideration. > > both approaches might be useful and there are tradeoffs. Direct updates > save us from rescanning but give more events and more context switches. > > On the other hand, with an unlimited queue and well-mergable events I > could e.g. sleep for five seconds each time after emptying the queue, > thus giving the events more potential to merge and reducing context > switches. > > But one risks blowing up the queue. However, I have some ideas. > Basically we could implement some kind of "bulk read" mode for fanotify > that would pass events to userspace only when > (a) a given event count thresold is hit (e.g. half the queue limit > if queue is limited), or > (b) the oldest event is older than a specified maximum latency. That > might vary from 1 second to 5 minutes depending on specific > application needs (e.g. background backup does not need to be > low-latency). > > This would greatly reduce extra context switches when watching > a large portion (or whole) of the file system. All of this presumably > could be implemented on top of your suggested patches. > > Either way, I suggest that implementing the nameless filename events > is a good idea. I do not know whether they will be the best choice > for my application but they probably will be useful for some > applications. > Done and pushed all branches. Added 2 new patches to the fanotify_dentry set: ce0c223 fanotify: allow merge of filename events cd872d1 fanotify: make filename info optional for filename events You should test with branch fanotify_sb, using super block watch, because filename events are not supported in mount scope. You can follow user API used by my fanotify_demo.c program. Tested with my fanotify_demo test both without FAN_EVENT_INFO_NAME and without FAN_EVENT_INFO_FH. Without FAN_EVENT_INFO_NAME more filename events are merged, but in my test that runs fs ops by a script, most events do not get merged because of different pid value. I guess that for the filesync/indexer use case, we would need to add a new FAN_EVENT_INFO_NOPID flag to loose the irrelevant pid information in order to be able to merge a lot more filename event on a directory. I also plan to add FAN_EVENT_INFO_NOFD, to get rid of redundant fd (when INFO_FH is available) and then be able to drop the refcount on the victim object. Cheers, Amir.
[CC += linux-api@vger.kernel.org] Filip, Since this is a kernel-user-space API change, please CC linux-api@ (and on future iterations of this patch). The kernel source file Documentation/SubmitChecklist notes that all Linux kernel patches that change userspace interfaces should be CCed to linux-api@vger.kernel.org, so that the various parties who are interested in API changes are informed. For further information, see https://www.kernel.org/doc/man-pages/linux-api-ml.html Thanks, Michael On Tue, Mar 14, 2017 at 12:02 AM, Filip Štědronský <r.lklm@regnarg.cz> wrote: > Fanotify currently does not report directory modification events > (rename, unlink, etc.). But these events are essential for about half of > concievable fanotify use cases, especially: > > - file system indexers / desktop search tools > - file synchronization tools (like Dropbox, Nextcloud, etc.), > online backup tools > > and pretty much any app that needs to maintain and up-to-date internal > representation of current contents of the file system. > > All applications of the above classes that I'm aware of currently use > recursive inotify watches, which do not scale (my home dir has ~200k > directories, creating all the watches takes ~2min and eats several tens > of MB of kernel memory). > > There have been many calls for such a feature, pretty much since the > creation of fanotify in 2009: > * By GNOME developers: > https://wiki.gnome.org/BastienNocera/KernelWishlist#VFS.2C_filesystems > * By KDE developers: > http://lkml.kernel.org/r/201211011352.42476.Martin%40lichtvoll.de > 'Better support for (desktop) file search / indexing applications' > * And others: > http://lkml.kernel.org/r/AANLkTi=owK=WZW4oNtpm5WpAZhqCQUdTR2K5gzJ_MqZ+%40mail.gmail.com > 'Fanotify mv/rename?' > http://lkml.kernel.org/r/1238158043.23703.20.camel%40fatty > 'Issues with using fanotify for a filesystem indexer' > > Add a new fanotify event, FAN_MODIFY_DIR, that is emitted whenever the > contents of a directory change (a directory entry is added, removed or > renamed). This covers all the currently missing events: rename, unlink, > mknod, mkdir, rmdir, etc. > > Included with the event is a file descriptor to the modified directory > but no further info. The userspace application is expected to rescan the > whole directory and update its model accordingly. This needs to be done > carefully to prevent race conditions. A cross-directory rename generates > two FAN_MODIFY_DIR events. > > This minimalistic approach has several advantages: > * Does not require changes to the fanotify userspace API or the > fsnotify in-kernel framework, apart from adding a new event. > Especially doesn't complicate it by adding string fields. > * Has simple and clear semantics, even with multiple renames occurring > in parallel etc. In case of any inconsistencies, one can simply wait > for a while and rescan again. > * FAN_MODIFY_DIR events are easily merged in case of multiple > operations on a directory (e.g. deleting all files). > > Signed-off-by: Filip Štědronský <r.lkml@regnarg.cz> > > --- > > An alternative might be to create wrapper functions like > vfs_path_(rename|unlink|...). They could also take care of calling > security_path_(rename|unlink|...), which is currently also up to > the indvidual callers (possibly with a flag because it might not > be always desired). > > An alternative was proposed by Amir Goldstein in several long series of > patches that add superblock-scoped (as opposed to vfsmount-scoped) > fanotify watches and specific dentry manipulation events with filenames: > > http://lkml.kernel.org/r/1481984434-13293-1-git-send-email-amir73il%40gmail.com > http://lkml.kernel.org/r/1482247207-4424-1-git-send-email-amir73il%40gmail.com > http://lkml.kernel.org/r/1476126784-12520-1-git-send-email-amir73il%40gmail.com > http://lkml.kernel.org/r/1489411223-12081-1-git-send-email-amir73il%40gmail.com > > There is large but not complete overlap between that proposal and > mine (which is was originally created over a year ago, before Amir's > work, but never posted). > > I think the superblock watch idea is very interesting because it might > in the future allow reporing fsnotify events from remote and virtual > filesystems. So I'm posting this more as a potential source of more > ideas for discussion, or a fallback proposal in case Amir's patches > don't make it. > --- > fs/notify/fanotify/fanotify.c | 1 + > include/linux/fsnotify.h | 17 +++++++++++++++++ > include/linux/fsnotify_backend.h | 1 + > include/uapi/linux/fanotify.h | 5 ++++- > 4 files changed, 23 insertions(+), 1 deletion(-) > > diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c > index bbc175d4213d..5178b06c338c 100644 > --- a/fs/notify/fanotify/fanotify.c > +++ b/fs/notify/fanotify/fanotify.c > @@ -186,6 +186,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, > > BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); > BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); > + BUILD_BUG_ON(FAN_MODIFY_DIR != FS_MODIFY_DIR); > BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); > BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); > BUILD_BUG_ON(FAN_OPEN != FS_OPEN); > diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h > index b43d3f5bd9ea..00fb87c975d6 100644 > --- a/include/linux/fsnotify.h > +++ b/include/linux/fsnotify.h > @@ -208,6 +208,23 @@ static inline void fsnotify_modify(struct file *file) > } > > /* > + * fsnotify_modifydir - directory contents were changed > + * (as a result of rename, creat, unlink, etc.) > + */ > +static inline void fsnotify_modify_dir(struct path *path) > +{ > + struct inode *inode = path->dentry->d_inode; > + __u32 mask = FS_MODIFY_DIR; > + > + if (S_ISDIR(inode->i_mode)) > + mask |= FS_ISDIR; > + else > + return; > + > + fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0); > +} > + > +/* > * fsnotify_open - file was opened > */ > static inline void fsnotify_open(struct file *file) > diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h > index 487246546ebe..7751b337ec31 100644 > --- a/include/linux/fsnotify_backend.h > +++ b/include/linux/fsnotify_backend.h > @@ -42,6 +42,7 @@ > > #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ > #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ > +#define FS_MODIFY_DIR 0x00040000 /* directory changed (rename/unlink/...) */ > > #define FS_EXCL_UNLINK 0x04000000 /* do not send events if object is unlinked */ > #define FS_ISDIR 0x40000000 /* event occurred against dir */ > diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h > index 030508d195d3..f14e048d492a 100644 > --- a/include/uapi/linux/fanotify.h > +++ b/include/uapi/linux/fanotify.h > @@ -15,6 +15,8 @@ > #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */ > #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */ > > +#define FAN_MODIFY_DIR 0x00040000 /* directory changed (rename/unlink/...) */ > + > #define FAN_ONDIR 0x40000000 /* event occurred against dir */ > > #define FAN_EVENT_ON_CHILD 0x08000000 /* interested in child events */ > @@ -67,7 +69,8 @@ > #define FAN_ALL_EVENTS (FAN_ACCESS |\ > FAN_MODIFY |\ > FAN_CLOSE |\ > - FAN_OPEN) > + FAN_OPEN |\ > + FAN_MODIFY_DIR) > > /* > * All events which require a permission response from userspace > -- > 2.11.1 >
Hello, On Tue 14-03-17 12:11:40, Amir Goldstein wrote: > > Add a new fanotify event, FAN_MODIFY_DIR, that is emitted whenever the > > contents of a directory change (a directory entry is added, removed or > > renamed). This covers all the currently missing events: rename, unlink, > > mknod, mkdir, rmdir, etc. > > > > Included with the event is a file descriptor to the modified directory > > but no further info. The userspace application is expected to rescan the > > whole directory and update its model accordingly. This needs to be done > > carefully to prevent race conditions. A cross-directory rename generates > > two FAN_MODIFY_DIR events. > > > > Your approach is interesting and I am glad you shared it with us. > I do like it and it gives me an idea, I am going to prepare a branch > with a subset > of my patches, so you can try them with your userspace sample program. > > In comments to your patches I am going to argue that, as a matter of fact, > you can take a small sub set of my patches and get the same functionality > of FAN_MODIFY_DIR, with code that is *less* complex then what you propose. > So please treat my comments as technical comments how FAN_MODIFY_DIR > should be implemented IMO and not as an attempt to reject an opposing proposal. Yeah, so I have yet to read both your patch sets in detail (good news is that this is getting towards top of my TODO stack and I think I'll do this when flying to LSF/MM this weekend). With Filip's design I like the minimalistic approach of adding just DIR_CHANGED event with (albeit optional) messing with names. That just seems to fit well with the fanotify design. > > This minimalistic approach has several advantages: > > * Does not require changes to the fanotify userspace API or the > > fsnotify in-kernel framework, apart from adding a new event. > > About the argument of not having to change in-kernel framework, > I don't think it should be a consideration at all. > I claim that my 1st fsnotify cleanup series is an improvement of the framework > (https://github.com/amir73il/linux/commits/fsnotify_dentry) > regardless of additional functionality. > So changing the in-kernel framework by adding complexity may be bad, > but changing it by simplifying and making code more readable should not > be an argument against the "non-minimalistic" approach. > > About the change to usespace API, if you strip off the *added* functionality > of super block watch from my patches, your proposal for user API looks > like this: > > fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \ > FAN_MODIFY_DIR|FAN_ONDIR, > > And my proposal for user API looks like this: > > fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \ > FAN_MOVE|FAN_CREATE|FAN_DELETE, > > You can see why my proposal for user API is not any less minimalistic > and it is fully compatible with existing intotify API for the same events. > > I understand why it is hard to see this behind my added functionality, > so I will post a minimalistic branch and test program as an example. > > > Especially doesn't complicate it by adding string fields. > > So the string fields in my proposal are optional. > userspace opts-in to get them by specifying the FAN_EVENT_INFO_NAME flag: > > fanotify_init(FAN_CLOEXEC | FAN_CLASS_NOTIF | > FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_NAME, > > If you don't specify FAN_EVENT_INFO_NAME, you can get filename events > FAN_MOVE|FAN_CREATE|FAN_DELETE without the name. So when thinking about this again, I'm again concerned that the names need not tell userspace what it thinks. I know we already discussed this but in our last discussion I think I forgot to point out that inotify directory events (and fanotify would be the same AFAICT) may come out of order compared to real filesystem changes. E.g. sequence: Task 1 Task 2 mv a b mv b c may come out of inotify as: IN_MOVED_FROM "b" COOKIE 1 IN_MOVED_TO "c" COOKIE 1 IN_MOVED_FROM "a" COOKIE 2 IN_MOVED_TO "b" COOKIE 2 and if user program just blindly belives this sequence its internal representation of the filesystem will be different from the real state of the filesystem. So for now I'm more inclined towards the trivial approach (possibly using your patches and stripping additional functionality from them). But I'll leave final decision to when I'll be able to read everything in detail. Honza
On Wed, Mar 15, 2017 at 4:05 PM, Jan Kara <jack@suse.cz> wrote: > Hello, > > On Tue 14-03-17 12:11:40, Amir Goldstein wrote: >> > Add a new fanotify event, FAN_MODIFY_DIR, that is emitted whenever the >> > contents of a directory change (a directory entry is added, removed or >> > renamed). This covers all the currently missing events: rename, unlink, >> > mknod, mkdir, rmdir, etc. >> > >> > Included with the event is a file descriptor to the modified directory >> > but no further info. The userspace application is expected to rescan the >> > whole directory and update its model accordingly. This needs to be done >> > carefully to prevent race conditions. A cross-directory rename generates >> > two FAN_MODIFY_DIR events. >> > >> >> Your approach is interesting and I am glad you shared it with us. >> I do like it and it gives me an idea, I am going to prepare a branch >> with a subset >> of my patches, so you can try them with your userspace sample program. >> >> In comments to your patches I am going to argue that, as a matter of fact, >> you can take a small sub set of my patches and get the same functionality >> of FAN_MODIFY_DIR, with code that is *less* complex then what you propose. >> So please treat my comments as technical comments how FAN_MODIFY_DIR >> should be implemented IMO and not as an attempt to reject an opposing proposal. > > Yeah, so I have yet to read both your patch sets in detail (good news is > that this is getting towards top of my TODO stack and I think I'll do this > when flying to LSF/MM this weekend). With Filip's design I like the > minimalistic approach of adding just DIR_CHANGED event with (albeit > optional) messing with names. That just seems to fit well with the fanotify > design. > >> > This minimalistic approach has several advantages: >> > * Does not require changes to the fanotify userspace API or the >> > fsnotify in-kernel framework, apart from adding a new event. >> >> About the argument of not having to change in-kernel framework, >> I don't think it should be a consideration at all. >> I claim that my 1st fsnotify cleanup series is an improvement of the framework >> (https://github.com/amir73il/linux/commits/fsnotify_dentry) >> regardless of additional functionality. >> So changing the in-kernel framework by adding complexity may be bad, >> but changing it by simplifying and making code more readable should not >> be an argument against the "non-minimalistic" approach. >> >> About the change to usespace API, if you strip off the *added* functionality >> of super block watch from my patches, your proposal for user API looks >> like this: >> >> fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \ >> FAN_MODIFY_DIR|FAN_ONDIR, >> >> And my proposal for user API looks like this: >> >> fanotify_mark(fan_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, \ >> FAN_MOVE|FAN_CREATE|FAN_DELETE, >> >> You can see why my proposal for user API is not any less minimalistic >> and it is fully compatible with existing intotify API for the same events. >> >> I understand why it is hard to see this behind my added functionality, >> so I will post a minimalistic branch and test program as an example. >> >> > Especially doesn't complicate it by adding string fields. >> >> So the string fields in my proposal are optional. >> userspace opts-in to get them by specifying the FAN_EVENT_INFO_NAME flag: >> >> fanotify_init(FAN_CLOEXEC | FAN_CLASS_NOTIF | >> FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_NAME, >> >> If you don't specify FAN_EVENT_INFO_NAME, you can get filename events >> FAN_MOVE|FAN_CREATE|FAN_DELETE without the name. > > So when thinking about this again, I'm again concerned that the names need > not tell userspace what it thinks. I know we already discussed this but > in our last discussion I think I forgot to point out that inotify directory > events (and fanotify would be the same AFAICT) may come out of order compared > to real filesystem changes. E.g. sequence: > > Task 1 Task 2 > > mv a b > mv b c > > may come out of inotify as: > > IN_MOVED_FROM "b" COOKIE 1 > IN_MOVED_TO "c" COOKIE 1 > IN_MOVED_FROM "a" COOKIE 2 > IN_MOVED_TO "b" COOKIE 2 > Really? How come? fsnotify_move() inside vfs_rename() is serialized with lock_rename() I should use this opportunity to note that fanotify event does not have a cookie field and that I did not add one to my implementation, so that is a problem. > and if user program just blindly belives this sequence its internal > representation of the filesystem will be different from the real state of > the filesystem. > > So for now I'm more inclined towards the trivial approach (possibly using > your patches and stripping additional functionality from them). But I'll > leave final decision to when I'll be able to read everything in detail. > The addition of filename info is completely optional and the relevant patch ("fanotify: pass filename info for filename events") Can be yanked out of the series and pushed back to the 'maybe' pile. Amir.
On Wed 15-03-17 16:34:11, Amir Goldstein wrote: > On Wed, Mar 15, 2017 at 4:05 PM, Jan Kara <jack@suse.cz> wrote: > >> So the string fields in my proposal are optional. > >> userspace opts-in to get them by specifying the FAN_EVENT_INFO_NAME flag: > >> > >> fanotify_init(FAN_CLOEXEC | FAN_CLASS_NOTIF | > >> FAN_EVENT_INFO_PARENT | FAN_EVENT_INFO_NAME, > >> > >> If you don't specify FAN_EVENT_INFO_NAME, you can get filename events > >> FAN_MOVE|FAN_CREATE|FAN_DELETE without the name. > > > > So when thinking about this again, I'm again concerned that the names need > > not tell userspace what it thinks. I know we already discussed this but > > in our last discussion I think I forgot to point out that inotify directory > > events (and fanotify would be the same AFAICT) may come out of order compared > > to real filesystem changes. E.g. sequence: > > > > Task 1 Task 2 > > > > mv a b > > mv b c > > > > may come out of inotify as: > > > > IN_MOVED_FROM "b" COOKIE 1 > > IN_MOVED_TO "c" COOKIE 1 > > IN_MOVED_FROM "a" COOKIE 2 > > IN_MOVED_TO "b" COOKIE 2 > > > > Really? How come? > > fsnotify_move() inside vfs_rename() is serialized with lock_rename() Sorry, I got confused by the unlocking in vfs_rename() however that unlocks only the moved file / directory but not the directory in which the rename is happening. Taking my objection back. Honza
On Tue, Mar 14, 2017 at 9:46 AM, Filip Štědronský <r.lkml@regnarg.cz> wrote: > Hi, > > On Tue, Mar 14, 2017 at 12:40:56PM +0200, Amir Goldstein wrote: >> An I am very happy that you used filehandles to keep track of objects >> in your example, because it fits my proposal really well. > > you can read more about the rationales in the WIP draft of my thesis > (especially the chapter Identifying Inodes and its surroundings): > > http://regnarg.cz/tmp/thesis.pdf > https://github.com/regnarg/bc > Filip, I read your draft and it's very interesting. Looking forward for the unfinished chapters ;-) open_by_handle() is important for your implementation in order to be able to get inode number (unique object id) from the file handle. In practice, although file handles are opaque descriptors, they usually contain the inode number information is a well known format. If it will be possible to take those formats and make them standard ABI, or alternatively, to call a new get_inum_by_handle() syscall, then the indexer/file sync won't need to have the extra privileges needed for open_by_handle() and file systems that can provide "view only" file handles (e.g. overlayfs) could be used with the indexer. Apropos chapter 1.3 "Filesystem based change detection" It's on my todo list to try and implement something with XFS using the LSN information embedded in every inode. It would be great if we can collaborate on this in the future. Amir.
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index bbc175d4213d..5178b06c338c 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -186,6 +186,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_MODIFY_DIR != FS_MODIFY_DIR); BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); BUILD_BUG_ON(FAN_OPEN != FS_OPEN); diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index b43d3f5bd9ea..00fb87c975d6 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -208,6 +208,23 @@ static inline void fsnotify_modify(struct file *file) } /* + * fsnotify_modifydir - directory contents were changed + * (as a result of rename, creat, unlink, etc.) + */ +static inline void fsnotify_modify_dir(struct path *path) +{ + struct inode *inode = path->dentry->d_inode; + __u32 mask = FS_MODIFY_DIR; + + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + else + return; + + fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0); +} + +/* * fsnotify_open - file was opened */ static inline void fsnotify_open(struct file *file) diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 487246546ebe..7751b337ec31 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -42,6 +42,7 @@ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ +#define FS_MODIFY_DIR 0x00040000 /* directory changed (rename/unlink/...) */ #define FS_EXCL_UNLINK 0x04000000 /* do not send events if object is unlinked */ #define FS_ISDIR 0x40000000 /* event occurred against dir */ diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 030508d195d3..f14e048d492a 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -15,6 +15,8 @@ #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */ #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */ +#define FAN_MODIFY_DIR 0x00040000 /* directory changed (rename/unlink/...) */ + #define FAN_ONDIR 0x40000000 /* event occurred against dir */ #define FAN_EVENT_ON_CHILD 0x08000000 /* interested in child events */ @@ -67,7 +69,8 @@ #define FAN_ALL_EVENTS (FAN_ACCESS |\ FAN_MODIFY |\ FAN_CLOSE |\ - FAN_OPEN) + FAN_OPEN |\ + FAN_MODIFY_DIR) /* * All events which require a permission response from userspace
Fanotify currently does not report directory modification events (rename, unlink, etc.). But these events are essential for about half of concievable fanotify use cases, especially: - file system indexers / desktop search tools - file synchronization tools (like Dropbox, Nextcloud, etc.), online backup tools and pretty much any app that needs to maintain and up-to-date internal representation of current contents of the file system. All applications of the above classes that I'm aware of currently use recursive inotify watches, which do not scale (my home dir has ~200k directories, creating all the watches takes ~2min and eats several tens of MB of kernel memory). There have been many calls for such a feature, pretty much since the creation of fanotify in 2009: * By GNOME developers: https://wiki.gnome.org/BastienNocera/KernelWishlist#VFS.2C_filesystems * By KDE developers: http://lkml.kernel.org/r/201211011352.42476.Martin%40lichtvoll.de 'Better support for (desktop) file search / indexing applications' * And others: http://lkml.kernel.org/r/AANLkTi=owK=WZW4oNtpm5WpAZhqCQUdTR2K5gzJ_MqZ+%40mail.gmail.com 'Fanotify mv/rename?' http://lkml.kernel.org/r/1238158043.23703.20.camel%40fatty 'Issues with using fanotify for a filesystem indexer' Add a new fanotify event, FAN_MODIFY_DIR, that is emitted whenever the contents of a directory change (a directory entry is added, removed or renamed). This covers all the currently missing events: rename, unlink, mknod, mkdir, rmdir, etc. Included with the event is a file descriptor to the modified directory but no further info. The userspace application is expected to rescan the whole directory and update its model accordingly. This needs to be done carefully to prevent race conditions. A cross-directory rename generates two FAN_MODIFY_DIR events. This minimalistic approach has several advantages: * Does not require changes to the fanotify userspace API or the fsnotify in-kernel framework, apart from adding a new event. Especially doesn't complicate it by adding string fields. * Has simple and clear semantics, even with multiple renames occurring in parallel etc. In case of any inconsistencies, one can simply wait for a while and rescan again. * FAN_MODIFY_DIR events are easily merged in case of multiple operations on a directory (e.g. deleting all files). Signed-off-by: Filip Štědronský <r.lkml@regnarg.cz> --- An alternative might be to create wrapper functions like vfs_path_(rename|unlink|...). They could also take care of calling security_path_(rename|unlink|...), which is currently also up to the indvidual callers (possibly with a flag because it might not be always desired). An alternative was proposed by Amir Goldstein in several long series of patches that add superblock-scoped (as opposed to vfsmount-scoped) fanotify watches and specific dentry manipulation events with filenames: http://lkml.kernel.org/r/1481984434-13293-1-git-send-email-amir73il%40gmail.com http://lkml.kernel.org/r/1482247207-4424-1-git-send-email-amir73il%40gmail.com http://lkml.kernel.org/r/1476126784-12520-1-git-send-email-amir73il%40gmail.com http://lkml.kernel.org/r/1489411223-12081-1-git-send-email-amir73il%40gmail.com There is large but not complete overlap between that proposal and mine (which is was originally created over a year ago, before Amir's work, but never posted). I think the superblock watch idea is very interesting because it might in the future allow reporing fsnotify events from remote and virtual filesystems. So I'm posting this more as a potential source of more ideas for discussion, or a fallback proposal in case Amir's patches don't make it. --- fs/notify/fanotify/fanotify.c | 1 + include/linux/fsnotify.h | 17 +++++++++++++++++ include/linux/fsnotify_backend.h | 1 + include/uapi/linux/fanotify.h | 5 ++++- 4 files changed, 23 insertions(+), 1 deletion(-)