diff mbox series

[v2,13/16] fanotify: implement "evictable" inode marks

Message ID 20220329074904.2980320-14-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show
Series Evictable fanotify marks | expand

Commit Message

Amir Goldstein March 29, 2022, 7:49 a.m. UTC
When an inode mark is created with flag FAN_MARK_EVICTABLE, it will not
pin the marked inode to inode cache, so when inode is evicted from cache
due to memory pressure, the mark will be lost.

When an inode mark with flag FAN_MARK_EVICATBLE is updated without using
this flag, the marked inode is pinned to inode cache.

When an inode mark is updated with flag FAN_MARK_EVICTABLE but an
existing mark already has the inode pinned, the mark update fails with
error EEXIST.

Evictable inode marks can be used to setup inode marks with ignored mask
to suppress events from uninteresting files or directories in a lazy
manner, upon receiving the first event, without having to iterate all
the uninteresting files or directories before hand.

The evictbale inode mark feature allows performing this lazy marks setup
without exhausting the system memory with pinned inodes.

This change does not enable the feature yet.

Link: https://lore.kernel.org/linux-fsdevel/CAOQ4uxiRDpuS=2uA6+ZUM7yG9vVU-u212tkunBmSnP_u=mkv=Q@mail.gmail.com/
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/notify/fanotify/fanotify.h      |  2 ++
 fs/notify/fanotify/fanotify_user.c | 31 +++++++++++++++++++++++++++++-
 include/uapi/linux/fanotify.h      |  1 +
 3 files changed, 33 insertions(+), 1 deletion(-)

Comments

Jan Kara April 11, 2022, 11:47 a.m. UTC | #1
On Tue 29-03-22 10:49:01, Amir Goldstein wrote:
> When an inode mark is created with flag FAN_MARK_EVICTABLE, it will not
> pin the marked inode to inode cache, so when inode is evicted from cache
> due to memory pressure, the mark will be lost.
> 
> When an inode mark with flag FAN_MARK_EVICATBLE is updated without using
> this flag, the marked inode is pinned to inode cache.
> 
> When an inode mark is updated with flag FAN_MARK_EVICTABLE but an
> existing mark already has the inode pinned, the mark update fails with
> error EEXIST.

I was thinking about this. FAN_MARK_EVICTABLE is effectively a hint to the
kernel - you can drop this if you wish. So does it make sense to return
error when we cannot follow the hint? Doesn't this just add unnecessary
work (determining whether the mark should be evictable or not) to the
userspace application using FAN_MARK_EVICTABLE?

I'd also note that FSNOTIFY_MARK_FLAG_NO_IREF needs to be stored only
because of this error checking behavior. Otherwise it would be enough to
have a flag on the connector (whether it holds iref or not) and
fsnotify_add_mark() would update the connector as needed given the added
mark. No need to mess with fsnotify_recalc_mask(). But this would be just
a small simplification. The API question in the above paragraph is more
important to me.

								Honza

> ---
>  fs/notify/fanotify/fanotify.h      |  2 ++
>  fs/notify/fanotify/fanotify_user.c | 31 +++++++++++++++++++++++++++++-
>  include/uapi/linux/fanotify.h      |  1 +
>  3 files changed, 33 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
> index 87142bc0131a..80e0ec95b113 100644
> --- a/fs/notify/fanotify/fanotify.h
> +++ b/fs/notify/fanotify/fanotify.h
> @@ -497,6 +497,8 @@ static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
>  
>  	if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
>  		mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
> +	if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)
> +		mflags |= FAN_MARK_EVICTABLE;
>  
>  	return mflags;
>  }
> diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
> index 6e78ea12239c..2c65038da4ce 100644
> --- a/fs/notify/fanotify/fanotify_user.c
> +++ b/fs/notify/fanotify/fanotify_user.c
> @@ -1084,6 +1084,8 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
>  static int fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
>  				      unsigned int flags, bool *recalc)
>  {
> +	bool want_iref = !(flags & FAN_MARK_EVICTABLE);
> +
>  	/*
>  	 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
>  	 * the removal of the FS_MODIFY bit in calculated mask if it was set
> @@ -1097,6 +1099,20 @@ static int fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
>  			*recalc = true;
>  	}
>  
> +	if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
> +	    want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
> +		return 0;
> +
> +	/*
> +	 * NO_IREF may be removed from a mark, but not added.
> +	 * When removed, fsnotify_recalc_mask() will take the inode ref.
> +	 */
> +	if (!want_iref)
> +		return -EEXIST;
> +
> +	fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
> +	*recalc = true;
> +
>  	return 0;
>  }
>  
> @@ -1130,6 +1146,7 @@ static int fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
>  static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
>  						   fsnotify_connp_t *connp,
>  						   unsigned int obj_type,
> +						   unsigned int fan_flags,
>  						   __kernel_fsid_t *fsid)
>  {
>  	struct ucounts *ucounts = group->fanotify_data.ucounts;
> @@ -1152,6 +1169,9 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
>  	}
>  
>  	fsnotify_init_mark(mark, group);
> +	if (fan_flags & FAN_MARK_EVICTABLE)
> +		mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
> +
>  	ret = fsnotify_add_mark_locked(mark, connp, obj_type, fsid);
>  	if (ret) {
>  		fsnotify_put_mark(mark);
> @@ -1187,7 +1207,8 @@ static int fanotify_add_mark(struct fsnotify_group *group,
>  	mutex_lock(&group->mark_mutex);
>  	fsn_mark = fsnotify_find_mark(connp, group);
>  	if (!fsn_mark) {
> -		fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid);
> +		fsn_mark = fanotify_add_new_mark(group, connp, obj_type, flags,
> +						 fsid);
>  		if (IS_ERR(fsn_mark)) {
>  			mutex_unlock(&group->mark_mutex);
>  			return PTR_ERR(fsn_mark);
> @@ -1602,6 +1623,14 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
>  	    mark_type != FAN_MARK_FILESYSTEM)
>  		goto fput_and_out;
>  
> +	/*
> +	 * Evictable is only relevant for inode marks, because only inode object
> +	 * can be evicted on memory pressure.
> +	 */
> +	if (flags & FAN_MARK_EVICTABLE &&
> +	     mark_type != FAN_MARK_INODE)
> +		goto fput_and_out;
> +
>  	/*
>  	 * Events that do not carry enough information to report
>  	 * event->fd require a group that supports reporting fid.  Those
> diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
> index e8ac38cc2fd6..f1f89132d60e 100644
> --- a/include/uapi/linux/fanotify.h
> +++ b/include/uapi/linux/fanotify.h
> @@ -82,6 +82,7 @@
>  #define FAN_MARK_IGNORED_SURV_MODIFY	0x00000040
>  #define FAN_MARK_FLUSH		0x00000080
>  /* FAN_MARK_FILESYSTEM is	0x00000100 */
> +#define FAN_MARK_EVICTABLE	0x00000200
>  
>  /* These are NOT bitwise flags.  Both bits can be used togther.  */
>  #define FAN_MARK_INODE		0x00000000
> -- 
> 2.25.1
>
Amir Goldstein April 11, 2022, 12:57 p.m. UTC | #2
On Mon, Apr 11, 2022 at 2:47 PM Jan Kara <jack@suse.cz> wrote:
>
> On Tue 29-03-22 10:49:01, Amir Goldstein wrote:
> > When an inode mark is created with flag FAN_MARK_EVICTABLE, it will not
> > pin the marked inode to inode cache, so when inode is evicted from cache
> > due to memory pressure, the mark will be lost.
> >
> > When an inode mark with flag FAN_MARK_EVICATBLE is updated without using
> > this flag, the marked inode is pinned to inode cache.
> >
> > When an inode mark is updated with flag FAN_MARK_EVICTABLE but an
> > existing mark already has the inode pinned, the mark update fails with
> > error EEXIST.
>
> I was thinking about this. FAN_MARK_EVICTABLE is effectively a hint to the
> kernel - you can drop this if you wish. So does it make sense to return
> error when we cannot follow the hint? Doesn't this just add unnecessary
> work (determining whether the mark should be evictable or not) to the
> userspace application using FAN_MARK_EVICTABLE?

I do not fully agree about your definition of  "hint to the kernel".
Yes, for a single inode it may be a hint, but for a million inodes it is pretty
much a directive that setting a very large number of evictable marks
CANNOT be used to choke the system.

It's true that the application should be able to avoid shooting its own
foot and we do not need to be the ones providing this protection, but
I rather prefer to keep the API more strict and safe than being sorry later.
After all, I don't think this complicates the implementation nor documentation
too much. Is it? see:

https://github.com/amir73il/man-pages/commit/b52eb7d1a8478cbd1456f4d9463902bbc4e80f0d

>
> I'd also note that FSNOTIFY_MARK_FLAG_NO_IREF needs to be stored only
> because of this error checking behavior. Otherwise it would be enough to
> have a flag on the connector (whether it holds iref or not) and
> fsnotify_add_mark() would update the connector as needed given the added

I am not sure I agree to that.
Maybe I am missing something, but the way fsnotify_recalc_mask() works now
is by checking if there is any mark without FSNOTIFY_MARK_FLAG_NO_IREF
attached to the object, so fsnotify_put_mark() knows to drop the inode when the
last non-evictable mark is removed.

Thanks,
Amir.
Jan Kara April 11, 2022, 2:19 p.m. UTC | #3
On Mon 11-04-22 15:57:30, Amir Goldstein wrote:
> On Mon, Apr 11, 2022 at 2:47 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Tue 29-03-22 10:49:01, Amir Goldstein wrote:
> > > When an inode mark is created with flag FAN_MARK_EVICTABLE, it will not
> > > pin the marked inode to inode cache, so when inode is evicted from cache
> > > due to memory pressure, the mark will be lost.
> > >
> > > When an inode mark with flag FAN_MARK_EVICATBLE is updated without using
> > > this flag, the marked inode is pinned to inode cache.
> > >
> > > When an inode mark is updated with flag FAN_MARK_EVICTABLE but an
> > > existing mark already has the inode pinned, the mark update fails with
> > > error EEXIST.
> >
> > I was thinking about this. FAN_MARK_EVICTABLE is effectively a hint to the
> > kernel - you can drop this if you wish. So does it make sense to return
> > error when we cannot follow the hint? Doesn't this just add unnecessary
> > work (determining whether the mark should be evictable or not) to the
> > userspace application using FAN_MARK_EVICTABLE?
> 
> I do not fully agree about your definition of  "hint to the kernel".
> Yes, for a single inode it may be a hint, but for a million inodes it is pretty
> much a directive that setting a very large number of evictable marks
> CANNOT be used to choke the system.
> 
> It's true that the application should be able to avoid shooting its own
> foot and we do not need to be the ones providing this protection, but
> I rather prefer to keep the API more strict and safe than being sorry later.
> After all, I don't think this complicates the implementation nor documentation
> too much. Is it? see:
> 
> https://github.com/amir73il/man-pages/commit/b52eb7d1a8478cbd1456f4d9463902bbc4e80f0d

No, it is not complicating things too much and you're probably right that
having things stricter now may pay off in the future. I was just thinking
that app adding ignore mark now needs to remember whether it has already
added something else for the inode or not to know whether it can use
FAN_MARK_EVICTABLE. Which seemed like unnecessary complication.

> > I'd also note that FSNOTIFY_MARK_FLAG_NO_IREF needs to be stored only
> > because of this error checking behavior. Otherwise it would be enough to
> > have a flag on the connector (whether it holds iref or not) and
> > fsnotify_add_mark() would update the connector as needed given the added
> 
> I am not sure I agree to that.
> Maybe I am missing something, but the way fsnotify_recalc_mask() works now
> is by checking if there is any mark without FSNOTIFY_MARK_FLAG_NO_IREF
> attached to the object, so fsnotify_put_mark() knows to drop the inode when the
> last non-evictable mark is removed.

Right, I was confused and somehow thought that once connector has iref, it
will drop it only when the mark list gets empty which is obviously not
true.

								Honza
Amir Goldstein April 12, 2022, 8:07 a.m. UTC | #4
On Mon, Apr 11, 2022 at 5:19 PM Jan Kara <jack@suse.cz> wrote:
>
> On Mon 11-04-22 15:57:30, Amir Goldstein wrote:
> > On Mon, Apr 11, 2022 at 2:47 PM Jan Kara <jack@suse.cz> wrote:
> > >
> > > On Tue 29-03-22 10:49:01, Amir Goldstein wrote:
> > > > When an inode mark is created with flag FAN_MARK_EVICTABLE, it will not
> > > > pin the marked inode to inode cache, so when inode is evicted from cache
> > > > due to memory pressure, the mark will be lost.
> > > >
> > > > When an inode mark with flag FAN_MARK_EVICATBLE is updated without using
> > > > this flag, the marked inode is pinned to inode cache.
> > > >
> > > > When an inode mark is updated with flag FAN_MARK_EVICTABLE but an
> > > > existing mark already has the inode pinned, the mark update fails with
> > > > error EEXIST.
> > >
> > > I was thinking about this. FAN_MARK_EVICTABLE is effectively a hint to the
> > > kernel - you can drop this if you wish. So does it make sense to return
> > > error when we cannot follow the hint? Doesn't this just add unnecessary
> > > work (determining whether the mark should be evictable or not) to the
> > > userspace application using FAN_MARK_EVICTABLE?
> >
> > I do not fully agree about your definition of  "hint to the kernel".
> > Yes, for a single inode it may be a hint, but for a million inodes it is pretty
> > much a directive that setting a very large number of evictable marks
> > CANNOT be used to choke the system.
> >
> > It's true that the application should be able to avoid shooting its own
> > foot and we do not need to be the ones providing this protection, but
> > I rather prefer to keep the API more strict and safe than being sorry later.
> > After all, I don't think this complicates the implementation nor documentation
> > too much. Is it? see:
> >
> > https://github.com/amir73il/man-pages/commit/b52eb7d1a8478cbd1456f4d9463902bbc4e80f0d
>
> No, it is not complicating things too much and you're probably right that
> having things stricter now may pay off in the future. I was just thinking
> that app adding ignore mark now needs to remember whether it has already
> added something else for the inode or not to know whether it can use
> FAN_MARK_EVICTABLE. Which seemed like unnecessary complication.
>

Well the way my app does it, it has a hardcoded list of "must exclude" dirs
which is sets on startup and then evictable ignore masks are added lazily
when events in non-interesting path show up.

If app would somehow get an event with path that was in the "must exclude"
set, then adding evictable mark would fail and nothing to it because it is
an optimization anyway. There is no tracking involved.

Anyway, I see there is a bug in EEXIST case, the error is returned *after*
updating the mask - I will fix it and re-post v3 with the rest of the fixes.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 87142bc0131a..80e0ec95b113 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -497,6 +497,8 @@  static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
 
 	if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
 		mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
+	if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)
+		mflags |= FAN_MARK_EVICTABLE;
 
 	return mflags;
 }
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 6e78ea12239c..2c65038da4ce 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1084,6 +1084,8 @@  static int fanotify_remove_inode_mark(struct fsnotify_group *group,
 static int fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
 				      unsigned int flags, bool *recalc)
 {
+	bool want_iref = !(flags & FAN_MARK_EVICTABLE);
+
 	/*
 	 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
 	 * the removal of the FS_MODIFY bit in calculated mask if it was set
@@ -1097,6 +1099,20 @@  static int fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
 			*recalc = true;
 	}
 
+	if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
+	    want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
+		return 0;
+
+	/*
+	 * NO_IREF may be removed from a mark, but not added.
+	 * When removed, fsnotify_recalc_mask() will take the inode ref.
+	 */
+	if (!want_iref)
+		return -EEXIST;
+
+	fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
+	*recalc = true;
+
 	return 0;
 }
 
@@ -1130,6 +1146,7 @@  static int fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 						   fsnotify_connp_t *connp,
 						   unsigned int obj_type,
+						   unsigned int fan_flags,
 						   __kernel_fsid_t *fsid)
 {
 	struct ucounts *ucounts = group->fanotify_data.ucounts;
@@ -1152,6 +1169,9 @@  static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
 	}
 
 	fsnotify_init_mark(mark, group);
+	if (fan_flags & FAN_MARK_EVICTABLE)
+		mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
+
 	ret = fsnotify_add_mark_locked(mark, connp, obj_type, fsid);
 	if (ret) {
 		fsnotify_put_mark(mark);
@@ -1187,7 +1207,8 @@  static int fanotify_add_mark(struct fsnotify_group *group,
 	mutex_lock(&group->mark_mutex);
 	fsn_mark = fsnotify_find_mark(connp, group);
 	if (!fsn_mark) {
-		fsn_mark = fanotify_add_new_mark(group, connp, obj_type, fsid);
+		fsn_mark = fanotify_add_new_mark(group, connp, obj_type, flags,
+						 fsid);
 		if (IS_ERR(fsn_mark)) {
 			mutex_unlock(&group->mark_mutex);
 			return PTR_ERR(fsn_mark);
@@ -1602,6 +1623,14 @@  static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
 	    mark_type != FAN_MARK_FILESYSTEM)
 		goto fput_and_out;
 
+	/*
+	 * Evictable is only relevant for inode marks, because only inode object
+	 * can be evicted on memory pressure.
+	 */
+	if (flags & FAN_MARK_EVICTABLE &&
+	     mark_type != FAN_MARK_INODE)
+		goto fput_and_out;
+
 	/*
 	 * Events that do not carry enough information to report
 	 * event->fd require a group that supports reporting fid.  Those
diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
index e8ac38cc2fd6..f1f89132d60e 100644
--- a/include/uapi/linux/fanotify.h
+++ b/include/uapi/linux/fanotify.h
@@ -82,6 +82,7 @@ 
 #define FAN_MARK_IGNORED_SURV_MODIFY	0x00000040
 #define FAN_MARK_FLUSH		0x00000080
 /* FAN_MARK_FILESYSTEM is	0x00000100 */
+#define FAN_MARK_EVICTABLE	0x00000200
 
 /* These are NOT bitwise flags.  Both bits can be used togther.  */
 #define FAN_MARK_INODE		0x00000000