diff mbox series

[v4,03/10] fsnotify: send event to parent and child with single callback

Message ID 20200702125744.10535-4-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show
Series fanotify events with name info | expand

Commit Message

Amir Goldstein July 2, 2020, 12:57 p.m. UTC
Instead of calling fsnotify() twice, once with parent inode and once
with child inode, if event should be sent to parent inode, send it
with both parent and child inodes marks in object type iterator and call
the backend handle_event() callback only once.

The parent inode is assigned to the standard "inode" iterator type and
the child inode is assigned to the special "child" iterator type.

In that case, the bit FS_EVENT_ON_CHILD will be set in the event mask,
the dir argment to handle_event will be the parent inode, the file_name
argument to handle_event is non NULL and refers to the name of the child
and the child inode can be accessed with fsnotify_data_inode().

This will allow fanotify to make decisions based on child or parent's
ignored mask.  For example, when a parent is interested in a specific
event on its children, but a specific child wishes to ignore this event,
the event will not be reported.  This is not what happens with current
code, but according to man page, it is the expected behavior.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/kernfs/file.c     | 10 ++++++----
 fs/notify/fsnotify.c | 40 ++++++++++++++++++++++++++--------------
 2 files changed, 32 insertions(+), 18 deletions(-)

Comments

Jan Kara July 14, 2020, 10:34 a.m. UTC | #1
On Thu 02-07-20 15:57:37, Amir Goldstein wrote:
> Instead of calling fsnotify() twice, once with parent inode and once
> with child inode, if event should be sent to parent inode, send it
> with both parent and child inodes marks in object type iterator and call
> the backend handle_event() callback only once.
> 
> The parent inode is assigned to the standard "inode" iterator type and
> the child inode is assigned to the special "child" iterator type.
> 
> In that case, the bit FS_EVENT_ON_CHILD will be set in the event mask,
> the dir argment to handle_event will be the parent inode, the file_name
> argument to handle_event is non NULL and refers to the name of the child
> and the child inode can be accessed with fsnotify_data_inode().
> 
> This will allow fanotify to make decisions based on child or parent's
> ignored mask.  For example, when a parent is interested in a specific
> event on its children, but a specific child wishes to ignore this event,
> the event will not be reported.  This is not what happens with current
> code, but according to man page, it is the expected behavior.
> 
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>

I like the direction where this is going. But can't we push it even a bit
further? I like the fact that we now have "one fs event" -> "one fsnotify()
call". Ideally I'd like to get rid of FS_EVENT_ON_CHILD in the event mask
because it's purpose seems very weak now and it complicates code (and now
it became even a bit of a misnomer) - intuitively, ->handle_event is now
passed sb, mnt, parent, child so it should have all the info to decide
where the event should be reported and I don't see a need for
FS_EVENT_ON_CHILD flag. With fsnotify() call itself we still use
FS_EVENT_ON_CHILD to determine what the arguments mean but can't we just
mandate that 'data' always points to the child, 'to_tell' is always the
parent dir if watching or NULL (and I'd rename that argument to 'dir' and
maybe move it after 'data_type' argument). What do you think?

Some further comments about the current implementation are below...

> ---
>  fs/kernfs/file.c     | 10 ++++++----
>  fs/notify/fsnotify.c | 40 ++++++++++++++++++++++++++--------------
>  2 files changed, 32 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> index e23b3f62483c..5b1468bc509e 100644
> --- a/fs/kernfs/file.c
> +++ b/fs/kernfs/file.c
> @@ -883,6 +883,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
>  
>  	list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
>  		struct kernfs_node *parent;
> +		struct inode *p_inode = NULL;
>  		struct inode *inode;
>  		struct qstr name;
>  
> @@ -899,8 +900,6 @@ static void kernfs_notify_workfn(struct work_struct *work)
>  		name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
>  		parent = kernfs_get_parent(kn);
>  		if (parent) {
> -			struct inode *p_inode;
> -
>  			p_inode = ilookup(info->sb, kernfs_ino(parent));
>  			if (p_inode) {
>  				fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
> @@ -911,8 +910,11 @@ static void kernfs_notify_workfn(struct work_struct *work)
>  			kernfs_put(parent);
>  		}
>  
> -		fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
> -			 NULL, 0);
> +		if (!p_inode) {
> +			fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
> +				 NULL, 0);
> +		}
> +
>  		iput(inode);
>  	}
>  
> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index 51ada3cfd2ff..7c6e624b24c9 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -145,15 +145,17 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
>  /*
>   * Notify this dentry's parent about a child's events with child name info
>   * if parent is watching.
> - * Notify also the child without name info if child inode is watching.
> + * Notify only the child without name info if parent is not watching.
>   */
>  int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
>  		      int data_type)
>  {
> +	struct inode *inode = d_inode(dentry);
>  	struct dentry *parent;
>  	struct inode *p_inode;
>  	int ret = 0;
>  
> +	parent = NULL;
>  	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
>  		goto notify_child;
>  
> @@ -165,23 +167,23 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
>  	} else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
>  		struct name_snapshot name;
>  
> -		/*
> -		 * We are notifying a parent, so set a flag in mask to inform
> -		 * backend that event has information about a child entry.
> -		 */
> +		/* When notifying parent, child should be passed as data */
> +		WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));
> +
> +		/* Notify both parent and child with child name info */
>  		take_dentry_name_snapshot(&name, dentry);
>  		ret = fsnotify(p_inode, mask | FS_EVENT_ON_CHILD, data,
>  			       data_type, &name.name, 0);
>  		release_dentry_name_snapshot(&name);
> +	} else {
> +notify_child:
> +		/* Notify child without child name info */
> +		ret = fsnotify(inode, mask, data, data_type, NULL, 0);
>  	}

AFAICT this will miss notifying the child if the condition
	!fsnotify_inode_watches_children(p_inode)
above is true... And I've noticed this because jumping into a branch in an
if block is usually a bad idea and so I gave it a closer look. Exactly
because of problems like this. Usually it's better to restructure
conditions instead.

In this case I think we could structure the code like:
	struct name_snapshot name
	struct qstr *namestr = NULL;

	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
		goto notify;
	parent = dget_parent(dentry);
	p_inode = parent->d_inode;

	if (unlikely(!fsnotify_inode_watches_children(p_inode))) {
		__fsnotify_update_child_dentry_flags(p_inode);
	} else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
		take_dentry_name_snapshot(&name, dentry);
		namestr = &name.name;
		mask |= FS_EVENT_ON_CHILD;
	}
notify:
	ret = fsnotify(p_inode, mask, data, data_type, namestr, 0);
	if (namestr)
		release_dentry_name_snapshot(&name);
	dput(parent);
	return ret;
>  
>  	dput(parent);
>  
> -	if (ret)
> -		return ret;
> -
> -notify_child:
> -	return fsnotify(d_inode(dentry), mask, data, data_type, NULL, 0);
> +	return ret;
>  }
>  EXPORT_SYMBOL_GPL(__fsnotify_parent);
>  
> @@ -322,12 +324,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_type,
>  	struct super_block *sb = to_tell->i_sb;
>  	struct inode *dir = S_ISDIR(to_tell->i_mode) ? to_tell : NULL;
>  	struct mount *mnt = NULL;
> +	struct inode *child = NULL;
>  	int ret = 0;
>  	__u32 test_mask, marks_mask;
>  
>  	if (path)
>  		mnt = real_mount(path->mnt);
>  
> +	if (mask & FS_EVENT_ON_CHILD)
> +		child = fsnotify_data_inode(data, data_type);
> +
>  	/*
>  	 * Optimization: srcu_read_lock() has a memory barrier which can
>  	 * be expensive.  It protects walking the *_fsnotify_marks lists.
> @@ -336,21 +342,23 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_type,
>  	 * need SRCU to keep them "alive".
>  	 */
>  	if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks &&
> -	    (!mnt || !mnt->mnt_fsnotify_marks))
> +	    (!mnt || !mnt->mnt_fsnotify_marks) &&
> +	    (!child || !child->i_fsnotify_marks))
>  		return 0;
>  
>  	/* An event "on child" is not intended for a mount/sb mark */
>  	marks_mask = to_tell->i_fsnotify_mask;
> -	if (!(mask & FS_EVENT_ON_CHILD)) {
> +	if (!child) {
>  		marks_mask |= sb->s_fsnotify_mask;
>  		if (mnt)
>  			marks_mask |= mnt->mnt_fsnotify_mask;
> +	} else {
> +		marks_mask |= child->i_fsnotify_mask;
>  	}

I don't think this is correct. The FS_EVENT_ON_CHILD events should
also be reported to sb & mnt marks because we now generate only
FS_EVENT_ON_CHILD if parent is watching but that doesn't mean e.g. sb
shouldn't receive the event. Am I missing something? If I'm indeed right,
maybe we should extend our LTP coverage a bit to catch breakage like
this...

>  	/*
>  	 * if this is a modify event we may need to clear the ignored masks
> -	 * otherwise return if neither the inode nor the vfsmount/sb care about
> -	 * this type of event.
> +	 * otherwise return if none of the marks care about this type of event.
>  	 */
>  	test_mask = (mask & ALL_FSNOTIFY_EVENTS);
>  	if (!(mask & FS_MODIFY) && !(test_mask & marks_mask))
> @@ -366,6 +374,10 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_type,
>  		iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
>  			fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
>  	}
> +	if (child) {
> +		iter_info.marks[FSNOTIFY_OBJ_TYPE_CHILD] =
> +			fsnotify_first_mark(&child->i_fsnotify_marks);
> +	}
>  
>  	/*
>  	 * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark
> -- 
> 2.17.1
>
Amir Goldstein July 14, 2020, 11:54 a.m. UTC | #2
On Tue, Jul 14, 2020 at 1:34 PM Jan Kara <jack@suse.cz> wrote:
>
> On Thu 02-07-20 15:57:37, Amir Goldstein wrote:
> > Instead of calling fsnotify() twice, once with parent inode and once
> > with child inode, if event should be sent to parent inode, send it
> > with both parent and child inodes marks in object type iterator and call
> > the backend handle_event() callback only once.
> >
> > The parent inode is assigned to the standard "inode" iterator type and
> > the child inode is assigned to the special "child" iterator type.
> >
> > In that case, the bit FS_EVENT_ON_CHILD will be set in the event mask,
> > the dir argment to handle_event will be the parent inode, the file_name
> > argument to handle_event is non NULL and refers to the name of the child
> > and the child inode can be accessed with fsnotify_data_inode().
> >
> > This will allow fanotify to make decisions based on child or parent's
> > ignored mask.  For example, when a parent is interested in a specific
> > event on its children, but a specific child wishes to ignore this event,
> > the event will not be reported.  This is not what happens with current
> > code, but according to man page, it is the expected behavior.
> >
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
>
> I like the direction where this is going. But can't we push it even a bit
> further? I like the fact that we now have "one fs event" -> "one fsnotify()
> call". Ideally I'd like to get rid of FS_EVENT_ON_CHILD in the event mask
> because it's purpose seems very weak now and it complicates code (and now

Can you give an example where it complicates the code?
Don't confuse this with the code in fanotify_user.c that subscribes for events
on child/with name.

In one before the last patch of the series I am testing FS_EVENT_ON_CHILD
in mask to know how to report the event inside fanotify_alloc_event().
I may be able to carry this information not in mask, but the flag space is
already taken anyway by FAN_EVENT_ON_CHILD input arg, so not sure
what is there to gain from not setting FS_EVENT_ON_CHILD.

> it became even a bit of a misnomer) - intuitively, ->handle_event is now

I thought of changing the name to FS_EVENT_WITH_NAME, but that was
confusing because create/delete are also events with a name.
Maybe FS_EVENT_WITH_CHILD_NAME :-/

> passed sb, mnt, parent, child so it should have all the info to decide
> where the event should be reported and I don't see a need for
> FS_EVENT_ON_CHILD flag.

Do you mean something like this?

        const struct path *inode = fsnotify_data_inode(data, data_type);
        bool event_on_child = !!file_name && dir != inode;

It may be true that all information is there, but I think the above is
a bit ugly and quite not trivial to explain, whereas the flag is quite
intuitive (to me) and adds no extra complexity (IMO).

> With fsnotify() call itself we still use
> FS_EVENT_ON_CHILD to determine what the arguments mean but can't we just
> mandate that 'data' always points to the child, 'to_tell' is always the
> parent dir if watching or NULL (and I'd rename that argument to 'dir' and
> maybe move it after 'data_type' argument). What do you think?
>

I think what you are missing is the calls from fsnotify_name().
For those calls, data is the dir and so is to_tell and name is the entry name.

> Some further comments about the current implementation are below...
>
> > ---
> >  fs/kernfs/file.c     | 10 ++++++----
> >  fs/notify/fsnotify.c | 40 ++++++++++++++++++++++++++--------------
> >  2 files changed, 32 insertions(+), 18 deletions(-)
> >
> > diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> > index e23b3f62483c..5b1468bc509e 100644
> > --- a/fs/kernfs/file.c
> > +++ b/fs/kernfs/file.c
> > @@ -883,6 +883,7 @@ static void kernfs_notify_workfn(struct work_struct *work)
> >
> >       list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
> >               struct kernfs_node *parent;
> > +             struct inode *p_inode = NULL;
> >               struct inode *inode;
> >               struct qstr name;
> >
> > @@ -899,8 +900,6 @@ static void kernfs_notify_workfn(struct work_struct *work)
> >               name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
> >               parent = kernfs_get_parent(kn);
> >               if (parent) {
> > -                     struct inode *p_inode;
> > -
> >                       p_inode = ilookup(info->sb, kernfs_ino(parent));
> >                       if (p_inode) {
> >                               fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
> > @@ -911,8 +910,11 @@ static void kernfs_notify_workfn(struct work_struct *work)
> >                       kernfs_put(parent);
> >               }
> >
> > -             fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
> > -                      NULL, 0);
> > +             if (!p_inode) {
> > +                     fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
> > +                              NULL, 0);
> > +             }
> > +
> >               iput(inode);
> >       }
> >
> > diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> > index 51ada3cfd2ff..7c6e624b24c9 100644
> > --- a/fs/notify/fsnotify.c
> > +++ b/fs/notify/fsnotify.c
> > @@ -145,15 +145,17 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
> >  /*
> >   * Notify this dentry's parent about a child's events with child name info
> >   * if parent is watching.
> > - * Notify also the child without name info if child inode is watching.
> > + * Notify only the child without name info if parent is not watching.
> >   */
> >  int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
> >                     int data_type)
> >  {
> > +     struct inode *inode = d_inode(dentry);
> >       struct dentry *parent;
> >       struct inode *p_inode;
> >       int ret = 0;
> >
> > +     parent = NULL;
> >       if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
> >               goto notify_child;
> >
> > @@ -165,23 +167,23 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
> >       } else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
> >               struct name_snapshot name;
> >
> > -             /*
> > -              * We are notifying a parent, so set a flag in mask to inform
> > -              * backend that event has information about a child entry.
> > -              */
> > +             /* When notifying parent, child should be passed as data */
> > +             WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));
> > +
> > +             /* Notify both parent and child with child name info */
> >               take_dentry_name_snapshot(&name, dentry);
> >               ret = fsnotify(p_inode, mask | FS_EVENT_ON_CHILD, data,
> >                              data_type, &name.name, 0);
> >               release_dentry_name_snapshot(&name);
> > +     } else {
> > +notify_child:
> > +             /* Notify child without child name info */
> > +             ret = fsnotify(inode, mask, data, data_type, NULL, 0);
> >       }
>
> AFAICT this will miss notifying the child if the condition
>         !fsnotify_inode_watches_children(p_inode)
> above is true... And I've noticed this because jumping into a branch in an
> if block is usually a bad idea and so I gave it a closer look. Exactly
> because of problems like this. Usually it's better to restructure
> conditions instead.
>
> In this case I think we could structure the code like:
>         struct name_snapshot name
>         struct qstr *namestr = NULL;
>
>         if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
>                 goto notify;
>         parent = dget_parent(dentry);
>         p_inode = parent->d_inode;
>
>         if (unlikely(!fsnotify_inode_watches_children(p_inode))) {
>                 __fsnotify_update_child_dentry_flags(p_inode);
>         } else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
>                 take_dentry_name_snapshot(&name, dentry);
>                 namestr = &name.name;
>                 mask |= FS_EVENT_ON_CHILD;
>         }
> notify:
>         ret = fsnotify(p_inode, mask, data, data_type, namestr, 0);
>         if (namestr)
>                 release_dentry_name_snapshot(&name);
>         dput(parent);
>         return ret;

I will look into this proposal, but please be aware that this function
completely changes
in the next patch "send event with parent/name info to sb/mount/non-dir marks",
so some of the things that look weird here or possibly even bugs might go away.
That is not to say that I won't fix them, but please review with the
next patch in mind
when considering reconstruct.

> >
> >       dput(parent);
> >
> > -     if (ret)
> > -             return ret;
> > -
> > -notify_child:
> > -     return fsnotify(d_inode(dentry), mask, data, data_type, NULL, 0);
> > +     return ret;
> >  }
> >  EXPORT_SYMBOL_GPL(__fsnotify_parent);
> >
> > @@ -322,12 +324,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_type,
> >       struct super_block *sb = to_tell->i_sb;
> >       struct inode *dir = S_ISDIR(to_tell->i_mode) ? to_tell : NULL;
> >       struct mount *mnt = NULL;
> > +     struct inode *child = NULL;
> >       int ret = 0;
> >       __u32 test_mask, marks_mask;
> >
> >       if (path)
> >               mnt = real_mount(path->mnt);
> >
> > +     if (mask & FS_EVENT_ON_CHILD)
> > +             child = fsnotify_data_inode(data, data_type);
> > +
> >       /*
> >        * Optimization: srcu_read_lock() has a memory barrier which can
> >        * be expensive.  It protects walking the *_fsnotify_marks lists.
> > @@ -336,21 +342,23 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_type,
> >        * need SRCU to keep them "alive".
> >        */
> >       if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks &&
> > -         (!mnt || !mnt->mnt_fsnotify_marks))
> > +         (!mnt || !mnt->mnt_fsnotify_marks) &&
> > +         (!child || !child->i_fsnotify_marks))
> >               return 0;
> >
> >       /* An event "on child" is not intended for a mount/sb mark */
> >       marks_mask = to_tell->i_fsnotify_mask;
> > -     if (!(mask & FS_EVENT_ON_CHILD)) {
> > +     if (!child) {
> >               marks_mask |= sb->s_fsnotify_mask;
> >               if (mnt)
> >                       marks_mask |= mnt->mnt_fsnotify_mask;
> > +     } else {
> > +             marks_mask |= child->i_fsnotify_mask;
> >       }
>
> I don't think this is correct. The FS_EVENT_ON_CHILD events should
> also be reported to sb & mnt marks because we now generate only
> FS_EVENT_ON_CHILD if parent is watching but that doesn't mean e.g. sb
> shouldn't receive the event. Am I missing something? If I'm indeed right,
> maybe we should extend our LTP coverage a bit to catch breakage like
> this...
>

You are right.
I will see if the bug is interim or stays with the next patch and will check
if LTP coverage covers this mid series breakage.

Thanks,
Amir.
Jan Kara July 15, 2020, 5:09 p.m. UTC | #3
On Tue 14-07-20 14:54:44, Amir Goldstein wrote:
> On Tue, Jul 14, 2020 at 1:34 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Thu 02-07-20 15:57:37, Amir Goldstein wrote:
> > > Instead of calling fsnotify() twice, once with parent inode and once
> > > with child inode, if event should be sent to parent inode, send it
> > > with both parent and child inodes marks in object type iterator and call
> > > the backend handle_event() callback only once.
> > >
> > > The parent inode is assigned to the standard "inode" iterator type and
> > > the child inode is assigned to the special "child" iterator type.
> > >
> > > In that case, the bit FS_EVENT_ON_CHILD will be set in the event mask,
> > > the dir argment to handle_event will be the parent inode, the file_name
> > > argument to handle_event is non NULL and refers to the name of the child
> > > and the child inode can be accessed with fsnotify_data_inode().
> > >
> > > This will allow fanotify to make decisions based on child or parent's
> > > ignored mask.  For example, when a parent is interested in a specific
> > > event on its children, but a specific child wishes to ignore this event,
> > > the event will not be reported.  This is not what happens with current
> > > code, but according to man page, it is the expected behavior.
> > >
> > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> >
> > I like the direction where this is going. But can't we push it even a bit
> > further? I like the fact that we now have "one fs event" -> "one fsnotify()
> > call". Ideally I'd like to get rid of FS_EVENT_ON_CHILD in the event mask
> > because it's purpose seems very weak now and it complicates code (and now
> 
> Can you give an example where it complicates the code?
> Don't confuse this with the code in fanotify_user.c that subscribes for
> events on child/with name.

I refer mostly to the stuff like:

        /* An event "on child" is not intended for a mount/sb mark */
        if (mask & FS_EVENT_ON_CHILD)
 		...

They are not big complications. But it would be nice to get rid of special
cases like this. Basically my thinking was like: Now that we generate each
event exactly once (i.e., no event duplication once with FS_EVENT_ON_CHILD
and once without it), we should just be able to deliver all events to sb,
mnt, parent, child and they'll just ignore it if they don't care. No
special cases needed. But I understand I'm omitting a lot of detail in this
highlevel "feeling" and these details may make this impractical.

> In one before the last patch of the series I am testing FS_EVENT_ON_CHILD
> in mask to know how to report the event inside fanotify_alloc_event().
> I may be able to carry this information not in mask, but the flag space is
> already taken anyway by FAN_EVENT_ON_CHILD input arg, so not sure
> what is there to gain from not setting FS_EVENT_ON_CHILD.
> 
> > it became even a bit of a misnomer) - intuitively, ->handle_event is now
> 
> I thought of changing the name to FS_EVENT_WITH_NAME, but that was
> confusing because create/delete are also events with a name.
> Maybe FS_EVENT_WITH_CHILD_NAME :-/

Yeah, FS_EVENT_WITH_CHILD_NAME would describe the use better now but then
the aliasing with FAN_EVENT_ON_CHILD will be confusing as well. So if we
keep passing the flag, I guess keeping the name is the least confusing.

> > passed sb, mnt, parent, child so it should have all the info to decide
> > where the event should be reported and I don't see a need for
> > FS_EVENT_ON_CHILD flag.
> 
> Do you mean something like this?
> 
>         const struct path *inode = fsnotify_data_inode(data, data_type);
>         bool event_on_child = !!file_name && dir != inode;

Not quite. E.g. in fanotify_group_event_mask() we could replace the
FS_EVENT_ON_CHILD usage with something like:

	/* If parent isn't interested in events on child, skip adding its mask */
	if (type == FSNOTIFY_OBJ_TYPE_INODE &&
	    !(mark->mask & FS_EVENT_ON_CHILD))
		continue;

And AFAIU this should do just what we need if we always fill in the
TYPE_CHILD field and TYPE_INODE only if we need the parent information
(either for reporting to parent or for parent info in the event).

> It may be true that all information is there, but I think the above is
> a bit ugly and quite not trivial to explain, whereas the flag is quite
> intuitive (to me) and adds no extra complexity (IMO).
> 
> > With fsnotify() call itself we still use
> > FS_EVENT_ON_CHILD to determine what the arguments mean but can't we just
> > mandate that 'data' always points to the child, 'to_tell' is always the
> > parent dir if watching or NULL (and I'd rename that argument to 'dir' and
> > maybe move it after 'data_type' argument). What do you think?
> >
> 
> I think what you are missing is the calls from fsnotify_name().
> For those calls, data is the dir and so is to_tell and name is the entry name.

Really? I can see in the final version:

static inline void fsnotify_name(struct inode *dir, __u32 mask,
				 struct inode *child,
				 const struct qstr *name, u32 cookie)
{
	fsnotify(dir, mask, child, FSNOTIFY_EVENT_INODE, name, cookie);
}

so it appears 'to_tell' is dir and data is the child inode...

> > > diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> > > index 51ada3cfd2ff..7c6e624b24c9 100644
> > > --- a/fs/notify/fsnotify.c
> > > +++ b/fs/notify/fsnotify.c
> > > @@ -145,15 +145,17 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
> > >  /*
> > >   * Notify this dentry's parent about a child's events with child name info
> > >   * if parent is watching.
> > > - * Notify also the child without name info if child inode is watching.
> > > + * Notify only the child without name info if parent is not watching.
> > >   */
> > >  int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
> > >                     int data_type)
> > >  {
> > > +     struct inode *inode = d_inode(dentry);
> > >       struct dentry *parent;
> > >       struct inode *p_inode;
> > >       int ret = 0;
> > >
> > > +     parent = NULL;
> > >       if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
> > >               goto notify_child;
> > >
> > > @@ -165,23 +167,23 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
> > >       } else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
> > >               struct name_snapshot name;
> > >
> > > -             /*
> > > -              * We are notifying a parent, so set a flag in mask to inform
> > > -              * backend that event has information about a child entry.
> > > -              */
> > > +             /* When notifying parent, child should be passed as data */
> > > +             WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));
> > > +
> > > +             /* Notify both parent and child with child name info */
> > >               take_dentry_name_snapshot(&name, dentry);
> > >               ret = fsnotify(p_inode, mask | FS_EVENT_ON_CHILD, data,
> > >                              data_type, &name.name, 0);
> > >               release_dentry_name_snapshot(&name);
> > > +     } else {
> > > +notify_child:
> > > +             /* Notify child without child name info */
> > > +             ret = fsnotify(inode, mask, data, data_type, NULL, 0);
> > >       }
> >
> > AFAICT this will miss notifying the child if the condition
> >         !fsnotify_inode_watches_children(p_inode)
> > above is true... And I've noticed this because jumping into a branch in an
> > if block is usually a bad idea and so I gave it a closer look. Exactly
> > because of problems like this. Usually it's better to restructure
> > conditions instead.
> >
> > In this case I think we could structure the code like:
> >         struct name_snapshot name
> >         struct qstr *namestr = NULL;
> >
> >         if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
> >                 goto notify;
> >         parent = dget_parent(dentry);
> >         p_inode = parent->d_inode;
> >
> >         if (unlikely(!fsnotify_inode_watches_children(p_inode))) {
> >                 __fsnotify_update_child_dentry_flags(p_inode);
> >         } else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
> >                 take_dentry_name_snapshot(&name, dentry);
> >                 namestr = &name.name;
> >                 mask |= FS_EVENT_ON_CHILD;
> >         }
> > notify:
> >         ret = fsnotify(p_inode, mask, data, data_type, namestr, 0);
> >         if (namestr)
> >                 release_dentry_name_snapshot(&name);
> >         dput(parent);
> >         return ret;
> 
> I will look into this proposal, but please be aware that this function
> completely changes in the next patch "send event with parent/name info to
> sb/mount/non-dir marks", so some of the things that look weird here or
> possibly even bugs might go away.  That is not to say that I won't fix
> them, but please review with the next patch in mind when considering
> reconstruct.

Yes, I've then noticed the function changes significantly later and the bug
actually gets silently fixed. So maybe what I proposed here isn't ideal and
the fix should look differently. But my main dislike was the jump into the
if branch which stays till the end AFAICT.

								Honza
Amir Goldstein July 15, 2020, 5:42 p.m. UTC | #4
On Wed, Jul 15, 2020 at 8:09 PM Jan Kara <jack@suse.cz> wrote:
>
> On Tue 14-07-20 14:54:44, Amir Goldstein wrote:
> > On Tue, Jul 14, 2020 at 1:34 PM Jan Kara <jack@suse.cz> wrote:
> > >
> > > On Thu 02-07-20 15:57:37, Amir Goldstein wrote:
> > > > Instead of calling fsnotify() twice, once with parent inode and once
> > > > with child inode, if event should be sent to parent inode, send it
> > > > with both parent and child inodes marks in object type iterator and call
> > > > the backend handle_event() callback only once.
> > > >
> > > > The parent inode is assigned to the standard "inode" iterator type and
> > > > the child inode is assigned to the special "child" iterator type.
> > > >
> > > > In that case, the bit FS_EVENT_ON_CHILD will be set in the event mask,
> > > > the dir argment to handle_event will be the parent inode, the file_name
> > > > argument to handle_event is non NULL and refers to the name of the child
> > > > and the child inode can be accessed with fsnotify_data_inode().
> > > >
> > > > This will allow fanotify to make decisions based on child or parent's
> > > > ignored mask.  For example, when a parent is interested in a specific
> > > > event on its children, but a specific child wishes to ignore this event,
> > > > the event will not be reported.  This is not what happens with current
> > > > code, but according to man page, it is the expected behavior.
> > > >
> > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > >
> > > I like the direction where this is going. But can't we push it even a bit
> > > further? I like the fact that we now have "one fs event" -> "one fsnotify()
> > > call". Ideally I'd like to get rid of FS_EVENT_ON_CHILD in the event mask
> > > because it's purpose seems very weak now and it complicates code (and now
> >
> > Can you give an example where it complicates the code?
> > Don't confuse this with the code in fanotify_user.c that subscribes for
> > events on child/with name.
>
> I refer mostly to the stuff like:
>
>         /* An event "on child" is not intended for a mount/sb mark */
>         if (mask & FS_EVENT_ON_CHILD)
>                 ...
>
> They are not big complications. But it would be nice to get rid of special
> cases like this. Basically my thinking was like: Now that we generate each
> event exactly once (i.e., no event duplication once with FS_EVENT_ON_CHILD
> and once without it), we should just be able to deliver all events to sb,
> mnt, parent, child and they'll just ignore it if they don't care. No
> special cases needed. But I understand I'm omitting a lot of detail in this
> highlevel "feeling" and these details may make this impractical.
>

Well, you may be right.
I have the opposite "feeling" but you often prove me wrong.
I will try to look closer at your proposal, but I must say, even
if the flag is redundant information, so what?
You may say the same thing about FS_ISDIR now.
I agree that we should simplify code that doesn't need to use the
flag, but if in the end the need to report the child FID is determined by
this flag, why should we recalculate it if we can conveniently set the
flag in the right case in fsnotify_parent().

If I am not mistaken, the alternative would mean special casing the
DIRENT_EVENTS in the event handler.
Yes, we already do special case them, but in some places special
casing them can be avoided by consulting the FS_EVENT_ON_CHILD flag.

> > In one before the last patch of the series I am testing FS_EVENT_ON_CHILD
> > in mask to know how to report the event inside fanotify_alloc_event().
> > I may be able to carry this information not in mask, but the flag space is
> > already taken anyway by FAN_EVENT_ON_CHILD input arg, so not sure
> > what is there to gain from not setting FS_EVENT_ON_CHILD.
> >
> > > it became even a bit of a misnomer) - intuitively, ->handle_event is now
> >
> > I thought of changing the name to FS_EVENT_WITH_NAME, but that was
> > confusing because create/delete are also events with a name.
> > Maybe FS_EVENT_WITH_CHILD_NAME :-/
>
> Yeah, FS_EVENT_WITH_CHILD_NAME would describe the use better now but then
> the aliasing with FAN_EVENT_ON_CHILD will be confusing as well. So if we
> keep passing the flag, I guess keeping the name is the least confusing.
>
> > > passed sb, mnt, parent, child so it should have all the info to decide
> > > where the event should be reported and I don't see a need for
> > > FS_EVENT_ON_CHILD flag.
> >
> > Do you mean something like this?
> >
> >         const struct path *inode = fsnotify_data_inode(data, data_type);
> >         bool event_on_child = !!file_name && dir != inode;
>
> Not quite. E.g. in fanotify_group_event_mask() we could replace the
> FS_EVENT_ON_CHILD usage with something like:
>
>         /* If parent isn't interested in events on child, skip adding its mask */
>         if (type == FSNOTIFY_OBJ_TYPE_INODE &&
>             !(mark->mask & FS_EVENT_ON_CHILD))
>                 continue;
>
> And AFAIU this should do just what we need if we always fill in the
> TYPE_CHILD field and TYPE_INODE only if we need the parent information
> (either for reporting to parent or for parent info in the event).
>

Yes, this specific condition could be simplified like you wrote.
It is a relic from before I unified the two event flavors and I forgot to
change it.

> > It may be true that all information is there, but I think the above is
> > a bit ugly and quite not trivial to explain, whereas the flag is quite
> > intuitive (to me) and adds no extra complexity (IMO).
> >
> > > With fsnotify() call itself we still use
> > > FS_EVENT_ON_CHILD to determine what the arguments mean but can't we just
> > > mandate that 'data' always points to the child, 'to_tell' is always the
> > > parent dir if watching or NULL (and I'd rename that argument to 'dir' and
> > > maybe move it after 'data_type' argument). What do you think?
> > >
> >
> > I think what you are missing is the calls from fsnotify_name().
> > For those calls, data is the dir and so is to_tell and name is the entry name.
>
> Really? I can see in the final version:
>
> static inline void fsnotify_name(struct inode *dir, __u32 mask,
>                                  struct inode *child,
>                                  const struct qstr *name, u32 cookie)
> {
>         fsnotify(dir, mask, child, FSNOTIFY_EVENT_INODE, name, cookie);
> }
>
> so it appears 'to_tell' is dir and data is the child inode...
>

Yes. right again.

> > > > diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> > > > index 51ada3cfd2ff..7c6e624b24c9 100644
> > > > --- a/fs/notify/fsnotify.c
> > > > +++ b/fs/notify/fsnotify.c
> > > > @@ -145,15 +145,17 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
> > > >  /*
> > > >   * Notify this dentry's parent about a child's events with child name info
> > > >   * if parent is watching.
> > > > - * Notify also the child without name info if child inode is watching.
> > > > + * Notify only the child without name info if parent is not watching.
> > > >   */
> > > >  int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
> > > >                     int data_type)
> > > >  {
> > > > +     struct inode *inode = d_inode(dentry);
> > > >       struct dentry *parent;
> > > >       struct inode *p_inode;
> > > >       int ret = 0;
> > > >
> > > > +     parent = NULL;
> > > >       if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
> > > >               goto notify_child;
> > > >
> > > > @@ -165,23 +167,23 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
> > > >       } else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
> > > >               struct name_snapshot name;
> > > >
> > > > -             /*
> > > > -              * We are notifying a parent, so set a flag in mask to inform
> > > > -              * backend that event has information about a child entry.
> > > > -              */
> > > > +             /* When notifying parent, child should be passed as data */
> > > > +             WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));
> > > > +
> > > > +             /* Notify both parent and child with child name info */
> > > >               take_dentry_name_snapshot(&name, dentry);
> > > >               ret = fsnotify(p_inode, mask | FS_EVENT_ON_CHILD, data,
> > > >                              data_type, &name.name, 0);
> > > >               release_dentry_name_snapshot(&name);
> > > > +     } else {
> > > > +notify_child:
> > > > +             /* Notify child without child name info */
> > > > +             ret = fsnotify(inode, mask, data, data_type, NULL, 0);
> > > >       }
> > >
> > > AFAICT this will miss notifying the child if the condition
> > >         !fsnotify_inode_watches_children(p_inode)
> > > above is true... And I've noticed this because jumping into a branch in an
> > > if block is usually a bad idea and so I gave it a closer look. Exactly
> > > because of problems like this. Usually it's better to restructure
> > > conditions instead.
> > >
> > > In this case I think we could structure the code like:
> > >         struct name_snapshot name
> > >         struct qstr *namestr = NULL;
> > >
> > >         if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
> > >                 goto notify;
> > >         parent = dget_parent(dentry);
> > >         p_inode = parent->d_inode;
> > >
> > >         if (unlikely(!fsnotify_inode_watches_children(p_inode))) {
> > >                 __fsnotify_update_child_dentry_flags(p_inode);
> > >         } else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
> > >                 take_dentry_name_snapshot(&name, dentry);
> > >                 namestr = &name.name;
> > >                 mask |= FS_EVENT_ON_CHILD;
> > >         }
> > > notify:
> > >         ret = fsnotify(p_inode, mask, data, data_type, namestr, 0);
> > >         if (namestr)
> > >                 release_dentry_name_snapshot(&name);
> > >         dput(parent);
> > >         return ret;
> >
> > I will look into this proposal, but please be aware that this function
> > completely changes in the next patch "send event with parent/name info to
> > sb/mount/non-dir marks", so some of the things that look weird here or
> > possibly even bugs might go away.  That is not to say that I won't fix
> > them, but please review with the next patch in mind when considering
> > reconstruct.
>
> Yes, I've then noticed the function changes significantly later and the bug
> actually gets silently fixed. So maybe what I proposed here isn't ideal and
> the fix should look differently. But my main dislike was the jump into the
> if branch which stays till the end AFAICT.
>

And you were right.
I implemented all re-structure suggestions and pushed to fsnotify_name branch.
Will try to look at reducing the use of the FS_EVENT_ON_CHILD flag.

Thanks,
Amir.
Amir Goldstein July 16, 2020, 6:38 a.m. UTC | #5
On Wed, Jul 15, 2020 at 8:42 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Wed, Jul 15, 2020 at 8:09 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Tue 14-07-20 14:54:44, Amir Goldstein wrote:
> > > On Tue, Jul 14, 2020 at 1:34 PM Jan Kara <jack@suse.cz> wrote:
> > > >
> > > > On Thu 02-07-20 15:57:37, Amir Goldstein wrote:
> > > > > Instead of calling fsnotify() twice, once with parent inode and once
> > > > > with child inode, if event should be sent to parent inode, send it
> > > > > with both parent and child inodes marks in object type iterator and call
> > > > > the backend handle_event() callback only once.
> > > > >
> > > > > The parent inode is assigned to the standard "inode" iterator type and
> > > > > the child inode is assigned to the special "child" iterator type.
> > > > >
> > > > > In that case, the bit FS_EVENT_ON_CHILD will be set in the event mask,
> > > > > the dir argment to handle_event will be the parent inode, the file_name
> > > > > argument to handle_event is non NULL and refers to the name of the child
> > > > > and the child inode can be accessed with fsnotify_data_inode().
> > > > >
> > > > > This will allow fanotify to make decisions based on child or parent's
> > > > > ignored mask.  For example, when a parent is interested in a specific
> > > > > event on its children, but a specific child wishes to ignore this event,
> > > > > the event will not be reported.  This is not what happens with current
> > > > > code, but according to man page, it is the expected behavior.
> > > > >
> > > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > > >
> > > > I like the direction where this is going. But can't we push it even a bit
> > > > further? I like the fact that we now have "one fs event" -> "one fsnotify()
> > > > call". Ideally I'd like to get rid of FS_EVENT_ON_CHILD in the event mask
> > > > because it's purpose seems very weak now and it complicates code (and now
> > >
> > > Can you give an example where it complicates the code?
> > > Don't confuse this with the code in fanotify_user.c that subscribes for
> > > events on child/with name.
> >
> > I refer mostly to the stuff like:
> >
> >         /* An event "on child" is not intended for a mount/sb mark */
> >         if (mask & FS_EVENT_ON_CHILD)
> >                 ...
> >

I need to explain something that was not an obvious decision for me.

When sending the same event on two inodes marks I considered a few options:

1. TYPE_INODE is the mark on the object referred to in data
    TYPE_PARENT is the mark on the parent if event is sent to a watching
                               parent or to sb/mnt/child with parent/name info
2. TYPE_CHILD is the mark on the object referred to in data
    TYPE_INODE is the mark on the fsnotify to_tell inode if not same as data
3. TYPE_INODE is the mark on the fsnotify to_tell inode
    TYPE_CHILD is the mark on the object referred to in data if it is
not to_tell

The first option with TYPE_PARENT  would require changing audit
and dnotify to look at TYPE_PARENT mark in addition to TYPE_INODE
mark, so it adds more friction and I ruled it out.

I think you had option #2 in mind when reading the code, but I went
for option #3.
There is a minor difference between them related to how we deal with the case
that the parent is watching and the case that only the child is watching.

If the parent is not watching (and child/sb/mnt not interested in name) we do
not snapshot the name and do not set the ON_CHILD flag in the mask.
In that case, should we add the child mark as TYPE_INODE or TYPE_CHILD?

I chose TYPE_INODE because this meant I did not have to change audit/dnotify
for that case. I didn't even care to look if they needed to be changed or not,
just wanted to keep things as they were.

Looking now, I see that dnotify would have needed to check TYPE_CHILD to
get FS_ATTRIB event on self.

It looks like audit would not have needed to change because although they set
FS_EVENT_ON_CHILD in mask, none of the events they care about are
"possible on child":
 #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
                        FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
                         FS_MOVE_SELF | FS_EVENT_ON_CHILD)

Having written that decision process down made me realize there is a bug in
my unified inotify event handler implementation - it does not clear
FS_EVENT_ON_CHILD when reporting without name.

It is interesting to note that the result of sending FS_OPEN only to a watching
child to inotify_handle_event() is the same for design choices #2 and #3 above.
But the bug fix of clearing FS_EVENT_ON_CHILD when reporting without name
would look different depending on said choice.

Since I had to change inotify handler anyway, I prefer to stick with my choice
and fix inotify handler using goto notify_child which is a bit uglier,
instead of
having to adapt dnotify to choice #2.

> > They are not big complications. But it would be nice to get rid of special
> > cases like this. Basically my thinking was like: Now that we generate each
> > event exactly once (i.e., no event duplication once with FS_EVENT_ON_CHILD
> > and once without it), we should just be able to deliver all events to sb,
> > mnt, parent, child and they'll just ignore it if they don't care. No
> > special cases needed. But I understand I'm omitting a lot of detail in this
> > highlevel "feeling" and these details may make this impractical.
> >
>
[...]
> > > > passed sb, mnt, parent, child so it should have all the info to decide
> > > > where the event should be reported and I don't see a need for
> > > > FS_EVENT_ON_CHILD flag.
> > >
> > > Do you mean something like this?
> > >
> > >         const struct path *inode = fsnotify_data_inode(data, data_type);
> > >         bool event_on_child = !!file_name && dir != inode;
> >
> > Not quite. E.g. in fanotify_group_event_mask() we could replace the
> > FS_EVENT_ON_CHILD usage with something like:
> >
> >         /* If parent isn't interested in events on child, skip adding its mask */
> >         if (type == FSNOTIFY_OBJ_TYPE_INODE &&
> >             !(mark->mask & FS_EVENT_ON_CHILD))
> >                 continue;
> >

That looks wrong. FAN_CREATE does not have the ON_CHILD flag and
should very well be reported to inode mark.
Trying to special case as little as possible the different types of events
(on child/dirent/self) is what drove my choices here, but if we can find
ways to further simplify the code all the better.

Thanks,
Amir.
Amir Goldstein July 16, 2020, 7:39 a.m. UTC | #6
On Thu, Jul 16, 2020 at 9:38 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Wed, Jul 15, 2020 at 8:42 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Wed, Jul 15, 2020 at 8:09 PM Jan Kara <jack@suse.cz> wrote:
> > >
> > > On Tue 14-07-20 14:54:44, Amir Goldstein wrote:
> > > > On Tue, Jul 14, 2020 at 1:34 PM Jan Kara <jack@suse.cz> wrote:
> > > > >
> > > > > On Thu 02-07-20 15:57:37, Amir Goldstein wrote:
> > > > > > Instead of calling fsnotify() twice, once with parent inode and once
> > > > > > with child inode, if event should be sent to parent inode, send it
> > > > > > with both parent and child inodes marks in object type iterator and call
> > > > > > the backend handle_event() callback only once.
> > > > > >
> > > > > > The parent inode is assigned to the standard "inode" iterator type and
> > > > > > the child inode is assigned to the special "child" iterator type.
> > > > > >
> > > > > > In that case, the bit FS_EVENT_ON_CHILD will be set in the event mask,
> > > > > > the dir argment to handle_event will be the parent inode, the file_name
> > > > > > argument to handle_event is non NULL and refers to the name of the child
> > > > > > and the child inode can be accessed with fsnotify_data_inode().
> > > > > >
> > > > > > This will allow fanotify to make decisions based on child or parent's
> > > > > > ignored mask.  For example, when a parent is interested in a specific
> > > > > > event on its children, but a specific child wishes to ignore this event,
> > > > > > the event will not be reported.  This is not what happens with current
> > > > > > code, but according to man page, it is the expected behavior.
> > > > > >
> > > > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > > > >
> > > > > I like the direction where this is going. But can't we push it even a bit
> > > > > further? I like the fact that we now have "one fs event" -> "one fsnotify()
> > > > > call". Ideally I'd like to get rid of FS_EVENT_ON_CHILD in the event mask
> > > > > because it's purpose seems very weak now and it complicates code (and now
> > > >
> > > > Can you give an example where it complicates the code?
> > > > Don't confuse this with the code in fanotify_user.c that subscribes for
> > > > events on child/with name.
> > >
> > > I refer mostly to the stuff like:
> > >
> > >         /* An event "on child" is not intended for a mount/sb mark */
> > >         if (mask & FS_EVENT_ON_CHILD)
> > >                 ...
> > >
>
> I need to explain something that was not an obvious decision for me.
>
> When sending the same event on two inodes marks I considered a few options:
>
> 1. TYPE_INODE is the mark on the object referred to in data
>     TYPE_PARENT is the mark on the parent if event is sent to a watching
>                                parent or to sb/mnt/child with parent/name info
> 2. TYPE_CHILD is the mark on the object referred to in data
>     TYPE_INODE is the mark on the fsnotify to_tell inode if not same as data
> 3. TYPE_INODE is the mark on the fsnotify to_tell inode
>     TYPE_CHILD is the mark on the object referred to in data if it is
> not to_tell
>
> The first option with TYPE_PARENT  would require changing audit
> and dnotify to look at TYPE_PARENT mark in addition to TYPE_INODE
> mark, so it adds more friction and I ruled it out.
>
> I think you had option #2 in mind when reading the code, but I went
> for option #3.
> There is a minor difference between them related to how we deal with the case
> that the parent is watching and the case that only the child is watching.
>
> If the parent is not watching (and child/sb/mnt not interested in name) we do
> not snapshot the name and do not set the ON_CHILD flag in the mask.
> In that case, should we add the child mark as TYPE_INODE or TYPE_CHILD?
>
> I chose TYPE_INODE because this meant I did not have to change audit/dnotify
> for that case. I didn't even care to look if they needed to be changed or not,
> just wanted to keep things as they were.
>
> Looking now, I see that dnotify would have needed to check TYPE_CHILD to
> get FS_ATTRIB event on self.
>
> It looks like audit would not have needed to change because although they set
> FS_EVENT_ON_CHILD in mask, none of the events they care about are
> "possible on child":
>  #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
>                         FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
> #define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
>                          FS_MOVE_SELF | FS_EVENT_ON_CHILD)
>
> Having written that decision process down made me realize there is a bug in
> my unified inotify event handler implementation - it does not clear
> FS_EVENT_ON_CHILD when reporting without name.
>
> It is interesting to note that the result of sending FS_OPEN only to a watching
> child to inotify_handle_event() is the same for design choices #2 and #3 above.
> But the bug fix of clearing FS_EVENT_ON_CHILD when reporting without name
> would look different depending on said choice.
>
> Since I had to change inotify handler anyway, I prefer to stick with my choice
> and fix inotify handler using goto notify_child which is a bit uglier,
> instead of
> having to adapt dnotify to choice #2.
>

It turns out it's the other way around.
inotify handler has no bug (FS_EVENT_ON_CHILD is not exposed to the user)
just a confusing comment, so I will fix that.
But dnotify does have a bug - it also needs to be taught about the unified event
so that DN_ATTRIB event can be reported twice on both parent dir and child
subdir if both are watching.
Alas, we have no test coverage for dnotify in LTP...

This means that we could also go with choice #2.
But we can also make that internal change later on, because it does not
impact the logic.

Thanks,
Amir.
Amir Goldstein July 16, 2020, 9:55 a.m. UTC | #7
On Thu, Jul 16, 2020 at 10:39 AM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Thu, Jul 16, 2020 at 9:38 AM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Wed, Jul 15, 2020 at 8:42 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > >
> > > On Wed, Jul 15, 2020 at 8:09 PM Jan Kara <jack@suse.cz> wrote:
> > > >
> > > > On Tue 14-07-20 14:54:44, Amir Goldstein wrote:
> > > > > On Tue, Jul 14, 2020 at 1:34 PM Jan Kara <jack@suse.cz> wrote:
> > > > > >
> > > > > > On Thu 02-07-20 15:57:37, Amir Goldstein wrote:
> > > > > > > Instead of calling fsnotify() twice, once with parent inode and once
> > > > > > > with child inode, if event should be sent to parent inode, send it
> > > > > > > with both parent and child inodes marks in object type iterator and call
> > > > > > > the backend handle_event() callback only once.
> > > > > > >
> > > > > > > The parent inode is assigned to the standard "inode" iterator type and
> > > > > > > the child inode is assigned to the special "child" iterator type.
> > > > > > >
> > > > > > > In that case, the bit FS_EVENT_ON_CHILD will be set in the event mask,
> > > > > > > the dir argment to handle_event will be the parent inode, the file_name
> > > > > > > argument to handle_event is non NULL and refers to the name of the child
> > > > > > > and the child inode can be accessed with fsnotify_data_inode().
> > > > > > >
> > > > > > > This will allow fanotify to make decisions based on child or parent's
> > > > > > > ignored mask.  For example, when a parent is interested in a specific
> > > > > > > event on its children, but a specific child wishes to ignore this event,
> > > > > > > the event will not be reported.  This is not what happens with current
> > > > > > > code, but according to man page, it is the expected behavior.
> > > > > > >
> > > > > > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > > > > >
> > > > > > I like the direction where this is going. But can't we push it even a bit
> > > > > > further? I like the fact that we now have "one fs event" -> "one fsnotify()
> > > > > > call". Ideally I'd like to get rid of FS_EVENT_ON_CHILD in the event mask
> > > > > > because it's purpose seems very weak now and it complicates code (and now
> > > > >
> > > > > Can you give an example where it complicates the code?
> > > > > Don't confuse this with the code in fanotify_user.c that subscribes for
> > > > > events on child/with name.
> > > >
> > > > I refer mostly to the stuff like:
> > > >
> > > >         /* An event "on child" is not intended for a mount/sb mark */
> > > >         if (mask & FS_EVENT_ON_CHILD)
> > > >                 ...
> > > >
> >
> > I need to explain something that was not an obvious decision for me.
> >
> > When sending the same event on two inodes marks I considered a few options:
> >
> > 1. TYPE_INODE is the mark on the object referred to in data
> >     TYPE_PARENT is the mark on the parent if event is sent to a watching
> >                                parent or to sb/mnt/child with parent/name info
> > 2. TYPE_CHILD is the mark on the object referred to in data
> >     TYPE_INODE is the mark on the fsnotify to_tell inode if not same as data
> > 3. TYPE_INODE is the mark on the fsnotify to_tell inode
> >     TYPE_CHILD is the mark on the object referred to in data if it is
> > not to_tell
> >
> > The first option with TYPE_PARENT  would require changing audit
> > and dnotify to look at TYPE_PARENT mark in addition to TYPE_INODE
> > mark, so it adds more friction and I ruled it out.
> >
> > I think you had option #2 in mind when reading the code, but I went
> > for option #3.
> > There is a minor difference between them related to how we deal with the case
> > that the parent is watching and the case that only the child is watching.
> >
> > If the parent is not watching (and child/sb/mnt not interested in name) we do
> > not snapshot the name and do not set the ON_CHILD flag in the mask.
> > In that case, should we add the child mark as TYPE_INODE or TYPE_CHILD?
> >
> > I chose TYPE_INODE because this meant I did not have to change audit/dnotify
> > for that case. I didn't even care to look if they needed to be changed or not,
> > just wanted to keep things as they were.
> >
> > Looking now, I see that dnotify would have needed to check TYPE_CHILD to
> > get FS_ATTRIB event on self.
> >
> > It looks like audit would not have needed to change because although they set
> > FS_EVENT_ON_CHILD in mask, none of the events they care about are
> > "possible on child":
> >  #define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
> >                         FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
> > #define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
> >                          FS_MOVE_SELF | FS_EVENT_ON_CHILD)
> >
> > Having written that decision process down made me realize there is a bug in
> > my unified inotify event handler implementation - it does not clear
> > FS_EVENT_ON_CHILD when reporting without name.
> >
> > It is interesting to note that the result of sending FS_OPEN only to a watching
> > child to inotify_handle_event() is the same for design choices #2 and #3 above.
> > But the bug fix of clearing FS_EVENT_ON_CHILD when reporting without name
> > would look different depending on said choice.
> >
> > Since I had to change inotify handler anyway, I prefer to stick with my choice
> > and fix inotify handler using goto notify_child which is a bit uglier,
> > instead of
> > having to adapt dnotify to choice #2.
> >
>
> It turns out it's the other way around.
> inotify handler has no bug (FS_EVENT_ON_CHILD is not exposed to the user)
> just a confusing comment, so I will fix that.
> But dnotify does have a bug - it also needs to be taught about the unified event
> so that DN_ATTRIB event can be reported twice on both parent dir and child
> subdir if both are watching.
> Alas, we have no test coverage for dnotify in LTP...

FYI I verified this dnotify regression and fix manually after adding
DN_ATTRIB to
tools/testing/selftests/filesystems/dnotify_test.c:

$ cd dir/
$ dnotify_test &
$ cd subdir/
$ dnotify_test &
$ chmod 777 .
Got event on fd=3
Got event on fd=3

I will write an LTP test to cover this and see if we have similar
tests for inotify
and fanotify.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e23b3f62483c..5b1468bc509e 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -883,6 +883,7 @@  static void kernfs_notify_workfn(struct work_struct *work)
 
 	list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
 		struct kernfs_node *parent;
+		struct inode *p_inode = NULL;
 		struct inode *inode;
 		struct qstr name;
 
@@ -899,8 +900,6 @@  static void kernfs_notify_workfn(struct work_struct *work)
 		name = (struct qstr)QSTR_INIT(kn->name, strlen(kn->name));
 		parent = kernfs_get_parent(kn);
 		if (parent) {
-			struct inode *p_inode;
-
 			p_inode = ilookup(info->sb, kernfs_ino(parent));
 			if (p_inode) {
 				fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD,
@@ -911,8 +910,11 @@  static void kernfs_notify_workfn(struct work_struct *work)
 			kernfs_put(parent);
 		}
 
-		fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
-			 NULL, 0);
+		if (!p_inode) {
+			fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
+				 NULL, 0);
+		}
+
 		iput(inode);
 	}
 
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 51ada3cfd2ff..7c6e624b24c9 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -145,15 +145,17 @@  void __fsnotify_update_child_dentry_flags(struct inode *inode)
 /*
  * Notify this dentry's parent about a child's events with child name info
  * if parent is watching.
- * Notify also the child without name info if child inode is watching.
+ * Notify only the child without name info if parent is not watching.
  */
 int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
 		      int data_type)
 {
+	struct inode *inode = d_inode(dentry);
 	struct dentry *parent;
 	struct inode *p_inode;
 	int ret = 0;
 
+	parent = NULL;
 	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
 		goto notify_child;
 
@@ -165,23 +167,23 @@  int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
 	} else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
 		struct name_snapshot name;
 
-		/*
-		 * We are notifying a parent, so set a flag in mask to inform
-		 * backend that event has information about a child entry.
-		 */
+		/* When notifying parent, child should be passed as data */
+		WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type));
+
+		/* Notify both parent and child with child name info */
 		take_dentry_name_snapshot(&name, dentry);
 		ret = fsnotify(p_inode, mask | FS_EVENT_ON_CHILD, data,
 			       data_type, &name.name, 0);
 		release_dentry_name_snapshot(&name);
+	} else {
+notify_child:
+		/* Notify child without child name info */
+		ret = fsnotify(inode, mask, data, data_type, NULL, 0);
 	}
 
 	dput(parent);
 
-	if (ret)
-		return ret;
-
-notify_child:
-	return fsnotify(d_inode(dentry), mask, data, data_type, NULL, 0);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
@@ -322,12 +324,16 @@  int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_type,
 	struct super_block *sb = to_tell->i_sb;
 	struct inode *dir = S_ISDIR(to_tell->i_mode) ? to_tell : NULL;
 	struct mount *mnt = NULL;
+	struct inode *child = NULL;
 	int ret = 0;
 	__u32 test_mask, marks_mask;
 
 	if (path)
 		mnt = real_mount(path->mnt);
 
+	if (mask & FS_EVENT_ON_CHILD)
+		child = fsnotify_data_inode(data, data_type);
+
 	/*
 	 * Optimization: srcu_read_lock() has a memory barrier which can
 	 * be expensive.  It protects walking the *_fsnotify_marks lists.
@@ -336,21 +342,23 @@  int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_type,
 	 * need SRCU to keep them "alive".
 	 */
 	if (!to_tell->i_fsnotify_marks && !sb->s_fsnotify_marks &&
-	    (!mnt || !mnt->mnt_fsnotify_marks))
+	    (!mnt || !mnt->mnt_fsnotify_marks) &&
+	    (!child || !child->i_fsnotify_marks))
 		return 0;
 
 	/* An event "on child" is not intended for a mount/sb mark */
 	marks_mask = to_tell->i_fsnotify_mask;
-	if (!(mask & FS_EVENT_ON_CHILD)) {
+	if (!child) {
 		marks_mask |= sb->s_fsnotify_mask;
 		if (mnt)
 			marks_mask |= mnt->mnt_fsnotify_mask;
+	} else {
+		marks_mask |= child->i_fsnotify_mask;
 	}
 
 	/*
 	 * if this is a modify event we may need to clear the ignored masks
-	 * otherwise return if neither the inode nor the vfsmount/sb care about
-	 * this type of event.
+	 * otherwise return if none of the marks care about this type of event.
 	 */
 	test_mask = (mask & ALL_FSNOTIFY_EVENTS);
 	if (!(mask & FS_MODIFY) && !(test_mask & marks_mask))
@@ -366,6 +374,10 @@  int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_type,
 		iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
 			fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
 	}
+	if (child) {
+		iter_info.marks[FSNOTIFY_OBJ_TYPE_CHILD] =
+			fsnotify_first_mark(&child->i_fsnotify_marks);
+	}
 
 	/*
 	 * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark