diff mbox series

[v2] fsnotify: fix unlink performance regression

Message ID 20190505200728.5892-1-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show
Series [v2] fsnotify: fix unlink performance regression | expand

Commit Message

Amir Goldstein May 5, 2019, 8:07 p.m. UTC
__fsnotify_parent() has an optimization in place to avoid unneeded
take_dentry_name_snapshot().  When fsnotify_nameremove() was changed
not to call __fsnotify_parent(), we left out the optimization.
Kernel test robot reported a 5% performance regression in concurrent
unlink() workload.

Modify __fsnotify_parent() so that it can be called also to report
directory modififcation events and use it from fsnotify_nameremove().

Reported-by: kernel test robot <rong.a.chen@intel.com>
Link: https://lore.kernel.org/lkml/20190505062153.GG29809@shao2-debian/
Link: https://lore.kernel.org/linux-fsdevel/20190104090357.GD22409@quack2.suse.cz/
Fixes: 5f02a8776384 ("fsnotify: annotate directory entry modification events")
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---

Jan,

A nicer approach reusing __fsnotify_parent() instead of copying code
from it.

This version does not apply cleanly to Al's for-next branch (there are
some fsnotify changes in work.dcache). The conflict is trivial and
resolved on my fsnotify branch [1].

Thanks,
Amir.

Changes since v1:
- Fix build without CONFIG_FSNOTIFY
- Use __fsnotify_parent() for reporting FS_DELETE

[1] https://github.com/amir73il/linux/commits/fsnotify

 fs/notify/fsnotify.c     | 22 +++++++++++-----------
 include/linux/fsnotify.h | 15 +++------------
 2 files changed, 14 insertions(+), 23 deletions(-)

Comments

Jan Kara May 7, 2019, 4:19 p.m. UTC | #1
On Sun 05-05-19 23:07:28, Amir Goldstein wrote:
> __fsnotify_parent() has an optimization in place to avoid unneeded
> take_dentry_name_snapshot().  When fsnotify_nameremove() was changed
> not to call __fsnotify_parent(), we left out the optimization.
> Kernel test robot reported a 5% performance regression in concurrent
> unlink() workload.
> 
> Modify __fsnotify_parent() so that it can be called also to report
> directory modififcation events and use it from fsnotify_nameremove().
> 
> Reported-by: kernel test robot <rong.a.chen@intel.com>
> Link: https://lore.kernel.org/lkml/20190505062153.GG29809@shao2-debian/
> Link: https://lore.kernel.org/linux-fsdevel/20190104090357.GD22409@quack2.suse.cz/
> Fixes: 5f02a8776384 ("fsnotify: annotate directory entry modification events")
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> ---
> 
> Jan,
> 
> A nicer approach reusing __fsnotify_parent() instead of copying code
> from it.
> 
> This version does not apply cleanly to Al's for-next branch (there are
> some fsnotify changes in work.dcache). The conflict is trivial and
> resolved on my fsnotify branch [1].

Hum, let me check if I understand the situation right. We've changed
fsnotify_nameremove() to not use fsnotify_parent() as we don't want to
report FS_EVENT_ON_CHILD with it anymore. We should use fsnotify_dirent()
as for all other directory event notification handlers but that's
problematic due to different locking context and possible instability of
parent.

Honestly I don't like the patch below much. How we are notifying parent
without sending FS_EVENT_ON_CHILD and modify behavior based on that flag
just looks subtle. So I'd rather move the fsnotify call to vfs_unlink(),
vfs_rmdir(), simple_unlink(), simple_rmdir(), and then those few callers of
d_delete() that remain as you suggest elsewhere in this thread. And then we
get more consistent context for fsnotify_nameremove() and could just use
fsnotify_dirent(). 

								Honza

> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index df06f3da166c..265b726d6e8d 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -151,31 +151,31 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
>  	spin_unlock(&inode->i_lock);
>  }
>  
> -/* Notify this dentry's parent about a child's events. */
> +/*
> + * Notify this dentry's parent about an event and make sure that name is stable.
> + * Events "on child" are only reported if parent is watching.
> + * Directory modification events are also reported if super block is watching.
> + */
>  int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask)
>  {
>  	struct dentry *parent;
>  	struct inode *p_inode;
>  	int ret = 0;
> +	bool on_child = (mask & FS_EVENT_ON_CHILD);
> +	__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
>  
> -	if (!dentry)
> -		dentry = path->dentry;
> -
> -	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
> +	if (on_child && !(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
>  		return 0;
>  
>  	parent = dget_parent(dentry);
>  	p_inode = parent->d_inode;
>  
> -	if (unlikely(!fsnotify_inode_watches_children(p_inode))) {
> +	if (on_child && unlikely(!fsnotify_inode_watches_children(p_inode))) {
>  		__fsnotify_update_child_dentry_flags(p_inode);
> -	} else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
> +	} else if ((p_inode->i_fsnotify_mask & test_mask) ||
> +		   (!on_child && (dentry->d_sb->s_fsnotify_mask & test_mask))) {
>  		struct name_snapshot name;
>  
> -		/* we are notifying a parent so come up with the new mask which
> -		 * specifies these are events which came from a child. */
> -		mask |= FS_EVENT_ON_CHILD;
> -
>  		take_dentry_name_snapshot(&name, dentry);
>  		if (path)
>  			ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
> diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
> index 09587e2860b5..8641bf9a1eda 100644
> --- a/include/linux/fsnotify.h
> +++ b/include/linux/fsnotify.h
> @@ -37,7 +37,7 @@ static inline int fsnotify_parent(const struct path *path,
>  	if (!dentry)
>  		dentry = path->dentry;
>  
> -	return __fsnotify_parent(path, dentry, mask);
> +	return __fsnotify_parent(path, dentry, mask | FS_EVENT_ON_CHILD);
>  }
>  
>  /*
> @@ -158,13 +158,11 @@ static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
>   * dentry->d_parent should be stable. However there are some corner cases where
>   * inode lock is not held. So to be on the safe side and be reselient to future
>   * callers and out of tree users of d_delete(), we do not assume that d_parent
> - * and d_name are stable and we use dget_parent() and
> + * and d_name are stable and we use __fsnotify_parent() to
>   * take_dentry_name_snapshot() to grab stable references.
>   */
>  static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
>  {
> -	struct dentry *parent;
> -	struct name_snapshot name;
>  	__u32 mask = FS_DELETE;
>  
>  	/* d_delete() of pseudo inode? (e.g. __ns_get_path() playing tricks) */
> @@ -174,14 +172,7 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
>  	if (isdir)
>  		mask |= FS_ISDIR;
>  
> -	parent = dget_parent(dentry);
> -	take_dentry_name_snapshot(&name, dentry);
> -
> -	fsnotify(d_inode(parent), mask, d_inode(dentry), FSNOTIFY_EVENT_INODE,
> -		 name.name, 0);
> -
> -	release_dentry_name_snapshot(&name);
> -	dput(parent);
> +	__fsnotify_parent(NULL, dentry, mask);
>  }
>  
>  /*
> -- 
> 2.17.1
>
Amir Goldstein May 7, 2019, 7:12 p.m. UTC | #2
On Tue, May 7, 2019 at 7:19 PM Jan Kara <jack@suse.cz> wrote:
>
> On Sun 05-05-19 23:07:28, Amir Goldstein wrote:
> > __fsnotify_parent() has an optimization in place to avoid unneeded
> > take_dentry_name_snapshot().  When fsnotify_nameremove() was changed
> > not to call __fsnotify_parent(), we left out the optimization.
> > Kernel test robot reported a 5% performance regression in concurrent
> > unlink() workload.
> >
> > Modify __fsnotify_parent() so that it can be called also to report
> > directory modififcation events and use it from fsnotify_nameremove().
> >
> > Reported-by: kernel test robot <rong.a.chen@intel.com>
> > Link: https://lore.kernel.org/lkml/20190505062153.GG29809@shao2-debian/
> > Link: https://lore.kernel.org/linux-fsdevel/20190104090357.GD22409@quack2.suse.cz/
> > Fixes: 5f02a8776384 ("fsnotify: annotate directory entry modification events")
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > ---
> >
> > Jan,
> >
> > A nicer approach reusing __fsnotify_parent() instead of copying code
> > from it.
> >
> > This version does not apply cleanly to Al's for-next branch (there are
> > some fsnotify changes in work.dcache). The conflict is trivial and
> > resolved on my fsnotify branch [1].
>
> Hum, let me check if I understand the situation right. We've changed
> fsnotify_nameremove() to not use fsnotify_parent() as we don't want to
> report FS_EVENT_ON_CHILD with it anymore. We should use fsnotify_dirent()
> as for all other directory event notification handlers but that's
> problematic due to different locking context and possible instability of
> parent.
>

Yes. Not only do we not want to report FS_EVENT_ON_CHILD with
FS_DELETE, but we also need to report it to dir inode and to fs sb
regardless of DCACHE_FSNOTIFY_PARENT_WATCHED.

> Honestly I don't like the patch below much. How we are notifying parent
> without sending FS_EVENT_ON_CHILD and modify behavior based on that flag
> just looks subtle.

I see, although please note that reporting FS_EVENT_ON_CHILD
is strongly related to the "modified behavior", because unless this an
a report of event on_child, DCACHE_FSNOTIFY_PARENT_WATCHED
is not relevant.

> So I'd rather move the fsnotify call to vfs_unlink(),
> vfs_rmdir(), simple_unlink(), simple_rmdir(), and then those few callers of
> d_delete() that remain as you suggest elsewhere in this thread. And then we
> get more consistent context for fsnotify_nameremove() and could just use
> fsnotify_dirent().
>

Yes, I much prefer this solution myself and I will follow up with it,
but it would not be honest to suggest said solution as a stable fix
to the performance regression that was introduced in v5.1.
I think is it better if you choose between lesser evil:
v1 with ifdef CONFIG_FSNOTIFY to fix build issue
v2 as subtle as it is
OR another obviously safe stable fix that you can think of

The change of cleansing d_delete() from fsnotify_nameremove()
requires more research and is anyway not stable tree material -
if not for the level of complexity, then because all the users of
FS_DELETE from pseudo and clustered filesystems need more time
to find regressions (we do not have test coverage for those in LTP).

Thanks,
Amir.
Amir Goldstein May 8, 2019, 4:09 p.m. UTC | #3
On Tue, May 7, 2019 at 10:12 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Tue, May 7, 2019 at 7:19 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Sun 05-05-19 23:07:28, Amir Goldstein wrote:
> > > __fsnotify_parent() has an optimization in place to avoid unneeded
> > > take_dentry_name_snapshot().  When fsnotify_nameremove() was changed
> > > not to call __fsnotify_parent(), we left out the optimization.
> > > Kernel test robot reported a 5% performance regression in concurrent
> > > unlink() workload.
> > >
> > > Modify __fsnotify_parent() so that it can be called also to report
> > > directory modififcation events and use it from fsnotify_nameremove().
> > >
> > > Reported-by: kernel test robot <rong.a.chen@intel.com>
> > > Link: https://lore.kernel.org/lkml/20190505062153.GG29809@shao2-debian/
> > > Link: https://lore.kernel.org/linux-fsdevel/20190104090357.GD22409@quack2.suse.cz/
> > > Fixes: 5f02a8776384 ("fsnotify: annotate directory entry modification events")
> > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > > ---
> > >
> > > Jan,
> > >
> > > A nicer approach reusing __fsnotify_parent() instead of copying code
> > > from it.
> > >
> > > This version does not apply cleanly to Al's for-next branch (there are
> > > some fsnotify changes in work.dcache). The conflict is trivial and
> > > resolved on my fsnotify branch [1].
> >
> > Hum, let me check if I understand the situation right. We've changed
> > fsnotify_nameremove() to not use fsnotify_parent() as we don't want to
> > report FS_EVENT_ON_CHILD with it anymore. We should use fsnotify_dirent()
> > as for all other directory event notification handlers but that's
> > problematic due to different locking context and possible instability of
> > parent.
> >
>
> Yes. Not only do we not want to report FS_EVENT_ON_CHILD with
> FS_DELETE, but we also need to report it to dir inode and to fs sb
> regardless of DCACHE_FSNOTIFY_PARENT_WATCHED.
>
> > Honestly I don't like the patch below much. How we are notifying parent
> > without sending FS_EVENT_ON_CHILD and modify behavior based on that flag
> > just looks subtle.
>
> I see, although please note that reporting FS_EVENT_ON_CHILD
> is strongly related to the "modified behavior", because unless this an
> a report of event on_child, DCACHE_FSNOTIFY_PARENT_WATCHED
> is not relevant.
>
> > So I'd rather move the fsnotify call to vfs_unlink(),
> > vfs_rmdir(), simple_unlink(), simple_rmdir(), and then those few callers of
> > d_delete() that remain as you suggest elsewhere in this thread. And then we
> > get more consistent context for fsnotify_nameremove() and could just use
> > fsnotify_dirent().
> >
>
> Yes, I much prefer this solution myself and I will follow up with it,
> but it would not be honest to suggest said solution as a stable fix
> to the performance regression that was introduced in v5.1.
> I think is it better if you choose between lesser evil:
> v1 with ifdef CONFIG_FSNOTIFY to fix build issue
> v2 as subtle as it is
> OR another obviously safe stable fix that you can think of
>
> The change of cleansing d_delete() from fsnotify_nameremove()
> requires more research and is anyway not stable tree material -
> if not for the level of complexity, then because all the users of
> FS_DELETE from pseudo and clustered filesystems need more time
> to find regressions (we do not have test coverage for those in LTP).
>

Something like this:
https://github.com/amir73il/linux/commits/fsnotify_nameremove

Only partially tested. Obviously haven't tested all callers.

Thanks,
Amir.
Jan Kara May 9, 2019, 9:46 a.m. UTC | #4
On Tue 07-05-19 22:12:57, Amir Goldstein wrote:
> On Tue, May 7, 2019 at 7:19 PM Jan Kara <jack@suse.cz> wrote:
> > So I'd rather move the fsnotify call to vfs_unlink(),
> > vfs_rmdir(), simple_unlink(), simple_rmdir(), and then those few callers of
> > d_delete() that remain as you suggest elsewhere in this thread. And then we
> > get more consistent context for fsnotify_nameremove() and could just use
> > fsnotify_dirent().
> >
> 
> Yes, I much prefer this solution myself and I will follow up with it,
> but it would not be honest to suggest said solution as a stable fix
> to the performance regression that was introduced in v5.1.
> I think is it better if you choose between lesser evil:
> v1 with ifdef CONFIG_FSNOTIFY to fix build issue
> v2 as subtle as it is
> OR another obviously safe stable fix that you can think of

OK, fair enough. I'll go with v1 + build fix for current merge window +
stable as it's local to fsnotify_nameremove().

> The change of cleansing d_delete() from fsnotify_nameremove()
> requires more research and is anyway not stable tree material -
> if not for the level of complexity, then because all the users of
> FS_DELETE from pseudo and clustered filesystems need more time
> to find regressions (we do not have test coverage for those in LTP).

Agreed.

								Honza
Jan Kara May 9, 2019, 10:31 a.m. UTC | #5
On Wed 08-05-19 19:09:56, Amir Goldstein wrote:
> On Tue, May 7, 2019 at 10:12 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > Yes, I much prefer this solution myself and I will follow up with it,
> > but it would not be honest to suggest said solution as a stable fix
> > to the performance regression that was introduced in v5.1.
> > I think is it better if you choose between lesser evil:
> > v1 with ifdef CONFIG_FSNOTIFY to fix build issue
> > v2 as subtle as it is
> > OR another obviously safe stable fix that you can think of
> >
> > The change of cleansing d_delete() from fsnotify_nameremove()
> > requires more research and is anyway not stable tree material -
> > if not for the level of complexity, then because all the users of
> > FS_DELETE from pseudo and clustered filesystems need more time
> > to find regressions (we do not have test coverage for those in LTP).
> >
> 
> Something like this:
> https://github.com/amir73il/linux/commits/fsnotify_nameremove
> 
> Only partially tested. Obviously haven't tested all callers.

Not quite. I'd add the fsnotify_nameremove() call also to simple_rmdir()
and simple_unlink(). That takes care of:
arch/s390/hypfs/inode.c, drivers/infiniband/hw/qib/qib_fs.c,
fs/configfs/dir.c, fs/debugfs/inode.c, fs/tracefs/inode.c,
net/sunrpc/rpc_pipe.c

So you're left only with:
drivers/usb/gadget/function/f_fs.c, fs/btrfs/ioctl.c, fs/devpts/inode.c,
fs/reiserfs/xattr.c

Out of these drivers/usb/gadget/function/f_fs.c and fs/reiserfs/xattr.c
actually also don't want the notifications to be generated. They don't
generate events on file creation AFAICS and at least in case of reiserfs I
know that xattrs are handled in "hidden" system files so notification does
not make any sense. So we are left with exactly *two* places that need
explicit fsnotify_nameremove() call. Since both these callers already take
care of calling fsnotify_create() I think that having explicit
fsnotify_nameremove() calls there is clearer than the current state.

								Honza
Amir Goldstein May 10, 2019, 2:57 p.m. UTC | #6
On Thu, May 9, 2019 at 12:46 PM Jan Kara <jack@suse.cz> wrote:
>
> On Tue 07-05-19 22:12:57, Amir Goldstein wrote:
> > On Tue, May 7, 2019 at 7:19 PM Jan Kara <jack@suse.cz> wrote:
> > > So I'd rather move the fsnotify call to vfs_unlink(),
> > > vfs_rmdir(), simple_unlink(), simple_rmdir(), and then those few callers of
> > > d_delete() that remain as you suggest elsewhere in this thread. And then we
> > > get more consistent context for fsnotify_nameremove() and could just use
> > > fsnotify_dirent().
> > >
> >
> > Yes, I much prefer this solution myself and I will follow up with it,
> > but it would not be honest to suggest said solution as a stable fix
> > to the performance regression that was introduced in v5.1.
> > I think is it better if you choose between lesser evil:
> > v1 with ifdef CONFIG_FSNOTIFY to fix build issue
> > v2 as subtle as it is
> > OR another obviously safe stable fix that you can think of
>
> OK, fair enough. I'll go with v1 + build fix for current merge window +
> stable as it's local to fsnotify_nameremove().

Please note that the patch on your fsnotify branch conflicts with
fsnotify_nameremove() changes in master:
230c6402b1b3 ovl_lookup_real_one(): don't bother with strlen()
25b229dff4ff fsnotify(): switch to passing const struct qstr * for file_name

Thanks,
Amir.
Amir Goldstein May 10, 2019, 3:24 p.m. UTC | #7
On Thu, May 9, 2019 at 1:31 PM Jan Kara <jack@suse.cz> wrote:
>
> On Wed 08-05-19 19:09:56, Amir Goldstein wrote:
> > On Tue, May 7, 2019 at 10:12 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > > Yes, I much prefer this solution myself and I will follow up with it,
> > > but it would not be honest to suggest said solution as a stable fix
> > > to the performance regression that was introduced in v5.1.
> > > I think is it better if you choose between lesser evil:
> > > v1 with ifdef CONFIG_FSNOTIFY to fix build issue
> > > v2 as subtle as it is
> > > OR another obviously safe stable fix that you can think of
> > >
> > > The change of cleansing d_delete() from fsnotify_nameremove()
> > > requires more research and is anyway not stable tree material -
> > > if not for the level of complexity, then because all the users of
> > > FS_DELETE from pseudo and clustered filesystems need more time
> > > to find regressions (we do not have test coverage for those in LTP).
> > >
> >
> > Something like this:
> > https://github.com/amir73il/linux/commits/fsnotify_nameremove
> >
> > Only partially tested. Obviously haven't tested all callers.
>
> Not quite. I'd add the fsnotify_nameremove() call also to simple_rmdir()
> and simple_unlink(). That takes care of:
> arch/s390/hypfs/inode.c, drivers/infiniband/hw/qib/qib_fs.c,
> fs/configfs/dir.c, fs/debugfs/inode.c, fs/tracefs/inode.c,
> net/sunrpc/rpc_pipe.c
>

simple_unlink() is used as i_op->unlink() implementation of simple
filesystems, such as: fs/pstore/inode.c fs/ramfs/inode.c
fs/ocfs2/dlmfs/dlmfs.c fs/hugetlbfs/inode.c kernel/bpf/inode.c

If we place fsnotify hook in the filesystem implementation and not
in vfs_unlink(), what will cover normal fs? If we do place fsnotify hook
in vfs_unlink(), then we have a double call to hook.

The places we add explicit fsnofity hooks should only be call sites that
do not originate from vfs_unlink/vfs_rmdir.

> So you're left only with:
> drivers/usb/gadget/function/f_fs.c, fs/btrfs/ioctl.c, fs/devpts/inode.c,
> fs/reiserfs/xattr.c
>
> Out of these drivers/usb/gadget/function/f_fs.c and fs/reiserfs/xattr.c
> actually also don't want the notifications to be generated. They don't
> generate events on file creation AFAICS and at least in case of reiserfs I
> know that xattrs are handled in "hidden" system files so notification does
> not make any sense. So we are left with exactly *two* places that need

OK. good to know.

> explicit fsnotify_nameremove() call. Since both these callers already take
> care of calling fsnotify_create() I think that having explicit
> fsnotify_nameremove() calls there is clearer than the current state.
>

I though so too, but then I did not feel comfortable with "regressing"
delete notifications on many filesystems that did seem plausible for
having notification users like configfs, even though they do not have
create notification, so I decided to do the safer option of converting all
plausible callers of d_delete() that abide to the parent/name stable
constrains to use the d_delete_and_notify() wrapper.

For readers that did not follow my link, those are the call site that were
converted to opt-in for d_delete_and_notify() in the linked branch:
arch/s390/hypfs/inode.c
drivers/infiniband/hw/qib/qib_fs.c
drivers/usb/gadget/function/f_fs.c
fs/btrfs/ioctl.c
fs/configfs/dir.c
fs/debugfs/inode.c
fs/devpts/inode.c
fs/reiserfs/xattr.c
fs/tracefs/inode.c
net/sunrpc/rpc_pipe.c

In addition, nfs and afs no longer need to call fsnotify hook explcitly on
completion of silly rename (delayed unlink).

Thanks,
Amir.
Jan Kara May 13, 2019, 4:23 p.m. UTC | #8
On Fri 10-05-19 17:57:49, Amir Goldstein wrote:
> On Thu, May 9, 2019 at 12:46 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Tue 07-05-19 22:12:57, Amir Goldstein wrote:
> > > On Tue, May 7, 2019 at 7:19 PM Jan Kara <jack@suse.cz> wrote:
> > > > So I'd rather move the fsnotify call to vfs_unlink(),
> > > > vfs_rmdir(), simple_unlink(), simple_rmdir(), and then those few callers of
> > > > d_delete() that remain as you suggest elsewhere in this thread. And then we
> > > > get more consistent context for fsnotify_nameremove() and could just use
> > > > fsnotify_dirent().
> > > >
> > >
> > > Yes, I much prefer this solution myself and I will follow up with it,
> > > but it would not be honest to suggest said solution as a stable fix
> > > to the performance regression that was introduced in v5.1.
> > > I think is it better if you choose between lesser evil:
> > > v1 with ifdef CONFIG_FSNOTIFY to fix build issue
> > > v2 as subtle as it is
> > > OR another obviously safe stable fix that you can think of
> >
> > OK, fair enough. I'll go with v1 + build fix for current merge window +
> > stable as it's local to fsnotify_nameremove().
> 
> Please note that the patch on your fsnotify branch conflicts with
> fsnotify_nameremove() changes in master:
> 230c6402b1b3 ovl_lookup_real_one(): don't bother with strlen()
> 25b229dff4ff fsnotify(): switch to passing const struct qstr * for file_name

Thanks for the heads up! The conflict is easy enough to resolve but I've
notified Linus about it in my pull request. Hum, which reminds me that I've
forgotten to pull the latest fix in fsnotify branch into my for_next branch
and so I didn't get a notification about the conflict myself. Too late to
fix that now I guess...

								Honza
Jan Kara May 13, 2019, 4:33 p.m. UTC | #9
On Fri 10-05-19 18:24:42, Amir Goldstein wrote:
> On Thu, May 9, 2019 at 1:31 PM Jan Kara <jack@suse.cz> wrote:
> > On Wed 08-05-19 19:09:56, Amir Goldstein wrote:
> > > On Tue, May 7, 2019 at 10:12 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > > > Yes, I much prefer this solution myself and I will follow up with it,
> > > > but it would not be honest to suggest said solution as a stable fix
> > > > to the performance regression that was introduced in v5.1.
> > > > I think is it better if you choose between lesser evil:
> > > > v1 with ifdef CONFIG_FSNOTIFY to fix build issue
> > > > v2 as subtle as it is
> > > > OR another obviously safe stable fix that you can think of
> > > >
> > > > The change of cleansing d_delete() from fsnotify_nameremove()
> > > > requires more research and is anyway not stable tree material -
> > > > if not for the level of complexity, then because all the users of
> > > > FS_DELETE from pseudo and clustered filesystems need more time
> > > > to find regressions (we do not have test coverage for those in LTP).
> > > >
> > >
> > > Something like this:
> > > https://github.com/amir73il/linux/commits/fsnotify_nameremove
> > >
> > > Only partially tested. Obviously haven't tested all callers.
> >
> > Not quite. I'd add the fsnotify_nameremove() call also to simple_rmdir()
> > and simple_unlink(). That takes care of:
> > arch/s390/hypfs/inode.c, drivers/infiniband/hw/qib/qib_fs.c,
> > fs/configfs/dir.c, fs/debugfs/inode.c, fs/tracefs/inode.c,
> > net/sunrpc/rpc_pipe.c
> >
> 
> simple_unlink() is used as i_op->unlink() implementation of simple
> filesystems, such as: fs/pstore/inode.c fs/ramfs/inode.c
> fs/ocfs2/dlmfs/dlmfs.c fs/hugetlbfs/inode.c kernel/bpf/inode.c
> 
> If we place fsnotify hook in the filesystem implementation and not
> in vfs_unlink(), what will cover normal fs? If we do place fsnotify hook
> in vfs_unlink(), then we have a double call to hook.
> 
> The places we add explicit fsnofity hooks should only be call sites that
> do not originate from vfs_unlink/vfs_rmdir.

Hum, right. I didn't realize simple_unlink() gets also called through
vfs_unlink() for some filesystems. But then I'd rather create variants
simple_unlink_notify() and simple_rmdir_notify() than messing with
d_delete(). As I just think that fsnotify call in d_delete() happens at a
wrong layer. d_delete() is about dcache management, not really about
filesystem name removal which is what we want to notify about.

> > So you're left only with:
> > drivers/usb/gadget/function/f_fs.c, fs/btrfs/ioctl.c, fs/devpts/inode.c,
> > fs/reiserfs/xattr.c
> >
> > Out of these drivers/usb/gadget/function/f_fs.c and fs/reiserfs/xattr.c
> > actually also don't want the notifications to be generated. They don't
> > generate events on file creation AFAICS and at least in case of reiserfs I
> > know that xattrs are handled in "hidden" system files so notification does
> > not make any sense. So we are left with exactly *two* places that need
> 
> OK. good to know.
> 
> > explicit fsnotify_nameremove() call. Since both these callers already take
> > care of calling fsnotify_create() I think that having explicit
> > fsnotify_nameremove() calls there is clearer than the current state.
> >
> 
> I though so too, but then I did not feel comfortable with "regressing"
> delete notifications on many filesystems that did seem plausible for
> having notification users like configfs, even though they do not have
> create notification, so I decided to do the safer option of converting all
> plausible callers of d_delete() that abide to the parent/name stable
> constrains to use the d_delete_and_notify() wrapper.

As I said above, I don't want to regress anyone either. But I just think
that d_delete() is a wrong function to call fsnotify hook from (as much as
it is convenient) and that's what's causing all these problems.

								Honza
Amir Goldstein May 13, 2019, 4:47 p.m. UTC | #10
On Mon, May 13, 2019 at 7:33 PM Jan Kara <jack@suse.cz> wrote:
>
> On Fri 10-05-19 18:24:42, Amir Goldstein wrote:
> > On Thu, May 9, 2019 at 1:31 PM Jan Kara <jack@suse.cz> wrote:
> > > On Wed 08-05-19 19:09:56, Amir Goldstein wrote:
> > > > On Tue, May 7, 2019 at 10:12 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > > > > Yes, I much prefer this solution myself and I will follow up with it,
> > > > > but it would not be honest to suggest said solution as a stable fix
> > > > > to the performance regression that was introduced in v5.1.
> > > > > I think is it better if you choose between lesser evil:
> > > > > v1 with ifdef CONFIG_FSNOTIFY to fix build issue
> > > > > v2 as subtle as it is
> > > > > OR another obviously safe stable fix that you can think of
> > > > >
> > > > > The change of cleansing d_delete() from fsnotify_nameremove()
> > > > > requires more research and is anyway not stable tree material -
> > > > > if not for the level of complexity, then because all the users of
> > > > > FS_DELETE from pseudo and clustered filesystems need more time
> > > > > to find regressions (we do not have test coverage for those in LTP).
> > > > >
> > > >
> > > > Something like this:
> > > > https://github.com/amir73il/linux/commits/fsnotify_nameremove
> > > >
> > > > Only partially tested. Obviously haven't tested all callers.
> > >
> > > Not quite. I'd add the fsnotify_nameremove() call also to simple_rmdir()
> > > and simple_unlink(). That takes care of:
> > > arch/s390/hypfs/inode.c, drivers/infiniband/hw/qib/qib_fs.c,
> > > fs/configfs/dir.c, fs/debugfs/inode.c, fs/tracefs/inode.c,
> > > net/sunrpc/rpc_pipe.c
> > >
> >
> > simple_unlink() is used as i_op->unlink() implementation of simple
> > filesystems, such as: fs/pstore/inode.c fs/ramfs/inode.c
> > fs/ocfs2/dlmfs/dlmfs.c fs/hugetlbfs/inode.c kernel/bpf/inode.c
> >
> > If we place fsnotify hook in the filesystem implementation and not
> > in vfs_unlink(), what will cover normal fs? If we do place fsnotify hook
> > in vfs_unlink(), then we have a double call to hook.
> >
> > The places we add explicit fsnofity hooks should only be call sites that
> > do not originate from vfs_unlink/vfs_rmdir.
>
> Hum, right. I didn't realize simple_unlink() gets also called through
> vfs_unlink() for some filesystems. But then I'd rather create variants
> simple_unlink_notify() and simple_rmdir_notify() than messing with
> d_delete(). As I just think that fsnotify call in d_delete() happens at a
> wrong layer. d_delete() is about dcache management, not really about
> filesystem name removal which is what we want to notify about.
>

Agreed.
I'll follow up with this solution, hopefully after the stable regression
fix is already merged.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index df06f3da166c..265b726d6e8d 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -151,31 +151,31 @@  void __fsnotify_update_child_dentry_flags(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 }
 
-/* Notify this dentry's parent about a child's events. */
+/*
+ * Notify this dentry's parent about an event and make sure that name is stable.
+ * Events "on child" are only reported if parent is watching.
+ * Directory modification events are also reported if super block is watching.
+ */
 int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask)
 {
 	struct dentry *parent;
 	struct inode *p_inode;
 	int ret = 0;
+	bool on_child = (mask & FS_EVENT_ON_CHILD);
+	__u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
 
-	if (!dentry)
-		dentry = path->dentry;
-
-	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
+	if (on_child && !(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
 		return 0;
 
 	parent = dget_parent(dentry);
 	p_inode = parent->d_inode;
 
-	if (unlikely(!fsnotify_inode_watches_children(p_inode))) {
+	if (on_child && unlikely(!fsnotify_inode_watches_children(p_inode))) {
 		__fsnotify_update_child_dentry_flags(p_inode);
-	} else if (p_inode->i_fsnotify_mask & mask & ALL_FSNOTIFY_EVENTS) {
+	} else if ((p_inode->i_fsnotify_mask & test_mask) ||
+		   (!on_child && (dentry->d_sb->s_fsnotify_mask & test_mask))) {
 		struct name_snapshot name;
 
-		/* we are notifying a parent so come up with the new mask which
-		 * specifies these are events which came from a child. */
-		mask |= FS_EVENT_ON_CHILD;
-
 		take_dentry_name_snapshot(&name, dentry);
 		if (path)
 			ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 09587e2860b5..8641bf9a1eda 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -37,7 +37,7 @@  static inline int fsnotify_parent(const struct path *path,
 	if (!dentry)
 		dentry = path->dentry;
 
-	return __fsnotify_parent(path, dentry, mask);
+	return __fsnotify_parent(path, dentry, mask | FS_EVENT_ON_CHILD);
 }
 
 /*
@@ -158,13 +158,11 @@  static inline void fsnotify_vfsmount_delete(struct vfsmount *mnt)
  * dentry->d_parent should be stable. However there are some corner cases where
  * inode lock is not held. So to be on the safe side and be reselient to future
  * callers and out of tree users of d_delete(), we do not assume that d_parent
- * and d_name are stable and we use dget_parent() and
+ * and d_name are stable and we use __fsnotify_parent() to
  * take_dentry_name_snapshot() to grab stable references.
  */
 static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 {
-	struct dentry *parent;
-	struct name_snapshot name;
 	__u32 mask = FS_DELETE;
 
 	/* d_delete() of pseudo inode? (e.g. __ns_get_path() playing tricks) */
@@ -174,14 +172,7 @@  static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 	if (isdir)
 		mask |= FS_ISDIR;
 
-	parent = dget_parent(dentry);
-	take_dentry_name_snapshot(&name, dentry);
-
-	fsnotify(d_inode(parent), mask, d_inode(dentry), FSNOTIFY_EVENT_INODE,
-		 name.name, 0);
-
-	release_dentry_name_snapshot(&name);
-	dput(parent);
+	__fsnotify_parent(NULL, dentry, mask);
 }
 
 /*