diff mbox series

[v2,1/5] fsnotify: pass dentry instead of inode when available

Message ID 20181114174344.17530-2-amir73il@gmail.com (mailing list archive)
State New, archived
Headers show
Series fsnotify prep work for fanotify dentry events | expand

Commit Message

Amir Goldstein Nov. 14, 2018, 5:43 p.m. UTC
Define a new data type to pass for event FSNOTIFY_EVENT_DENTRY
and use it whenever a dentry is available instead of passing
it's ->d_inode as data type FSNOTIFY_EVENT_INODE.

None of the current fsnotify backends make use of the inode data
with data type FSNOTIFY_EVENT_INODE - only the data of type
FSNOTIFY_EVENT_PATH is ever used, so this change has no immediate
consequences.

Soon, we are going to use the dentry data type to support more
events with fanotify backend.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 fs/notify/fsnotify.c             |  8 ++++----
 include/linux/fsnotify.h         | 15 +++++++--------
 include/linux/fsnotify_backend.h |  3 ++-
 3 files changed, 13 insertions(+), 13 deletions(-)

Comments

Jan Kara Nov. 20, 2018, 11:32 a.m. UTC | #1
On Wed 14-11-18 19:43:40, Amir Goldstein wrote:
> Define a new data type to pass for event FSNOTIFY_EVENT_DENTRY
> and use it whenever a dentry is available instead of passing
> it's ->d_inode as data type FSNOTIFY_EVENT_INODE.
> 
> None of the current fsnotify backends make use of the inode data
> with data type FSNOTIFY_EVENT_INODE - only the data of type
> FSNOTIFY_EVENT_PATH is ever used, so this change has no immediate
> consequences.
> 
> Soon, we are going to use the dentry data type to support more
> events with fanotify backend.
> 
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>

One general coment: Audit is using FSNOTIFY_EVENT_INODE and this change is
going to break it. So it needs updating as well.

Also I'd prefer a more justified selection of dentry events than 'those
where we can do it'. Because eventually that set will translate to events
available to userspace in fanotify and that should be a reasonably
consistent set. So here I suspect you target at directory modification
events (you call them filename events in the next patch). So just define
which FS_<foo> events these exactly are and convert exactly those event
helpers to dentry ones...

> diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
> index 2ccb08cb5d6a..9dadc0bcd7a9 100644
> --- a/include/linux/fsnotify.h
> +++ b/include/linux/fsnotify.h
> @@ -99,14 +99,14 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
>  
>  	fsnotify(old_dir, old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_name,
>  		 fs_cookie);
> -	fsnotify(new_dir, new_dir_mask, source, FSNOTIFY_EVENT_INODE, new_name,
> +	fsnotify(new_dir, new_dir_mask, moved, FSNOTIFY_EVENT_DENTRY, new_name,
>  		 fs_cookie);
>  
>  	if (target)
>  		fsnotify_link_count(target);
>  
>  	if (source)
> -		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
> +		fsnotify(source, FS_MOVE_SELF, moved, FSNOTIFY_EVENT_DENTRY, NULL, 0);
>  	audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
>  }
>  

I'd postpone these fsnotify_move() changes to a patch where you make
fsnotify_move() fully dentry based. Having it converted half-way looks
confusing...

> @@ -168,7 +168,7 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
>  	fsnotify_link_count(inode);
>  	audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);
>  
> -	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name, 0);
> +	fsnotify(dir, FS_CREATE, new_dentry, FSNOTIFY_EVENT_DENTRY, new_dentry->d_name.name, 0);
>  }

So a change like this makes me slightly nervous. What guarantees that
new_dentry->d_inode is going to be the same thing as 'inode' passed in?
Aren't races changing new_dentry before we look at new_dentry->d_inode
possible? In this specific case I don't think anything unexpected is
possible - we hold i_rwsem, hopefully nobody does anything fancy like
instantiating a different inode in their ->link method. But generally
non-obvious changes like this should be split out in separate commits with
justification why the change is safe. 

								Honza
Amir Goldstein Nov. 20, 2018, 2:36 p.m. UTC | #2
On Tue, Nov 20, 2018 at 1:32 PM Jan Kara <jack@suse.cz> wrote:
>
> On Wed 14-11-18 19:43:40, Amir Goldstein wrote:
> > Define a new data type to pass for event FSNOTIFY_EVENT_DENTRY
> > and use it whenever a dentry is available instead of passing
> > it's ->d_inode as data type FSNOTIFY_EVENT_INODE.
> >
> > None of the current fsnotify backends make use of the inode data
> > with data type FSNOTIFY_EVENT_INODE - only the data of type
> > FSNOTIFY_EVENT_PATH is ever used, so this change has no immediate
> > consequences.
> >
> > Soon, we are going to use the dentry data type to support more
> > events with fanotify backend.
> >
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
>
> One general coment: Audit is using FSNOTIFY_EVENT_INODE and this change is
> going to break it. So it needs updating as well.
>

Ouch! I should really add the audit test suite to my testing routine...

> Also I'd prefer a more justified selection of dentry events than 'those
> where we can do it'. Because eventually that set will translate to events
> available to userspace in fanotify and that should be a reasonably
> consistent set. So here I suspect you target at directory modification
> events (you call them filename events in the next patch). So just define
> which FS_<foo> events these exactly are and convert exactly those event
> helpers to dentry ones...
>

It is not accurate that I target only "directory modification" events.
I also target FS_ATTRIB and in the future also FS_DELETE_SELF,
both applicable to files as well as directories.

But when I think about it... fanotify does not really need the dentry
information,
so I might just be able to do without FSNOTIFY_EVENT_DENTRY
and avert the questions about stability of dentry->d_inode

fanotify_dentry branch uses the dentry information in 3 occasions:

1. if (d_is_dir(dentry) in fanotify_group_event_mask()
    This could be converted to S_ISDIR(inode->i_mode)
2. exportfs_encode_fh(dentry,... in fanotify_alloc_fid()
    Can be converted to exportfs_encode_inode_fh(inode,...
3. statfs_by_dentry(dentry,... in fanotify_alloc_fid()
    Can be converted to statfs_by_dentry(inode->i_sb->sb_root,...

I will leave the patch to annotate "directory change" events,
but the rest of this prep series can be discarded.

Does that sound reasonable?

Thanks,
Amir.
Jan Kara Nov. 21, 2018, 12:51 p.m. UTC | #3
On Tue 20-11-18 16:36:31, Amir Goldstein wrote:
> On Tue, Nov 20, 2018 at 1:32 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Wed 14-11-18 19:43:40, Amir Goldstein wrote:
> > > Define a new data type to pass for event FSNOTIFY_EVENT_DENTRY
> > > and use it whenever a dentry is available instead of passing
> > > it's ->d_inode as data type FSNOTIFY_EVENT_INODE.
> > >
> > > None of the current fsnotify backends make use of the inode data
> > > with data type FSNOTIFY_EVENT_INODE - only the data of type
> > > FSNOTIFY_EVENT_PATH is ever used, so this change has no immediate
> > > consequences.
> > >
> > > Soon, we are going to use the dentry data type to support more
> > > events with fanotify backend.
> > >
> > > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> >
> > One general coment: Audit is using FSNOTIFY_EVENT_INODE and this change is
> > going to break it. So it needs updating as well.
> >
> 
> Ouch! I should really add the audit test suite to my testing routine...
> 
> > Also I'd prefer a more justified selection of dentry events than 'those
> > where we can do it'. Because eventually that set will translate to events
> > available to userspace in fanotify and that should be a reasonably
> > consistent set. So here I suspect you target at directory modification
> > events (you call them filename events in the next patch). So just define
> > which FS_<foo> events these exactly are and convert exactly those event
> > helpers to dentry ones...
> >
> 
> It is not accurate that I target only "directory modification" events.
> I also target FS_ATTRIB and in the future also FS_DELETE_SELF,
> both applicable to files as well as directories.
> 
> But when I think about it... fanotify does not really need the dentry
> information,
> so I might just be able to do without FSNOTIFY_EVENT_DENTRY
> and avert the questions about stability of dentry->d_inode

OK, that would certainly make things simpler.

> fanotify_dentry branch uses the dentry information in 3 occasions:
> 
> 1. if (d_is_dir(dentry) in fanotify_group_event_mask()
>     This could be converted to S_ISDIR(inode->i_mode)

Sure.

> 2. exportfs_encode_fh(dentry,... in fanotify_alloc_fid()
>     Can be converted to exportfs_encode_inode_fh(inode,...

If you have the parent inode then yes. In lot of cases we do have it, not
sure if in all of them (but likely yes so that we can do proper
FS_EVENT_ON_CHILD) handling.

> 3. statfs_by_dentry(dentry,... in fanotify_alloc_fid()
>     Can be converted to statfs_by_dentry(inode->i_sb->sb_root,...

This might be problematic e.g. with btrfs subvolumes which have each
different fsid so the fsid-handle pair might be actually invalid or even
point to a file with different contents. Maybe we could just store the
fsid in the fsnotify_mark when it is created and use it when generating
events? That would also get rid of the overhead of calling statfs for each
generated event which I don't like...

> I will leave the patch to annotate "directory change" events,
> but the rest of this prep series can be discarded.
> 
> Does that sound reasonable?

Yes.

								Honza
Amir Goldstein Nov. 21, 2018, 4:13 p.m. UTC | #4
>
> > 2. exportfs_encode_fh(dentry,... in fanotify_alloc_fid()
> >     Can be converted to exportfs_encode_inode_fh(inode,...
>
> If you have the parent inode then yes. In lot of cases we do have it, not
> sure if in all of them (but likely yes so that we can do proper
> FS_EVENT_ON_CHILD) handling.
>

We do not need the parent inode, because we are not encoding a
"connectable" file handle. we need:
exportfs_encode_inode_fh(inode, (struct fid *)fid->f_handle,
                                  &dwords,  NULL);


> > 3. statfs_by_dentry(dentry,... in fanotify_alloc_fid()
> >     Can be converted to statfs_by_dentry(inode->i_sb->sb_root,...
>
> This might be problematic e.g. with btrfs subvolumes which have each
> different fsid so the fsid-handle pair might be actually invalid or even
> point to a file with different contents. Maybe we could just store the
> fsid in the fsnotify_mark when it is created and use it when generating
> events? That would also get rid of the overhead of calling statfs for each
> generated event which I don't like...
>

OK, I was not being accurate when I wrote that these are the only 3 places
we use dentry. Those are the only 3 places in fanotify, but we also use
dentry->d_sb in fsnotify() to get to the sb mark of course, so we will be using
inode->i_sb instead.

w.r.t btrfs - btrfs has a single sb for multiple subvolumes so by definition the
FAN_MARK_FILESYSTEM feature is only capable of watching ALL subvolumes.

If we wanted to implement subvolume watch for btrfs, we would need
to support attaching a mark to a btrfs subvolume struct (or fs_view [1]).

Basically, the purpose, of fid->fsid is:
1. A key for finding a mount point to use as mount_fd argument to
open_by_handle_at()
2. Make fid unique across the system

Since btrfs file handles encode the subvolume root object id in the file handle,
fid->fsid is only needed to find a btrfs mount of the same sb (blockdev).

Regardless, IIUC, btrfs_statfs() returns an fsid which is associated with the
single super block struct, so all dentries in all subvolumes will
return the same
fsid: btrfs_sb(dentry->d_sb)->fsid.

CC some btrfs folks to correct me if I am wrong.

Thanks,
Amir.

[1] https://lwn.net/Articles/753917/
Jan Kara Nov. 22, 2018, 9:52 a.m. UTC | #5
On Wed 21-11-18 18:13:17, Amir Goldstein wrote:
> >
> > > 2. exportfs_encode_fh(dentry,... in fanotify_alloc_fid()
> > >     Can be converted to exportfs_encode_inode_fh(inode,...
> >
> > If you have the parent inode then yes. In lot of cases we do have it, not
> > sure if in all of them (but likely yes so that we can do proper
> > FS_EVENT_ON_CHILD) handling.
> >
> 
> We do not need the parent inode, because we are not encoding a
> "connectable" file handle. we need:
> exportfs_encode_inode_fh(inode, (struct fid *)fid->f_handle,
>                                   &dwords,  NULL);

Ah, you're right. I thought any handle has to be "connectable" as you say
but apparently open-by-handle is fine even with not connectable ones.
Thanks for enlightening me :)

> > > 3. statfs_by_dentry(dentry,... in fanotify_alloc_fid()
> > >     Can be converted to statfs_by_dentry(inode->i_sb->sb_root,...
> >
> > This might be problematic e.g. with btrfs subvolumes which have each
> > different fsid so the fsid-handle pair might be actually invalid or even
> > point to a file with different contents. Maybe we could just store the
> > fsid in the fsnotify_mark when it is created and use it when generating
> > events? That would also get rid of the overhead of calling statfs for each
> > generated event which I don't like...
> >
> 
> OK, I was not being accurate when I wrote that these are the only 3 places
> we use dentry. Those are the only 3 places in fanotify, but we also use
> dentry->d_sb in fsnotify() to get to the sb mark of course, so we will be using
> inode->i_sb instead.

Sure.

> w.r.t btrfs - btrfs has a single sb for multiple subvolumes so by
> definition the FAN_MARK_FILESYSTEM feature is only capable of watching
> ALL subvolumes.

Agreed but then if some event happens on inode A in subvolume S, you have
to be sure the handle+fsid you return will allow you to open the file with
that content and not inode A on subvolume T which has a different
contents...

> If we wanted to implement subvolume watch for btrfs, we would need
> to support attaching a mark to a btrfs subvolume struct (or fs_view [1]).

No, that's not what I meant.

> Basically, the purpose, of fid->fsid is:
> 1. A key for finding a mount point to use as mount_fd argument to
> open_by_handle_at()
> 2. Make fid unique across the system

Understood, that's what I thought.

> Since btrfs file handles encode the subvolume root object id in the file
> handle, fid->fsid is only needed to find a btrfs mount of the same sb
> (blockdev).

Ah good. I didn't realize btrfs will record subvolume root in the file
handle. On a second thought how else could NFS mount work, right, but it
didn't occur to me yesterday.

> Regardless, IIUC, btrfs_statfs() returns an fsid which is associated with
> the single super block struct, so all dentries in all subvolumes will
> return the same fsid: btrfs_sb(dentry->d_sb)->fsid.

This is not true AFAICT. Looking at btrfs_statfs() I can see:

        buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
        buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
        /* Mask in the root object ID too, to disambiguate subvols */
        buf->f_fsid.val[0] ^=
                BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
        buf->f_fsid.val[1] ^=
                BTRFS_I(d_inode(dentry))->root->root_key.objectid;

So subvolume root is xored into the FSID. Thus dentries from different
subvolumes are going to return different fsids...

								Honza
Amir Goldstein Nov. 22, 2018, 12:36 p.m. UTC | #6
> > w.r.t btrfs - btrfs has a single sb for multiple subvolumes so by
> > definition the FAN_MARK_FILESYSTEM feature is only capable of watching
> > ALL subvolumes.
>
> Agreed but then if some event happens on inode A in subvolume S, you have
> to be sure the handle+fsid you return will allow you to open the file with
> that content and not inode A on subvolume T which has a different
> contents...

So as written below, the btrfs file handle is unique across all subvolumes.
fsid is therefore not needed to find the subvolume. It is only needed to find
the btrfs super block and listening on many block devices and/or many
filesystem types.

> > Basically, the purpose, of fid->fsid is:
> > 1. A key for finding a mount point to use as mount_fd argument to
> > open_by_handle_at()
> > 2. Make fid unique across the system
>
> Understood, that's what I thought.
>
> > Since btrfs file handles encode the subvolume root object id in the file
> > handle, fid->fsid is only needed to find a btrfs mount of the same sb
> > (blockdev).
>
> Ah good. I didn't realize btrfs will record subvolume root in the file
> handle. On a second thought how else could NFS mount work, right, but it
> didn't occur to me yesterday.
>
> > Regardless, IIUC, btrfs_statfs() returns an fsid which is associated with
> > the single super block struct, so all dentries in all subvolumes will
> > return the same fsid: btrfs_sb(dentry->d_sb)->fsid.
>
> This is not true AFAICT. Looking at btrfs_statfs() I can see:
>
>         buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
>         buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
>         /* Mask in the root object ID too, to disambiguate subvols */
>         buf->f_fsid.val[0] ^=
>                 BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
>         buf->f_fsid.val[1] ^=
>                 BTRFS_I(d_inode(dentry))->root->root_key.objectid;
>
> So subvolume root is xored into the FSID. Thus dentries from different
> subvolumes are going to return different fsids...
>

Right... how could I have missed that :-/

I do not want to bring back FSNOTIFY_EVENT_DENTRY just for that
and I saw how many flaws you pointed to in $SUBJECT patch.
Instead, I will use:
statfs_by_dentry(d_find_any_alias(inode) ?: inode->i_sb->sb_root,...

So we are only left with the corner case of fsnotify_inoderemove()
on btrfs and the like. I checked that for all the rest of fsnotify hooks
an alias is guarantied to exist at the time of the hook.

I could go with either of two options:
1. No support for FAN_DELETE_SELF as my v2 patch set already does
2. Best effort support for FAN_DELETE_SELF - it works for non-btrfs
    and if application listens on a single filesystem and ignores fsid
    (document this culprit).

What do you think?

Thanks,
Amir.
Jan Kara Nov. 22, 2018, 1:26 p.m. UTC | #7
On Thu 22-11-18 14:36:35, Amir Goldstein wrote:
> > > Regardless, IIUC, btrfs_statfs() returns an fsid which is associated with
> > > the single super block struct, so all dentries in all subvolumes will
> > > return the same fsid: btrfs_sb(dentry->d_sb)->fsid.
> >
> > This is not true AFAICT. Looking at btrfs_statfs() I can see:
> >
> >         buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
> >         buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
> >         /* Mask in the root object ID too, to disambiguate subvols */
> >         buf->f_fsid.val[0] ^=
> >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
> >         buf->f_fsid.val[1] ^=
> >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid;
> >
> > So subvolume root is xored into the FSID. Thus dentries from different
> > subvolumes are going to return different fsids...
> >
> 
> Right... how could I have missed that :-/
> 
> I do not want to bring back FSNOTIFY_EVENT_DENTRY just for that
> and I saw how many flaws you pointed to in $SUBJECT patch.
> Instead, I will use:
> statfs_by_dentry(d_find_any_alias(inode) ?: inode->i_sb->sb_root,...

So what about my proposal to store fsid in the notification mark when it is
created and then use it when that mark results in event being generated?
When mark is created, we have full path available, so getting proper fsid
is trivial. Furthermore if the behavior is documented like that, it is
fairly easy for userspace to track fsids it should care about and translate
them to proper file descriptors for open_by_handle().

This would get rid of statfs() on every event creation (which I don't like
much) and also avoids these problems "how to get fsid for inode". What do
you think?

								Honza

> So we are only left with the corner case of fsnotify_inoderemove()
> on btrfs and the like. I checked that for all the rest of fsnotify hooks
> an alias is guarantied to exist at the time of the hook.
> 
> I could go with either of two options:
> 1. No support for FAN_DELETE_SELF as my v2 patch set already does
> 2. Best effort support for FAN_DELETE_SELF - it works for non-btrfs
>     and if application listens on a single filesystem and ignores fsid
>     (document this culprit).
> 
> What do you think?
> 
> Thanks,
> Amir.
Amir Goldstein Nov. 22, 2018, 3:18 p.m. UTC | #8
On Thu, Nov 22, 2018 at 3:26 PM Jan Kara <jack@suse.cz> wrote:
>
> On Thu 22-11-18 14:36:35, Amir Goldstein wrote:
> > > > Regardless, IIUC, btrfs_statfs() returns an fsid which is associated with
> > > > the single super block struct, so all dentries in all subvolumes will
> > > > return the same fsid: btrfs_sb(dentry->d_sb)->fsid.
> > >
> > > This is not true AFAICT. Looking at btrfs_statfs() I can see:
> > >
> > >         buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
> > >         buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
> > >         /* Mask in the root object ID too, to disambiguate subvols */
> > >         buf->f_fsid.val[0] ^=
> > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
> > >         buf->f_fsid.val[1] ^=
> > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid;
> > >
> > > So subvolume root is xored into the FSID. Thus dentries from different
> > > subvolumes are going to return different fsids...
> > >
> >
> > Right... how could I have missed that :-/
> >
> > I do not want to bring back FSNOTIFY_EVENT_DENTRY just for that
> > and I saw how many flaws you pointed to in $SUBJECT patch.
> > Instead, I will use:
> > statfs_by_dentry(d_find_any_alias(inode) ?: inode->i_sb->sb_root,...
>
> So what about my proposal to store fsid in the notification mark when it is
> created and then use it when that mark results in event being generated?
> When mark is created, we have full path available, so getting proper fsid
> is trivial. Furthermore if the behavior is documented like that, it is
> fairly easy for userspace to track fsids it should care about and translate
> them to proper file descriptors for open_by_handle().
>
> This would get rid of statfs() on every event creation (which I don't like
> much) and also avoids these problems "how to get fsid for inode". What do
> you think?
>

That's interesting. I like the simplicity.
What happens when there are 2 btrfs subvols /subvol1 /subvol2
and the application obviously doesn't know about this and does:
fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol1);
statfs("/subvol1",...);
fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol2);
statfs("/subvol2",...);

Now the second fanotify_mark() call just updates the existing super block
mark mask, but does not change the fsid on the mark, so all events will have
fsid of subvol1 that was stored when first creating the mark.

fanotify-watch application (for example) hashes the watches (paths) under
/subvol2 by fid with fsid of subvol2, so events cannot get matched back to
"watch" (i.e. path).

And when trying to open_by_handle fid with fhandle from /subvol2
using mount_fd of /subvol1, I suppose we can either get ESTALE
or a disconnected dentry, because object from /subvol2 cannot
have a path inside /subvol1....

Am I making the issue clear? or maybe I am missing something?

Thanks,
Amir.
Amir Goldstein Nov. 22, 2018, 7:42 p.m. UTC | #9
On Thu, Nov 22, 2018 at 5:18 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Thu, Nov 22, 2018 at 3:26 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Thu 22-11-18 14:36:35, Amir Goldstein wrote:
> > > > > Regardless, IIUC, btrfs_statfs() returns an fsid which is associated with
> > > > > the single super block struct, so all dentries in all subvolumes will
> > > > > return the same fsid: btrfs_sb(dentry->d_sb)->fsid.
> > > >
> > > > This is not true AFAICT. Looking at btrfs_statfs() I can see:
> > > >
> > > >         buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
> > > >         buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
> > > >         /* Mask in the root object ID too, to disambiguate subvols */
> > > >         buf->f_fsid.val[0] ^=
> > > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
> > > >         buf->f_fsid.val[1] ^=
> > > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid;
> > > >
> > > > So subvolume root is xored into the FSID. Thus dentries from different
> > > > subvolumes are going to return different fsids...
> > > >
> > >
> > > Right... how could I have missed that :-/
> > >
> > > I do not want to bring back FSNOTIFY_EVENT_DENTRY just for that
> > > and I saw how many flaws you pointed to in $SUBJECT patch.
> > > Instead, I will use:
> > > statfs_by_dentry(d_find_any_alias(inode) ?: inode->i_sb->sb_root,...
> >
> > So what about my proposal to store fsid in the notification mark when it is
> > created and then use it when that mark results in event being generated?
> > When mark is created, we have full path available, so getting proper fsid
> > is trivial. Furthermore if the behavior is documented like that, it is
> > fairly easy for userspace to track fsids it should care about and translate
> > them to proper file descriptors for open_by_handle().
> >
> > This would get rid of statfs() on every event creation (which I don't like
> > much) and also avoids these problems "how to get fsid for inode". What do
> > you think?
> >
>
> That's interesting. I like the simplicity.
> What happens when there are 2 btrfs subvols /subvol1 /subvol2
> and the application obviously doesn't know about this and does:
> fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol1);
> statfs("/subvol1",...);
> fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol2);
> statfs("/subvol2",...);
>
> Now the second fanotify_mark() call just updates the existing super block
> mark mask, but does not change the fsid on the mark, so all events will have
> fsid of subvol1 that was stored when first creating the mark.
>
> fanotify-watch application (for example) hashes the watches (paths) under
> /subvol2 by fid with fsid of subvol2, so events cannot get matched back to
> "watch" (i.e. path).
>
> And when trying to open_by_handle fid with fhandle from /subvol2
> using mount_fd of /subvol1, I suppose we can either get ESTALE
> or a disconnected dentry, because object from /subvol2 cannot
> have a path inside /subvol1....
>
> Am I making the issue clear? or maybe I am missing something?
>

How about this compromise:

- On fanotify_mark() call statfs() for both dentry and dentry->d_sb->sb_root
- If they produce the same fsid, cache the fsid in the mark
  If they do not match invalidate existing cache and never check again
- When encoding fid, use cached fsid if exists, otherwise fallback to
  statfs_by_dentry(find_any_alias(inode) ?: inode->i_sb->sb_root)

Maybe better to return -EXDEV from fanotify_mark() on mismatch of fsid
of dentry and dentry->d_sb->sb_root? because it doesn't look like the
FAN_MARK_FILESYSTEM is going to be very useful for btrfs (??).

Other ideas?

Amir.
Jan Kara Nov. 23, 2018, 12:52 p.m. UTC | #10
Changed subject to better match what we discuss and added btrfs list to CC.

On Thu 22-11-18 17:18:25, Amir Goldstein wrote:
> On Thu, Nov 22, 2018 at 3:26 PM Jan Kara <jack@suse.cz> wrote:
> >
> > On Thu 22-11-18 14:36:35, Amir Goldstein wrote:
> > > > > Regardless, IIUC, btrfs_statfs() returns an fsid which is associated with
> > > > > the single super block struct, so all dentries in all subvolumes will
> > > > > return the same fsid: btrfs_sb(dentry->d_sb)->fsid.
> > > >
> > > > This is not true AFAICT. Looking at btrfs_statfs() I can see:
> > > >
> > > >         buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
> > > >         buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
> > > >         /* Mask in the root object ID too, to disambiguate subvols */
> > > >         buf->f_fsid.val[0] ^=
> > > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
> > > >         buf->f_fsid.val[1] ^=
> > > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid;
> > > >
> > > > So subvolume root is xored into the FSID. Thus dentries from different
> > > > subvolumes are going to return different fsids...
> > > >
> > >
> > > Right... how could I have missed that :-/
> > >
> > > I do not want to bring back FSNOTIFY_EVENT_DENTRY just for that
> > > and I saw how many flaws you pointed to in $SUBJECT patch.
> > > Instead, I will use:
> > > statfs_by_dentry(d_find_any_alias(inode) ?: inode->i_sb->sb_root,...
> >
> > So what about my proposal to store fsid in the notification mark when it is
> > created and then use it when that mark results in event being generated?
> > When mark is created, we have full path available, so getting proper fsid
> > is trivial. Furthermore if the behavior is documented like that, it is
> > fairly easy for userspace to track fsids it should care about and translate
> > them to proper file descriptors for open_by_handle().
> >
> > This would get rid of statfs() on every event creation (which I don't like
> > much) and also avoids these problems "how to get fsid for inode". What do
> > you think?
> >
> 
> That's interesting. I like the simplicity.
> What happens when there are 2 btrfs subvols /subvol1 /subvol2
> and the application obviously doesn't know about this and does:
> fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol1);
> statfs("/subvol1",...);
> fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol2);
> statfs("/subvol2",...);
> 
> Now the second fanotify_mark() call just updates the existing super block
> mark mask, but does not change the fsid on the mark, so all events will have
> fsid of subvol1 that was stored when first creating the mark.

Yes.

> fanotify-watch application (for example) hashes the watches (paths) under
> /subvol2 by fid with fsid of subvol2, so events cannot get matched back to
> "watch" (i.e. path).

I agree this can be confusing... but with btrfs fanotify-watch will be
confused even with your current code, won't it? Because FAN_MARK_FILESYSTEM
on /subvol1 (with fsid A) is going to return also events on inodes from
/subvol2 (with fsid B). So your current code will return event with fsid B
which fanotify-watch has no way to match back and can get confused.

So currently application can get events with fsid it has never seen, with
the code as I suggest it can get "wrong" fsid. That is confusing but still
somewhat better?

The core of the problem is that btrfs does not have "the superblock
identifier" that would correspond to FAN_MARK_FILESYSTEM scope of events
that we could use.

> And when trying to open_by_handle fid with fhandle from /subvol2
> using mount_fd of /subvol1, I suppose we can either get ESTALE
> or a disconnected dentry, because object from /subvol2 cannot
> have a path inside /subvol1....

So open_by_handle() should work fine even if we get mount_fd of /subvol1
and handle for inode on /subvol2. mount_fd in open_by_handle() is really
only used to get the superblock and that is the same.

								Honza
Jan Kara Nov. 23, 2018, 12:56 p.m. UTC | #11
On Thu 22-11-18 21:42:45, Amir Goldstein wrote:
> On Thu, Nov 22, 2018 at 5:18 PM Amir Goldstein <amir73il@gmail.com> wrote:
> >
> > On Thu, Nov 22, 2018 at 3:26 PM Jan Kara <jack@suse.cz> wrote:
> > >
> > > On Thu 22-11-18 14:36:35, Amir Goldstein wrote:
> > > > > > Regardless, IIUC, btrfs_statfs() returns an fsid which is associated with
> > > > > > the single super block struct, so all dentries in all subvolumes will
> > > > > > return the same fsid: btrfs_sb(dentry->d_sb)->fsid.
> > > > >
> > > > > This is not true AFAICT. Looking at btrfs_statfs() I can see:
> > > > >
> > > > >         buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
> > > > >         buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
> > > > >         /* Mask in the root object ID too, to disambiguate subvols */
> > > > >         buf->f_fsid.val[0] ^=
> > > > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
> > > > >         buf->f_fsid.val[1] ^=
> > > > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid;
> > > > >
> > > > > So subvolume root is xored into the FSID. Thus dentries from different
> > > > > subvolumes are going to return different fsids...
> > > > >
> > > >
> > > > Right... how could I have missed that :-/
> > > >
> > > > I do not want to bring back FSNOTIFY_EVENT_DENTRY just for that
> > > > and I saw how many flaws you pointed to in $SUBJECT patch.
> > > > Instead, I will use:
> > > > statfs_by_dentry(d_find_any_alias(inode) ?: inode->i_sb->sb_root,...
> > >
> > > So what about my proposal to store fsid in the notification mark when it is
> > > created and then use it when that mark results in event being generated?
> > > When mark is created, we have full path available, so getting proper fsid
> > > is trivial. Furthermore if the behavior is documented like that, it is
> > > fairly easy for userspace to track fsids it should care about and translate
> > > them to proper file descriptors for open_by_handle().
> > >
> > > This would get rid of statfs() on every event creation (which I don't like
> > > much) and also avoids these problems "how to get fsid for inode". What do
> > > you think?
> > >
> >
> > That's interesting. I like the simplicity.
> > What happens when there are 2 btrfs subvols /subvol1 /subvol2
> > and the application obviously doesn't know about this and does:
> > fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol1);
> > statfs("/subvol1",...);
> > fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol2);
> > statfs("/subvol2",...);
> >
> > Now the second fanotify_mark() call just updates the existing super block
> > mark mask, but does not change the fsid on the mark, so all events will have
> > fsid of subvol1 that was stored when first creating the mark.
> >
> > fanotify-watch application (for example) hashes the watches (paths) under
> > /subvol2 by fid with fsid of subvol2, so events cannot get matched back to
> > "watch" (i.e. path).
> >
> > And when trying to open_by_handle fid with fhandle from /subvol2
> > using mount_fd of /subvol1, I suppose we can either get ESTALE
> > or a disconnected dentry, because object from /subvol2 cannot
> > have a path inside /subvol1....
> >
> > Am I making the issue clear? or maybe I am missing something?
> >
> 
> How about this compromise:
> 
> - On fanotify_mark() call statfs() for both dentry and dentry->d_sb->sb_root
> - If they produce the same fsid, cache the fsid in the mark
>   If they do not match invalidate existing cache and never check again
> - When encoding fid, use cached fsid if exists, otherwise fallback to
>   statfs_by_dentry(find_any_alias(inode) ?: inode->i_sb->sb_root)

I don't think this is really better - see my previous email.

> Maybe better to return -EXDEV from fanotify_mark() on mismatch of fsid
> of dentry and dentry->d_sb->sb_root? because it doesn't look like the
> FAN_MARK_FILESYSTEM is going to be very useful for btrfs (??).

Well, I think FAN_MARK_FILESYSTEM is useful. Path events have no problem,
events using FID will work as well when using open_by_handle(), just the
reported fsid may be confusing... I'm undecided what is better: returning
EXDEV or just silently dropping the other fsid.

								Honza
Amir Goldstein Nov. 23, 2018, 1:34 p.m. UTC | #12
On Fri, Nov 23, 2018 at 2:52 PM Jan Kara <jack@suse.cz> wrote:
>
> Changed subject to better match what we discuss and added btrfs list to CC.
>
> On Thu 22-11-18 17:18:25, Amir Goldstein wrote:
> > On Thu, Nov 22, 2018 at 3:26 PM Jan Kara <jack@suse.cz> wrote:
> > >
> > > On Thu 22-11-18 14:36:35, Amir Goldstein wrote:
> > > > > > Regardless, IIUC, btrfs_statfs() returns an fsid which is associated with
> > > > > > the single super block struct, so all dentries in all subvolumes will
> > > > > > return the same fsid: btrfs_sb(dentry->d_sb)->fsid.
> > > > >
> > > > > This is not true AFAICT. Looking at btrfs_statfs() I can see:
> > > > >
> > > > >         buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
> > > > >         buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
> > > > >         /* Mask in the root object ID too, to disambiguate subvols */
> > > > >         buf->f_fsid.val[0] ^=
> > > > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
> > > > >         buf->f_fsid.val[1] ^=
> > > > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid;
> > > > >
> > > > > So subvolume root is xored into the FSID. Thus dentries from different
> > > > > subvolumes are going to return different fsids...
> > > > >
> > > >
> > > > Right... how could I have missed that :-/
> > > >
> > > > I do not want to bring back FSNOTIFY_EVENT_DENTRY just for that
> > > > and I saw how many flaws you pointed to in $SUBJECT patch.
> > > > Instead, I will use:
> > > > statfs_by_dentry(d_find_any_alias(inode) ?: inode->i_sb->sb_root,...
> > >
> > > So what about my proposal to store fsid in the notification mark when it is
> > > created and then use it when that mark results in event being generated?
> > > When mark is created, we have full path available, so getting proper fsid
> > > is trivial. Furthermore if the behavior is documented like that, it is
> > > fairly easy for userspace to track fsids it should care about and translate
> > > them to proper file descriptors for open_by_handle().
> > >
> > > This would get rid of statfs() on every event creation (which I don't like
> > > much) and also avoids these problems "how to get fsid for inode". What do
> > > you think?
> > >
> >
> > That's interesting. I like the simplicity.
> > What happens when there are 2 btrfs subvols /subvol1 /subvol2
> > and the application obviously doesn't know about this and does:
> > fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol1);
> > statfs("/subvol1",...);
> > fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol2);
> > statfs("/subvol2",...);
> >
> > Now the second fanotify_mark() call just updates the existing super block
> > mark mask, but does not change the fsid on the mark, so all events will have
> > fsid of subvol1 that was stored when first creating the mark.
>
> Yes.
>
> > fanotify-watch application (for example) hashes the watches (paths) under
> > /subvol2 by fid with fsid of subvol2, so events cannot get matched back to
> > "watch" (i.e. path).
>
> I agree this can be confusing... but with btrfs fanotify-watch will be
> confused even with your current code, won't it? Because FAN_MARK_FILESYSTEM
> on /subvol1 (with fsid A) is going to return also events on inodes from
> /subvol2 (with fsid B). So your current code will return event with fsid B
> which fanotify-watch has no way to match back and can get confused.
>
> So currently application can get events with fsid it has never seen, with
> the code as I suggest it can get "wrong" fsid. That is confusing but still
> somewhat better?

It's not better IMO because the purpose of the fid in event is a unique key
that application can use to match a path it already indexed.
wrong fsid => no match.
Using open_by_handle_at() to resolve unknown fid is a limited solution
and I don't think it is going to work in this case (see below).

>
> The core of the problem is that btrfs does not have "the superblock
> identifier" that would correspond to FAN_MARK_FILESYSTEM scope of events
> that we could use.
>
> > And when trying to open_by_handle fid with fhandle from /subvol2
> > using mount_fd of /subvol1, I suppose we can either get ESTALE
> > or a disconnected dentry, because object from /subvol2 cannot
> > have a path inside /subvol1....
>
> So open_by_handle() should work fine even if we get mount_fd of /subvol1
> and handle for inode on /subvol2. mount_fd in open_by_handle() is really
> only used to get the superblock and that is the same.

I don't think it will work fine.
do_handle_to_path() will compose a path from /subvol1 mnt and dentry from
/subvol2. This may resolve to a full path that does not really exist,
so application
cannot match it to anything that is can use name_to_handle_at() to identify.

The whole thing just sounds too messy. At the very least we need more time
to communicate with btrfs developers and figure this out, so I am going to
go with -EXDEV for any attempt to set *any* mark on a group with
FAN_REPORT_FID if fsid of fanotify_mark() path argument is different
from fsid of path->dentry->d_sb->s_root.

We can relax that later if we figure out a better way.

BTW, I am also going to go with -ENODEV for zero fsid (e.g. tmpfs).
tmpfs can be easily fixed to have non zero fsid if filesystem watch on tmpfs is
important.

Thanks,
Amir.
Amir Goldstein Nov. 23, 2018, 1:42 p.m. UTC | #13
> > How about this compromise:
> >
> > - On fanotify_mark() call statfs() for both dentry and dentry->d_sb->sb_root
> > - If they produce the same fsid, cache the fsid in the mark
> >   If they do not match invalidate existing cache and never check again
> > - When encoding fid, use cached fsid if exists, otherwise fallback to
> >   statfs_by_dentry(find_any_alias(inode) ?: inode->i_sb->sb_root)
>
> I don't think this is really better - see my previous email.

No I think it is horid, because results are unexpected and the same
type of event
on the same object can produce different fsid over time.

>
> > Maybe better to return -EXDEV from fanotify_mark() on mismatch of fsid
> > of dentry and dentry->d_sb->sb_root? because it doesn't look like the
> > FAN_MARK_FILESYSTEM is going to be very useful for btrfs (??).
>
> Well, I think FAN_MARK_FILESYSTEM is useful. Path events have no problem,

I wasn't indenting to deny btrfs of FAN_MARK_FILESYSTEM with event->fd.
Using a group with FAN_REPORT_FID will have limited support on btrfs
(only for setting marks on root volume?).

> events using FID will work as well when using open_by_handle(), just the
> reported fsid may be confusing... I'm undecided what is better: returning
> EXDEV or just silently dropping the other fsid.
>

As I wrote in other thread, I don't think open_by_handle() is really going to
work. you may get an open file, but with an invalid path that is not useful
for filesystem monitoring application.

So if you don't object, I'll go with EXDEV and we can see later about
relaxing it.

Thanks,
Amir.
Amir Goldstein Nov. 23, 2018, 5:53 p.m. UTC | #14
On Fri, Nov 23, 2018 at 3:34 PM Amir Goldstein <amir73il@gmail.com> wrote:
>
> On Fri, Nov 23, 2018 at 2:52 PM Jan Kara <jack@suse.cz> wrote:
> >
> > Changed subject to better match what we discuss and added btrfs list to CC.
> >
> > On Thu 22-11-18 17:18:25, Amir Goldstein wrote:
> > > On Thu, Nov 22, 2018 at 3:26 PM Jan Kara <jack@suse.cz> wrote:
> > > >
> > > > On Thu 22-11-18 14:36:35, Amir Goldstein wrote:
> > > > > > > Regardless, IIUC, btrfs_statfs() returns an fsid which is associated with
> > > > > > > the single super block struct, so all dentries in all subvolumes will
> > > > > > > return the same fsid: btrfs_sb(dentry->d_sb)->fsid.
> > > > > >
> > > > > > This is not true AFAICT. Looking at btrfs_statfs() I can see:
> > > > > >
> > > > > >         buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
> > > > > >         buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
> > > > > >         /* Mask in the root object ID too, to disambiguate subvols */
> > > > > >         buf->f_fsid.val[0] ^=
> > > > > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
> > > > > >         buf->f_fsid.val[1] ^=
> > > > > >                 BTRFS_I(d_inode(dentry))->root->root_key.objectid;
> > > > > >
> > > > > > So subvolume root is xored into the FSID. Thus dentries from different
> > > > > > subvolumes are going to return different fsids...
> > > > > >
> > > > >
> > > > > Right... how could I have missed that :-/
> > > > >
> > > > > I do not want to bring back FSNOTIFY_EVENT_DENTRY just for that
> > > > > and I saw how many flaws you pointed to in $SUBJECT patch.
> > > > > Instead, I will use:
> > > > > statfs_by_dentry(d_find_any_alias(inode) ?: inode->i_sb->sb_root,...
> > > >
> > > > So what about my proposal to store fsid in the notification mark when it is
> > > > created and then use it when that mark results in event being generated?
> > > > When mark is created, we have full path available, so getting proper fsid
> > > > is trivial. Furthermore if the behavior is documented like that, it is
> > > > fairly easy for userspace to track fsids it should care about and translate
> > > > them to proper file descriptors for open_by_handle().
> > > >
> > > > This would get rid of statfs() on every event creation (which I don't like
> > > > much) and also avoids these problems "how to get fsid for inode". What do
> > > > you think?
> > > >
> > >
> > > That's interesting. I like the simplicity.
> > > What happens when there are 2 btrfs subvols /subvol1 /subvol2
> > > and the application obviously doesn't know about this and does:
> > > fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol1);
> > > statfs("/subvol1",...);
> > > fanotify_mark(fd, FAN_MARK_ADD|FAN_MARK_FILESYSTEM, ... /subvol2);
> > > statfs("/subvol2",...);
> > >
> > > Now the second fanotify_mark() call just updates the existing super block
> > > mark mask, but does not change the fsid on the mark, so all events will have
> > > fsid of subvol1 that was stored when first creating the mark.
> >
> > Yes.
> >
> > > fanotify-watch application (for example) hashes the watches (paths) under
> > > /subvol2 by fid with fsid of subvol2, so events cannot get matched back to
> > > "watch" (i.e. path).
> >
> > I agree this can be confusing... but with btrfs fanotify-watch will be
> > confused even with your current code, won't it? Because FAN_MARK_FILESYSTEM
> > on /subvol1 (with fsid A) is going to return also events on inodes from
> > /subvol2 (with fsid B). So your current code will return event with fsid B
> > which fanotify-watch has no way to match back and can get confused.
> >
> > So currently application can get events with fsid it has never seen, with
> > the code as I suggest it can get "wrong" fsid. That is confusing but still
> > somewhat better?
>
> It's not better IMO because the purpose of the fid in event is a unique key
> that application can use to match a path it already indexed.
> wrong fsid => no match.
> Using open_by_handle_at() to resolve unknown fid is a limited solution
> and I don't think it is going to work in this case (see below).
>
> >
> > The core of the problem is that btrfs does not have "the superblock
> > identifier" that would correspond to FAN_MARK_FILESYSTEM scope of events
> > that we could use.
> >
> > > And when trying to open_by_handle fid with fhandle from /subvol2
> > > using mount_fd of /subvol1, I suppose we can either get ESTALE
> > > or a disconnected dentry, because object from /subvol2 cannot
> > > have a path inside /subvol1....
> >
> > So open_by_handle() should work fine even if we get mount_fd of /subvol1
> > and handle for inode on /subvol2. mount_fd in open_by_handle() is really
> > only used to get the superblock and that is the same.
>
> I don't think it will work fine.
> do_handle_to_path() will compose a path from /subvol1 mnt and dentry from
> /subvol2. This may resolve to a full path that does not really exist,
> so application
> cannot match it to anything that is can use name_to_handle_at() to identify.
>
> The whole thing just sounds too messy. At the very least we need more time
> to communicate with btrfs developers and figure this out, so I am going to
> go with -EXDEV for any attempt to set *any* mark on a group with
> FAN_REPORT_FID if fsid of fanotify_mark() path argument is different
> from fsid of path->dentry->d_sb->s_root.
>
> We can relax that later if we figure out a better way.
>
> BTW, I am also going to go with -ENODEV for zero fsid (e.g. tmpfs).
> tmpfs can be easily fixed to have non zero fsid if filesystem watch on tmpfs is
> important.
>

Well, this is interesting... I have implemented the -EXDEV logic and it
works as expected. I can set a filesystem global watch on the main
btrfs mount and not allowed to set a global watch on a subvolume.

The interesting part is that the global watch on the main btrfs volume
more useful than I though it would be. The file handles reported by the
main volume global watch are resolved to correct paths in the main volume.
I guess this is because a btrfs subvolume looks like a directory tree
in the global
namespace to vfs. See below.

So I will continue based on this working POC:
https://github.com/amir73il/linux/commits/fanotify_fid

Note that in the POC, fsid is cached in mark connector as you suggested.
It is still buggy, but my prototype always decodes file handles from the first
path argument given to the program, so it just goes to show that by getting
fsid of the main btrfs volume, the application will be able to properly decode
file handles and resolve correct paths.

The bottom line is that btrfs will have the full functionality of super block
monitoring with no ability to watch (or ignore) a single subvolume
(unless by using a mount mark).

Thanks,
Amir.

=============

root@kvm-xfstests:~# mount -t btrfs
/dev/vde on /vde type btrfs (rw,relatime,space_cache,subvolid=5,subvol=/)
/dev/vde on /mnt type btrfs
(rw,relatime,space_cache,subvolid=257,subvol=/subvol)

root@kvm-xfstests:~# /vtmp/inotifywait -m -e open,close -g /mnt
Setting up global filesystem watches.
Couldn't add global watch(es) /mnt...: Invalid cross-device link

root@kvm-xfstests:~# /vtmp/inotifywait -m -e open,close -g /vde &
Setting up global filesystem watches.
Watches established.

root@kvm-xfstests:~# mkdir -p /mnt/a/b/c/d/e/ && touch /mnt/a/b/c/d/e/x
Start watching /vde/subvol/a (fid=107...).
/vde/subvol/a OPEN
/vde/subvol/a CLOSE_NOWRITE,CLOSE
/vde/subvol/a CLOSE_NOWRITE,OPEN,CLOSE
Start watching /vde/subvol/a/b (fid=108...).
/vde/subvol/a/b CLOSE_NOWRITE,OPEN,CLOSE
Start watching /vde/subvol/a/b/c (fid=109...).
/vde/subvol/a/b/c CLOSE_NOWRITE,OPEN,CLOSE
Start watching /vde/subvol/a/b/c/d (fid=10a...).
/vde/subvol/a/b/c/d CLOSE_NOWRITE,OPEN,CLOSE
/vde/subvol/a/b CLOSE_NOWRITE,OPEN,CLOSE
/vde/subvol/a/b/c CLOSE_NOWRITE,OPEN,CLOSE
Start watching /vde/subvol/a/b/c/d/e/x (fid=10c...).
/vde/subvol/a/b/c/d/e/x CLOSE_WRITE,OPEN,CLOSE
/vde/subvol/a/b/c/d CLOSE_NOWRITE,OPEN,CLOSE
/vde/subvol/a/b/c/d/e/x CLOSE_NOWRITE,OPEN,CLOSE
Jan Kara Nov. 27, 2018, 8:35 a.m. UTC | #15
On Fri 23-11-18 19:53:11, Amir Goldstein wrote:
> On Fri, Nov 23, 2018 at 3:34 PM Amir Goldstein <amir73il@gmail.com> wrote:
> > > So open_by_handle() should work fine even if we get mount_fd of /subvol1
> > > and handle for inode on /subvol2. mount_fd in open_by_handle() is really
> > > only used to get the superblock and that is the same.
> >
> > I don't think it will work fine.
> > do_handle_to_path() will compose a path from /subvol1 mnt and dentry from
> > /subvol2. This may resolve to a full path that does not really exist,
> > so application
> > cannot match it to anything that is can use name_to_handle_at() to identify.
> >
> > The whole thing just sounds too messy. At the very least we need more time
> > to communicate with btrfs developers and figure this out, so I am going to
> > go with -EXDEV for any attempt to set *any* mark on a group with
> > FAN_REPORT_FID if fsid of fanotify_mark() path argument is different
> > from fsid of path->dentry->d_sb->s_root.
> >
> > We can relax that later if we figure out a better way.
> >
> > BTW, I am also going to go with -ENODEV for zero fsid (e.g. tmpfs).
> > tmpfs can be easily fixed to have non zero fsid if filesystem watch on tmpfs is
> > important.
> >
> 
> Well, this is interesting... I have implemented the -EXDEV logic and it
> works as expected. I can set a filesystem global watch on the main
> btrfs mount and not allowed to set a global watch on a subvolume.
> 
> The interesting part is that the global watch on the main btrfs volume
> more useful than I though it would be. The file handles reported by the
> main volume global watch are resolved to correct paths in the main volume.
> I guess this is because a btrfs subvolume looks like a directory tree
> in the global
> namespace to vfs. See below.
> 
> So I will continue based on this working POC:
> https://github.com/amir73il/linux/commits/fanotify_fid
> 
> Note that in the POC, fsid is cached in mark connector as you suggested.
> It is still buggy, but my prototype always decodes file handles from the first
> path argument given to the program, so it just goes to show that by getting
> fsid of the main btrfs volume, the application will be able to properly decode
> file handles and resolve correct paths.
> 
> The bottom line is that btrfs will have the full functionality of super block
> monitoring with no ability to watch (or ignore) a single subvolume
> (unless by using a mount mark).

Sounds good. I'll check the new version of your series.

								Honza
diff mbox series

Patch

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ecf09b6243d9..6a120b7f8b94 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -178,11 +178,11 @@  int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask
 
 		take_dentry_name_snapshot(&name, dentry);
 		if (path)
-			ret = fsnotify(p_inode, mask, path, FSNOTIFY_EVENT_PATH,
-				       name.name, 0);
+			ret = fsnotify(p_inode, mask, path,
+				       FSNOTIFY_EVENT_PATH, name.name, 0);
 		else
-			ret = fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
-				       name.name, 0);
+			ret = fsnotify(p_inode, mask, dentry,
+				       FSNOTIFY_EVENT_DENTRY, name.name, 0);
 		release_dentry_name_snapshot(&name);
 	}
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 2ccb08cb5d6a..9dadc0bcd7a9 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -99,14 +99,14 @@  static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 
 	fsnotify(old_dir, old_dir_mask, source, FSNOTIFY_EVENT_INODE, old_name,
 		 fs_cookie);
-	fsnotify(new_dir, new_dir_mask, source, FSNOTIFY_EVENT_INODE, new_name,
+	fsnotify(new_dir, new_dir_mask, moved, FSNOTIFY_EVENT_DENTRY, new_name,
 		 fs_cookie);
 
 	if (target)
 		fsnotify_link_count(target);
 
 	if (source)
-		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+		fsnotify(source, FS_MOVE_SELF, moved, FSNOTIFY_EVENT_DENTRY, NULL, 0);
 	audit_inode_child(new_dir, moved, AUDIT_TYPE_CHILD_CREATE);
 }
 
@@ -155,7 +155,7 @@  static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
 	audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
 
-	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
+	fsnotify(inode, FS_CREATE, dentry, FSNOTIFY_EVENT_DENTRY, dentry->d_name.name, 0);
 }
 
 /*
@@ -168,7 +168,7 @@  static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 	fsnotify_link_count(inode);
 	audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);
 
-	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name, 0);
+	fsnotify(dir, FS_CREATE, new_dentry, FSNOTIFY_EVENT_DENTRY, new_dentry->d_name.name, 0);
 }
 
 /*
@@ -177,11 +177,10 @@  static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
 	__u32 mask = (FS_CREATE | FS_ISDIR);
-	struct inode *d_inode = dentry->d_inode;
 
 	audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
 
-	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
+	fsnotify(inode, mask, dentry, FSNOTIFY_EVENT_DENTRY, dentry->d_name.name, 0);
 }
 
 /*
@@ -262,7 +261,7 @@  static inline void fsnotify_xattr(struct dentry *dentry)
 		mask |= FS_ISDIR;
 
 	fsnotify_parent(NULL, dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+	fsnotify(inode, mask, dentry, FSNOTIFY_EVENT_DENTRY, NULL, 0);
 }
 
 /*
@@ -297,7 +296,7 @@  static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 			mask |= FS_ISDIR;
 
 		fsnotify_parent(NULL, dentry, mask);
-		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+		fsnotify(inode, mask, dentry, FSNOTIFY_EVENT_DENTRY, NULL, 0);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 7639774e7475..24d92455be03 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -208,10 +208,11 @@  struct fsnotify_group {
 	};
 };
 
-/* when calling fsnotify tell it if the data is a path or inode */
+/* when calling fsnotify tell it if the data is a path or inode or dentry */
 #define FSNOTIFY_EVENT_NONE	0
 #define FSNOTIFY_EVENT_PATH	1
 #define FSNOTIFY_EVENT_INODE	2
+#define FSNOTIFY_EVENT_DENTRY	3
 
 enum fsnotify_obj_type {
 	FSNOTIFY_OBJ_TYPE_INODE,