diff mbox series

[06/18] fsinfo: Add a uniquifier ID to struct mount [ver #21]

Message ID 159646183662.1784947.5709738540440380373.stgit@warthog.procyon.org.uk (mailing list archive)
State New, archived
Headers show
Series VFS: Filesystem information [ver #21] | expand

Commit Message

David Howells Aug. 3, 2020, 1:37 p.m. UTC
Add a uniquifier ID to struct mount that is effectively unique over the
kernel lifetime to deal around mnt_id values being reused.  This can then
be exported through fsinfo() to allow detection of replacement mounts that
happen to end up with the same mount ID.

The normal mount handle is still used for referring to a particular mount.

The mount notification is then changed to convey these unique mount IDs
rather than the mount handle.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 fs/mount.h        |    3 +++
 fs/mount_notify.c |    4 ++--
 fs/namespace.c    |    3 +++
 3 files changed, 8 insertions(+), 2 deletions(-)

Comments

Miklos Szeredi Aug. 4, 2020, 10:41 a.m. UTC | #1
On Mon, Aug 03, 2020 at 02:37:16PM +0100, David Howells wrote:
> Add a uniquifier ID to struct mount that is effectively unique over the
> kernel lifetime to deal around mnt_id values being reused.  This can then
> be exported through fsinfo() to allow detection of replacement mounts that
> happen to end up with the same mount ID.
> 
> The normal mount handle is still used for referring to a particular mount.
> 
> The mount notification is then changed to convey these unique mount IDs
> rather than the mount handle.
> 
> Signed-off-by: David Howells <dhowells@redhat.com>
> ---
> 
>  fs/mount.h        |    3 +++
>  fs/mount_notify.c |    4 ++--
>  fs/namespace.c    |    3 +++
>  3 files changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/mount.h b/fs/mount.h
> index 85456a5f5a3a..1037781be055 100644
> --- a/fs/mount.h
> +++ b/fs/mount.h
> @@ -79,6 +79,9 @@ struct mount {
>  	int mnt_expiry_mark;		/* true if marked for expiry */
>  	struct hlist_head mnt_pins;
>  	struct hlist_head mnt_stuck_children;
> +#ifdef CONFIG_FSINFO
> +	u64	mnt_unique_id;		/* ID unique over lifetime of kernel */
> +#endif

Not sure if it's worth making conditional.

>  #ifdef CONFIG_MOUNT_NOTIFICATIONS
>  	struct watch_list *mnt_watchers; /* Watches on dentries within this mount */
>  #endif
> diff --git a/fs/mount_notify.c b/fs/mount_notify.c
> index 44f570e4cebe..d8ba66ed5f77 100644
> --- a/fs/mount_notify.c
> +++ b/fs/mount_notify.c
> @@ -90,7 +90,7 @@ void notify_mount(struct mount *trigger,
>  	n.watch.type	= WATCH_TYPE_MOUNT_NOTIFY;
>  	n.watch.subtype	= subtype;
>  	n.watch.info	= info_flags | watch_sizeof(n);
> -	n.triggered_on	= trigger->mnt_id;
> +	n.triggered_on	= trigger->mnt_unique_id;
>  
>  	switch (subtype) {
>  	case NOTIFY_MOUNT_EXPIRY:
> @@ -102,7 +102,7 @@ void notify_mount(struct mount *trigger,
>  	case NOTIFY_MOUNT_UNMOUNT:
>  	case NOTIFY_MOUNT_MOVE_FROM:
>  	case NOTIFY_MOUNT_MOVE_TO:
> -		n.auxiliary_mount	= aux->mnt_id;
> +		n.auxiliary_mount = aux->mnt_unique_id;

Hmm, so we now have two ID's:

 - one can be used to look up the mount
 - one is guaranteed to be unique

With this change the mount cannot be looked up with FSINFO_FLAGS_QUERY_MOUNT,
right?

Should we be merging the two ID's into a single one which has both properties?

>  		break;
>  
>  	default:
> diff --git a/fs/namespace.c b/fs/namespace.c
> index b2b9920ffd3c..1db8a64cd76f 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -115,6 +115,9 @@ static int mnt_alloc_id(struct mount *mnt)
>  	if (res < 0)
>  		return res;
>  	mnt->mnt_id = res;
> +#ifdef CONFIG_FSINFO
> +	mnt->mnt_unique_id = atomic64_inc_return(&vfs_unique_counter);
> +#endif
>  	return 0;
>  }
>  
> 
>
Ian Kent Aug. 4, 2020, 12:32 p.m. UTC | #2
On Tue, 2020-08-04 at 12:41 +0200, Miklos Szeredi wrote:
> On Mon, Aug 03, 2020 at 02:37:16PM +0100, David Howells wrote:
> > Add a uniquifier ID to struct mount that is effectively unique over
> > the
> > kernel lifetime to deal around mnt_id values being reused.  This
> > can then
> > be exported through fsinfo() to allow detection of replacement
> > mounts that
> > happen to end up with the same mount ID.
> > 
> > The normal mount handle is still used for referring to a particular
> > mount.
> > 
> > The mount notification is then changed to convey these unique mount
> > IDs
> > rather than the mount handle.
> > 
> > Signed-off-by: David Howells <dhowells@redhat.com>
> > ---
> > 
> >  fs/mount.h        |    3 +++
> >  fs/mount_notify.c |    4 ++--
> >  fs/namespace.c    |    3 +++
> >  3 files changed, 8 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/mount.h b/fs/mount.h
> > index 85456a5f5a3a..1037781be055 100644
> > --- a/fs/mount.h
> > +++ b/fs/mount.h
> > @@ -79,6 +79,9 @@ struct mount {
> >  	int mnt_expiry_mark;		/* true if marked for
> > expiry */
> >  	struct hlist_head mnt_pins;
> >  	struct hlist_head mnt_stuck_children;
> > +#ifdef CONFIG_FSINFO
> > +	u64	mnt_unique_id;		/* ID unique over lifetime of
> > kernel */
> > +#endif
> 
> Not sure if it's worth making conditional.
> 
> >  #ifdef CONFIG_MOUNT_NOTIFICATIONS
> >  	struct watch_list *mnt_watchers; /* Watches on dentries within
> > this mount */
> >  #endif
> > diff --git a/fs/mount_notify.c b/fs/mount_notify.c
> > index 44f570e4cebe..d8ba66ed5f77 100644
> > --- a/fs/mount_notify.c
> > +++ b/fs/mount_notify.c
> > @@ -90,7 +90,7 @@ void notify_mount(struct mount *trigger,
> >  	n.watch.type	= WATCH_TYPE_MOUNT_NOTIFY;
> >  	n.watch.subtype	= subtype;
> >  	n.watch.info	= info_flags | watch_sizeof(n);
> > -	n.triggered_on	= trigger->mnt_id;
> > +	n.triggered_on	= trigger->mnt_unique_id;
> >  
> >  	switch (subtype) {
> >  	case NOTIFY_MOUNT_EXPIRY:
> > @@ -102,7 +102,7 @@ void notify_mount(struct mount *trigger,
> >  	case NOTIFY_MOUNT_UNMOUNT:
> >  	case NOTIFY_MOUNT_MOVE_FROM:
> >  	case NOTIFY_MOUNT_MOVE_TO:
> > -		n.auxiliary_mount	= aux->mnt_id;
> > +		n.auxiliary_mount = aux->mnt_unique_id;
> 
> Hmm, so we now have two ID's:
> 
>  - one can be used to look up the mount
>  - one is guaranteed to be unique
> 
> With this change the mount cannot be looked up with
> FSINFO_FLAGS_QUERY_MOUNT,
> right?
> 
> Should we be merging the two ID's into a single one which has both
> properties?

I'd been thinking we would probably need to change to 64 bit ids
for a while now and I thought that was what was going to happen.

We'll need to change libmount and current code but better early
on than later.

Ian

> 
> >  		break;
> >  
> >  	default:
> > diff --git a/fs/namespace.c b/fs/namespace.c
> > index b2b9920ffd3c..1db8a64cd76f 100644
> > --- a/fs/namespace.c
> > +++ b/fs/namespace.c
> > @@ -115,6 +115,9 @@ static int mnt_alloc_id(struct mount *mnt)
> >  	if (res < 0)
> >  		return res;
> >  	mnt->mnt_id = res;
> > +#ifdef CONFIG_FSINFO
> > +	mnt->mnt_unique_id = atomic64_inc_return(&vfs_unique_counter);
> > +#endif
> >  	return 0;
> >  }
> >  
> > 
> >
David Howells Aug. 5, 2020, 2:13 p.m. UTC | #3
Miklos Szeredi <miklos@szeredi.hu> wrote:

> > +#ifdef CONFIG_FSINFO
> > +	u64	mnt_unique_id;		/* ID unique over lifetime of kernel */
> > +#endif
>
> Not sure if it's worth making conditional.

You can't get at it without CONFIG_FSINFO=y as it stands, but making it
unconditional might be reasonable.

> > -		n.auxiliary_mount	= aux->mnt_id;
> > +		n.auxiliary_mount = aux->mnt_unique_id;
>
> Hmm, so we now have two ID's:
>
>  - one can be used to look up the mount
>  - one is guaranteed to be unique
>
> With this change the mount cannot be looked up with FSINFO_FLAGS_QUERY_MOUNT,
> right?
>
> Should we be merging the two ID's into a single one which has both properties?

Ideally, yes... but...  The 31-bit mnt_id is currently exposed to userspace in
various places, e.g. /proc, sys_name_to_handle_at().  So we have to keep that
as is and we can't expand it.

For fsinfo(), however, it might make sense to only use the 64-bit uniquifier
as the identifier to use for direct look up.

However, looking up that identifier requires some sort of structure for doing
this and it's kind of worst case for the IDR tree as the keys are gradually
going to spread out, causing it to eat more memory.  It may be a tradeoff
worth making, and the memory consumption might not be that bad - or we could
use some other data structure such as an rbtree.

That's why I was going for the 31-bit identifier + uniquifier so that you can at
least tell if the identifier got recycled reasonably quickly.

David
Miklos Szeredi Aug. 5, 2020, 2:46 p.m. UTC | #4
On Wed, Aug 5, 2020 at 4:14 PM David Howells <dhowells@redhat.com> wrote:

> However, looking up that identifier requires some sort of structure for doing
> this and it's kind of worst case for the IDR tree as the keys are gradually
> going to spread out, causing it to eat more memory.  It may be a tradeoff
> worth making, and the memory consumption might not be that bad - or we could
> use some other data structure such as an rbtree.

idr_alloc_cyclic() seems to be a good template for doing the lower
32bit allocation, and we can add code to increment the high 32bit on
wraparound.

Lots of code uses idr_alloc_cyclic() so I guess it shouldn't be too
bad in terms of memory use or performance.

Thanks,
Miklos
David Howells Aug. 5, 2020, 3:30 p.m. UTC | #5
Miklos Szeredi <miklos@szeredi.hu> wrote:

> idr_alloc_cyclic() seems to be a good template for doing the lower
> 32bit allocation, and we can add code to increment the high 32bit on
> wraparound.
> 
> Lots of code uses idr_alloc_cyclic() so I guess it shouldn't be too
> bad in terms of memory use or performance.

It's optimised for shortness of path and trades memory for performance.  It's
currently implemented using an xarray, so memory usage is dependent on the
sparseness of the tree.  Each node in the tree is 576 bytes and in the worst
case, each one node will contain one mount - and then you have to backfill the
ancestry, though for lower memory costs.

Systemd makes life more interesting since it sets up a whole load of
propagations.  Each mount you make may cause several others to be created, but
that would likely make the tree more efficient.

David
Matthew Wilcox (Oracle) Aug. 5, 2020, 7:33 p.m. UTC | #6
On Wed, Aug 05, 2020 at 04:30:10PM +0100, David Howells wrote:
> Miklos Szeredi <miklos@szeredi.hu> wrote:
> 
> > idr_alloc_cyclic() seems to be a good template for doing the lower
> > 32bit allocation, and we can add code to increment the high 32bit on
> > wraparound.
> > 
> > Lots of code uses idr_alloc_cyclic() so I guess it shouldn't be too
> > bad in terms of memory use or performance.
> 
> It's optimised for shortness of path and trades memory for performance.  It's
> currently implemented using an xarray, so memory usage is dependent on the
> sparseness of the tree.  Each node in the tree is 576 bytes and in the worst
> case, each one node will contain one mount - and then you have to backfill the
> ancestry, though for lower memory costs.
> 
> Systemd makes life more interesting since it sets up a whole load of
> propagations.  Each mount you make may cause several others to be created, but
> that would likely make the tree more efficient.

I would recommend using xa_alloc and ignoring the ID assigned from
xa_alloc.  Looking up by unique ID is then a matter of iterating every
mount (xa_for_each()) looking for a matching unique ID in the mount
struct.  That's O(n) search, but it's faster than a linked list, and we
don't have that many mounts in a system.

The maple tree will handle this case more effectively, but I can't
recommend waiting for that to be ready.
Ian Kent Aug. 6, 2020, 5:43 a.m. UTC | #7
On Wed, 2020-08-05 at 20:33 +0100, Matthew Wilcox wrote:
> On Wed, Aug 05, 2020 at 04:30:10PM +0100, David Howells wrote:
> > Miklos Szeredi <miklos@szeredi.hu> wrote:
> > 
> > > idr_alloc_cyclic() seems to be a good template for doing the
> > > lower
> > > 32bit allocation, and we can add code to increment the high 32bit
> > > on
> > > wraparound.
> > > 
> > > Lots of code uses idr_alloc_cyclic() so I guess it shouldn't be
> > > too
> > > bad in terms of memory use or performance.
> > 
> > It's optimised for shortness of path and trades memory for
> > performance.  It's
> > currently implemented using an xarray, so memory usage is dependent
> > on the
> > sparseness of the tree.  Each node in the tree is 576 bytes and in
> > the worst
> > case, each one node will contain one mount - and then you have to
> > backfill the
> > ancestry, though for lower memory costs.
> > 
> > Systemd makes life more interesting since it sets up a whole load
> > of
> > propagations.  Each mount you make may cause several others to be
> > created, but
> > that would likely make the tree more efficient.
> 
> I would recommend using xa_alloc and ignoring the ID assigned from
> xa_alloc.  Looking up by unique ID is then a matter of iterating
> every
> mount (xa_for_each()) looking for a matching unique ID in the mount
> struct.  That's O(n) search, but it's faster than a linked list, and
> we
> don't have that many mounts in a system.

How many is not many, 5000, 10000, I agree that 30000 plus is fairly
rare, even for the autofs direct mount case I hope the implementation
here will help to fix.

Ian
diff mbox series

Patch

diff --git a/fs/mount.h b/fs/mount.h
index 85456a5f5a3a..1037781be055 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -79,6 +79,9 @@  struct mount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	struct hlist_head mnt_pins;
 	struct hlist_head mnt_stuck_children;
+#ifdef CONFIG_FSINFO
+	u64	mnt_unique_id;		/* ID unique over lifetime of kernel */
+#endif
 #ifdef CONFIG_MOUNT_NOTIFICATIONS
 	struct watch_list *mnt_watchers; /* Watches on dentries within this mount */
 #endif
diff --git a/fs/mount_notify.c b/fs/mount_notify.c
index 44f570e4cebe..d8ba66ed5f77 100644
--- a/fs/mount_notify.c
+++ b/fs/mount_notify.c
@@ -90,7 +90,7 @@  void notify_mount(struct mount *trigger,
 	n.watch.type	= WATCH_TYPE_MOUNT_NOTIFY;
 	n.watch.subtype	= subtype;
 	n.watch.info	= info_flags | watch_sizeof(n);
-	n.triggered_on	= trigger->mnt_id;
+	n.triggered_on	= trigger->mnt_unique_id;
 
 	switch (subtype) {
 	case NOTIFY_MOUNT_EXPIRY:
@@ -102,7 +102,7 @@  void notify_mount(struct mount *trigger,
 	case NOTIFY_MOUNT_UNMOUNT:
 	case NOTIFY_MOUNT_MOVE_FROM:
 	case NOTIFY_MOUNT_MOVE_TO:
-		n.auxiliary_mount	= aux->mnt_id;
+		n.auxiliary_mount = aux->mnt_unique_id;
 		break;
 
 	default:
diff --git a/fs/namespace.c b/fs/namespace.c
index b2b9920ffd3c..1db8a64cd76f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -115,6 +115,9 @@  static int mnt_alloc_id(struct mount *mnt)
 	if (res < 0)
 		return res;
 	mnt->mnt_id = res;
+#ifdef CONFIG_FSINFO
+	mnt->mnt_unique_id = atomic64_inc_return(&vfs_unique_counter);
+#endif
 	return 0;
 }