diff mbox series

[13/17] watch_queue: Implement mount topology and attribute change notifications [ver #5]

Message ID 158454391302.2863966.1884682840541676280.stgit@warthog.procyon.org.uk (mailing list archive)
State New
Headers show
Series pipe: Keyrings, mount and superblock notifications [ver #5] | expand

Commit Message

David Howells March 18, 2020, 3:05 p.m. UTC
Add a mount notification facility whereby notifications about changes in
mount topology and configuration can be received.  Note that this only
covers vfsmount topology changes and not superblock events.  A separate
facility will be added for that.

Every mount is given a change counter than counts the number of topological
rearrangements in which it is involved and the number of attribute changes
it undergoes.  This allows notification loss to be dealt with.  Later
patches will provide a way to quickly retrieve this value, along with
information about topology and parameters for the superblock.

Firstly, a watch queue needs to be created:

	pipe2(fds, O_NOTIFICATION_PIPE);
	ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256);

then a notification can be set up to report notifications via that queue:

	struct watch_notification_filter filter = {
		.nr_filters = 1,
		.filters = {
			[0] = {
				.type = WATCH_TYPE_MOUNT_NOTIFY,
				.subtype_filter[0] = UINT_MAX,
			},
		},
	};
	ioctl(fds[1], IOC_WATCH_QUEUE_SET_FILTER, &filter);
	watch_mount(AT_FDCWD, "/", 0, fds[1], 0x02);

In this case, it would let me monitor the mount topology subtree rooted at
"/" for events.  Mount notifications propagate up the tree towards the
root, so a watch will catch all of the events happening in the subtree
rooted at the watch.

After setting the watch, records will be placed into the queue when, for
example, as superblock switches between read-write and read-only.  Records
are of the following format:

	struct mount_notification {
		struct watch_notification watch;
		__u32	triggered_on;
		__u32	auxiliary_mount;
		__u32	topology_changes;
		__u32	attr_changes;
		__u32	aux_topology_changes;
	} *n;

Where:

	n->watch.type will be WATCH_TYPE_MOUNT_NOTIFY.

	n->watch.subtype will indicate the type of event, such as
	NOTIFY_MOUNT_NEW_MOUNT.

	n->watch.info & WATCH_INFO_LENGTH will indicate the length of the
	record.

	n->watch.info & WATCH_INFO_ID will be the fifth argument to
	watch_mount(), shifted.

	n->watch.info & NOTIFY_MOUNT_IN_SUBTREE if true indicates that the
	notifcation was generated in the mount subtree rooted at the watch,
	and not actually in the watch itself.

	n->watch.info & NOTIFY_MOUNT_IS_RECURSIVE if true indicates that
	the notifcation was generated by an event (eg. SETATTR) that was
	applied recursively.  The notification is only generated for the
	object that initially triggered it.

	n->watch.info & NOTIFY_MOUNT_IS_NOW_RO will be used for
	NOTIFY_MOUNT_READONLY, being set if the superblock becomes R/O, and
	being cleared otherwise, and for NOTIFY_MOUNT_NEW_MOUNT, being set
	if the new mount is a submount (e.g. an automount).

	n->watch.info & NOTIFY_MOUNT_IS_SUBMOUNT if true indicates that the
	NOTIFY_MOUNT_NEW_MOUNT notification is in response to a mount
	performed by the kernel (e.g. an automount).

	n->triggered_on indicates the ID of the mount to which the change
	was accounted (e.g. the new parent of a new mount).

	n->axiliary_mount indicates the ID of an additional mount that was
	affected (e.g. a new mount itself) or 0.

	n->topology_changes provides the value of the topology change
	counter of the triggered-on mount at the conclusion of the
	operarion.

	n->attr_changes provides the value of the attribute change counter
	of the triggered-on mount at the conclusion of the operarion.

	n->aux_topology_changes provides the value of the topology change
	counter of the auxiliary mount at the conclusion of the operation.

Note that it is permissible for event records to be of variable length -
or, at least, the length may be dependent on the subtype.  Note also that
the queue can be shared between multiple notifications of various types.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 Documentation/watch_queue.rst               |   12 +
 arch/alpha/kernel/syscalls/syscall.tbl      |    1 
 arch/arm/tools/syscall.tbl                  |    1 
 arch/arm64/include/asm/unistd.h             |    2 
 arch/arm64/include/asm/unistd32.h           |    2 
 arch/ia64/kernel/syscalls/syscall.tbl       |    1 
 arch/m68k/kernel/syscalls/syscall.tbl       |    1 
 arch/microblaze/kernel/syscalls/syscall.tbl |    1 
 arch/mips/kernel/syscalls/syscall_n32.tbl   |    1 
 arch/mips/kernel/syscalls/syscall_n64.tbl   |    1 
 arch/mips/kernel/syscalls/syscall_o32.tbl   |    1 
 arch/parisc/kernel/syscalls/syscall.tbl     |    1 
 arch/powerpc/kernel/syscalls/syscall.tbl    |    1 
 arch/s390/kernel/syscalls/syscall.tbl       |    1 
 arch/sh/kernel/syscalls/syscall.tbl         |    1 
 arch/sparc/kernel/syscalls/syscall.tbl      |    1 
 arch/x86/entry/syscalls/syscall_32.tbl      |    1 
 arch/x86/entry/syscalls/syscall_64.tbl      |    1 
 arch/xtensa/kernel/syscalls/syscall.tbl     |    1 
 fs/Kconfig                                  |    9 +
 fs/Makefile                                 |    1 
 fs/mount.h                                  |   21 ++
 fs/mount_notify.c                           |  228 +++++++++++++++++++++++++++
 fs/namespace.c                              |   22 +++
 include/linux/dcache.h                      |    1 
 include/linux/syscalls.h                    |    2 
 include/uapi/asm-generic/unistd.h           |    4 
 include/uapi/linux/watch_queue.h            |   36 ++++
 kernel/sys_ni.c                             |    3 
 29 files changed, 355 insertions(+), 4 deletions(-)
 create mode 100644 fs/mount_notify.c

Comments

Miklos Szeredi April 2, 2020, 3:19 p.m. UTC | #1
On Wed, Mar 18, 2020 at 4:05 PM David Howells <dhowells@redhat.com> wrote:
>
> Add a mount notification facility whereby notifications about changes in
> mount topology and configuration can be received.  Note that this only
> covers vfsmount topology changes and not superblock events.  A separate
> facility will be added for that.
>
> Every mount is given a change counter than counts the number of topological
> rearrangements in which it is involved and the number of attribute changes
> it undergoes.  This allows notification loss to be dealt with.

Isn't queue overrun signalled anyway?

If an event is lost, there's no way to know which object was affected,
so how does the counter help here?

>  Later
> patches will provide a way to quickly retrieve this value, along with
> information about topology and parameters for the superblock.

So?  If we receive a notification for MNT1 with change counter CTR1
and then receive the info for MNT1 with CTR2, then we know that we
either missed a notification or we raced and will receive the
notification later.  This helps with not having to redo the query when
we receive the notification with CTR2, but this is just an
optimization, not really useful.

> Firstly, a watch queue needs to be created:
>
>         pipe2(fds, O_NOTIFICATION_PIPE);
>         ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256);
>
> then a notification can be set up to report notifications via that queue:
>
>         struct watch_notification_filter filter = {
>                 .nr_filters = 1,
>                 .filters = {
>                         [0] = {
>                                 .type = WATCH_TYPE_MOUNT_NOTIFY,
>                                 .subtype_filter[0] = UINT_MAX,
>                         },
>                 },
>         };
>         ioctl(fds[1], IOC_WATCH_QUEUE_SET_FILTER, &filter);
>         watch_mount(AT_FDCWD, "/", 0, fds[1], 0x02);
>
> In this case, it would let me monitor the mount topology subtree rooted at
> "/" for events.  Mount notifications propagate up the tree towards the
> root, so a watch will catch all of the events happening in the subtree
> rooted at the watch.

Does it make sense to watch a single mount?  A set of mounts?   A
subtree with an exclusion list (subtrees, types, ???)?

Not asking for these to be implemented initially, just questioning
whether the API is flexible enough to allow these cases to be
implemented later if needed.

>
> After setting the watch, records will be placed into the queue when, for
> example, as superblock switches between read-write and read-only.  Records
> are of the following format:
>
>         struct mount_notification {
>                 struct watch_notification watch;
>                 __u32   triggered_on;
>                 __u32   auxiliary_mount;

What guarantees that mount_id is going to remain a 32bit entity?

>                 __u32   topology_changes;
>                 __u32   attr_changes;
>                 __u32   aux_topology_changes;

Being 32bit this introduces wraparound effects.  Is that really worth it?

>         } *n;
>
> Where:
>
>         n->watch.type will be WATCH_TYPE_MOUNT_NOTIFY.
>
>         n->watch.subtype will indicate the type of event, such as
>         NOTIFY_MOUNT_NEW_MOUNT.
>
>         n->watch.info & WATCH_INFO_LENGTH will indicate the length of the
>         record.

Hmm, size of record limited to 112bytes?  Is this verified somewhere?
Don't see a BUILD_BUG_ON() in watch_sizeof().

>
>         n->watch.info & WATCH_INFO_ID will be the fifth argument to
>         watch_mount(), shifted.
>
>         n->watch.info & NOTIFY_MOUNT_IN_SUBTREE if true indicates that the
>         notifcation was generated in the mount subtree rooted at the watch,

notification

>         and not actually in the watch itself.
>
>         n->watch.info & NOTIFY_MOUNT_IS_RECURSIVE if true indicates that
>         the notifcation was generated by an event (eg. SETATTR) that was
>         applied recursively.  The notification is only generated for the
>         object that initially triggered it.

Unused in this patchset.  Please don't add things to the API which are not used.

>
>         n->watch.info & NOTIFY_MOUNT_IS_NOW_RO will be used for
>         NOTIFY_MOUNT_READONLY, being set if the superblock becomes R/O, and
>         being cleared otherwise,

Does this refer to mount r/o flag or superblock r/o flag?  Confused.

> and for NOTIFY_MOUNT_NEW_MOUNT, being set
>         if the new mount is a submount (e.g. an automount).

Huh?  What has r/o flag do with being a submount?

>
>         n->watch.info & NOTIFY_MOUNT_IS_SUBMOUNT if true indicates that the
>         NOTIFY_MOUNT_NEW_MOUNT notification is in response to a mount
>         performed by the kernel (e.g. an automount).
>
>         n->triggered_on indicates the ID of the mount to which the change
>         was accounted (e.g. the new parent of a new mount).

For move there are two parents that are affected.  This doesn't look
sufficient to reflect that.

>
>         n->axiliary_mount indicates the ID of an additional mount that was
>         affected (e.g. a new mount itself) or 0.
>
>         n->topology_changes provides the value of the topology change
>         counter of the triggered-on mount at the conclusion of the
>         operarion.

operation

>
>         n->attr_changes provides the value of the attribute change counter
>         of the triggered-on mount at the conclusion of the operarion.

operation

>
>         n->aux_topology_changes provides the value of the topology change
>         counter of the auxiliary mount at the conclusion of the operation.
>
> Note that it is permissible for event records to be of variable length -
> or, at least, the length may be dependent on the subtype.  Note also that
> the queue can be shared between multiple notifications of various types.

Will review code later...

Thanks,
Miklos
Ian Kent June 14, 2020, 3:07 a.m. UTC | #2
On Thu, 2020-04-02 at 17:19 +0200, Miklos Szeredi wrote:
> 
> > Firstly, a watch queue needs to be created:
> > 
> >         pipe2(fds, O_NOTIFICATION_PIPE);
> >         ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256);
> > 
> > then a notification can be set up to report notifications via that
> > queue:
> > 
> >         struct watch_notification_filter filter = {
> >                 .nr_filters = 1,
> >                 .filters = {
> >                         [0] = {
> >                                 .type = WATCH_TYPE_MOUNT_NOTIFY,
> >                                 .subtype_filter[0] = UINT_MAX,
> >                         },
> >                 },
> >         };
> >         ioctl(fds[1], IOC_WATCH_QUEUE_SET_FILTER, &filter);
> >         watch_mount(AT_FDCWD, "/", 0, fds[1], 0x02);
> > 
> > In this case, it would let me monitor the mount topology subtree
> > rooted at
> > "/" for events.  Mount notifications propagate up the tree towards
> > the
> > root, so a watch will catch all of the events happening in the
> > subtree
> > rooted at the watch.
> 
> Does it make sense to watch a single mount?  A set of mounts?   A
> subtree with an exclusion list (subtrees, types, ???)?

Yes, filtering, perhaps, I'm not sure a single mount is useful
as changes generally need to be monitored for a set of mounts.

Monitoring a subtree is obviously possible because the monitor
path doesn't need to be "/".

Or am I misunderstanding what your trying to get at.

The notion of filtering types and other things is interesting
but what I've seen that doesn't fit in the current implementation
so far probably isn't appropriate for kernel implementation.

There's a special case of acquiring a list of mounts where the
path is not a mount point itself but you need all mount below
that path prefix.

In this case you get all mounts, including the mounts of the mount
containing the path, so you still need to traverse the list to match
the prefix and that can easily mean the whole list of mounts in the
system.

Point is it leads to multiple traversals of a larger than needed list
of mounts, one to get the list of mounts to check, and one to filter
on the prefix.

I've seen this use case with fsinfo() and that's where it's needed
although it may be useful to carry it through to notifications as
well.

While this sounds like it isn't such a big deal it can sometimes
make a considerable difference to the number of mounts you need
to traverse when there are a large number of mounts in the system.

I didn't consider it appropriate for kernel implementation but
since you asked here it is. OTOH were checking for connectedness
in fsinfo() anyway so maybe this is something that could be done
without undue overhead.

But that's all I've seen so far.

Ian
Miklos Szeredi June 15, 2020, 8:44 a.m. UTC | #3
On Sun, Jun 14, 2020 at 5:07 AM Ian Kent <raven@themaw.net> wrote:
>
> On Thu, 2020-04-02 at 17:19 +0200, Miklos Szeredi wrote:
> >
> > > Firstly, a watch queue needs to be created:
> > >
> > >         pipe2(fds, O_NOTIFICATION_PIPE);
> > >         ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256);
> > >
> > > then a notification can be set up to report notifications via that
> > > queue:
> > >
> > >         struct watch_notification_filter filter = {
> > >                 .nr_filters = 1,
> > >                 .filters = {
> > >                         [0] = {
> > >                                 .type = WATCH_TYPE_MOUNT_NOTIFY,
> > >                                 .subtype_filter[0] = UINT_MAX,
> > >                         },
> > >                 },
> > >         };
> > >         ioctl(fds[1], IOC_WATCH_QUEUE_SET_FILTER, &filter);
> > >         watch_mount(AT_FDCWD, "/", 0, fds[1], 0x02);
> > >
> > > In this case, it would let me monitor the mount topology subtree
> > > rooted at
> > > "/" for events.  Mount notifications propagate up the tree towards
> > > the
> > > root, so a watch will catch all of the events happening in the
> > > subtree
> > > rooted at the watch.
> >
> > Does it make sense to watch a single mount?  A set of mounts?   A
> > subtree with an exclusion list (subtrees, types, ???)?
>
> Yes, filtering, perhaps, I'm not sure a single mount is useful
> as changes generally need to be monitored for a set of mounts.
>
> Monitoring a subtree is obviously possible because the monitor
> path doesn't need to be "/".
>
> Or am I misunderstanding what your trying to get at.
>
> The notion of filtering types and other things is interesting
> but what I've seen that doesn't fit in the current implementation
> so far probably isn't appropriate for kernel implementation.
>
> There's a special case of acquiring a list of mounts where the
> path is not a mount point itself but you need all mount below
> that path prefix.
>
> In this case you get all mounts, including the mounts of the mount
> containing the path, so you still need to traverse the list to match
> the prefix and that can easily mean the whole list of mounts in the
> system.
>
> Point is it leads to multiple traversals of a larger than needed list
> of mounts, one to get the list of mounts to check, and one to filter
> on the prefix.
>
> I've seen this use case with fsinfo() and that's where it's needed
> although it may be useful to carry it through to notifications as
> well.
>
> While this sounds like it isn't such a big deal it can sometimes
> make a considerable difference to the number of mounts you need
> to traverse when there are a large number of mounts in the system.
>
> I didn't consider it appropriate for kernel implementation but
> since you asked here it is. OTOH were checking for connectedness
> in fsinfo() anyway so maybe this is something that could be done
> without undue overhead.

Good point.  Filtering notifications for mounts outside of the
specified path makes sense.

Thanks,
Miklos
David Howells July 23, 2020, 10:48 a.m. UTC | #4
Miklos Szeredi <miklos@szeredi.hu> wrote:

> On Wed, Mar 18, 2020 at 4:05 PM David Howells <dhowells@redhat.com> wrote:
> >
> > Add a mount notification facility whereby notifications about changes in
> > mount topology and configuration can be received.  Note that this only
> > covers vfsmount topology changes and not superblock events.  A separate
> > facility will be added for that.
> >
> > Every mount is given a change counter than counts the number of topological
> > rearrangements in which it is involved and the number of attribute changes
> > it undergoes.  This allows notification loss to be dealt with.
> 
> Isn't queue overrun signalled anyway?
> 
> If an event is lost, there's no way to know which object was affected,
> so how does the counter help here?

An event may up the counter multiple times.  For example, imagine that you
do the following:

	mkdir /foo
	mount -t tmpfs none /foo
	mkdir /foo/b
	chroot /foo/b
	watch_mount("/")

now someone else comes along and does:

	mkdir /foo/a
	mkdir /foo/b/c
	mount -t tmpfs none /foo/a
	mount -o move /foo/a /foo/b/c

thereby moving a mount from outside your chroot window to inside of it.  The
move will generate two events (move-from and move-to), but you'll only get to
see one of them.  The usage on the mount at /foo, however, will be bumped by
2, not 1.

Also, if someone instead does this:

	mkdir /foo/a/d
	mkdir /foo/a/e
	mount -t tmpfs none /foo/a/d
	mount -o move /foo/a/e /foo/a/e

you won't get any notifications, but the counter still got bumped by 2.
You'll see an unusual bump in it at the next event, but you know you didn't
miss any events that pertain to you and can keep your copy of the counter up
to date... provided there hasn't been an overrun.

If there has been an overrun, you ask fsinfo() for a list of
{mount_id,counter} and then you have to scan anything where the counter has
changed unexpectedly.  It gives you the chance to keep up to date more
readily.

Maybe putting the counter into the notification message isn't really
necessary, but it's cheap to do if the counter is available.

> >  Later
> > patches will provide a way to quickly retrieve this value, along with
> > information about topology and parameters for the superblock.
> 
> So?  If we receive a notification for MNT1 with change counter CTR1
> and then receive the info for MNT1 with CTR2, then we know that we
> either missed a notification or we raced and will receive the
> notification later.  This helps with not having to redo the query when
> we receive the notification with CTR2, but this is just an
> optimization, not really useful.

Are optimisations ever useful?

> > In this case, it would let me monitor the mount topology subtree rooted at
> > "/" for events.  Mount notifications propagate up the tree towards the
> > root, so a watch will catch all of the events happening in the subtree
> > rooted at the watch.
> 
> Does it make sense to watch a single mount?  A set of mounts?   A
> subtree with an exclusion list (subtrees, types, ???)?
> 
> Not asking for these to be implemented initially, just questioning
> whether the API is flexible enough to allow these cases to be
> implemented later if needed.

You can watch a single mount or a whole subtree.  I could make it possible to
add exclusions into the filter list.

> >
> > After setting the watch, records will be placed into the queue when, for
> > example, as superblock switches between read-write and read-only.  Records
> > are of the following format:
> >
> >         struct mount_notification {
> >                 struct watch_notification watch;
> >                 __u32   triggered_on;
> >                 __u32   auxiliary_mount;
> 
> What guarantees that mount_id is going to remain a 32bit entity?

You think it likely we'd have >4 billion concurrent mounts on a system?  That
would require >1.2TiB of RAM just for the struct mount allocations.

But I can expand it to __u64.

> >                 __u32   topology_changes;
> >                 __u32   attr_changes;
> >                 __u32   aux_topology_changes;
> 
> Being 32bit this introduces wraparound effects.  Is that really worth it?

You'd have to make 2 billion changes without whoever's monitoring getting a
chance to update their counters.  But maybe it's not worth it putting them
here.  If you'd prefer, I can make the counters all 64-bit and just retrieve
them with fsinfo().

> >         } *n;
> >
> > Where:
> >
> >         n->watch.type will be WATCH_TYPE_MOUNT_NOTIFY.
> >
> >         n->watch.subtype will indicate the type of event, such as
> >         NOTIFY_MOUNT_NEW_MOUNT.
> >
> >         n->watch.info & WATCH_INFO_LENGTH will indicate the length of the
> >         record.
> 
> Hmm, size of record limited to 112bytes?  Is this verified somewhere?
> Don't see a BUILD_BUG_ON() in watch_sizeof().

127 bytes now, including the header.  I can add a BUILD_BUG_ON().

> >         n->watch.info & NOTIFY_MOUNT_IS_RECURSIVE if true indicates that
> >         the notifcation was generated by an event (eg. SETATTR) that was
> >         applied recursively.  The notification is only generated for the
> >         object that initially triggered it.
> 
> Unused in this patchset.  Please don't add things to the API which are not
> used.

Christian Brauner has patches for mount_setattr() that will need to use this.

> >         n->watch.info & NOTIFY_MOUNT_IS_NOW_RO will be used for
> >         NOTIFY_MOUNT_READONLY, being set if the superblock becomes R/O, and
> >         being cleared otherwise,
> 
> Does this refer to mount r/o flag or superblock r/o flag?  Confused.

Sorry, that should be "mount".

> > and for NOTIFY_MOUNT_NEW_MOUNT, being set
> >         if the new mount is a submount (e.g. an automount).
> 
> Huh?  What has r/o flag do with being a submount?

That should read "if the new mount is readonly".

> >         n->watch.info & NOTIFY_MOUNT_IS_SUBMOUNT if true indicates that the
> >         NOTIFY_MOUNT_NEW_MOUNT notification is in response to a mount
> >         performed by the kernel (e.g. an automount).
> >
> >         n->triggered_on indicates the ID of the mount to which the change
> >         was accounted (e.g. the new parent of a new mount).
> 
> For move there are two parents that are affected.  This doesn't look
> sufficient to reflect that.

You get up to two messages in that case:

	NOTIFY_MOUNT_MOVE_FROM	= 5, /* Mount moved from here */
	NOTIFY_MOUNT_MOVE_TO	= 6, /* Mount moved to here (compare op_id) */

but either message may get filtered because the event occurred outside of your
watched tree.

David
David Howells July 24, 2020, 10:19 a.m. UTC | #5
David Howells <dhowells@redhat.com> wrote:

> > What guarantees that mount_id is going to remain a 32bit entity?
> 
> You think it likely we'd have >4 billion concurrent mounts on a system?  That
> would require >1.2TiB of RAM just for the struct mount allocations.
> 
> But I can expand it to __u64.

That said, sys_name_to_handle_at() assumes it's a 32-bit signed integer, so
we're currently limited to ~2 billion concurrent mounts:-/

David
Ian Kent July 24, 2020, 10:44 a.m. UTC | #6
On Fri, 2020-07-24 at 11:19 +0100, David Howells wrote:
> David Howells <dhowells@redhat.com> wrote:
> 
> > > What guarantees that mount_id is going to remain a 32bit entity?
> > 
> > You think it likely we'd have >4 billion concurrent mounts on a
> > system?  That
> > would require >1.2TiB of RAM just for the struct mount allocations.
> > 
> > But I can expand it to __u64.
> 
> That said, sys_name_to_handle_at() assumes it's a 32-bit signed
> integer, so
> we're currently limited to ~2 billion concurrent mounts:-/

I was wondering about id re-use.

Assuming that ids that are returned to the idr db are re-used
what would the chance that a recently used id would end up
being used?

Would that chance increase as ids are consumed and freed over
time?

Yeah, it's one of those questions ... ;)

Ian
David Howells July 24, 2020, 11:36 a.m. UTC | #7
Ian Kent <raven@themaw.net> wrote:

> I was wondering about id re-use.
> 
> Assuming that ids that are returned to the idr db are re-used
> what would the chance that a recently used id would end up
> being used?
> 
> Would that chance increase as ids are consumed and freed over
> time?

I've added something to deal with that in the fsinfo branch.  I've given each
mount object and superblock a supplementary 64-bit unique ID that's not likely
to repeat before we're no longer around to have to worry about it.

fsinfo() then allows you to retrieve them by path or by mount ID.

So, yes, mnt_id and s_dev are not unique and may be reused very quickly, but
I'm also providing uniquifiers that you can check.

David
Miklos Szeredi Aug. 3, 2020, 9:29 a.m. UTC | #8
On Thu, Jul 23, 2020 at 12:48 PM David Howells <dhowells@redhat.com> wrote:

>
> > >                 __u32   topology_changes;
> > >                 __u32   attr_changes;
> > >                 __u32   aux_topology_changes;
> >
> > Being 32bit this introduces wraparound effects.  Is that really worth it?
>
> You'd have to make 2 billion changes without whoever's monitoring getting a
> chance to update their counters.  But maybe it's not worth it putting them
> here.  If you'd prefer, I can make the counters all 64-bit and just retrieve
> them with fsinfo().

Yes, I think that would be preferable.

> > >         n->watch.info & NOTIFY_MOUNT_IS_RECURSIVE if true indicates that
> > >         the notifcation was generated by an event (eg. SETATTR) that was
> > >         applied recursively.  The notification is only generated for the
> > >         object that initially triggered it.
> >
> > Unused in this patchset.  Please don't add things to the API which are not
> > used.
>
> Christian Brauner has patches for mount_setattr() that will need to use this.

Fine, then that patch can add the flag.

Thanks,
Miklos
Miklos Szeredi Aug. 3, 2020, 10:02 a.m. UTC | #9
On Fri, Jul 24, 2020 at 1:36 PM David Howells <dhowells@redhat.com> wrote:
>
> Ian Kent <raven@themaw.net> wrote:
>
> > I was wondering about id re-use.
> >
> > Assuming that ids that are returned to the idr db are re-used
> > what would the chance that a recently used id would end up
> > being used?
> >
> > Would that chance increase as ids are consumed and freed over
> > time?
>
> I've added something to deal with that in the fsinfo branch.  I've given each
> mount object and superblock a supplementary 64-bit unique ID that's not likely
> to repeat before we're no longer around to have to worry about it.
>
> fsinfo() then allows you to retrieve them by path or by mount ID.

Shouldn't the notification interface provide the unique ID?

Thanks,
Miklos

>
> So, yes, mnt_id and s_dev are not unique and may be reused very quickly, but
> I'm also providing uniquifiers that you can check.
>
> David
>
David Howells Aug. 3, 2020, 10:08 a.m. UTC | #10
Miklos Szeredi <miklos@szeredi.hu> wrote:

> > fsinfo() then allows you to retrieve them by path or by mount ID.
> 
> Shouldn't the notification interface provide the unique ID?

It could make sense - instead of the reusable mnt_id.

David
David Howells Aug. 3, 2020, 10:18 a.m. UTC | #11
Miklos Szeredi <miklos@szeredi.hu> wrote:

> > fsinfo() then allows you to retrieve them by path or by mount ID.
> 
> Shouldn't the notification interface provide the unique ID?

Hmmm...  If I'm going to do that, I have to put the fsinfo-core branch first
otherwise you can't actually retrieve the unique ID - and thus won't be able
to make sense of the notification record.  Such a rearrangement might make
sense anyway since Ian and Karel have been primarily concentrating on fsinfo
and only more recently started adding notification support.

David
Miklos Szeredi Aug. 3, 2020, 11:17 a.m. UTC | #12
On Mon, Aug 3, 2020 at 12:18 PM David Howells <dhowells@redhat.com> wrote:
>
> Miklos Szeredi <miklos@szeredi.hu> wrote:
>
> > > fsinfo() then allows you to retrieve them by path or by mount ID.
> >
> > Shouldn't the notification interface provide the unique ID?
>
> Hmmm...  If I'm going to do that, I have to put the fsinfo-core branch first
> otherwise you can't actually retrieve the unique ID - and thus won't be able
> to make sense of the notification record.  Such a rearrangement might make
> sense anyway since Ian and Karel have been primarily concentrating on fsinfo
> and only more recently started adding notification support.

OTOH mount notification is way smaller and IMO a more mature
interface.  So just picking the unique ID patch into this set might
make sense.

Thanks,
Miklos
David Howells Aug. 3, 2020, 11:49 a.m. UTC | #13
Miklos Szeredi <miklos@szeredi.hu> wrote:

> OTOH mount notification is way smaller and IMO a more mature
> interface.  So just picking the unique ID patch into this set might
> make sense.

But userspace can't retrieve the unique ID without fsinfo() as things stand.

I'm changing it so that the fields are 64-bit, but initialised with the
existing mount ID in the notifications set.  The fsinfo set changes that to a
unique ID.  I'm tempted to make the unique IDs start at UINT_MAX+1 to
disambiguate them.

David
Ian Kent Aug. 3, 2020, 12:01 p.m. UTC | #14
On Mon, 2020-08-03 at 12:49 +0100, David Howells wrote:
> Miklos Szeredi <miklos@szeredi.hu> wrote:
> 
> > OTOH mount notification is way smaller and IMO a more mature
> > interface.  So just picking the unique ID patch into this set might
> > make sense.
> 
> But userspace can't retrieve the unique ID without fsinfo() as things
> stand.
> 
> I'm changing it so that the fields are 64-bit, but initialised with
> the
> existing mount ID in the notifications set.  The fsinfo set changes
> that to a
> unique ID.  I'm tempted to make the unique IDs start at UINT_MAX+1 to
> disambiguate them.

Mmm ... so what would I use as a mount id that's not used, like NULL
for strings?

I'm using -1 now but changing this will mean I need something
different.

Could we set aside a mount id that will never be used so it can be
used for this case?

Maybe mount ids should start at 1 instead of zero ...

Ian
David Howells Aug. 3, 2020, 12:31 p.m. UTC | #15
Ian Kent <raven@themaw.net> wrote:

> > I'm changing it so that the fields are 64-bit, but initialised with the
> > existing mount ID in the notifications set.  The fsinfo set changes that
> > to a unique ID.  I'm tempted to make the unique IDs start at UINT_MAX+1 to
> > disambiguate them.
> 
> Mmm ... so what would I use as a mount id that's not used, like NULL
> for strings?

Zero is skipped, so you could use that.

> I'm using -1 now but changing this will mean I need something
> different.

It's 64-bits, so you're not likely to see it reach -1, even if it does start
at UINT_MAX+1.

David
Ian Kent Aug. 3, 2020, 2:30 p.m. UTC | #16
On Mon, 2020-08-03 at 13:31 +0100, David Howells wrote:
> Ian Kent <raven@themaw.net> wrote:
> 
> > > I'm changing it so that the fields are 64-bit, but initialised
> > > with the
> > > existing mount ID in the notifications set.  The fsinfo set
> > > changes that
> > > to a unique ID.  I'm tempted to make the unique IDs start at
> > > UINT_MAX+1 to
> > > disambiguate them.
> > 
> > Mmm ... so what would I use as a mount id that's not used, like
> > NULL
> > for strings?
> 
> Zero is skipped, so you could use that.
> 
> > I'm using -1 now but changing this will mean I need something
> > different.
> 
> It's 64-bits, so you're not likely to see it reach -1, even if it
> does start
> at UINT_MAX+1.

Ha, either or, I don't think it will be a problem, there's
bound to be a few changes so the components using this will
need to change a bit before it's finalized, shouldn't be a
big deal I think. At least not for me and shouldn't be much
for libmount either I think.

Ian
Ian Kent Aug. 4, 2020, 11:38 a.m. UTC | #17
On Mon, 2020-08-03 at 11:29 +0200, Miklos Szeredi wrote:
> On Thu, Jul 23, 2020 at 12:48 PM David Howells <dhowells@redhat.com>
> wrote:
> 
> > > >                 __u32   topology_changes;
> > > >                 __u32   attr_changes;
> > > >                 __u32   aux_topology_changes;
> > > 
> > > Being 32bit this introduces wraparound effects.  Is that really
> > > worth it?
> > 
> > You'd have to make 2 billion changes without whoever's monitoring
> > getting a
> > chance to update their counters.  But maybe it's not worth it
> > putting them
> > here.  If you'd prefer, I can make the counters all 64-bit and just
> > retrieve
> > them with fsinfo().
> 
> Yes, I think that would be preferable.

I think this is the source of the recommendation for removing the
change counters from the notification message, correct?

While it looks like I may not need those counters for systemd message
buffer overflow handling myself I think removing them from the
notification message isn't a sensible thing to do.

If you need to detect missing messages, perhaps due to message buffer
overflow, then you need change counters that are relevant to the
notification message itself. That's so the next time you get a message
for that object you can be sure that change counter comparisons you
you make relate to object notifications you have processed.

Yes, I know it isn't quite that simple, but tallying up what you have
processed in the current batch of messages (or in multiple batches of
messages if more than one read has been possible) to perform the check
is a user space responsibility. And it simply can't be done if the
counters consistency is in question which it would be if you need to
perform another system call to get it.

It's way more useful to have these in the notification than obtainable
via fsinfo() IMHO.

> 
> > > >         n->watch.info & NOTIFY_MOUNT_IS_RECURSIVE if true
> > > > indicates that
> > > >         the notifcation was generated by an event (eg. SETATTR)
> > > > that was
> > > >         applied recursively.  The notification is only
> > > > generated for the
> > > >         object that initially triggered it.
> > > 
> > > Unused in this patchset.  Please don't add things to the API
> > > which are not
> > > used.
> > 
> > Christian Brauner has patches for mount_setattr() that will need to
> > use this.
> 
> Fine, then that patch can add the flag.
> 
> Thanks,
> Miklos
Miklos Szeredi Aug. 4, 2020, 1:19 p.m. UTC | #18
On Tue, Aug 4, 2020 at 1:39 PM Ian Kent <raven@themaw.net> wrote:
>
> On Mon, 2020-08-03 at 11:29 +0200, Miklos Szeredi wrote:
> > On Thu, Jul 23, 2020 at 12:48 PM David Howells <dhowells@redhat.com>
> > wrote:
> >
> > > > >                 __u32   topology_changes;
> > > > >                 __u32   attr_changes;
> > > > >                 __u32   aux_topology_changes;
> > > >
> > > > Being 32bit this introduces wraparound effects.  Is that really
> > > > worth it?
> > >
> > > You'd have to make 2 billion changes without whoever's monitoring
> > > getting a
> > > chance to update their counters.  But maybe it's not worth it
> > > putting them
> > > here.  If you'd prefer, I can make the counters all 64-bit and just
> > > retrieve
> > > them with fsinfo().
> >
> > Yes, I think that would be preferable.
>
> I think this is the source of the recommendation for removing the
> change counters from the notification message, correct?
>
> While it looks like I may not need those counters for systemd message
> buffer overflow handling myself I think removing them from the
> notification message isn't a sensible thing to do.
>
> If you need to detect missing messages, perhaps due to message buffer
> overflow, then you need change counters that are relevant to the
> notification message itself. That's so the next time you get a message
> for that object you can be sure that change counter comparisons you
> you make relate to object notifications you have processed.

I don't quite get it.  Change notification is just that: a
notification.   You need to know what object that notification relates
to, to be able to retrieve the up to date attributes of said object.

What happens if you get a change counter N in the notification
message, then get a change counter N + 1 in the attribute retrieval?
You know that another change happened, and you haven't yet processed
the notification yet.  So when the notification with N + 1 comes in,
you can optimize away the attribute retrieve.

Nice optimization, but it's optimizing a race condition, and I don't
think that's warranted.  I don't see any other use for the change
counter in the notification message.


> Yes, I know it isn't quite that simple, but tallying up what you have
> processed in the current batch of messages (or in multiple batches of
> messages if more than one read has been possible) to perform the check
> is a user space responsibility. And it simply can't be done if the
> counters consistency is in question which it would be if you need to
> perform another system call to get it.
>
> It's way more useful to have these in the notification than obtainable
> via fsinfo() IMHO.

What is it useful for?

If the notification itself would contain the list of updated
attributes and their new values, then yes, this would make sense.  If
the notification just tells us that the object was modified, but not
the modifications themselves, then I don't see how the change counter
in itself could add any information (other than optimizing the race
condition above).

Thanks,
Miklos

Thanks,



>
> >
> > > > >         n->watch.info & NOTIFY_MOUNT_IS_RECURSIVE if true
> > > > > indicates that
> > > > >         the notifcation was generated by an event (eg. SETATTR)
> > > > > that was
> > > > >         applied recursively.  The notification is only
> > > > > generated for the
> > > > >         object that initially triggered it.
> > > >
> > > > Unused in this patchset.  Please don't add things to the API
> > > > which are not
> > > > used.
> > >
> > > Christian Brauner has patches for mount_setattr() that will need to
> > > use this.
> >
> > Fine, then that patch can add the flag.
> >
> > Thanks,
> > Miklos
>
Ian Kent Aug. 5, 2020, 1:53 a.m. UTC | #19
On Tue, 2020-08-04 at 15:19 +0200, Miklos Szeredi wrote:
> On Tue, Aug 4, 2020 at 1:39 PM Ian Kent <raven@themaw.net> wrote:
> > On Mon, 2020-08-03 at 11:29 +0200, Miklos Szeredi wrote:
> > > On Thu, Jul 23, 2020 at 12:48 PM David Howells <
> > > dhowells@redhat.com>
> > > wrote:
> > > 
> > > > > >                 __u32   topology_changes;
> > > > > >                 __u32   attr_changes;
> > > > > >                 __u32   aux_topology_changes;
> > > > > 
> > > > > Being 32bit this introduces wraparound effects.  Is that
> > > > > really
> > > > > worth it?
> > > > 
> > > > You'd have to make 2 billion changes without whoever's
> > > > monitoring
> > > > getting a
> > > > chance to update their counters.  But maybe it's not worth it
> > > > putting them
> > > > here.  If you'd prefer, I can make the counters all 64-bit and
> > > > just
> > > > retrieve
> > > > them with fsinfo().
> > > 
> > > Yes, I think that would be preferable.
> > 
> > I think this is the source of the recommendation for removing the
> > change counters from the notification message, correct?
> > 
> > While it looks like I may not need those counters for systemd
> > message
> > buffer overflow handling myself I think removing them from the
> > notification message isn't a sensible thing to do.
> > 
> > If you need to detect missing messages, perhaps due to message
> > buffer
> > overflow, then you need change counters that are relevant to the
> > notification message itself. That's so the next time you get a
> > message
> > for that object you can be sure that change counter comparisons you
> > you make relate to object notifications you have processed.
> 
> I don't quite get it.  Change notification is just that: a
> notification.   You need to know what object that notification
> relates
> to, to be able to retrieve the up to date attributes of said object.
> 
> What happens if you get a change counter N in the notification
> message, then get a change counter N + 1 in the attribute retrieval?
> You know that another change happened, and you haven't yet processed
> the notification yet.  So when the notification with N + 1 comes in,
> you can optimize away the attribute retrieve.
> 
> Nice optimization, but it's optimizing a race condition, and I don't
> think that's warranted.  I don't see any other use for the change
> counter in the notification message.
> 
> 
> > Yes, I know it isn't quite that simple, but tallying up what you
> > have
> > processed in the current batch of messages (or in multiple batches
> > of
> > messages if more than one read has been possible) to perform the
> > check
> > is a user space responsibility. And it simply can't be done if the
> > counters consistency is in question which it would be if you need
> > to
> > perform another system call to get it.
> > 
> > It's way more useful to have these in the notification than
> > obtainable
> > via fsinfo() IMHO.
> 
> What is it useful for?

Only to verify that you have seen all the notifications.

If you have to grab that info with a separate call then the count
isn't necessarily consistent because other notifications can occur
while you grab it.

My per-object rant isn't quite right, what's needed is a consistent
way to verify you have seen everything you were supposed to.

I think your point is that if you grab the info in another call and
it doesn't match you need to refresh and that's fine but I think it's
better to be able to verify you have got everything that was sent as
you go and avoid the need for the refresh more often.

> 
> If the notification itself would contain the list of updated
> attributes and their new values, then yes, this would make sense.  If
> the notification just tells us that the object was modified, but not
> the modifications themselves, then I don't see how the change counter
> in itself could add any information (other than optimizing the race
> condition above).
> 
> Thanks,
> Miklos
> 
> Thanks,
> 
> 
> 
> > > > > >         n->watch.info & NOTIFY_MOUNT_IS_RECURSIVE if true
> > > > > > indicates that
> > > > > >         the notifcation was generated by an event (eg.
> > > > > > SETATTR)
> > > > > > that was
> > > > > >         applied recursively.  The notification is only
> > > > > > generated for the
> > > > > >         object that initially triggered it.
> > > > > 
> > > > > Unused in this patchset.  Please don't add things to the API
> > > > > which are not
> > > > > used.
> > > > 
> > > > Christian Brauner has patches for mount_setattr() that will
> > > > need to
> > > > use this.
> > > 
> > > Fine, then that patch can add the flag.
> > > 
> > > Thanks,
> > > Miklos
Miklos Szeredi Aug. 5, 2020, 7:43 a.m. UTC | #20
On Wed, Aug 5, 2020 at 3:54 AM Ian Kent <raven@themaw.net> wrote:
>

> > > It's way more useful to have these in the notification than
> > > obtainable
> > > via fsinfo() IMHO.
> >
> > What is it useful for?
>
> Only to verify that you have seen all the notifications.
>
> If you have to grab that info with a separate call then the count
> isn't necessarily consistent because other notifications can occur
> while you grab it.

No, no no.   The watch queue will signal an overflow, without any
additional overhead for the normal case.  If you think of this as a
protocol stack, then the overflow detection happens on the transport
layer, instead of the application layer.  The application layer is
responsible for restoring state in case of a transport layer error,
but detection of that error is not the responsibility of the
application layer.


Thanks,
Miklos
Ian Kent Aug. 5, 2020, 11:36 a.m. UTC | #21
On Wed, 2020-08-05 at 09:43 +0200, Miklos Szeredi wrote:
> On Wed, Aug 5, 2020 at 3:54 AM Ian Kent <raven@themaw.net> wrote:
> > > > It's way more useful to have these in the notification than
> > > > obtainable
> > > > via fsinfo() IMHO.
> > > 
> > > What is it useful for?
> > 
> > Only to verify that you have seen all the notifications.
> > 
> > If you have to grab that info with a separate call then the count
> > isn't necessarily consistent because other notifications can occur
> > while you grab it.
> 
> No, no no.   The watch queue will signal an overflow, without any
> additional overhead for the normal case.  If you think of this as a
> protocol stack, then the overflow detection happens on the transport
> layer, instead of the application layer.  The application layer is
> responsible for restoring state in case of a transport layer error,
> but detection of that error is not the responsibility of the
> application layer.

I can see in the kernel code that an error is returned if the message
buffer is full when trying to add a message, I just can't see where
to get it in the libmount code.

That's not really a communication protocol problem.

Still I need to work out how to detect it, maybe it is seen by
the code in libmount already and I simply can't see what I need
to do to recognise it ...

So I'm stuck wanting to verify I have got everything that was
sent and am having trouble moving on from that.

Ian
Miklos Szeredi Aug. 5, 2020, 11:56 a.m. UTC | #22
On Wed, Aug 5, 2020 at 1:36 PM Ian Kent <raven@themaw.net> wrote:
>

> I can see in the kernel code that an error is returned if the message
> buffer is full when trying to add a message, I just can't see where
> to get it in the libmount code.
>
> That's not really a communication protocol problem.
>
> Still I need to work out how to detect it, maybe it is seen by
> the code in libmount already and I simply can't see what I need
> to do to recognise it ...
>
> So I'm stuck wanting to verify I have got everything that was
> sent and am having trouble moving on from that.

This is the commit that should add the overrun detection capability:

e7d553d69cf6 ("pipe: Add notification lossage handling")

Thanks,
Miklos
diff mbox series

Patch

diff --git a/Documentation/watch_queue.rst b/Documentation/watch_queue.rst
index 849fad6893ef..3e647992be31 100644
--- a/Documentation/watch_queue.rst
+++ b/Documentation/watch_queue.rst
@@ -8,6 +8,7 @@  opened by userspace.  This can be used in conjunction with::
 
   * Key/keyring notifications
 
+  * Mount notifications.
 
 The notifications buffers can be enabled by:
 
@@ -233,6 +234,11 @@  Any particular buffer can be fed from multiple sources.  Sources include:
 
     See Documentation/security/keys/core.rst for more information.
 
+  * WATCH_TYPE_MOUNT_NOTIFY
+
+    Notifications of this type indicate changes to mount attributes and the
+    mount topology within the subtree at the indicated point.
+
 
 Event Filtering
 ===============
@@ -292,9 +298,10 @@  A buffer is created with something like the following::
 	pipe2(fds, O_TMPFILE);
 	ioctl(fds[1], IOC_WATCH_QUEUE_SET_SIZE, 256);
 
-It can then be set to receive keyring change notifications::
+It can then be set to receive notifications::
 
 	keyctl(KEYCTL_WATCH_KEY, KEY_SPEC_SESSION_KEYRING, fds[1], 0x01);
+	watch_mount(AT_FDCWD, "/", 0, fds[1], 0x02);
 
 The notifications can then be consumed by something like the following::
 
@@ -331,6 +338,9 @@  The notifications can then be consumed by something like the following::
 				case WATCH_TYPE_KEY_NOTIFY:
 					saw_key_change(&n.n);
 					break;
+				case WATCH_TYPE_MOUNT_NOTIFY:
+					saw_mount_change(&n.n);
+					break;
 				}
 
 				p += len;
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 36d42da7466a..b869428033ef 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -477,3 +477,4 @@ 
 # 545 reserved for clone3
 547	common	openat2				sys_openat2
 548	common	pidfd_getfd			sys_pidfd_getfd
+549	common	watch_mount			sys_watch_mount
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 4d1cf74a2caa..9c389da9efcc 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -451,3 +451,4 @@ 
 435	common	clone3				sys_clone3
 437	common	openat2				sys_openat2
 438	common	pidfd_getfd			sys_pidfd_getfd
+439	common	watch_mount			sys_watch_mount
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 1dd22da1c3a9..75f04a1023be 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@ 
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		439
+#define __NR_compat_syscalls		440
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index c1c61635f89c..774f0339763f 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -883,6 +883,8 @@  __SYSCALL(__NR_clone3, sys_clone3)
 __SYSCALL(__NR_openat2, sys_openat2)
 #define __NR_pidfd_getfd 438
 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
+#define __NR_watch_mount 439
+__SYSCALL(__NR_watch_mount, sys_watch_mount)
 
 /*
  * Please add new compat syscalls above this comment and update
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index 042911e670b8..6817f865cc71 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -358,3 +358,4 @@ 
 # 435 reserved for clone3
 437	common	openat2				sys_openat2
 438	common	pidfd_getfd			sys_pidfd_getfd
+439	common	watch_mount			sys_watch_mount
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index f4f49fcb76d0..fbf85da75ecb 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -437,3 +437,4 @@ 
 435	common	clone3				__sys_clone3
 437	common	openat2				sys_openat2
 438	common	pidfd_getfd			sys_pidfd_getfd
+439	common	watch_mount			sys_watch_mount
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 4c67b11f9c9e..b05b192da1e2 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -443,3 +443,4 @@ 
 435	common	clone3				sys_clone3
 437	common	openat2				sys_openat2
 438	common	pidfd_getfd			sys_pidfd_getfd
+439	common	watch_mount			sys_watch_mount
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index 1f9e8ad636cc..0f85d2a033f9 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -376,3 +376,4 @@ 
 435	n32	clone3				__sys_clone3
 437	n32	openat2				sys_openat2
 438	n32	pidfd_getfd			sys_pidfd_getfd
+439	n32	watch_mount			sys_watch_mount
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index c0b9d802dbf6..905cf9ac0792 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -352,3 +352,4 @@ 
 435	n64	clone3				__sys_clone3
 437	n64	openat2				sys_openat2
 438	n64	pidfd_getfd			sys_pidfd_getfd
+439	n64	watch_mount			sys_watch_mount
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index ac586774c980..834b26b08d74 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -425,3 +425,4 @@ 
 435	o32	clone3				__sys_clone3
 437	o32	openat2				sys_openat2
 438	o32	pidfd_getfd			sys_pidfd_getfd
+439	o32	watch_mount			sys_watch_mount
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 52a15f5cd130..badd3449db43 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -435,3 +435,4 @@ 
 435	common	clone3				sys_clone3_wrapper
 437	common	openat2				sys_openat2
 438	common	pidfd_getfd			sys_pidfd_getfd
+439	common	watch_mount			sys_watch_mount
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index 35b61bfc1b1a..b404361bc929 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -519,3 +519,4 @@ 
 435	nospu	clone3				ppc_clone3
 437	common	openat2				sys_openat2
 438	common	pidfd_getfd			sys_pidfd_getfd
+439	common	watch_mount			sys_watch_mount
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index bd7bd3581a0f..33071de24511 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@ 
 435  common	clone3			sys_clone3			sys_clone3
 437  common	openat2			sys_openat2			sys_openat2
 438  common	pidfd_getfd		sys_pidfd_getfd			sys_pidfd_getfd
+439	common	watch_mount		sys_watch_mount			sys_watch_mount
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index c7a30fcd135f..682c125122f4 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@ 
 # 435 reserved for clone3
 437	common	openat2				sys_openat2
 438	common	pidfd_getfd			sys_pidfd_getfd
+439	common	watch_mount			sys_watch_mount
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index f13615ecdecc..febf3cd675e3 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -483,3 +483,4 @@ 
 # 435 reserved for clone3
 437	common	openat2			sys_openat2
 438	common	pidfd_getfd			sys_pidfd_getfd
+439	common	watch_mount			sys_watch_mount
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index c17cb77eb150..085bcc5afdf1 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -442,3 +442,4 @@ 
 435	i386	clone3			sys_clone3			__ia32_sys_clone3
 437	i386	openat2			sys_openat2			__ia32_sys_openat2
 438	i386	pidfd_getfd		sys_pidfd_getfd			__ia32_sys_pidfd_getfd
+439	i386	watch_mount		sys_watch_mount			__ia32_sys_watch_mount
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 44d510bc9b78..9cfb6b2eb319 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -359,6 +359,7 @@ 
 435	common	clone3			__x64_sys_clone3/ptregs
 437	common	openat2			__x64_sys_openat2
 438	common	pidfd_getfd		__x64_sys_pidfd_getfd
+439	common	watch_mount		__x64_sys_watch_mount
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index 85a9ab1bc04d..1a066a43a58b 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -408,3 +408,4 @@ 
 435	common	clone3				sys_clone3
 437	common	openat2				sys_openat2
 438	common	pidfd_getfd			sys_pidfd_getfd
+439	common	watch_mount			sys_watch_mount
diff --git a/fs/Kconfig b/fs/Kconfig
index 708ba336e689..d7039137d538 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -117,6 +117,15 @@  source "fs/verity/Kconfig"
 
 source "fs/notify/Kconfig"
 
+config MOUNT_NOTIFICATIONS
+	bool "Mount topology change notifications"
+	select WATCH_QUEUE
+	help
+	  This option provides support for getting change notifications on the
+	  mount tree topology.  This makes use of the /dev/watch_queue misc
+	  device to handle the notification buffer and provides the
+	  mount_notify() system call to enable/disable watchpoints.
+
 source "fs/quota/Kconfig"
 
 source "fs/autofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 505e51166973..4477757780d0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -22,6 +22,7 @@  obj-y +=	no-block.o
 endif
 
 obj-$(CONFIG_PROC_FS) += proc_namespace.o
+obj-$(CONFIG_MOUNT_NOTIFICATIONS) += mount_notify.o
 
 obj-y				+= notify/
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
diff --git a/fs/mount.h b/fs/mount.h
index 711a4093e475..9a49ea1e7365 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -4,6 +4,7 @@ 
 #include <linux/poll.h>
 #include <linux/ns_common.h>
 #include <linux/fs_pin.h>
+#include <linux/watch_queue.h>
 
 struct mnt_namespace {
 	atomic_t		count;
@@ -72,6 +73,12 @@  struct mount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	struct hlist_head mnt_pins;
 	struct hlist_head mnt_stuck_children;
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+	atomic_t mnt_topology_changes;	/* Number of topology changes applied */
+	atomic_t mnt_attr_changes;	/* Number of attribute changes applied */
+	atomic_t mnt_subtree_notifications;	/* Number of notifications in subtree */
+	struct watch_list *mnt_watchers; /* Watches on dentries within this mount */
+#endif
 } __randomize_layout;
 
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
@@ -153,3 +160,17 @@  static inline bool is_anon_ns(struct mnt_namespace *ns)
 {
 	return ns->seq == 0;
 }
+
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+extern void notify_mount(struct mount *triggered,
+			 struct mount *aux,
+			 enum mount_notification_subtype subtype,
+			 u32 info_flags);
+#else
+static inline void notify_mount(struct mount *triggered,
+				struct mount *aux,
+				enum mount_notification_subtype subtype,
+				u32 info_flags)
+{
+}
+#endif
diff --git a/fs/mount_notify.c b/fs/mount_notify.c
new file mode 100644
index 000000000000..403d79785807
--- /dev/null
+++ b/fs/mount_notify.c
@@ -0,0 +1,228 @@ 
+// SPDX-License-Identifier: GPL-2.0
+/* Provide mount topology/attribute change notifications.
+ *
+ * Copyright (C) 2019 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include "mount.h"
+
+/*
+ * Post mount notifications to all watches going rootwards along the tree.
+ *
+ * Must be called with the mount_lock held.
+ */
+static void post_mount_notification(struct mount *changed,
+				    struct mount_notification *notify)
+{
+	const struct cred *cred = current_cred();
+	struct path cursor;
+	struct mount *mnt;
+	unsigned seq;
+
+	seq = 0;
+	rcu_read_lock();
+restart:
+	cursor.mnt = &changed->mnt;
+	cursor.dentry = changed->mnt.mnt_root;
+	mnt = real_mount(cursor.mnt);
+	notify->watch.info &= ~NOTIFY_MOUNT_IN_SUBTREE;
+
+	read_seqbegin_or_lock(&rename_lock, &seq);
+	for (;;) {
+		if (mnt->mnt_watchers &&
+		    !hlist_empty(&mnt->mnt_watchers->watchers)) {
+			if (cursor.dentry->d_flags & DCACHE_MOUNT_WATCH)
+				post_watch_notification(mnt->mnt_watchers,
+							&notify->watch, cred,
+							(unsigned long)cursor.dentry);
+		} else {
+			cursor.dentry = mnt->mnt.mnt_root;
+		}
+		notify->watch.info |= NOTIFY_MOUNT_IN_SUBTREE;
+
+		if (cursor.dentry == cursor.mnt->mnt_root ||
+		    IS_ROOT(cursor.dentry)) {
+			struct mount *parent = READ_ONCE(mnt->mnt_parent);
+
+			/* Escaped? */
+			if (cursor.dentry != cursor.mnt->mnt_root)
+				break;
+
+			/* Global root? */
+			if (mnt == parent)
+				break;
+
+			cursor.dentry = READ_ONCE(mnt->mnt_mountpoint);
+			mnt = parent;
+			cursor.mnt = &mnt->mnt;
+			atomic_inc(&mnt->mnt_subtree_notifications);
+		} else {
+			cursor.dentry = cursor.dentry->d_parent;
+		}
+	}
+
+	if (need_seqretry(&rename_lock, seq)) {
+		seq = 1;
+		goto restart;
+	}
+
+	done_seqretry(&rename_lock, seq);
+	rcu_read_unlock();
+}
+
+/*
+ * Generate a mount notification.
+ */
+void notify_mount(struct mount *trigger,
+		  struct mount *aux,
+		  enum mount_notification_subtype subtype,
+		  u32 info_flags)
+{
+
+	struct mount_notification n;
+
+	memset(&n, 0, sizeof(n));
+	n.watch.type	= WATCH_TYPE_MOUNT_NOTIFY;
+	n.watch.subtype	= subtype;
+	n.watch.info	= info_flags | watch_sizeof(n);
+	n.triggered_on	= trigger->mnt_id;
+
+	switch (subtype) {
+	case NOTIFY_MOUNT_EXPIRY:
+	case NOTIFY_MOUNT_READONLY:
+	case NOTIFY_MOUNT_SETATTR:
+		n.topology_changes	= atomic_read(&trigger->mnt_topology_changes);
+		n.attr_changes		= atomic_inc_return(&trigger->mnt_attr_changes);
+		break;
+
+	case NOTIFY_MOUNT_NEW_MOUNT:
+	case NOTIFY_MOUNT_UNMOUNT:
+	case NOTIFY_MOUNT_MOVE_FROM:
+	case NOTIFY_MOUNT_MOVE_TO:
+		n.auxiliary_mount	= aux->mnt_id,
+		n.attr_changes		= atomic_read(&trigger->mnt_attr_changes);
+		n.topology_changes	= atomic_inc_return(&trigger->mnt_topology_changes);
+		n.aux_topology_changes	= atomic_inc_return(&aux->mnt_topology_changes);
+		break;
+
+	default:
+		BUG();
+	}
+
+	post_mount_notification(trigger, &n);
+}
+
+static void release_mount_watch(struct watch *watch)
+{
+	struct dentry *dentry = (struct dentry *)(unsigned long)watch->id;
+
+	dput(dentry);
+}
+
+/**
+ * sys_watch_mount - Watch for mount topology/attribute changes
+ * @dfd: Base directory to pathwalk from or fd referring to mount.
+ * @filename: Path to mount to place the watch upon
+ * @at_flags: Pathwalk control flags
+ * @watch_fd: The watch queue to send notifications to.
+ * @watch_id: The watch ID to be placed in the notification (-1 to remove watch)
+ */
+SYSCALL_DEFINE5(watch_mount,
+		int, dfd,
+		const char __user *, filename,
+		unsigned int, at_flags,
+		int, watch_fd,
+		int, watch_id)
+{
+	struct watch_queue *wqueue;
+	struct watch_list *wlist = NULL;
+	struct watch *watch = NULL;
+	struct mount *m;
+	struct path path;
+	unsigned int lookup_flags =
+		LOOKUP_DIRECTORY | LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+	int ret;
+
+	if (watch_id < -1 || watch_id > 0xff)
+		return -EINVAL;
+	if ((at_flags & ~(AT_NO_AUTOMOUNT | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+	if (at_flags & AT_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (at_flags & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+
+	ret = user_path_at(dfd, filename, lookup_flags, &path);
+	if (ret)
+		return ret;
+
+	ret = inode_permission(path.dentry->d_inode, MAY_EXEC);
+	if (ret)
+		goto err_path;
+
+	wqueue = get_watch_queue(watch_fd);
+	if (IS_ERR(wqueue))
+		goto err_path;
+
+	m = real_mount(path.mnt);
+
+	if (watch_id >= 0) {
+		ret = -ENOMEM;
+		if (!READ_ONCE(m->mnt_watchers)) {
+			wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
+			if (!wlist)
+				goto err_wqueue;
+			init_watch_list(wlist, release_mount_watch);
+		}
+
+		watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+		if (!watch)
+			goto err_wlist;
+
+		init_watch(watch, wqueue);
+		watch->id		= (unsigned long)path.dentry;
+		watch->info_id		= (u32)watch_id << 24;
+
+		ret = security_watch_mount(watch, &path);
+		if (ret < 0)
+			goto err_watch;
+
+		down_write(&m->mnt.mnt_sb->s_umount);
+		if (!m->mnt_watchers) {
+			m->mnt_watchers = wlist;
+			wlist = NULL;
+		}
+
+		ret = add_watch_to_object(watch, m->mnt_watchers);
+		if (ret == 0) {
+			spin_lock(&path.dentry->d_lock);
+			path.dentry->d_flags |= DCACHE_MOUNT_WATCH;
+			spin_unlock(&path.dentry->d_lock);
+			dget(path.dentry);
+			watch = NULL;
+		}
+		up_write(&m->mnt.mnt_sb->s_umount);
+	} else {
+		down_write(&m->mnt.mnt_sb->s_umount);
+		ret = remove_watch_from_object(m->mnt_watchers, wqueue,
+					       (unsigned long)path.dentry,
+					       false);
+		up_write(&m->mnt.mnt_sb->s_umount);
+	}
+
+err_watch:
+	kfree(watch);
+err_wlist:
+	kfree(wlist);
+err_wqueue:
+	put_watch_queue(wqueue);
+err_path:
+	path_put(&path);
+	return ret;
+}
diff --git a/fs/namespace.c b/fs/namespace.c
index 85b5f7bea82e..54d237251941 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -498,6 +498,9 @@  static int mnt_make_readonly(struct mount *mnt)
 	smp_wmb();
 	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	unlock_mount_hash();
+	if (ret == 0)
+		notify_mount(mnt, NULL, NOTIFY_MOUNT_READONLY,
+			     NOTIFY_MOUNT_IS_NOW_RO);
 	return ret;
 }
 
@@ -506,6 +509,7 @@  static int __mnt_unmake_readonly(struct mount *mnt)
 	lock_mount_hash();
 	mnt->mnt.mnt_flags &= ~MNT_READONLY;
 	unlock_mount_hash();
+	notify_mount(mnt, NULL, NOTIFY_MOUNT_READONLY, 0);
 	return 0;
 }
 
@@ -819,6 +823,7 @@  static struct mountpoint *unhash_mnt(struct mount *mnt)
  */
 static void umount_mnt(struct mount *mnt)
 {
+	notify_mount(mnt->mnt_parent, mnt, NOTIFY_MOUNT_UNMOUNT, 0);
 	put_mountpoint(unhash_mnt(mnt));
 }
 
@@ -1159,6 +1164,11 @@  static void mntput_no_expire(struct mount *mnt)
 	mnt->mnt.mnt_flags |= MNT_DOOMED;
 	rcu_read_unlock();
 
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+	if (mnt->mnt_watchers)
+		remove_watch_list(mnt->mnt_watchers, mnt->mnt_id);
+#endif
+
 	list_del(&mnt->mnt_instance);
 
 	if (unlikely(!list_empty(&mnt->mnt_mounts))) {
@@ -1453,6 +1463,7 @@  static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 		p = list_first_entry(&tmp_list, struct mount, mnt_list);
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
+
 		ns = p->mnt_ns;
 		if (ns) {
 			ns->mounts--;
@@ -2079,7 +2090,10 @@  static int attach_recursive_mnt(struct mount *source_mnt,
 	}
 	if (moving) {
 		unhash_mnt(source_mnt);
+		notify_mount(source_mnt->mnt_parent, source_mnt,
+			     NOTIFY_MOUNT_MOVE_FROM, 0);
 		attach_mnt(source_mnt, dest_mnt, dest_mp);
+		notify_mount(dest_mnt, source_mnt, NOTIFY_MOUNT_MOVE_TO, 0);
 		touch_mnt_namespace(source_mnt->mnt_ns);
 	} else {
 		if (source_mnt->mnt_ns) {
@@ -2088,6 +2102,11 @@  static int attach_recursive_mnt(struct mount *source_mnt,
 		}
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		commit_tree(source_mnt);
+		notify_mount(dest_mnt, source_mnt, NOTIFY_MOUNT_NEW_MOUNT,
+			     (source_mnt->mnt.mnt_sb->s_flags & SB_RDONLY ?
+			      NOTIFY_MOUNT_IS_NOW_RO : 0) |
+			     (source_mnt->mnt.mnt_sb->s_flags & SB_SUBMOUNT ?
+			      NOTIFY_MOUNT_IS_SUBMOUNT : 0));
 	}
 
 	hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) {
@@ -2464,6 +2483,8 @@  static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags)
 	mnt->mnt.mnt_flags = mnt_flags;
 	touch_mnt_namespace(mnt->mnt_ns);
 	unlock_mount_hash();
+	notify_mount(mnt, NULL, NOTIFY_MOUNT_SETATTR,
+		     (mnt_flags & SB_RDONLY ? NOTIFY_MOUNT_IS_NOW_RO : 0));
 }
 
 static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt)
@@ -2899,6 +2920,7 @@  void mark_mounts_for_expiry(struct list_head *mounts)
 			propagate_mount_busy(mnt, 1))
 			continue;
 		list_move(&mnt->mnt_expire, &graveyard);
+		notify_mount(mnt, NULL, NOTIFY_MOUNT_EXPIRY, 0);
 	}
 	while (!list_empty(&graveyard)) {
 		mnt = list_first_entry(&graveyard, struct mount, mnt_expire);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index c1488cc84fd9..7b194d778155 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -217,6 +217,7 @@  struct dentry_operations {
 #define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
 #define DCACHE_DENTRY_CURSOR		0x20000000
 #define DCACHE_NORCU			0x40000000 /* No RCU delay for freeing */
+#define DCACHE_MOUNT_WATCH		0x80000000 /* There's a mount watch here */
 
 extern seqlock_t rename_lock;
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1815065d52f3..1fd43af3b22d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1003,6 +1003,8 @@  asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 				       siginfo_t __user *info,
 				       unsigned int flags);
 asmlinkage long sys_pidfd_getfd(int pidfd, int fd, unsigned int flags);
+asmlinkage long sys_watch_mount(int dfd, const char __user *path,
+				unsigned int at_flags, int watch_fd, int watch_id);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 3a3201e4618e..6b5748287883 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -855,9 +855,11 @@  __SYSCALL(__NR_clone3, sys_clone3)
 __SYSCALL(__NR_openat2, sys_openat2)
 #define __NR_pidfd_getfd 438
 __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
+#define __NR_watch_mount 439
+__SYSCALL(__NR_watch_mount, sys_watch_mount)
 
 #undef __NR_syscalls
-#define __NR_syscalls 439
+#define __NR_syscalls 440
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
index c3d8320b5d3a..6b6cd2afc590 100644
--- a/include/uapi/linux/watch_queue.h
+++ b/include/uapi/linux/watch_queue.h
@@ -14,7 +14,8 @@ 
 enum watch_notification_type {
 	WATCH_TYPE_META		= 0,	/* Special record */
 	WATCH_TYPE_KEY_NOTIFY	= 1,	/* Key change event notification */
-	WATCH_TYPE__NR		= 2
+	WATCH_TYPE_MOUNT_NOTIFY	= 2,	/* Mount topology change notification */
+	WATCH_TYPE___NR		= 3
 };
 
 enum watch_meta_notification_subtype {
@@ -101,4 +102,37 @@  struct key_notification {
 	__u32	aux;		/* Per-type auxiliary data */
 };
 
+/*
+ * Type of mount topology change notification.
+ */
+enum mount_notification_subtype {
+	NOTIFY_MOUNT_NEW_MOUNT	= 0, /* New mount added */
+	NOTIFY_MOUNT_UNMOUNT	= 1, /* Mount removed manually */
+	NOTIFY_MOUNT_EXPIRY	= 2, /* Automount expired */
+	NOTIFY_MOUNT_READONLY	= 3, /* Mount R/O state changed */
+	NOTIFY_MOUNT_SETATTR	= 4, /* Mount attributes changed */
+	NOTIFY_MOUNT_MOVE_FROM	= 5, /* Mount moved from here */
+	NOTIFY_MOUNT_MOVE_TO	= 6, /* Mount moved to here (compare op_id) */
+};
+
+#define NOTIFY_MOUNT_IN_SUBTREE		WATCH_INFO_FLAG_0 /* Event not actually at watched dentry */
+#define NOTIFY_MOUNT_IS_RECURSIVE	WATCH_INFO_FLAG_1 /* Change applied recursively */
+#define NOTIFY_MOUNT_IS_NOW_RO		WATCH_INFO_FLAG_2 /* Mount changed to R/O */
+#define NOTIFY_MOUNT_IS_SUBMOUNT	WATCH_INFO_FLAG_3 /* New mount is submount */
+
+/*
+ * Mount topology/configuration change notification record.
+ * - watch.type = WATCH_TYPE_MOUNT_NOTIFY
+ * - watch.subtype = enum mount_notification_subtype
+ */
+struct mount_notification {
+	struct watch_notification watch; /* WATCH_TYPE_MOUNT_NOTIFY */
+	__u32	triggered_on;		/* The mount that triggered the notification */
+	__u32	auxiliary_mount;	/* Added/moved/removed mount or 0 */
+	__u32	topology_changes;	/* trigger: Number of topology changes applied */
+	__u32	attr_changes;		/* trigger: Number of attribute changes applied */
+	__u32	aux_topology_changes;	/* aux: Number of topology changes applied */
+	__u32	__padding;
+};
+
 #endif /* _UAPI_LINUX_WATCH_QUEUE_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 3b69a560a7ac..3e1c5c9d2efe 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -85,6 +85,9 @@  COND_SYSCALL(ioprio_get);
 /* fs/locks.c */
 COND_SYSCALL(flock);
 
+/* fs/mount_notify.c */
+COND_SYSCALL(watch_mount);
+
 /* fs/namei.c */
 
 /* fs/namespace.c */