diff mbox series

[3/7] vfs: Add a mount-notification facility

Message ID 155905933492.7587.6968545866041839538.stgit@warthog.procyon.org.uk (mailing list archive)
State New, archived
Headers show
Series Mount, FS, Block and Keyrings notifications | expand

Commit Message

David Howells May 28, 2019, 4:02 p.m. UTC
Add a mount notification facility whereby notifications about changes in
mount topology and configuration can be received.  Note that this only
covers vfsmount topology changes and not superblock events.  A separate
facility will be added for that.

Firstly, an event queue needs to be created:

	fd = open("/dev/event_queue", O_RDWR);
	ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, page_size << n);

then a notification can be set up to report notifications via that queue:

	struct watch_notification_filter filter = {
		.nr_filters = 1,
		.filters = {
			[0] = {
				.type = WATCH_TYPE_MOUNT_NOTIFY,
				.subtype_filter[0] = UINT_MAX,
			},
		},
	};
	ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter);
	mount_notify(AT_FDCWD, "/", 0, fd, 0x02);

In this case, it would let me monitor the mount topology subtree rooted at
"/" for events.  Mount notifications propagate up the tree towards the
root, so a watch will catch all of the events happening in the subtree
rooted at the watch.

After setting the watch, records will be placed into the queue when, for
example, as superblock switches between read-write and read-only.  Records
are of the following format:

	struct mount_notification {
		struct watch_notification watch;
		__u32	triggered_on;
		__u32	changed_mount;
	} *n;

Where:

	n->watch.type will be WATCH_TYPE_MOUNT_NOTIFY.

	n->watch.subtype will indicate the type of event, such as
	NOTIFY_MOUNT_NEW_MOUNT.

	n->watch.info & WATCH_INFO_LENGTH will indicate the length of the
	record.

	n->watch.info & WATCH_INFO_ID will be the fifth argument to
	mount_notify(), shifted.

	n->watch.info & WATCH_INFO_FLAG_0 will be used for
	NOTIFY_MOUNT_READONLY, being set if the superblock becomes R/O, and
	being cleared otherwise, and for NOTIFY_MOUNT_NEW_MOUNT, being set
	if the new mount is a submount (e.g. an automount).

	n->triggered_on indicates the ID of the mount on which the watch
	was installed.

	n->changed_mount indicates the ID of the mount that was affected.

The mount IDs can be retrieved with the fsinfo() syscall, using the
fsinfo_mount_info and fsinfo_mount_child attributes.  There are
notification counters there too for when a buffer overrun occurs, thereby
allowing the mount tree to be quickly rescanned.

Note that it is permissible for event records to be of variable length -
or, at least, the length may be dependent on the subtype.  Note also that
the queue can be shared between multiple notifications of various types.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 arch/x86/entry/syscalls/syscall_32.tbl |    1 
 arch/x86/entry/syscalls/syscall_64.tbl |    1 
 fs/Kconfig                             |    9 ++
 fs/Makefile                            |    1 
 fs/mount.h                             |   33 ++++--
 fs/mount_notify.c                      |  178 ++++++++++++++++++++++++++++++++
 fs/namespace.c                         |    9 +-
 include/linux/dcache.h                 |    1 
 include/linux/syscalls.h               |    2 
 include/uapi/linux/watch_queue.h       |   24 ++++
 kernel/sys_ni.c                        |    3 +
 11 files changed, 248 insertions(+), 14 deletions(-)
 create mode 100644 fs/mount_notify.c

Comments

Jann Horn May 28, 2019, 8:06 p.m. UTC | #1
On Tue, May 28, 2019 at 6:05 PM David Howells <dhowells@redhat.com> wrote:
> Add a mount notification facility whereby notifications about changes in
> mount topology and configuration can be received.  Note that this only
> covers vfsmount topology changes and not superblock events.  A separate
> facility will be added for that.
[...]
> @@ -172,4 +167,18 @@ static inline void notify_mount(struct mount *changed,
>                                 u32 info_flags)
>  {
>         atomic_inc(&changed->mnt_notify_counter);
> +
> +#ifdef CONFIG_MOUNT_NOTIFICATIONS
> +       {
> +               struct mount_notification n = {
> +                       .watch.type     = WATCH_TYPE_MOUNT_NOTIFY,
> +                       .watch.subtype  = subtype,
> +                       .watch.info     = info_flags | sizeof(n),
> +                       .triggered_on   = changed->mnt_id,
> +                       .changed_mount  = aux ? aux->mnt_id : 0,
> +               };
> +
> +               post_mount_notification(changed, &n);
> +       }
> +#endif
>  }
[...]
> +void post_mount_notification(struct mount *changed,
> +                            struct mount_notification *notify)
> +{
> +       const struct cred *cred = current_cred();

This current_cred() looks bogus to me. Can't mount topology changes
come from all sorts of places? For example, umount_mnt() from
umount_tree() from dissolve_on_fput() from __fput(), which could
happen pretty much anywhere depending on where the last reference gets
dropped?

> +       struct path cursor;
> +       struct mount *mnt;
> +       unsigned seq;
> +
> +       seq = 0;
> +       rcu_read_lock();
> +restart:
> +       cursor.mnt = &changed->mnt;
> +       cursor.dentry = changed->mnt.mnt_root;
> +       mnt = real_mount(cursor.mnt);
> +       notify->watch.info &= ~WATCH_INFO_IN_SUBTREE;
> +
> +       read_seqbegin_or_lock(&rename_lock, &seq);
> +       for (;;) {
> +               if (mnt->mnt_watchers &&
> +                   !hlist_empty(&mnt->mnt_watchers->watchers)) {
> +                       if (cursor.dentry->d_flags & DCACHE_MOUNT_WATCH)
> +                               post_watch_notification(mnt->mnt_watchers,
> +                                                       &notify->watch, cred,
> +                                                       (unsigned long)cursor.dentry);
> +               } else {
> +                       cursor.dentry = mnt->mnt.mnt_root;
> +               }
> +               notify->watch.info |= WATCH_INFO_IN_SUBTREE;
> +
> +               if (cursor.dentry == cursor.mnt->mnt_root ||
> +                   IS_ROOT(cursor.dentry)) {
> +                       struct mount *parent = READ_ONCE(mnt->mnt_parent);
> +
> +                       /* Escaped? */
> +                       if (cursor.dentry != cursor.mnt->mnt_root)
> +                               break;
> +
> +                       /* Global root? */
> +                       if (mnt != parent) {
> +                               cursor.dentry = READ_ONCE(mnt->mnt_mountpoint);
> +                               mnt = parent;
> +                               cursor.mnt = &mnt->mnt;
> +                               continue;
> +                       }
> +                       break;

(nit: this would look clearer if you inverted the condition and wrote
it as "if (mnt == parent) break;", then you also wouldn't need that
"continue" or the braces)

> +               }
> +
> +               cursor.dentry = cursor.dentry->d_parent;
> +       }
> +
> +       if (need_seqretry(&rename_lock, seq)) {
> +               seq = 1;
> +               goto restart;
> +       }
> +
> +       done_seqretry(&rename_lock, seq);
> +       rcu_read_unlock();
> +}
[...]
> +SYSCALL_DEFINE5(mount_notify,
> +               int, dfd,
> +               const char __user *, filename,
> +               unsigned int, at_flags,
> +               int, watch_fd,
> +               int, watch_id)
> +{
> +       struct watch_queue *wqueue;
> +       struct watch_list *wlist = NULL;
> +       struct watch *watch;
> +       struct mount *m;
> +       struct path path;
> +       int ret;
> +
> +       if (watch_id < -1 || watch_id > 0xff)
> +               return -EINVAL;
> +
> +       ret = user_path_at(dfd, filename, at_flags, &path);

The third argument of user_path_at() contains kernel-private lookup
flags, I'm pretty sure userspace isn't supposed to be able to control
these directly.

> +       if (ret)
> +               return ret;
> +
> +       wqueue = get_watch_queue(watch_fd);
> +       if (IS_ERR(wqueue))
> +               goto err_path;
> +
> +       m = real_mount(path.mnt);
> +
> +       if (watch_id >= 0) {
> +               if (!m->mnt_watchers) {
> +                       wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
> +                       if (!wlist)
> +                               goto err_wqueue;
> +                       INIT_HLIST_HEAD(&wlist->watchers);
> +                       spin_lock_init(&wlist->lock);
> +                       wlist->release_watch = release_mount_watch;
> +               }
> +
> +               watch = kzalloc(sizeof(*watch), GFP_KERNEL);
> +               if (!watch)
> +                       goto err_wlist;
> +
> +               init_watch(watch, wqueue);
> +               watch->id               = (unsigned long)path.dentry;
> +               watch->private          = path.mnt;
> +               watch->info_id          = (u32)watch_id << 24;
> +
> +               down_write(&m->mnt.mnt_sb->s_umount);
> +               if (!m->mnt_watchers) {
> +                       m->mnt_watchers = wlist;
> +                       wlist = NULL;
> +               }
> +
> +               ret = add_watch_to_object(watch, m->mnt_watchers);
> +               if (ret == 0) {
> +                       spin_lock(&path.dentry->d_lock);
> +                       path.dentry->d_flags |= DCACHE_MOUNT_WATCH;
> +                       spin_unlock(&path.dentry->d_lock);
> +                       path_get(&path);

So... the watches on a mountpoint create references back to the
mountpoint? Is your plan that umount_tree() breaks the loop by getting
rid of the watches?

If so: Is there anything that prevents installing new watches after
umount_tree()? Because I don't see anything.

It might make sense to redesign this stuff so that watches don't hold
references on the object being watched.

> +               }
> +               up_write(&m->mnt.mnt_sb->s_umount);
> +               if (ret < 0)
> +                       kfree(watch);
> +       } else if (m->mnt_watchers) {
> +               down_write(&m->mnt.mnt_sb->s_umount);
> +               ret = remove_watch_from_object(m->mnt_watchers, wqueue,
> +                                              (unsigned long)path.dentry,
> +                                              false);
> +               up_write(&m->mnt.mnt_sb->s_umount);
> +       } else {
> +               ret = -EBADSLT;
> +       }
> +
> +err_wlist:
> +       kfree(wlist);
> +err_wqueue:
> +       put_watch_queue(wqueue);
> +err_path:
> +       path_put(&path);
> +       return ret;
> +}
David Howells May 28, 2019, 11:04 p.m. UTC | #2
Jann Horn <jannh@google.com> wrote:

> It might make sense to redesign this stuff so that watches don't hold
> references on the object being watched.

I explicitly made it hold a reference so that if you place a watch on an
automounted mount it stops it from expiring.

Further, if I create a watch on something, *should* it be unmountable, just as
if I had a file open there or had chdir'd into there?

David
David Howells May 28, 2019, 11:08 p.m. UTC | #3
David Howells <dhowells@redhat.com> wrote:

> > It might make sense to redesign this stuff so that watches don't hold
> > references on the object being watched.
>
> I explicitly made it hold a reference so that if you place a watch on an
> automounted mount it stops it from expiring.
> 
> Further, if I create a watch on something, *should* it be unmountable, just as
> if I had a file open there or had chdir'd into there?

It gets trickier than that as I need a ref on the dentry on which the watch is
rooted to prevent it from getting culled.

David
Jann Horn May 28, 2019, 11:23 p.m. UTC | #4
On Wed, May 29, 2019 at 1:04 AM David Howells <dhowells@redhat.com> wrote:
> Jann Horn <jannh@google.com> wrote:
> > It might make sense to redesign this stuff so that watches don't hold
> > references on the object being watched.
>
> I explicitly made it hold a reference so that if you place a watch on an
> automounted mount it stops it from expiring.
>
> Further, if I create a watch on something, *should* it be unmountable, just as
> if I had a file open there or had chdir'd into there?

I don't really know. I guess it depends on how it's being used? If
someone decides to e.g. make a file browser that installs watches for
a bunch of mountpoints for some fancy sidebar showing the device
mounts on the system, or something like that, that probably shouldn't
inhibit unmounting... I don't know if that's a realistic use case.
David Howells May 29, 2019, 10:55 a.m. UTC | #5
Jann Horn <jannh@google.com> wrote:

> > +                       /* Global root? */
> > +                       if (mnt != parent) {
> > +                               cursor.dentry = READ_ONCE(mnt->mnt_mountpoint);
> > +                               mnt = parent;
> > +                               cursor.mnt = &mnt->mnt;
> > +                               continue;
> > +                       }
> > +                       break;
> 
> (nit: this would look clearer if you inverted the condition and wrote
> it as "if (mnt == parent) break;", then you also wouldn't need that
> "continue" or the braces)

It does look better with the logic inverted, but you *do* still need the
continue.  After the if-statement, there is:

	cursor.dentry = cursor.dentry->d_parent;

which we need to skip.  It might make sense to move that into an
else-statement from an aesthetic point of view.

David
David Howells May 29, 2019, 11 a.m. UTC | #6
Jann Horn <jannh@google.com> wrote:

> > +void post_mount_notification(struct mount *changed,
> > +                            struct mount_notification *notify)
> > +{
> > +       const struct cred *cred = current_cred();
> 
> This current_cred() looks bogus to me. Can't mount topology changes
> come from all sorts of places? For example, umount_mnt() from
> umount_tree() from dissolve_on_fput() from __fput(), which could
> happen pretty much anywhere depending on where the last reference gets
> dropped?

IIRC, that's what Casey argued is the right thing to do from a security PoV.
Casey?

Maybe I should pass in NULL creds in the case that an event is being generated
because an object is being destroyed due to the last usage[*] being removed.

 [*] Usage, not ref - Superblocks are a bit weird in their accounting.

David
David Howells May 29, 2019, 11:16 a.m. UTC | #7
Jann Horn <jannh@google.com> wrote:

> I don't really know. I guess it depends on how it's being used? If
> someone decides to e.g. make a file browser that installs watches for
> a bunch of mountpoints for some fancy sidebar showing the device
> mounts on the system, or something like that, that probably shouldn't
> inhibit unmounting... I don't know if that's a realistic use case.

In such a use case, I would envision the browser putting a watch on "/".  A
watch sees all events in the subtree rooted at that point and you must apply a
filter that filters them out if you're not interested (filter on
WATCH_INFO_IN_SUBTREE using info_filter and info_mask).

David
Casey Schaufler May 29, 2019, 3:53 p.m. UTC | #8
On 5/29/2019 4:00 AM, David Howells wrote:
> Jann Horn <jannh@google.com> wrote:
>
>>> +void post_mount_notification(struct mount *changed,
>>> +                            struct mount_notification *notify)
>>> +{
>>> +       const struct cred *cred = current_cred();
>> This current_cred() looks bogus to me. Can't mount topology changes
>> come from all sorts of places? For example, umount_mnt() from
>> umount_tree() from dissolve_on_fput() from __fput(), which could
>> happen pretty much anywhere depending on where the last reference gets
>> dropped?
> IIRC, that's what Casey argued is the right thing to do from a security PoV.
> Casey?

You need to identify the credential of the subject that triggered
the event. If it isn't current_cred(), the cred needs to be passed
in to post_mount_notification(), or derived by some other means.

> Maybe I should pass in NULL creds in the case that an event is being generated
> because an object is being destroyed due to the last usage[*] being removed.

You should pass the cred of the process that removed the
last usage. If the last usage was removed by something like
the power being turned off on a disk drive a system cred
should be used. Someone or something caused the event. It can
be important who it was.

>  [*] Usage, not ref - Superblocks are a bit weird in their accounting.
>
> David
Jann Horn May 29, 2019, 4:12 p.m. UTC | #9
On Wed, May 29, 2019 at 5:53 PM Casey Schaufler <casey@schaufler-ca.com> wrote:
> On 5/29/2019 4:00 AM, David Howells wrote:
> > Jann Horn <jannh@google.com> wrote:
> >
> >>> +void post_mount_notification(struct mount *changed,
> >>> +                            struct mount_notification *notify)
> >>> +{
> >>> +       const struct cred *cred = current_cred();
> >> This current_cred() looks bogus to me. Can't mount topology changes
> >> come from all sorts of places? For example, umount_mnt() from
> >> umount_tree() from dissolve_on_fput() from __fput(), which could
> >> happen pretty much anywhere depending on where the last reference gets
> >> dropped?
> > IIRC, that's what Casey argued is the right thing to do from a security PoV.
> > Casey?
>
> You need to identify the credential of the subject that triggered
> the event. If it isn't current_cred(), the cred needs to be passed
> in to post_mount_notification(), or derived by some other means.
>
> > Maybe I should pass in NULL creds in the case that an event is being generated
> > because an object is being destroyed due to the last usage[*] being removed.
>
> You should pass the cred of the process that removed the
> last usage. If the last usage was removed by something like
> the power being turned off on a disk drive a system cred
> should be used. Someone or something caused the event. It can
> be important who it was.

The kernel's normal security model means that you should be able to
e.g. accept FDs that random processes send you and perform
read()/write() calls on them without acting as a subject in any
security checks; let alone close(). If you send a file descriptor over
a unix domain socket and the unix domain socket is garbage collected,
for example, I think the close() will just come from some random,
completely unrelated task that happens to trigger the garbage
collector?

Also, I think if someone does I/O via io_uring, I think the caller's
credentials for read/write operations will probably just be normal
kernel creds?

Here the checks probably aren't all that important, but in other
places, when people try to use an LSM as the primary line of defense,
checks that don't align with the kernel's normal security model might
lead to a bunch of problems.
Casey Schaufler May 29, 2019, 5:04 p.m. UTC | #10
On 5/29/2019 9:12 AM, Jann Horn wrote:
> On Wed, May 29, 2019 at 5:53 PM Casey Schaufler <casey@schaufler-ca.com> wrote:
>> On 5/29/2019 4:00 AM, David Howells wrote:
>>> Jann Horn <jannh@google.com> wrote:
>>>
>>>>> +void post_mount_notification(struct mount *changed,
>>>>> +                            struct mount_notification *notify)
>>>>> +{
>>>>> +       const struct cred *cred = current_cred();
>>>> This current_cred() looks bogus to me. Can't mount topology changes
>>>> come from all sorts of places? For example, umount_mnt() from
>>>> umount_tree() from dissolve_on_fput() from __fput(), which could
>>>> happen pretty much anywhere depending on where the last reference gets
>>>> dropped?
>>> IIRC, that's what Casey argued is the right thing to do from a security PoV.
>>> Casey?
>> You need to identify the credential of the subject that triggered
>> the event. If it isn't current_cred(), the cred needs to be passed
>> in to post_mount_notification(), or derived by some other means.
>>
>>> Maybe I should pass in NULL creds in the case that an event is being generated
>>> because an object is being destroyed due to the last usage[*] being removed.
>> You should pass the cred of the process that removed the
>> last usage. If the last usage was removed by something like
>> the power being turned off on a disk drive a system cred
>> should be used. Someone or something caused the event. It can
>> be important who it was.
> The kernel's normal security model means that you should be able to
> e.g. accept FDs that random processes send you and perform
> read()/write() calls on them without acting as a subject in any
> security checks; let alone close().

Passed file descriptors are an anomaly in the security model
that (in this developer's opinion) should have never been
included. More than one of the "B" level UNIX systems disabled
them outright. 

>  If you send a file descriptor over
> a unix domain socket and the unix domain socket is garbage collected,
> for example, I think the close() will just come from some random,
> completely unrelated task that happens to trigger the garbage
> collector?

I never said this was going to be easy or pleasant.
Who destroyed the UDS? It didn't just spontaneously become
garbage. Well, not on modern Linux filesystems, anyway. 

> Also, I think if someone does I/O via io_uring, I think the caller's
> credentials for read/write operations will probably just be normal
> kernel creds?
>
> Here the checks probably aren't all that important, but in other
> places, when people try to use an LSM as the primary line of defense,
> checks that don't align with the kernel's normal security model might
> lead to a bunch of problems.

The kernel does not have a "normal security model". It has a
collection of disparate and almost but not quite contradictory
models for the various objects and mechanisms it implements.
It already has a bunch of problems, we're just used to them.

I can only send a signal to a process with the same UID. Why doesn't
a process have mode bits so that I could get signals from my group?

Why do IPC object have creator bits, while files don't?

Why can I send a file descriptor over a UDS, but not a message queue?

Why can't I set the mode bits on a symlink?

What can go wrong if I don't map groups into a user namespace?

LSMs (SELinux and Smack, which are classic mandatory access control
systems in particular) are more consistent, but still have to deal with
some of these differences. A symlink gets a Smack label, for example.

The point being that it's very easy to add new mechanisms that do
wonderful things but that introduce unforeseen ways to bypass one
or more of the existing protections.
Andy Lutomirski May 29, 2019, 5:13 p.m. UTC | #11
> On May 29, 2019, at 8:53 AM, Casey Schaufler <casey@schaufler-ca.com> wrote:
> 
>> On 5/29/2019 4:00 AM, David Howells wrote:
>> Jann Horn <jannh@google.com> wrote:
>> 
>>>> +void post_mount_notification(struct mount *changed,
>>>> +                            struct mount_notification *notify)
>>>> +{
>>>> +       const struct cred *cred = current_cred();
>>> This current_cred() looks bogus to me. Can't mount topology changes
>>> come from all sorts of places? For example, umount_mnt() from
>>> umount_tree() from dissolve_on_fput() from __fput(), which could
>>> happen pretty much anywhere depending on where the last reference gets
>>> dropped?
>> IIRC, that's what Casey argued is the right thing to do from a security PoV.
>> Casey?
> 
> You need to identify the credential of the subject that triggered
> the event. If it isn't current_cred(), the cred needs to be passed
> in to post_mount_notification(), or derived by some other means.

Taking a step back, why do we care who triggered the event?  It seems to me that we should care whether the event happened and whether the *receiver* is permitted to know that.

(And receiver means whoever subscribed, presumably, not whoever called read() or mmap().)
Casey Schaufler May 29, 2019, 5:46 p.m. UTC | #12
On 5/29/2019 10:13 AM, Andy Lutomirski wrote:
>
>> On May 29, 2019, at 8:53 AM, Casey Schaufler <casey@schaufler-ca.com> wrote:
>>
>>> On 5/29/2019 4:00 AM, David Howells wrote:
>>> Jann Horn <jannh@google.com> wrote:
>>>
>>>>> +void post_mount_notification(struct mount *changed,
>>>>> +                            struct mount_notification *notify)
>>>>> +{
>>>>> +       const struct cred *cred = current_cred();
>>>> This current_cred() looks bogus to me. Can't mount topology changes
>>>> come from all sorts of places? For example, umount_mnt() from
>>>> umount_tree() from dissolve_on_fput() from __fput(), which could
>>>> happen pretty much anywhere depending on where the last reference gets
>>>> dropped?
>>> IIRC, that's what Casey argued is the right thing to do from a security PoV.
>>> Casey?
>> You need to identify the credential of the subject that triggered
>> the event. If it isn't current_cred(), the cred needs to be passed
>> in to post_mount_notification(), or derived by some other means.
> Taking a step back, why do we care who triggered the event?  It seems to me that we should care whether the event happened and whether the *receiver* is permitted to know that.

There are two filesystems, "dot" and "dash". I am not allowed
to communicate with Fred on the system, and all precautions have
been taken to ensure I cannot. Fred asks for notifications on
all mount activity. I perform actions that result in notifications
on "dot" and "dash". Fred receives notifications and interprets
them using Morse code. This is not OK. If Wilma, who *is* allowed
to communicate with Fred, does the same actions, he should be
allowed to get the messages via Morse.

The event is information. The information is generated as a
result of my or Wilma's action. Fred is passive in this access.
Fred is not "reading" the event. The event is being written to
Fred. My process is the subject, and Fred's the object.

Other security modelers may disagree. The models they produce
are going to be *very* complicated and will introduce agents and
intermediate objects to justify Fred's reception of an event as
a read operation.

> (And receiver means whoever subscribed, presumably, not whoever called read() or mmap().)

The receiver is the process that gets the event. There may
be more than one receiver, and the receivers may have different
credentials. Each needs to be checked separately.

Isn't this starting to sound like the discussions on kdbus?
I'm not sure if that deserves a :) or a :( but probably one of the two.
Jann Horn May 29, 2019, 6:11 p.m. UTC | #13
On Wed, May 29, 2019 at 7:46 PM Casey Schaufler <casey@schaufler-ca.com> wrote:
> On 5/29/2019 10:13 AM, Andy Lutomirski wrote:
> >> On May 29, 2019, at 8:53 AM, Casey Schaufler <casey@schaufler-ca.com> wrote:
> >>> On 5/29/2019 4:00 AM, David Howells wrote:
> >>> Jann Horn <jannh@google.com> wrote:
> >>>
> >>>>> +void post_mount_notification(struct mount *changed,
> >>>>> +                            struct mount_notification *notify)
> >>>>> +{
> >>>>> +       const struct cred *cred = current_cred();
> >>>> This current_cred() looks bogus to me. Can't mount topology changes
> >>>> come from all sorts of places? For example, umount_mnt() from
> >>>> umount_tree() from dissolve_on_fput() from __fput(), which could
> >>>> happen pretty much anywhere depending on where the last reference gets
> >>>> dropped?
> >>> IIRC, that's what Casey argued is the right thing to do from a security PoV.
> >>> Casey?
> >> You need to identify the credential of the subject that triggered
> >> the event. If it isn't current_cred(), the cred needs to be passed
> >> in to post_mount_notification(), or derived by some other means.
> > Taking a step back, why do we care who triggered the event?  It seems to me that we should care whether the event happened and whether the *receiver* is permitted to know that.
>
> There are two filesystems, "dot" and "dash". I am not allowed
> to communicate with Fred on the system, and all precautions have
> been taken to ensure I cannot. Fred asks for notifications on
> all mount activity. I perform actions that result in notifications
> on "dot" and "dash". Fred receives notifications and interprets
> them using Morse code. This is not OK. If Wilma, who *is* allowed
> to communicate with Fred, does the same actions, he should be
> allowed to get the messages via Morse.

In other words, a classic covert channel. You can't really prevent two
cooperating processes from communicating through a covert channel on a
modern computer. You can transmit information through the scheduler,
through hyperthread resource sharing, through CPU data caches, through
disk contention, through page cache state, through RAM contention, and
probably dozens of other ways that I can't think of right now. There
have been plenty of papers that demonstrated things like an SSH
connection between two virtual machines without network access running
on the same physical host (<https://gruss.cc/files/hello.pdf>),
communication between a VM and a browser running on the host system,
and so on.
Casey Schaufler May 29, 2019, 7:28 p.m. UTC | #14
On 5/29/2019 11:11 AM, Jann Horn wrote:
> On Wed, May 29, 2019 at 7:46 PM Casey Schaufler <casey@schaufler-ca.com> wrote:
>> On 5/29/2019 10:13 AM, Andy Lutomirski wrote:
>>>> On May 29, 2019, at 8:53 AM, Casey Schaufler <casey@schaufler-ca.com> wrote:
>>>>> On 5/29/2019 4:00 AM, David Howells wrote:
>>>>> Jann Horn <jannh@google.com> wrote:
>>>>>
>>>>>>> +void post_mount_notification(struct mount *changed,
>>>>>>> +                            struct mount_notification *notify)
>>>>>>> +{
>>>>>>> +       const struct cred *cred = current_cred();
>>>>>> This current_cred() looks bogus to me. Can't mount topology changes
>>>>>> come from all sorts of places? For example, umount_mnt() from
>>>>>> umount_tree() from dissolve_on_fput() from __fput(), which could
>>>>>> happen pretty much anywhere depending on where the last reference gets
>>>>>> dropped?
>>>>> IIRC, that's what Casey argued is the right thing to do from a security PoV.
>>>>> Casey?
>>>> You need to identify the credential of the subject that triggered
>>>> the event. If it isn't current_cred(), the cred needs to be passed
>>>> in to post_mount_notification(), or derived by some other means.
>>> Taking a step back, why do we care who triggered the event?  It seems to me that we should care whether the event happened and whether the *receiver* is permitted to know that.
>> There are two filesystems, "dot" and "dash". I am not allowed
>> to communicate with Fred on the system, and all precautions have
>> been taken to ensure I cannot. Fred asks for notifications on
>> all mount activity. I perform actions that result in notifications
>> on "dot" and "dash". Fred receives notifications and interprets
>> them using Morse code. This is not OK. If Wilma, who *is* allowed
>> to communicate with Fred, does the same actions, he should be
>> allowed to get the messages via Morse.
> In other words, a classic covert channel. You can't really prevent two
> cooperating processes from communicating through a covert channel on a
> modern computer.

That doesn't give you permission to design them in.
Plus, the LSMs that implement mandatory access controls
are going to want to intervene. No unclassified user
should see notifications caused by Top Secret users.

>  You can transmit information through the scheduler,
> through hyperthread resource sharing, through CPU data caches, through
> disk contention, through page cache state, through RAM contention, and
> probably dozens of other ways that I can't think of right now.

Yeah, and there's been a lot of activity to reduce those,
which are hard to exploit, as opposed to this, which would
be trivial and obvious.

> There
> have been plenty of papers that demonstrated things like an SSH
> connection between two virtual machines without network access running
> on the same physical host (<https://gruss.cc/files/hello.pdf>),
> communication between a VM and a browser running on the host system,
> and so on.

So you're saying we shouldn't have mode bits on files because
spectre/meltdown makes them pointless?
Jann Horn May 29, 2019, 7:47 p.m. UTC | #15
On Wed, May 29, 2019 at 9:28 PM Casey Schaufler <casey@schaufler-ca.com> wrote:
> On 5/29/2019 11:11 AM, Jann Horn wrote:
> > On Wed, May 29, 2019 at 7:46 PM Casey Schaufler <casey@schaufler-ca.com> wrote:
> >> On 5/29/2019 10:13 AM, Andy Lutomirski wrote:
> >>>> On May 29, 2019, at 8:53 AM, Casey Schaufler <casey@schaufler-ca.com> wrote:
> >>>>> On 5/29/2019 4:00 AM, David Howells wrote:
> >>>>> Jann Horn <jannh@google.com> wrote:
> >>>>>
> >>>>>>> +void post_mount_notification(struct mount *changed,
> >>>>>>> +                            struct mount_notification *notify)
> >>>>>>> +{
> >>>>>>> +       const struct cred *cred = current_cred();
> >>>>>> This current_cred() looks bogus to me. Can't mount topology changes
> >>>>>> come from all sorts of places? For example, umount_mnt() from
> >>>>>> umount_tree() from dissolve_on_fput() from __fput(), which could
> >>>>>> happen pretty much anywhere depending on where the last reference gets
> >>>>>> dropped?
> >>>>> IIRC, that's what Casey argued is the right thing to do from a security PoV.
> >>>>> Casey?
> >>>> You need to identify the credential of the subject that triggered
> >>>> the event. If it isn't current_cred(), the cred needs to be passed
> >>>> in to post_mount_notification(), or derived by some other means.
> >>> Taking a step back, why do we care who triggered the event?  It seems to me that we should care whether the event happened and whether the *receiver* is permitted to know that.
> >> There are two filesystems, "dot" and "dash". I am not allowed
> >> to communicate with Fred on the system, and all precautions have
> >> been taken to ensure I cannot. Fred asks for notifications on
> >> all mount activity. I perform actions that result in notifications
> >> on "dot" and "dash". Fred receives notifications and interprets
> >> them using Morse code. This is not OK. If Wilma, who *is* allowed
> >> to communicate with Fred, does the same actions, he should be
> >> allowed to get the messages via Morse.
> > In other words, a classic covert channel. You can't really prevent two
> > cooperating processes from communicating through a covert channel on a
> > modern computer.
>
> That doesn't give you permission to design them in.
> Plus, the LSMs that implement mandatory access controls
> are going to want to intervene. No unclassified user
> should see notifications caused by Top Secret users.

But that's probably because they're worried about *side* channels, not
covert channels?

Talking about this in the context of (small) side channels: The
notification types introduced in this patch are mostly things that a
user would be able to observe anyway if they polled /proc/self/mounts,
right? It might make sense to align access controls based on that - if
you don't want it to be possible to observe events happening on some
mount points through this API, you should probably lock down
/proc/*/mounts equivalently, by introducing an LSM hook for "is @cred
allowed to see @mnt" or something like that - and if you want to
compare two cred structures, you could record the cred structure that
is responsible for the creation of the mount point, or something like
that.

For some of the other patches, I guess things get more tricky because
the notification exposes new information that wasn't really available
before.

> >  You can transmit information through the scheduler,
> > through hyperthread resource sharing, through CPU data caches, through
> > disk contention, through page cache state, through RAM contention, and
> > probably dozens of other ways that I can't think of right now.
>
> Yeah, and there's been a lot of activity to reduce those,
> which are hard to exploit, as opposed to this, which would
> be trivial and obvious.
>
> > There
> > have been plenty of papers that demonstrated things like an SSH
> > connection between two virtual machines without network access running
> > on the same physical host (<https://gruss.cc/files/hello.pdf>),
> > communication between a VM and a browser running on the host system,
> > and so on.
>
> So you're saying we shouldn't have mode bits on files because
> spectre/meltdown makes them pointless?

spectre/meltdown are vulnerabilities that are being mitigated.
Microarchitectural covert channels are an accepted fact and I haven't
heard of anyone seriously considering trying to get rid of them all.
Casey Schaufler May 29, 2019, 8:50 p.m. UTC | #16
On 5/29/2019 12:47 PM, Jann Horn wrote:
> On Wed, May 29, 2019 at 9:28 PM Casey Schaufler <casey@schaufler-ca.com> wrote:
>> On 5/29/2019 11:11 AM, Jann Horn wrote:
>>> On Wed, May 29, 2019 at 7:46 PM Casey Schaufler <casey@schaufler-ca.com> wrote:
>>>> On 5/29/2019 10:13 AM, Andy Lutomirski wrote:
>>>>>> On May 29, 2019, at 8:53 AM, Casey Schaufler <casey@schaufler-ca.com> wrote:
>>>>>>> On 5/29/2019 4:00 AM, David Howells wrote:
>>>>>>> Jann Horn <jannh@google.com> wrote:
>>>>>>>
>>>>>>>>> +void post_mount_notification(struct mount *changed,
>>>>>>>>> +                            struct mount_notification *notify)
>>>>>>>>> +{
>>>>>>>>> +       const struct cred *cred = current_cred();
>>>>>>>> This current_cred() looks bogus to me. Can't mount topology changes
>>>>>>>> come from all sorts of places? For example, umount_mnt() from
>>>>>>>> umount_tree() from dissolve_on_fput() from __fput(), which could
>>>>>>>> happen pretty much anywhere depending on where the last reference gets
>>>>>>>> dropped?
>>>>>>> IIRC, that's what Casey argued is the right thing to do from a security PoV.
>>>>>>> Casey?
>>>>>> You need to identify the credential of the subject that triggered
>>>>>> the event. If it isn't current_cred(), the cred needs to be passed
>>>>>> in to post_mount_notification(), or derived by some other means.
>>>>> Taking a step back, why do we care who triggered the event?  It seems to me that we should care whether the event happened and whether the *receiver* is permitted to know that.
>>>> There are two filesystems, "dot" and "dash". I am not allowed
>>>> to communicate with Fred on the system, and all precautions have
>>>> been taken to ensure I cannot. Fred asks for notifications on
>>>> all mount activity. I perform actions that result in notifications
>>>> on "dot" and "dash". Fred receives notifications and interprets
>>>> them using Morse code. This is not OK. If Wilma, who *is* allowed
>>>> to communicate with Fred, does the same actions, he should be
>>>> allowed to get the messages via Morse.
>>> In other words, a classic covert channel. You can't really prevent two
>>> cooperating processes from communicating through a covert channel on a
>>> modern computer.
>> That doesn't give you permission to design them in.
>> Plus, the LSMs that implement mandatory access controls
>> are going to want to intervene. No unclassified user
>> should see notifications caused by Top Secret users.
> But that's probably because they're worried about *side* channels, not
> covert channels?

The security evaluators from the 1990's considered any channel
with greater than 1 bit/second bandwidth a show-stopper. That was
true for covert and side channels. Further, if you knew that a
mechanism had a channel, as this one does, and you didn't fix it,
you didn't get your certificate. If you know about a problem
during the design/implementation phase it's really inexcusable not
to fix it before "completing" the code.

> Talking about this in the context of (small) side channels: The
> notification types introduced in this patch are mostly things that a
> user would be able to observe anyway if they polled /proc/self/mounts,
> right?

It's supposed to be a general mechanism. Of course it would
be simpler if is was restricted to things you can get at via
/proc/self.

>  It might make sense to align access controls based on that - if
> you don't want it to be possible to observe events happening on some
> mount points through this API, you should probably lock down
> /proc/*/mounts equivalently, by introducing an LSM hook for "is @cred
> allowed to see @mnt" or something like that - and if you want to
> compare two cred structures, you could record the cred structure that
> is responsible for the creation of the mount point, or something like
> that.

I'm not going to argue against that.

> For some of the other patches, I guess things get more tricky because
> the notification exposes new information that wasn't really available
> before.

We have to look not just at the information being available,
but the mechanism used. Being able to look at information about
a process in /proc doesn't mean I should be able to look at it
using ptrace(). Access control isn't done on data, it's done on
objects. That I can get information by looking in one object provides
no assurance that I can get it through a different object containing
the same information. This happens in /dev all over the place. A
file with hard links may be accessible by one path but not another.

>
>>>  You can transmit information through the scheduler,
>>> through hyperthread resource sharing, through CPU data caches, through
>>> disk contention, through page cache state, through RAM contention, and
>>> probably dozens of other ways that I can't think of right now.
>> Yeah, and there's been a lot of activity to reduce those,
>> which are hard to exploit, as opposed to this, which would
>> be trivial and obvious.
>>
>>> There
>>> have been plenty of papers that demonstrated things like an SSH
>>> connection between two virtual machines without network access running
>>> on the same physical host (<https://gruss.cc/files/hello.pdf>),
>>> communication between a VM and a browser running on the host system,
>>> and so on.
>> So you're saying we shouldn't have mode bits on files because
>> spectre/meltdown makes them pointless?
> spectre/meltdown are vulnerabilities that are being mitigated.
> Microarchitectural covert channels are an accepted fact and I haven't
> heard of anyone seriously considering trying to get rid of them all.
Andy Lutomirski May 29, 2019, 11:12 p.m. UTC | #17
> On May 29, 2019, at 10:46 AM, Casey Schaufler <casey@schaufler-ca.com> wrote:
> 
>> On 5/29/2019 10:13 AM, Andy Lutomirski wrote:
>> 
>>>> On May 29, 2019, at 8:53 AM, Casey Schaufler <casey@schaufler-ca.com> wrote:
>>>> 
>>>> On 5/29/2019 4:00 AM, David Howells wrote:
>>>> Jann Horn <jannh@google.com> wrote:
>>>> 
>>>>>> +void post_mount_notification(struct mount *changed,
>>>>>> +                            struct mount_notification *notify)
>>>>>> +{
>>>>>> +       const struct cred *cred = current_cred();
>>>>> This current_cred() looks bogus to me. Can't mount topology changes
>>>>> come from all sorts of places? For example, umount_mnt() from
>>>>> umount_tree() from dissolve_on_fput() from __fput(), which could
>>>>> happen pretty much anywhere depending on where the last reference gets
>>>>> dropped?
>>>> IIRC, that's what Casey argued is the right thing to do from a security PoV.
>>>> Casey?
>>> You need to identify the credential of the subject that triggered
>>> the event. If it isn't current_cred(), the cred needs to be passed
>>> in to post_mount_notification(), or derived by some other means.
>> Taking a step back, why do we care who triggered the event?  It seems to me that we should care whether the event happened and whether the *receiver* is permitted to know that.
> 
> There are two filesystems, "dot" and "dash". I am not allowed
> to communicate with Fred on the system, and all precautions have
> been taken to ensure I cannot. Fred asks for notifications on
> all mount activity. I perform actions that result in notifications
> on "dot" and "dash". Fred receives notifications and interprets
> them using Morse code. This is not OK. If Wilma, who *is* allowed
> to communicate with Fred, does the same actions, he should be
> allowed to get the messages via Morse.

Under this scenario, Fred should not be allowed to enable these watches. If you give yourself and Fred unconstrained access to the same FS, then can communicate.

> 
> Other security modelers may disagree. The models they produce
> are going to be *very* complicated and will introduce agents and
> intermediate objects to justify Fred's reception of an event as
> a read operation.

I disagree. They’ll model the watch as something to prevent if they want to restrict communication.

> 
>> (And receiver means whoever subscribed, presumably, not whoever called read() or mmap().)
> 
> The receiver is the process that gets the event. There may
> be more than one receiver, and the receivers may have different
> credentials. Each needs to be checked separately.

I think it’s a bit crazy to have the same event queue with two readers who read different things.
Casey Schaufler May 29, 2019, 11:56 p.m. UTC | #18
On 5/29/2019 4:12 PM, Andy Lutomirski wrote:
>
>> On May 29, 2019, at 10:46 AM, Casey Schaufler <casey@schaufler-ca.com> wrote:
>>
>>> On 5/29/2019 10:13 AM, Andy Lutomirski wrote:
>>>
>>>>> On May 29, 2019, at 8:53 AM, Casey Schaufler <casey@schaufler-ca.com> wrote:
>>>>>
>>>>> On 5/29/2019 4:00 AM, David Howells wrote:
>>>>> Jann Horn <jannh@google.com> wrote:
>>>>>
>>>>>>> +void post_mount_notification(struct mount *changed,
>>>>>>> +                            struct mount_notification *notify)
>>>>>>> +{
>>>>>>> +       const struct cred *cred = current_cred();
>>>>>> This current_cred() looks bogus to me. Can't mount topology changes
>>>>>> come from all sorts of places? For example, umount_mnt() from
>>>>>> umount_tree() from dissolve_on_fput() from __fput(), which could
>>>>>> happen pretty much anywhere depending on where the last reference gets
>>>>>> dropped?
>>>>> IIRC, that's what Casey argued is the right thing to do from a security PoV.
>>>>> Casey?
>>>> You need to identify the credential of the subject that triggered
>>>> the event. If it isn't current_cred(), the cred needs to be passed
>>>> in to post_mount_notification(), or derived by some other means.
>>> Taking a step back, why do we care who triggered the event?  It seems to me that we should care whether the event happened and whether the *receiver* is permitted to know that.
>> There are two filesystems, "dot" and "dash". I am not allowed
>> to communicate with Fred on the system, and all precautions have
>> been taken to ensure I cannot. Fred asks for notifications on
>> all mount activity. I perform actions that result in notifications
>> on "dot" and "dash". Fred receives notifications and interprets
>> them using Morse code. This is not OK. If Wilma, who *is* allowed
>> to communicate with Fred, does the same actions, he should be
>> allowed to get the messages via Morse.
> Under this scenario, Fred should not be allowed to enable these watches. If you give yourself and Fred unconstrained access to the same FS, then can communicate.

How are you going to determine at the time Fred tries to enable the watches
that I am going to do something that will trigger them? I'm not saying it isn't
possible, I'm curious how you would propose doing it. If you deny Fred the ability
to set watches because it is possible for me to trigger them, he can't use them
to get information from Wilma, either.

>
>> Other security modelers may disagree. The models they produce
>> are going to be *very* complicated and will introduce agents and
>> intermediate objects to justify Fred's reception of an event as
>> a read operation.
> I disagree. They’ll model the watch as something to prevent if they want to restrict communication.

Sorry, but that isn't sufficiently detailed to be meaningful.

>>> (And receiver means whoever subscribed, presumably, not whoever called read() or mmap().)
>> The receiver is the process that gets the event. There may
>> be more than one receiver, and the receivers may have different
>> credentials. Each needs to be checked separately.
> I think it’s a bit crazy to have the same event queue with two readers who read different things.

Look at killpg(3).

The process that creates the event has to be involved in the
access decision. Otherwise you have an uncontrolled data channel.
When the receiver reads the event queue it knows nothing about the
sender, and hence cannot make the decision unless the credential of
the sender is kept with the event message, and used when the
receiver tries to access it. I don't think that wold work well with
the mechanism as designed.
David Howells June 3, 2019, 4:30 p.m. UTC | #19
Casey Schaufler <casey@schaufler-ca.com> wrote:

> >> should be used. Someone or something caused the event. It can
> >> be important who it was.
> > The kernel's normal security model means that you should be able to
> > e.g. accept FDs that random processes send you and perform
> > read()/write() calls on them without acting as a subject in any
> > security checks; let alone close().
> 
> Passed file descriptors are an anomaly in the security model
> that (in this developer's opinion) should have never been
> included. More than one of the "B" level UNIX systems disabled
> them outright. 

Considering further on this, I think the only way to implement what you're
suggesting is to add a field to struct file to record the last fputter's creds
as the procedure of fputting is offloaded to a workqueue.

Note that's last fputter, not the last closer, as we don't track the number of
open fds linked to a file struct.

In the case of AF_UNIX sockets that contain in-the-process-of-being-passed fds
at the time of closure, this is further complicated by the socket fput being
achieved in the work item - thereby adding layers of indirection.

It might be possible to replace f_cred rather than adding a new field, but
that might get used somewhere after that point.

Note also that fsnotify_close() doesn't appear to use the last fputter's path
since it's not available if called from deferred fput.

David
diff mbox series

Patch

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 03decae51513..a8416a9a0ccb 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -439,3 +439,4 @@ 
 432	i386	fsmount			sys_fsmount			__ia32_sys_fsmount
 433	i386	fspick			sys_fspick			__ia32_sys_fspick
 434	i386	fsinfo			sys_fsinfo			__ia32_sys_fsinfo
+435	i386	mount_notify		sys_mount_notify		__ia32_sys_mount_notify
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index ea63df9a1020..ea052a94eb97 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -356,6 +356,7 @@ 
 432	common	fsmount			__x64_sys_fsmount
 433	common	fspick			__x64_sys_fspick
 434	common	fsinfo			__x64_sys_fsinfo
+435	common	mount_notify		__x64_sys_mount_notify
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/Kconfig b/fs/Kconfig
index 9e7d2f2c0111..a26bbe27a791 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -121,6 +121,15 @@  source "fs/crypto/Kconfig"
 
 source "fs/notify/Kconfig"
 
+config MOUNT_NOTIFICATIONS
+	bool "Mount topology change notifications"
+	select WATCH_QUEUE
+	help
+	  This option provides support for getting change notifications on the
+	  mount tree topology.  This makes use of the /dev/watch_queue misc
+	  device to handle the notification buffer and provides the
+	  mount_notify() system call to enable/disable watchpoints.
+
 source "fs/quota/Kconfig"
 
 source "fs/autofs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index 26eaeae4b9a1..c6a71daf2464 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -131,3 +131,4 @@  obj-$(CONFIG_F2FS_FS)		+= f2fs/
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
+obj-$(CONFIG_MOUNT_NOTIFICATIONS) += mount_notify.o
diff --git a/fs/mount.h b/fs/mount.h
index 47795802f78e..a95b805d00d8 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -4,6 +4,7 @@ 
 #include <linux/poll.h>
 #include <linux/ns_common.h>
 #include <linux/fs_pin.h>
+#include <linux/watch_queue.h>
 
 struct mnt_namespace {
 	atomic_t		count;
@@ -67,9 +68,13 @@  struct mount {
 	int mnt_id;			/* mount identifier */
 	int mnt_group_id;		/* peer group identifier */
 	int mnt_expiry_mark;		/* true if marked for expiry */
+	int mnt_nr_watchers;		/* The number of subtree watches tracking this */
 	struct hlist_head mnt_pins;
 	struct fs_pin mnt_umount;
 	struct dentry *mnt_ex_mountpoint;
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+	struct watch_list *mnt_watchers; /* Watches on dentries within this mount */
+#endif
 	atomic_t mnt_notify_counter;	/* Number of notifications generated */
 } __randomize_layout;
 
@@ -153,18 +158,8 @@  static inline bool is_anon_ns(struct mnt_namespace *ns)
 	return ns->seq == 0;
 }
 
-/*
- * Type of mount topology change notification.
- */
-enum mount_notification_subtype {
-	NOTIFY_MOUNT_NEW_MOUNT	= 0, /* New mount added */
-	NOTIFY_MOUNT_UNMOUNT	= 1, /* Mount removed manually */
-	NOTIFY_MOUNT_EXPIRY	= 2, /* Automount expired */
-	NOTIFY_MOUNT_READONLY	= 3, /* Mount R/O state changed */
-	NOTIFY_MOUNT_SETATTR	= 4, /* Mount attributes changed */
-	NOTIFY_MOUNT_MOVE_FROM	= 5, /* Mount moved from here */
-	NOTIFY_MOUNT_MOVE_TO	= 6, /* Mount moved to here (compare op_id) */
-};
+extern void post_mount_notification(struct mount *changed,
+				    struct mount_notification *notify);
 
 static inline void notify_mount(struct mount *changed,
 				struct mount *aux,
@@ -172,4 +167,18 @@  static inline void notify_mount(struct mount *changed,
 				u32 info_flags)
 {
 	atomic_inc(&changed->mnt_notify_counter);
+
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+	{
+		struct mount_notification n = {
+			.watch.type	= WATCH_TYPE_MOUNT_NOTIFY,
+			.watch.subtype	= subtype,
+			.watch.info	= info_flags | sizeof(n),
+			.triggered_on	= changed->mnt_id,
+			.changed_mount	= aux ? aux->mnt_id : 0,
+		};
+
+		post_mount_notification(changed, &n);
+	}
+#endif
 }
diff --git a/fs/mount_notify.c b/fs/mount_notify.c
new file mode 100644
index 000000000000..6c7f323dbd4f
--- /dev/null
+++ b/fs/mount_notify.c
@@ -0,0 +1,178 @@ 
+/* Provide mount topology/attribute change notifications.
+ *
+ * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include "mount.h"
+
+/*
+ * Post mount notifications to all watches going rootwards along the tree.
+ *
+ * Must be called with the mount_lock held.
+ */
+void post_mount_notification(struct mount *changed,
+			     struct mount_notification *notify)
+{
+	const struct cred *cred = current_cred();
+	struct path cursor;
+	struct mount *mnt;
+	unsigned seq;
+
+	seq = 0;
+	rcu_read_lock();
+restart:
+	cursor.mnt = &changed->mnt;
+	cursor.dentry = changed->mnt.mnt_root;
+	mnt = real_mount(cursor.mnt);
+	notify->watch.info &= ~WATCH_INFO_IN_SUBTREE;
+
+	read_seqbegin_or_lock(&rename_lock, &seq);
+	for (;;) {
+		if (mnt->mnt_watchers &&
+		    !hlist_empty(&mnt->mnt_watchers->watchers)) {
+			if (cursor.dentry->d_flags & DCACHE_MOUNT_WATCH)
+				post_watch_notification(mnt->mnt_watchers,
+							&notify->watch, cred,
+							(unsigned long)cursor.dentry);
+		} else {
+			cursor.dentry = mnt->mnt.mnt_root;
+		}
+		notify->watch.info |= WATCH_INFO_IN_SUBTREE;
+
+		if (cursor.dentry == cursor.mnt->mnt_root ||
+		    IS_ROOT(cursor.dentry)) {
+			struct mount *parent = READ_ONCE(mnt->mnt_parent);
+
+			/* Escaped? */
+			if (cursor.dentry != cursor.mnt->mnt_root)
+				break;
+
+			/* Global root? */
+			if (mnt != parent) {
+				cursor.dentry = READ_ONCE(mnt->mnt_mountpoint);
+				mnt = parent;
+				cursor.mnt = &mnt->mnt;
+				continue;
+			}
+			break;
+		}
+
+		cursor.dentry = cursor.dentry->d_parent;
+	}
+
+	if (need_seqretry(&rename_lock, seq)) {
+		seq = 1;
+		goto restart;
+	}
+
+	done_seqretry(&rename_lock, seq);
+	rcu_read_unlock();
+}
+
+static void release_mount_watch(struct watch_list *wlist, struct watch *watch)
+{
+	struct vfsmount *mnt = watch->private;
+	struct dentry *dentry = (struct dentry *)(unsigned long)watch->id;
+
+	dput(dentry);
+	mntput(mnt);
+}
+
+/**
+ * sys_mount_notify - Watch for mount topology/attribute changes
+ * @dfd: Base directory to pathwalk from or fd referring to mount.
+ * @filename: Path to mount to place the watch upon
+ * @at_flags: Pathwalk control flags
+ * @watch_fd: The watch queue to send notifications to.
+ * @watch_id: The watch ID to be placed in the notification (-1 to remove watch)
+ */
+SYSCALL_DEFINE5(mount_notify,
+		int, dfd,
+		const char __user *, filename,
+		unsigned int, at_flags,
+		int, watch_fd,
+		int, watch_id)
+{
+	struct watch_queue *wqueue;
+	struct watch_list *wlist = NULL;
+	struct watch *watch;
+	struct mount *m;
+	struct path path;
+	int ret;
+
+	if (watch_id < -1 || watch_id > 0xff)
+		return -EINVAL;
+
+	ret = user_path_at(dfd, filename, at_flags, &path);
+	if (ret)
+		return ret;
+
+	wqueue = get_watch_queue(watch_fd);
+	if (IS_ERR(wqueue))
+		goto err_path;
+
+	m = real_mount(path.mnt);
+
+	if (watch_id >= 0) {
+		if (!m->mnt_watchers) {
+			wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
+			if (!wlist)
+				goto err_wqueue;
+			INIT_HLIST_HEAD(&wlist->watchers);
+			spin_lock_init(&wlist->lock);
+			wlist->release_watch = release_mount_watch;
+		}
+
+		watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+		if (!watch)
+			goto err_wlist;
+
+		init_watch(watch, wqueue);
+		watch->id		= (unsigned long)path.dentry;
+		watch->private		= path.mnt;
+		watch->info_id		= (u32)watch_id << 24;
+
+		down_write(&m->mnt.mnt_sb->s_umount);
+		if (!m->mnt_watchers) {
+			m->mnt_watchers = wlist;
+			wlist = NULL;
+		}
+
+		ret = add_watch_to_object(watch, m->mnt_watchers);
+		if (ret == 0) {
+			spin_lock(&path.dentry->d_lock);
+			path.dentry->d_flags |= DCACHE_MOUNT_WATCH;
+			spin_unlock(&path.dentry->d_lock);
+			path_get(&path);
+		}
+		up_write(&m->mnt.mnt_sb->s_umount);
+		if (ret < 0)
+			kfree(watch);
+	} else if (m->mnt_watchers) {
+		down_write(&m->mnt.mnt_sb->s_umount);
+		ret = remove_watch_from_object(m->mnt_watchers, wqueue,
+					       (unsigned long)path.dentry,
+					       false);
+		up_write(&m->mnt.mnt_sb->s_umount);
+	} else {
+		ret = -EBADSLT;
+	}
+
+err_wlist:
+	kfree(wlist);
+err_wqueue:
+	put_watch_queue(wqueue);
+err_path:
+	path_put(&path);
+	return ret;
+}
diff --git a/fs/namespace.c b/fs/namespace.c
index ae03066b2d9b..de778b2e8ec4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -515,7 +515,8 @@  static int mnt_make_readonly(struct mount *mnt)
 	mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
 	unlock_mount_hash();
 	if (ret == 0)
-		notify_mount(mnt, NULL, NOTIFY_MOUNT_READONLY, 0x10000);
+		notify_mount(mnt, NULL, NOTIFY_MOUNT_READONLY,
+			     WATCH_INFO_FLAG_0);
 	return ret;
 }
 
@@ -1478,6 +1479,10 @@  static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
 		list_del_init(&p->mnt_expire);
 		list_del_init(&p->mnt_list);
 
+#ifdef CONFIG_MOUNT_NOTIFICATIONS
+		if (p->mnt_watchers)
+			remove_watch_list(p->mnt_watchers);
+#endif
 		ns = p->mnt_ns;
 		if (ns) {
 			ns->mounts--;
@@ -2115,7 +2120,7 @@  static int attach_recursive_mnt(struct mount *source_mnt,
 		mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt);
 		notify_mount(dest_mnt, source_mnt, NOTIFY_MOUNT_NEW_MOUNT,
 			     source_mnt->mnt.mnt_sb->s_flags & SB_SUBMOUNT ?
-			     0x10000 : 0);
+			     WATCH_INFO_FLAG_0 : 0);
 		commit_tree(source_mnt);
 	}
 
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 361305ddd75e..5db8e244d9a0 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -217,6 +217,7 @@  struct dentry_operations {
 #define DCACHE_PAR_LOOKUP		0x10000000 /* being looked up (with parent locked shared) */
 #define DCACHE_DENTRY_CURSOR		0x20000000
 #define DCACHE_NORCU			0x40000000 /* No RCU delay for freeing */
+#define DCACHE_MOUNT_WATCH		0x80000000 /* There's a mount watch here */
 
 extern seqlock_t rename_lock;
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 217d25b62b4f..7c2b66175f3c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1001,6 +1001,8 @@  asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
 asmlinkage long sys_fsinfo(int dfd, const char __user *path,
 			   struct fsinfo_params __user *params,
 			   void __user *buffer, size_t buf_size);
+asmlinkage long sys_mount_notify(int dfd, const char __user *path,
+				 unsigned int at_flags, int watch_fd, int watch_id);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
index e3bb35a480ae..388b4141bcee 100644
--- a/include/uapi/linux/watch_queue.h
+++ b/include/uapi/linux/watch_queue.h
@@ -104,4 +104,28 @@  struct key_notification {
 	__u32	aux;		/* Per-type auxiliary data */
 };
 
+/*
+ * Type of mount topology change notification.
+ */
+enum mount_notification_subtype {
+	NOTIFY_MOUNT_NEW_MOUNT	= 0, /* New mount added */
+	NOTIFY_MOUNT_UNMOUNT	= 1, /* Mount removed manually */
+	NOTIFY_MOUNT_EXPIRY	= 2, /* Automount expired */
+	NOTIFY_MOUNT_READONLY	= 3, /* Mount R/O state changed */
+	NOTIFY_MOUNT_SETATTR	= 4, /* Mount attributes changed */
+	NOTIFY_MOUNT_MOVE_FROM	= 5, /* Mount moved from here */
+	NOTIFY_MOUNT_MOVE_TO	= 6, /* Mount moved to here (compare op_id) */
+};
+
+/*
+ * Mount topology/configuration change notification record.
+ * - watch.type = WATCH_TYPE_MOUNT_NOTIFY
+ * - watch.subtype = enum mount_notification_subtype
+ */
+struct mount_notification {
+	struct watch_notification watch; /* WATCH_TYPE_MOUNT_NOTIFY */
+	__u32	triggered_on;		/* The mount that the notify was on */
+	__u32	changed_mount;		/* The mount that got changed */
+};
+
 #endif /* _UAPI_LINUX_WATCH_QUEUE_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d1d9d76cae1e..97b025e7863c 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -88,6 +88,9 @@  COND_SYSCALL(ioprio_get);
 /* fs/locks.c */
 COND_SYSCALL(flock);
 
+/* fs/mount_notify.c */
+COND_SYSCALL(mount_notify);
+
 /* fs/namei.c */
 
 /* fs/namespace.c */