diff mbox series

[15/19] vfs: Add superblock notifications [ver #16]

Message ID 158204561120.3299825.5242636508455859327.stgit@warthog.procyon.org.uk (mailing list archive)
State New, archived
Headers show
Series VFS: Filesystem information and notifications [ver #16] | expand

Commit Message

David Howells Feb. 18, 2020, 5:06 p.m. UTC
Add a superblock event notification facility whereby notifications about
superblock events, such as I/O errors (EIO), quota limits being hit
(EDQUOT) and running out of space (ENOSPC) can be reported to a monitoring
process asynchronously.  Note that this does not cover vfsmount topology
changes.  watch_mount() is used for that.

Firstly, an event queue needs to be created:

	fd = open("/dev/event_queue", O_RDWR);
	ioctl(fd, IOC_WATCH_QUEUE_SET_SIZE, page_size << n);

then a notification can be set up to report notifications via that queue:

	struct watch_notification_filter filter = {
		.nr_filters = 1,
		.filters = {
			[0] = {
				.type = WATCH_TYPE_SB_NOTIFY,
				.subtype_filter[0] = UINT_MAX,
			},
		},
	};
	ioctl(fd, IOC_WATCH_QUEUE_SET_FILTER, &filter);
	watch_sb(AT_FDCWD, "/home/dhowells", 0, fd, 0x03);

In this case, it would let me monitor my own homedir for events.  After
setting the watch, records will be placed into the queue when, for example,
as superblock switches between read-write and read-only.  Records are of
the following format:

	struct superblock_notification {
		struct watch_notification watch;
		__u64	sb_id;
	} *n;

Where:

	n->watch.type will be WATCH_TYPE_SB_NOTIFY.

	n->watch.subtype will indicate the type of event, such as
	NOTIFY_SUPERBLOCK_READONLY.

	n->watch.info & WATCH_INFO_LENGTH will indicate the length of the
	record.

	n->watch.info & WATCH_INFO_ID will be the fifth argument to
	watch_sb(), shifted.

	n->watch.info & NOTIFY_SUPERBLOCK_IS_NOW_RO will be used for
	NOTIFY_SUPERBLOCK_READONLY, being set if the superblock becomes
	R/O, and being cleared otherwise.

	n->sb_id will be the ID of the superblock, as can be retrieved with
	the fsinfo() syscall, as part of the fsinfo_sb_notifications
	attribute in the the watch_id field.

Note that it is permissible for event records to be of variable length -
or, at least, the length may be dependent on the subtype.  Note also that
the queue can be shared between multiple notifications of various types.

Signed-off-by: David Howells <dhowells@redhat.com>
---

 arch/alpha/kernel/syscalls/syscall.tbl      |    1 
 arch/arm/tools/syscall.tbl                  |    1 
 arch/arm64/include/asm/unistd.h             |    2 
 arch/ia64/kernel/syscalls/syscall.tbl       |    1 
 arch/m68k/kernel/syscalls/syscall.tbl       |    1 
 arch/microblaze/kernel/syscalls/syscall.tbl |    1 
 arch/mips/kernel/syscalls/syscall_n32.tbl   |    1 
 arch/mips/kernel/syscalls/syscall_n64.tbl   |    1 
 arch/mips/kernel/syscalls/syscall_o32.tbl   |    1 
 arch/parisc/kernel/syscalls/syscall.tbl     |    1 
 arch/powerpc/kernel/syscalls/syscall.tbl    |    1 
 arch/s390/kernel/syscalls/syscall.tbl       |    1 
 arch/sh/kernel/syscalls/syscall.tbl         |    1 
 arch/sparc/kernel/syscalls/syscall.tbl      |    1 
 arch/x86/entry/syscalls/syscall_32.tbl      |    1 
 arch/x86/entry/syscalls/syscall_64.tbl      |    1 
 arch/xtensa/kernel/syscalls/syscall.tbl     |    1 
 fs/Kconfig                                  |   12 +++
 fs/super.c                                  |  125 +++++++++++++++++++++++++++
 include/linux/fs.h                          |   77 +++++++++++++++++
 include/linux/syscalls.h                    |    2 
 include/uapi/asm-generic/unistd.h           |    4 +
 include/uapi/linux/watch_queue.h            |   31 ++++++-
 kernel/sys_ni.c                             |    1 
 24 files changed, 267 insertions(+), 3 deletions(-)

Comments

Jann Horn Feb. 19, 2020, 11:08 p.m. UTC | #1
On Tue, Feb 18, 2020 at 6:07 PM David Howells <dhowells@redhat.com> wrote:
> Add a superblock event notification facility whereby notifications about
> superblock events, such as I/O errors (EIO), quota limits being hit
> (EDQUOT) and running out of space (ENOSPC) can be reported to a monitoring
> process asynchronously.  Note that this does not cover vfsmount topology
> changes.  watch_mount() is used for that.
[...]
> @@ -354,6 +356,10 @@ void deactivate_locked_super(struct super_block *s)
>  {
>         struct file_system_type *fs = s->s_type;
>         if (atomic_dec_and_test(&s->s_active)) {
> +#ifdef CONFIG_SB_NOTIFICATIONS
> +               if (s->s_watchers)
> +                       remove_watch_list(s->s_watchers, s->s_unique_id);
> +#endif
>                 cleancache_invalidate_fs(s);
>                 unregister_shrinker(&s->s_shrink);
>                 fs->kill_sb(s);
[...]
> +/**
> + * sys_watch_sb - Watch for superblock events.
> + * @dfd: Base directory to pathwalk from or fd referring to superblock.
> + * @filename: Path to superblock to place the watch upon
> + * @at_flags: Pathwalk control flags
> + * @watch_fd: The watch queue to send notifications to.
> + * @watch_id: The watch ID to be placed in the notification (-1 to remove watch)
> + */
> +SYSCALL_DEFINE5(watch_sb,
> +               int, dfd,
> +               const char __user *, filename,
> +               unsigned int, at_flags,
> +               int, watch_fd,
> +               int, watch_id)
> +{
> +       struct watch_queue *wqueue;
> +       struct super_block *s;
> +       struct watch_list *wlist = NULL;
> +       struct watch *watch = NULL;
> +       struct path path;
> +       unsigned int lookup_flags =
> +               LOOKUP_DIRECTORY | LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
> +       int ret;
[...]
> +       wqueue = get_watch_queue(watch_fd);
> +       if (IS_ERR(wqueue))
> +               goto err_path;
> +
> +       s = path.dentry->d_sb;
> +       if (watch_id >= 0) {
> +               ret = -ENOMEM;
> +               if (!s->s_watchers) {

READ_ONCE() ?

> +                       wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
> +                       if (!wlist)
> +                               goto err_wqueue;
> +                       init_watch_list(wlist, NULL);
> +               }
> +
> +               watch = kzalloc(sizeof(*watch), GFP_KERNEL);
> +               if (!watch)
> +                       goto err_wlist;
> +
> +               init_watch(watch, wqueue);
> +               watch->id               = s->s_unique_id;
> +               watch->private          = s;
> +               watch->info_id          = (u32)watch_id << 24;
> +
> +               ret = security_watch_sb(watch, s);
> +               if (ret < 0)
> +                       goto err_watch;
> +
> +               down_write(&s->s_umount);
> +               ret = -EIO;
> +               if (atomic_read(&s->s_active)) {
> +                       if (!s->s_watchers) {
> +                               s->s_watchers = wlist;
> +                               wlist = NULL;
> +                       }
> +
> +                       ret = add_watch_to_object(watch, s->s_watchers);
> +                       if (ret == 0) {
> +                               spin_lock(&sb_lock);
> +                               s->s_count++;
> +                               spin_unlock(&sb_lock);

Where is the corresponding decrement of s->s_count? I'm guessing that
it should be in the ->release_watch() handler, except that there isn't
one...

> +                               watch = NULL;
> +                       }
> +               }
> +               up_write(&s->s_umount);
> +       } else {
> +               ret = -EBADSLT;
> +               if (READ_ONCE(s->s_watchers)) {

(Nit: I don't get why you do a lockless check here before taking the
lock - it'd be more straightforward to take the lock first, and it's
not like you want to optimize for the case where someone calls
sys_watch_sb() with invalid arguments...)

> +                       down_write(&s->s_umount);
> +                       ret = remove_watch_from_object(s->s_watchers, wqueue,
> +                                                      s->s_unique_id, false);
> +                       up_write(&s->s_umount);
> +               }
> +       }
> +
> +err_watch:
> +       kfree(watch);
> +err_wlist:
> +       kfree(wlist);
> +err_wqueue:
> +       put_watch_queue(wqueue);
> +err_path:
> +       path_put(&path);
> +       return ret;
> +}
> +#endif
[...]
> +/**
> + * notify_sb: Post simple superblock notification.
> + * @s: The superblock the notification is about.
> + * @subtype: The type of notification.
> + * @info: WATCH_INFO_FLAG_* flags to be set in the record.
> + */
> +static inline void notify_sb(struct super_block *s,
> +                            enum superblock_notification_type subtype,
> +                            u32 info)
> +{
> +#ifdef CONFIG_SB_NOTIFICATIONS
> +       if (unlikely(s->s_watchers)) {

READ_ONCE() ?

> +               struct superblock_notification n = {
> +                       .watch.type     = WATCH_TYPE_SB_NOTIFY,
> +                       .watch.subtype  = subtype,
> +                       .watch.info     = watch_sizeof(n) | info,
> +                       .sb_id          = s->s_unique_id,
> +               };
> +
> +               post_sb_notification(s, &n);
> +       }
> +
> +#endif
> +}
> +
> +/**
> + * notify_sb_error: Post superblock error notification.
> + * @s: The superblock the notification is about.
> + * @error: The error number to be recorded.
> + */
> +static inline int notify_sb_error(struct super_block *s, int error)
> +{
> +#ifdef CONFIG_SB_NOTIFICATIONS
> +       if (unlikely(s->s_watchers)) {

READ_ONCE() ?

> +               struct superblock_error_notification n = {
> +                       .s.watch.type   = WATCH_TYPE_SB_NOTIFY,
> +                       .s.watch.subtype = NOTIFY_SUPERBLOCK_ERROR,
> +                       .s.watch.info   = watch_sizeof(n),
> +                       .s.sb_id        = s->s_unique_id,
> +                       .error_number   = error,
> +                       .error_cookie   = 0,
> +               };
> +
> +               post_sb_notification(s, &n.s);
> +       }
> +#endif
> +       return error;
> +}
David Howells Feb. 21, 2020, 2:23 p.m. UTC | #2
Jann Horn <jannh@google.com> wrote:

> > +               if (!s->s_watchers) {
> 
> READ_ONCE() ?

I'm not sure it matters.  It can only be set once, and the next time we read
it we're inside the lock.  And at this point, I don't actually dereference it,
and if it's non-NULL, it's not going to change.

> > +                       ret = add_watch_to_object(watch, s->s_watchers);
> > +                       if (ret == 0) {
> > +                               spin_lock(&sb_lock);
> > +                               s->s_count++;
> > +                               spin_unlock(&sb_lock);
> 
> Where is the corresponding decrement of s->s_count? I'm guessing that
> it should be in the ->release_watch() handler, except that there isn't
> one...

Um.  Good question.  I think this should do the job:

	static void sb_release_watch(struct watch *watch)
	{
		put_super(watch->private);
	}

And this then has to be set later:

	init_watch_list(wlist, sb_release_watch);

> > +       } else {
> > +               ret = -EBADSLT;
> > +               if (READ_ONCE(s->s_watchers)) {
> 
> (Nit: I don't get why you do a lockless check here before taking the
> lock - it'd be more straightforward to take the lock first, and it's
> not like you want to optimize for the case where someone calls
> sys_watch_sb() with invalid arguments...)

Fair enough.  I'll remove it.

> > +#ifdef CONFIG_SB_NOTIFICATIONS
> > +       if (unlikely(s->s_watchers)) {
> 
> READ_ONCE() ?

Shouldn't matter.  It's only read once and then a decision is made on it
immediately thereafter.  And if it's non-NULL, the value cannot change
thereafter.

David
Jann Horn Feb. 21, 2020, 3:44 p.m. UTC | #3
On Fri, Feb 21, 2020 at 3:24 PM David Howells <dhowells@redhat.com> wrote:
>
> Jann Horn <jannh@google.com> wrote:
>
> > > +               if (!s->s_watchers) {
> >
> > READ_ONCE() ?
>
> I'm not sure it matters.  It can only be set once, and the next time we read
> it we're inside the lock.  And at this point, I don't actually dereference it,
> and if it's non-NULL, it's not going to change.

I'd really like these READ_ONCE() things to be *anywhere* the value
can concurrently change, for two reasons:

First, it tells the reader "keep in mind that this value may
concurrently change in some way, don't just assume that it'll stay the
same".

But also, it tells the compiler that if it generates multiple loads
here and assumes that they return the same value, *really* bad stuff
may happen. GCC has some really fun behavior when compiling a switch()
on a value that might change concurrently without using READ_ONCE():
It sometimes generates multiple loads, where the first load is used to
test whether the value is in a specific range and then the second load
is used for actually indexing into a table of jump destinations. If
the value is concurrently mutated from an in-bounds value to an
out-of-bounds value, this code will load a jump destination from
random out-of-bounds memory.

An example:

$ cat gcc-jump.c
int blah(int *x, int y) {
  switch (*x) {
    case 0: return y+1;
    case 1: return y*2;
    case 2: return y-3;
    case 3: return y^1;
    case 4: return y+6;
    case 5: return y-5;
    case 6: return y|1;
    case 7: return y&4;
    case 8: return y|5;
    case 9: return y-3;
    case 10: return y&8;
    case 11: return y|9;
    default: return y;
  }
}
$ gcc-9 -O2 -c -o gcc-jump.o gcc-jump.c
$ objdump -dr gcc-jump.o
[...]
0000000000000000 <blah>:
   0: 83 3f 0b              cmpl   $0xb,(%rdi)
   3: 0f 87 00 00 00 00    ja     9 <blah+0x9>
5: R_X86_64_PC32 .text.unlikely-0x4
   9: 8b 07                mov    (%rdi),%eax
   b: 48 8d 15 00 00 00 00 lea    0x0(%rip),%rdx        # 12 <blah+0x12>
e: R_X86_64_PC32 .rodata-0x4
  12: 48 63 04 82          movslq (%rdx,%rax,4),%rax
  16: 48 01 d0              add    %rdx,%rax
  19: ff e0                jmpq   *%rax
[...]


Or if you want to see a full example that actually crashes:

$ cat gcc-jump-crash.c
#include <pthread.h>

int mutating_number;

__attribute__((noinline)) int blah(int *x, int y) {
  switch (*x) {
    case 0: return y+1;
    case 1: return y*2;
    case 2: return y-3;
    case 3: return y^1;
    case 4: return y+6;
    case 5: return y-5;
    case 6: return y|1;
    case 7: return y&4;
    case 8: return y|5;
    case 9: return y-3;
    case 10: return y&8;
    case 11: return y|9;
    default: return y;
  }
}

int blah_num;
void *thread_fn(void *dummy) {
  while (1) {
    blah_num = blah(&mutating_number, blah_num);
  }
}

int main(void) {
  pthread_t thread;
  pthread_create(&thread, NULL, thread_fn, NULL);
  while (1) {
    *(volatile int *)&mutating_number = 1;
    *(volatile int *)&mutating_number = 100000000;
  }
}
$ gcc-9 -O2 -pthread -o gcc-jump-crash gcc-jump-crash.c -ggdb -Wall
$ gdb ./gcc-jump-crash
[...]
(gdb) run
[...]
Thread 2 "gcc-jump-crash" received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7ffff7db6700 (LWP 33237)]
0x00005555555551a2 in blah (x=0x555555558034 <mutating_number>, y=0)
at gcc-jump-crash.c:6
6   switch (*x) {
(gdb) x/10i blah
   0x555555555190 <blah>: cmp    DWORD PTR [rdi],0xb
   0x555555555193 <blah+3>: ja     0x555555555050 <blah+4294966976>
   0x555555555199 <blah+9>: mov    eax,DWORD PTR [rdi]
   0x55555555519b <blah+11>: lea    rdx,[rip+0xe62]        # 0x555555556004
=> 0x5555555551a2 <blah+18>: movsxd rax,DWORD PTR [rdx+rax*4]
   0x5555555551a6 <blah+22>: add    rax,rdx
   0x5555555551a9 <blah+25>: jmp    rax
   0x5555555551ab <blah+27>: nop    DWORD PTR [rax+rax*1+0x0]
   0x5555555551b0 <blah+32>: lea    eax,[rsi-0x3]
   0x5555555551b3 <blah+35>: ret
(gdb)


Here's a presentation from Felix Wilhelm, a security researcher who
managed to find a case in the Xen hypervisor where a switch() on a
value in shared memory was exploitable to compromise the hypervisor
from inside a guest (see slides 35 and following):
<https://www.blackhat.com/docs/us-16/materials/us-16-Wilhelm-Xenpwn-Breaking-Paravirtualized-Devices.pdf>

I realize that a compiler is extremely unlikely to make such an
optimization decision for a simple "if (!a->b)" branch; but still, I
would prefer to have READ_ONCE() everywhere where it is semantically
required, not just everywhere where you can think of a concrete
compiler optimization that will break stuff.

> > > +                       ret = add_watch_to_object(watch, s->s_watchers);
> > > +                       if (ret == 0) {
> > > +                               spin_lock(&sb_lock);
> > > +                               s->s_count++;
> > > +                               spin_unlock(&sb_lock);
> >
> > Where is the corresponding decrement of s->s_count? I'm guessing that
> > it should be in the ->release_watch() handler, except that there isn't
> > one...
>
> Um.  Good question.  I think this should do the job:
>
>         static void sb_release_watch(struct watch *watch)
>         {
>                 put_super(watch->private);
>         }
>
> And this then has to be set later:
>
>         init_watch_list(wlist, sb_release_watch);

(And as in the other case, the s->s_count increment will probably have
to be moved above the add_watch_to_object(), unless you hold the
sb_lock around it?)
David Howells Feb. 21, 2020, 4:33 p.m. UTC | #4
Jann Horn <jannh@google.com> wrote:

> (And as in the other case, the s->s_count increment will probably have
> to be moved above the add_watch_to_object(), unless you hold the
> sb_lock around it?)

It shouldn't matter as I'm holding s->s_umount across the add and increment.
That prevents the watch from being removed: watch_sb() would have to get the
lock first to do that.  It also deactivate_locked_super() from removing all
the watchers.

I can move it before, but I probably have to drop s_umount before I can call
put_super().

David
Jann Horn Feb. 21, 2020, 4:41 p.m. UTC | #5
On Fri, Feb 21, 2020 at 5:33 PM David Howells <dhowells@redhat.com> wrote:
> Jann Horn <jannh@google.com> wrote:
>
> > (And as in the other case, the s->s_count increment will probably have
> > to be moved above the add_watch_to_object(), unless you hold the
> > sb_lock around it?)
>
> It shouldn't matter as I'm holding s->s_umount across the add and increment.
> That prevents the watch from being removed: watch_sb() would have to get the
> lock first to do that.  It also deactivate_locked_super() from removing all
> the watchers.

Can't the same thing I already pointed out on "[PATCH 13/19] vfs: Add
a mount-notification facility [ver #16]" also happen here?

If another thread concurrently runs close(watch_fd) before the
spin_lock(&sb_lock), pipe_release -> put_pipe_info -> free_pipe_info
-> watch_queue_clear will run, correct? And then watch_queue_clear()
will find the watch that we've just created and call its
->release_watch() handler, which causes put_super(), potentially
dropping the refcount to zero? And then stuff will blow up.

> I can move it before, but I probably have to drop s_umount before I can call
> put_super().
David Howells Feb. 21, 2020, 5:11 p.m. UTC | #6
Jann Horn <jannh@google.com> wrote:

> If another thread concurrently runs close(watch_fd)

Fair point.  We have the watch queue pinned, but watch_queue_clear() is called
before the ref is released.

David
diff mbox series

Patch

diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl
index 72bc7b33c59d..cd39492e4f7d 100644
--- a/arch/alpha/kernel/syscalls/syscall.tbl
+++ b/arch/alpha/kernel/syscalls/syscall.tbl
@@ -479,3 +479,4 @@ 
 548	common	pidfd_getfd			sys_pidfd_getfd
 549	common	fsinfo				sys_fsinfo
 550	common	watch_mount			sys_watch_mount
+551	common	watch_sb			sys_watch_sb
diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl
index 67777fd0b19e..a26bc42b9464 100644
--- a/arch/arm/tools/syscall.tbl
+++ b/arch/arm/tools/syscall.tbl
@@ -453,3 +453,4 @@ 
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	fsinfo				sys_fsinfo
 440	common	watch_mount			sys_watch_mount
+441	common	watch_sb			sys_watch_sb
diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h
index 75f04a1023be..388eeb71cff0 100644
--- a/arch/arm64/include/asm/unistd.h
+++ b/arch/arm64/include/asm/unistd.h
@@ -38,7 +38,7 @@ 
 #define __ARM_NR_compat_set_tls		(__ARM_NR_COMPAT_BASE + 5)
 #define __ARM_NR_COMPAT_END		(__ARM_NR_COMPAT_BASE + 0x800)
 
-#define __NR_compat_syscalls		440
+#define __NR_compat_syscalls		442
 #endif
 
 #define __ARCH_WANT_SYS_CLONE
diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl
index cd18dc112902..b13b94de9a01 100644
--- a/arch/ia64/kernel/syscalls/syscall.tbl
+++ b/arch/ia64/kernel/syscalls/syscall.tbl
@@ -360,3 +360,4 @@ 
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	fsinfo				sys_fsinfo
 440	common	watch_mount			sys_watch_mount
+441	common	watch_sb			sys_watch_sb
diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl
index de5c7303899f..4a163d0200b2 100644
--- a/arch/m68k/kernel/syscalls/syscall.tbl
+++ b/arch/m68k/kernel/syscalls/syscall.tbl
@@ -440,3 +440,4 @@ 
 # 435 reserved for clone3
 439	common	fsinfo				sys_fsinfo
 440	common	watch_mount			sys_watch_mount
+441	common	watch_sb			sys_watch_sb
diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl
index 7387a44767c3..b0fed3b73462 100644
--- a/arch/microblaze/kernel/syscalls/syscall.tbl
+++ b/arch/microblaze/kernel/syscalls/syscall.tbl
@@ -445,3 +445,4 @@ 
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	fsinfo				sys_fsinfo
 440	common	watch_mount			sys_watch_mount
+441	common	watch_sb			sys_watch_sb
diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl
index e2c76157a580..8a33cc08ed39 100644
--- a/arch/mips/kernel/syscalls/syscall_n32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl
@@ -378,3 +378,4 @@ 
 438	n32	pidfd_getfd			sys_pidfd_getfd
 439	n32	fsinfo				sys_fsinfo
 440	n32	watch_mount			sys_watch_mount
+441	n32	watch_sb			sys_watch_sb
diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl
index e5da9a13b074..8a11d81717d3 100644
--- a/arch/mips/kernel/syscalls/syscall_n64.tbl
+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl
@@ -354,3 +354,4 @@ 
 438	n64	pidfd_getfd			sys_pidfd_getfd
 439	n64	fsinfo				sys_fsinfo
 440	n64	watch_mount			sys_watch_mount
+441	n64	watch_sb			sys_watch_sb
diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl
index fe135759d2a8..76787af4a8f2 100644
--- a/arch/mips/kernel/syscalls/syscall_o32.tbl
+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl
@@ -427,3 +427,4 @@ 
 438	o32	pidfd_getfd			sys_pidfd_getfd
 439	o32	fsinfo				sys_fsinfo
 440	o32	watch_mount			sys_watch_mount
+441	o32	watch_sb			sys_watch_sb
diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl
index 5ac7a58af305..1c35cf2c0938 100644
--- a/arch/parisc/kernel/syscalls/syscall.tbl
+++ b/arch/parisc/kernel/syscalls/syscall.tbl
@@ -437,3 +437,4 @@ 
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	fsinfo				sys_fsinfo
 440	common	watch_mount			sys_watch_mount
+441	common	watch_sb			sys_watch_sb
diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl
index c77a1cf377ec..c5ea6f8e95b6 100644
--- a/arch/powerpc/kernel/syscalls/syscall.tbl
+++ b/arch/powerpc/kernel/syscalls/syscall.tbl
@@ -521,3 +521,4 @@ 
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	fsinfo				sys_fsinfo
 440	common	watch_mount			sys_watch_mount
+441	common	watch_sb			sys_watch_sb
diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl
index d81d30d02aaf..4577426e09f5 100644
--- a/arch/s390/kernel/syscalls/syscall.tbl
+++ b/arch/s390/kernel/syscalls/syscall.tbl
@@ -442,3 +442,4 @@ 
 438  common	pidfd_getfd		sys_pidfd_getfd			sys_pidfd_getfd
 439  common	fsinfo			sys_fsinfo			sys_fsinfo
 440	common	watch_mount		sys_watch_mount			sys_watch_mount
+441	common	watch_sb		sys_watch_sb			sys_watch_sb
diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl
index dcdc747fa430..e57c03fd5ba3 100644
--- a/arch/sh/kernel/syscalls/syscall.tbl
+++ b/arch/sh/kernel/syscalls/syscall.tbl
@@ -442,3 +442,4 @@ 
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	fsinfo				sys_fsinfo
 440	common	watch_mount			sys_watch_mount
+441	common	watch_sb			sys_watch_sb
diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl
index b4f82e5a08bf..1b2b19873319 100644
--- a/arch/sparc/kernel/syscalls/syscall.tbl
+++ b/arch/sparc/kernel/syscalls/syscall.tbl
@@ -485,3 +485,4 @@ 
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	fsinfo				sys_fsinfo
 440	common	watch_mount			sys_watch_mount
+441	common	watch_sb			sys_watch_sb
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 07572644779d..8b3a00860524 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -444,3 +444,4 @@ 
 438	i386	pidfd_getfd		sys_pidfd_getfd			__ia32_sys_pidfd_getfd
 439	i386	fsinfo			sys_fsinfo			__ia32_sys_fsinfo
 440	i386	watch_mount		sys_watch_mount			__ia32_sys_watch_mount
+441	i386	watch_sb		sys_watch_sb			__ia32_sys_watch_sb
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 1b51791fe104..8522ff13308c 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -361,6 +361,7 @@ 
 438	common	pidfd_getfd		__x64_sys_pidfd_getfd
 439	common	fsinfo			__x64_sys_fsinfo
 440	common	watch_mount		__x64_sys_watch_mount
+441	common	watch_sb		__x64_sys_watch_sb
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl
index dfcdd3036d3e..70f0292ed37a 100644
--- a/arch/xtensa/kernel/syscalls/syscall.tbl
+++ b/arch/xtensa/kernel/syscalls/syscall.tbl
@@ -410,3 +410,4 @@ 
 438	common	pidfd_getfd			sys_pidfd_getfd
 439	common	fsinfo				sys_fsinfo
 440	common	watch_mount			sys_watch_mount
+441	common	watch_sb			sys_watch_sb
diff --git a/fs/Kconfig b/fs/Kconfig
index 76224bc015cb..01d0d436b3cd 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -133,6 +133,18 @@  config MOUNT_NOTIFICATIONS
 	  device to handle the notification buffer and provides the
 	  mount_notify() system call to enable/disable watchpoints.
 
+config SB_NOTIFICATIONS
+	bool "Superblock event notifications"
+	select WATCH_QUEUE
+	help
+	  This option provides support for receiving superblock event
+	  notifications.  This makes use of the /dev/watch_queue misc device to
+	  handle the notification buffer and provides the sb_notify() system
+	  call to enable/disable watches.
+
+	  Events can include things like changing between R/W and R/O, EIO
+	  generation, ENOSPC generation and EDQUOT generation.
+
 source "fs/quota/Kconfig"
 
 source "fs/autofs/Kconfig"
diff --git a/fs/super.c b/fs/super.c
index a63073e6127e..ec16e6f88c16 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,6 +37,8 @@ 
 #include <linux/lockdep.h>
 #include <linux/user_namespace.h>
 #include <linux/fs_context.h>
+#include <linux/syscalls.h>
+#include <linux/namei.h>
 #include <uapi/linux/mount.h>
 #include "internal.h"
 
@@ -354,6 +356,10 @@  void deactivate_locked_super(struct super_block *s)
 {
 	struct file_system_type *fs = s->s_type;
 	if (atomic_dec_and_test(&s->s_active)) {
+#ifdef CONFIG_SB_NOTIFICATIONS
+		if (s->s_watchers)
+			remove_watch_list(s->s_watchers, s->s_unique_id);
+#endif
 		cleancache_invalidate_fs(s);
 		unregister_shrinker(&s->s_shrink);
 		fs->kill_sb(s);
@@ -993,6 +999,8 @@  int reconfigure_super(struct fs_context *fc)
 	/* Needs to be ordered wrt mnt_is_readonly() */
 	smp_wmb();
 	sb->s_readonly_remount = 0;
+	notify_sb(sb, NOTIFY_SUPERBLOCK_READONLY,
+		  remount_ro ? NOTIFY_SUPERBLOCK_IS_NOW_RO : 0);
 
 	/*
 	 * Some filesystems modify their metadata via some other path than the
@@ -1891,3 +1899,120 @@  int thaw_super(struct super_block *sb)
 	return thaw_super_locked(sb);
 }
 EXPORT_SYMBOL(thaw_super);
+
+#ifdef CONFIG_SB_NOTIFICATIONS
+/*
+ * Post superblock notifications.
+ */
+void post_sb_notification(struct super_block *s, struct superblock_notification *n)
+{
+	post_watch_notification(s->s_watchers, &n->watch, current_cred(),
+				s->s_unique_id);
+}
+
+/**
+ * sys_watch_sb - Watch for superblock events.
+ * @dfd: Base directory to pathwalk from or fd referring to superblock.
+ * @filename: Path to superblock to place the watch upon
+ * @at_flags: Pathwalk control flags
+ * @watch_fd: The watch queue to send notifications to.
+ * @watch_id: The watch ID to be placed in the notification (-1 to remove watch)
+ */
+SYSCALL_DEFINE5(watch_sb,
+		int, dfd,
+		const char __user *, filename,
+		unsigned int, at_flags,
+		int, watch_fd,
+		int, watch_id)
+{
+	struct watch_queue *wqueue;
+	struct super_block *s;
+	struct watch_list *wlist = NULL;
+	struct watch *watch = NULL;
+	struct path path;
+	unsigned int lookup_flags =
+		LOOKUP_DIRECTORY | LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT;
+	int ret;
+
+	if (watch_id < -1 || watch_id > 0xff)
+		return -EINVAL;
+	if ((at_flags & ~(AT_NO_AUTOMOUNT | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+	if (at_flags & AT_NO_AUTOMOUNT)
+		lookup_flags &= ~LOOKUP_AUTOMOUNT;
+	if (at_flags & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+
+	ret = user_path_at(dfd, filename, at_flags, &path);
+	if (ret)
+		return ret;
+
+	ret = inode_permission(path.dentry->d_inode, MAY_EXEC);
+	if (ret)
+		goto err_path;
+
+	wqueue = get_watch_queue(watch_fd);
+	if (IS_ERR(wqueue))
+		goto err_path;
+
+	s = path.dentry->d_sb;
+	if (watch_id >= 0) {
+		ret = -ENOMEM;
+		if (!s->s_watchers) {
+			wlist = kzalloc(sizeof(*wlist), GFP_KERNEL);
+			if (!wlist)
+				goto err_wqueue;
+			init_watch_list(wlist, NULL);
+		}
+
+		watch = kzalloc(sizeof(*watch), GFP_KERNEL);
+		if (!watch)
+			goto err_wlist;
+
+		init_watch(watch, wqueue);
+		watch->id		= s->s_unique_id;
+		watch->private		= s;
+		watch->info_id		= (u32)watch_id << 24;
+
+		ret = security_watch_sb(watch, s);
+		if (ret < 0)
+			goto err_watch;
+
+		down_write(&s->s_umount);
+		ret = -EIO;
+		if (atomic_read(&s->s_active)) {
+			if (!s->s_watchers) {
+				s->s_watchers = wlist;
+				wlist = NULL;
+			}
+
+			ret = add_watch_to_object(watch, s->s_watchers);
+			if (ret == 0) {
+				spin_lock(&sb_lock);
+				s->s_count++;
+				spin_unlock(&sb_lock);
+				watch = NULL;
+			}
+		}
+		up_write(&s->s_umount);
+	} else {
+		ret = -EBADSLT;
+		if (READ_ONCE(s->s_watchers)) {
+			down_write(&s->s_umount);
+			ret = remove_watch_from_object(s->s_watchers, wqueue,
+						       s->s_unique_id, false);
+			up_write(&s->s_umount);
+		}
+	}
+
+err_watch:
+	kfree(watch);
+err_wlist:
+	kfree(wlist);
+err_wqueue:
+	put_watch_queue(wqueue);
+err_path:
+	path_put(&path);
+	return ret;
+}
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e5db22d536a3..423a6f03cdf8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -40,6 +40,7 @@ 
 #include <linux/fs_types.h>
 #include <linux/build_bug.h>
 #include <linux/stddef.h>
+#include <linux/watch_queue.h>
 
 #include <asm/byteorder.h>
 #include <uapi/linux/fs.h>
@@ -1553,6 +1554,10 @@  struct super_block {
 
 	/* Superblock event notifications */
 	u64			s_unique_id;
+
+#ifdef CONFIG_SB_NOTIFICATIONS
+	struct watch_list	*s_watchers;
+#endif
 } __randomize_layout;
 
 /* Helper functions so that in most cases filesystems will
@@ -3659,4 +3664,76 @@  static inline int inode_drain_writes(struct inode *inode)
 	return filemap_write_and_wait(inode->i_mapping);
 }
 
+extern void post_sb_notification(struct super_block *, struct superblock_notification *);
+
+/**
+ * notify_sb: Post simple superblock notification.
+ * @s: The superblock the notification is about.
+ * @subtype: The type of notification.
+ * @info: WATCH_INFO_FLAG_* flags to be set in the record.
+ */
+static inline void notify_sb(struct super_block *s,
+			     enum superblock_notification_type subtype,
+			     u32 info)
+{
+#ifdef CONFIG_SB_NOTIFICATIONS
+	if (unlikely(s->s_watchers)) {
+		struct superblock_notification n = {
+			.watch.type	= WATCH_TYPE_SB_NOTIFY,
+			.watch.subtype	= subtype,
+			.watch.info	= watch_sizeof(n) | info,
+			.sb_id		= s->s_unique_id,
+		};
+
+		post_sb_notification(s, &n);
+	}
+			     
+#endif
+}
+
+/**
+ * notify_sb_error: Post superblock error notification.
+ * @s: The superblock the notification is about.
+ * @error: The error number to be recorded.
+ */
+static inline int notify_sb_error(struct super_block *s, int error)
+{
+#ifdef CONFIG_SB_NOTIFICATIONS
+	if (unlikely(s->s_watchers)) {
+		struct superblock_error_notification n = {
+			.s.watch.type	= WATCH_TYPE_SB_NOTIFY,
+			.s.watch.subtype = NOTIFY_SUPERBLOCK_ERROR,
+			.s.watch.info	= watch_sizeof(n),
+			.s.sb_id	= s->s_unique_id,
+			.error_number	= error,
+			.error_cookie	= 0,
+		};
+
+		post_sb_notification(s, &n.s);
+	}
+#endif
+	return error;
+}
+
+/**
+ * notify_sb_EDQUOT: Post superblock quota overrun notification.
+ * @s: The superblock the notification is about.
+ */
+static inline int notify_sb_EQDUOT(struct super_block *s)
+{
+#ifdef CONFIG_SB_NOTIFICATIONS
+	if (unlikely(s->s_watchers)) {
+		struct superblock_notification n = {
+			.watch.type	= WATCH_TYPE_SB_NOTIFY,
+			.watch.subtype	= NOTIFY_SUPERBLOCK_EDQUOT,
+			.watch.info	= watch_sizeof(n),
+			.sb_id		= s->s_unique_id,
+		};
+
+		post_sb_notification(s, &n);
+	}
+#endif
+	return -EDQUOT;
+}
+
 #endif /* _LINUX_FS_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1687e064751d..af66fe97a586 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1009,6 +1009,8 @@  asmlinkage long sys_fsinfo(int dfd, const char __user *pathname,
 			   void __user *buffer, size_t buf_size);
 asmlinkage long sys_watch_mount(int dfd, const char __user *path,
 				unsigned int at_flags, int watch_fd, int watch_id);
+asmlinkage long sys_watch_sb(int dfd, const char __user *path,
+			     unsigned int at_flags, int watch_fd, int watch_id);
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index d6b6c45ad31a..882c0fae4f37 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -859,9 +859,11 @@  __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd)
 __SYSCALL(__NR_fsinfo, sys_fsinfo)
 #define __NR_watch_mount 440
 __SYSCALL(__NR_watch_mount, sys_watch_mount)
+#define __NR_watch_sb 441
+__SYSCALL(__NR_watch_sb, sys_watch_sb)
 
 #undef __NR_syscalls
-#define __NR_syscalls 441
+#define __NR_syscalls 442
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/watch_queue.h b/include/uapi/linux/watch_queue.h
index b0f35cf51394..190d27073302 100644
--- a/include/uapi/linux/watch_queue.h
+++ b/include/uapi/linux/watch_queue.h
@@ -15,7 +15,8 @@  enum watch_notification_type {
 	WATCH_TYPE_META		= 0,	/* Special record */
 	WATCH_TYPE_KEY_NOTIFY	= 1,	/* Key change event notification */
 	WATCH_TYPE_MOUNT_NOTIFY	= 2,	/* Mount topology change notification */
-	WATCH_TYPE___NR		= 3
+	WATCH_TYPE_SB_NOTIFY	= 3,	/* Superblock event notification */
+	WATCH_TYPE___NR		= 4
 };
 
 enum watch_meta_notification_subtype {
@@ -131,4 +132,32 @@  struct mount_notification {
 	__u32	changed_mount;		/* The mount that got changed */
 };
 
+/*
+ * Type of superblock notification.
+ */
+enum superblock_notification_type {
+	NOTIFY_SUPERBLOCK_READONLY	= 0, /* Filesystem toggled between R/O and R/W */
+	NOTIFY_SUPERBLOCK_ERROR		= 1, /* Error in filesystem or blockdev */
+	NOTIFY_SUPERBLOCK_EDQUOT	= 2, /* EDQUOT notification */
+	NOTIFY_SUPERBLOCK_NETWORK	= 3, /* Network status change */
+};
+
+#define NOTIFY_SUPERBLOCK_IS_NOW_RO	WATCH_INFO_FLAG_0 /* Superblock changed to R/O */
+
+/*
+ * Superblock notification record.
+ * - watch.type = WATCH_TYPE_MOUNT_NOTIFY
+ * - watch.subtype = enum superblock_notification_subtype
+ */
+struct superblock_notification {
+	struct watch_notification watch; /* WATCH_TYPE_SB_NOTIFY */
+	__u64	sb_id;			/* 64-bit superblock ID [fsinfo_ids::f_sb_id] */
+};
+
+struct superblock_error_notification {
+	struct superblock_notification s; /* subtype = notify_superblock_error */
+	__u32	error_number;
+	__u32	error_cookie;
+};
+
 #endif /* _UAPI_LINUX_WATCH_QUEUE_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 1a1eb7b61914..bc2e6885ef2d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -53,6 +53,7 @@  COND_SYSCALL(io_uring_enter);
 COND_SYSCALL(io_uring_register);
 COND_SYSCALL(fsinfo);
 COND_SYSCALL(watch_mount);
+COND_SYSCALL(watch_sb);
 
 /* fs/xattr.c */