diff mbox series

[1/2] fsnotify: add pre-content hooks on mmap()

Message ID 20250311114153.1763176-2-amir73il@gmail.com (mailing list archive)
State New
Headers show
Series Fix for potential deadlock in pre-content event | expand

Commit Message

Amir Goldstein March 11, 2025, 11:41 a.m. UTC
Pre-content hooks in page faults introduces potential deadlock of HSM
handler in userspace with filesystem freezing.

The requirement with pre-content event is that for every accessed file
range an event covering at least this range will be generated at least
once before the file data is accesses.

In preparation to disabling pre-content event hooks on page faults,
change those hooks to always use the mask MAY_ACCESS and add pre-content
hooks at mmap() variants for the entire mmaped range, so HSM can fill
content when user requests to map a portion of the file.

Note that exec() variant also calls vm_mmap_pgoff() internally to map
code sections, so pre-content hooks are also generated in this case.

Link: https://lore.kernel.org/linux-fsdevel/7ehxrhbvehlrjwvrduoxsao5k3x4aw275patsb3krkwuq573yv@o2hskrfawbnc/
Suggested-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Amir Goldstein <amir73il@gmail.com>
---
 mm/filemap.c |  3 +--
 mm/mmap.c    | 12 ++++++++++++
 mm/util.c    |  7 +++++++
 3 files changed, 20 insertions(+), 2 deletions(-)

Comments

Lorenzo Stoakes March 11, 2025, 12:33 p.m. UTC | #1
On Tue, Mar 11, 2025 at 12:41:52PM +0100, Amir Goldstein wrote:
> Pre-content hooks in page faults introduces potential deadlock of HSM
> handler in userspace with filesystem freezing.
>
> The requirement with pre-content event is that for every accessed file
> range an event covering at least this range will be generated at least
> once before the file data is accesses.
>
> In preparation to disabling pre-content event hooks on page faults,
> change those hooks to always use the mask MAY_ACCESS and add pre-content
> hooks at mmap() variants for the entire mmaped range, so HSM can fill
> content when user requests to map a portion of the file.
>
> Note that exec() variant also calls vm_mmap_pgoff() internally to map
> code sections, so pre-content hooks are also generated in this case.
>
> Link: https://lore.kernel.org/linux-fsdevel/7ehxrhbvehlrjwvrduoxsao5k3x4aw275patsb3krkwuq573yv@o2hskrfawbnc/
> Suggested-by: Josef Bacik <josef@toxicpanda.com>
> Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> ---
>  mm/filemap.c |  3 +--
>  mm/mmap.c    | 12 ++++++++++++
>  mm/util.c    |  7 +++++++
>  3 files changed, 20 insertions(+), 2 deletions(-)
>
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 2974691fdfad2..f85d288209b44 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3350,7 +3350,6 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
>  vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
>  {
>  	struct file *fpin = NULL;
> -	int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
>  	loff_t pos = vmf->pgoff >> PAGE_SHIFT;
>  	size_t count = PAGE_SIZE;
>  	int err;
> @@ -3370,7 +3369,7 @@ vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
>  	if (!fpin)
>  		return VM_FAULT_SIGBUS;
>
> -	err = fsnotify_file_area_perm(fpin, mask, &pos, count);
> +	err = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos, count);
>  	fput(fpin);
>  	if (err)
>  		return VM_FAULT_SIGBUS;
> diff --git a/mm/mmap.c b/mm/mmap.c
> index cda01071c7b1f..70318936fd588 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -48,6 +48,7 @@
>  #include <linux/sched/mm.h>
>  #include <linux/ksm.h>
>  #include <linux/memfd.h>
> +#include <linux/fsnotify.h>
>
>  #include <linux/uaccess.h>
>  #include <asm/cacheflush.h>
> @@ -1151,6 +1152,17 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,

I kind of hate that we keep on extending this deprecate syscall. Is it
truly necessary here?

>  		return ret;
>  	}
>
> +	if (file && unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {

Is there a circumstance where file == NULL here? I mean get_file()
literally dereferences a field then returns the pointer, so that'd be a
null pointer deref?

Also I'm pretty sure it's impossible possible for a VMA to be VM_SHARED and
!vma->vm_file, since we need to access the address_space etc.

> +		int mask = (prot & PROT_WRITE) ? MAY_WRITE : MAY_READ;
> +		loff_t pos = pgoff >> PAGE_SHIFT;
> +
> +		ret = fsnotify_file_area_perm(file, mask, &pos, size);

All other invocations of this in fs code, this further amplifies my belief
that this belongs in fs code.

> +		if (ret) {
> +			fput(file);
> +			return ret;
> +		}
> +	}
> +
>  	ret = -EINVAL;
>
>  	/* OK security check passed, take write lock + let it rip. */
> diff --git a/mm/util.c b/mm/util.c
> index b6b9684a14388..2dddeabac6098 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -23,6 +23,7 @@
>  #include <linux/processor.h>
>  #include <linux/sizes.h>
>  #include <linux/compat.h>
> +#include <linux/fsnotify.h>
>
>  #include <linux/uaccess.h>
>
> @@ -569,6 +570,12 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
>  	LIST_HEAD(uf);
>
>  	ret = security_mmap_file(file, prot, flag);
> +	if (!ret && file && unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
> +		int mask = (prot & PROT_WRITE) ? MAY_WRITE : MAY_READ;
> +		loff_t pos = pgoff >> PAGE_SHIFT;
> +
> +		ret = fsnotify_file_area_perm(file, mask, &pos, len);
> +	}
>  	if (!ret) {
>  		if (mmap_write_lock_killable(mm))
>  			return -EINTR;

You've duplicated this code in 2 places, can we please just have it in one
as a helper function?

Also I'm not a fan of having super-specific file system code relating to
HSM in general mapping code like this. Can't we have something like a hook
or something more generic?

I mean are we going to keep on expanding this for other super-specific
cases?

I would say refactor this whole thing into a check that's done in fs code
that we can call into.

If we need a new hook, then let's add one. If we can use existing hooks,
let's use them.

Also, is it valid to be accessing this file without doing a get_file()
here? It seems super inconsistent you increment ref count in one place but
not the other?

> --
> 2.34.1
>
Amir Goldstein March 11, 2025, 2:37 p.m. UTC | #2
On Tue, Mar 11, 2025 at 1:34 PM Lorenzo Stoakes
<lorenzo.stoakes@oracle.com> wrote:
>
> On Tue, Mar 11, 2025 at 12:41:52PM +0100, Amir Goldstein wrote:
> > Pre-content hooks in page faults introduces potential deadlock of HSM
> > handler in userspace with filesystem freezing.
> >
> > The requirement with pre-content event is that for every accessed file
> > range an event covering at least this range will be generated at least
> > once before the file data is accesses.
> >
> > In preparation to disabling pre-content event hooks on page faults,
> > change those hooks to always use the mask MAY_ACCESS and add pre-content
> > hooks at mmap() variants for the entire mmaped range, so HSM can fill
> > content when user requests to map a portion of the file.
> >
> > Note that exec() variant also calls vm_mmap_pgoff() internally to map
> > code sections, so pre-content hooks are also generated in this case.
> >
> > Link: https://lore.kernel.org/linux-fsdevel/7ehxrhbvehlrjwvrduoxsao5k3x4aw275patsb3krkwuq573yv@o2hskrfawbnc/
> > Suggested-by: Josef Bacik <josef@toxicpanda.com>
> > Signed-off-by: Amir Goldstein <amir73il@gmail.com>
> > ---
> >  mm/filemap.c |  3 +--
> >  mm/mmap.c    | 12 ++++++++++++
> >  mm/util.c    |  7 +++++++
> >  3 files changed, 20 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 2974691fdfad2..f85d288209b44 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -3350,7 +3350,6 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
> >  vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
> >  {
> >       struct file *fpin = NULL;
> > -     int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
> >       loff_t pos = vmf->pgoff >> PAGE_SHIFT;
> >       size_t count = PAGE_SIZE;
> >       int err;
> > @@ -3370,7 +3369,7 @@ vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
> >       if (!fpin)
> >               return VM_FAULT_SIGBUS;
> >
> > -     err = fsnotify_file_area_perm(fpin, mask, &pos, count);
> > +     err = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos, count);
> >       fput(fpin);
> >       if (err)
> >               return VM_FAULT_SIGBUS;
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index cda01071c7b1f..70318936fd588 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -48,6 +48,7 @@
> >  #include <linux/sched/mm.h>
> >  #include <linux/ksm.h>
> >  #include <linux/memfd.h>
> > +#include <linux/fsnotify.h>
> >
> >  #include <linux/uaccess.h>
> >  #include <asm/cacheflush.h>
> > @@ -1151,6 +1152,17 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
>
> I kind of hate that we keep on extending this deprecate syscall. Is it
> truly necessary here?
>

If my understanding of remap_file_pages() is correct then no new regions of the
file are being mapped - so no, my bad - the pre-content hook is not
needed in this case.


> >               return ret;
> >       }
> >
> > +     if (file && unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
>
> Is there a circumstance where file == NULL here? I mean get_file()
> literally dereferences a field then returns the pointer, so that'd be a
> null pointer deref?
>
> Also I'm pretty sure it's impossible possible for a VMA to be VM_SHARED and
> !vma->vm_file, since we need to access the address_space etc.
>
> > +             int mask = (prot & PROT_WRITE) ? MAY_WRITE : MAY_READ;
> > +             loff_t pos = pgoff >> PAGE_SHIFT;
> > +
> > +             ret = fsnotify_file_area_perm(file, mask, &pos, size);
>
> All other invocations of this in fs code, this further amplifies my belief
> that this belongs in fs code.
>
> > +             if (ret) {
> > +                     fput(file);
> > +                     return ret;
> > +             }
> > +     }
> > +
> >       ret = -EINVAL;
> >
> >       /* OK security check passed, take write lock + let it rip. */
> > diff --git a/mm/util.c b/mm/util.c
> > index b6b9684a14388..2dddeabac6098 100644
> > --- a/mm/util.c
> > +++ b/mm/util.c
> > @@ -23,6 +23,7 @@
> >  #include <linux/processor.h>
> >  #include <linux/sizes.h>
> >  #include <linux/compat.h>
> > +#include <linux/fsnotify.h>
> >
> >  #include <linux/uaccess.h>
> >
> > @@ -569,6 +570,12 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
> >       LIST_HEAD(uf);
> >
> >       ret = security_mmap_file(file, prot, flag);
> > +     if (!ret && file && unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
> > +             int mask = (prot & PROT_WRITE) ? MAY_WRITE : MAY_READ;
> > +             loff_t pos = pgoff >> PAGE_SHIFT;
> > +
> > +             ret = fsnotify_file_area_perm(file, mask, &pos, len);
> > +     }
> >       if (!ret) {
> >               if (mmap_write_lock_killable(mm))
> >                       return -EINTR;
>
> You've duplicated this code in 2 places, can we please just have it in one
> as a helper function?
>
> Also I'm not a fan of having super-specific file system code relating to
> HSM in general mapping code like this. Can't we have something like a hook
> or something more generic?
>
> I mean are we going to keep on expanding this for other super-specific
> cases?
>
> I would say refactor this whole thing into a check that's done in fs code
> that we can call into.
>
> If we need a new hook, then let's add one. If we can use existing hooks,
> let's use them.
>

fsnotify hooks and logic have traditionally been contained in fsnotify.h
wrappers much like the security_ hooks.

For the page fault hooks, Linus asked for the FMODE_FSNOTIFY_HSM
condition to be very visible and explicit, but I suppose there is no reason
for not having a fsnotify_mmap() wrapper here.

> Also, is it valid to be accessing this file without doing a get_file()
> here?

Caller of vm_mmap_pgoff() should have a reference on file.

> It seems super inconsistent you increment ref count in one place but
> not the other?

By other place do you mean in remap_file_pages()?
There, the file reference is coming from vma->vm_file and hook is called
outside mmap lock.

Thanks,
Amir.
diff mbox series

Patch

diff --git a/mm/filemap.c b/mm/filemap.c
index 2974691fdfad2..f85d288209b44 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3350,7 +3350,6 @@  static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
 vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
 {
 	struct file *fpin = NULL;
-	int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
 	loff_t pos = vmf->pgoff >> PAGE_SHIFT;
 	size_t count = PAGE_SIZE;
 	int err;
@@ -3370,7 +3369,7 @@  vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
 	if (!fpin)
 		return VM_FAULT_SIGBUS;
 
-	err = fsnotify_file_area_perm(fpin, mask, &pos, count);
+	err = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos, count);
 	fput(fpin);
 	if (err)
 		return VM_FAULT_SIGBUS;
diff --git a/mm/mmap.c b/mm/mmap.c
index cda01071c7b1f..70318936fd588 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -48,6 +48,7 @@ 
 #include <linux/sched/mm.h>
 #include <linux/ksm.h>
 #include <linux/memfd.h>
+#include <linux/fsnotify.h>
 
 #include <linux/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1151,6 +1152,17 @@  SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 		return ret;
 	}
 
+	if (file && unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
+		int mask = (prot & PROT_WRITE) ? MAY_WRITE : MAY_READ;
+		loff_t pos = pgoff >> PAGE_SHIFT;
+
+		ret = fsnotify_file_area_perm(file, mask, &pos, size);
+		if (ret) {
+			fput(file);
+			return ret;
+		}
+	}
+
 	ret = -EINVAL;
 
 	/* OK security check passed, take write lock + let it rip. */
diff --git a/mm/util.c b/mm/util.c
index b6b9684a14388..2dddeabac6098 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -23,6 +23,7 @@ 
 #include <linux/processor.h>
 #include <linux/sizes.h>
 #include <linux/compat.h>
+#include <linux/fsnotify.h>
 
 #include <linux/uaccess.h>
 
@@ -569,6 +570,12 @@  unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	LIST_HEAD(uf);
 
 	ret = security_mmap_file(file, prot, flag);
+	if (!ret && file && unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
+		int mask = (prot & PROT_WRITE) ? MAY_WRITE : MAY_READ;
+		loff_t pos = pgoff >> PAGE_SHIFT;
+
+		ret = fsnotify_file_area_perm(file, mask, &pos, len);
+	}
 	if (!ret) {
 		if (mmap_write_lock_killable(mm))
 			return -EINTR;