diff mbox series

[3/6] mm: Handle shared faults under the VMA lock

Message ID 20230927052505.2855872-4-willy@infradead.org (mailing list archive)
State New
Headers show
Series Handle more faults under the VMA lock | expand

Commit Message

Matthew Wilcox Sept. 27, 2023, 5:25 a.m. UTC
There are many implementations of ->fault and some of them depend on
mmap_lock being held.  All vm_ops that implement ->map_pages() end
up calling filemap_fault(), which I have audited to be sure it does
not rely on mmap_lock.  So (for now) key off ->map_pages existing
as being a flag to indicate that it's safe to call ->fault while
only holding the vma lock.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/memory.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

Comments

Suren Baghdasaryan Sept. 28, 2023, 12:46 a.m. UTC | #1
On Tue, Sep 26, 2023 at 10:25 PM Matthew Wilcox (Oracle)
<willy@infradead.org> wrote:
>
> There are many implementations of ->fault and some of them depend on
> mmap_lock being held.  All vm_ops that implement ->map_pages() end
> up calling filemap_fault(), which I have audited to be sure it does
> not rely on mmap_lock.  So (for now) key off ->map_pages existing
> as being a flag to indicate that it's safe to call ->fault while
> only holding the vma lock.
>
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  mm/memory.c | 22 ++++++++++++++++++----
>  1 file changed, 18 insertions(+), 4 deletions(-)
>
> diff --git a/mm/memory.c b/mm/memory.c
> index cff78c496728..0f3da4889230 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3042,6 +3042,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
>         count_vm_event(PGREUSE);
>  }
>
> +/*
> + * We could add a bitflag somewhere, but for now, we know that all
> + * vm_ops that have a ->map_pages have been audited and don't need
> + * the mmap_lock to be held.
> + */
> +static inline vm_fault_t vmf_maybe_unlock_vma(const struct vm_fault *vmf)
> +{
> +       struct vm_area_struct *vma = vmf->vma;
> +
> +       if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
> +               return 0;
> +       vma_end_read(vma);
> +       return VM_FAULT_RETRY;
> +}
> +
>  static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
>  {
>         struct vm_area_struct *vma = vmf->vma;
> @@ -4669,10 +4684,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
>         vm_fault_t ret, tmp;
>         struct folio *folio;
>
> -       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> -               vma_end_read(vma);
> -               return VM_FAULT_RETRY;
> -       }
> +       ret = vmf_maybe_unlock_vma(vmf);

The name of this new function in this context does not seem
appropriate to me. The logic of this check was that we can't rely on
VMA lock since it might not be sufficient, so we have to retry with
mmap_lock instead. With this change it seems like we intentionally try
to unlock the VMA here. IMHO this would be more understandable:

static inline bool is_vma_lock_sufficient(struct vm_area_struct *vma) {
    return vma->vm_ops->map_pages != NULL;
}

static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
{
...
        if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !is_vma_lock_sufficient(vma) {
                vma_end_read(vma);
               return VM_FAULT_RETRY;
        }

> +       if (ret)
> +               return ret;
>
>         ret = __do_fault(vmf);
>         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> --
> 2.40.1
>
Suren Baghdasaryan Sept. 28, 2023, 1:02 a.m. UTC | #2
On Wed, Sep 27, 2023 at 5:46 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Tue, Sep 26, 2023 at 10:25 PM Matthew Wilcox (Oracle)
> <willy@infradead.org> wrote:
> >
> > There are many implementations of ->fault and some of them depend on
> > mmap_lock being held.  All vm_ops that implement ->map_pages() end
> > up calling filemap_fault(), which I have audited to be sure it does
> > not rely on mmap_lock.  So (for now) key off ->map_pages existing
> > as being a flag to indicate that it's safe to call ->fault while
> > only holding the vma lock.
> >
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > ---
> >  mm/memory.c | 22 ++++++++++++++++++----
> >  1 file changed, 18 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index cff78c496728..0f3da4889230 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -3042,6 +3042,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
> >         count_vm_event(PGREUSE);
> >  }
> >
> > +/*
> > + * We could add a bitflag somewhere, but for now, we know that all
> > + * vm_ops that have a ->map_pages have been audited and don't need
> > + * the mmap_lock to be held.
> > + */
> > +static inline vm_fault_t vmf_maybe_unlock_vma(const struct vm_fault *vmf)
> > +{
> > +       struct vm_area_struct *vma = vmf->vma;
> > +
> > +       if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
> > +               return 0;
> > +       vma_end_read(vma);
> > +       return VM_FAULT_RETRY;
> > +}
> > +
> >  static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
> >  {
> >         struct vm_area_struct *vma = vmf->vma;
> > @@ -4669,10 +4684,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> >         vm_fault_t ret, tmp;
> >         struct folio *folio;
> >
> > -       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> > -               vma_end_read(vma);
> > -               return VM_FAULT_RETRY;
> > -       }
> > +       ret = vmf_maybe_unlock_vma(vmf);
>
> The name of this new function in this context does not seem
> appropriate to me. The logic of this check was that we can't rely on
> VMA lock since it might not be sufficient, so we have to retry with
> mmap_lock instead. With this change it seems like we intentionally try
> to unlock the VMA here. IMHO this would be more understandable:
>
> static inline bool is_vma_lock_sufficient(struct vm_area_struct *vma) {
>     return vma->vm_ops->map_pages != NULL;
> }
>
> static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
> {
> ...
>         if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !is_vma_lock_sufficient(vma) {
>                 vma_end_read(vma);
>                return VM_FAULT_RETRY;
>         }

Same comment for the rest of the patches where vmf_maybe_unlock_vma()
is being used. It would be great to have this logic coded in one
function like you do but I could not find an appropriate name that
would convey that "we want to check if the current lock is sufficient
and if not then we will drop it and retry". Maybe you or someone else
can think of a good name for it?

>
> > +       if (ret)
> > +               return ret;
> >
> >         ret = __do_fault(vmf);
> >         if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
> > --
> > 2.40.1
> >
Matthew Wilcox Sept. 28, 2023, 5:33 a.m. UTC | #3
On Wed, Sep 27, 2023 at 06:02:47PM -0700, Suren Baghdasaryan wrote:
> On Wed, Sep 27, 2023 at 5:46 PM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Tue, Sep 26, 2023 at 10:25 PM Matthew Wilcox (Oracle)
> > <willy@infradead.org> wrote:
> > >
> > > There are many implementations of ->fault and some of them depend on
> > > mmap_lock being held.  All vm_ops that implement ->map_pages() end
> > > up calling filemap_fault(), which I have audited to be sure it does
> > > not rely on mmap_lock.  So (for now) key off ->map_pages existing
> > > as being a flag to indicate that it's safe to call ->fault while
> > > only holding the vma lock.
> > >
> > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > ---
> > >  mm/memory.c | 22 ++++++++++++++++++----
> > >  1 file changed, 18 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/mm/memory.c b/mm/memory.c
> > > index cff78c496728..0f3da4889230 100644
> > > --- a/mm/memory.c
> > > +++ b/mm/memory.c
> > > @@ -3042,6 +3042,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
> > >         count_vm_event(PGREUSE);
> > >  }
> > >
> > > +/*
> > > + * We could add a bitflag somewhere, but for now, we know that all
> > > + * vm_ops that have a ->map_pages have been audited and don't need
> > > + * the mmap_lock to be held.
> > > + */
> > > +static inline vm_fault_t vmf_maybe_unlock_vma(const struct vm_fault *vmf)
> > > +{
> > > +       struct vm_area_struct *vma = vmf->vma;
> > > +
> > > +       if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
> > > +               return 0;
> > > +       vma_end_read(vma);
> > > +       return VM_FAULT_RETRY;
> > > +}
> > > +
> > >  static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
> > >  {
> > >         struct vm_area_struct *vma = vmf->vma;
> > > @@ -4669,10 +4684,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> > >         vm_fault_t ret, tmp;
> > >         struct folio *folio;
> > >
> > > -       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> > > -               vma_end_read(vma);
> > > -               return VM_FAULT_RETRY;
> > > -       }
> > > +       ret = vmf_maybe_unlock_vma(vmf);
> >
> > The name of this new function in this context does not seem
> > appropriate to me. The logic of this check was that we can't rely on
> > VMA lock since it might not be sufficient, so we have to retry with
> > mmap_lock instead. With this change it seems like we intentionally try
> > to unlock the VMA here. IMHO this would be more understandable:
> >
> > static inline bool is_vma_lock_sufficient(struct vm_area_struct *vma) {
> >     return vma->vm_ops->map_pages != NULL;
> > }

Originally I called this function vma_needs_mmap_lock() (with the
opposite polarity).  But I disliked the duplication of code ...

> Same comment for the rest of the patches where vmf_maybe_unlock_vma()
> is being used. It would be great to have this logic coded in one
> function like you do but I could not find an appropriate name that
> would convey that "we want to check if the current lock is sufficient
> and if not then we will drop it and retry". Maybe you or someone else
> can think of a good name for it?

Maybe

+static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
+{
+       struct vm_area_struct *vma = vmf->vma;
+
+       if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
+               return 0;
+       vma_end_read(vma);
+       return VM_FAULT_RETRY;
+}

I'm having trouble coming up with a name that doesn't imply it's a bool
predicate.
Suren Baghdasaryan Sept. 28, 2023, 3:02 p.m. UTC | #4
On Wed, Sep 27, 2023 at 10:33 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Wed, Sep 27, 2023 at 06:02:47PM -0700, Suren Baghdasaryan wrote:
> > On Wed, Sep 27, 2023 at 5:46 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > >
> > > On Tue, Sep 26, 2023 at 10:25 PM Matthew Wilcox (Oracle)
> > > <willy@infradead.org> wrote:
> > > >
> > > > There are many implementations of ->fault and some of them depend on
> > > > mmap_lock being held.  All vm_ops that implement ->map_pages() end
> > > > up calling filemap_fault(), which I have audited to be sure it does
> > > > not rely on mmap_lock.  So (for now) key off ->map_pages existing
> > > > as being a flag to indicate that it's safe to call ->fault while
> > > > only holding the vma lock.
> > > >
> > > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > > ---
> > > >  mm/memory.c | 22 ++++++++++++++++++----
> > > >  1 file changed, 18 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/mm/memory.c b/mm/memory.c
> > > > index cff78c496728..0f3da4889230 100644
> > > > --- a/mm/memory.c
> > > > +++ b/mm/memory.c
> > > > @@ -3042,6 +3042,21 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
> > > >         count_vm_event(PGREUSE);
> > > >  }
> > > >
> > > > +/*
> > > > + * We could add a bitflag somewhere, but for now, we know that all
> > > > + * vm_ops that have a ->map_pages have been audited and don't need
> > > > + * the mmap_lock to be held.
> > > > + */
> > > > +static inline vm_fault_t vmf_maybe_unlock_vma(const struct vm_fault *vmf)
> > > > +{
> > > > +       struct vm_area_struct *vma = vmf->vma;
> > > > +
> > > > +       if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
> > > > +               return 0;
> > > > +       vma_end_read(vma);
> > > > +       return VM_FAULT_RETRY;
> > > > +}
> > > > +
> > > >  static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
> > > >  {
> > > >         struct vm_area_struct *vma = vmf->vma;
> > > > @@ -4669,10 +4684,9 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
> > > >         vm_fault_t ret, tmp;
> > > >         struct folio *folio;
> > > >
> > > > -       if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
> > > > -               vma_end_read(vma);
> > > > -               return VM_FAULT_RETRY;
> > > > -       }
> > > > +       ret = vmf_maybe_unlock_vma(vmf);
> > >
> > > The name of this new function in this context does not seem
> > > appropriate to me. The logic of this check was that we can't rely on
> > > VMA lock since it might not be sufficient, so we have to retry with
> > > mmap_lock instead. With this change it seems like we intentionally try
> > > to unlock the VMA here. IMHO this would be more understandable:
> > >
> > > static inline bool is_vma_lock_sufficient(struct vm_area_struct *vma) {
> > >     return vma->vm_ops->map_pages != NULL;
> > > }
>
> Originally I called this function vma_needs_mmap_lock() (with the
> opposite polarity).  But I disliked the duplication of code ...
>
> > Same comment for the rest of the patches where vmf_maybe_unlock_vma()
> > is being used. It would be great to have this logic coded in one
> > function like you do but I could not find an appropriate name that
> > would convey that "we want to check if the current lock is sufficient
> > and if not then we will drop it and retry". Maybe you or someone else
> > can think of a good name for it?
>
> Maybe
>
> +static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
> +{
> +       struct vm_area_struct *vma = vmf->vma;
> +
> +       if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
> +               return 0;
> +       vma_end_read(vma);
> +       return VM_FAULT_RETRY;
> +}
>
> I'm having trouble coming up with a name that doesn't imply it's a bool
> predicate.

Ok, I think it's better. At least it does not imply that our intention
is to unlock the VMA.

>
diff mbox series

Patch

diff --git a/mm/memory.c b/mm/memory.c
index cff78c496728..0f3da4889230 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3042,6 +3042,21 @@  static inline void wp_page_reuse(struct vm_fault *vmf)
 	count_vm_event(PGREUSE);
 }
 
+/*
+ * We could add a bitflag somewhere, but for now, we know that all
+ * vm_ops that have a ->map_pages have been audited and don't need
+ * the mmap_lock to be held.
+ */
+static inline vm_fault_t vmf_maybe_unlock_vma(const struct vm_fault *vmf)
+{
+	struct vm_area_struct *vma = vmf->vma;
+
+	if (vma->vm_ops->map_pages || !(vmf->flags & FAULT_FLAG_VMA_LOCK))
+		return 0;
+	vma_end_read(vma);
+	return VM_FAULT_RETRY;
+}
+
 static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
@@ -4669,10 +4684,9 @@  static vm_fault_t do_shared_fault(struct vm_fault *vmf)
 	vm_fault_t ret, tmp;
 	struct folio *folio;
 
-	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
-		vma_end_read(vma);
-		return VM_FAULT_RETRY;
-	}
+	ret = vmf_maybe_unlock_vma(vmf);
+	if (ret)
+		return ret;
 
 	ret = __do_fault(vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))