diff mbox series

[v2,4/5] mm: make vma cache SLAB_TYPESAFE_BY_RCU

Message ID 20241112194635.444146-5-surenb@google.com (mailing list archive)
State New
Headers show
Series move per-vma lock into vm_area_struct | expand

Commit Message

Suren Baghdasaryan Nov. 12, 2024, 7:46 p.m. UTC
To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
object reuse before RCU grace period is over will be detected inside
lock_vma_under_rcu().
lock_vma_under_rcu() enters RCU read section, finds the vma at the
given address, locks the vma and checks if it got detached or remapped
to cover a different address range. These last checks are there
to ensure that the vma was not modified after we found it but before
locking it. Vma reuse introduces a possibility that in between those
events of finding and locking the vma, it can get detached, reused,
added into a tree and be marked as attached. Current checks will help
detecting cases when:
- vma was reused but not yet added into the tree (detached check)
- vma was reused at a different address range (address check)
If vma is covering a new address range which still includes the address
we were looking for, it's not a problem unless the reused vma was added
into a different address space. Therefore checking that vma->vm_mm is
still the same is the the only missing check to detect vma reuse.
Add this missing check into lock_vma_under_rcu() and change vma cache
to include SLAB_TYPESAFE_BY_RCU. This will facilitate vm_area_struct
reuse and will minimize the number of call_rcu() calls.
Adding vm_freeptr into vm_area_struct avoids bloating that structure.
lock_vma_under_rcu() checks of the detached flag guarantees that vma
is valid and attached to a tree, therefore unioning vm_freeptr with
vm_start/vm_end is not an issue even though lock_vma_under_rcu() is
using them.
As part of this change freeptr_t declaration is moved into mm_types.h
to avoid circular dependencies between mm_types.h and slab.h.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm_types.h | 10 +++++++---
 include/linux/slab.h     |  6 ------
 kernel/fork.c            | 29 +++++++++++++----------------
 mm/memory.c              |  2 +-
 4 files changed, 21 insertions(+), 26 deletions(-)

Comments

Suren Baghdasaryan Nov. 13, 2024, 2:57 a.m. UTC | #1
On Tue, Nov 12, 2024 at 11:46 AM 'Suren Baghdasaryan' via kernel-team
<kernel-team@android.com> wrote:
>
> To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
> object reuse before RCU grace period is over will be detected inside
> lock_vma_under_rcu().
> lock_vma_under_rcu() enters RCU read section, finds the vma at the
> given address, locks the vma and checks if it got detached or remapped
> to cover a different address range. These last checks are there
> to ensure that the vma was not modified after we found it but before
> locking it. Vma reuse introduces a possibility that in between those
> events of finding and locking the vma, it can get detached, reused,
> added into a tree and be marked as attached. Current checks will help
> detecting cases when:
> - vma was reused but not yet added into the tree (detached check)
> - vma was reused at a different address range (address check)
> If vma is covering a new address range which still includes the address
> we were looking for, it's not a problem unless the reused vma was added
> into a different address space. Therefore checking that vma->vm_mm is
> still the same is the the only missing check to detect vma reuse.

Thinking about this some more, I don't think this works. I'm relying
on vma_start_read() to stabilize the vma, however the lock I'm taking
is part of the vma which can be reused from under us. So, the lock I'm
taking might be reinitialized after I take the lock...
I need to figure out a way to stabilize the vma in some other manner
before taking this lock.

> Add this missing check into lock_vma_under_rcu() and change vma cache
> to include SLAB_TYPESAFE_BY_RCU. This will facilitate vm_area_struct
> reuse and will minimize the number of call_rcu() calls.
> Adding vm_freeptr into vm_area_struct avoids bloating that structure.
> lock_vma_under_rcu() checks of the detached flag guarantees that vma
> is valid and attached to a tree, therefore unioning vm_freeptr with
> vm_start/vm_end is not an issue even though lock_vma_under_rcu() is
> using them.
> As part of this change freeptr_t declaration is moved into mm_types.h
> to avoid circular dependencies between mm_types.h and slab.h.
>
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> ---
>  include/linux/mm_types.h | 10 +++++++---
>  include/linux/slab.h     |  6 ------
>  kernel/fork.c            | 29 +++++++++++++----------------
>  mm/memory.c              |  2 +-
>  4 files changed, 21 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 5c4bfdcfac72..37580cc7bec0 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -32,6 +32,12 @@
>  struct address_space;
>  struct mem_cgroup;
>
> +/*
> + * freeptr_t represents a SLUB freelist pointer, which might be encoded
> + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> + */
> +typedef struct { unsigned long v; } freeptr_t;
> +
>  /*
>   * Each physical page in the system has a struct page associated with
>   * it to keep track of whatever it is we are using the page for at the
> @@ -673,9 +679,7 @@ struct vm_area_struct {
>                         unsigned long vm_start;
>                         unsigned long vm_end;
>                 };
> -#ifdef CONFIG_PER_VMA_LOCK
> -               struct rcu_head vm_rcu; /* Used for deferred freeing. */
> -#endif
> +               freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
>         };
>
>         /*
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index b35e2db7eb0e..cb45db2402ac 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -212,12 +212,6 @@ enum _slab_flag_bits {
>  #define SLAB_NO_OBJ_EXT                __SLAB_FLAG_UNUSED
>  #endif
>
> -/*
> - * freeptr_t represents a SLUB freelist pointer, which might be encoded
> - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
> - */
> -typedef struct { unsigned long v; } freeptr_t;
> -
>  /*
>   * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
>   *
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 7823797e31d2..946c3f9a9342 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -478,25 +478,15 @@ void __vm_area_free(struct vm_area_struct *vma)
>         kmem_cache_free(vm_area_cachep, vma);
>  }
>
> -#ifdef CONFIG_PER_VMA_LOCK
> -static void vm_area_free_rcu_cb(struct rcu_head *head)
> +void vm_area_free(struct vm_area_struct *vma)
>  {
> -       struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
> -                                                 vm_rcu);
> -
> +#ifdef CONFIG_PER_VMA_LOCK
> +       /* The vma should be detached while being destroyed. */
> +       VM_BUG_ON_VMA(!is_vma_detached(vma), vma);
>         /* The vma should not be locked while being destroyed. */
>         VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma);
> -       __vm_area_free(vma);
> -}
>  #endif
> -
> -void vm_area_free(struct vm_area_struct *vma)
> -{
> -#ifdef CONFIG_PER_VMA_LOCK
> -       call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
> -#else
>         __vm_area_free(vma);
> -#endif
>  }
>
>  static void account_kernel_stack(struct task_struct *tsk, int account)
> @@ -3115,6 +3105,11 @@ void __init mm_cache_init(void)
>
>  void __init proc_caches_init(void)
>  {
> +       struct kmem_cache_args args = {
> +               .use_freeptr_offset = true,
> +               .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
> +       };
> +
>         sighand_cachep = kmem_cache_create("sighand_cache",
>                         sizeof(struct sighand_struct), 0,
>                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> @@ -3131,9 +3126,11 @@ void __init proc_caches_init(void)
>                         sizeof(struct fs_struct), 0,
>                         SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
>                         NULL);
> -       vm_area_cachep = KMEM_CACHE(vm_area_struct,
> -                       SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC|
> +       vm_area_cachep = kmem_cache_create("vm_area_struct",
> +                       sizeof(struct vm_area_struct), &args,
> +                       SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
>                         SLAB_ACCOUNT);
> +
>         mmap_init();
>         nsproxy_cache_init();
>  }
> diff --git a/mm/memory.c b/mm/memory.c
> index d0197a0c0996..9c414c81f14a 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -6279,7 +6279,7 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
>                 goto inval;
>
>         /* Check if the VMA got isolated after we found it */
> -       if (is_vma_detached(vma)) {
> +       if (is_vma_detached(vma) || vma->vm_mm != mm) {
>                 vma_end_read(vma);
>                 count_vm_vma_lock_event(VMA_LOCK_MISS);
>                 /* The area was replaced with another one */
> --
> 2.47.0.277.g8800431eea-goog
>
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Hugh Dickins Nov. 13, 2024, 5:08 a.m. UTC | #2
On Tue, 12 Nov 2024, Suren Baghdasaryan wrote:
> 
> Thinking about this some more, I don't think this works. I'm relying
> on vma_start_read() to stabilize the vma, however the lock I'm taking
> is part of the vma which can be reused from under us. So, the lock I'm
> taking might be reinitialized after I take the lock...
> I need to figure out a way to stabilize the vma in some other manner
> before taking this lock.

(I'm not paying attention and following the patches, I just happened
to notice this remark: forgive me if I'm out of context and have
misunderstood, but hope this might help:)

But this is exactly the problem SLAB_TYPESAFE_BY_RCU was invented for.
You just have to be careful that the locks are initialized only when the
slab is first created (allocated from buddy), not reinitialized whenever
a new object is allocated from that slab.

Hugh
Suren Baghdasaryan Nov. 13, 2024, 6:03 a.m. UTC | #3
On Tue, Nov 12, 2024 at 9:08 PM Hugh Dickins <hughd@google.com> wrote:
>
> On Tue, 12 Nov 2024, Suren Baghdasaryan wrote:
> >
> > Thinking about this some more, I don't think this works. I'm relying
> > on vma_start_read() to stabilize the vma, however the lock I'm taking
> > is part of the vma which can be reused from under us. So, the lock I'm
> > taking might be reinitialized after I take the lock...
> > I need to figure out a way to stabilize the vma in some other manner
> > before taking this lock.
>
> (I'm not paying attention and following the patches, I just happened
> to notice this remark: forgive me if I'm out of context and have
> misunderstood, but hope this might help:)
>
> But this is exactly the problem SLAB_TYPESAFE_BY_RCU was invented for.
> You just have to be careful that the locks are initialized only when the
> slab is first created (allocated from buddy), not reinitialized whenever
> a new object is allocated from that slab.

Hi Hugh!
I'm looking into SLAB_TYPESAFE_BY_RCU implementation and trying to
figure out if initializing the lock in the ctor() of the cache as
mentioned in the comment here:
https://elixir.bootlin.com/linux/v6.12-rc7/source/include/linux/slab.h#L127
would help my case. I assume that's what you are hinting here?

>
> Hugh
Hugh Dickins Nov. 13, 2024, 6:52 a.m. UTC | #4
On Tue, 12 Nov 2024, Suren Baghdasaryan wrote:
> On Tue, Nov 12, 2024 at 9:08 PM Hugh Dickins <hughd@google.com> wrote:
> > On Tue, 12 Nov 2024, Suren Baghdasaryan wrote:
> > >
> > > Thinking about this some more, I don't think this works. I'm relying
> > > on vma_start_read() to stabilize the vma, however the lock I'm taking
> > > is part of the vma which can be reused from under us. So, the lock I'm
> > > taking might be reinitialized after I take the lock...
> > > I need to figure out a way to stabilize the vma in some other manner
> > > before taking this lock.
> >
> > (I'm not paying attention and following the patches, I just happened
> > to notice this remark: forgive me if I'm out of context and have
> > misunderstood, but hope this might help:)
> >
> > But this is exactly the problem SLAB_TYPESAFE_BY_RCU was invented for.
> > You just have to be careful that the locks are initialized only when the
> > slab is first created (allocated from buddy), not reinitialized whenever
> > a new object is allocated from that slab.
> 
> Hi Hugh!
> I'm looking into SLAB_TYPESAFE_BY_RCU implementation and trying to
> figure out if initializing the lock in the ctor() of the cache as
> mentioned in the comment here:
> https://elixir.bootlin.com/linux/v6.12-rc7/source/include/linux/slab.h#L127
> would help my case. I assume that's what you are hinting here?

Yes, if I'm "hinting", it's because offhand I forget the right names:
"ctor", yes, that sounds right.

Just grep around for examples of how it is used: there must be plenty
now. but anon_vma is what it was first used for.

But given the title of this patch, I'm surprised it's new to you.

Hugh
Suren Baghdasaryan Nov. 13, 2024, 8:19 a.m. UTC | #5
On Tue, Nov 12, 2024 at 10:52 PM Hugh Dickins <hughd@google.com> wrote:
>
> On Tue, 12 Nov 2024, Suren Baghdasaryan wrote:
> > On Tue, Nov 12, 2024 at 9:08 PM Hugh Dickins <hughd@google.com> wrote:
> > > On Tue, 12 Nov 2024, Suren Baghdasaryan wrote:
> > > >
> > > > Thinking about this some more, I don't think this works. I'm relying
> > > > on vma_start_read() to stabilize the vma, however the lock I'm taking
> > > > is part of the vma which can be reused from under us. So, the lock I'm
> > > > taking might be reinitialized after I take the lock...
> > > > I need to figure out a way to stabilize the vma in some other manner
> > > > before taking this lock.
> > >
> > > (I'm not paying attention and following the patches, I just happened
> > > to notice this remark: forgive me if I'm out of context and have
> > > misunderstood, but hope this might help:)
> > >
> > > But this is exactly the problem SLAB_TYPESAFE_BY_RCU was invented for.
> > > You just have to be careful that the locks are initialized only when the
> > > slab is first created (allocated from buddy), not reinitialized whenever
> > > a new object is allocated from that slab.
> >
> > Hi Hugh!
> > I'm looking into SLAB_TYPESAFE_BY_RCU implementation and trying to
> > figure out if initializing the lock in the ctor() of the cache as
> > mentioned in the comment here:
> > https://elixir.bootlin.com/linux/v6.12-rc7/source/include/linux/slab.h#L127
> > would help my case. I assume that's what you are hinting here?
>
> Yes, if I'm "hinting", it's because offhand I forget the right names:
> "ctor", yes, that sounds right.

Just wanted to make sure I understood you correctly. Thanks for confirmation.

>
> Just grep around for examples of how it is used: there must be plenty
> now. but anon_vma is what it was first used for.

Yeah, there are plenty of examples now.

>
> But given the title of this patch, I'm surprised it's new to you.

Thinking about issues arising from possible object reuse is indeed new
to me, that's why I missed the lock reinitialization issue. I think I
know how to fix that now.
Thanks,
Suren.

>
> Hugh
Vlastimil Babka Nov. 13, 2024, 8:58 a.m. UTC | #6
On 11/12/24 20:46, Suren Baghdasaryan wrote:
> To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
> object reuse before RCU grace period is over will be detected inside
> lock_vma_under_rcu().
> lock_vma_under_rcu() enters RCU read section, finds the vma at the
> given address, locks the vma and checks if it got detached or remapped
> to cover a different address range. These last checks are there
> to ensure that the vma was not modified after we found it but before
> locking it. Vma reuse introduces a possibility that in between those
> events of finding and locking the vma, it can get detached, reused,
> added into a tree and be marked as attached. Current checks will help
> detecting cases when:
> - vma was reused but not yet added into the tree (detached check)
> - vma was reused at a different address range (address check)
> If vma is covering a new address range which still includes the address
> we were looking for, it's not a problem unless the reused vma was added
> into a different address space. Therefore checking that vma->vm_mm is
> still the same is the the only missing check to detect vma reuse.

Hi, I was wondering if we actually need the detached flag. Couldn't
"detached" simply mean vma->vm_mm == NULL and we save 4 bytes? Do we ever
need a vma that's detached but still has a mm pointer? I'd hope the places
that set detached to false have the mm pointer around so it's not inconvenient.
Liam R. Howlett Nov. 13, 2024, 12:38 p.m. UTC | #7
* Vlastimil Babka <vbabka@suse.cz> [241113 03:58]:
> On 11/12/24 20:46, Suren Baghdasaryan wrote:
> > To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that
> > object reuse before RCU grace period is over will be detected inside
> > lock_vma_under_rcu().
> > lock_vma_under_rcu() enters RCU read section, finds the vma at the
> > given address, locks the vma and checks if it got detached or remapped
> > to cover a different address range. These last checks are there
> > to ensure that the vma was not modified after we found it but before
> > locking it. Vma reuse introduces a possibility that in between those
> > events of finding and locking the vma, it can get detached, reused,
> > added into a tree and be marked as attached. Current checks will help
> > detecting cases when:
> > - vma was reused but not yet added into the tree (detached check)
> > - vma was reused at a different address range (address check)
> > If vma is covering a new address range which still includes the address
> > we were looking for, it's not a problem unless the reused vma was added
> > into a different address space. Therefore checking that vma->vm_mm is
> > still the same is the the only missing check to detect vma reuse.
> 
> Hi, I was wondering if we actually need the detached flag. Couldn't
> "detached" simply mean vma->vm_mm == NULL and we save 4 bytes? Do we ever
> need a vma that's detached but still has a mm pointer? I'd hope the places
> that set detached to false have the mm pointer around so it's not inconvenient.

I think the gate vmas ruin this plan.
Matthew Wilcox Nov. 13, 2024, 1:57 p.m. UTC | #8
On Wed, Nov 13, 2024 at 07:38:02AM -0500, Liam R. Howlett wrote:
> > Hi, I was wondering if we actually need the detached flag. Couldn't
> > "detached" simply mean vma->vm_mm == NULL and we save 4 bytes? Do we ever
> > need a vma that's detached but still has a mm pointer? I'd hope the places
> > that set detached to false have the mm pointer around so it's not inconvenient.
> 
> I think the gate vmas ruin this plan.

But the gate VMAs aren't to be found in the VMA tree.  Used to be that
was because the VMA tree was the injective RB tree and so VMAs could
only be in one tree at a time.  We could change that now!

Anyway, we could use (void *)1 instead of NULL to indicate a "detached"
VMA if we need to distinguish between a detached VMA and a gate VMA.
Suren Baghdasaryan Nov. 13, 2024, 3:25 p.m. UTC | #9
On Wed, Nov 13, 2024 at 7:23 AM 'Liam R. Howlett' via kernel-team
<kernel-team@android.com> wrote:
>
> * Matthew Wilcox <willy@infradead.org> [241113 08:57]:
> > On Wed, Nov 13, 2024 at 07:38:02AM -0500, Liam R. Howlett wrote:
> > > > Hi, I was wondering if we actually need the detached flag. Couldn't
> > > > "detached" simply mean vma->vm_mm == NULL and we save 4 bytes? Do we ever
> > > > need a vma that's detached but still has a mm pointer? I'd hope the places
> > > > that set detached to false have the mm pointer around so it's not inconvenient.
> > >
> > > I think the gate vmas ruin this plan.
> >
> > But the gate VMAs aren't to be found in the VMA tree.  Used to be that
> > was because the VMA tree was the injective RB tree and so VMAs could
> > only be in one tree at a time.  We could change that now!
>
> \o/
>
> >
> > Anyway, we could use (void *)1 instead of NULL to indicate a "detached"
> > VMA if we need to distinguish between a detached VMA and a gate VMA.
>
> I was thinking a pointer to itself vma->vm_mm = vma, then a check for
> this, instead of null like we do today.

The motivation for having a separate detached flag was that vma->vm_mm
is used when read/write locking the vma, so it has to stay valid even
when vma gets detached. Maybe we can be more cautious in
vma_start_read()/vma_start_write() about it but I don't recall if
those were the only places that was an issue.

>
> Either way, we should make it a function so it's easier to reuse for
> whatever we need in the future, wdyt?
>
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>
Liam R. Howlett Nov. 13, 2024, 3:29 p.m. UTC | #10
* Suren Baghdasaryan <surenb@google.com> [241113 10:25]:
> On Wed, Nov 13, 2024 at 7:23 AM 'Liam R. Howlett' via kernel-team
> <kernel-team@android.com> wrote:
> >
> > * Matthew Wilcox <willy@infradead.org> [241113 08:57]:
> > > On Wed, Nov 13, 2024 at 07:38:02AM -0500, Liam R. Howlett wrote:
> > > > > Hi, I was wondering if we actually need the detached flag. Couldn't
> > > > > "detached" simply mean vma->vm_mm == NULL and we save 4 bytes? Do we ever
> > > > > need a vma that's detached but still has a mm pointer? I'd hope the places
> > > > > that set detached to false have the mm pointer around so it's not inconvenient.
> > > >
> > > > I think the gate vmas ruin this plan.
> > >
> > > But the gate VMAs aren't to be found in the VMA tree.  Used to be that
> > > was because the VMA tree was the injective RB tree and so VMAs could
> > > only be in one tree at a time.  We could change that now!
> >
> > \o/
> >
> > >
> > > Anyway, we could use (void *)1 instead of NULL to indicate a "detached"
> > > VMA if we need to distinguish between a detached VMA and a gate VMA.
> >
> > I was thinking a pointer to itself vma->vm_mm = vma, then a check for
> > this, instead of null like we do today.
> 
> The motivation for having a separate detached flag was that vma->vm_mm
> is used when read/write locking the vma, so it has to stay valid even
> when vma gets detached. Maybe we can be more cautious in
> vma_start_read()/vma_start_write() about it but I don't recall if
> those were the only places that was an issue.

We have the mm form the callers though, so it could be passed in?

> 
> >
> > Either way, we should make it a function so it's easier to reuse for
> > whatever we need in the future, wdyt?
> >
> > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> >
Suren Baghdasaryan Nov. 13, 2024, 3:47 p.m. UTC | #11
On Wed, Nov 13, 2024 at 7:29 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
>
> * Suren Baghdasaryan <surenb@google.com> [241113 10:25]:
> > On Wed, Nov 13, 2024 at 7:23 AM 'Liam R. Howlett' via kernel-team
> > <kernel-team@android.com> wrote:
> > >
> > > * Matthew Wilcox <willy@infradead.org> [241113 08:57]:
> > > > On Wed, Nov 13, 2024 at 07:38:02AM -0500, Liam R. Howlett wrote:
> > > > > > Hi, I was wondering if we actually need the detached flag. Couldn't
> > > > > > "detached" simply mean vma->vm_mm == NULL and we save 4 bytes? Do we ever
> > > > > > need a vma that's detached but still has a mm pointer? I'd hope the places
> > > > > > that set detached to false have the mm pointer around so it's not inconvenient.
> > > > >
> > > > > I think the gate vmas ruin this plan.
> > > >
> > > > But the gate VMAs aren't to be found in the VMA tree.  Used to be that
> > > > was because the VMA tree was the injective RB tree and so VMAs could
> > > > only be in one tree at a time.  We could change that now!
> > >
> > > \o/
> > >
> > > >
> > > > Anyway, we could use (void *)1 instead of NULL to indicate a "detached"
> > > > VMA if we need to distinguish between a detached VMA and a gate VMA.
> > >
> > > I was thinking a pointer to itself vma->vm_mm = vma, then a check for
> > > this, instead of null like we do today.
> >
> > The motivation for having a separate detached flag was that vma->vm_mm
> > is used when read/write locking the vma, so it has to stay valid even
> > when vma gets detached. Maybe we can be more cautious in
> > vma_start_read()/vma_start_write() about it but I don't recall if
> > those were the only places that was an issue.
>
> We have the mm form the callers though, so it could be passed in?

Let me try and see if something else blows up. When I was implementing
per-vma locks I thought about using vma->vm_mm to indicate detached
state but there were some issues that caused me reconsider.

>
> >
> > >
> > > Either way, we should make it a function so it's easier to reuse for
> > > whatever we need in the future, wdyt?
> > >
> > > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> > >
Jann Horn Nov. 13, 2024, 4:44 p.m. UTC | #12
On Wed, Nov 13, 2024 at 4:23 PM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> * Matthew Wilcox <willy@infradead.org> [241113 08:57]:
> > On Wed, Nov 13, 2024 at 07:38:02AM -0500, Liam R. Howlett wrote:
> > > > Hi, I was wondering if we actually need the detached flag. Couldn't
> > > > "detached" simply mean vma->vm_mm == NULL and we save 4 bytes? Do we ever
> > > > need a vma that's detached but still has a mm pointer? I'd hope the places
> > > > that set detached to false have the mm pointer around so it's not inconvenient.
> > >
> > > I think the gate vmas ruin this plan.
> >
> > But the gate VMAs aren't to be found in the VMA tree.  Used to be that
> > was because the VMA tree was the injective RB tree and so VMAs could
> > only be in one tree at a time.  We could change that now!
>
> \o/
>
> >
> > Anyway, we could use (void *)1 instead of NULL to indicate a "detached"
> > VMA if we need to distinguish between a detached VMA and a gate VMA.
>
> I was thinking a pointer to itself vma->vm_mm = vma, then a check for
> this, instead of null like we do today.

Sidenote:
Something like NULL or (void*)1 is fine with me but please don't do
pointer-to-itself - we shouldn't unnecessarily store a pointer to an
object of one type in a pointer field of an incompatible type, that
increases the risk of creating type confusion issues (both in the
memory corruption sense and in the Spectre sense). I know MM already
has several places where similar stuff can happen (in particular
page->mapping), but here it seems like unnecessary risk to me.
Suren Baghdasaryan Nov. 13, 2024, 7:05 p.m. UTC | #13
On Wed, Nov 13, 2024 at 7:47 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Wed, Nov 13, 2024 at 7:29 AM Liam R. Howlett <Liam.Howlett@oracle.com> wrote:
> >
> > * Suren Baghdasaryan <surenb@google.com> [241113 10:25]:
> > > On Wed, Nov 13, 2024 at 7:23 AM 'Liam R. Howlett' via kernel-team
> > > <kernel-team@android.com> wrote:
> > > >
> > > > * Matthew Wilcox <willy@infradead.org> [241113 08:57]:
> > > > > On Wed, Nov 13, 2024 at 07:38:02AM -0500, Liam R. Howlett wrote:
> > > > > > > Hi, I was wondering if we actually need the detached flag. Couldn't
> > > > > > > "detached" simply mean vma->vm_mm == NULL and we save 4 bytes? Do we ever
> > > > > > > need a vma that's detached but still has a mm pointer? I'd hope the places
> > > > > > > that set detached to false have the mm pointer around so it's not inconvenient.
> > > > > >
> > > > > > I think the gate vmas ruin this plan.
> > > > >
> > > > > But the gate VMAs aren't to be found in the VMA tree.  Used to be that
> > > > > was because the VMA tree was the injective RB tree and so VMAs could
> > > > > only be in one tree at a time.  We could change that now!
> > > >
> > > > \o/
> > > >
> > > > >
> > > > > Anyway, we could use (void *)1 instead of NULL to indicate a "detached"
> > > > > VMA if we need to distinguish between a detached VMA and a gate VMA.
> > > >
> > > > I was thinking a pointer to itself vma->vm_mm = vma, then a check for
> > > > this, instead of null like we do today.
> > >
> > > The motivation for having a separate detached flag was that vma->vm_mm
> > > is used when read/write locking the vma, so it has to stay valid even
> > > when vma gets detached. Maybe we can be more cautious in
> > > vma_start_read()/vma_start_write() about it but I don't recall if
> > > those were the only places that was an issue.
> >
> > We have the mm form the callers though, so it could be passed in?
>
> Let me try and see if something else blows up. When I was implementing
> per-vma locks I thought about using vma->vm_mm to indicate detached
> state but there were some issues that caused me reconsider.

Yeah, a quick change reveals the first mine explosion:

[    2.838900] BUG: kernel NULL pointer dereference, address: 0000000000000480
[    2.840671] #PF: supervisor read access in kernel mode
[    2.841958] #PF: error_code(0x0000) - not-present page
[    2.843248] PGD 800000010835a067 P4D 800000010835a067 PUD 10835b067 PMD 0
[    2.844920] Oops: Oops: 0000 [#1] PREEMPT SMP PTI
[    2.846078] CPU: 2 UID: 0 PID: 1 Comm: init Not tainted
6.12.0-rc6-00258-ga587fcd91b06-dirty #111
[    2.848277] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[    2.850673] RIP: 0010:unmap_vmas+0x84/0x190
[    2.851717] Code: 00 00 00 00 48 c7 44 24 48 00 00 00 00 48 c7 44
24 18 00 00 00 00 48 89 44 24 28 4c 89 44 24 38 e8 b1 c0 d1 00 48 8b
44 24 28 <48> 83 b8 80 04 00 00 00 0f 85 dd 00 00 00 45 0f b6 ed 49 83
ec 01
[    2.856287] RSP: 0000:ffffa298c0017a18 EFLAGS: 00010246
[    2.857599] RAX: 0000000000000000 RBX: 00007f48ccbb4000 RCX: 00007f48ccbb4000
[    2.859382] RDX: ffff8918c26401e0 RSI: ffffa298c0017b98 RDI: ffffa298c0017ab0
[    2.861156] RBP: 00007f48ccdb6000 R08: 00007f48ccdb6000 R09: 0000000000000001
[    2.862941] R10: 0000000000000040 R11: ffff8918c2637108 R12: 0000000000000001
[    2.864719] R13: 0000000000000001 R14: ffff8918c26401e0 R15: ffffa298c0017b98
[    2.866472] FS:  0000000000000000(0000) GS:ffff8927bf080000(0000)
knlGS:0000000000000000
[    2.868439] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    2.869877] CR2: 0000000000000480 CR3: 000000010263e000 CR4: 0000000000750ef0
[    2.871661] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[    2.873419] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[    2.875185] PKRU: 55555554
[    2.875871] Call Trace:
[    2.876503]  <TASK>
[    2.877047]  ? __die+0x1e/0x60
[    2.877824]  ? page_fault_oops+0x17b/0x4a0
[    2.878857]  ? exc_page_fault+0x6b/0x150
[    2.879841]  ? asm_exc_page_fault+0x26/0x30
[    2.880886]  ? unmap_vmas+0x84/0x190
[    2.881783]  ? unmap_vmas+0x7f/0x190
[    2.882680]  vms_clear_ptes+0x106/0x160
[    2.883621]  vms_complete_munmap_vmas+0x53/0x170
[    2.884762]  do_vmi_align_munmap+0x15e/0x1d0
[    2.885838]  do_vmi_munmap+0xcb/0x160
[    2.886760]  __vm_munmap+0xa4/0x150
[    2.887637]  elf_load+0x1c4/0x250
[    2.888473]  load_elf_binary+0xabb/0x1680
[    2.889476]  ? __kernel_read+0x111/0x320
[    2.890458]  ? load_misc_binary+0x1bc/0x2c0
[    2.891510]  bprm_execve+0x23e/0x5e0
[    2.892408]  kernel_execve+0xf3/0x140
[    2.893331]  ? __pfx_kernel_init+0x10/0x10
[    2.894356]  kernel_init+0xe5/0x1c0
[    2.895241]  ret_from_fork+0x2c/0x50
[    2.896141]  ? __pfx_kernel_init+0x10/0x10
[    2.897164]  ret_from_fork_asm+0x1a/0x30
[    2.898148]  </TASK>

Looks like we are detaching VMAs and then unmapping them, where
vms_clear_ptes() uses vms->vma->vm_mm. I'll try to clean up this and
other paths and will see how many changes are required to make this
work.

>
> >
> > >
> > > >
> > > > Either way, we should make it a function so it's easier to reuse for
> > > > whatever we need in the future, wdyt?
> > > >
> > > > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> > > >
Matthew Wilcox Nov. 13, 2024, 8:59 p.m. UTC | #14
On Wed, Nov 13, 2024 at 05:44:00PM +0100, Jann Horn wrote:
> Something like NULL or (void*)1 is fine with me but please don't do
> pointer-to-itself - we shouldn't unnecessarily store a pointer to an
> object of one type in a pointer field of an incompatible type, that
> increases the risk of creating type confusion issues (both in the
> memory corruption sense and in the Spectre sense). I know MM already
> has several places where similar stuff can happen (in particular
> page->mapping), but here it seems like unnecessary risk to me.

Hm?  I don't think page->mapping can ever point at page.  As far as I
know, we have four cases, discriminated by the bottom two bits:

0 - NULL or address_space
1 - anon_vma
2 - movable_ops
3 - ksm_stable_node

In fact, we're almost done eliminating page->mapping.  Just a few
filesystems and device drivers left to go.

Would it be halpful if we did:

-	struct address_space *mapping;
+	union {
+		struct address_space *mapping;
+		unsigned long raw_mapping;
+	};

and had non-filesystems use raw_mapping and do the masking?
Jann Horn Nov. 13, 2024, 9:23 p.m. UTC | #15
On Wed, Nov 13, 2024 at 9:59 PM Matthew Wilcox <willy@infradead.org> wrote:
> On Wed, Nov 13, 2024 at 05:44:00PM +0100, Jann Horn wrote:
> > Something like NULL or (void*)1 is fine with me but please don't do
> > pointer-to-itself - we shouldn't unnecessarily store a pointer to an
> > object of one type in a pointer field of an incompatible type, that
> > increases the risk of creating type confusion issues (both in the
> > memory corruption sense and in the Spectre sense). I know MM already
> > has several places where similar stuff can happen (in particular
> > page->mapping), but here it seems like unnecessary risk to me.
>
> Hm?  I don't think page->mapping can ever point at page.  As far as I
> know, we have four cases, discriminated by the bottom two bits:
>
> 0 - NULL or address_space
> 1 - anon_vma
> 2 - movable_ops
> 3 - ksm_stable_node

Oh, I didn't even know about cases 2 and 3.

Ah, yes, I didn't mean it can point at itself, I meant the pattern of
having a pointer declared as "points to type A" ("struct address_space
*mapping") while it actually points at other types sometimes.

> In fact, we're almost done eliminating page->mapping.  Just a few
> filesystems and device drivers left to go.

Ah, you mean by replacing it with folio->mapping as in
https://lore.kernel.org/all/20241025190822.1319162-4-willy@infradead.org/
?

> Would it be halpful if we did:
>
> -       struct address_space *mapping;
> +       union {
> +               struct address_space *mapping;
> +               unsigned long raw_mapping;
> +       };
>
> and had non-filesystems use raw_mapping and do the masking?

While I think that would look a tiny bit tidier, I don't think it
would make a significant difference for page->mapping or
folio->mapping. In the context of the VMA field, the question is about
whether we make it possible for the same memory location to contain
values of different types, which I think increases the risk of
programmer mistakes or speculative confusions; while in the context of
the page->mapping field, the same memory location can contain values
of different types either way. So while aesthetically I think having a
union for the mapping field would look a little nicer, I don't
actually see much benefit in making such a change.
diff mbox series

Patch

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 5c4bfdcfac72..37580cc7bec0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -32,6 +32,12 @@ 
 struct address_space;
 struct mem_cgroup;
 
+/*
+ * freeptr_t represents a SLUB freelist pointer, which might be encoded
+ * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
+ */
+typedef struct { unsigned long v; } freeptr_t;
+
 /*
  * Each physical page in the system has a struct page associated with
  * it to keep track of whatever it is we are using the page for at the
@@ -673,9 +679,7 @@  struct vm_area_struct {
 			unsigned long vm_start;
 			unsigned long vm_end;
 		};
-#ifdef CONFIG_PER_VMA_LOCK
-		struct rcu_head vm_rcu;	/* Used for deferred freeing. */
-#endif
+		freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
 	};
 
 	/*
diff --git a/include/linux/slab.h b/include/linux/slab.h
index b35e2db7eb0e..cb45db2402ac 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -212,12 +212,6 @@  enum _slab_flag_bits {
 #define SLAB_NO_OBJ_EXT		__SLAB_FLAG_UNUSED
 #endif
 
-/*
- * freeptr_t represents a SLUB freelist pointer, which might be encoded
- * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled.
- */
-typedef struct { unsigned long v; } freeptr_t;
-
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
diff --git a/kernel/fork.c b/kernel/fork.c
index 7823797e31d2..946c3f9a9342 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -478,25 +478,15 @@  void __vm_area_free(struct vm_area_struct *vma)
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
-#ifdef CONFIG_PER_VMA_LOCK
-static void vm_area_free_rcu_cb(struct rcu_head *head)
+void vm_area_free(struct vm_area_struct *vma)
 {
-	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
-						  vm_rcu);
-
+#ifdef CONFIG_PER_VMA_LOCK
+	/* The vma should be detached while being destroyed. */
+	VM_BUG_ON_VMA(!is_vma_detached(vma), vma);
 	/* The vma should not be locked while being destroyed. */
 	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma);
-	__vm_area_free(vma);
-}
 #endif
-
-void vm_area_free(struct vm_area_struct *vma)
-{
-#ifdef CONFIG_PER_VMA_LOCK
-	call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
-#else
 	__vm_area_free(vma);
-#endif
 }
 
 static void account_kernel_stack(struct task_struct *tsk, int account)
@@ -3115,6 +3105,11 @@  void __init mm_cache_init(void)
 
 void __init proc_caches_init(void)
 {
+	struct kmem_cache_args args = {
+		.use_freeptr_offset = true,
+		.freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
+	};
+
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@@ -3131,9 +3126,11 @@  void __init proc_caches_init(void)
 			sizeof(struct fs_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
-	vm_area_cachep = KMEM_CACHE(vm_area_struct,
-			SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC|
+	vm_area_cachep = kmem_cache_create("vm_area_struct",
+			sizeof(struct vm_area_struct), &args,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
 			SLAB_ACCOUNT);
+
 	mmap_init();
 	nsproxy_cache_init();
 }
diff --git a/mm/memory.c b/mm/memory.c
index d0197a0c0996..9c414c81f14a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6279,7 +6279,7 @@  struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 		goto inval;
 
 	/* Check if the VMA got isolated after we found it */
-	if (is_vma_detached(vma)) {
+	if (is_vma_detached(vma) || vma->vm_mm != mm) {
 		vma_end_read(vma);
 		count_vm_vma_lock_event(VMA_LOCK_MISS);
 		/* The area was replaced with another one */