Message ID | 20210902231813.3597709-3-surenb@google.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v9,1/3] mm: rearrange madvise code to allow for reuse | expand |
On Thu, Sep 02, 2021 at 04:18:13PM -0700, Suren Baghdasaryan wrote: > While forking a process with high number (64K) of named anonymous vmas the > overhead caused by strdup() is noticeable. Experiments with ARM64 Android > device show up to 40% performance regression when forking a process with > 64k unpopulated anonymous vmas using the max name lengths vs the same > process with the same number of anonymous vmas having no name. > Introduce anon_vma_name refcounted structure to avoid the overhead of > copying vma names during fork() and when splitting named anonymous vmas. > When a vma is duplicated, instead of copying the name we increment the > refcount of this structure. Multiple vmas can point to the same > anon_vma_name as long as they increment the refcount. The name member of > anon_vma_name structure is assigned at structure allocation time and is > never changed. If vma name changes then the refcount of the original > structure is dropped, a new anon_vma_name structure is allocated > to hold the new name and the vma pointer is updated to point to the new > structure. > With this approach the fork() performance regressions is reduced 3-4x > times and with usecases using more reasonable number of VMAs (a few > thousand) the regressions is not measurable. > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > --- > previous version including cover letter with test results is at: > https://lore.kernel.org/linux-mm/20210827191858.2037087-1-surenb@google.com/ > > changes in v9 > - Replaced kzalloc with kmalloc in anon_vma_name_alloc, per Rolf Eike Beer > > include/linux/mm_types.h | 9 ++++++++- > mm/madvise.c | 43 +++++++++++++++++++++++++++++++++------- > 2 files changed, 44 insertions(+), 8 deletions(-) > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 968a1d0463d8..7feb43daee6c 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -5,6 +5,7 @@ > #include <linux/mm_types_task.h> > > #include <linux/auxvec.h> > +#include <linux/kref.h> > #include <linux/list.h> > #include <linux/spinlock.h> > #include <linux/rbtree.h> > @@ -310,6 +311,12 @@ struct vm_userfaultfd_ctx { > struct vm_userfaultfd_ctx {}; > #endif /* CONFIG_USERFAULTFD */ > > +struct anon_vma_name { > + struct kref kref; > + /* The name needs to be at the end because it is dynamically sized. */ > + char name[]; > +}; > + > /* > * This struct describes a virtual memory area. There is one of these > * per VM-area/task. A VM area is any part of the process virtual memory > @@ -361,7 +368,7 @@ struct vm_area_struct { > unsigned long rb_subtree_last; > } shared; > /* Serialized by mmap_sem. */ > - char *anon_name; > + struct anon_vma_name *anon_name; > }; > > /* > diff --git a/mm/madvise.c b/mm/madvise.c > index 0c6d0f64d432..adc53edd3fe7 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -63,6 +63,28 @@ static int madvise_need_mmap_write(int behavior) > } > } > > +static struct anon_vma_name *anon_vma_name_alloc(const char *name) > +{ > + struct anon_vma_name *anon_name; > + size_t len = strlen(name); > + > + /* Add 1 for NUL terminator at the end of the anon_name->name */ > + anon_name = kmalloc(sizeof(*anon_name) + len + 1, GFP_KERNEL); > + if (anon_name) { > + kref_init(&anon_name->kref); > + strcpy(anon_name->name, name); Please don't use strcpy(), even though we know it's safe here. We're trying to remove it globally (or at least for non-constant buffers)[1]. We can also use the struct_size() helper, along with memcpy(): /* Add 1 for NUL terminator at the end of the anon_name->name */ size_t count = strlen(name) + 1; anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); if (anon_name) { kref_init(&anon_name->kref); memcpy(anon_name->name, name, count); } [1] https://www.kernel.org/doc/html/latest/process/deprecated.html#strcpy > + } > + > + return anon_name; > +} > + > +static void vma_anon_name_free(struct kref *kref) > +{ > + struct anon_vma_name *anon_name = > + container_of(kref, struct anon_vma_name, kref); > + kfree(anon_name); > +} > + > static inline bool has_vma_anon_name(struct vm_area_struct *vma) > { > return !vma->vm_file && vma->anon_name; > @@ -75,7 +97,7 @@ const char *vma_anon_name(struct vm_area_struct *vma) > > mmap_assert_locked(vma->vm_mm); > > - return vma->anon_name; > + return vma->anon_name->name; > } > > void dup_vma_anon_name(struct vm_area_struct *orig_vma, > @@ -84,37 +106,44 @@ void dup_vma_anon_name(struct vm_area_struct *orig_vma, > if (!has_vma_anon_name(orig_vma)) > return; > > - new_vma->anon_name = kstrdup(orig_vma->anon_name, GFP_KERNEL); > + kref_get(&orig_vma->anon_name->kref); > + new_vma->anon_name = orig_vma->anon_name; > } > > void free_vma_anon_name(struct vm_area_struct *vma) > { > + struct anon_vma_name *anon_name; > + > if (!has_vma_anon_name(vma)) > return; > > - kfree(vma->anon_name); > + anon_name = vma->anon_name; > vma->anon_name = NULL; > + kref_put(&anon_name->kref, vma_anon_name_free); > } > > /* mmap_lock should be write-locked */ > static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) > { > + const char *anon_name; > + > if (!name) { > free_vma_anon_name(vma); > return 0; > } > > - if (vma->anon_name) { > + anon_name = vma_anon_name(vma); > + if (anon_name) { > /* Should never happen, to dup use dup_vma_anon_name() */ > - WARN_ON(vma->anon_name == name); > + WARN_ON(anon_name == name); > > /* Same name, nothing to do here */ > - if (!strcmp(name, vma->anon_name)) > + if (!strcmp(name, anon_name)) > return 0; > > free_vma_anon_name(vma); > } > - vma->anon_name = kstrdup(name, GFP_KERNEL); > + vma->anon_name = anon_vma_name_alloc(name); > if (!vma->anon_name) > return -ENOMEM; > > -- > 2.33.0.153.gba50c8fa24-goog > With the above tweak, please consider this: Reviewed-by: Kees Cook <keescook@chromium.org> Thanks for working on this!
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 968a1d0463d8..7feb43daee6c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -5,6 +5,7 @@ #include <linux/mm_types_task.h> #include <linux/auxvec.h> +#include <linux/kref.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rbtree.h> @@ -310,6 +311,12 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; +}; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -361,7 +368,7 @@ struct vm_area_struct { unsigned long rb_subtree_last; } shared; /* Serialized by mmap_sem. */ - char *anon_name; + struct anon_vma_name *anon_name; }; /* diff --git a/mm/madvise.c b/mm/madvise.c index 0c6d0f64d432..adc53edd3fe7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -63,6 +63,28 @@ static int madvise_need_mmap_write(int behavior) } } +static struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + struct anon_vma_name *anon_name; + size_t len = strlen(name); + + /* Add 1 for NUL terminator at the end of the anon_name->name */ + anon_name = kmalloc(sizeof(*anon_name) + len + 1, GFP_KERNEL); + if (anon_name) { + kref_init(&anon_name->kref); + strcpy(anon_name->name, name); + } + + return anon_name; +} + +static void vma_anon_name_free(struct kref *kref) +{ + struct anon_vma_name *anon_name = + container_of(kref, struct anon_vma_name, kref); + kfree(anon_name); +} + static inline bool has_vma_anon_name(struct vm_area_struct *vma) { return !vma->vm_file && vma->anon_name; @@ -75,7 +97,7 @@ const char *vma_anon_name(struct vm_area_struct *vma) mmap_assert_locked(vma->vm_mm); - return vma->anon_name; + return vma->anon_name->name; } void dup_vma_anon_name(struct vm_area_struct *orig_vma, @@ -84,37 +106,44 @@ void dup_vma_anon_name(struct vm_area_struct *orig_vma, if (!has_vma_anon_name(orig_vma)) return; - new_vma->anon_name = kstrdup(orig_vma->anon_name, GFP_KERNEL); + kref_get(&orig_vma->anon_name->kref); + new_vma->anon_name = orig_vma->anon_name; } void free_vma_anon_name(struct vm_area_struct *vma) { + struct anon_vma_name *anon_name; + if (!has_vma_anon_name(vma)) return; - kfree(vma->anon_name); + anon_name = vma->anon_name; vma->anon_name = NULL; + kref_put(&anon_name->kref, vma_anon_name_free); } /* mmap_lock should be write-locked */ static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) { + const char *anon_name; + if (!name) { free_vma_anon_name(vma); return 0; } - if (vma->anon_name) { + anon_name = vma_anon_name(vma); + if (anon_name) { /* Should never happen, to dup use dup_vma_anon_name() */ - WARN_ON(vma->anon_name == name); + WARN_ON(anon_name == name); /* Same name, nothing to do here */ - if (!strcmp(name, vma->anon_name)) + if (!strcmp(name, anon_name)) return 0; free_vma_anon_name(vma); } - vma->anon_name = kstrdup(name, GFP_KERNEL); + vma->anon_name = anon_vma_name_alloc(name); if (!vma->anon_name) return -ENOMEM;
While forking a process with high number (64K) of named anonymous vmas the overhead caused by strdup() is noticeable. Experiments with ARM64 Android device show up to 40% performance regression when forking a process with 64k unpopulated anonymous vmas using the max name lengths vs the same process with the same number of anonymous vmas having no name. Introduce anon_vma_name refcounted structure to avoid the overhead of copying vma names during fork() and when splitting named anonymous vmas. When a vma is duplicated, instead of copying the name we increment the refcount of this structure. Multiple vmas can point to the same anon_vma_name as long as they increment the refcount. The name member of anon_vma_name structure is assigned at structure allocation time and is never changed. If vma name changes then the refcount of the original structure is dropped, a new anon_vma_name structure is allocated to hold the new name and the vma pointer is updated to point to the new structure. With this approach the fork() performance regressions is reduced 3-4x times and with usecases using more reasonable number of VMAs (a few thousand) the regressions is not measurable. Signed-off-by: Suren Baghdasaryan <surenb@google.com> --- previous version including cover letter with test results is at: https://lore.kernel.org/linux-mm/20210827191858.2037087-1-surenb@google.com/ changes in v9 - Replaced kzalloc with kmalloc in anon_vma_name_alloc, per Rolf Eike Beer include/linux/mm_types.h | 9 ++++++++- mm/madvise.c | 43 +++++++++++++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 8 deletions(-)