diff mbox series

[v2] mm: batch unlink_file_vma calls in free_pgd_range

Message ID 20240521234321.359501-1-mjguzik@gmail.com (mailing list archive)
State New
Headers show
Series [v2] mm: batch unlink_file_vma calls in free_pgd_range | expand

Commit Message

Mateusz Guzik May 21, 2024, 11:43 p.m. UTC
Execs of dynamically linked binaries at 20-ish cores are bottlenecked on
the i_mmap_rwsem semaphore, while the biggest singular contributor is
free_pgd_range inducing the lock acquire back-to-back for all
consecutive mappings of a given file.

Tracing the count of said acquires while building the kernel shows:
[1, 2)     799579 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[2, 3)          0 |                                                    |
[3, 4)       3009 |                                                    |
[4, 5)       3009 |                                                    |
[5, 6)     326442 |@@@@@@@@@@@@@@@@@@@@@                               |

So in particular there were 326442 opportunities to coalesce 5 acquires
into 1.

Doing so increases execs per second by 4% (~50k to ~52k) when running
the benchmark linked below.

The lock remains the main bottleneck, I have not looked at other spots
yet.

Bench can be found here:
http://apollo.backplane.com/DFlyMisc/doexec.c

$ cc -O2 -o shared-doexec doexec.c
$ ./shared-doexec $(nproc)

Note this particular test makes sure binaries are separate, but the
loader is shared.

Stats collected on the patched kernel (+ "noinline") with:
bpftrace -e 'kprobe:unlink_file_vma_batch_process
{ @ = lhist(((struct unlink_vma_file_batch *)arg0)->count, 0, 8, 1); }'

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---

v2:
- move new stuff to mm/internal.h

 mm/internal.h |  9 +++++++++
 mm/memory.c   | 10 ++++++++--
 mm/mmap.c     | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 58 insertions(+), 2 deletions(-)

Comments

Liam R. Howlett May 22, 2024, 3:19 p.m. UTC | #1
* Mateusz Guzik <mjguzik@gmail.com> [240521 19:43]:
> Execs of dynamically linked binaries at 20-ish cores are bottlenecked on
> the i_mmap_rwsem semaphore, while the biggest singular contributor is
> free_pgd_range inducing the lock acquire back-to-back for all
> consecutive mappings of a given file.
> 
> Tracing the count of said acquires while building the kernel shows:
> [1, 2)     799579 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [2, 3)          0 |                                                    |
> [3, 4)       3009 |                                                    |
> [4, 5)       3009 |                                                    |
> [5, 6)     326442 |@@@@@@@@@@@@@@@@@@@@@                               |
> 
> So in particular there were 326442 opportunities to coalesce 5 acquires
> into 1.
> 
> Doing so increases execs per second by 4% (~50k to ~52k) when running
> the benchmark linked below.
> 
> The lock remains the main bottleneck, I have not looked at other spots
> yet.

Thanks.  This change is compact and allows for a performance gain.  It
looks good to me.

I guess this would cause a regression on single mappings, probably
within the noise and probably not a real work load.  Just something to
keep in mind to check if the bots yell about some contrived benchmark.

> 
> Bench can be found here:
> http://apollo.backplane.com/DFlyMisc/doexec.c
> 
> $ cc -O2 -o shared-doexec doexec.c
> $ ./shared-doexec $(nproc)
> 
> Note this particular test makes sure binaries are separate, but the
> loader is shared.
> 
> Stats collected on the patched kernel (+ "noinline") with:
> bpftrace -e 'kprobe:unlink_file_vma_batch_process
> { @ = lhist(((struct unlink_vma_file_batch *)arg0)->count, 0, 8, 1); }'
> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
> 
> v2:
> - move new stuff to mm/internal.h
> 
>  mm/internal.h |  9 +++++++++
>  mm/memory.c   | 10 ++++++++--
>  mm/mmap.c     | 41 +++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 58 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/internal.h b/mm/internal.h
> index 2adabe369403..2e7be1c773f2 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1484,4 +1484,13 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
>  void workingset_update_node(struct xa_node *node);
>  extern struct list_lru shadow_nodes;
>  
> +struct unlink_vma_file_batch {
> +	int count;
> +	struct vm_area_struct *vmas[8];
> +};
> +
> +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *);
> +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *);
> +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *);
> +
>  #endif	/* __MM_INTERNAL_H */
> diff --git a/mm/memory.c b/mm/memory.c
> index b5453b86ec4b..1b96dce19796 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -365,6 +365,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
>  		   struct vm_area_struct *vma, unsigned long floor,
>  		   unsigned long ceiling, bool mm_wr_locked)
>  {
> +	struct unlink_vma_file_batch vb;
> +
>  	do {
>  		unsigned long addr = vma->vm_start;
>  		struct vm_area_struct *next;
> @@ -384,12 +386,15 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
>  		if (mm_wr_locked)
>  			vma_start_write(vma);
>  		unlink_anon_vmas(vma);
> -		unlink_file_vma(vma);
>  
>  		if (is_vm_hugetlb_page(vma)) {
> +			unlink_file_vma(vma);
>  			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
>  				floor, next ? next->vm_start : ceiling);
>  		} else {
> +			unlink_file_vma_batch_init(&vb);
> +			unlink_file_vma_batch_add(&vb, vma);
> +
>  			/*
>  			 * Optimization: gather nearby vmas into one call down
>  			 */
> @@ -402,8 +407,9 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
>  				if (mm_wr_locked)
>  					vma_start_write(vma);
>  				unlink_anon_vmas(vma);
> -				unlink_file_vma(vma);
> +				unlink_file_vma_batch_add(&vb, vma);
>  			}
> +			unlink_file_vma_batch_final(&vb);
>  			free_pgd_range(tlb, addr, vma->vm_end,
>  				floor, next ? next->vm_start : ceiling);
>  		}
> diff --git a/mm/mmap.c b/mm/mmap.c
> index d6d8ab119b72..1f9a43ecd053 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c

I see why you put them in mm/mmap.c and it's the best place right now,
for some definition of best.  The vma work is spread across several
files.

On that note, kernel/fork.c uses this lock for each cloned vma right
now.  If you saved the file pointer in your struct, it could be used
for bulk add as well.  The only complication I see is the insert order
being inserted "just after mpnt", maybe a bulk add version of the struct
would need two lists of vmas - if the size of the struct is of concern,
I don't think it would be.

> @@ -131,6 +131,47 @@ void unlink_file_vma(struct vm_area_struct *vma)
>  	}
>  }
>  
> +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> +{
> +	vb->count = 0;
> +}
> +
> +static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
> +{
> +	struct address_space *mapping;
> +	int i;
> +
> +	mapping = vb->vmas[0]->vm_file->f_mapping;
> +	i_mmap_lock_write(mapping);
> +	for (i = 0; i < vb->count; i++) {
> +		VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
> +		__remove_shared_vm_struct(vb->vmas[i], mapping);
> +	}
> +	i_mmap_unlock_write(mapping);
> +
> +	unlink_file_vma_batch_init(vb);
> +}
> +
> +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
> +			       struct vm_area_struct *vma)
> +{
> +	if (vma->vm_file == NULL)
> +		return;
> +

It might be worth a comment about count being always ahead of the last
vma in the array.  On first glance, I was concerned about an off-by-one
here (and in the process function).  But maybe it's just me, the
increment is pretty close to this statement - I had to think about
ARRAY_SIZE() here.

> +	if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
> +	    vb->count == ARRAY_SIZE(vb->vmas))

Since you are checking vm_file and only support a single vm_file in this
version, it might be worth saving it in your unlink_vma_file_batch
struct.  It could also be used in the processing to reduce dereferencing
to f_mappings.

I'm not sure if this is worth it with modern cpus, though.  I'm just
thinking that this step is executed the most so any speedup here will
help you.

> +		unlink_file_vma_batch_process(vb);
> +
> +	vb->vmas[vb->count] = vma;
> +	vb->count++;
> +}
> +
> +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
> +{
> +	if (vb->count > 0)
> +		unlink_file_vma_batch_process(vb);
> +}
> +
>  /*
>   * Close a vm structure and free it.
>   */
> -- 
> 2.39.2
> 

Feel free to add

Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Mateusz Guzik May 22, 2024, 5:22 p.m. UTC | #2
On Wed, May 22, 2024 at 11:19:45AM -0400, Liam R. Howlett wrote:
> * Mateusz Guzik <mjguzik@gmail.com> [240521 19:43]:
> > Execs of dynamically linked binaries at 20-ish cores are bottlenecked on
> > the i_mmap_rwsem semaphore, while the biggest singular contributor is
> > free_pgd_range inducing the lock acquire back-to-back for all
> > consecutive mappings of a given file.
> > 
> > Tracing the count of said acquires while building the kernel shows:
> > [1, 2)     799579 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> > [2, 3)          0 |                                                    |
> > [3, 4)       3009 |                                                    |
> > [4, 5)       3009 |                                                    |
> > [5, 6)     326442 |@@@@@@@@@@@@@@@@@@@@@                               |
> > 
> > So in particular there were 326442 opportunities to coalesce 5 acquires
> > into 1.
> > 
> > Doing so increases execs per second by 4% (~50k to ~52k) when running
> > the benchmark linked below.
> > 
> > The lock remains the main bottleneck, I have not looked at other spots
> > yet.
> 
> Thanks.  This change is compact and allows for a performance gain.  It
> looks good to me.
> 
> I guess this would cause a regression on single mappings, probably
> within the noise and probably not a real work load.  Just something to
> keep in mind to check if the bots yell about some contrived benchmark.
> 

Trivial tidy ups can be done should someone be adamant there is a
slowdown and it needs to be recouped, starting with inlining the new
routines (apart from unlink_file_vma_batch_process).

> On that note, kernel/fork.c uses this lock for each cloned vma right
> now.  If you saved the file pointer in your struct, it could be used
> for bulk add as well.  The only complication I see is the insert order
> being inserted "just after mpnt", maybe a bulk add version of the struct
> would need two lists of vmas - if the size of the struct is of concern,
> I don't think it would be.
> 

Looks like it would need a different spin on batching than the one
implemented above.

Maybe I'll get around to this some time early next month.

> > @@ -131,6 +131,47 @@ void unlink_file_vma(struct vm_area_struct *vma)
> >  	}
> >  }
> >  
> > +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
> > +{
> > +	vb->count = 0;
> > +}
> > +
> > +static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
> > +{
> > +	struct address_space *mapping;
> > +	int i;
> > +
> > +	mapping = vb->vmas[0]->vm_file->f_mapping;
> > +	i_mmap_lock_write(mapping);
> > +	for (i = 0; i < vb->count; i++) {
> > +		VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
> > +		__remove_shared_vm_struct(vb->vmas[i], mapping);
> > +	}
> > +	i_mmap_unlock_write(mapping);
> > +
> > +	unlink_file_vma_batch_init(vb);
> > +}
> > +
> > +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
> > +			       struct vm_area_struct *vma)
> > +{
> > +	if (vma->vm_file == NULL)
> > +		return;
> > +
> 
> It might be worth a comment about count being always ahead of the last
> vma in the array.  On first glance, I was concerned about an off-by-one
> here (and in the process function).  But maybe it's just me, the
> increment is pretty close to this statement - I had to think about
> ARRAY_SIZE() here.
> 

I think that's upgringing on different codebases.

Idiomatic array iteration of n elements being "for (i = 0; i < n; i++)"
to me makes the below assignment + counter bump pair obviously correct.

That is to say some other arrangement would require me to do a double
take. :)

> > +	if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
> > +	    vb->count == ARRAY_SIZE(vb->vmas))
> 
> Since you are checking vm_file and only support a single vm_file in this
> version, it might be worth saving it in your unlink_vma_file_batch
> struct.  It could also be used in the processing to reduce dereferencing
> to f_mappings.
> 
> I'm not sure if this is worth it with modern cpus, though.  I'm just
> thinking that this step is executed the most so any speedup here will
> help you.
> 

I had it originally but it imo uglified the code.

> Feel free to add
> 
> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
> 

thanks
diff mbox series

Patch

diff --git a/mm/internal.h b/mm/internal.h
index 2adabe369403..2e7be1c773f2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1484,4 +1484,13 @@  static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
 void workingset_update_node(struct xa_node *node);
 extern struct list_lru shadow_nodes;
 
+struct unlink_vma_file_batch {
+	int count;
+	struct vm_area_struct *vmas[8];
+};
+
+void unlink_file_vma_batch_init(struct unlink_vma_file_batch *);
+void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *);
+void unlink_file_vma_batch_final(struct unlink_vma_file_batch *);
+
 #endif	/* __MM_INTERNAL_H */
diff --git a/mm/memory.c b/mm/memory.c
index b5453b86ec4b..1b96dce19796 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -365,6 +365,8 @@  void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		   struct vm_area_struct *vma, unsigned long floor,
 		   unsigned long ceiling, bool mm_wr_locked)
 {
+	struct unlink_vma_file_batch vb;
+
 	do {
 		unsigned long addr = vma->vm_start;
 		struct vm_area_struct *next;
@@ -384,12 +386,15 @@  void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		if (mm_wr_locked)
 			vma_start_write(vma);
 		unlink_anon_vmas(vma);
-		unlink_file_vma(vma);
 
 		if (is_vm_hugetlb_page(vma)) {
+			unlink_file_vma(vma);
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next ? next->vm_start : ceiling);
 		} else {
+			unlink_file_vma_batch_init(&vb);
+			unlink_file_vma_batch_add(&vb, vma);
+
 			/*
 			 * Optimization: gather nearby vmas into one call down
 			 */
@@ -402,8 +407,9 @@  void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 				if (mm_wr_locked)
 					vma_start_write(vma);
 				unlink_anon_vmas(vma);
-				unlink_file_vma(vma);
+				unlink_file_vma_batch_add(&vb, vma);
 			}
+			unlink_file_vma_batch_final(&vb);
 			free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next ? next->vm_start : ceiling);
 		}
diff --git a/mm/mmap.c b/mm/mmap.c
index d6d8ab119b72..1f9a43ecd053 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -131,6 +131,47 @@  void unlink_file_vma(struct vm_area_struct *vma)
 	}
 }
 
+void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
+{
+	vb->count = 0;
+}
+
+static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
+{
+	struct address_space *mapping;
+	int i;
+
+	mapping = vb->vmas[0]->vm_file->f_mapping;
+	i_mmap_lock_write(mapping);
+	for (i = 0; i < vb->count; i++) {
+		VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
+		__remove_shared_vm_struct(vb->vmas[i], mapping);
+	}
+	i_mmap_unlock_write(mapping);
+
+	unlink_file_vma_batch_init(vb);
+}
+
+void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
+			       struct vm_area_struct *vma)
+{
+	if (vma->vm_file == NULL)
+		return;
+
+	if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
+	    vb->count == ARRAY_SIZE(vb->vmas))
+		unlink_file_vma_batch_process(vb);
+
+	vb->vmas[vb->count] = vma;
+	vb->count++;
+}
+
+void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
+{
+	if (vb->count > 0)
+		unlink_file_vma_batch_process(vb);
+}
+
 /*
  * Close a vm structure and free it.
  */