diff mbox series

mm: batch unlink_file_vma calls in free_pgd_range

Message ID 20240518062005.76129-1-mjguzik@gmail.com (mailing list archive)
State New
Headers show
Series mm: batch unlink_file_vma calls in free_pgd_range | expand

Commit Message

Mateusz Guzik May 18, 2024, 6:20 a.m. UTC
Execs of dynamically linked binaries at 20-ish cores are bottlenecked on
the i_mmap_rwsem semaphore, while the biggest singular contributor is
free_pgd_range inducing the lock acquire back-to-back for all
consecutive mappings of a given file.

Tracing the count of said acquires while building the kernel shows:
[1, 2)     799579 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[2, 3)          0 |                                                    |
[3, 4)       3009 |                                                    |
[4, 5)       3009 |                                                    |
[5, 6)     326442 |@@@@@@@@@@@@@@@@@@@@@                               |

So in particular there were 326442 opportunities to coalesce 5 acquires
into 1.

Doing so increases execs per second by 4% (~50k to ~52k) when running
the benchmark linked below.

The lock remains the main bottleneck, I have not looked at other spots
yet.

Bench can be found here:
http://apollo.backplane.com/DFlyMisc/doexec.c

$ cc -O2 -o shared-doexec doexec.c
$ ./shared-doexec $(nproc)

Note this particular test makes sure binaries are separate, but the
loader is shared.

Stats collected on the patched kernel (+ "noinline") with:
bpftrace -e 'kprobe:unlink_file_vma_batch_process
{ @ = lhist(((struct unlink_vma_file_batch *)arg0)->count, 0, 8, 1); }'

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
 include/linux/mm.h |  8 ++++++++
 mm/memory.c        | 10 ++++++++--
 mm/mmap.c          | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 57 insertions(+), 2 deletions(-)

Comments

Matthew Wilcox May 18, 2024, 11:31 p.m. UTC | #1
On Sat, May 18, 2024 at 08:20:05AM +0200, Mateusz Guzik wrote:
> Execs of dynamically linked binaries at 20-ish cores are bottlenecked on
> the i_mmap_rwsem semaphore, while the biggest singular contributor is
> free_pgd_range inducing the lock acquire back-to-back for all
> consecutive mappings of a given file.
> 
> Tracing the count of said acquires while building the kernel shows:
> [1, 2)     799579 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
> [2, 3)          0 |                                                    |
> [3, 4)       3009 |                                                    |
> [4, 5)       3009 |                                                    |
> [5, 6)     326442 |@@@@@@@@@@@@@@@@@@@@@                               |

This makes sense.  A snippet of /proc/self/maps:

7f0a44725000-7f0a4474b000 r--p 00000000 fe:01 100663437                  /usr/lib/x86_64-linux-gnu/libc.so.6
7f0a4474b000-7f0a448a0000 r-xp 00026000 fe:01 100663437                  /usr/lib/x86_64-linux-gnu/libc.so.6
7f0a448a0000-7f0a448f4000 r--p 0017b000 fe:01 100663437                  /usr/lib/x86_64-linux-gnu/libc.so.6
7f0a448f4000-7f0a448f8000 r--p 001cf000 fe:01 100663437                  /usr/lib/x86_64-linux-gnu/libc.so.6
7f0a448f8000-7f0a448fa000 rw-p 001d3000 fe:01 100663437                  /usr/lib/x86_64-linux-gnu/libc.so.6

so we frequently have the same file mmaped five times in a row.

> The lock remains the main bottleneck, I have not looked at other spots
> yet.

You're not the first to report high contention on this lock.
https://lore.kernel.org/all/20240202093407.12536-1-JonasZhou-oc@zhaoxin.com/
for example.

> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index b6bdaa18b9e9..443d0c55df80 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h

I do object to this going into mm.h.  mm/internal.h would be better.

I haven't reviewed the patch in depth, but I don't have a problem with
the idea.  I think it's only a stopgap and we really do need a better
data structure than this.
Mateusz Guzik May 19, 2024, 10:07 a.m. UTC | #2
On Sun, May 19, 2024 at 12:31:20AM +0100, Matthew Wilcox wrote:
> On Sat, May 18, 2024 at 08:20:05AM +0200, Mateusz Guzik wrote:
> > Execs of dynamically linked binaries at 20-ish cores are bottlenecked on
> > the i_mmap_rwsem semaphore, while the biggest singular contributor is
> > free_pgd_range inducing the lock acquire back-to-back for all
> > consecutive mappings of a given file.
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index b6bdaa18b9e9..443d0c55df80 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> 
> I do object to this going into mm.h.  mm/internal.h would be better.
> 

Noted.

> I haven't reviewed the patch in depth, but I don't have a problem with
> the idea.  I think it's only a stopgap and we really do need a better
> data structure than this.
> 

I'll send a v2 after some more reviews pour in.

The above indeed is just a low hanging fruit fixup in an unpleasant
situation.

I think the real fix in the long run would provide the loader with means
to be more efficient about it.

strace /bin/echo shows:
[snip]
openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0\220\243\2\0\0\0\0\0"..., 832) = 832
pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784
fstat(3, {st_mode=S_IFREG|0755, st_size=2125328, ...}) = 0
pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784
mmap(NULL, 2170256, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7dbda8a00000
mmap(0x7dbda8a28000, 1605632, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x28000) = 0x7dbda8a28000
mmap(0x7dbda8bb0000, 323584, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1b0000) = 0x7dbda8bb0000
mmap(0x7dbda8bff000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1fe000) = 0x7dbda8bff000
mmap(0x7dbda8c05000, 52624, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7dbda8c05000
[/snip]

Hence the 5 mappings.

Should there be a mechanism to issue all these mmaps at the same time
there would definitely be savings in total work done, not only in terms
of one i_mmap_rwsem lock trip.

The mechanism should be versatile enough to replace other back-to-back mmap
uses. It would be great if on top of it it did not require the size
argument, instead it could return a pair address + size. Then the
typical combo of open + fstat + mmap could be shortened.

As in that was just a quick note, I have no intention of pursuing
anything of the sort. I'll probably submit some other patches to
damage-control the state without altering any design choices.
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b6bdaa18b9e9..443d0c55df80 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3272,6 +3272,11 @@  void anon_vma_interval_tree_verify(struct anon_vma_chain *node);
 	     avc; avc = anon_vma_interval_tree_iter_next(avc, start, last))
 
 /* mmap.c */
+struct unlink_vma_file_batch {
+	int count;
+	struct vm_area_struct *vmas[8];
+};
+
 extern int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin);
 extern int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
 		      unsigned long start, unsigned long end, pgoff_t pgoff,
@@ -3281,6 +3286,9 @@  extern int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
 extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
 extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void unlink_file_vma(struct vm_area_struct *);
+void unlink_file_vma_batch_init(struct unlink_vma_file_batch *);
+void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *);
+void unlink_file_vma_batch_final(struct unlink_vma_file_batch *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
 	unsigned long addr, unsigned long len, pgoff_t pgoff,
 	bool *need_rmap_locks);
diff --git a/mm/memory.c b/mm/memory.c
index 0201f50d8307..048fde0e5a8a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -363,6 +363,8 @@  void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		   struct vm_area_struct *vma, unsigned long floor,
 		   unsigned long ceiling, bool mm_wr_locked)
 {
+	struct unlink_vma_file_batch vb;
+
 	do {
 		unsigned long addr = vma->vm_start;
 		struct vm_area_struct *next;
@@ -382,12 +384,15 @@  void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 		if (mm_wr_locked)
 			vma_start_write(vma);
 		unlink_anon_vmas(vma);
-		unlink_file_vma(vma);
 
 		if (is_vm_hugetlb_page(vma)) {
+			unlink_file_vma(vma);
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next ? next->vm_start : ceiling);
 		} else {
+			unlink_file_vma_batch_init(&vb);
+			unlink_file_vma_batch_add(&vb, vma);
+
 			/*
 			 * Optimization: gather nearby vmas into one call down
 			 */
@@ -400,8 +405,9 @@  void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
 				if (mm_wr_locked)
 					vma_start_write(vma);
 				unlink_anon_vmas(vma);
-				unlink_file_vma(vma);
+				unlink_file_vma_batch_add(&vb, vma);
 			}
+			unlink_file_vma_batch_final(&vb);
 			free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next ? next->vm_start : ceiling);
 		}
diff --git a/mm/mmap.c b/mm/mmap.c
index 3490af70f259..e928401df913 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -131,6 +131,47 @@  void unlink_file_vma(struct vm_area_struct *vma)
 	}
 }
 
+void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
+{
+	vb->count = 0;
+}
+
+static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
+{
+	struct address_space *mapping;
+	int i;
+
+	mapping = vb->vmas[0]->vm_file->f_mapping;
+	i_mmap_lock_write(mapping);
+	for (i = 0; i < vb->count; i++) {
+		VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
+		__remove_shared_vm_struct(vb->vmas[i], mapping);
+	}
+	i_mmap_unlock_write(mapping);
+
+	unlink_file_vma_batch_init(vb);
+}
+
+void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
+			       struct vm_area_struct *vma)
+{
+	if (vma->vm_file == NULL)
+		return;
+
+	if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
+	    vb->count == ARRAY_SIZE(vb->vmas))
+		unlink_file_vma_batch_process(vb);
+
+	vb->vmas[vb->count] = vma;
+	vb->count++;
+}
+
+void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
+{
+	if (vb->count > 0)
+		unlink_file_vma_batch_process(vb);
+}
+
 /*
  * Close a vm structure and free it.
  */