diff mbox series

[v2,tip/perf/core,5/5] uprobes: add speculative lockless VMA-to-inode-to-uprobe resolution

Message ID 20241001225207.2215639-6-andrii@kernel.org (mailing list archive)
State Not Applicable
Delegated to: BPF
Headers show
Series uprobes,mm: speculative lockless VMA-to-uprobe lookup | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Andrii Nakryiko Oct. 1, 2024, 10:52 p.m. UTC
Given filp_cachep is marked SLAB_TYPESAFE_BY_RCU (and FMODE_BACKING
files, a special case, now goes through RCU-delated freeing), we can
safely access vma->vm_file->f_inode field locklessly under just
rcu_read_lock() protection, which enables looking up uprobe from
uprobes_tree completely locklessly and speculatively without the need to
acquire mmap_lock for reads. In most cases, anyway, assuming that there
are no parallel mm and/or VMA modifications. The underlying struct
file's memory won't go away from under us (even if struct file can be
reused in the meantime).

We rely on newly added mmap_lock_speculation_{start,end}() helpers to
validate that mm_struct stays intact for entire duration of this
speculation. If not, we fall back to mmap_lock-protected lookup.
The speculative logic is written in such a way that it will safely
handle any garbage values that might be read from vma or file structs.

Benchmarking results speak for themselves.

BEFORE (latest tip/perf/core)
=============================
uprobe-nop            ( 1 cpus):    3.384 ± 0.004M/s  (  3.384M/s/cpu)
uprobe-nop            ( 2 cpus):    5.456 ± 0.005M/s  (  2.728M/s/cpu)
uprobe-nop            ( 3 cpus):    7.863 ± 0.015M/s  (  2.621M/s/cpu)
uprobe-nop            ( 4 cpus):    9.442 ± 0.008M/s  (  2.360M/s/cpu)
uprobe-nop            ( 5 cpus):   11.036 ± 0.013M/s  (  2.207M/s/cpu)
uprobe-nop            ( 6 cpus):   10.884 ± 0.019M/s  (  1.814M/s/cpu)
uprobe-nop            ( 7 cpus):    7.897 ± 0.145M/s  (  1.128M/s/cpu)
uprobe-nop            ( 8 cpus):   10.021 ± 0.128M/s  (  1.253M/s/cpu)
uprobe-nop            (10 cpus):    9.932 ± 0.170M/s  (  0.993M/s/cpu)
uprobe-nop            (12 cpus):    8.369 ± 0.056M/s  (  0.697M/s/cpu)
uprobe-nop            (14 cpus):    8.678 ± 0.017M/s  (  0.620M/s/cpu)
uprobe-nop            (16 cpus):    7.392 ± 0.003M/s  (  0.462M/s/cpu)
uprobe-nop            (24 cpus):    5.326 ± 0.178M/s  (  0.222M/s/cpu)
uprobe-nop            (32 cpus):    5.426 ± 0.059M/s  (  0.170M/s/cpu)
uprobe-nop            (40 cpus):    5.262 ± 0.070M/s  (  0.132M/s/cpu)
uprobe-nop            (48 cpus):    6.121 ± 0.010M/s  (  0.128M/s/cpu)
uprobe-nop            (56 cpus):    6.252 ± 0.035M/s  (  0.112M/s/cpu)
uprobe-nop            (64 cpus):    7.644 ± 0.023M/s  (  0.119M/s/cpu)
uprobe-nop            (72 cpus):    7.781 ± 0.001M/s  (  0.108M/s/cpu)
uprobe-nop            (80 cpus):    8.992 ± 0.048M/s  (  0.112M/s/cpu)

AFTER
=====
uprobe-nop            ( 1 cpus):    3.534 ± 0.033M/s  (  3.534M/s/cpu)
uprobe-nop            ( 2 cpus):    6.701 ± 0.007M/s  (  3.351M/s/cpu)
uprobe-nop            ( 3 cpus):   10.031 ± 0.007M/s  (  3.344M/s/cpu)
uprobe-nop            ( 4 cpus):   13.003 ± 0.012M/s  (  3.251M/s/cpu)
uprobe-nop            ( 5 cpus):   16.274 ± 0.006M/s  (  3.255M/s/cpu)
uprobe-nop            ( 6 cpus):   19.563 ± 0.024M/s  (  3.261M/s/cpu)
uprobe-nop            ( 7 cpus):   22.696 ± 0.054M/s  (  3.242M/s/cpu)
uprobe-nop            ( 8 cpus):   24.534 ± 0.010M/s  (  3.067M/s/cpu)
uprobe-nop            (10 cpus):   30.475 ± 0.117M/s  (  3.047M/s/cpu)
uprobe-nop            (12 cpus):   33.371 ± 0.017M/s  (  2.781M/s/cpu)
uprobe-nop            (14 cpus):   38.864 ± 0.004M/s  (  2.776M/s/cpu)
uprobe-nop            (16 cpus):   41.476 ± 0.020M/s  (  2.592M/s/cpu)
uprobe-nop            (24 cpus):   64.696 ± 0.021M/s  (  2.696M/s/cpu)
uprobe-nop            (32 cpus):   85.054 ± 0.027M/s  (  2.658M/s/cpu)
uprobe-nop            (40 cpus):  101.979 ± 0.032M/s  (  2.549M/s/cpu)
uprobe-nop            (48 cpus):  110.518 ± 0.056M/s  (  2.302M/s/cpu)
uprobe-nop            (56 cpus):  117.737 ± 0.020M/s  (  2.102M/s/cpu)
uprobe-nop            (64 cpus):  124.613 ± 0.079M/s  (  1.947M/s/cpu)
uprobe-nop            (72 cpus):  133.239 ± 0.032M/s  (  1.851M/s/cpu)
uprobe-nop            (80 cpus):  142.037 ± 0.138M/s  (  1.775M/s/cpu)

Previously total throughput was maxing out at 11mln/s, and gradually
declining past 8 cores. With this change, it now keeps growing with each
added CPU, reaching 142mln/s at 80 CPUs (this was measured on a 80-core
Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz).

Note, results above assume that commit [0] from linux-trace tree is
applied as well, otherwise it will limit scalability to about 10mln/s
total throughput.

  [0] 10cdb82aa77f ("uprobes: turn trace_uprobe's nhit counter to be per-CPU one")

Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 kernel/events/uprobes.c | 44 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

Comments

Oleg Nesterov Oct. 2, 2024, 7:25 a.m. UTC | #1
On 10/01, Andrii Nakryiko wrote:
>
> +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> +{
> +	struct mm_struct *mm = current->mm;
> +	struct uprobe *uprobe = NULL;
> +	struct vm_area_struct *vma;
> +	struct file *vm_file;
> +	loff_t offset;
> +	long seq;
> +
> +	guard(rcu)();
> +
> +	if (!mmap_lock_speculation_start(mm, &seq))
> +		return NULL;
> +
> +	vma = vma_lookup(mm, bp_vaddr);
> +	if (!vma)
> +		return NULL;
> +
> +	/* vm_file memory can be reused for another instance of struct file,
> +	 * but can't be freed from under us, so it's safe to read fields from
> +	 * it, even if the values are some garbage values; ultimately
> +	 * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
> +	 * that whatever we speculatively found is correct
> +	 */
> +	vm_file = READ_ONCE(vma->vm_file);
> +	if (!vm_file)
> +		return NULL;
> +
> +	offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);

LGTM. But perhaps vma->vm_pgoff and vma->vm_start need READ_ONCE() as well,
if nothing else to shut up KCSAN if this code races with, say, __split_vma() ?

> +	uprobe = find_uprobe_rcu(vm_file->f_inode, offset);

OK, I guess vm_file->f_inode is fine without READ_ONCE...

Oleg.
Andrii Nakryiko Oct. 2, 2024, 8:02 p.m. UTC | #2
On Wed, Oct 2, 2024 at 12:25 AM Oleg Nesterov <oleg@redhat.com> wrote:
>
> On 10/01, Andrii Nakryiko wrote:
> >
> > +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> > +{
> > +     struct mm_struct *mm = current->mm;
> > +     struct uprobe *uprobe = NULL;
> > +     struct vm_area_struct *vma;
> > +     struct file *vm_file;
> > +     loff_t offset;
> > +     long seq;
> > +
> > +     guard(rcu)();
> > +
> > +     if (!mmap_lock_speculation_start(mm, &seq))
> > +             return NULL;
> > +
> > +     vma = vma_lookup(mm, bp_vaddr);
> > +     if (!vma)
> > +             return NULL;
> > +
> > +     /* vm_file memory can be reused for another instance of struct file,
> > +      * but can't be freed from under us, so it's safe to read fields from
> > +      * it, even if the values are some garbage values; ultimately
> > +      * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
> > +      * that whatever we speculatively found is correct
> > +      */
> > +     vm_file = READ_ONCE(vma->vm_file);
> > +     if (!vm_file)
> > +             return NULL;
> > +
> > +     offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
>
> LGTM. But perhaps vma->vm_pgoff and vma->vm_start need READ_ONCE() as well,
> if nothing else to shut up KCSAN if this code races with, say, __split_vma() ?

We keep going back and forth between reading directly, using
READ_ONCE(), and annotating with data_race(). I don't think it matters
in terms of correctness or performance, so I'm happy to add whatever
incantations that will make everyone satisfied. Let's see what others
think, and I'll incorporate that into the next revision.

>
> > +     uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
>
> OK, I guess vm_file->f_inode is fine without READ_ONCE...
>
> Oleg.
>
Oleg Nesterov Oct. 3, 2024, 9:32 a.m. UTC | #3
On 10/02, Andrii Nakryiko wrote:
>
> On Wed, Oct 2, 2024 at 12:25 AM Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > > +     vm_file = READ_ONCE(vma->vm_file);
> > > +     if (!vm_file)
> > > +             return NULL;
> > > +
> > > +     offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
> >
> > LGTM. But perhaps vma->vm_pgoff and vma->vm_start need READ_ONCE() as well,
> > if nothing else to shut up KCSAN if this code races with, say, __split_vma() ?
>
> We keep going back and forth between reading directly, using
> READ_ONCE(), and annotating with data_race(). I don't think it matters
> in terms of correctness or performance, so I'm happy to add whatever
> incantations that will make everyone satisfied. Let's see what others
> think, and I'll incorporate that into the next revision.

OK, agreed...

And I guess I was wrong anyway, READ_ONCE() alone won't shutup KCSAN in
this case.

Oleg.
kernel test robot Oct. 4, 2024, 11:59 p.m. UTC | #4
Hi Andrii,

kernel test robot noticed the following build errors:

[auto build test ERROR on tip/perf/core]

url:    https://github.com/intel-lab-lkp/linux/commits/Andrii-Nakryiko/mm-introduce-mmap_lock_speculation_-start-end/20241002-065354
base:   tip/perf/core
patch link:    https://lore.kernel.org/r/20241001225207.2215639-6-andrii%40kernel.org
patch subject: [PATCH v2 tip/perf/core 5/5] uprobes: add speculative lockless VMA-to-inode-to-uprobe resolution
config: loongarch-allyesconfig (https://download.01.org/0day-ci/archive/20241005/202410050745.2Nuvusy4-lkp@intel.com/config)
compiler: loongarch64-linux-gcc (GCC) 14.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241005/202410050745.2Nuvusy4-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410050745.2Nuvusy4-lkp@intel.com/

All errors (new ones prefixed by >>):

   kernel/events/uprobes.c: In function 'find_active_uprobe_speculative':
>> kernel/events/uprobes.c:2098:46: error: passing argument 2 of 'mmap_lock_speculation_start' from incompatible pointer type [-Wincompatible-pointer-types]
    2098 |         if (!mmap_lock_speculation_start(mm, &seq))
         |                                              ^~~~
         |                                              |
         |                                              long int *
   In file included from include/linux/mm.h:16,
                    from arch/loongarch/include/asm/cacheflush.h:8,
                    from include/linux/cacheflush.h:5,
                    from include/linux/highmem.h:8,
                    from kernel/events/uprobes.c:13:
   include/linux/mmap_lock.h:126:75: note: expected 'int *' but argument is of type 'long int *'
     126 | static inline bool mmap_lock_speculation_start(struct mm_struct *mm, int *seq) { return false; }
         |                                                                      ~~~~~^~~


vim +/mmap_lock_speculation_start +2098 kernel/events/uprobes.c

  2086	
  2087	static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
  2088	{
  2089		struct mm_struct *mm = current->mm;
  2090		struct uprobe *uprobe = NULL;
  2091		struct vm_area_struct *vma;
  2092		struct file *vm_file;
  2093		loff_t offset;
  2094		long seq;
  2095	
  2096		guard(rcu)();
  2097	
> 2098		if (!mmap_lock_speculation_start(mm, &seq))
  2099			return NULL;
  2100	
  2101		vma = vma_lookup(mm, bp_vaddr);
  2102		if (!vma)
  2103			return NULL;
  2104	
  2105		/* vm_file memory can be reused for another instance of struct file,
  2106		 * but can't be freed from under us, so it's safe to read fields from
  2107		 * it, even if the values are some garbage values; ultimately
  2108		 * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
  2109		 * that whatever we speculatively found is correct
  2110		 */
  2111		vm_file = READ_ONCE(vma->vm_file);
  2112		if (!vm_file)
  2113			return NULL;
  2114	
  2115		offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
  2116		uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
  2117		if (!uprobe)
  2118			return NULL;
  2119	
  2120		/* now double check that nothing about MM changed */
  2121		if (!mmap_lock_speculation_end(mm, seq))
  2122			return NULL;
  2123	
  2124		return uprobe;
  2125	}
  2126
kernel test robot Oct. 5, 2024, 1:12 a.m. UTC | #5
Hi Andrii,

kernel test robot noticed the following build errors:

[auto build test ERROR on tip/perf/core]

url:    https://github.com/intel-lab-lkp/linux/commits/Andrii-Nakryiko/mm-introduce-mmap_lock_speculation_-start-end/20241002-065354
base:   tip/perf/core
patch link:    https://lore.kernel.org/r/20241001225207.2215639-6-andrii%40kernel.org
patch subject: [PATCH v2 tip/perf/core 5/5] uprobes: add speculative lockless VMA-to-inode-to-uprobe resolution
config: i386-defconfig (https://download.01.org/0day-ci/archive/20241005/202410050846.Zbsm9OI1-lkp@intel.com/config)
compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241005/202410050846.Zbsm9OI1-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202410050846.Zbsm9OI1-lkp@intel.com/

All errors (new ones prefixed by >>):

>> kernel/events/uprobes.c:2098:39: error: incompatible pointer types passing 'long *' to parameter of type 'int *' [-Werror,-Wincompatible-pointer-types]
    2098 |         if (!mmap_lock_speculation_start(mm, &seq))
         |                                              ^~~~
   include/linux/mmap_lock.h:126:75: note: passing argument to parameter 'seq' here
     126 | static inline bool mmap_lock_speculation_start(struct mm_struct *mm, int *seq) { return false; }
         |                                                                           ^
   1 error generated.


vim +2098 kernel/events/uprobes.c

  2086	
  2087	static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
  2088	{
  2089		struct mm_struct *mm = current->mm;
  2090		struct uprobe *uprobe = NULL;
  2091		struct vm_area_struct *vma;
  2092		struct file *vm_file;
  2093		loff_t offset;
  2094		long seq;
  2095	
  2096		guard(rcu)();
  2097	
> 2098		if (!mmap_lock_speculation_start(mm, &seq))
  2099			return NULL;
  2100	
  2101		vma = vma_lookup(mm, bp_vaddr);
  2102		if (!vma)
  2103			return NULL;
  2104	
  2105		/* vm_file memory can be reused for another instance of struct file,
  2106		 * but can't be freed from under us, so it's safe to read fields from
  2107		 * it, even if the values are some garbage values; ultimately
  2108		 * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
  2109		 * that whatever we speculatively found is correct
  2110		 */
  2111		vm_file = READ_ONCE(vma->vm_file);
  2112		if (!vm_file)
  2113			return NULL;
  2114	
  2115		offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
  2116		uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
  2117		if (!uprobe)
  2118			return NULL;
  2119	
  2120		/* now double check that nothing about MM changed */
  2121		if (!mmap_lock_speculation_end(mm, seq))
  2122			return NULL;
  2123	
  2124		return uprobe;
  2125	}
  2126
diff mbox series

Patch

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7bd9111b4e8b..960130275656 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2081,6 +2081,46 @@  static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	return is_trap_insn(&opcode);
 }
 
+static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
+{
+	struct mm_struct *mm = current->mm;
+	struct uprobe *uprobe = NULL;
+	struct vm_area_struct *vma;
+	struct file *vm_file;
+	loff_t offset;
+	long seq;
+
+	guard(rcu)();
+
+	if (!mmap_lock_speculation_start(mm, &seq))
+		return NULL;
+
+	vma = vma_lookup(mm, bp_vaddr);
+	if (!vma)
+		return NULL;
+
+	/* vm_file memory can be reused for another instance of struct file,
+	 * but can't be freed from under us, so it's safe to read fields from
+	 * it, even if the values are some garbage values; ultimately
+	 * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
+	 * that whatever we speculatively found is correct
+	 */
+	vm_file = READ_ONCE(vma->vm_file);
+	if (!vm_file)
+		return NULL;
+
+	offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
+	uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
+	if (!uprobe)
+		return NULL;
+
+	/* now double check that nothing about MM changed */
+	if (!mmap_lock_speculation_end(mm, seq))
+		return NULL;
+
+	return uprobe;
+}
+
 /* assumes being inside RCU protected region */
 static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
 {
@@ -2088,6 +2128,10 @@  static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
 	struct uprobe *uprobe = NULL;
 	struct vm_area_struct *vma;
 
+	uprobe = find_active_uprobe_speculative(bp_vaddr);
+	if (uprobe)
+		return uprobe;
+
 	mmap_read_lock(mm);
 	vma = vma_lookup(mm, bp_vaddr);
 	if (vma) {