diff mbox series

[RFC,v3,13/13] uprobes: add speculative lockless VMA to inode resolution

Message ID 20240813042917.506057-14-andrii@kernel.org (mailing list archive)
State Handled Elsewhere
Headers show
Series uprobes: RCU-protected hot path optimizations | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch
bpf/vmtest-bpf-PR fail merge-conflict
bpf/vmtest-bpf-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-14 pending Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-VM_Test-41 success Logs for x86_64-llvm-18 / veristat

Commit Message

Andrii Nakryiko Aug. 13, 2024, 4:29 a.m. UTC
Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
attempting uprobe look up speculatively.

We rely on newly added mmap_lock_speculation_{start,end}() helpers to
validate that mm_struct stays intact for entire duration of this
speculation. If not, we fall back to mmap_lock-protected lookup.

This allows to avoid contention on mmap_lock in absolutely majority of
cases, nicely improving uprobe/uretprobe scalability.

BEFORE
======
uprobe-nop            ( 1 cpus):    3.417 ± 0.013M/s  (  3.417M/s/cpu)
uprobe-nop            ( 2 cpus):    5.724 ± 0.006M/s  (  2.862M/s/cpu)
uprobe-nop            ( 3 cpus):    8.543 ± 0.012M/s  (  2.848M/s/cpu)
uprobe-nop            ( 4 cpus):   11.094 ± 0.004M/s  (  2.774M/s/cpu)
uprobe-nop            ( 5 cpus):   13.703 ± 0.006M/s  (  2.741M/s/cpu)
uprobe-nop            ( 6 cpus):   16.350 ± 0.010M/s  (  2.725M/s/cpu)
uprobe-nop            ( 7 cpus):   19.100 ± 0.031M/s  (  2.729M/s/cpu)
uprobe-nop            ( 8 cpus):   20.138 ± 0.029M/s  (  2.517M/s/cpu)
uprobe-nop            (10 cpus):   20.161 ± 0.020M/s  (  2.016M/s/cpu)
uprobe-nop            (12 cpus):   15.129 ± 0.011M/s  (  1.261M/s/cpu)
uprobe-nop            (14 cpus):   15.013 ± 0.013M/s  (  1.072M/s/cpu)
uprobe-nop            (16 cpus):   13.352 ± 0.007M/s  (  0.834M/s/cpu)
uprobe-nop            (24 cpus):   12.470 ± 0.005M/s  (  0.520M/s/cpu)
uprobe-nop            (32 cpus):   11.252 ± 0.042M/s  (  0.352M/s/cpu)
uprobe-nop            (40 cpus):   10.308 ± 0.001M/s  (  0.258M/s/cpu)
uprobe-nop            (48 cpus):   11.037 ± 0.007M/s  (  0.230M/s/cpu)
uprobe-nop            (56 cpus):   12.055 ± 0.002M/s  (  0.215M/s/cpu)
uprobe-nop            (64 cpus):   12.895 ± 0.004M/s  (  0.201M/s/cpu)
uprobe-nop            (72 cpus):   13.995 ± 0.005M/s  (  0.194M/s/cpu)
uprobe-nop            (80 cpus):   15.224 ± 0.030M/s  (  0.190M/s/cpu)

AFTER
=====
uprobe-nop            ( 1 cpus):    3.562 ± 0.006M/s  (  3.562M/s/cpu)
uprobe-nop            ( 2 cpus):    6.751 ± 0.007M/s  (  3.376M/s/cpu)
uprobe-nop            ( 3 cpus):   10.121 ± 0.007M/s  (  3.374M/s/cpu)
uprobe-nop            ( 4 cpus):   13.100 ± 0.007M/s  (  3.275M/s/cpu)
uprobe-nop            ( 5 cpus):   16.321 ± 0.008M/s  (  3.264M/s/cpu)
uprobe-nop            ( 6 cpus):   19.612 ± 0.004M/s  (  3.269M/s/cpu)
uprobe-nop            ( 7 cpus):   22.910 ± 0.037M/s  (  3.273M/s/cpu)
uprobe-nop            ( 8 cpus):   24.705 ± 0.011M/s  (  3.088M/s/cpu)
uprobe-nop            (10 cpus):   30.772 ± 0.020M/s  (  3.077M/s/cpu)
uprobe-nop            (12 cpus):   33.614 ± 0.009M/s  (  2.801M/s/cpu)
uprobe-nop            (14 cpus):   39.166 ± 0.004M/s  (  2.798M/s/cpu)
uprobe-nop            (16 cpus):   41.692 ± 0.014M/s  (  2.606M/s/cpu)
uprobe-nop            (24 cpus):   64.802 ± 0.048M/s  (  2.700M/s/cpu)
uprobe-nop            (32 cpus):   84.226 ± 0.223M/s  (  2.632M/s/cpu)
uprobe-nop            (40 cpus):  102.071 ± 0.067M/s  (  2.552M/s/cpu)
uprobe-nop            (48 cpus):  106.603 ± 1.198M/s  (  2.221M/s/cpu)
uprobe-nop            (56 cpus):  117.695 ± 0.059M/s  (  2.102M/s/cpu)
uprobe-nop            (64 cpus):  124.291 ± 0.485M/s  (  1.942M/s/cpu)
uprobe-nop            (72 cpus):  135.527 ± 0.134M/s  (  1.882M/s/cpu)
uprobe-nop            (80 cpus):  146.195 ± 0.230M/s  (  1.827M/s/cpu)

Previously total throughput was maxing out at 20mln/s with 8-10 cores,
declining afterwards. With this change, it now keeps growing with each
added CPU, reaching 146mln/s at 80 CPUs (this was measured on a 80-core
Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz).

Suggested-by: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 kernel/events/uprobes.c | 51 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

Comments

Mateusz Guzik Aug. 13, 2024, 6:17 a.m. UTC | #1
On Mon, Aug 12, 2024 at 09:29:17PM -0700, Andrii Nakryiko wrote:
> Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> attempting uprobe look up speculatively.
> 
> We rely on newly added mmap_lock_speculation_{start,end}() helpers to
> validate that mm_struct stays intact for entire duration of this
> speculation. If not, we fall back to mmap_lock-protected lookup.
> 
> This allows to avoid contention on mmap_lock in absolutely majority of
> cases, nicely improving uprobe/uretprobe scalability.
> 

Here I have to admit to being mostly ignorant about the mm, so bear with
me. :>

I note the result of find_active_uprobe_speculative is immediately stale
in face of modifications.

The thing I'm after is that the mmap_lock_speculation business adds
overhead on archs where a release fence is not a de facto nop and I
don't believe the commit message justifies it. Definitely a bummer to
add merely it for uprobes. If there are bigger plans concerning it
that's a different story of course.

With this in mind I have to ask if instead you could perhaps get away
with the already present per-vma sequence counter?
Suren Baghdasaryan Aug. 13, 2024, 3:36 p.m. UTC | #2
On Mon, Aug 12, 2024 at 11:18 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> On Mon, Aug 12, 2024 at 09:29:17PM -0700, Andrii Nakryiko wrote:
> > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > attempting uprobe look up speculatively.
> >
> > We rely on newly added mmap_lock_speculation_{start,end}() helpers to
> > validate that mm_struct stays intact for entire duration of this
> > speculation. If not, we fall back to mmap_lock-protected lookup.
> >
> > This allows to avoid contention on mmap_lock in absolutely majority of
> > cases, nicely improving uprobe/uretprobe scalability.
> >
>
> Here I have to admit to being mostly ignorant about the mm, so bear with
> me. :>
>
> I note the result of find_active_uprobe_speculative is immediately stale
> in face of modifications.
>
> The thing I'm after is that the mmap_lock_speculation business adds
> overhead on archs where a release fence is not a de facto nop and I
> don't believe the commit message justifies it. Definitely a bummer to
> add merely it for uprobes. If there are bigger plans concerning it
> that's a different story of course.
>
> With this in mind I have to ask if instead you could perhaps get away
> with the already present per-vma sequence counter?

per-vma sequence counter does not implement acquire/release logic, it
relies on vma->vm_lock for synchronization. So if we want to use it,
we would have to add additional memory barriers here. This is likely
possible but as I mentioned before we would need to ensure the
pagefault path does not regress. OTOH mm->mm_lock_seq already halfway
there (it implements acquire/release logic), we just had to ensure
mmap_write_lock() increments mm->mm_lock_seq.

So, from the release fence overhead POV I think whether we use
mm->mm_lock_seq or vma->vm_lock, we would still need a proper fence
here.
Mateusz Guzik Aug. 15, 2024, 1:44 p.m. UTC | #3
On Tue, Aug 13, 2024 at 08:36:03AM -0700, Suren Baghdasaryan wrote:
> On Mon, Aug 12, 2024 at 11:18 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> >
> > On Mon, Aug 12, 2024 at 09:29:17PM -0700, Andrii Nakryiko wrote:
> > > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > > attempting uprobe look up speculatively.
> > >
> > > We rely on newly added mmap_lock_speculation_{start,end}() helpers to
> > > validate that mm_struct stays intact for entire duration of this
> > > speculation. If not, we fall back to mmap_lock-protected lookup.
> > >
> > > This allows to avoid contention on mmap_lock in absolutely majority of
> > > cases, nicely improving uprobe/uretprobe scalability.
> > >
> >
> > Here I have to admit to being mostly ignorant about the mm, so bear with
> > me. :>
> >
> > I note the result of find_active_uprobe_speculative is immediately stale
> > in face of modifications.
> >
> > The thing I'm after is that the mmap_lock_speculation business adds
> > overhead on archs where a release fence is not a de facto nop and I
> > don't believe the commit message justifies it. Definitely a bummer to
> > add merely it for uprobes. If there are bigger plans concerning it
> > that's a different story of course.
> >
> > With this in mind I have to ask if instead you could perhaps get away
> > with the already present per-vma sequence counter?
> 
> per-vma sequence counter does not implement acquire/release logic, it
> relies on vma->vm_lock for synchronization. So if we want to use it,
> we would have to add additional memory barriers here. This is likely
> possible but as I mentioned before we would need to ensure the
> pagefault path does not regress. OTOH mm->mm_lock_seq already halfway
> there (it implements acquire/release logic), we just had to ensure
> mmap_write_lock() increments mm->mm_lock_seq.
> 
> So, from the release fence overhead POV I think whether we use
> mm->mm_lock_seq or vma->vm_lock, we would still need a proper fence
> here.
> 

Per my previous e-mail I'm not particularly familiar with mm internals,
so I'm going to handwave a little bit with my $0,03 concerning multicore
in general and if you disagree with it that's your business. For the
time being I have no interest in digging into any of this.

Before I do, to prevent this thread from being a total waste, here are
some remarks concerning the patch with the assumption that the core idea
lands.

From the commit message:
> Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> attempting uprobe look up speculatively.

Just in case I'll note a nit that this paragraph will need to be removed
since the patch adding the flag is getting dropped.

A non-nit which may or may not end up mattering is that the flag (which
*is* set on the filep slab cache) makes things more difficult to
validate. Normal RCU usage guarantees that the object itself wont be
freed as long you follow the rules. However, the SLAB_TYPESAFE_BY_RCU
flag weakens it significantly -- the thing at hand will always be a
'struct file', but it may get reallocated to *another* file from under
you. Whether this aspect plays a role here I don't know.

> +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> +{
> +	const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
> +	struct mm_struct *mm = current->mm;
> +	struct uprobe *uprobe;
> +	struct vm_area_struct *vma;
> +	struct file *vm_file;
> +	struct inode *vm_inode;
> +	unsigned long vm_pgoff, vm_start;
> +	int seq;
> +	loff_t offset;
> +
> +	if (!mmap_lock_speculation_start(mm, &seq))
> +		return NULL;
> +
> +	rcu_read_lock();
> +

I don't think there is a correctness problem here, but entering rcu
*after* deciding to speculatively do the lookup feels backwards.

> +	vma = vma_lookup(mm, bp_vaddr);
> +	if (!vma)
> +		goto bail;
> +
> +	vm_file = data_race(vma->vm_file);
> +	if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
> +		goto bail;
> +

If vma teardown is allowed to progress and the file got fput'ed...

> +	vm_inode = data_race(vm_file->f_inode);

... the inode can be NULL, I don't know if that's handled.

More importantly though, per my previous description of
SLAB_TYPESAFE_BY_RCU, by now the file could have been reallocated and
the inode you did find is completely unrelated.

I understand the intent is to backpedal from everything should the mm
seqc change, but the above may happen to matter.

> +	vm_pgoff = data_race(vma->vm_pgoff);
> +	vm_start = data_race(vma->vm_start);
> +
> +	offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> +	uprobe = find_uprobe_rcu(vm_inode, offset);
> +	if (!uprobe)
> +		goto bail;
> +
> +	/* now double check that nothing about MM changed */
> +	if (!mmap_lock_speculation_end(mm, seq))
> +		goto bail;

This leaks the reference obtained by find_uprobe_rcu().

> +
> +	rcu_read_unlock();
> +
> +	/* happy case, we speculated successfully */
> +	return uprobe;
> +bail:
> +	rcu_read_unlock();
> +	return NULL;
> +}

Now to some handwaving, here it is:

The core of my concern is that adding more work to down_write on the
mmap semaphore comes with certain side-effects and plausibly more than a
sufficient speed up can be achieved without doing it.

An mm-wide mechanism is just incredibly coarse-grained and it may happen
to perform poorly when faced with a program which likes to mess with its
address space -- the fast path is going to keep failing and only
inducing *more* overhead as the code decides to down_read the mmap
semaphore.

Furthermore there may be work currently synchronized with down_write
which perhaps can transition to "merely" down_read, but by the time it
happens this and possibly other consumers expect a change in the
sequence counter, messing with it.

To my understanding the kernel supports parallel faults with per-vma
locking. I would find it surprising if the same machinery could not be
used to sort out uprobe handling above.

I presume a down_read on vma around all the work would also sort out any
issues concerning stability of the file or inode objects.

Of course single-threaded performance would take a hit due to atomic
stemming from down/up_read and parallel uprobe lookups on the same vma
would also get slower, but I don't know if that's a problem for a real
workload.

I would not have any comments if all speed ups were achieved without
modifying non-uprobe code.
Andrii Nakryiko Aug. 15, 2024, 4:47 p.m. UTC | #4
On Thu, Aug 15, 2024 at 6:44 AM Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> On Tue, Aug 13, 2024 at 08:36:03AM -0700, Suren Baghdasaryan wrote:
> > On Mon, Aug 12, 2024 at 11:18 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > >
> > > On Mon, Aug 12, 2024 at 09:29:17PM -0700, Andrii Nakryiko wrote:
> > > > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > > > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > > > attempting uprobe look up speculatively.
> > > >
> > > > We rely on newly added mmap_lock_speculation_{start,end}() helpers to
> > > > validate that mm_struct stays intact for entire duration of this
> > > > speculation. If not, we fall back to mmap_lock-protected lookup.
> > > >
> > > > This allows to avoid contention on mmap_lock in absolutely majority of
> > > > cases, nicely improving uprobe/uretprobe scalability.
> > > >
> > >
> > > Here I have to admit to being mostly ignorant about the mm, so bear with
> > > me. :>
> > >
> > > I note the result of find_active_uprobe_speculative is immediately stale
> > > in face of modifications.
> > >
> > > The thing I'm after is that the mmap_lock_speculation business adds
> > > overhead on archs where a release fence is not a de facto nop and I
> > > don't believe the commit message justifies it. Definitely a bummer to
> > > add merely it for uprobes. If there are bigger plans concerning it
> > > that's a different story of course.
> > >
> > > With this in mind I have to ask if instead you could perhaps get away
> > > with the already present per-vma sequence counter?
> >
> > per-vma sequence counter does not implement acquire/release logic, it
> > relies on vma->vm_lock for synchronization. So if we want to use it,
> > we would have to add additional memory barriers here. This is likely
> > possible but as I mentioned before we would need to ensure the
> > pagefault path does not regress. OTOH mm->mm_lock_seq already halfway
> > there (it implements acquire/release logic), we just had to ensure
> > mmap_write_lock() increments mm->mm_lock_seq.
> >
> > So, from the release fence overhead POV I think whether we use
> > mm->mm_lock_seq or vma->vm_lock, we would still need a proper fence
> > here.
> >
>
> Per my previous e-mail I'm not particularly familiar with mm internals,
> so I'm going to handwave a little bit with my $0,03 concerning multicore
> in general and if you disagree with it that's your business. For the
> time being I have no interest in digging into any of this.
>
> Before I do, to prevent this thread from being a total waste, here are
> some remarks concerning the patch with the assumption that the core idea
> lands.
>
> From the commit message:
> > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > attempting uprobe look up speculatively.
>
> Just in case I'll note a nit that this paragraph will need to be removed
> since the patch adding the flag is getting dropped.

Yep, of course, I'll update all that for the next revision (I'll wait
for non-RFC patches to land first before reposting).

>
> A non-nit which may or may not end up mattering is that the flag (which
> *is* set on the filep slab cache) makes things more difficult to
> validate. Normal RCU usage guarantees that the object itself wont be
> freed as long you follow the rules. However, the SLAB_TYPESAFE_BY_RCU
> flag weakens it significantly -- the thing at hand will always be a
> 'struct file', but it may get reallocated to *another* file from under
> you. Whether this aspect plays a role here I don't know.

Yes, that's ok and is accounted for. We care about that memory not
going even from under us (I'm not even sure if it matters that it is
still a struct file, tbh; I think that shouldn't matter as we are
prepared to deal with completely garbage values read from struct
file).

>
> > +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> > +{
> > +     const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
> > +     struct mm_struct *mm = current->mm;
> > +     struct uprobe *uprobe;
> > +     struct vm_area_struct *vma;
> > +     struct file *vm_file;
> > +     struct inode *vm_inode;
> > +     unsigned long vm_pgoff, vm_start;
> > +     int seq;
> > +     loff_t offset;
> > +
> > +     if (!mmap_lock_speculation_start(mm, &seq))
> > +             return NULL;
> > +
> > +     rcu_read_lock();
> > +
>
> I don't think there is a correctness problem here, but entering rcu
> *after* deciding to speculatively do the lookup feels backwards.

RCU should protect VMA and file, mm itself won't go anywhere, so this seems ok.

>
> > +     vma = vma_lookup(mm, bp_vaddr);
> > +     if (!vma)
> > +             goto bail;
> > +
> > +     vm_file = data_race(vma->vm_file);
> > +     if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
> > +             goto bail;
> > +
>
> If vma teardown is allowed to progress and the file got fput'ed...
>
> > +     vm_inode = data_race(vm_file->f_inode);
>
> ... the inode can be NULL, I don't know if that's handled.
>

Yep, inode pointer value is part of RB-tree key, so if it's NULL, we
just won't find a matching uprobe. Same for any other "garbage"
f_inode value. Importantly, we never should dereference such inode
pointers, at least until we find a valid uprobe (in which case we keep
inode reference to it).

> More importantly though, per my previous description of
> SLAB_TYPESAFE_BY_RCU, by now the file could have been reallocated and
> the inode you did find is completely unrelated.
>
> I understand the intent is to backpedal from everything should the mm
> seqc change, but the above may happen to matter.

Yes, I think we took that into account. All that we care about is
memory "type safety", i.e., even if struct file's memory is reused,
it's ok, we'll eventually detect the change and will discard wrong
uprobe that we might by accident lookup (though probably in most cases
we just won't find a uprobe at all).

>
> > +     vm_pgoff = data_race(vma->vm_pgoff);
> > +     vm_start = data_race(vma->vm_start);
> > +
> > +     offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > +     uprobe = find_uprobe_rcu(vm_inode, offset);
> > +     if (!uprobe)
> > +             goto bail;
> > +
> > +     /* now double check that nothing about MM changed */
> > +     if (!mmap_lock_speculation_end(mm, seq))
> > +             goto bail;
>
> This leaks the reference obtained by find_uprobe_rcu().

find_uprobe_rcu() doesn't obtain a reference, uprobe is RCU-protected,
and if caller need a refcount bump it will have to use
try_get_uprobe() (which might fail).

>
> > +
> > +     rcu_read_unlock();
> > +
> > +     /* happy case, we speculated successfully */
> > +     return uprobe;
> > +bail:
> > +     rcu_read_unlock();
> > +     return NULL;
> > +}
>
> Now to some handwaving, here it is:
>
> The core of my concern is that adding more work to down_write on the
> mmap semaphore comes with certain side-effects and plausibly more than a
> sufficient speed up can be achieved without doing it.
>
> An mm-wide mechanism is just incredibly coarse-grained and it may happen
> to perform poorly when faced with a program which likes to mess with its
> address space -- the fast path is going to keep failing and only
> inducing *more* overhead as the code decides to down_read the mmap
> semaphore.
>
> Furthermore there may be work currently synchronized with down_write
> which perhaps can transition to "merely" down_read, but by the time it
> happens this and possibly other consumers expect a change in the
> sequence counter, messing with it.
>
> To my understanding the kernel supports parallel faults with per-vma
> locking. I would find it surprising if the same machinery could not be
> used to sort out uprobe handling above.

per-vma locking is still *locking*. Which means memory sharing between
multiple CPUs, which means limited scalability. Lots of work in this
series went to avoid even refcounting (as I pointed out for
find_uprobe_rcu()) due to the same reason, and so relying on per-VMA
locking is just shifting the bottleneck from mmap_lock to
vma->vm_lock. Worst (and not uncommon) case is the same uprobe in the
same process (and thus vma) being hit on multiple CPUs at the same
time. Whether that's protected by mmap_lock or vma->vm_lock is
immaterial at that point (from scalability standpoint).

>
> I presume a down_read on vma around all the work would also sort out any
> issues concerning stability of the file or inode objects.
>
> Of course single-threaded performance would take a hit due to atomic
> stemming from down/up_read and parallel uprobe lookups on the same vma
> would also get slower, but I don't know if that's a problem for a real
> workload.
>
> I would not have any comments if all speed ups were achieved without
> modifying non-uprobe code.

I'm also not a mm-focused person, so I'll let Suren and others address
mm-specific concerns, but I (hopefully) addressed all the
uprobe-related questions and concerns you had.
Suren Baghdasaryan Aug. 15, 2024, 5:45 p.m. UTC | #5
On Thu, Aug 15, 2024 at 9:47 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Thu, Aug 15, 2024 at 6:44 AM Mateusz Guzik <mjguzik@gmail.com> wrote:
> >
> > On Tue, Aug 13, 2024 at 08:36:03AM -0700, Suren Baghdasaryan wrote:
> > > On Mon, Aug 12, 2024 at 11:18 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > > >
> > > > On Mon, Aug 12, 2024 at 09:29:17PM -0700, Andrii Nakryiko wrote:
> > > > > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > > > > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > > > > attempting uprobe look up speculatively.
> > > > >
> > > > > We rely on newly added mmap_lock_speculation_{start,end}() helpers to
> > > > > validate that mm_struct stays intact for entire duration of this
> > > > > speculation. If not, we fall back to mmap_lock-protected lookup.
> > > > >
> > > > > This allows to avoid contention on mmap_lock in absolutely majority of
> > > > > cases, nicely improving uprobe/uretprobe scalability.
> > > > >
> > > >
> > > > Here I have to admit to being mostly ignorant about the mm, so bear with
> > > > me. :>
> > > >
> > > > I note the result of find_active_uprobe_speculative is immediately stale
> > > > in face of modifications.
> > > >
> > > > The thing I'm after is that the mmap_lock_speculation business adds
> > > > overhead on archs where a release fence is not a de facto nop and I
> > > > don't believe the commit message justifies it. Definitely a bummer to
> > > > add merely it for uprobes. If there are bigger plans concerning it
> > > > that's a different story of course.
> > > >
> > > > With this in mind I have to ask if instead you could perhaps get away
> > > > with the already present per-vma sequence counter?
> > >
> > > per-vma sequence counter does not implement acquire/release logic, it
> > > relies on vma->vm_lock for synchronization. So if we want to use it,
> > > we would have to add additional memory barriers here. This is likely
> > > possible but as I mentioned before we would need to ensure the
> > > pagefault path does not regress. OTOH mm->mm_lock_seq already halfway
> > > there (it implements acquire/release logic), we just had to ensure
> > > mmap_write_lock() increments mm->mm_lock_seq.
> > >
> > > So, from the release fence overhead POV I think whether we use
> > > mm->mm_lock_seq or vma->vm_lock, we would still need a proper fence
> > > here.
> > >
> >
> > Per my previous e-mail I'm not particularly familiar with mm internals,
> > so I'm going to handwave a little bit with my $0,03 concerning multicore
> > in general and if you disagree with it that's your business. For the
> > time being I have no interest in digging into any of this.
> >
> > Before I do, to prevent this thread from being a total waste, here are
> > some remarks concerning the patch with the assumption that the core idea
> > lands.
> >
> > From the commit message:
> > > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > > attempting uprobe look up speculatively.
> >
> > Just in case I'll note a nit that this paragraph will need to be removed
> > since the patch adding the flag is getting dropped.
>
> Yep, of course, I'll update all that for the next revision (I'll wait
> for non-RFC patches to land first before reposting).
>
> >
> > A non-nit which may or may not end up mattering is that the flag (which
> > *is* set on the filep slab cache) makes things more difficult to
> > validate. Normal RCU usage guarantees that the object itself wont be
> > freed as long you follow the rules. However, the SLAB_TYPESAFE_BY_RCU
> > flag weakens it significantly -- the thing at hand will always be a
> > 'struct file', but it may get reallocated to *another* file from under
> > you. Whether this aspect plays a role here I don't know.
>
> Yes, that's ok and is accounted for. We care about that memory not
> going even from under us (I'm not even sure if it matters that it is
> still a struct file, tbh; I think that shouldn't matter as we are
> prepared to deal with completely garbage values read from struct
> file).

Correct, with SLAB_TYPESAFE_BY_RCU we do need an additional check that
vma->vm_file has not been freed and reused. That's where
mmap_lock_speculation_{start|end} helps us. For vma->vm_file to change
from under us one would have to take mmap_lock for write. If that
happens mmap_lock_speculation_{start|end} should detect that and
terminate our speculation.

>
> >
> > > +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> > > +{
> > > +     const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
> > > +     struct mm_struct *mm = current->mm;
> > > +     struct uprobe *uprobe;
> > > +     struct vm_area_struct *vma;
> > > +     struct file *vm_file;
> > > +     struct inode *vm_inode;
> > > +     unsigned long vm_pgoff, vm_start;
> > > +     int seq;
> > > +     loff_t offset;
> > > +
> > > +     if (!mmap_lock_speculation_start(mm, &seq))
> > > +             return NULL;
> > > +
> > > +     rcu_read_lock();
> > > +
> >
> > I don't think there is a correctness problem here, but entering rcu
> > *after* deciding to speculatively do the lookup feels backwards.
>
> RCU should protect VMA and file, mm itself won't go anywhere, so this seems ok.
>
> >
> > > +     vma = vma_lookup(mm, bp_vaddr);
> > > +     if (!vma)
> > > +             goto bail;
> > > +
> > > +     vm_file = data_race(vma->vm_file);
> > > +     if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
> > > +             goto bail;
> > > +
> >
> > If vma teardown is allowed to progress and the file got fput'ed...
> >
> > > +     vm_inode = data_race(vm_file->f_inode);
> >
> > ... the inode can be NULL, I don't know if that's handled.
> >
>
> Yep, inode pointer value is part of RB-tree key, so if it's NULL, we
> just won't find a matching uprobe. Same for any other "garbage"
> f_inode value. Importantly, we never should dereference such inode
> pointers, at least until we find a valid uprobe (in which case we keep
> inode reference to it).
>
> > More importantly though, per my previous description of
> > SLAB_TYPESAFE_BY_RCU, by now the file could have been reallocated and
> > the inode you did find is completely unrelated.
> >
> > I understand the intent is to backpedal from everything should the mm
> > seqc change, but the above may happen to matter.
>
> Yes, I think we took that into account. All that we care about is
> memory "type safety", i.e., even if struct file's memory is reused,
> it's ok, we'll eventually detect the change and will discard wrong
> uprobe that we might by accident lookup (though probably in most cases
> we just won't find a uprobe at all).
>
> >
> > > +     vm_pgoff = data_race(vma->vm_pgoff);
> > > +     vm_start = data_race(vma->vm_start);
> > > +
> > > +     offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > +     uprobe = find_uprobe_rcu(vm_inode, offset);
> > > +     if (!uprobe)
> > > +             goto bail;
> > > +
> > > +     /* now double check that nothing about MM changed */
> > > +     if (!mmap_lock_speculation_end(mm, seq))
> > > +             goto bail;
> >
> > This leaks the reference obtained by find_uprobe_rcu().
>
> find_uprobe_rcu() doesn't obtain a reference, uprobe is RCU-protected,
> and if caller need a refcount bump it will have to use
> try_get_uprobe() (which might fail).
>
> >
> > > +
> > > +     rcu_read_unlock();
> > > +
> > > +     /* happy case, we speculated successfully */
> > > +     return uprobe;
> > > +bail:
> > > +     rcu_read_unlock();
> > > +     return NULL;
> > > +}
> >
> > Now to some handwaving, here it is:
> >
> > The core of my concern is that adding more work to down_write on the
> > mmap semaphore comes with certain side-effects and plausibly more than a
> > sufficient speed up can be achieved without doing it.

AFAIK writers of mmap_lock are not considered a fast path. In a sense
yes, we made any writer a bit heavier but OTOH we also made
mm->mm_lock_seq a proper sequence count which allows us to locklessly
check if mmap_lock is write-locked. I think you asked whether there
will be other uses for mmap_lock_speculation_{start|end} and yes. For
example, I am planning to use them for printing /proc/{pid}/maps
without taking mmap_lock (when it's uncontended). If we have VMA seq
counter-based detection it would be better (see below).

> >
> > An mm-wide mechanism is just incredibly coarse-grained and it may happen
> > to perform poorly when faced with a program which likes to mess with its
> > address space -- the fast path is going to keep failing and only
> > inducing *more* overhead as the code decides to down_read the mmap
> > semaphore.
> >
> > Furthermore there may be work currently synchronized with down_write
> > which perhaps can transition to "merely" down_read, but by the time it
> > happens this and possibly other consumers expect a change in the
> > sequence counter, messing with it.
> >
> > To my understanding the kernel supports parallel faults with per-vma
> > locking. I would find it surprising if the same machinery could not be
> > used to sort out uprobe handling above.

From all the above, my understanding of your objection is that
checking mmap_lock during our speculation is too coarse-grained and
you would prefer to use the VMA seq counter to check that the VMA we
are working on is unchanged. I agree, that would be ideal. I had a
quick chat with Jann about this and the conclusion we came to is that
we would need to add an additional smp_wmb() barrier inside
vma_start_write() and a smp_rmb() in the speculation code:

static inline void vma_start_write(struct vm_area_struct *vma)
{
        int mm_lock_seq;

        if (__is_vma_write_locked(vma, &mm_lock_seq))
                return;

        down_write(&vma->vm_lock->lock);
        /*
         * We should use WRITE_ONCE() here because we can have concurrent reads
         * from the early lockless pessimistic check in vma_start_read().
         * We don't really care about the correctness of that early check, but
         * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
         */
        WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
+        smp_wmb();
        up_write(&vma->vm_lock->lock);
}

Note: up_write(&vma->vm_lock->lock) in the vma_start_write() is not
enough because it's one-way permeable (it's a "RELEASE operation") and
later vma->vm_file store (or any other VMA modification) can move
before our vma->vm_lock_seq store.

This makes vma_start_write() heavier but again, it's write-locking, so
should not be considered a fast path.
With this change we can use the code suggested by Andrii in
https://lore.kernel.org/all/CAEf4BzZeLg0WsYw2M7KFy0+APrPaPVBY7FbawB9vjcA2+6k69Q@mail.gmail.com/
with an additional smp_rmb():

rcu_read_lock()
vma = find_vma(...)
if (!vma) /* bail */

vm_lock_seq = smp_load_acquire(&vma->vm_lock_seq);
mm_lock_seq = smp_load_acquire(&vma->mm->mm_lock_seq);
/* I think vm_lock has to be acquired first to avoid the race */
if (mm_lock_seq == vm_lock_seq)
        /* bail, vma is write-locked */
... perform uprobe lookup logic based on vma->vm_file->f_inode ...
smp_rmb();
if (vma->vm_lock_seq != vm_lock_seq)
        /* bail, VMA might have changed */

The smp_rmb() is needed so that vma->vm_lock_seq load does not get
reordered and moved up before speculation.

I'm CC'ing Jann since he understands memory barriers way better than
me and will keep me honest.


>
> per-vma locking is still *locking*. Which means memory sharing between
> multiple CPUs, which means limited scalability. Lots of work in this
> series went to avoid even refcounting (as I pointed out for
> find_uprobe_rcu()) due to the same reason, and so relying on per-VMA
> locking is just shifting the bottleneck from mmap_lock to
> vma->vm_lock. Worst (and not uncommon) case is the same uprobe in the
> same process (and thus vma) being hit on multiple CPUs at the same
> time. Whether that's protected by mmap_lock or vma->vm_lock is
> immaterial at that point (from scalability standpoint).
>
> >
> > I presume a down_read on vma around all the work would also sort out any
> > issues concerning stability of the file or inode objects.
> >
> > Of course single-threaded performance would take a hit due to atomic
> > stemming from down/up_read and parallel uprobe lookups on the same vma
> > would also get slower, but I don't know if that's a problem for a real
> > workload.
> >
> > I would not have any comments if all speed ups were achieved without
> > modifying non-uprobe code.
>
> I'm also not a mm-focused person, so I'll let Suren and others address
> mm-specific concerns, but I (hopefully) addressed all the
> uprobe-related questions and concerns you had.
Mateusz Guzik Aug. 15, 2024, 6:24 p.m. UTC | #6
On Thu, Aug 15, 2024 at 10:45:45AM -0700, Suren Baghdasaryan wrote:
> >From all the above, my understanding of your objection is that
> checking mmap_lock during our speculation is too coarse-grained and
> you would prefer to use the VMA seq counter to check that the VMA we
> are working on is unchanged. I agree, that would be ideal. I had a
> quick chat with Jann about this and the conclusion we came to is that
> we would need to add an additional smp_wmb() barrier inside
> vma_start_write() and a smp_rmb() in the speculation code:
> 
> static inline void vma_start_write(struct vm_area_struct *vma)
> {
>         int mm_lock_seq;
> 
>         if (__is_vma_write_locked(vma, &mm_lock_seq))
>                 return;
> 
>         down_write(&vma->vm_lock->lock);
>         /*
>          * We should use WRITE_ONCE() here because we can have concurrent reads
>          * from the early lockless pessimistic check in vma_start_read().
>          * We don't really care about the correctness of that early check, but
>          * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
>          */
>         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> +        smp_wmb();
>         up_write(&vma->vm_lock->lock);
> }
> 
> Note: up_write(&vma->vm_lock->lock) in the vma_start_write() is not
> enough because it's one-way permeable (it's a "RELEASE operation") and
> later vma->vm_file store (or any other VMA modification) can move
> before our vma->vm_lock_seq store.
> 
> This makes vma_start_write() heavier but again, it's write-locking, so
> should not be considered a fast path.
> With this change we can use the code suggested by Andrii in
> https://lore.kernel.org/all/CAEf4BzZeLg0WsYw2M7KFy0+APrPaPVBY7FbawB9vjcA2+6k69Q@mail.gmail.com/
> with an additional smp_rmb():
> 
> rcu_read_lock()
> vma = find_vma(...)
> if (!vma) /* bail */
> 
> vm_lock_seq = smp_load_acquire(&vma->vm_lock_seq);
> mm_lock_seq = smp_load_acquire(&vma->mm->mm_lock_seq);
> /* I think vm_lock has to be acquired first to avoid the race */
> if (mm_lock_seq == vm_lock_seq)
>         /* bail, vma is write-locked */
> ... perform uprobe lookup logic based on vma->vm_file->f_inode ...
> smp_rmb();
> if (vma->vm_lock_seq != vm_lock_seq)
>         /* bail, VMA might have changed */
> 
> The smp_rmb() is needed so that vma->vm_lock_seq load does not get
> reordered and moved up before speculation.
> 
> I'm CC'ing Jann since he understands memory barriers way better than
> me and will keep me honest.
> 

So I briefly noted that maybe down_read on the vma would do it, but per
Andrii parallel lookups on the same vma on multiple CPUs are expected,
which whacks that out.

When I initially mentioned per-vma sequence counters I blindly assumed
they worked the usual way. I don't believe any fancy rework here is
warranted especially given that the per-mm counter thing is expected to
have other uses.

However, chances are decent this can still be worked out with per-vma
granualarity all while avoiding any stores on lookup and without
invasive (or complicated) changes. The lockless uprobe code claims to
guarantee only false negatives and the miss always falls back to the
mmap semaphore lookup. There may be something here, I'm going to chew on
it.

That said, thank you both for writeup so far.
Jann Horn Aug. 15, 2024, 6:58 p.m. UTC | #7
+brauner for "struct file" lifetime

On Thu, Aug 15, 2024 at 7:45 PM Suren Baghdasaryan <surenb@google.com> wrote:
> On Thu, Aug 15, 2024 at 9:47 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Thu, Aug 15, 2024 at 6:44 AM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > >
> > > On Tue, Aug 13, 2024 at 08:36:03AM -0700, Suren Baghdasaryan wrote:
> > > > On Mon, Aug 12, 2024 at 11:18 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > > > >
> > > > > On Mon, Aug 12, 2024 at 09:29:17PM -0700, Andrii Nakryiko wrote:
> > > > > > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > > > > > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > > > > > attempting uprobe look up speculatively.

Stupid question: Is this uprobe stuff actually such a hot codepath
that it makes sense to optimize it to be faster than the page fault
path?

(Sidenote: I find it kinda interesting that this is sort of going back
in the direction of the old Speculative Page Faults design.)

> > > > > > We rely on newly added mmap_lock_speculation_{start,end}() helpers to
> > > > > > validate that mm_struct stays intact for entire duration of this
> > > > > > speculation. If not, we fall back to mmap_lock-protected lookup.
> > > > > >
> > > > > > This allows to avoid contention on mmap_lock in absolutely majority of
> > > > > > cases, nicely improving uprobe/uretprobe scalability.
> > > > > >
> > > > >
> > > > > Here I have to admit to being mostly ignorant about the mm, so bear with
> > > > > me. :>
> > > > >
> > > > > I note the result of find_active_uprobe_speculative is immediately stale
> > > > > in face of modifications.
> > > > >
> > > > > The thing I'm after is that the mmap_lock_speculation business adds
> > > > > overhead on archs where a release fence is not a de facto nop and I
> > > > > don't believe the commit message justifies it. Definitely a bummer to
> > > > > add merely it for uprobes. If there are bigger plans concerning it
> > > > > that's a different story of course.
> > > > >
> > > > > With this in mind I have to ask if instead you could perhaps get away
> > > > > with the already present per-vma sequence counter?
> > > >
> > > > per-vma sequence counter does not implement acquire/release logic, it
> > > > relies on vma->vm_lock for synchronization. So if we want to use it,
> > > > we would have to add additional memory barriers here. This is likely
> > > > possible but as I mentioned before we would need to ensure the
> > > > pagefault path does not regress. OTOH mm->mm_lock_seq already halfway
> > > > there (it implements acquire/release logic), we just had to ensure
> > > > mmap_write_lock() increments mm->mm_lock_seq.
> > > >
> > > > So, from the release fence overhead POV I think whether we use
> > > > mm->mm_lock_seq or vma->vm_lock, we would still need a proper fence
> > > > here.
> > > >
> > >
> > > Per my previous e-mail I'm not particularly familiar with mm internals,
> > > so I'm going to handwave a little bit with my $0,03 concerning multicore
> > > in general and if you disagree with it that's your business. For the
> > > time being I have no interest in digging into any of this.
> > >
> > > Before I do, to prevent this thread from being a total waste, here are
> > > some remarks concerning the patch with the assumption that the core idea
> > > lands.
> > >
> > > From the commit message:
> > > > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > > > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > > > attempting uprobe look up speculatively.
> > >
> > > Just in case I'll note a nit that this paragraph will need to be removed
> > > since the patch adding the flag is getting dropped.
> >
> > Yep, of course, I'll update all that for the next revision (I'll wait
> > for non-RFC patches to land first before reposting).
> >
> > >
> > > A non-nit which may or may not end up mattering is that the flag (which
> > > *is* set on the filep slab cache) makes things more difficult to
> > > validate. Normal RCU usage guarantees that the object itself wont be
> > > freed as long you follow the rules. However, the SLAB_TYPESAFE_BY_RCU
> > > flag weakens it significantly -- the thing at hand will always be a
> > > 'struct file', but it may get reallocated to *another* file from under
> > > you. Whether this aspect plays a role here I don't know.
> >
> > Yes, that's ok and is accounted for. We care about that memory not
> > going even from under us (I'm not even sure if it matters that it is
> > still a struct file, tbh; I think that shouldn't matter as we are
> > prepared to deal with completely garbage values read from struct
> > file).
>
> Correct, with SLAB_TYPESAFE_BY_RCU we do need an additional check that
> vma->vm_file has not been freed and reused. That's where
> mmap_lock_speculation_{start|end} helps us. For vma->vm_file to change
> from under us one would have to take mmap_lock for write. If that
> happens mmap_lock_speculation_{start|end} should detect that and
> terminate our speculation.
>
> >
> > >
> > > > +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> > > > +{
> > > > +     const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
> > > > +     struct mm_struct *mm = current->mm;
> > > > +     struct uprobe *uprobe;
> > > > +     struct vm_area_struct *vma;
> > > > +     struct file *vm_file;
> > > > +     struct inode *vm_inode;
> > > > +     unsigned long vm_pgoff, vm_start;
> > > > +     int seq;
> > > > +     loff_t offset;
> > > > +
> > > > +     if (!mmap_lock_speculation_start(mm, &seq))
> > > > +             return NULL;
> > > > +
> > > > +     rcu_read_lock();
> > > > +
> > >
> > > I don't think there is a correctness problem here, but entering rcu
> > > *after* deciding to speculatively do the lookup feels backwards.
> >
> > RCU should protect VMA and file, mm itself won't go anywhere, so this seems ok.
> >
> > >
> > > > +     vma = vma_lookup(mm, bp_vaddr);
> > > > +     if (!vma)
> > > > +             goto bail;
> > > > +
> > > > +     vm_file = data_race(vma->vm_file);
> > > > +     if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
> > > > +             goto bail;
> > > > +
> > >
> > > If vma teardown is allowed to progress and the file got fput'ed...
> > >
> > > > +     vm_inode = data_race(vm_file->f_inode);
> > >
> > > ... the inode can be NULL, I don't know if that's handled.
> > >
> >
> > Yep, inode pointer value is part of RB-tree key, so if it's NULL, we
> > just won't find a matching uprobe. Same for any other "garbage"
> > f_inode value. Importantly, we never should dereference such inode
> > pointers, at least until we find a valid uprobe (in which case we keep
> > inode reference to it).
> >
> > > More importantly though, per my previous description of
> > > SLAB_TYPESAFE_BY_RCU, by now the file could have been reallocated and
> > > the inode you did find is completely unrelated.
> > >
> > > I understand the intent is to backpedal from everything should the mm
> > > seqc change, but the above may happen to matter.
> >
> > Yes, I think we took that into account. All that we care about is
> > memory "type safety", i.e., even if struct file's memory is reused,
> > it's ok, we'll eventually detect the change and will discard wrong
> > uprobe that we might by accident lookup (though probably in most cases
> > we just won't find a uprobe at all).
> >
> > >
> > > > +     vm_pgoff = data_race(vma->vm_pgoff);
> > > > +     vm_start = data_race(vma->vm_start);
> > > > +
> > > > +     offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > +     uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > +     if (!uprobe)
> > > > +             goto bail;
> > > > +
> > > > +     /* now double check that nothing about MM changed */
> > > > +     if (!mmap_lock_speculation_end(mm, seq))
> > > > +             goto bail;
> > >
> > > This leaks the reference obtained by find_uprobe_rcu().
> >
> > find_uprobe_rcu() doesn't obtain a reference, uprobe is RCU-protected,
> > and if caller need a refcount bump it will have to use
> > try_get_uprobe() (which might fail).
> >
> > >
> > > > +
> > > > +     rcu_read_unlock();
> > > > +
> > > > +     /* happy case, we speculated successfully */
> > > > +     return uprobe;
> > > > +bail:
> > > > +     rcu_read_unlock();
> > > > +     return NULL;
> > > > +}
> > >
> > > Now to some handwaving, here it is:
> > >
> > > The core of my concern is that adding more work to down_write on the
> > > mmap semaphore comes with certain side-effects and plausibly more than a
> > > sufficient speed up can be achieved without doing it.
>
> AFAIK writers of mmap_lock are not considered a fast path. In a sense
> yes, we made any writer a bit heavier but OTOH we also made
> mm->mm_lock_seq a proper sequence count which allows us to locklessly
> check if mmap_lock is write-locked. I think you asked whether there
> will be other uses for mmap_lock_speculation_{start|end} and yes. For
> example, I am planning to use them for printing /proc/{pid}/maps
> without taking mmap_lock (when it's uncontended).

What would be the goal of this - to avoid cacheline bouncing of the
mmap lock between readers? Or to allow mmap_write_lock() to preempt
/proc/{pid}/maps readers who started out uncontended?

Is the idea that you'd change show_map_vma() to first do something
like get_file_active() to increment the file refcount (because
otherwise the dentry can be freed under you and you need the dentry
for path printing), then recheck your sequence count on the mm or vma
(to avoid accessing the dentry of an unrelated file that hasn't become
userspace-visible yet and may not have a proper dentry pointer yet),
then print the file path, drop the file reference again, and in the
end recheck the sequence count again before actually returning the
printed data to userspace?

> If we have VMA seq
> counter-based detection it would be better (see below).
>
> > >
> > > An mm-wide mechanism is just incredibly coarse-grained and it may happen
> > > to perform poorly when faced with a program which likes to mess with its
> > > address space -- the fast path is going to keep failing and only
> > > inducing *more* overhead as the code decides to down_read the mmap
> > > semaphore.
> > >
> > > Furthermore there may be work currently synchronized with down_write
> > > which perhaps can transition to "merely" down_read, but by the time it
> > > happens this and possibly other consumers expect a change in the
> > > sequence counter, messing with it.
> > >
> > > To my understanding the kernel supports parallel faults with per-vma
> > > locking. I would find it surprising if the same machinery could not be
> > > used to sort out uprobe handling above.
>
> From all the above, my understanding of your objection is that
> checking mmap_lock during our speculation is too coarse-grained and
> you would prefer to use the VMA seq counter to check that the VMA we
> are working on is unchanged. I agree, that would be ideal. I had a
> quick chat with Jann about this and the conclusion we came to is that
> we would need to add an additional smp_wmb() barrier inside
> vma_start_write() and a smp_rmb() in the speculation code:
>
> static inline void vma_start_write(struct vm_area_struct *vma)
> {
>         int mm_lock_seq;
>
>         if (__is_vma_write_locked(vma, &mm_lock_seq))
>                 return;
>
>         down_write(&vma->vm_lock->lock);
>         /*
>          * We should use WRITE_ONCE() here because we can have concurrent reads
>          * from the early lockless pessimistic check in vma_start_read().
>          * We don't really care about the correctness of that early check, but
>          * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
>          */
>         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> +        smp_wmb();
>         up_write(&vma->vm_lock->lock);
> }
>
> Note: up_write(&vma->vm_lock->lock) in the vma_start_write() is not
> enough because it's one-way permeable (it's a "RELEASE operation") and
> later vma->vm_file store (or any other VMA modification) can move
> before our vma->vm_lock_seq store.
>
> This makes vma_start_write() heavier but again, it's write-locking, so
> should not be considered a fast path.
> With this change we can use the code suggested by Andrii in
> https://lore.kernel.org/all/CAEf4BzZeLg0WsYw2M7KFy0+APrPaPVBY7FbawB9vjcA2+6k69Q@mail.gmail.com/
> with an additional smp_rmb():
>
> rcu_read_lock()
> vma = find_vma(...)
> if (!vma) /* bail */

And maybe add some comments like:

/*
 * Load the current VMA lock sequence - we will detect if anyone concurrently
 * locks the VMA after this point.
 * Pairs with smp_wmb() in vma_start_write().
 */
> vm_lock_seq = smp_load_acquire(&vma->vm_lock_seq);
/*
 * Now we just have to detect if the VMA is already locked with its current
 * sequence count.
 *
 * The following load is ordered against the vm_lock_seq load above (using
 * smp_load_acquire() for the load above), and pairs with implicit memory
 * ordering between the mm_lock_seq write in mmap_write_unlock() and the
 * vm_lock_seq write in the next vma_start_write() after that (which can only
 * occur after an mmap_write_lock()).
 */
> mm_lock_seq = smp_load_acquire(&vma->mm->mm_lock_seq);
> /* I think vm_lock has to be acquired first to avoid the race */
> if (mm_lock_seq == vm_lock_seq)
>         /* bail, vma is write-locked */
> ... perform uprobe lookup logic based on vma->vm_file->f_inode ...
/*
 * Order the speculative accesses above against the following vm_lock_seq
 * recheck.
 */
> smp_rmb();
> if (vma->vm_lock_seq != vm_lock_seq)

(As I said on the other thread: Since this now relies on
vma->vm_lock_seq not wrapping back to the same value for correctness,
I'd like to see vma->vm_lock_seq being at least an "unsigned long", or
even better, an atomic64_t... though I realize we don't currently do
that for seqlocks either.)

>         /* bail, VMA might have changed */
>
> The smp_rmb() is needed so that vma->vm_lock_seq load does not get
> reordered and moved up before speculation.
>
> I'm CC'ing Jann since he understands memory barriers way better than
> me and will keep me honest.
Mateusz Guzik Aug. 15, 2024, 7:07 p.m. UTC | #8
On Thu, Aug 15, 2024 at 8:58 PM Jann Horn <jannh@google.com> wrote:
> Stupid question: Is this uprobe stuff actually such a hot codepath
> that it makes sense to optimize it to be faster than the page fault
> path?
>

That's what I implicitly asked, hoping a down_read on vma would do it,
but Andrii claims multiple parallel lookups on the same vma are a
problem.

Even so, I suspect something *simple* is doable here which avoids any
writes to vmas and does not need the mm-wide sequence counter. It may
be requirements are lax enough that merely observing some state is the
same before and after uprobe lookup will be sufficient, or maybe some
other hackery is viable without messing with fences in
vma_start_write.
Suren Baghdasaryan Aug. 15, 2024, 7:44 p.m. UTC | #9
On Thu, Aug 15, 2024 at 11:58 AM Jann Horn <jannh@google.com> wrote:
>
> +brauner for "struct file" lifetime
>
> On Thu, Aug 15, 2024 at 7:45 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > On Thu, Aug 15, 2024 at 9:47 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Thu, Aug 15, 2024 at 6:44 AM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > > >
> > > > On Tue, Aug 13, 2024 at 08:36:03AM -0700, Suren Baghdasaryan wrote:
> > > > > On Mon, Aug 12, 2024 at 11:18 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Aug 12, 2024 at 09:29:17PM -0700, Andrii Nakryiko wrote:
> > > > > > > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > > > > > > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > > > > > > attempting uprobe look up speculatively.
>
> Stupid question: Is this uprobe stuff actually such a hot codepath
> that it makes sense to optimize it to be faster than the page fault
> path?
>
> (Sidenote: I find it kinda interesting that this is sort of going back
> in the direction of the old Speculative Page Faults design.)
>
> > > > > > > We rely on newly added mmap_lock_speculation_{start,end}() helpers to
> > > > > > > validate that mm_struct stays intact for entire duration of this
> > > > > > > speculation. If not, we fall back to mmap_lock-protected lookup.
> > > > > > >
> > > > > > > This allows to avoid contention on mmap_lock in absolutely majority of
> > > > > > > cases, nicely improving uprobe/uretprobe scalability.
> > > > > > >
> > > > > >
> > > > > > Here I have to admit to being mostly ignorant about the mm, so bear with
> > > > > > me. :>
> > > > > >
> > > > > > I note the result of find_active_uprobe_speculative is immediately stale
> > > > > > in face of modifications.
> > > > > >
> > > > > > The thing I'm after is that the mmap_lock_speculation business adds
> > > > > > overhead on archs where a release fence is not a de facto nop and I
> > > > > > don't believe the commit message justifies it. Definitely a bummer to
> > > > > > add merely it for uprobes. If there are bigger plans concerning it
> > > > > > that's a different story of course.
> > > > > >
> > > > > > With this in mind I have to ask if instead you could perhaps get away
> > > > > > with the already present per-vma sequence counter?
> > > > >
> > > > > per-vma sequence counter does not implement acquire/release logic, it
> > > > > relies on vma->vm_lock for synchronization. So if we want to use it,
> > > > > we would have to add additional memory barriers here. This is likely
> > > > > possible but as I mentioned before we would need to ensure the
> > > > > pagefault path does not regress. OTOH mm->mm_lock_seq already halfway
> > > > > there (it implements acquire/release logic), we just had to ensure
> > > > > mmap_write_lock() increments mm->mm_lock_seq.
> > > > >
> > > > > So, from the release fence overhead POV I think whether we use
> > > > > mm->mm_lock_seq or vma->vm_lock, we would still need a proper fence
> > > > > here.
> > > > >
> > > >
> > > > Per my previous e-mail I'm not particularly familiar with mm internals,
> > > > so I'm going to handwave a little bit with my $0,03 concerning multicore
> > > > in general and if you disagree with it that's your business. For the
> > > > time being I have no interest in digging into any of this.
> > > >
> > > > Before I do, to prevent this thread from being a total waste, here are
> > > > some remarks concerning the patch with the assumption that the core idea
> > > > lands.
> > > >
> > > > From the commit message:
> > > > > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > > > > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > > > > attempting uprobe look up speculatively.
> > > >
> > > > Just in case I'll note a nit that this paragraph will need to be removed
> > > > since the patch adding the flag is getting dropped.
> > >
> > > Yep, of course, I'll update all that for the next revision (I'll wait
> > > for non-RFC patches to land first before reposting).
> > >
> > > >
> > > > A non-nit which may or may not end up mattering is that the flag (which
> > > > *is* set on the filep slab cache) makes things more difficult to
> > > > validate. Normal RCU usage guarantees that the object itself wont be
> > > > freed as long you follow the rules. However, the SLAB_TYPESAFE_BY_RCU
> > > > flag weakens it significantly -- the thing at hand will always be a
> > > > 'struct file', but it may get reallocated to *another* file from under
> > > > you. Whether this aspect plays a role here I don't know.
> > >
> > > Yes, that's ok and is accounted for. We care about that memory not
> > > going even from under us (I'm not even sure if it matters that it is
> > > still a struct file, tbh; I think that shouldn't matter as we are
> > > prepared to deal with completely garbage values read from struct
> > > file).
> >
> > Correct, with SLAB_TYPESAFE_BY_RCU we do need an additional check that
> > vma->vm_file has not been freed and reused. That's where
> > mmap_lock_speculation_{start|end} helps us. For vma->vm_file to change
> > from under us one would have to take mmap_lock for write. If that
> > happens mmap_lock_speculation_{start|end} should detect that and
> > terminate our speculation.
> >
> > >
> > > >
> > > > > +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> > > > > +{
> > > > > +     const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
> > > > > +     struct mm_struct *mm = current->mm;
> > > > > +     struct uprobe *uprobe;
> > > > > +     struct vm_area_struct *vma;
> > > > > +     struct file *vm_file;
> > > > > +     struct inode *vm_inode;
> > > > > +     unsigned long vm_pgoff, vm_start;
> > > > > +     int seq;
> > > > > +     loff_t offset;
> > > > > +
> > > > > +     if (!mmap_lock_speculation_start(mm, &seq))
> > > > > +             return NULL;
> > > > > +
> > > > > +     rcu_read_lock();
> > > > > +
> > > >
> > > > I don't think there is a correctness problem here, but entering rcu
> > > > *after* deciding to speculatively do the lookup feels backwards.
> > >
> > > RCU should protect VMA and file, mm itself won't go anywhere, so this seems ok.
> > >
> > > >
> > > > > +     vma = vma_lookup(mm, bp_vaddr);
> > > > > +     if (!vma)
> > > > > +             goto bail;
> > > > > +
> > > > > +     vm_file = data_race(vma->vm_file);
> > > > > +     if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
> > > > > +             goto bail;
> > > > > +
> > > >
> > > > If vma teardown is allowed to progress and the file got fput'ed...
> > > >
> > > > > +     vm_inode = data_race(vm_file->f_inode);
> > > >
> > > > ... the inode can be NULL, I don't know if that's handled.
> > > >
> > >
> > > Yep, inode pointer value is part of RB-tree key, so if it's NULL, we
> > > just won't find a matching uprobe. Same for any other "garbage"
> > > f_inode value. Importantly, we never should dereference such inode
> > > pointers, at least until we find a valid uprobe (in which case we keep
> > > inode reference to it).
> > >
> > > > More importantly though, per my previous description of
> > > > SLAB_TYPESAFE_BY_RCU, by now the file could have been reallocated and
> > > > the inode you did find is completely unrelated.
> > > >
> > > > I understand the intent is to backpedal from everything should the mm
> > > > seqc change, but the above may happen to matter.
> > >
> > > Yes, I think we took that into account. All that we care about is
> > > memory "type safety", i.e., even if struct file's memory is reused,
> > > it's ok, we'll eventually detect the change and will discard wrong
> > > uprobe that we might by accident lookup (though probably in most cases
> > > we just won't find a uprobe at all).
> > >
> > > >
> > > > > +     vm_pgoff = data_race(vma->vm_pgoff);
> > > > > +     vm_start = data_race(vma->vm_start);
> > > > > +
> > > > > +     offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
> > > > > +     uprobe = find_uprobe_rcu(vm_inode, offset);
> > > > > +     if (!uprobe)
> > > > > +             goto bail;
> > > > > +
> > > > > +     /* now double check that nothing about MM changed */
> > > > > +     if (!mmap_lock_speculation_end(mm, seq))
> > > > > +             goto bail;
> > > >
> > > > This leaks the reference obtained by find_uprobe_rcu().
> > >
> > > find_uprobe_rcu() doesn't obtain a reference, uprobe is RCU-protected,
> > > and if caller need a refcount bump it will have to use
> > > try_get_uprobe() (which might fail).
> > >
> > > >
> > > > > +
> > > > > +     rcu_read_unlock();
> > > > > +
> > > > > +     /* happy case, we speculated successfully */
> > > > > +     return uprobe;
> > > > > +bail:
> > > > > +     rcu_read_unlock();
> > > > > +     return NULL;
> > > > > +}
> > > >
> > > > Now to some handwaving, here it is:
> > > >
> > > > The core of my concern is that adding more work to down_write on the
> > > > mmap semaphore comes with certain side-effects and plausibly more than a
> > > > sufficient speed up can be achieved without doing it.
> >
> > AFAIK writers of mmap_lock are not considered a fast path. In a sense
> > yes, we made any writer a bit heavier but OTOH we also made
> > mm->mm_lock_seq a proper sequence count which allows us to locklessly
> > check if mmap_lock is write-locked. I think you asked whether there
> > will be other uses for mmap_lock_speculation_{start|end} and yes. For
> > example, I am planning to use them for printing /proc/{pid}/maps
> > without taking mmap_lock (when it's uncontended).
>
> What would be the goal of this - to avoid cacheline bouncing of the
> mmap lock between readers? Or to allow mmap_write_lock() to preempt
> /proc/{pid}/maps readers who started out uncontended?

The latter, from my early patchset which I need to refine
(https://lore.kernel.org/all/20240123231014.3801041-3-surenb@google.com/):

This change is designed to reduce mmap_lock contention and prevent a
process reading /proc/pid/maps files (often a low priority task, such as
monitoring/data collection services) from blocking address space updates.

>
> Is the idea that you'd change show_map_vma() to first do something
> like get_file_active() to increment the file refcount (because
> otherwise the dentry can be freed under you and you need the dentry
> for path printing), then recheck your sequence count on the mm or vma
> (to avoid accessing the dentry of an unrelated file that hasn't become
> userspace-visible yet and may not have a proper dentry pointer yet),
> then print the file path, drop the file reference again, and in the
> end recheck the sequence count again before actually returning the
> printed data to userspace?

Yeah, you can see the details in that link I posted above. See
get_vma_snapshot() function.

>
> > If we have VMA seq
> > counter-based detection it would be better (see below).
> >
> > > >
> > > > An mm-wide mechanism is just incredibly coarse-grained and it may happen
> > > > to perform poorly when faced with a program which likes to mess with its
> > > > address space -- the fast path is going to keep failing and only
> > > > inducing *more* overhead as the code decides to down_read the mmap
> > > > semaphore.
> > > >
> > > > Furthermore there may be work currently synchronized with down_write
> > > > which perhaps can transition to "merely" down_read, but by the time it
> > > > happens this and possibly other consumers expect a change in the
> > > > sequence counter, messing with it.
> > > >
> > > > To my understanding the kernel supports parallel faults with per-vma
> > > > locking. I would find it surprising if the same machinery could not be
> > > > used to sort out uprobe handling above.
> >
> > From all the above, my understanding of your objection is that
> > checking mmap_lock during our speculation is too coarse-grained and
> > you would prefer to use the VMA seq counter to check that the VMA we
> > are working on is unchanged. I agree, that would be ideal. I had a
> > quick chat with Jann about this and the conclusion we came to is that
> > we would need to add an additional smp_wmb() barrier inside
> > vma_start_write() and a smp_rmb() in the speculation code:
> >
> > static inline void vma_start_write(struct vm_area_struct *vma)
> > {
> >         int mm_lock_seq;
> >
> >         if (__is_vma_write_locked(vma, &mm_lock_seq))
> >                 return;
> >
> >         down_write(&vma->vm_lock->lock);
> >         /*
> >          * We should use WRITE_ONCE() here because we can have concurrent reads
> >          * from the early lockless pessimistic check in vma_start_read().
> >          * We don't really care about the correctness of that early check, but
> >          * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
> >          */
> >         WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> > +        smp_wmb();
> >         up_write(&vma->vm_lock->lock);
> > }
> >
> > Note: up_write(&vma->vm_lock->lock) in the vma_start_write() is not
> > enough because it's one-way permeable (it's a "RELEASE operation") and
> > later vma->vm_file store (or any other VMA modification) can move
> > before our vma->vm_lock_seq store.
> >
> > This makes vma_start_write() heavier but again, it's write-locking, so
> > should not be considered a fast path.
> > With this change we can use the code suggested by Andrii in
> > https://lore.kernel.org/all/CAEf4BzZeLg0WsYw2M7KFy0+APrPaPVBY7FbawB9vjcA2+6k69Q@mail.gmail.com/
> > with an additional smp_rmb():
> >
> > rcu_read_lock()
> > vma = find_vma(...)
> > if (!vma) /* bail */
>
> And maybe add some comments like:
>
> /*
>  * Load the current VMA lock sequence - we will detect if anyone concurrently
>  * locks the VMA after this point.
>  * Pairs with smp_wmb() in vma_start_write().
>  */
> > vm_lock_seq = smp_load_acquire(&vma->vm_lock_seq);
> /*
>  * Now we just have to detect if the VMA is already locked with its current
>  * sequence count.
>  *
>  * The following load is ordered against the vm_lock_seq load above (using
>  * smp_load_acquire() for the load above), and pairs with implicit memory
>  * ordering between the mm_lock_seq write in mmap_write_unlock() and the
>  * vm_lock_seq write in the next vma_start_write() after that (which can only
>  * occur after an mmap_write_lock()).
>  */
> > mm_lock_seq = smp_load_acquire(&vma->mm->mm_lock_seq);
> > /* I think vm_lock has to be acquired first to avoid the race */
> > if (mm_lock_seq == vm_lock_seq)
> >         /* bail, vma is write-locked */
> > ... perform uprobe lookup logic based on vma->vm_file->f_inode ...
> /*
>  * Order the speculative accesses above against the following vm_lock_seq
>  * recheck.
>  */
> > smp_rmb();
> > if (vma->vm_lock_seq != vm_lock_seq)
>
> (As I said on the other thread: Since this now relies on
> vma->vm_lock_seq not wrapping back to the same value for correctness,
> I'd like to see vma->vm_lock_seq being at least an "unsigned long", or
> even better, an atomic64_t... though I realize we don't currently do
> that for seqlocks either.)
>
> >         /* bail, VMA might have changed */
> >
> > The smp_rmb() is needed so that vma->vm_lock_seq load does not get
> > reordered and moved up before speculation.
> >
> > I'm CC'ing Jann since he understands memory barriers way better than
> > me and will keep me honest.
Andrii Nakryiko Aug. 15, 2024, 8:17 p.m. UTC | #10
On Thu, Aug 15, 2024 at 11:58 AM Jann Horn <jannh@google.com> wrote:
>
> +brauner for "struct file" lifetime
>
> On Thu, Aug 15, 2024 at 7:45 PM Suren Baghdasaryan <surenb@google.com> wrote:
> > On Thu, Aug 15, 2024 at 9:47 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Thu, Aug 15, 2024 at 6:44 AM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > > >
> > > > On Tue, Aug 13, 2024 at 08:36:03AM -0700, Suren Baghdasaryan wrote:
> > > > > On Mon, Aug 12, 2024 at 11:18 PM Mateusz Guzik <mjguzik@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Aug 12, 2024 at 09:29:17PM -0700, Andrii Nakryiko wrote:
> > > > > > > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > > > > > > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > > > > > > attempting uprobe look up speculatively.
>
> Stupid question: Is this uprobe stuff actually such a hot codepath
> that it makes sense to optimize it to be faster than the page fault
> path?

Not a stupid question, but yes, generally speaking uprobe performance
is critical for a bunch of tracing use cases. And having independent
threads implicitly contending with each other just because of uprobe's
internal implementation detail (while conceptually there should be no
dependencies for triggering uprobe from multiple parallel threads) is
a big surprise to users and affects production use cases beyond just
uprobe-handling BPF logic overhead ("useful overhead") they assume.

>
> (Sidenote: I find it kinda interesting that this is sort of going back
> in the direction of the old Speculative Page Faults design.)
>
> > > > > > > We rely on newly added mmap_lock_speculation_{start,end}() helpers to
> > > > > > > validate that mm_struct stays intact for entire duration of this
> > > > > > > speculation. If not, we fall back to mmap_lock-protected lookup.
> > > > > > >
> > > > > > > This allows to avoid contention on mmap_lock in absolutely majority of
> > > > > > > cases, nicely improving uprobe/uretprobe scalability.
> > > > > > >
> > > > > >

[...]

> > Note: up_write(&vma->vm_lock->lock) in the vma_start_write() is not
> > enough because it's one-way permeable (it's a "RELEASE operation") and
> > later vma->vm_file store (or any other VMA modification) can move
> > before our vma->vm_lock_seq store.
> >
> > This makes vma_start_write() heavier but again, it's write-locking, so
> > should not be considered a fast path.
> > With this change we can use the code suggested by Andrii in
> > https://lore.kernel.org/all/CAEf4BzZeLg0WsYw2M7KFy0+APrPaPVBY7FbawB9vjcA2+6k69Q@mail.gmail.com/
> > with an additional smp_rmb():
> >
> > rcu_read_lock()
> > vma = find_vma(...)
> > if (!vma) /* bail */
>
> And maybe add some comments like:
>
> /*
>  * Load the current VMA lock sequence - we will detect if anyone concurrently
>  * locks the VMA after this point.
>  * Pairs with smp_wmb() in vma_start_write().
>  */
> > vm_lock_seq = smp_load_acquire(&vma->vm_lock_seq);
> /*
>  * Now we just have to detect if the VMA is already locked with its current
>  * sequence count.
>  *
>  * The following load is ordered against the vm_lock_seq load above (using
>  * smp_load_acquire() for the load above), and pairs with implicit memory
>  * ordering between the mm_lock_seq write in mmap_write_unlock() and the
>  * vm_lock_seq write in the next vma_start_write() after that (which can only
>  * occur after an mmap_write_lock()).
>  */
> > mm_lock_seq = smp_load_acquire(&vma->mm->mm_lock_seq);
> > /* I think vm_lock has to be acquired first to avoid the race */
> > if (mm_lock_seq == vm_lock_seq)
> >         /* bail, vma is write-locked */
> > ... perform uprobe lookup logic based on vma->vm_file->f_inode ...
> /*
>  * Order the speculative accesses above against the following vm_lock_seq
>  * recheck.
>  */
> > smp_rmb();
> > if (vma->vm_lock_seq != vm_lock_seq)
>

thanks, will incorporate these comments into the next revision

> (As I said on the other thread: Since this now relies on
> vma->vm_lock_seq not wrapping back to the same value for correctness,
> I'd like to see vma->vm_lock_seq being at least an "unsigned long", or
> even better, an atomic64_t... though I realize we don't currently do
> that for seqlocks either.)
>
> >         /* bail, VMA might have changed */
> >
> > The smp_rmb() is needed so that vma->vm_lock_seq load does not get
> > reordered and moved up before speculation.
> >
> > I'm CC'ing Jann since he understands memory barriers way better than
> > me and will keep me honest.
diff mbox series

Patch

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 713824c8ca77..12f3edf2ffb1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2286,6 +2286,53 @@  static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	return is_trap_insn(&opcode);
 }
 
+static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
+{
+	const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
+	struct mm_struct *mm = current->mm;
+	struct uprobe *uprobe;
+	struct vm_area_struct *vma;
+	struct file *vm_file;
+	struct inode *vm_inode;
+	unsigned long vm_pgoff, vm_start;
+	int seq;
+	loff_t offset;
+
+	if (!mmap_lock_speculation_start(mm, &seq))
+		return NULL;
+
+	rcu_read_lock();
+
+	vma = vma_lookup(mm, bp_vaddr);
+	if (!vma)
+		goto bail;
+
+	vm_file = data_race(vma->vm_file);
+	if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
+		goto bail;
+
+	vm_inode = data_race(vm_file->f_inode);
+	vm_pgoff = data_race(vma->vm_pgoff);
+	vm_start = data_race(vma->vm_start);
+
+	offset = (loff_t)(vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vm_start);
+	uprobe = find_uprobe_rcu(vm_inode, offset);
+	if (!uprobe)
+		goto bail;
+
+	/* now double check that nothing about MM changed */
+	if (!mmap_lock_speculation_end(mm, seq))
+		goto bail;
+
+	rcu_read_unlock();
+
+	/* happy case, we speculated successfully */
+	return uprobe;
+bail:
+	rcu_read_unlock();
+	return NULL;
+}
+
 /* assumes being inside RCU protected region */
 static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
 {
@@ -2293,6 +2340,10 @@  static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
 	struct uprobe *uprobe = NULL;
 	struct vm_area_struct *vma;
 
+	uprobe = find_active_uprobe_speculative(bp_vaddr);
+	if (uprobe)
+		return uprobe;
+
 	mmap_read_lock(mm);
 	vma = vma_lookup(mm, bp_vaddr);
 	if (vma) {