diff mbox series

x86/alternatives: remove false sharing in poke_int3_handler()

Message ID 20250323072511.2353342-1-edumazet@google.com (mailing list archive)
State Superseded
Headers show
Series x86/alternatives: remove false sharing in poke_int3_handler() | expand

Checks

Context Check Description
netdev/tree_selection success Not a local patch

Commit Message

Eric Dumazet March 23, 2025, 7:25 a.m. UTC
eBPF programs can be run 20,000,000+ times per second on busy servers.

Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
hundreds of calls sites are patched from text_poke_bp_batch()
and we see a critical loss of performance due to false sharing
on bp_desc.refs lasting up to three seconds.

   51.30%  server_bin       [kernel.kallsyms]           [k] poke_int3_handler
            |
            |--46.45%--poke_int3_handler
            |          exc_int3
            |          asm_exc_int3
            |          |
            |          |--24.26%--cls_bpf_classify
            |          |          tcf_classify
            |          |          __dev_queue_xmit
            |          |          ip6_finish_output2
            |          |          ip6_output
            |          |          ip6_xmit
            |          |          inet6_csk_xmit
            |          |          __tcp_transmit_skb
            |          |          |
            |          |          |--9.00%--tcp_v6_do_rcv
            |          |          |          tcp_v6_rcv
            |          |          |          ip6_protocol_deliver_rcu
            |          |          |          ip6_rcv_finish
            |          |          |          ipv6_rcv
            |          |          |          __netif_receive_skb
            |          |          |          process_backlog
            |          |          |          __napi_poll
            |          |          |          net_rx_action
            |          |          |          __softirqentry_text_start
            |          |          |          asm_call_sysvec_on_stack
            |          |          |          do_softirq_own_stack

Fix this by replacing bp_desc.refs with a per-cpu bp_refs.

Before the patch, on a host with 240 cpus (480 threads):

echo 0 >/proc/sys/kernel/bpf_stats_enabled
text_poke_bp_batch(nr_entries=2)
        text_poke_bp_batch+1
        text_poke_finish+27
        arch_jump_label_transform_apply+22
        jump_label_update+98
        __static_key_slow_dec_cpuslocked+64
        static_key_slow_dec+31
        bpf_stats_handler+236
        proc_sys_call_handler+396
        vfs_write+761
        ksys_write+102
        do_syscall_64+107
        entry_SYSCALL_64_after_hwframe+103
Took 324 usec

text_poke_bp_batch(nr_entries=164)
        text_poke_bp_batch+1
        text_poke_finish+27
        arch_jump_label_transform_apply+22
        jump_label_update+98
        __static_key_slow_dec_cpuslocked+64
        static_key_slow_dec+31
        bpf_stats_handler+236
        proc_sys_call_handler+396
        vfs_write+761
        ksys_write+102
        do_syscall_64+107
        entry_SYSCALL_64_after_hwframe+103
Took 2655300 usec

After this patch:

echo 0 >/proc/sys/kernecho 0 >/proc/sys/kernel/bpf_stats_enabled
text_poke_bp_batch(nr_entries=2)
        text_poke_bp_batch+1
        text_poke_finish+27
        arch_jump_label_transform_apply+22
        jump_label_update+98
        __static_key_slow_dec_cpuslocked+64
        static_key_slow_dec+31
        bpf_stats_handler+236
        proc_sys_call_handler+396
        vfs_write+761
        ksys_write+102
        do_syscall_64+107
        entry_SYSCALL_64_after_hwframe+103
Took 519 usec

text_poke_bp_batch(nr_entries=164)
        text_poke_bp_batch+1
        text_poke_finish+27
        arch_jump_label_transform_apply+22
        jump_label_update+98
        __static_key_slow_dec_cpuslocked+64
        static_key_slow_dec+31
        bpf_stats_handler+236
        proc_sys_call_handler+396
        vfs_write+761
        ksys_write+102
        do_syscall_64+107
        entry_SYSCALL_64_after_hwframe+103
Took 702 usec

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 arch/x86/kernel/alternative.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

Comments

Ingo Molnar March 23, 2025, 9:38 p.m. UTC | #1
* Eric Dumazet <edumazet@google.com> wrote:

> eBPF programs can be run 20,000,000+ times per second on busy servers.
> 
> Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
> hundreds of calls sites are patched from text_poke_bp_batch()
> and we see a critical loss of performance due to false sharing
> on bp_desc.refs lasting up to three seconds.

> @@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
>  	/*
>  	 * Remove and wait for refs to be zero.
>  	 */
> -	if (!atomic_dec_and_test(&bp_desc.refs))
> -		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
> +	for_each_possible_cpu(i) {
> +		atomic_t *refs = per_cpu_ptr(&bp_refs, i);
> +
> +		if (!atomic_dec_and_test(refs))
> +			atomic_cond_read_acquire(refs, !VAL);
> +	}

So your patch changes text_poke_bp_batch() to busy-spin-wait for 
bp_refs to go to zero on all 480 CPUs.

Your measurement is using /proc/sys/kernel/bpf_stats_enabled on a 
single CPU, right?

What's the adversarial workload here? Spamming bpf_stats_enabled on all 
CPUs in parallel? Or mixing it with some other text_poke_bp_batch() 
user if bpf_stats_enabled serializes access?

Does anything undesirable happen in that case?

Thanks,

	Ingo
Eric Dumazet March 24, 2025, 3:59 a.m. UTC | #2
On Sun, Mar 23, 2025 at 10:38 PM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Eric Dumazet <edumazet@google.com> wrote:
>
> > eBPF programs can be run 20,000,000+ times per second on busy servers.
> >
> > Whenever /proc/sys/kernel/bpf_stats_enabled is turned off,
> > hundreds of calls sites are patched from text_poke_bp_batch()
> > and we see a critical loss of performance due to false sharing
> > on bp_desc.refs lasting up to three seconds.
>
> > @@ -2413,8 +2415,12 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
> >       /*
> >        * Remove and wait for refs to be zero.
> >        */
> > -     if (!atomic_dec_and_test(&bp_desc.refs))
> > -             atomic_cond_read_acquire(&bp_desc.refs, !VAL);
> > +     for_each_possible_cpu(i) {
> > +             atomic_t *refs = per_cpu_ptr(&bp_refs, i);
> > +
> > +             if (!atomic_dec_and_test(refs))
> > +                     atomic_cond_read_acquire(refs, !VAL);
> > +     }
>
> So your patch changes text_poke_bp_batch() to busy-spin-wait for
> bp_refs to go to zero on all 480 CPUs.
>
> Your measurement is using /proc/sys/kernel/bpf_stats_enabled on a
> single CPU, right?

Yes, some daemon enables bpf_stats for a small amount of time (one
second) to gather stats
on eBPF run time costs.  (bpftool prog | grep run_time)

One eBPF selftest can also do this.

tools/testing/selftests/bpf/prog_tests/enable_stats.c


>
> What's the adversarial workload here? Spamming bpf_stats_enabled on all
> CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> user if bpf_stats_enabled serializes access?

The workload is having ~480 cpus running various eBPF programs all
over the places,

In the perf bit I added in the changelog, we see an eBPF program
hooked at the xmit of each packet.

But the fd = bpf_enable_stats(BPF_STATS_RUN_TIME)  / .... / close(fd)
only happens from time to time,
because of the supposed extra cost of fetching two extra time stamps.

BTW, before the patch stats on my test host look like

105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
3009063719 run_cnt 82757845
-> average cost is 36 nsec per call

And after the patch :

105: sched_cls  name hn_egress  tag 699fc5eea64144e3  gpl run_time_ns
1928223019 run_cnt 67682728
 -> average cost is 28 nsec per call

>
> Does anything undesirable happen in that case?

The case of multiple threads trying to flip bpf_stats_enabled is
handled by bpf_stats_enabled_mutex.

Thanks !

>
> Thanks,
>
>         Ingo
Ingo Molnar March 24, 2025, 7:16 a.m. UTC | #3
* Eric Dumazet <edumazet@google.com> wrote:

> > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > user if bpf_stats_enabled serializes access?
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

> > Does anything undesirable happen in that case?
> 
> The case of multiple threads trying to flip bpf_stats_enabled is
> handled by bpf_stats_enabled_mutex.

So my suggested workload wasn't adversarial enough due to 
bpf_stats_enabled_mutex: how about some other workload that doesn't 
serialize access to text_poke_bp_batch()?

Thanks,

	Ingo
Eric Dumazet March 24, 2025, 7:47 a.m. UTC | #4
On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Eric Dumazet <edumazet@google.com> wrote:
>
> > > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > > user if bpf_stats_enabled serializes access?
>             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> > > Does anything undesirable happen in that case?
> >
> > The case of multiple threads trying to flip bpf_stats_enabled is
> > handled by bpf_stats_enabled_mutex.
>
> So my suggested workload wasn't adversarial enough due to
> bpf_stats_enabled_mutex: how about some other workload that doesn't
> serialize access to text_poke_bp_batch()?

Do you have a specific case in mind that I can test on these big platforms ?

text_poke_bp_batch() calls themselves are serialized by text_mutex, it
is not clear what you are looking for.

Thanks.
Eric Dumazet March 24, 2025, 7:53 a.m. UTC | #5
On Mon, Mar 24, 2025 at 8:47 AM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote:
> >
> >
> > * Eric Dumazet <edumazet@google.com> wrote:
> >
> > > > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > > > user if bpf_stats_enabled serializes access?
> >             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >
> > > > Does anything undesirable happen in that case?
> > >
> > > The case of multiple threads trying to flip bpf_stats_enabled is
> > > handled by bpf_stats_enabled_mutex.
> >
> > So my suggested workload wasn't adversarial enough due to
> > bpf_stats_enabled_mutex: how about some other workload that doesn't
> > serialize access to text_poke_bp_batch()?
>
> Do you have a specific case in mind that I can test on these big platforms ?
>
> text_poke_bp_batch() calls themselves are serialized by text_mutex, it
> is not clear what you are looking for.


BTW the atomic_cond_read_acquire() part is never called even during my
stress test.

We could add this eventually:

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d7afbf822c45..5d364e990055 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2418,7 +2418,7 @@ static void text_poke_bp_batch(struct
text_poke_loc *tp, unsigned int nr_entries
        for_each_possible_cpu(i) {
                atomic_t *refs = per_cpu_ptr(&bp_refs, i);

-               if (!atomic_dec_and_test(refs))
+               if (unlikely(!atomic_dec_and_test(refs)))
                        atomic_cond_read_acquire(refs, !VAL);
        }
 }
Ingo Molnar March 24, 2025, 8:02 a.m. UTC | #6
* Eric Dumazet <edumazet@google.com> wrote:

> Do you have a specific case in mind that I can test on these big 
> platforms ?

No. I was thinking of large-scale kprobes or ftrace patching - but you 
are right that the text_mutex should naturally serialize all the 
write-side code here.

Mind adding your second round of test results to the changelog as well, 
which improved per call overhead from 36 to 28 nsecs?

Thanks,

	Ingo
Ingo Molnar March 24, 2025, 8:04 a.m. UTC | #7
* Eric Dumazet <edumazet@google.com> wrote:

> On Mon, Mar 24, 2025 at 8:47 AM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Mon, Mar 24, 2025 at 8:16 AM Ingo Molnar <mingo@kernel.org> wrote:
> > >
> > >
> > > * Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > > > What's the adversarial workload here? Spamming bpf_stats_enabled on all
> > > > > CPUs in parallel? Or mixing it with some other text_poke_bp_batch()
> > > > > user if bpf_stats_enabled serializes access?
> > >             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > >
> > > > > Does anything undesirable happen in that case?
> > > >
> > > > The case of multiple threads trying to flip bpf_stats_enabled is
> > > > handled by bpf_stats_enabled_mutex.
> > >
> > > So my suggested workload wasn't adversarial enough due to
> > > bpf_stats_enabled_mutex: how about some other workload that doesn't
> > > serialize access to text_poke_bp_batch()?
> >
> > Do you have a specific case in mind that I can test on these big platforms ?
> >
> > text_poke_bp_batch() calls themselves are serialized by text_mutex, it
> > is not clear what you are looking for.
> 
> 
> BTW the atomic_cond_read_acquire() part is never called even during my
> stress test.

Yeah, that code threw me off - can it really happen with text_mutex 
serializing all of it?

> @@ -2418,7 +2418,7 @@ static void text_poke_bp_batch(struct
> text_poke_loc *tp, unsigned int nr_entries
>         for_each_possible_cpu(i) {
>                 atomic_t *refs = per_cpu_ptr(&bp_refs, i);
> 
> -               if (!atomic_dec_and_test(refs))
> +               if (unlikely(!atomic_dec_and_test(refs)))
>                         atomic_cond_read_acquire(refs, !VAL);

If it could never happen then this should that condition be a 
WARN_ON_ONCE() perhaps?

Thanks,

	Ingo
Eric Dumazet March 24, 2025, 8:20 a.m. UTC | #8
On Mon, Mar 24, 2025 at 9:02 AM Ingo Molnar <mingo@kernel.org> wrote:
>
>
> * Eric Dumazet <edumazet@google.com> wrote:
>
> > Do you have a specific case in mind that I can test on these big
> > platforms ?
>
> No. I was thinking of large-scale kprobes or ftrace patching - but you
> are right that the text_mutex should naturally serialize all the
> write-side code here.
>
> Mind adding your second round of test results to the changelog as well,
> which improved per call overhead from 36 to 28 nsecs?

Sure thing, thanks !

Note the 36 to 28 nsec was on a test host, not really under production stress.

As all of our production runs with the old code, I can not really tell
what would be the effective change once new kernels are rolled out.

When updating an unique and shared atomic_t from 480 cpus (worst case
scenario), we need more than 40000 cycles per operation.

perf stat atomic_bench -T480

The atomic counter is 21904528, total_cycles=2095231571464, 95652 avg
cycles per update
[05]                                                    7866 in
[32,64[ cycles (53 avg)
[06]                                                    2196 in
[64,128[ cycles (81 avg)
[07]                                                    2942 in
[128,256[ cycles (202 avg)
[08]                                                    1865 in
[256,512[ cycles (383 avg)
[09]                                                    4251 in
[512,1024[ cycles (780 avg)
[10]                                                    72248 in
[1024,2048[ cycles (1722 avg)
[11] ***                                                438110 in
[2048,4096[ cycles (3217 avg)
[12] ***********                                        1703927 in
[4096,8192[ cycles (6199 avg)
[13] **************************                         3869889 in
[8192,16384[ cycles (12320 avg)
[14] ***************************                        4040952 in
[16384,32768[ cycles (25185 avg)
[15] ************************************************** 7261596 in
[32768,65536[ cycles (46884 avg)
[16] ******************                                 2688791 in
[65536,131072[ cycles (83552 avg)
[17] *                                                  253104 in
[131072,262144[ cycles (189642 avg)
[18] **                                                 326075 in
[262144,524288[ cycles (349319 avg)
[19] ******                                             901293 in
[524288,1048576[ cycles (890724 avg)
[20] **                                                 321711 in
[1048576,2097152[ cycles (1205250 avg)
[21]                                                    6616 in
[2097152,4194304[ cycles (2436096 avg)

 Performance counter stats for './atomic_bench -T480':

        964,194.88 msec task-clock                #  467.120 CPUs
utilized
            13,795      context-switches          #   14.307 M/sec
               480      cpu-migrations            #    0.498 M/sec
             1,605      page-faults               #    1.665 M/sec
 3,182,241,468,867      cycles                    # 3300416.170 GHz
    11,077,646,267      instructions              #    0.00  insn per
cycle
     1,711,894,269      branches                  # 1775466.627 M/sec
         3,747,877      branch-misses             #    0.22% of all
branches

       2.064128692 seconds time elapsed


I said the  atomic_cond_read_acquire(refs, !VAL) was not called in my tests,
but it is a valid situation, we should not add a WARN_ON_ONCE().

I will simply add the unlikely()

Thanks.
Peter Zijlstra March 24, 2025, 11:33 a.m. UTC | #9
On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:

> BTW the atomic_cond_read_acquire() part is never called even during my
> stress test.

Yes, IIRC this is due to text_poke_sync() serializing the state, as that
does a synchronous IPI broadcast, which by necessity requires all
previous INT3 handlers to complete.

You can only hit that case if the INT3 remains after step-3 (IOW you're
actively writing INT3 into the text). This is exceedingly rare.
Ingo Molnar March 25, 2025, 8:41 a.m. UTC | #10
* Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:
> 
> > BTW the atomic_cond_read_acquire() part is never called even during my
> > stress test.
> 
> Yes, IIRC this is due to text_poke_sync() serializing the state, as that
> does a synchronous IPI broadcast, which by necessity requires all
> previous INT3 handlers to complete.
> 
> You can only hit that case if the INT3 remains after step-3 (IOW you're
> actively writing INT3 into the text). This is exceedingly rare.

Might make sense to add a comment for that.

Also, any strong objections against doing this in the namespace:

  s/bp_/int3_

?

Half of the code already calls it a variant of 'int3', half of it 'bp', 
which I had to think for a couple of seconds goes for breakpoint, not 
base pointer ... ;-)

Might as well standardize on int3_ and call it a day?

Thanks,

	Ingo
Peter Zijlstra March 25, 2025, 10:30 a.m. UTC | #11
On Tue, Mar 25, 2025 at 09:41:10AM +0100, Ingo Molnar wrote:
> 
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:
> > 
> > > BTW the atomic_cond_read_acquire() part is never called even during my
> > > stress test.
> > 
> > Yes, IIRC this is due to text_poke_sync() serializing the state, as that
> > does a synchronous IPI broadcast, which by necessity requires all
> > previous INT3 handlers to complete.
> > 
> > You can only hit that case if the INT3 remains after step-3 (IOW you're
> > actively writing INT3 into the text). This is exceedingly rare.
> 
> Might make sense to add a comment for that.

Sure, find below.

> Also, any strong objections against doing this in the namespace:
> 
>   s/bp_/int3_
> 
> ?
> 
> Half of the code already calls it a variant of 'int3', half of it 'bp', 
> which I had to think for a couple of seconds goes for breakpoint, not 
> base pointer ... ;-)

It actually is breakpoint, as in INT3 raises #BP. For complete confusion
the things that are commonly known as debug breakpoints, those things in
DR7, they raise #DB or debug exceptions.

> Might as well standardize on int3_ and call it a day?

Yeah, perhaps. At some point you've got to know that INT3->#BP and
DR7->#DB and it all sorta makes sense, but *shrug* :-)


---
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index bf82c6f7d690..01e94603e767 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2749,6 +2749,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 
 	/*
 	 * Remove and wait for refs to be zero.
+	 *
+	 * Notably, if after step-3 above the INT3 got removed, then the
+	 * text_poke_sync() will have serialized against any running INT3
+	 * handlers and the below spin-wait will not happen.
+	 *
+	 * IOW. unless the replacement instruction is INT3, this case goes
+	 * unused.
 	 */
 	if (!atomic_dec_and_test(&bp_desc.refs))
 		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
Ingo Molnar March 25, 2025, 11:26 a.m. UTC | #12
* Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Mar 25, 2025 at 09:41:10AM +0100, Ingo Molnar wrote:
> > 
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Mon, Mar 24, 2025 at 08:53:31AM +0100, Eric Dumazet wrote:
> > > 
> > > > BTW the atomic_cond_read_acquire() part is never called even during my
> > > > stress test.
> > > 
> > > Yes, IIRC this is due to text_poke_sync() serializing the state, as that
> > > does a synchronous IPI broadcast, which by necessity requires all
> > > previous INT3 handlers to complete.
> > > 
> > > You can only hit that case if the INT3 remains after step-3 (IOW you're
> > > actively writing INT3 into the text). This is exceedingly rare.
> > 
> > Might make sense to add a comment for that.
> 
> Sure, find below.
> 
> > Also, any strong objections against doing this in the namespace:
> > 
> >   s/bp_/int3_
> > 
> > ?
> > 
> > Half of the code already calls it a variant of 'int3', half of it 'bp', 
> > which I had to think for a couple of seconds goes for breakpoint, not 
> > base pointer ... ;-)
> 
> It actually is breakpoint, as in INT3 raises #BP. For complete confusion
> the things that are commonly known as debug breakpoints, those things in
> DR7, they raise #DB or debug exceptions.

Yeah, it's a software breakpoint, swbp, that raises the #BP trap.

'bp' is confusingly aliased (in my brain at least) with 'base pointer' 
register naming and assembler syntax: as in bp, ebp, rbp.

So I'd prefer if it was named consistently:

  text_poke_int3_batch()
  text_poke_int3_handler()
  ...

Not the current mishmash of:

  text_poke_bp_batch()
  poke_int3_handler()
  ...

Does this make more sense?

> > Might as well standardize on int3_ and call it a day?
> 
> Yeah, perhaps. At some point you've got to know that INT3->#BP and
> DR7->#DB and it all sorta makes sense, but *shrug* :-)

Yeah, so I do know what #BP is, but what the heck disambiguates the two 
meanings of _bp and why do we have the above jungle of an inconsistent 
namespace? :-)

Picking _int3 would neatly solve all of that.

> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
> index bf82c6f7d690..01e94603e767 100644
> --- a/arch/x86/kernel/alternative.c
> +++ b/arch/x86/kernel/alternative.c
> @@ -2749,6 +2749,13 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
>  
>  	/*
>  	 * Remove and wait for refs to be zero.
> +	 *
> +	 * Notably, if after step-3 above the INT3 got removed, then the
> +	 * text_poke_sync() will have serialized against any running INT3
> +	 * handlers and the below spin-wait will not happen.
> +	 *
> +	 * IOW. unless the replacement instruction is INT3, this case goes
> +	 * unused.
>  	 */
>  	if (!atomic_dec_and_test(&bp_desc.refs))
>  		atomic_cond_read_acquire(&bp_desc.refs, !VAL);

Thanks! I stuck this into tip:x86/alternatives, with your SOB.

	Ingo
diff mbox series

Patch

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c71b575bf229..d7afbf822c45 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -2137,28 +2137,29 @@  struct text_poke_loc {
 struct bp_patching_desc {
 	struct text_poke_loc *vec;
 	int nr_entries;
-	atomic_t refs;
 };
 
+static DEFINE_PER_CPU(atomic_t, bp_refs);
+
 static struct bp_patching_desc bp_desc;
 
 static __always_inline
 struct bp_patching_desc *try_get_desc(void)
 {
-	struct bp_patching_desc *desc = &bp_desc;
+	atomic_t *refs = this_cpu_ptr(&bp_refs);
 
-	if (!raw_atomic_inc_not_zero(&desc->refs))
+	if (!raw_atomic_inc_not_zero(refs))
 		return NULL;
 
-	return desc;
+	return &bp_desc;
 }
 
 static __always_inline void put_desc(void)
 {
-	struct bp_patching_desc *desc = &bp_desc;
+	atomic_t *refs = this_cpu_ptr(&bp_refs);
 
 	smp_mb__before_atomic();
-	raw_atomic_dec(&desc->refs);
+	raw_atomic_dec(refs);
 }
 
 static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
@@ -2191,9 +2192,9 @@  noinstr int poke_int3_handler(struct pt_regs *regs)
 	 * Having observed our INT3 instruction, we now must observe
 	 * bp_desc with non-zero refcount:
 	 *
-	 *	bp_desc.refs = 1		INT3
-	 *	WMB				RMB
-	 *	write INT3			if (bp_desc.refs != 0)
+	 *	bp_refs = 1		INT3
+	 *	WMB			RMB
+	 *	write INT3		if (bp_refs != 0)
 	 */
 	smp_rmb();
 
@@ -2299,7 +2300,8 @@  static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	 * Corresponds to the implicit memory barrier in try_get_desc() to
 	 * ensure reading a non-zero refcount provides up to date bp_desc data.
 	 */
-	atomic_set_release(&bp_desc.refs, 1);
+	for_each_possible_cpu(i)
+		atomic_set_release(per_cpu_ptr(&bp_refs, i), 1);
 
 	/*
 	 * Function tracing can enable thousands of places that need to be
@@ -2413,8 +2415,12 @@  static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	/*
 	 * Remove and wait for refs to be zero.
 	 */
-	if (!atomic_dec_and_test(&bp_desc.refs))
-		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
+	for_each_possible_cpu(i) {
+		atomic_t *refs = per_cpu_ptr(&bp_refs, i);
+
+		if (!atomic_dec_and_test(refs))
+			atomic_cond_read_acquire(refs, !VAL);
+	}
 }
 
 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,