diff mbox series

[1/1] mm/vmalloc: Combine all TLB flush operations of KASAN shadow virtual address into one operation

Message ID 20240726165246.31326-1-ahuang12@lenovo.com (mailing list archive)
State New
Headers show
Series [1/1] mm/vmalloc: Combine all TLB flush operations of KASAN shadow virtual address into one operation | expand

Commit Message

Huang Adrian July 26, 2024, 4:52 p.m. UTC
From: Adrian Huang <ahuang12@lenovo.com>

When compiling kernel source 'make -j $(nproc)' with the up-and-running
KASAN-enabled kernel on a 256-core machine, the following soft lockup
is shown:

watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [kworker/28:1:1760]
CPU: 28 PID: 1760 Comm: kworker/28:1 Kdump: loaded Not tainted 6.10.0-rc5 #95
Workqueue: events drain_vmap_area_work
RIP: 0010:smp_call_function_many_cond+0x1d8/0xbb0
Code: 38 c8 7c 08 84 c9 0f 85 49 08 00 00 8b 45 08 a8 01 74 2e 48 89 f1 49 89 f7 48 c1 e9 03 41 83 e7 07 4c 01 e9 41 83 c7 03 f3 90 <0f> b6 01 41 38 c7 7c 08 84 c0 0f 85 d4 06 00 00 8b 45 08 a8 01 75
RSP: 0018:ffffc9000cb3fb60 EFLAGS: 00000202
RAX: 0000000000000011 RBX: ffff8883bc4469c0 RCX: ffffed10776e9949
RDX: 0000000000000002 RSI: ffff8883bb74ca48 RDI: ffffffff8434dc50
RBP: ffff8883bb74ca40 R08: ffff888103585dc0 R09: ffff8884533a1800
R10: 0000000000000004 R11: ffffffffffffffff R12: ffffed1077888d39
R13: dffffc0000000000 R14: ffffed1077888d38 R15: 0000000000000003
FS:  0000000000000000(0000) GS:ffff8883bc400000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00005577b5c8d158 CR3: 0000000004850000 CR4: 0000000000350ef0
Call Trace:
 <IRQ>
 ? watchdog_timer_fn+0x2cd/0x390
 ? __pfx_watchdog_timer_fn+0x10/0x10
 ? __hrtimer_run_queues+0x300/0x6d0
 ? sched_clock_cpu+0x69/0x4e0
 ? __pfx___hrtimer_run_queues+0x10/0x10
 ? srso_return_thunk+0x5/0x5f
 ? ktime_get_update_offsets_now+0x7f/0x2a0
 ? srso_return_thunk+0x5/0x5f
 ? srso_return_thunk+0x5/0x5f
 ? hrtimer_interrupt+0x2ca/0x760
 ? __sysvec_apic_timer_interrupt+0x8c/0x2b0
 ? sysvec_apic_timer_interrupt+0x6a/0x90
 </IRQ>
 <TASK>
 ? asm_sysvec_apic_timer_interrupt+0x16/0x20
 ? smp_call_function_many_cond+0x1d8/0xbb0
 ? __pfx_do_kernel_range_flush+0x10/0x10
 on_each_cpu_cond_mask+0x20/0x40
 flush_tlb_kernel_range+0x19b/0x250
 ? srso_return_thunk+0x5/0x5f
 ? kasan_release_vmalloc+0xa7/0xc0
 purge_vmap_node+0x357/0x820
 ? __pfx_purge_vmap_node+0x10/0x10
 __purge_vmap_area_lazy+0x5b8/0xa10
 drain_vmap_area_work+0x21/0x30
 process_one_work+0x661/0x10b0
 worker_thread+0x844/0x10e0
 ? srso_return_thunk+0x5/0x5f
 ? __kthread_parkme+0x82/0x140
 ? __pfx_worker_thread+0x10/0x10
 kthread+0x2a5/0x370
 ? __pfx_kthread+0x10/0x10
 ret_from_fork+0x30/0x70
 ? __pfx_kthread+0x10/0x10
 ret_from_fork_asm+0x1a/0x30
 </TASK>

Debugging Analysis:

  1. The following ftrace log shows that the lockup CPU spends too much
     time iterating vmap_nodes and flushing TLB when purging vm_area
     structures. (Some info is trimmed).

     kworker: funcgraph_entry:              |  drain_vmap_area_work() {
     kworker: funcgraph_entry:              |   mutex_lock() {
     kworker: funcgraph_entry:  1.092 us    |     __cond_resched();
     kworker: funcgraph_exit:   3.306 us    |   }
     ...                                        ...
     kworker: funcgraph_entry:              |    flush_tlb_kernel_range() {
     ...                                          ...
     kworker: funcgraph_exit: # 7533.649 us |    }
     ...                                         ...
     kworker: funcgraph_entry:  2.344 us    |   mutex_unlock();
     kworker: funcgraph_exit: $ 23871554 us | }

     The drain_vmap_area_work() spends over 23 seconds.

     There are 2805 flush_tlb_kernel_range() calls in the ftrace log.
       * One is called in __purge_vmap_area_lazy().
       * Others are called by purge_vmap_node->kasan_release_vmalloc.
         purge_vmap_node() iteratively releases kasan vmalloc
         allocations and flushes TLB for each vmap_area.
           - [Rough calculation] Each flush_tlb_kernel_range() runs
             about 7.5ms.
               -- 2804 * 7.5ms = 21.03 seconds.
               -- That's why a soft lock is triggered.

  2. Extending the soft lockup time can work around the issue (For example,
     # echo 60 > /proc/sys/kernel/watchdog_thresh). This confirms the
     above-mentioned speculation: drain_vmap_area_work() spends too much
     time.

If we combine all TLB flush operations of the KASAN shadow virtual
address into one operation in the call path
'purge_vmap_node()->kasan_release_vmalloc()', the running time of
drain_vmap_area_work() can be saved greatly. The idea is from the
flush_tlb_kernel_range() call in __purge_vmap_area_lazy(). And, the
soft lockup won't not be triggered.

Here is the test result based on 6.10:

[6.10 wo/ the patch]
  1. ftrace latency profiling (record a trace if the latency > 20s).
     echo 20000000 > /sys/kernel/debug/tracing/tracing_thresh
     echo drain_vmap_area_work > /sys/kernel/debug/tracing/set_graph_function
     echo function_graph > /sys/kernel/debug/tracing/current_tracer
     echo 1 > /sys/kernel/debug/tracing/tracing_on

  2. Run `make -j $(nproc)` to compile the kernel source

  3. Once the soft lockup is reproduced, check the ftrace log:
     cat /sys/kernel/debug/tracing/trace
        # tracer: function_graph
        #
        # CPU  DURATION                  FUNCTION CALLS
        # |     |   |                     |   |   |   |
          76) $ 50412985 us |    } /* __purge_vmap_area_lazy */
          76) $ 50412997 us |  } /* drain_vmap_area_work */
          76) $ 29165911 us |    } /* __purge_vmap_area_lazy */
          76) $ 29165926 us |  } /* drain_vmap_area_work */
          91) $ 53629423 us |    } /* __purge_vmap_area_lazy */
          91) $ 53629434 us |  } /* drain_vmap_area_work */
          91) $ 28121014 us |    } /* __purge_vmap_area_lazy */
          91) $ 28121026 us |  } /* drain_vmap_area_work */

[6.10 w/ the patch]
  1. Repeat step 1-2 in "[6.10 wo/ the patch]"

  2. The soft lockup is not triggered and ftrace log is empty.
     cat /sys/kernel/debug/tracing/trace
     # tracer: function_graph
     #
     # CPU  DURATION                  FUNCTION CALLS
     # |     |   |                     |   |   |   |

  3. Setting 'tracing_thresh' to 10/5 seconds does not get any ftrace
     log.

  4. Setting 'tracing_thresh' to 1 second gets ftrace log.
     cat /sys/kernel/debug/tracing/trace
     # tracer: function_graph
     #
     # CPU  DURATION                  FUNCTION CALLS
     # |     |   |                     |   |   |   |
       23) $ 1074942 us  |    } /* __purge_vmap_area_lazy */
       23) $ 1074950 us  |  } /* drain_vmap_area_work */

  The worst execution time of drain_vmap_area_work() is about 1 second.

Link: https://lore.kernel.org/lkml/ZqFlawuVnOMY2k3E@pc638.lan/
Fixes: 282631cb2447 ("mm: vmalloc: remove global purge_vmap_area_root rb-tree")
Signed-off-by: Adrian Huang <ahuang12@lenovo.com>
Co-developed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Tested-by: Jiwei Sun <sunjw10@lenovo.com>
---
 include/linux/kasan.h | 12 +++++++++---
 mm/kasan/shadow.c     | 14 ++++++++++----
 mm/vmalloc.c          | 34 ++++++++++++++++++++++++++--------
 3 files changed, 45 insertions(+), 15 deletions(-)

Comments

Andrew Morton July 28, 2024, 9:18 p.m. UTC | #1
On Sat, 27 Jul 2024 00:52:46 +0800 Adrian Huang <adrianhuang0701@gmail.com> wrote:

> From: Adrian Huang <ahuang12@lenovo.com>
> 
> When compiling kernel source 'make -j $(nproc)' with the up-and-running
> KASAN-enabled kernel on a 256-core machine, the following soft lockup
> is shown:
> 
> ...
>
>         # CPU  DURATION                  FUNCTION CALLS
>         # |     |   |                     |   |   |   |
>           76) $ 50412985 us |    } /* __purge_vmap_area_lazy */
>
> ...
>
>      # CPU  DURATION                  FUNCTION CALLS
>      # |     |   |                     |   |   |   |
>        23) $ 1074942 us  |    } /* __purge_vmap_area_lazy */
>        23) $ 1074950 us  |  } /* drain_vmap_area_work */
> 
>   The worst execution time of drain_vmap_area_work() is about 1 second.

Cool, thanks.

But that's still pretty dreadful and I bet there are other workloads
which will trigger the lockup detector in this path?

(And "avoiding lockup detector warnings" isn't the objective here - the
detector is merely a tool for identifying issues)
Baoquan He July 29, 2024, 8:30 a.m. UTC | #2
On 07/27/24 at 12:52am, Adrian Huang wrote:
...... 
> If we combine all TLB flush operations of the KASAN shadow virtual
> address into one operation in the call path
> 'purge_vmap_node()->kasan_release_vmalloc()', the running time of
> drain_vmap_area_work() can be saved greatly. The idea is from the
> flush_tlb_kernel_range() call in __purge_vmap_area_lazy(). And, the
> soft lockup won't not be triggered.
              ~~~~~~~~~~~
               typo
> 
> Here is the test result based on 6.10:
> 
> [6.10 wo/ the patch]
>   1. ftrace latency profiling (record a trace if the latency > 20s).
>      echo 20000000 > /sys/kernel/debug/tracing/tracing_thresh
>      echo drain_vmap_area_work > /sys/kernel/debug/tracing/set_graph_function
>      echo function_graph > /sys/kernel/debug/tracing/current_tracer
>      echo 1 > /sys/kernel/debug/tracing/tracing_on
> 
...... 
>   The worst execution time of drain_vmap_area_work() is about 1 second.
> 
> Link: https://lore.kernel.org/lkml/ZqFlawuVnOMY2k3E@pc638.lan/
> Fixes: 282631cb2447 ("mm: vmalloc: remove global purge_vmap_area_root rb-tree")
> Signed-off-by: Adrian Huang <ahuang12@lenovo.com>
> Co-developed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
> Tested-by: Jiwei Sun <sunjw10@lenovo.com>
> ---
>  include/linux/kasan.h | 12 +++++++++---
>  mm/kasan/shadow.c     | 14 ++++++++++----
>  mm/vmalloc.c          | 34 ++++++++++++++++++++++++++--------
>  3 files changed, 45 insertions(+), 15 deletions(-)

LGTM,

Reviewed-by: Baoquan He <bhe@redhat.com>
Huang Adrian July 29, 2024, 11:12 a.m. UTC | #3
On Mon, Jul 29, 2024 at 5:18 AM Andrew Morton <akpm@linux-foundation.org> wrote:
>
> On Sat, 27 Jul 2024 00:52:46 +0800 Adrian Huang <adrianhuang0701@gmail.com> wrote:
>
> > From: Adrian Huang <ahuang12@lenovo.com>
> >
> > When compiling kernel source 'make -j $(nproc)' with the up-and-running
> > KASAN-enabled kernel on a 256-core machine, the following soft lockup
> > is shown:
> >
> > ...
> >
> >         # CPU  DURATION                  FUNCTION CALLS
> >         # |     |   |                     |   |   |   |
> >           76) $ 50412985 us |    } /* __purge_vmap_area_lazy */
> >
> > ...
> >
> >      # CPU  DURATION                  FUNCTION CALLS
> >      # |     |   |                     |   |   |   |
> >        23) $ 1074942 us  |    } /* __purge_vmap_area_lazy */
> >        23) $ 1074950 us  |  } /* drain_vmap_area_work */
> >
> >   The worst execution time of drain_vmap_area_work() is about 1 second.
>
> Cool, thanks.
>
> But that's still pretty dreadful and I bet there are other workloads
> which will trigger the lockup detector in this path?

Yes, this path can be reproduced by other workloads. The stress-ng
command `stress-ng --exec $(nproc) --timeout 5m` can also trigger the
lockup detector in this path. (Confirmed on v6.11-rc1)

-- Adrian
Huang Adrian July 29, 2024, 11:19 a.m. UTC | #4
On Mon, Jul 29, 2024 at 4:30 PM Baoquan He <bhe@redhat.com> wrote:
>
> On 07/27/24 at 12:52am, Adrian Huang wrote:
> ......
> > If we combine all TLB flush operations of the KASAN shadow virtual
> > address into one operation in the call path
> > 'purge_vmap_node()->kasan_release_vmalloc()', the running time of
> > drain_vmap_area_work() can be saved greatly. The idea is from the
> > flush_tlb_kernel_range() call in __purge_vmap_area_lazy(). And, the
> > soft lockup won't not be triggered.
>               ~~~~~~~~~~~
>                typo

Oh, my fat-finger. Thanks for pointing it out.

I saw that Andrew already added this patch to his mm branch. Let me
know if I need to send the v2 version to fix this typo. (Depend on
Andew's decision)

-- Adrian
Uladzislau Rezki July 29, 2024, 11:29 a.m. UTC | #5
On Sun, Jul 28, 2024 at 02:18:51PM -0700, Andrew Morton wrote:
> On Sat, 27 Jul 2024 00:52:46 +0800 Adrian Huang <adrianhuang0701@gmail.com> wrote:
> 
> > From: Adrian Huang <ahuang12@lenovo.com>
> > 
> > When compiling kernel source 'make -j $(nproc)' with the up-and-running
> > KASAN-enabled kernel on a 256-core machine, the following soft lockup
> > is shown:
> > 
> > ...
> >
> >         # CPU  DURATION                  FUNCTION CALLS
> >         # |     |   |                     |   |   |   |
> >           76) $ 50412985 us |    } /* __purge_vmap_area_lazy */
> >
> > ...
> >
> >      # CPU  DURATION                  FUNCTION CALLS
> >      # |     |   |                     |   |   |   |
> >        23) $ 1074942 us  |    } /* __purge_vmap_area_lazy */
> >        23) $ 1074950 us  |  } /* drain_vmap_area_work */
> > 
> >   The worst execution time of drain_vmap_area_work() is about 1 second.
> 
> Cool, thanks.
> 
> But that's still pretty dreadful and I bet there are other workloads
> which will trigger the lockup detector in this path?
> 
> (And "avoiding lockup detector warnings" isn't the objective here - the
> detector is merely a tool for identifying issues)
> 
As for 1 sec execution and worst case. I did some analysis with enabling
CONFIG_LOCK_STAT to see some waiting statistics across different locks:

See it here: https://lore.kernel.org/linux-mm/ZogS_04dP5LlRlXN@pc636/T/#m5d57f11d9f69aef5313f4efbe25415b3bae4c818

It would be really good if Adrian could run the "compiling workload" on
his big system and post the statistics here.

For example:
  a) v6.11-rc1 + KASAN.
  b) v6.11-rc1 + KASAN + patch. 

Thanks!

--
Uladzislau Rezki
Andrew Morton July 30, 2024, 1:18 a.m. UTC | #6
On Mon, 29 Jul 2024 19:19:33 +0800 Huang Adrian <adrianhuang0701@gmail.com> wrote:

>  Let me
> know if I need to send the v2 version to fix this typo. (Depend on
> Andew's decision)

Thanks, I fixed it.
Huang Adrian July 30, 2024, 9:36 a.m. UTC | #7
On Mon, Jul 29, 2024 at 7:29 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> It would be really good if Adrian could run the "compiling workload" on
> his big system and post the statistics here.
>
> For example:
>   a) v6.11-rc1 + KASAN.
>   b) v6.11-rc1 + KASAN + patch.

Sure, please see the statistics below.

Test Result (based on 6.11-rc1)
===============================

1. Profile purge_vmap_node()

   A. Command: trace-cmd record -p function_graph -l purge_vmap_node make -j $(nproc)

   B. Average execution time of purge_vmap_node():

	no patch (us)		patched (us)	saved
	-------------		------------    -----
      	 147885.02	 	  3692.51	 97%  

   C. Total execution time of purge_vmap_node():

	no patch (us)		patched (us)	saved
	-------------		------------	-----
	  194173036		  5114138	 97%

   [ftrace log] Without patch: https://gist.github.com/AdrianHuang/a5bec861f67434e1024bbf43cea85959
   [ftrace log] With patch: https://gist.github.com/AdrianHuang/a200215955ee377288377425dbaa04e3

2. Use `time` utility to measure execution time
 
   A. Command: make clean && time make -j $(nproc)

   B. The following result is the average kernel execution time of five-time
      measurements. ('sys' field of `time` output):

	no patch (seconds)	patched (seconds)	saved
	------------------	----------------	-----
	    36932.904		   31403.478		 15%

   [`time` log] Without patch: https://gist.github.com/AdrianHuang/987b20fd0bd2bb616b3524aa6ee43112
   [`time` log] With patch: https://gist.github.com/AdrianHuang/da2ea4e6aa0b4dcc207b4e40b202f694
Uladzislau Rezki July 30, 2024, 11:38 a.m. UTC | #8
> On Mon, Jul 29, 2024 at 7:29 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> > It would be really good if Adrian could run the "compiling workload" on
> > his big system and post the statistics here.
> >
> > For example:
> >   a) v6.11-rc1 + KASAN.
> >   b) v6.11-rc1 + KASAN + patch.
> 
> Sure, please see the statistics below.
> 
> Test Result (based on 6.11-rc1)
> ===============================
> 
> 1. Profile purge_vmap_node()
> 
>    A. Command: trace-cmd record -p function_graph -l purge_vmap_node make -j $(nproc)
> 
>    B. Average execution time of purge_vmap_node():
> 
> 	no patch (us)		patched (us)	saved
> 	-------------		------------    -----
>       	 147885.02	 	  3692.51	 97%  
> 
>    C. Total execution time of purge_vmap_node():
> 
> 	no patch (us)		patched (us)	saved
> 	-------------		------------	-----
> 	  194173036		  5114138	 97%
> 
>    [ftrace log] Without patch: https://gist.github.com/AdrianHuang/a5bec861f67434e1024bbf43cea85959
>    [ftrace log] With patch: https://gist.github.com/AdrianHuang/a200215955ee377288377425dbaa04e3
> 
> 2. Use `time` utility to measure execution time
>  
>    A. Command: make clean && time make -j $(nproc)
> 
>    B. The following result is the average kernel execution time of five-time
>       measurements. ('sys' field of `time` output):
> 
> 	no patch (seconds)	patched (seconds)	saved
> 	------------------	----------------	-----
> 	    36932.904		   31403.478		 15%
> 
>    [`time` log] Without patch: https://gist.github.com/AdrianHuang/987b20fd0bd2bb616b3524aa6ee43112
>    [`time` log] With patch: https://gist.github.com/AdrianHuang/da2ea4e6aa0b4dcc207b4e40b202f694
>
I meant another statistics. As noted here https://lore.kernel.org/linux-mm/ZogS_04dP5LlRlXN@pc636/T/#m5d57f11d9f69aef5313f4efbe25415b3bae4c818
i came to conclusion that below place and lock:

<snip>
static void exit_notify(struct task_struct *tsk, int group_dead)
{
	bool autoreap;
	struct task_struct *p, *n;
	LIST_HEAD(dead);

	write_lock_irq(&tasklist_lock);
...
<snip>

keeps IRQs disabled, so it means that the purge_vmap_node() does the progress
but it can be slow.

CPU_1:
disables IRQs
trying to grab the tasklist_lock

CPU_2:
Sends an IPI to CPU_1
waits until the specified callback is executed on CPU_1

Since CPU_1 has disabled IRQs, serving an IPI and completion of callback
takes time until CPU_1 enables IRQs back.

Could you please post lock statistics for kernel compiling use case?
KASAN + patch is enough, IMO. This just to double check whether a
tasklist_lock is a problem or not.

Thanks!

--
Uladzislau Rezki
Huang Adrian July 30, 2024, 4:27 p.m. UTC | #9
On Tue, Jul 30, 2024 at 7:38 PM Uladzislau Rezki <urezki@gmail.com> wrote:
>
> > On Mon, Jul 29, 2024 at 7:29 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> > > It would be really good if Adrian could run the "compiling workload" on
> > > his big system and post the statistics here.
> > >
> > > For example:
> > >   a) v6.11-rc1 + KASAN.
> > >   b) v6.11-rc1 + KASAN + patch.
> >
> > Sure, please see the statistics below.
> >
> > Test Result (based on 6.11-rc1)
> > ===============================
> >
> > 1. Profile purge_vmap_node()
> >
> >    A. Command: trace-cmd record -p function_graph -l purge_vmap_node make -j $(nproc)
> >
> >    B. Average execution time of purge_vmap_node():
> >
> >       no patch (us)           patched (us)    saved
> >       -------------           ------------    -----
> >                147885.02                3692.51        97%
> >
> >    C. Total execution time of purge_vmap_node():
> >
> >       no patch (us)           patched (us)    saved
> >       -------------           ------------    -----
> >         194173036               5114138        97%
> >
> >    [ftrace log] Without patch: https://gist.github.com/AdrianHuang/a5bec861f67434e1024bbf43cea85959
> >    [ftrace log] With patch: https://gist.github.com/AdrianHuang/a200215955ee377288377425dbaa04e3
> >
> > 2. Use `time` utility to measure execution time
> >
> >    A. Command: make clean && time make -j $(nproc)
> >
> >    B. The following result is the average kernel execution time of five-time
> >       measurements. ('sys' field of `time` output):
> >
> >       no patch (seconds)      patched (seconds)       saved
> >       ------------------      ----------------        -----
> >           36932.904              31403.478             15%
> >
> >    [`time` log] Without patch: https://gist.github.com/AdrianHuang/987b20fd0bd2bb616b3524aa6ee43112
> >    [`time` log] With patch: https://gist.github.com/AdrianHuang/da2ea4e6aa0b4dcc207b4e40b202f694
> >
> I meant another statistics. As noted here https://lore.kernel.org/linux-mm/ZogS_04dP5LlRlXN@pc636/T/#m5d57f11d9f69aef5313f4efbe25415b3bae4c818
> i came to conclusion that below place and lock:
>
> <snip>
> static void exit_notify(struct task_struct *tsk, int group_dead)
> {
>         bool autoreap;
>         struct task_struct *p, *n;
>         LIST_HEAD(dead);
>
>         write_lock_irq(&tasklist_lock);
> ...
> <snip>
>
> keeps IRQs disabled, so it means that the purge_vmap_node() does the progress
> but it can be slow.
>
> CPU_1:
> disables IRQs
> trying to grab the tasklist_lock
>
> CPU_2:
> Sends an IPI to CPU_1
> waits until the specified callback is executed on CPU_1
>
> Since CPU_1 has disabled IRQs, serving an IPI and completion of callback
> takes time until CPU_1 enables IRQs back.
>
> Could you please post lock statistics for kernel compiling use case?
> KASAN + patch is enough, IMO. This just to double check whether a
> tasklist_lock is a problem or not.

Sorry for the misunderstanding.

Two experiments are shown as follows. I saw you think KASAN + patch is
enough. But, in case you need another one. ;-)

a) v6.11-rc1 + KASAN

The result is different from yours, so I ran two tests (make sure the
soft lockup warning was triggered).

Test #1: waittime-max = 5.4ms
<snip>
...
class name    con-bounces    contentions   waittime-min   waittime-max
waittime-total   waittime-avg    acq-bounces   acquisitions
holdtime-min   holdtime-max holdtime-total   holdtime-avg
...
tasklist_lock-W:        118762         120090           0.44
5443.22    24807413.37         206.57         429757         569051
       2.27        3222.00    69914505.87         122.86
tasklist_lock-R:        108262         108300           0.41
5381.34    23613372.10         218.04         489132         541541
       0.20        5543.40    10095470.68          18.64
    ---------------
    tasklist_lock          44594          [<0000000099d3ea35>]
exit_notify+0x82/0x900
    tasklist_lock          32041          [<0000000058f753d8>]
release_task+0x104/0x3f0
    tasklist_lock          99240          [<000000008524ff80>]
__do_wait+0xd8/0x710
    tasklist_lock          43435          [<00000000f6e82dcf>]
copy_process+0x2a46/0x50f0
    ---------------
    tasklist_lock          98334          [<0000000099d3ea35>]
exit_notify+0x82/0x900
    tasklist_lock          82649          [<0000000058f753d8>]
release_task+0x104/0x3f0
    tasklist_lock              2          [<00000000da5a7972>]
mm_update_next_owner+0xc0/0x430
    tasklist_lock          26708          [<00000000f6e82dcf>]
copy_process+0x2a46/0x50f0
...
<snip>

Test #2:waittime-max = 5.7ms
<snip>
...
class name    con-bounces    contentions   waittime-min   waittime-max
waittime-total   waittime-avg    acq-bounces   acquisitions
holdtime-min   holdtime-max holdtime-total   holdtime-avg
...
tasklist_lock-W:        121742         123167           0.43
5713.02    25252257.61         205.02         432111         569762
       2.25        3083.08    70711022.74         124.11
tasklist_lock-R:        111479         111523           0.39
5050.50    24557264.88         220.20         491404         542221
       0.20        5611.81    10007782.09          18.46
    ---------------
    tasklist_lock         102317          [<000000008524ff80>]
__do_wait+0xd8/0x710
    tasklist_lock          44606          [<00000000f6e82dcf>]
copy_process+0x2a46/0x50f0
    tasklist_lock          45584          [<0000000099d3ea35>]
exit_notify+0x82/0x900
    tasklist_lock          32969          [<0000000058f753d8>]
release_task+0x104/0x3f0
    ---------------
    tasklist_lock         100498          [<0000000099d3ea35>]
exit_notify+0x82/0x900
    tasklist_lock          27401          [<00000000f6e82dcf>]
copy_process+0x2a46/0x50f0
    tasklist_lock          85473          [<0000000058f753d8>]
release_task+0x104/0x3f0
    tasklist_lock            650          [<000000004d0b9f6b>]
tty_open_proc_set_tty+0x23/0x210
...
<snip>


b) v6.11-rc1 + KASAN + patch: waittime-max = 5.7ms
<snip>
...
class name    con-bounces    contentions   waittime-min   waittime-max
waittime-total   waittime-avg    acq-bounces   acquisitions
holdtime-min   holdtime-max holdtime-total   holdtime-avg
...
tasklist_lock-W:        108876         110087           0.33
5688.64    18622460.43         169.16         426740         568715
       1.94        2930.76    62560515.48         110.00
tasklist_lock-R:         99864          99909           0.43
5868.69    17849478.20         178.66         487654         541328
       0.20        5709.98     9207504.90          17.01
    ---------------
    tasklist_lock          91655          [<00000000a622e532>]
__do_wait+0xd8/0x710
    tasklist_lock          41100          [<00000000ccf53925>]
exit_notify+0x82/0x900
    tasklist_lock           8254          [<00000000093ccded>]
tty_open_proc_set_tty+0x23/0x210
    tasklist_lock          39542          [<00000000a0e6bf4d>]
copy_process+0x2a46/0x50f0
    ---------------
    tasklist_lock          90525          [<00000000ccf53925>]
exit_notify+0x82/0x900
    tasklist_lock          76934          [<00000000cb7ca00c>]
release_task+0x104/0x3f0
    tasklist_lock          23723          [<00000000a0e6bf4d>]
copy_process+0x2a46/0x50f0
    tasklist_lock          18223          [<00000000a622e532>]
__do_wait+0xd8/0x710
...
<snip>
Uladzislau Rezki July 30, 2024, 4:42 p.m. UTC | #10
On Wed, Jul 31, 2024 at 12:27:27AM +0800, Huang Adrian wrote:
> On Tue, Jul 30, 2024 at 7:38 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> >
> > > On Mon, Jul 29, 2024 at 7:29 PM Uladzislau Rezki <urezki@gmail.com> wrote:
> > > > It would be really good if Adrian could run the "compiling workload" on
> > > > his big system and post the statistics here.
> > > >
> > > > For example:
> > > >   a) v6.11-rc1 + KASAN.
> > > >   b) v6.11-rc1 + KASAN + patch.
> > >
> > > Sure, please see the statistics below.
> > >
> > > Test Result (based on 6.11-rc1)
> > > ===============================
> > >
> > > 1. Profile purge_vmap_node()
> > >
> > >    A. Command: trace-cmd record -p function_graph -l purge_vmap_node make -j $(nproc)
> > >
> > >    B. Average execution time of purge_vmap_node():
> > >
> > >       no patch (us)           patched (us)    saved
> > >       -------------           ------------    -----
> > >                147885.02                3692.51        97%
> > >
> > >    C. Total execution time of purge_vmap_node():
> > >
> > >       no patch (us)           patched (us)    saved
> > >       -------------           ------------    -----
> > >         194173036               5114138        97%
> > >
> > >    [ftrace log] Without patch: https://gist.github.com/AdrianHuang/a5bec861f67434e1024bbf43cea85959
> > >    [ftrace log] With patch: https://gist.github.com/AdrianHuang/a200215955ee377288377425dbaa04e3
> > >
> > > 2. Use `time` utility to measure execution time
> > >
> > >    A. Command: make clean && time make -j $(nproc)
> > >
> > >    B. The following result is the average kernel execution time of five-time
> > >       measurements. ('sys' field of `time` output):
> > >
> > >       no patch (seconds)      patched (seconds)       saved
> > >       ------------------      ----------------        -----
> > >           36932.904              31403.478             15%
> > >
> > >    [`time` log] Without patch: https://gist.github.com/AdrianHuang/987b20fd0bd2bb616b3524aa6ee43112
> > >    [`time` log] With patch: https://gist.github.com/AdrianHuang/da2ea4e6aa0b4dcc207b4e40b202f694
> > >
> > I meant another statistics. As noted here https://lore.kernel.org/linux-mm/ZogS_04dP5LlRlXN@pc636/T/#m5d57f11d9f69aef5313f4efbe25415b3bae4c818
> > i came to conclusion that below place and lock:
> >
> > <snip>
> > static void exit_notify(struct task_struct *tsk, int group_dead)
> > {
> >         bool autoreap;
> >         struct task_struct *p, *n;
> >         LIST_HEAD(dead);
> >
> >         write_lock_irq(&tasklist_lock);
> > ...
> > <snip>
> >
> > keeps IRQs disabled, so it means that the purge_vmap_node() does the progress
> > but it can be slow.
> >
> > CPU_1:
> > disables IRQs
> > trying to grab the tasklist_lock
> >
> > CPU_2:
> > Sends an IPI to CPU_1
> > waits until the specified callback is executed on CPU_1
> >
> > Since CPU_1 has disabled IRQs, serving an IPI and completion of callback
> > takes time until CPU_1 enables IRQs back.
> >
> > Could you please post lock statistics for kernel compiling use case?
> > KASAN + patch is enough, IMO. This just to double check whether a
> > tasklist_lock is a problem or not.
> 
> Sorry for the misunderstanding.
> 
> Two experiments are shown as follows. I saw you think KASAN + patch is
> enough. But, in case you need another one. ;-)
> 
> a) v6.11-rc1 + KASAN
> 
> The result is different from yours, so I ran two tests (make sure the
> soft lockup warning was triggered).
> 
> Test #1: waittime-max = 5.4ms
> <snip>
> ...
> class name    con-bounces    contentions   waittime-min   waittime-max
> waittime-total   waittime-avg    acq-bounces   acquisitions
> holdtime-min   holdtime-max holdtime-total   holdtime-avg
> ...
> tasklist_lock-W:        118762         120090           0.44
> 5443.22    24807413.37         206.57         429757         569051
>        2.27        3222.00    69914505.87         122.86
> tasklist_lock-R:        108262         108300           0.41
> 5381.34    23613372.10         218.04         489132         541541
>        0.20        5543.40    10095470.68          18.64
>     ---------------
>     tasklist_lock          44594          [<0000000099d3ea35>]
> exit_notify+0x82/0x900
>     tasklist_lock          32041          [<0000000058f753d8>]
> release_task+0x104/0x3f0
>     tasklist_lock          99240          [<000000008524ff80>]
> __do_wait+0xd8/0x710
>     tasklist_lock          43435          [<00000000f6e82dcf>]
> copy_process+0x2a46/0x50f0
>     ---------------
>     tasklist_lock          98334          [<0000000099d3ea35>]
> exit_notify+0x82/0x900
>     tasklist_lock          82649          [<0000000058f753d8>]
> release_task+0x104/0x3f0
>     tasklist_lock              2          [<00000000da5a7972>]
> mm_update_next_owner+0xc0/0x430
>     tasklist_lock          26708          [<00000000f6e82dcf>]
> copy_process+0x2a46/0x50f0
> ...
> <snip>
> 
> Test #2:waittime-max = 5.7ms
> <snip>
> ...
> class name    con-bounces    contentions   waittime-min   waittime-max
> waittime-total   waittime-avg    acq-bounces   acquisitions
> holdtime-min   holdtime-max holdtime-total   holdtime-avg
> ...
> tasklist_lock-W:        121742         123167           0.43
> 5713.02    25252257.61         205.02         432111         569762
>        2.25        3083.08    70711022.74         124.11
> tasklist_lock-R:        111479         111523           0.39
> 5050.50    24557264.88         220.20         491404         542221
>        0.20        5611.81    10007782.09          18.46
>     ---------------
>     tasklist_lock         102317          [<000000008524ff80>]
> __do_wait+0xd8/0x710
>     tasklist_lock          44606          [<00000000f6e82dcf>]
> copy_process+0x2a46/0x50f0
>     tasklist_lock          45584          [<0000000099d3ea35>]
> exit_notify+0x82/0x900
>     tasklist_lock          32969          [<0000000058f753d8>]
> release_task+0x104/0x3f0
>     ---------------
>     tasklist_lock         100498          [<0000000099d3ea35>]
> exit_notify+0x82/0x900
>     tasklist_lock          27401          [<00000000f6e82dcf>]
> copy_process+0x2a46/0x50f0
>     tasklist_lock          85473          [<0000000058f753d8>]
> release_task+0x104/0x3f0
>     tasklist_lock            650          [<000000004d0b9f6b>]
> tty_open_proc_set_tty+0x23/0x210
> ...
> <snip>
> 
> 
> b) v6.11-rc1 + KASAN + patch: waittime-max = 5.7ms
> <snip>
> ...
> class name    con-bounces    contentions   waittime-min   waittime-max
> waittime-total   waittime-avg    acq-bounces   acquisitions
> holdtime-min   holdtime-max holdtime-total   holdtime-avg
> ...
> tasklist_lock-W:        108876         110087           0.33
> 5688.64    18622460.43         169.16         426740         568715
>        1.94        2930.76    62560515.48         110.00
> tasklist_lock-R:         99864          99909           0.43
> 5868.69    17849478.20         178.66         487654         541328
>        0.20        5709.98     9207504.90          17.01
>     ---------------
>     tasklist_lock          91655          [<00000000a622e532>]
> __do_wait+0xd8/0x710
>     tasklist_lock          41100          [<00000000ccf53925>]
> exit_notify+0x82/0x900
>     tasklist_lock           8254          [<00000000093ccded>]
> tty_open_proc_set_tty+0x23/0x210
>     tasklist_lock          39542          [<00000000a0e6bf4d>]
> copy_process+0x2a46/0x50f0
>     ---------------
>     tasklist_lock          90525          [<00000000ccf53925>]
> exit_notify+0x82/0x900
>     tasklist_lock          76934          [<00000000cb7ca00c>]
> release_task+0x104/0x3f0
>     tasklist_lock          23723          [<00000000a0e6bf4d>]
> copy_process+0x2a46/0x50f0
>     tasklist_lock          18223          [<00000000a622e532>]
> __do_wait+0xd8/0x710
> ...
> <snip>
>
Thank you for posting this! So tasklist_lock is not a problem.
I assume you have a full output of lock_stat. Could you please
paste it for v6.11-rc1 + KASAN?

Thank you!

--
Uladzislau Rezki
Huang Adrian July 31, 2024, 12:39 a.m. UTC | #11
On Wed, Jul 31, 2024 at 12:42 AM Uladzislau Rezki <urezki@gmail.com> wrote:
> Thank you for posting this! So tasklist_lock is not a problem.
> I assume you have a full output of lock_stat. Could you please
> paste it for v6.11-rc1 + KASAN?

Full output: https://gist.github.com/AdrianHuang/2c2c97f533ba467ff32781590279ccc9

-- Adrian
Uladzislau Rezki July 31, 2024, 1:38 p.m. UTC | #12
On Wed, Jul 31, 2024 at 08:39:00AM +0800, Huang Adrian wrote:
> On Wed, Jul 31, 2024 at 12:42 AM Uladzislau Rezki <urezki@gmail.com> wrote:
> > Thank you for posting this! So tasklist_lock is not a problem.
> > I assume you have a full output of lock_stat. Could you please
> > paste it for v6.11-rc1 + KASAN?
> 
> Full output: https://gist.github.com/AdrianHuang/2c2c97f533ba467ff32781590279ccc9
> 
I do not see anything obvious. So it means that CSD lock debugging should be done.
But this is another story :)

Thank you for helping!

--
Uladzislau Rezki
diff mbox series

Patch

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 70d6a8f6e25d..2adea4fef153 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -29,6 +29,9 @@  typedef unsigned int __bitwise kasan_vmalloc_flags_t;
 #define KASAN_VMALLOC_VM_ALLOC		((__force kasan_vmalloc_flags_t)0x02u)
 #define KASAN_VMALLOC_PROT_NORMAL	((__force kasan_vmalloc_flags_t)0x04u)
 
+#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */
+#define KASAN_VMALLOC_TLB_FLUSH  0x2 /* TLB flush */
+
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 
 #include <linux/pgtable.h>
@@ -511,7 +514,8 @@  void kasan_populate_early_vm_area_shadow(void *start, unsigned long size);
 int kasan_populate_vmalloc(unsigned long addr, unsigned long size);
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
-			   unsigned long free_region_end);
+			   unsigned long free_region_end,
+			   unsigned long flags);
 
 #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
@@ -526,7 +530,8 @@  static inline int kasan_populate_vmalloc(unsigned long start,
 static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
-					 unsigned long free_region_end) { }
+					 unsigned long free_region_end,
+					 unsigned long flags) { }
 
 #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
 
@@ -561,7 +566,8 @@  static inline int kasan_populate_vmalloc(unsigned long start,
 static inline void kasan_release_vmalloc(unsigned long start,
 					 unsigned long end,
 					 unsigned long free_region_start,
-					 unsigned long free_region_end) { }
+					 unsigned long free_region_end,
+					 unsigned long flags) { }
 
 static inline void *kasan_unpoison_vmalloc(const void *start,
 					   unsigned long size,
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index d6210ca48dda..88d1c9dcb507 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -489,7 +489,8 @@  static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
  */
 void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			   unsigned long free_region_start,
-			   unsigned long free_region_end)
+			   unsigned long free_region_end,
+			   unsigned long flags)
 {
 	void *shadow_start, *shadow_end;
 	unsigned long region_start, region_end;
@@ -522,12 +523,17 @@  void kasan_release_vmalloc(unsigned long start, unsigned long end,
 			__memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
 			return;
 		}
-		apply_to_existing_page_range(&init_mm,
+
+
+		if (flags & KASAN_VMALLOC_PAGE_RANGE)
+			apply_to_existing_page_range(&init_mm,
 					     (unsigned long)shadow_start,
 					     size, kasan_depopulate_vmalloc_pte,
 					     NULL);
-		flush_tlb_kernel_range((unsigned long)shadow_start,
-				       (unsigned long)shadow_end);
+
+		if (flags & KASAN_VMALLOC_TLB_FLUSH)
+			flush_tlb_kernel_range((unsigned long)shadow_start,
+					       (unsigned long)shadow_end);
 	}
 }
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e34ea860153f..bc21d821d506 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2186,6 +2186,25 @@  decay_va_pool_node(struct vmap_node *vn, bool full_decay)
 	reclaim_list_global(&decay_list);
 }
 
+static void
+kasan_release_vmalloc_node(struct vmap_node *vn)
+{
+	struct vmap_area *va;
+	unsigned long start, end;
+
+	start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start;
+	end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end;
+
+	list_for_each_entry(va, &vn->purge_list, list) {
+		if (is_vmalloc_or_module_addr((void *) va->va_start))
+			kasan_release_vmalloc(va->va_start, va->va_end,
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE);
+	}
+
+	kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH);
+}
+
 static void purge_vmap_node(struct work_struct *work)
 {
 	struct vmap_node *vn = container_of(work,
@@ -2193,20 +2212,17 @@  static void purge_vmap_node(struct work_struct *work)
 	struct vmap_area *va, *n_va;
 	LIST_HEAD(local_list);
 
+	if (IS_ENABLED(CONFIG_KASAN_VMALLOC))
+		kasan_release_vmalloc_node(vn);
+
 	vn->nr_purged = 0;
 
 	list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
 		unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
-		unsigned long orig_start = va->va_start;
-		unsigned long orig_end = va->va_end;
 		unsigned int vn_id = decode_vn_id(va->flags);
 
 		list_del_init(&va->list);
 
-		if (is_vmalloc_or_module_addr((void *)orig_start))
-			kasan_release_vmalloc(orig_start, orig_end,
-					      va->va_start, va->va_end);
-
 		atomic_long_sub(nr, &vmap_lazy_nr);
 		vn->nr_purged++;
 
@@ -4726,7 +4742,8 @@  struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
-				va->va_start, va->va_end);
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
 		vas[area] = NULL;
 	}
 
@@ -4776,7 +4793,8 @@  struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
 				&free_vmap_area_list);
 		if (va)
 			kasan_release_vmalloc(orig_start, orig_end,
-				va->va_start, va->va_end);
+				va->va_start, va->va_end,
+				KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH);
 		vas[area] = NULL;
 		kfree(vms[area]);
 	}