diff mbox series

[v4,1/2] ring-buffer: Introducing ring-buffer mapping functions

Message ID 20230613083513.3312612-2-vdonnefort@google.com (mailing list archive)
State Superseded
Headers show
Series Introducing trace buffer mapping by user-space | expand

Commit Message

Vincent Donnefort June 13, 2023, 8:35 a.m. UTC
In preparation for allowing the user-space to map a ring-buffer, add
a set of mapping functions:

  ring_buffer_{map,unmap}()
  ring_buffer_map_fault()

And controls on the ring-buffer:

  ring_buffer_map_get_reader_page()  /* swap reader and head */

Mapping the ring-buffer also involves:

  A unique ID for each page of the ring-buffer, as currently the pages
  are only identified through their in-kernel VA.

  A meta-page, where are stored statistics about the ring-buffer and
  a page IDs list, ordered. A field gives what page is the reader
  one and one to gives where the ring-buffer starts in the list of data
  pages.

The linear mapping exposes the meta-page, and each page of the
ring-buffer, ordered following their unique ID, assigned during the
first mapping.

Once mapped, no page can get in or out of the ring-buffer: the buffer
size will remain unmodified and the splice enabling functions will in
reality simply memcpy the data instead of swapping pages.

Signed-off-by: Vincent Donnefort <vdonnefort@google.com>

Comments

kernel test robot July 6, 2023, 7:56 a.m. UTC | #1
Hello,

kernel test robot noticed a 25.5% improvement of stress-ng.kill.ops_per_sec on:


commit: dab8a2e9ddebfc8629b0b7f527206c91d52b160e ("[PATCH v4 1/2] ring-buffer: Introducing ring-buffer mapping functions")
url: https://github.com/intel-lab-lkp/linux/commits/Vincent-Donnefort/ring-buffer-Introducing-ring-buffer-mapping-functions/20230613-163710
base: https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git fb054096aea0576f0c0a61c598e5e9676443ee86
patch link: https://lore.kernel.org/all/20230613083513.3312612-2-vdonnefort@google.com/
patch subject: [PATCH v4 1/2] ring-buffer: Introducing ring-buffer mapping functions

testcase: stress-ng
test machine: 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory
parameters:

	nr_threads: 100%
	disk: 1HDD
	testtime: 60s
	class: interrupt
	test: kill
	cpufreq_governor: performance






Details are as below:
-------------------------------------------------------------------------------------------------->


To reproduce:

        git clone https://github.com/intel/lkp-tests.git
        cd lkp-tests
        sudo bin/lkp install job.yaml           # job file is attached in this email
        bin/lkp split-job --compatible job.yaml # generate the yaml file for lkp run
        sudo bin/lkp run generated-yaml-file

        # if come across any failure that blocks the test,
        # please remove ~/.lkp and /lkp dir to run from a clean state.

=========================================================================================
class/compiler/cpufreq_governor/disk/kconfig/nr_threads/rootfs/tbox_group/test/testcase/testtime:
  interrupt/gcc-12/performance/1HDD/x86_64-rhel-8.3/100%/debian-11.1-x86_64-20220510.cgz/lkp-icl-2sp8/kill/stress-ng/60s

commit: 
  fb054096ae ("Merge tag 'mm-hotfixes-stable-2023-06-12-12-22' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm")
  dab8a2e9dd ("ring-buffer: Introducing ring-buffer mapping functions")

fb054096aea0576f dab8a2e9ddebfc8629b0b7f5272 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      0.01          +100.0%       0.02        turbostat.IPC
    318.00 ± 98%    +475.1%       1828 ±117%  vmstat.memory.buff
     85186           +23.8%     105494        vmstat.system.cs
      0.72            +0.1        0.78 ±  2%  mpstat.cpu.all.irq%
      0.02 ±  6%      +0.0        0.03 ±  3%  mpstat.cpu.all.soft%
      0.47            +0.3        0.81 ± 10%  mpstat.cpu.all.usr%
    167.67 ± 10%     -34.2%     110.33 ±  7%  perf-c2c.DRAM.local
      1709 ±  3%     -78.2%     372.33 ±  5%  perf-c2c.DRAM.remote
      3027 ±  4%     -20.6%       2402 ±  6%  perf-c2c.HITM.local
    540.00 ±  5%     -67.7%     174.67 ± 12%  perf-c2c.HITM.remote
    293385           +10.9%     325265        meminfo.Active
    474.67 ± 59%    +308.4%       1938 ±111%  meminfo.Active(file)
    318.00 ± 98%    +473.6%       1824 ±118%  meminfo.Buffers
     99203 ±  2%     +22.1%     121162 ±  3%  meminfo.Mapped
    354897           +13.0%     401038        meminfo.Shmem
    737.88            +6.0%     782.52        stress-ng.kill.kill_calls_per_sec
    913328           +25.5%    1146265        stress-ng.kill.ops
     15221           +25.5%      19103        stress-ng.kill.ops_per_sec
   2741307           +24.9%    3422867        stress-ng.time.involuntary_context_switches
   2719695           +25.1%    3402015        stress-ng.time.voluntary_context_switches
     72923           +10.5%      80615        proc-vmstat.nr_active_anon
    118.50 ± 59%    +308.9%     484.50 ±111%  proc-vmstat.nr_active_file
     89727            +1.8%      91366        proc-vmstat.nr_anon_pages
    772985            +1.6%     785072        proc-vmstat.nr_file_pages
    105184            +5.4%     110891        proc-vmstat.nr_inactive_anon
     24859 ±  3%     +22.5%      30450 ±  2%  proc-vmstat.nr_mapped
     88505           +13.3%     100250        proc-vmstat.nr_shmem
     72923           +10.5%      80615        proc-vmstat.nr_zone_active_anon
    118.50 ± 59%    +308.9%     484.50 ±111%  proc-vmstat.nr_zone_active_file
    105184            +5.4%     110891        proc-vmstat.nr_zone_inactive_anon
     18521 ± 15%     +45.3%      26903 ±  8%  proc-vmstat.numa_hint_faults_local
    519320            +3.3%     536424        proc-vmstat.numa_hit
    453047            +3.8%     470130 ±  2%  proc-vmstat.numa_local
    559942            +2.6%     574521        proc-vmstat.pgalloc_normal
   1730277           -10.7%    1544435        sched_debug.cfs_rq:/.min_vruntime.min
     25977 ±  8%     +82.0%      47289 ±  3%  sched_debug.cfs_rq:/.min_vruntime.stddev
     46.15 ± 41%     +63.1%      75.28 ± 24%  sched_debug.cfs_rq:/.removed.load_avg.avg
    142.13 ± 18%     +25.5%     178.42 ± 10%  sched_debug.cfs_rq:/.removed.load_avg.stddev
     15.30 ± 40%     +73.1%      26.49 ± 19%  sched_debug.cfs_rq:/.removed.runnable_avg.avg
     51.68 ± 25%     +34.2%      69.36 ±  7%  sched_debug.cfs_rq:/.removed.runnable_avg.stddev
     15.30 ± 40%     +73.1%      26.49 ± 19%  sched_debug.cfs_rq:/.removed.util_avg.avg
     51.68 ± 25%     +34.2%      69.36 ±  7%  sched_debug.cfs_rq:/.removed.util_avg.stddev
   -163513          +101.1%    -328771        sched_debug.cfs_rq:/.spread0.min
     25962 ±  8%     +82.2%      47315 ±  3%  sched_debug.cfs_rq:/.spread0.stddev
    185.25 ±  6%     -30.6%     128.58 ± 46%  sched_debug.cfs_rq:/.util_est_enqueued.min
     91.74 ± 12%     +27.1%     116.58 ± 17%  sched_debug.cfs_rq:/.util_est_enqueued.stddev
     47444           +21.0%      57414        sched_debug.cpu.nr_switches.avg
     34565 ±  2%     -56.3%      15121 ± 11%  sched_debug.cpu.nr_switches.min
     26970 ±  2%     +64.1%      44271 ±  3%  sched_debug.cpu.nr_switches.stddev
     13.70            -2.0%      13.42        perf-stat.i.MPKI
 1.579e+09           +18.3%  1.868e+09        perf-stat.i.branch-instructions
   8703552            +7.4%    9347515        perf-stat.i.branch-misses
     11.92            -8.9        2.97 ±  5%  perf-stat.i.cache-miss-rate%
  13454425           -72.4%    3712356 ±  3%  perf-stat.i.cache-misses
 1.116e+08           +19.3%  1.332e+08        perf-stat.i.cache-references
     88572           +23.9%     109756        perf-stat.i.context-switches
     27.02           -19.0%      21.88        perf-stat.i.cpi
     15970          +309.6%      65417 ±  2%  perf-stat.i.cycles-between-cache-misses
 2.318e+09           +19.5%   2.77e+09        perf-stat.i.dTLB-loads
 1.199e+09           +21.0%  1.452e+09        perf-stat.i.dTLB-stores
 8.467e+09           +18.9%  1.006e+10        perf-stat.i.instructions
    146.35           -63.4%      53.57 ± 14%  perf-stat.i.metric.K/sec
     81.18           +19.6%      97.05        perf-stat.i.metric.M/sec
     86.90            -7.0       79.92 ±  2%  perf-stat.i.node-load-miss-rate%
   2738509           -77.0%     630505 ±  3%  perf-stat.i.node-load-misses
    407911 ±  5%     -56.2%     178562 ± 14%  perf-stat.i.node-loads
     89.44           -36.9       52.51        perf-stat.i.node-store-miss-rate%
   4392752           -92.0%     351197 ±  3%  perf-stat.i.node-store-misses
      0.53            -0.1        0.47        perf-stat.overall.branch-miss-rate%
     12.05            -9.3        2.76 ±  2%  perf-stat.overall.cache-miss-rate%
     25.96           -16.1%      21.79        perf-stat.overall.cpi
     16218          +264.0%      59031 ±  3%  perf-stat.overall.cycles-between-cache-misses
      0.00 ±  7%      -0.0        0.00 ±  8%  perf-stat.overall.dTLB-store-miss-rate%
      0.04           +19.1%       0.05        perf-stat.overall.ipc
     86.90            -9.4       77.51 ±  2%  perf-stat.overall.node-load-miss-rate%
     90.81           -48.4       42.45 ±  2%  perf-stat.overall.node-store-miss-rate%
 1.546e+09           +18.1%  1.826e+09        perf-stat.ps.branch-instructions
   8207103            +5.4%    8650221        perf-stat.ps.branch-misses
  13275967           -72.6%    3639102 ±  3%  perf-stat.ps.cache-misses
 1.102e+08           +19.8%  1.319e+08        perf-stat.ps.cache-references
     87466           +24.2%     108660        perf-stat.ps.context-switches
 2.275e+09           +19.5%  2.719e+09        perf-stat.ps.dTLB-loads
  1.18e+09           +21.1%   1.43e+09        perf-stat.ps.dTLB-stores
 8.294e+09           +18.7%  9.849e+09        perf-stat.ps.instructions
   2703084           -77.1%     618728 ±  3%  perf-stat.ps.node-load-misses
    407596 ±  5%     -55.8%     180316 ± 13%  perf-stat.ps.node-loads
   4340341           -92.0%     347078 ±  3%  perf-stat.ps.node-store-misses
 5.243e+11           +19.0%  6.239e+11        perf-stat.total.instructions
      0.02 ± 19%    +158.0%       0.06 ± 50%  perf-sched.sch_delay.avg.ms.__cond_resched.__wait_for_common.affine_move_task.__set_cpus_allowed_ptr.__sched_setaffinity
      0.01 ± 39%     +61.1%       0.02 ± 20%  perf-sched.sch_delay.avg.ms.__x64_sys_pause.do_syscall_64.entry_SYSCALL_64_after_hwframe.[unknown]
      0.34 ±183%    +373.4%       1.62 ± 73%  perf-sched.sch_delay.avg.ms.io_schedule.bit_wait_io.__wait_on_bit.out_of_line_wait_on_bit
      1.92 ±  8%    +102.3%       3.88 ± 13%  perf-sched.sch_delay.avg.ms.irq_thread.kthread.ret_from_fork
      0.94 ±148%    +253.3%       3.31 ± 76%  perf-sched.sch_delay.avg.ms.schedule_timeout.ext4_lazyinit_thread.part.0.kthread
      0.63 ±167%    +440.9%       3.42 ± 59%  perf-sched.sch_delay.avg.ms.schedule_timeout.io_schedule_timeout.__wait_for_common.submit_bio_wait
      1.92 ±  9%     +27.4%       2.44 ± 15%  perf-sched.sch_delay.avg.ms.schedule_timeout.kcompactd.kthread.ret_from_fork
      0.90 ±  6%     -20.1%       0.72 ±  6%  perf-sched.sch_delay.avg.ms.smpboot_thread_fn.kthread.ret_from_fork
      2.90 ± 11%    +220.7%       9.30 ± 18%  perf-sched.sch_delay.max.ms.__cond_resched.__wait_for_common.affine_move_task.__set_cpus_allowed_ptr.__sched_setaffinity
      6.26 ± 30%    +479.0%      36.23 ± 65%  perf-sched.sch_delay.max.ms.__x64_sys_pause.do_syscall_64.entry_SYSCALL_64_after_hwframe.[unknown]
      0.68 ±183%    +372.0%       3.23 ± 73%  perf-sched.sch_delay.max.ms.io_schedule.bit_wait_io.__wait_on_bit.out_of_line_wait_on_bit
      3.96 ± 14%    +135.0%       9.32 ± 13%  perf-sched.sch_delay.max.ms.irq_thread.kthread.ret_from_fork
      0.94 ±148%    +253.3%       3.31 ± 76%  perf-sched.sch_delay.max.ms.schedule_timeout.ext4_lazyinit_thread.part.0.kthread
      0.63 ±167%    +440.9%       3.42 ± 59%  perf-sched.sch_delay.max.ms.schedule_timeout.io_schedule_timeout.__wait_for_common.submit_bio_wait
      4.02 ±  7%     +89.6%       7.63 ± 12%  perf-sched.sch_delay.max.ms.schedule_timeout.kcompactd.kthread.ret_from_fork
      5.09 ± 13%    +101.8%      10.27 ±  8%  perf-sched.sch_delay.max.ms.smpboot_thread_fn.kthread.ret_from_fork
     12.96 ±  7%     +22.8%      15.91 ±  8%  perf-sched.sch_delay.max.ms.wait_for_partner.fifo_open.do_dentry_open.do_open
      5.38 ± 13%     +89.3%      10.19 ± 12%  perf-sched.sch_delay.max.ms.worker_thread.kthread.ret_from_fork
      2.22 ±  2%     -18.6%       1.80 ±  3%  perf-sched.total_wait_and_delay.average.ms
    462455           +23.1%     569289        perf-sched.total_wait_and_delay.count.ms
      2.20 ±  2%     -18.7%       1.79 ±  3%  perf-sched.total_wait_time.average.ms
     13.15 ±  7%     -10.5%      11.77 ±  3%  perf-sched.wait_and_delay.avg.ms.__cond_resched.__wait_for_common.affine_move_task.__set_cpus_allowed_ptr.__sched_setaffinity
    118.49 ± 34%     -77.9%      26.13 ± 32%  perf-sched.wait_and_delay.avg.ms.__cond_resched.generic_perform_write.__generic_file_write_iter.generic_file_write_iter.vfs_write
      4.14           -20.8%       3.28        perf-sched.wait_and_delay.avg.ms.do_signal_stop.get_signal.arch_do_signal_or_restart.exit_to_user_mode_loop
      0.02 ±  5%     +46.7%       0.03 ± 11%  perf-sched.wait_and_delay.avg.ms.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64
     18.24 ±142%    +977.7%     196.58 ±124%  perf-sched.wait_and_delay.avg.ms.kjournald2.kthread.ret_from_fork
      3.24 ±144%    +244.9%      11.18 ± 53%  perf-sched.wait_and_delay.avg.ms.schedule_timeout.ext4_lazyinit_thread.part.0.kthread
      7.83 ± 58%    +153.2%      19.83 ± 34%  perf-sched.wait_and_delay.count.__cond_resched.__alloc_pages.__folio_alloc.vma_alloc_folio.shmem_alloc_folio
    352.50 ±  8%      -9.2%     320.00        perf-sched.wait_and_delay.count.__cond_resched.__wait_for_common.affine_move_task.__set_cpus_allowed_ptr.__sched_setaffinity
     25.50 ± 11%     -28.8%      18.17 ± 20%  perf-sched.wait_and_delay.count.__cond_resched.down_write_killable.exec_mmap.begin_new_exec.load_elf_binary
     20.33 ± 16%    +263.1%      73.83 ± 31%  perf-sched.wait_and_delay.count.__cond_resched.generic_perform_write.__generic_file_write_iter.generic_file_write_iter.vfs_write
    151847           +23.8%     187914        perf-sched.wait_and_delay.count.__x64_sys_pause.do_syscall_64.entry_SYSCALL_64_after_hwframe.[unknown]
     75617           +23.0%      93028        perf-sched.wait_and_delay.count.do_signal_stop.get_signal.arch_do_signal_or_restart.exit_to_user_mode_loop
    230482           +23.1%     283813        perf-sched.wait_and_delay.count.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64
      1246 ± 20%     -55.5%     555.15 ± 32%  perf-sched.wait_and_delay.max.ms.__cond_resched.generic_perform_write.__generic_file_write_iter.generic_file_write_iter.vfs_write
     18.24 ±142%    +977.7%     196.58 ±124%  perf-sched.wait_and_delay.max.ms.kjournald2.kthread.ret_from_fork
      3.24 ±144%    +244.9%      11.18 ± 53%  perf-sched.wait_and_delay.max.ms.schedule_timeout.ext4_lazyinit_thread.part.0.kthread
     12.96 ±  7%     +22.8%      15.91 ±  8%  perf-sched.wait_and_delay.max.ms.wait_for_partner.fifo_open.do_dentry_open.do_open
     13.12 ±  7%     -10.8%      11.70 ±  4%  perf-sched.wait_time.avg.ms.__cond_resched.__wait_for_common.affine_move_task.__set_cpus_allowed_ptr.__sched_setaffinity
      0.01 ± 29%    +656.5%       0.09 ±172%  perf-sched.wait_time.avg.ms.__cond_resched.dput.step_into.link_path_walk.part
      0.01 ±  9%   +5204.7%       0.38 ±152%  perf-sched.wait_time.avg.ms.__cond_resched.dput.step_into.open_last_lookups.path_openat
    118.49 ± 34%     -77.9%      26.13 ± 32%  perf-sched.wait_time.avg.ms.__cond_resched.generic_perform_write.__generic_file_write_iter.generic_file_write_iter.vfs_write
      4.13           -20.9%       3.27        perf-sched.wait_time.avg.ms.do_signal_stop.get_signal.arch_do_signal_or_restart.exit_to_user_mode_loop
      0.02 ±  3%     +49.1%       0.03 ± 12%  perf-sched.wait_time.avg.ms.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64
      0.01 ±186%    +479.6%       0.05 ± 39%  perf-sched.wait_time.avg.ms.io_schedule.bit_wait_io.__wait_on_bit.out_of_line_wait_on_bit
     16.37 ±142%   +1077.5%     192.72 ±127%  perf-sched.wait_time.avg.ms.kjournald2.kthread.ret_from_fork
      2.30 ±142%    +241.5%       7.87 ± 43%  perf-sched.wait_time.avg.ms.schedule_timeout.ext4_lazyinit_thread.part.0.kthread
      0.03 ± 53%   +3495.7%       0.97 ±200%  perf-sched.wait_time.max.ms.__cond_resched.dput.step_into.link_path_walk.part
      0.01 ± 40%  +29892.9%       3.50 ±143%  perf-sched.wait_time.max.ms.__cond_resched.dput.step_into.open_last_lookups.path_openat
      1246 ± 20%     -55.5%     555.15 ± 32%  perf-sched.wait_time.max.ms.__cond_resched.generic_perform_write.__generic_file_write_iter.generic_file_write_iter.vfs_write
     24.58 ± 13%     +45.8%      35.84 ± 21%  perf-sched.wait_time.max.ms.do_signal_stop.get_signal.arch_do_signal_or_restart.exit_to_user_mode_loop
      0.02 ±186%    +475.9%       0.10 ± 39%  perf-sched.wait_time.max.ms.io_schedule.bit_wait_io.__wait_on_bit.out_of_line_wait_on_bit
     16.37 ±142%   +1077.5%     192.72 ±127%  perf-sched.wait_time.max.ms.kjournald2.kthread.ret_from_fork
      2.30 ±142%    +241.5%       7.87 ± 43%  perf-sched.wait_time.max.ms.schedule_timeout.ext4_lazyinit_thread.part.0.kthread
      9.10 ±  2%      -5.5        3.64 ±  2%  perf-profile.calltrace.cycles-pp.aa_may_signal.apparmor_task_kill.security_task_kill.kill_something_info.__x64_sys_kill
     94.25            -2.4       91.90        perf-profile.calltrace.cycles-pp.apparmor_task_kill.security_task_kill.kill_something_info.__x64_sys_kill.do_syscall_64
     94.39            -2.0       92.42        perf-profile.calltrace.cycles-pp.security_task_kill.kill_something_info.__x64_sys_kill.do_syscall_64.entry_SYSCALL_64_after_hwframe
     98.29            -1.0       97.34        perf-profile.calltrace.cycles-pp.kill_something_info.__x64_sys_kill.do_syscall_64.entry_SYSCALL_64_after_hwframe.kill
     98.33            -0.9       97.40        perf-profile.calltrace.cycles-pp.__x64_sys_kill.do_syscall_64.entry_SYSCALL_64_after_hwframe.kill
     98.46            -0.9       97.59        perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.kill
     98.46            -0.9       97.60        perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.kill
     98.49            -0.9       97.63        perf-profile.calltrace.cycles-pp.kill
      0.66            +0.2        0.83 ±  2%  perf-profile.calltrace.cycles-pp.apparmor_task_kill.security_task_kill.kill_pid_info.kill_something_info.__x64_sys_kill
      0.67            +0.2        0.84 ±  2%  perf-profile.calltrace.cycles-pp.security_task_kill.kill_pid_info.kill_something_info.__x64_sys_kill.do_syscall_64
      0.72            +0.2        0.92        perf-profile.calltrace.cycles-pp.profile_signal_perm.aa_may_signal.apparmor_task_kill.security_task_kill.kill_something_info
      0.77            +0.2        0.98        perf-profile.calltrace.cycles-pp.kill_pid_info.kill_something_info.__x64_sys_kill.do_syscall_64.entry_SYSCALL_64_after_hwframe
      0.56 ±  3%      +0.3        0.81 ±  3%  perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.pause
      0.56 ±  3%      +0.3        0.82 ±  3%  perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.pause
      0.57 ±  3%      +0.3        0.83 ±  3%  perf-profile.calltrace.cycles-pp.pause
      0.00            +0.6        0.56 ±  3%  perf-profile.calltrace.cycles-pp.asm_sysvec_apic_timer_interrupt.apparmor_task_kill.security_task_kill.kill_something_info.__x64_sys_kill
      0.08 ±223%      +0.6        0.70 ±  4%  perf-profile.calltrace.cycles-pp.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe.pause
      0.08 ±223%      +0.6        0.70 ±  4%  perf-profile.calltrace.cycles-pp.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe.pause
      0.00            +0.6        0.65 ±  3%  perf-profile.calltrace.cycles-pp.do_filp_open.do_sys_openat2.__x64_sys_openat.do_syscall_64.entry_SYSCALL_64_after_hwframe
      0.00            +0.6        0.65 ±  3%  perf-profile.calltrace.cycles-pp.path_openat.do_filp_open.do_sys_openat2.__x64_sys_openat.do_syscall_64
      2.44 ±  3%      +0.6        3.09 ±  3%  perf-profile.calltrace.cycles-pp.queued_read_lock_slowpath.kill_something_info.__x64_sys_kill.do_syscall_64.entry_SYSCALL_64_after_hwframe
      2.43 ±  3%      +0.7        3.09 ±  3%  perf-profile.calltrace.cycles-pp.native_queued_spin_lock_slowpath.queued_read_lock_slowpath.kill_something_info.__x64_sys_kill.do_syscall_64
      0.00            +0.7        0.67 ±  4%  perf-profile.calltrace.cycles-pp.get_signal.arch_do_signal_or_restart.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode
      0.00            +0.7        0.68 ±  3%  perf-profile.calltrace.cycles-pp.entry_SYSCALL_64_after_hwframe.open64
      0.00            +0.7        0.68 ±  3%  perf-profile.calltrace.cycles-pp.do_syscall_64.entry_SYSCALL_64_after_hwframe.open64
      0.00            +0.7        0.68 ±  3%  perf-profile.calltrace.cycles-pp.__x64_sys_openat.do_syscall_64.entry_SYSCALL_64_after_hwframe.open64
      0.00            +0.7        0.68 ±  3%  perf-profile.calltrace.cycles-pp.do_sys_openat2.__x64_sys_openat.do_syscall_64.entry_SYSCALL_64_after_hwframe.open64
      0.00            +0.7        0.68 ±  3%  perf-profile.calltrace.cycles-pp.open64
      0.00            +0.7        0.69 ±  4%  perf-profile.calltrace.cycles-pp.arch_do_signal_or_restart.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64
      0.00            +0.7        0.70 ±  5%  perf-profile.calltrace.cycles-pp.exit_to_user_mode_loop.exit_to_user_mode_prepare.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe
     38.83            +3.0       41.81        perf-profile.calltrace.cycles-pp.aa_get_task_label.apparmor_task_kill.security_task_kill.kill_something_info.__x64_sys_kill
      9.20 ±  2%      -5.5        3.72 ±  2%  perf-profile.children.cycles-pp.aa_may_signal
     94.97            -1.8       93.15        perf-profile.children.cycles-pp.apparmor_task_kill
     95.08            -1.8       93.28        perf-profile.children.cycles-pp.security_task_kill
     98.32            -0.9       97.38        perf-profile.children.cycles-pp.kill_something_info
     98.33            -0.9       97.40        perf-profile.children.cycles-pp.__x64_sys_kill
     98.50            -0.9       97.64        perf-profile.children.cycles-pp.kill
     99.90            -0.1       99.81        perf-profile.children.cycles-pp.do_syscall_64
     99.91            -0.1       99.82        perf-profile.children.cycles-pp.entry_SYSCALL_64_after_hwframe
      0.07            -0.0        0.06 ±  6%  perf-profile.children.cycles-pp.exit_notify
      0.06 ±  9%      +0.0        0.07        perf-profile.children.cycles-pp.audit_signal_info
      0.08 ±  5%      +0.0        0.10        perf-profile.children.cycles-pp.ksys_read
      0.06 ±  7%      +0.0        0.08 ±  4%  perf-profile.children.cycles-pp.audit_signal_info_syscall
      0.09 ±  5%      +0.0        0.11 ±  4%  perf-profile.children.cycles-pp.security_file_open
      0.09 ±  5%      +0.0        0.11 ±  4%  perf-profile.children.cycles-pp.apparmor_file_open
      0.05            +0.0        0.08 ±  6%  perf-profile.children.cycles-pp.apparmor_file_alloc_security
      0.05            +0.0        0.08        perf-profile.children.cycles-pp.security_file_alloc
      0.06 ±  7%      +0.0        0.10 ±  4%  perf-profile.children.cycles-pp.__x64_sys_pause
      0.06 ±  7%      +0.0        0.10 ±  5%  perf-profile.children.cycles-pp.complete_signal
      0.11 ±  4%      +0.0        0.14        perf-profile.children.cycles-pp.__task_pid_nr_ns
      0.06            +0.0        0.10 ±  8%  perf-profile.children.cycles-pp.__d_lookup
      0.06 ±  6%      +0.0        0.10 ±  8%  perf-profile.children.cycles-pp.open_last_lookups
      0.07            +0.0        0.11 ±  4%  perf-profile.children.cycles-pp.alloc_empty_file
      0.05 ±  7%      +0.0        0.09 ±  4%  perf-profile.children.cycles-pp.__fput
      0.07 ±  7%      +0.0        0.10 ±  4%  perf-profile.children.cycles-pp.__alloc_file
      0.07 ±  5%      +0.0        0.11 ±  5%  perf-profile.children.cycles-pp.try_to_wake_up
      0.02 ± 99%      +0.0        0.06 ±  7%  perf-profile.children.cycles-pp._raw_spin_lock_irqsave
      0.07            +0.0        0.11 ±  3%  perf-profile.children.cycles-pp.task_work_run
      0.13 ±  5%      +0.0        0.17 ±  2%  perf-profile.children.cycles-pp.__kill_pgrp_info
      0.09 ±  5%      +0.0        0.14 ±  4%  perf-profile.children.cycles-pp.do_send_sig_info
      0.09 ±  4%      +0.1        0.14 ±  2%  perf-profile.children.cycles-pp.__send_signal_locked
      0.17 ±  2%      +0.1        0.22 ±  5%  perf-profile.children.cycles-pp.do_dentry_open
      0.00            +0.1        0.05 ±  7%  perf-profile.children.cycles-pp.perf_mux_hrtimer_handler
      0.09            +0.1        0.14 ±  6%  perf-profile.children.cycles-pp.lookup_fast
      0.00            +0.1        0.06 ±  9%  perf-profile.children.cycles-pp.lockref_get
      0.00            +0.1        0.06 ±  9%  perf-profile.children.cycles-pp.lockref_put_or_lock
      0.00            +0.1        0.06 ±  9%  perf-profile.children.cycles-pp.ttwu_do_activate
      0.08 ±  4%      +0.1        0.14 ±  5%  perf-profile.children.cycles-pp.inode_permission
      0.08 ±  4%      +0.1        0.14 ±  5%  perf-profile.children.cycles-pp.proc_sys_permission
      0.07            +0.1        0.13 ±  5%  perf-profile.children.cycles-pp.switch_mm_irqs_off
      0.07 ±  6%      +0.1        0.13 ±  5%  perf-profile.children.cycles-pp._raw_spin_lock_irq
      0.41            +0.1        0.47        perf-profile.children.cycles-pp.check_kill_permission
      0.09            +0.1        0.15 ±  2%  perf-profile.children.cycles-pp.__close
      0.00            +0.1        0.06 ±  6%  perf-profile.children.cycles-pp.security_file_free
      0.00            +0.1        0.06 ±  6%  perf-profile.children.cycles-pp.apparmor_file_free_security
      0.00            +0.1        0.06 ±  6%  perf-profile.children.cycles-pp.walk_component
      0.18 ±  2%      +0.1        0.24 ±  4%  perf-profile.children.cycles-pp.do_open
      0.00            +0.1        0.07 ± 19%  perf-profile.children.cycles-pp.shmem_write_begin
      0.20            +0.1        0.27 ±  4%  perf-profile.children.cycles-pp.__hrtimer_run_queues
      0.22 ±  8%      +0.1        0.29 ±  6%  perf-profile.children.cycles-pp.do_signal_stop
      0.00            +0.1        0.07 ± 18%  perf-profile.children.cycles-pp.shmem_get_folio_gfp
      0.00            +0.1        0.08 ±  6%  perf-profile.children.cycles-pp.dput
      0.11 ±  3%      +0.1        0.19 ±  3%  perf-profile.children.cycles-pp.link_path_walk
      0.25            +0.1        0.32 ±  8%  perf-profile.children.cycles-pp.__sysvec_apic_timer_interrupt
      0.24            +0.1        0.32 ±  8%  perf-profile.children.cycles-pp.hrtimer_interrupt
      0.28            +0.1        0.38 ±  7%  perf-profile.children.cycles-pp.sysvec_apic_timer_interrupt
      0.20 ±  2%      +0.1        0.31 ±  3%  perf-profile.children.cycles-pp.__schedule
      0.20 ±  4%      +0.1        0.31 ±  3%  perf-profile.children.cycles-pp.schedule
      0.00            +0.1        0.13 ± 17%  perf-profile.children.cycles-pp.generic_perform_write
      0.00            +0.1        0.13 ± 18%  perf-profile.children.cycles-pp.__generic_file_write_iter
      0.00            +0.1        0.14 ± 17%  perf-profile.children.cycles-pp.generic_file_write_iter
      0.00            +0.2        0.16 ± 18%  perf-profile.children.cycles-pp.vfs_write
      0.31 ±  3%      +0.2        0.48 ±  3%  perf-profile.children.cycles-pp._raw_spin_lock
      0.00            +0.2        0.20 ± 20%  perf-profile.children.cycles-pp.ksys_write
      0.00            +0.2        0.20 ± 19%  perf-profile.children.cycles-pp.__libc_write
      0.46 ±  3%      +0.2        0.67 ±  4%  perf-profile.children.cycles-pp.get_signal
      0.74            +0.2        0.95        perf-profile.children.cycles-pp.profile_signal_perm
      0.00            +0.2        0.21 ± 19%  perf-profile.children.cycles-pp.record__pushfn
      0.02 ± 99%      +0.2        0.24 ± 19%  perf-profile.children.cycles-pp.__libc_start_main
      0.02 ± 99%      +0.2        0.24 ± 19%  perf-profile.children.cycles-pp.main
      0.02 ± 99%      +0.2        0.24 ± 19%  perf-profile.children.cycles-pp.run_builtin
      0.00            +0.2        0.21 ± 20%  perf-profile.children.cycles-pp.writen
      0.00            +0.2        0.21 ± 20%  perf-profile.children.cycles-pp.perf_mmap__push
      0.77            +0.2        0.98        perf-profile.children.cycles-pp.kill_pid_info
      0.00            +0.2        0.22 ± 20%  perf-profile.children.cycles-pp.record__mmap_read_evlist
      0.01 ±223%      +0.2        0.23 ± 20%  perf-profile.children.cycles-pp.cmd_record
      0.01 ±223%      +0.2        0.23 ± 20%  perf-profile.children.cycles-pp.__cmd_record
      0.50 ±  2%      +0.2        0.72 ±  4%  perf-profile.children.cycles-pp.arch_do_signal_or_restart
      0.43            +0.2        0.66 ±  3%  perf-profile.children.cycles-pp.path_openat
      0.44            +0.2        0.66 ±  3%  perf-profile.children.cycles-pp.do_filp_open
      0.45            +0.2        0.69 ±  3%  perf-profile.children.cycles-pp.open64
      0.46            +0.2        0.70 ±  3%  perf-profile.children.cycles-pp.__x64_sys_openat
      0.46            +0.2        0.70 ±  3%  perf-profile.children.cycles-pp.do_sys_openat2
      0.58 ±  3%      +0.3        0.83 ±  3%  perf-profile.children.cycles-pp.pause
      0.67 ±  2%      +0.3        1.00 ±  3%  perf-profile.children.cycles-pp.exit_to_user_mode_loop
      0.70 ±  2%      +0.3        1.03 ±  3%  perf-profile.children.cycles-pp.syscall_exit_to_user_mode
      0.70 ±  2%      +0.3        1.03 ±  3%  perf-profile.children.cycles-pp.exit_to_user_mode_prepare
      2.74 ±  3%      +0.7        3.49 ±  3%  perf-profile.children.cycles-pp.queued_read_lock_slowpath
      0.32            +0.8        1.13 ±  3%  perf-profile.children.cycles-pp.asm_sysvec_apic_timer_interrupt
      2.98 ±  3%      +0.8        3.82 ±  3%  perf-profile.children.cycles-pp.native_queued_spin_lock_slowpath
     39.13            +3.5       42.64        perf-profile.children.cycles-pp.aa_get_task_label
      8.44 ±  2%      -5.7        2.78 ±  2%  perf-profile.self.cycles-pp.aa_may_signal
      0.18 ±  2%      -0.0        0.17 ±  2%  perf-profile.self.cycles-pp.apparmor_capable
      0.10 ±  4%      +0.0        0.12 ±  3%  perf-profile.self.cycles-pp.security_task_kill
      0.05 ±  7%      +0.0        0.07        perf-profile.self.cycles-pp.audit_signal_info_syscall
      0.09 ±  7%      +0.0        0.11 ±  6%  perf-profile.self.cycles-pp.apparmor_file_open
      0.09 ±  5%      +0.0        0.11 ±  4%  perf-profile.self.cycles-pp.__task_pid_nr_ns
      0.05            +0.0        0.08 ±  6%  perf-profile.self.cycles-pp.apparmor_file_alloc_security
      0.09 ±  5%      +0.0        0.12 ±  4%  perf-profile.self.cycles-pp.kill_something_info
      0.11 ±  4%      +0.0        0.15 ±  6%  perf-profile.self.cycles-pp.check_kill_permission
      0.02 ± 99%      +0.0        0.06 ±  6%  perf-profile.self.cycles-pp._raw_spin_lock_irqsave
      0.12 ±  4%      +0.0        0.16 ±  3%  perf-profile.self.cycles-pp.__kill_pgrp_info
      0.01 ±223%      +0.0        0.06 ±  6%  perf-profile.self.cycles-pp.audit_signal_info
      0.05 ±  7%      +0.1        0.11 ±  3%  perf-profile.self.cycles-pp._raw_spin_lock_irq
      0.07            +0.1        0.13 ±  5%  perf-profile.self.cycles-pp.switch_mm_irqs_off
      0.00            +0.1        0.06 ±  6%  perf-profile.self.cycles-pp.apparmor_file_free_security
      0.10 ±  4%      +0.1        0.18 ±  2%  perf-profile.self.cycles-pp._raw_spin_lock
      0.72            +0.2        0.92        perf-profile.self.cycles-pp.profile_signal_perm
      2.97 ±  3%      +0.8        3.80 ±  3%  perf-profile.self.cycles-pp.native_queued_spin_lock_slowpath
     39.00            +3.4       42.44        perf-profile.self.cycles-pp.aa_get_task_label




Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.
diff mbox series

Patch

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 782e14f62201..980abfbd92ed 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -6,6 +6,8 @@ 
 #include <linux/seq_file.h>
 #include <linux/poll.h>
 
+#include <uapi/linux/trace_mmap.h>
+
 struct trace_buffer;
 struct ring_buffer_iter;
 
@@ -211,4 +213,9 @@  int trace_rb_cpu_prepare(unsigned int cpu, struct hlist_node *node);
 #define trace_rb_cpu_prepare	NULL
 #endif
 
+int ring_buffer_map(struct trace_buffer *buffer, int cpu);
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu);
+struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
+				   unsigned long pgoff);
+int ring_buffer_map_get_reader_page(struct trace_buffer *buffer, int cpu);
 #endif /* _LINUX_RING_BUFFER_H */
diff --git a/include/uapi/linux/trace_mmap.h b/include/uapi/linux/trace_mmap.h
new file mode 100644
index 000000000000..653176cc50bc
--- /dev/null
+++ b/include/uapi/linux/trace_mmap.h
@@ -0,0 +1,26 @@ 
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_TRACE_MMAP_H_
+#define _UAPI_TRACE_MMAP_H_
+
+#include <linux/types.h>
+
+struct ring_buffer_meta {
+	unsigned long	entries;
+	unsigned long	overrun;
+	unsigned long	read;
+
+	unsigned long	pages_touched;
+	unsigned long	pages_lost;
+	unsigned long	pages_read;
+
+	__u32		meta_page_size;
+	__u32		nr_data_pages;	/* Number of pages in the ring-buffer */
+
+	struct reader_page {
+		__u32	id;		/* Reader page ID from 0 to nr_data_pages - 1 */
+		__u32	read;		/* Number of bytes read on the reader page */
+		unsigned long	lost_events; /* Events lost at the time of the reader swap */
+	} reader_page;
+};
+
+#endif /* _UAPI_TRACE_MMAP_H_ */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 834b361a4a66..0e8137161955 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -332,6 +332,7 @@  struct buffer_page {
 	local_t		 entries;	/* entries on this page */
 	unsigned long	 real_end;	/* real end of data */
 	struct buffer_data_page *page;	/* Actual data page */
+	u32		 id;		/* ID for external mapping */
 };
 
 /*
@@ -523,6 +524,12 @@  struct ring_buffer_per_cpu {
 	rb_time_t			before_stamp;
 	u64				event_stamp[MAX_NEST];
 	u64				read_stamp;
+
+	int				mapped;
+	struct mutex			mapping_lock;
+	unsigned long			*page_ids;	/* ID to addr */
+	struct ring_buffer_meta		*meta_page;
+
 	/* ring buffer pages to update, > 0 to add, < 0 to remove */
 	long				nr_pages_to_update;
 	struct list_head		new_pages; /* new pages to add */
@@ -1561,6 +1568,13 @@  static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer,
 		/* Again, either we update tail_page or an interrupt does */
 		(void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page);
 	}
+
+	if (READ_ONCE(cpu_buffer->mapped)) {
+		/* Ensure the meta_page is ready */
+		smp_rmb();
+		WRITE_ONCE(cpu_buffer->meta_page->pages_touched,
+			   local_read(&cpu_buffer->pages_touched));
+	}
 }
 
 static void rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer,
@@ -1724,6 +1738,7 @@  rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
 	init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
 	init_waitqueue_head(&cpu_buffer->irq_work.waiters);
 	init_waitqueue_head(&cpu_buffer->irq_work.full_waiters);
+	mutex_init(&cpu_buffer->mapping_lock);
 
 	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 			    GFP_KERNEL, cpu_to_node(cpu));
@@ -2168,7 +2183,6 @@  int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
 	/* prevent another thread from changing buffer sizes */
 	mutex_lock(&buffer->mutex);
 
-
 	if (cpu_id == RING_BUFFER_ALL_CPUS) {
 		/*
 		 * Don't succeed if resizing is disabled, as a reader might be
@@ -2518,6 +2532,15 @@  rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
 		local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
 		local_inc(&cpu_buffer->pages_lost);
 
+		if (READ_ONCE(cpu_buffer->mapped)) {
+			/* Ensure the meta_page is ready */
+			smp_rmb();
+			WRITE_ONCE(cpu_buffer->meta_page->overrun,
+				   local_read(&cpu_buffer->overrun));
+			WRITE_ONCE(cpu_buffer->meta_page->pages_lost,
+				   local_read(&cpu_buffer->pages_lost));
+		}
+
 		/*
 		 * The entries will be zeroed out when we move the
 		 * tail page.
@@ -3180,6 +3203,14 @@  static inline void rb_event_discard(struct ring_buffer_event *event)
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	local_inc(&cpu_buffer->entries);
+
+	if (READ_ONCE(cpu_buffer->mapped)) {
+		/* Ensure the meta_page is ready */
+		smp_rmb();
+		WRITE_ONCE(cpu_buffer->meta_page->entries,
+			   local_read(&cpu_buffer->entries));
+	}
+
 	rb_end_commit(cpu_buffer);
 }
 
@@ -3483,7 +3514,7 @@  static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
 		return;
 
 	/*
-	 * If this interrupted another event, 
+	 * If this interrupted another event,
 	 */
 	if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
 		goto out;
@@ -4655,6 +4686,13 @@  rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 		cpu_buffer->last_overrun = overwrite;
 	}
 
+	if (cpu_buffer->mapped) {
+		WRITE_ONCE(cpu_buffer->meta_page->reader_page.read, 0);
+		WRITE_ONCE(cpu_buffer->meta_page->reader_page.id, reader->id);
+		WRITE_ONCE(cpu_buffer->meta_page->reader_page.lost_events, cpu_buffer->lost_events);
+		WRITE_ONCE(cpu_buffer->meta_page->pages_read, local_read(&cpu_buffer->pages_read));
+	}
+
 	goto again;
 
  out:
@@ -4721,6 +4759,13 @@  static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	length = rb_event_length(event);
 	cpu_buffer->reader_page->read += length;
+
+	if (cpu_buffer->mapped) {
+		WRITE_ONCE(cpu_buffer->meta_page->reader_page.read,
+			   cpu_buffer->reader_page->read);
+		WRITE_ONCE(cpu_buffer->meta_page->read,
+			   cpu_buffer->read);
+	}
 }
 
 static void rb_advance_iter(struct ring_buffer_iter *iter)
@@ -5242,6 +5287,19 @@  unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_size);
 
+static void rb_reset_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	struct ring_buffer_meta *meta = cpu_buffer->meta_page;
+
+	WRITE_ONCE(meta->entries, 0);
+	WRITE_ONCE(meta->overrun, 0);
+	WRITE_ONCE(meta->read, cpu_buffer->read);
+	WRITE_ONCE(meta->pages_touched, 0);
+	WRITE_ONCE(meta->pages_lost, 0);
+	WRITE_ONCE(meta->pages_read, local_read(&cpu_buffer->pages_read));
+	WRITE_ONCE(meta->reader_page.read, cpu_buffer->reader_page->read);
+}
+
 static void
 rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 {
@@ -5288,6 +5346,9 @@  rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->lost_events = 0;
 	cpu_buffer->last_overrun = 0;
 
+	if (cpu_buffer->mapped)
+		rb_reset_meta_page(cpu_buffer);
+
 	rb_head_page_activate(cpu_buffer);
 }
 
@@ -5502,6 +5563,11 @@  int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
 	cpu_buffer_a = buffer_a->buffers[cpu];
 	cpu_buffer_b = buffer_b->buffers[cpu];
 
+	if (READ_ONCE(cpu_buffer_a->mapped) || READ_ONCE(cpu_buffer_b->mapped)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
 	/* At least make sure the two buffers are somewhat the same */
 	if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
 		goto out;
@@ -5735,7 +5801,8 @@  int ring_buffer_read_page(struct trace_buffer *buffer,
 	 * Otherwise, we can simply swap the page with the one passed in.
 	 */
 	if (read || (len < (commit - read)) ||
-	    cpu_buffer->reader_page == cpu_buffer->commit_page) {
+	    cpu_buffer->reader_page == cpu_buffer->commit_page ||
+	    cpu_buffer->mapped) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
 		unsigned int rpos = read;
 		unsigned int pos = 0;
@@ -5852,6 +5919,255 @@  int ring_buffer_read_page(struct trace_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
+static void rb_free_page_ids(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	int i;
+
+	for (i = 0; i < cpu_buffer->nr_pages + 1; i++)
+		virt_to_page(cpu_buffer->page_ids[i])->mapping = NULL;
+
+	kfree(cpu_buffer->page_ids);
+	cpu_buffer->page_ids = NULL;
+}
+
+static int rb_alloc_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	if (cpu_buffer->meta_page)
+		return 0;
+
+	cpu_buffer->meta_page = page_to_virt(alloc_page(GFP_USER));
+	if (!cpu_buffer->meta_page)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void rb_free_meta_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	unsigned long addr = (unsigned long)cpu_buffer->meta_page;
+
+	virt_to_page(addr)->mapping = NULL;
+	free_page(addr);
+	cpu_buffer->meta_page = NULL;
+}
+
+static void rb_setup_ids_meta_page(struct ring_buffer_per_cpu *cpu_buffer,
+				   unsigned long *page_ids)
+{
+	struct ring_buffer_meta *meta = cpu_buffer->meta_page;
+	unsigned int nr_data_pages = cpu_buffer->nr_pages + 1;
+	struct buffer_page *first_page, *bpage;
+	int id = 0;
+
+	page_ids[id] = (unsigned long)cpu_buffer->reader_page->page;
+	cpu_buffer->reader_page->id = id++;
+
+	first_page = bpage = rb_set_head_page(cpu_buffer);
+	do {
+		if (id >= nr_data_pages) {
+			WARN_ON(1);
+			break;
+		}
+
+		page_ids[id] = (unsigned long)bpage->page;
+		bpage->id = id;
+
+		rb_inc_page(&bpage);
+		id++;
+	} while (bpage != first_page);
+
+	/* install page ID to kern VA translation */
+	cpu_buffer->page_ids = page_ids;
+
+	meta->meta_page_size = PAGE_SIZE;
+	meta->nr_data_pages = nr_data_pages;
+	meta->reader_page.id = cpu_buffer->reader_page->id;
+	rb_reset_meta_page(cpu_buffer);
+}
+
+static inline struct ring_buffer_per_cpu *
+rb_get_mapped_buffer(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return ERR_PTR(-EINVAL);
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	mutex_lock(&cpu_buffer->mapping_lock);
+
+	if (!cpu_buffer->mapped) {
+		mutex_unlock(&cpu_buffer->mapping_lock);
+		return ERR_PTR(-ENODEV);
+	}
+
+	return cpu_buffer;
+}
+
+static inline void rb_put_mapped_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+	mutex_unlock(&cpu_buffer->mapping_lock);
+}
+
+int ring_buffer_map(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long flags, *page_ids;
+	int err = 0;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return -EINVAL;
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	mutex_lock(&cpu_buffer->mapping_lock);
+
+	if (cpu_buffer->mapped) {
+		WRITE_ONCE(cpu_buffer->mapped, cpu_buffer->mapped + 1);
+		goto unlock;
+	}
+
+	/* prevent another thread from changing buffer sizes */
+	mutex_lock(&buffer->mutex);
+	atomic_inc(&cpu_buffer->resize_disabled);
+	mutex_unlock(&buffer->mutex);
+
+	err = rb_alloc_meta_page(cpu_buffer);
+	if (err) {
+		atomic_dec(&cpu_buffer->resize_disabled);
+		goto unlock;
+	}
+
+	/* page_ids include the reader page while nr_pages does not */
+	page_ids = kzalloc(sizeof(*page_ids) * (cpu_buffer->nr_pages + 1),
+			   GFP_KERNEL);
+	if (!page_ids) {
+		rb_free_meta_page(cpu_buffer);
+		atomic_dec(&cpu_buffer->resize_disabled);
+		err = -ENOMEM;
+		goto unlock;
+	}
+
+	/*
+	 * Lock all readers to block any page swap until the page IDs are
+	 * assigned.
+	 */
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+
+	rb_setup_ids_meta_page(cpu_buffer, page_ids);
+	/*
+	 * Ensure the writer will observe the meta-page before
+	 * cpu_buffer->mapped.
+	 */
+	smp_wmb();
+	WRITE_ONCE(cpu_buffer->mapped, 1);
+
+	/* Init meta_page values unless the writer did it already */
+	cmpxchg(&cpu_buffer->meta_page->entries, 0,
+		local_read(&cpu_buffer->entries));
+	cmpxchg(&cpu_buffer->meta_page->overrun, 0,
+		local_read(&cpu_buffer->overrun));
+	cmpxchg(&cpu_buffer->meta_page->pages_touched, 0,
+		local_read(&cpu_buffer->pages_touched));
+	cmpxchg(&cpu_buffer->meta_page->pages_lost, 0,
+		local_read(&cpu_buffer->pages_lost));
+
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+unlock:
+	mutex_unlock(&cpu_buffer->mapping_lock);
+
+	return err;
+}
+
+int ring_buffer_unmap(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	int err = 0;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return -EINVAL;
+
+	cpu_buffer = buffer->buffers[cpu];
+
+	mutex_lock(&cpu_buffer->mapping_lock);
+
+	if (!cpu_buffer->mapped) {
+		err = -ENODEV;
+		goto unlock;
+	}
+
+	WRITE_ONCE(cpu_buffer->mapped, cpu_buffer->mapped - 1);
+	if (!cpu_buffer->mapped) {
+		/* Wait the writer and readers to observe !mapped */
+		synchronize_rcu();
+
+		rb_free_page_ids(cpu_buffer);
+		rb_free_meta_page(cpu_buffer);
+		atomic_dec(&cpu_buffer->resize_disabled);
+	}
+
+unlock:
+	mutex_unlock(&cpu_buffer->mapping_lock);
+
+	return err;
+}
+
+/*
+ *   +--------------+
+ *   |   meta page  |  pgoff=0
+ *   +--------------+
+ *   |  data page1  |  page_ids=0
+ *   +--------------+
+ *   |  data page2  |  page_ids=1
+ *         ...
+ */
+struct page *ring_buffer_map_fault(struct trace_buffer *buffer, int cpu,
+				   unsigned long pgoff)
+{
+	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+
+	if (!pgoff)
+		return virt_to_page((void *)cpu_buffer->meta_page);
+
+	pgoff--;
+	if (pgoff > cpu_buffer->nr_pages)
+		return NULL;
+
+	return virt_to_page(cpu_buffer->page_ids[pgoff]);
+}
+
+int ring_buffer_map_get_reader_page(struct trace_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long reader_size, flags;
+
+	cpu_buffer = rb_get_mapped_buffer(buffer, cpu);
+	if (IS_ERR(cpu_buffer))
+		return (int)PTR_ERR(cpu_buffer);
+
+	raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+consume:
+	if (rb_per_cpu_empty(cpu_buffer))
+		goto out;
+	reader_size = rb_page_size(cpu_buffer->reader_page);
+	if (cpu_buffer->reader_page->read < reader_size) {
+		while (cpu_buffer->reader_page->read < reader_size)
+			rb_advance_reader(cpu_buffer);
+		goto out;
+	}
+
+	if (WARN_ON(!rb_get_reader_page(cpu_buffer)))
+		goto out;
+
+	goto consume;
+out:
+	raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+	rb_put_mapped_buffer(cpu_buffer);
+
+	return 0;
+}
+
 /*
  * We only allocate new buffers, never free them if the CPU goes down.
  * If we were to free the buffer, then the user would lose any trace that was in