diff mbox series

[v3,02/10] mm: Allow per-VMA locks on file-backed VMAs

Message ID 20230724185410.1124082-3-willy@infradead.org (mailing list archive)
State New, archived
Headers show
Series Handle most file-backed faults under the VMA lock | expand

Commit Message

Matthew Wilcox July 24, 2023, 6:54 p.m. UTC
Remove the TCP layering violation by allowing per-VMA locks on all VMAs.
The fault path will immediately fail in handle_mm_fault().  There may be
a small performance reduction from this patch as a little unnecessary work
will be done on each page fault.  See later patches for the improvement.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Cc: Arjun Roy <arjunroy@google.com>
Cc: Eric Dumazet <edumazet@google.com>
---
 MAINTAINERS            |  1 -
 include/linux/net_mm.h | 17 -----------------
 include/net/tcp.h      |  1 -
 mm/memory.c            | 12 ++++++------
 net/ipv4/tcp.c         | 11 ++++-------
 5 files changed, 10 insertions(+), 32 deletions(-)
 delete mode 100644 include/linux/net_mm.h

Comments

Conor Dooley July 25, 2023, 12:57 p.m. UTC | #1
Hey,

On Mon, Jul 24, 2023 at 07:54:02PM +0100, Matthew Wilcox (Oracle) wrote:
> Remove the TCP layering violation by allowing per-VMA locks on all VMAs.
> The fault path will immediately fail in handle_mm_fault().  There may be
> a small performance reduction from this patch as a little unnecessary work
> will be done on each page fault.  See later patches for the improvement.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> Cc: Arjun Roy <arjunroy@google.com>
> Cc: Eric Dumazet <edumazet@google.com>

Unless my bisection has gone awry, this is causing boot failures for me
in today's linux-next w/ a splat like so.
Full log and bisection log below, it reproduces on this hardware using
the standard riscv 64-bit defconfig, although my bisection was done with
some more debugging stuff enabled.

	=====================================
	WARNING: bad unlock balance detected!
	6.5.0-rc3-next-20230725 #1 Not tainted
	-------------------------------------
	modprobe/58 is trying to release lock (&vma->vm_lock->lock) at:
	[<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
	but there are no more locks to release!

	other info that might help us debug this:
	1 lock held by modprobe/58:
	 #0: ffffffff8169daa0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire+0x0/0x2e
	
	stack backtrace:
	CPU: 3 PID: 58 Comm: modprobe Not tainted 6.5.0-rc3-next-20230725 #1
	Hardware name: Microchip PolarFire-SoC Icicle Kit (DT)
	Call Trace:
	[<ffffffff80006b48>] show_stack+0x2c/0x38
	[<ffffffff80b49bb2>] dump_stack_lvl+0x60/0x82
	[<ffffffff80b49be8>] dump_stack+0x14/0x1c
	[<ffffffff80089d5a>] print_unlock_imbalance_bug+0x1cc/0x1d6
	[<ffffffff80085e4a>] lock_release+0x236/0x3ae
	[<ffffffff8007e464>] up_read+0x16/0x26
	[<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
	[<ffffffff8000d9ca>] handle_page_fault+0x19e/0x3a4
	[<ffffffff80b4a18c>] do_page_fault+0x20/0x56
	[<ffffffff80004434>] ret_from_exception+0x0/0x64
	------------[ cut here ]------------
	DEBUG_RWSEMS_WARN_ON(tmp < 0): count = 0xffffffffffffff00, magic = 0xffffffe7c63c1558, owner = 0x1, curr 0xffffffe7c0add600, list empty
	WARNING: CPU: 3 PID: 58 at kernel/locking/rwsem.c:1348 __up_read+0x1c2/0x224
	Modules linked in:
	CPU: 3 PID: 58 Comm: modprobe Not tainted 6.5.0-rc3-next-20230725 #1
	Hardware name: Microchip PolarFire-SoC Icicle Kit (DT)
	epc : __up_read+0x1c2/0x224
	 ra : __up_read+0x1c2/0x224
	epc : ffffffff8007e636 ra : ffffffff8007e636 sp : ffffffc8002b3df0
	 gp : ffffffff818ad0f0 tp : ffffffe7c0add600 t0 : ffffffc8002b3ab8
	 t1 : 0000000000000044 t2 : 5357525f47554245 s0 : ffffffc8002b3e20
	 s1 : ffffffffffffff00 a0 : b32cfaf25517f300 a1 : b32cfaf25517f300
	 a2 : b32cfaf25517f300 a3 : c0000000ffffefff a4 : 00000fff00000000
	 a5 : 0000000000000004 a6 : ffffffff81643e90 a7 : 0000000000000038
	 s2 : ffffffe7c63c1560 s3 : ffffffff81ae3578 s4 : ffffffe7c63c1558
	 s5 : ffffffc8002b3ee0 s6 : 0000000000000254 s7 : ffffffe7c63c05a0
	 s8 : 0000003f97687780 s9 : ffffffffeffffef5 s10: 000000006474e553
	 s11: 0000000000000000 t3 : ffffffff8259430f t4 : ffffffff8259430f
	 t5 : ffffffff82594310 t6 : ffffffff8259430f
	status: 0000000200000120 badaddr: 0000000000000000 cause: 0000000000000003
	[<ffffffff8007e636>] __up_read+0x1c2/0x224
	[<ffffffff8007e46a>] up_read+0x1c/0x26
	[<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
	[<ffffffff8000d9ca>] handle_page_fault+0x19e/0x3a4
	[<ffffffff80b4a18c>] do_page_fault+0x20/0x56
	[<ffffffff80004434>] ret_from_exception+0x0/0x64
	irq event stamp: 371
	hardirqs last  enabled at (371): [<ffffffff8000d862>] handle_page_fault+0x36/0x3a4
	hardirqs last disabled at (370): [<ffffffff80b4aef6>] irqentry_enter+0x16/0x4c
	softirqs last  enabled at (180): [<ffffffff80b56566>] __do_softirq+0x57e/0x66e
	softirqs last disabled at (173): [<ffffffff8001f288>] __irq_exit_rcu+0x8c/0x14c
	---[ end trace 0000000000000000 ]---

Thanks,
Conor.

git bisect start
# status: waiting for both good and bad commits
# good: [06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5] Linux 6.5-rc1
git bisect good 06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5
# status: waiting for bad commit, 1 good commit known
# bad: [1e25dd7772483f477f79986d956028e9f47f990a] Add linux-next specific files for 20230725
git bisect bad 1e25dd7772483f477f79986d956028e9f47f990a
# bad: [73002c8a551db94daa4124dbe61a3340999c556e] Merge branch 'master' of git://linuxtv.org/mchehab/media-next.git
git bisect bad 73002c8a551db94daa4124dbe61a3340999c556e
# bad: [c37659958e0ff5aaffae86df9e696638f58cd3a3] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/joel/bmc.git
git bisect bad c37659958e0ff5aaffae86df9e696638f58cd3a3
# good: [bdd1d82e7d02bd2764a68a5cc54533dfc2ba452a] Merge tag 'io_uring-6.5-2023-07-21' of git://git.kernel.dk/linux
git bisect good bdd1d82e7d02bd2764a68a5cc54533dfc2ba452a
# good: [d7b4fea201483d40b7cb1f522915531c6e6b168b] mm/page_io: convert count_swpout_vm_event() to take in a folio
git bisect good d7b4fea201483d40b7cb1f522915531c6e6b168b
# good: [46dce626e248cc91b0612f7c6c31b15f48899465] Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git
git bisect good 46dce626e248cc91b0612f7c6c31b15f48899465
# bad: [e3fb201f858b63a7abf71b5bb563923b6424b98a] Merge branch 'mm-nonmm-unstable' into mm-everything
git bisect bad e3fb201f858b63a7abf71b5bb563923b6424b98a
# good: [1e0fb16464b68936b1b901a7b908c37d6f587b7c] arm64: smccc: replace custom COUNT_ARGS() & CONCATENATE() implementations
git bisect good 1e0fb16464b68936b1b901a7b908c37d6f587b7c
# good: [de32a89e11bab2e9c8c881fbbbad84a492d5ce9c] mm: set up vma iterator for vma_iter_prealloc() calls
git bisect good de32a89e11bab2e9c8c881fbbbad84a492d5ce9c
# bad: [add29438034569277ab967199af39fe42a4d858c] mm: handle PUD faults under the VMA lock
git bisect bad add29438034569277ab967199af39fe42a4d858c
# good: [ecc821b0404e19eebf393f7a5a73d80c3faa69e4] maple_tree: reduce resets during store setup
git bisect good ecc821b0404e19eebf393f7a5a73d80c3faa69e4
# good: [4e9c4f4a2949a2b47917647283799bb5952f2290] mm: remove CONFIG_PER_VMA_LOCK ifdefs
git bisect good 4e9c4f4a2949a2b47917647283799bb5952f2290
# bad: [9a709e2cca6097f66aaba411bd8758cf43a39eb9] mm: move FAULT_FLAG_VMA_LOCK check from handle_mm_fault()
git bisect bad 9a709e2cca6097f66aaba411bd8758cf43a39eb9
# bad: [78b696bb953cb0b553e1cf9084a6e09580aa4e2a] mm: allow per-VMA locks on file-backed VMAs
git bisect bad 78b696bb953cb0b553e1cf9084a6e09580aa4e2a
# first bad commit: [78b696bb953cb0b553e1cf9084a6e09580aa4e2a] mm: allow per-VMA locks on file-backed VMAs

[    0.000000] Linux version 6.5.0-rc3-00283-g78b696bb953c (conor@wendy) (ClangBuiltLinux clang version 15.0.7 (/home/
conor/stuff/dev/llvm/clang 8dfdcc7b7bf66834a761bd8de445840ef68e4d1a), ClangBuiltLinux LLD 15.0.7) #1 SMP PREEMPT @666
[    0.000000] Machine model: Microchip PolarFire-SoC Icicle Kit
[    0.000000] SBI specification v1.0 detected
[    0.000000] SBI implementation ID=0x1 Version=0x10002
[    0.000000] SBI TIME extension detected
[    0.000000] SBI IPI extension detected
[    0.000000] SBI RFENCE extension detected
[    0.000000] SBI SRST extension detected
[    0.000000] earlycon: ns16550a0 at MMIO32 0x0000000020100000 (options '115200n8')
[    0.000000] printk: bootconsole [ns16550a0] enabled
[    0.000000] printk: debug: skip boot console de-registration.
[    0.000000] efi: UEFI not found.
[    0.000000] OF: reserved mem: 0x00000000bfc00000..0x00000000bfffffff (4096 KiB) nomap non-reusable region@BFC00000
[    0.000000] Zone ranges:
[    0.000000]   DMA32    [mem 0x0000000080000000-0x00000000ffffffff]
[    0.000000]   Normal   [mem 0x0000000100000000-0x000000107fffffff]
[    0.000000] Movable zone start for each node
[    0.000000] Early memory node ranges
[    0.000000]   node   0: [mem 0x0000000080000000-0x00000000bfbfffff]
[    0.000000]   node   0: [mem 0x00000000bfc00000-0x00000000bfffffff]
[    0.000000]   node   0: [mem 0x0000001040000000-0x000000107fffffff]
[    0.000000] Initmem setup node 0 [mem 0x0000000080000000-0x000000107fffffff]
[    0.000000] SBI HSM extension detected
[    0.000000] CPU with hartid=0 is not available
[    0.000000] riscv: base ISA extensions acdfim
[    0.000000] riscv: ELF capabilities acdfim
[    0.000000] percpu: Embedded 30 pages/cpu s84320 r8192 d30368 u122880
[    0.000000] Kernel command line: earlycon keep_bootcon root=/dev/mmcblk1p2 rootdelay=10 reboot=cold
[    0.000000] Dentry cache hash table entries: 262144 (order: 9, 2097152 bytes, linear)
[    0.000000] Inode-cache hash table entries: 131072 (order: 8, 1048576 bytes, linear)
[    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: 517120
[    0.000000] mem auto-init: stack:all(zero), heap alloc:off, heap free:off
[    0.000000] software IO TLB: area num 4.
[    0.000000] software IO TLB: mapped [mem 0x00000000bbc00000-0x00000000bfc00000] (64MB)
[    0.000000] Virtual kernel memory layout:
[    0.000000]       fixmap : 0xffffffc6fea00000 - 0xffffffc6ff000000   (6144 kB)
[    0.000000]       pci io : 0xffffffc6ff000000 - 0xffffffc700000000   (  16 MB)
[    0.000000]      vmemmap : 0xffffffc700000000 - 0xffffffc800000000   (4096 MB)
[    0.000000]      vmalloc : 0xffffffc800000000 - 0xffffffd800000000   (  64 GB)
[    0.000000]      modules : 0xffffffff025d4000 - 0xffffffff80000000   (2010 MB)
[    0.000000]       lowmem : 0xffffffd800000000 - 0xffffffe800000000   (  64 GB)
[    0.000000]       kernel : 0xffffffff80000000 - 0xffffffffffffffff   (2047 MB)
[    0.000000] Memory: 1913560K/2097152K available (11570K kernel code, 5054K rwdata, 4096K rodata, 6069K init, 11226K
 bss, 183592K reserved, 0K cma-reserved)
[    0.000000] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=4, Nodes=1
[    0.000000] trace event string verifier disabled
[    0.000000] Running RCU self tests
[    0.000000] Running RCU synchronous self tests
[    0.000000] rcu: Preemptible hierarchical RCU implementation.
[    0.000000] rcu:     RCU lockdep checking is enabled.
[    0.000000] rcu:     RCU restricting CPUs from NR_CPUS=64 to nr_cpu_ids=4.
[    0.000000] rcu:     RCU debug extended QS entry/exit.
[    0.000000]  Trampoline variant of Tasks RCU enabled.
[    0.000000]  Tracing variant of Tasks RCU enabled.
[    0.000000] rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies.
[    0.000000] rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=4
[    0.000000] Running RCU synchronous self tests
[    0.000000] NR_IRQS: 64, nr_irqs: 64, preallocated irqs: 0
[    0.000000] riscv-intc: unable to find hart id for /cpus/cpu@0/interrupt-controller
[    0.000000] riscv-intc: 64 local interrupts mapped
[    0.000000] plic: interrupt-controller@c000000: mapped 186 interrupts with 4 handlers for 9 contexts.
[    0.000000] riscv: providing IPIs using SBI IPI extension
[    0.000000] rcu: srcu_init: Setting srcu_struct sizes based on contention.
[    0.000000] clocksource: riscv_clocksource: mask: 0xffffffffffffffff max_cycles: 0x1d854df40, max_idle_ns: 3526361616960 ns
[    0.000003] sched_clock: 64 bits at 1000kHz, resolution 1000ns, wraps every 2199023255500ns
[    0.010570] Console: colour dummy device 80x25
[    0.015820] printk: console [tty0] enabled
[    0.020660] Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar
[    0.029527] ... MAX_LOCKDEP_SUBCLASSES:  8
[    0.034262] ... MAX_LOCK_DEPTH:          48
[    0.039096] ... MAX_LOCKDEP_KEYS:        8192
[    0.044121] ... CLASSHASH_SIZE:          4096
[    0.049148] ... MAX_LOCKDEP_ENTRIES:     32768
[    0.054277] ... MAX_LOCKDEP_CHAINS:      65536
[    0.059407] ... CHAINHASH_SIZE:          32768
[    0.064539]  memory used by lock dependency info: 6493 kB
[    0.070744]  memory used for stack traces: 4224 kB
[    0.076263]  per task-struct memory footprint: 1920 bytes
[    0.082662] Calibrating delay loop (skipped), value calculated using timer frequency.. 2.00 BogoMIPS (lpj=4000)
[    0.094225] pid_max: default: 32768 minimum: 301
[    0.100802] Mount-cache hash table entries: 4096 (order: 3, 32768 bytes, linear)
[    0.109469] Mountpoint-cache hash table entries: 4096 (order: 3, 32768 bytes, linear)
[    0.125003] Running RCU synchronous self tests
[    0.130214] Running RCU synchronous self tests
[    0.137709] CPU node for /cpus/cpu@0 exist but the possible cpu range is :0-3
[    0.155633] RCU Tasks: Setting shift to 2 and lim to 1 rcu_task_cb_adjust=1.
[    0.164714] RCU Tasks Trace: Setting shift to 2 and lim to 1 rcu_task_cb_adjust=1.
[    0.174340] Running RCU-tasks wait API self tests
[    0.286157] riscv: ELF compat mode unsupported
[    0.286208] ASID allocator disabled (0 bits)
[    0.296532] Callback from call_rcu_tasks_trace() invoked.
[    0.304124] rcu: Hierarchical SRCU implementation.
[    0.309791] rcu:     Max phase no-delay instances is 1000.
[    0.321671] EFI services will not be available.
[    0.329990] smp: Bringing up secondary CPUs ...
[    0.354947] smp: Brought up 1 node, 4 CPUs
[    0.365578] devtmpfs: initialized
[    0.410073] Running RCU synchronous self tests
[    0.416094] Running RCU synchronous self tests
[    0.421843] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
[    0.433116] futex hash table entries: 1024 (order: 5, 131072 bytes, linear)
[    0.442918] pinctrl core: initialized pinctrl subsystem
[    0.456687] NET: Registered PF_NETLINK/PF_ROUTE protocol family
[    0.468413] DMA: preallocated 256 KiB GFP_KERNEL pool for atomic allocations
[    0.476749] DMA: preallocated 256 KiB GFP_KERNEL|GFP_DMA32 pool for atomic allocations
[    0.492183] cpuidle: using governor menu
[    0.558914] Callback from call_rcu_tasks() invoked.
[    0.613189] HugeTLB: registered 2.00 MiB page size, pre-allocated 0 pages
[    0.621137] HugeTLB: 0 KiB vmemmap can be freed for a 2.00 MiB page
[    0.635532] ACPI: Interpreter disabled.
[    0.659634] SCSI subsystem initialized
[    0.667627] usbcore: registered new interface driver usbfs
[    0.674440] usbcore: registered new interface driver hub
[    0.680969] usbcore: registered new device driver usb
[    0.690490] FPGA manager framework
[    0.703508] vgaarb: loaded
[    0.708569] clocksource: Switched to clocksource riscv_clocksource
[    0.723150] pnp: PnP ACPI: disabled
[    0.809949] NET: Registered PF_INET protocol family
[    0.818618] IP idents hash table entries: 32768 (order: 6, 262144 bytes, linear)
[    0.839293] tcp_listen_portaddr_hash hash table entries: 1024 (order: 4, 73728 bytes, linear)
[    0.850714] Table-perturb hash table entries: 65536 (order: 6, 262144 bytes, linear)
[    0.859721] TCP established hash table entries: 16384 (order: 5, 131072 bytes, linear)
[    0.872260] TCP bind hash table entries: 16384 (order: 9, 2359296 bytes, linear)
[    0.902395] TCP: Hash tables configured (established 16384 bind 16384)
[    0.911840] UDP hash table entries: 1024 (order: 5, 163840 bytes, linear)
[    0.921420] UDP-Lite hash table entries: 1024 (order: 5, 163840 bytes, linear)
[    0.932233] NET: Registered PF_UNIX/PF_LOCAL protocol family
[    0.943727] RPC: Registered named UNIX socket transport module.
[    0.950879] RPC: Registered udp transport module.
[    0.956374] RPC: Registered tcp transport module.
[    0.961926] RPC: Registered tcp-with-tls transport module.
[    0.968707] RPC: Registered tcp NFSv4.1 backchannel transport module.
[    0.976198] PCI: CLS 0 bytes, default 64
[    0.985222] Unpacking initramfs...
[    1.000909] workingset: timestamp_bits=62 max_order=19 bucket_order=0
[    1.016047] NFS: Registering the id_resolver key type
[    1.022604] Key type id_resolver registered
[    1.027625] Key type id_legacy registered
[    1.032619] nfs4filelayout_init: NFSv4 File Layout Driver Registering...
[    1.040698] nfs4flexfilelayout_init: NFSv4 Flexfile Layout Driver Registering...
[    1.050868] 9p: Installing v9fs 9p2000 file system support
[    1.059351] NET: Registered PF_ALG protocol family
[    1.065759] Block layer SCSI generic (bsg) driver version 0.4 loaded (major 246)
[    1.074615] io scheduler mq-deadline registered
[    1.079954] io scheduler kyber registered
[    1.084855] io scheduler bfq registered
[    2.691196] String selftests succeeded
[    2.695696] test_string_helpers: Running tests...
[    2.735759] microchip-pcie 3000000000.pcie: host bridge /pcie@3000000000 ranges:
[    2.744646] microchip-pcie 3000000000.pcie:      MEM 0x3008000000..0x3087ffffff -> 0x0008000000
[    2.754854] microchip-pcie 3000000000.pcie:   IB MEM 0x0000000000..0x00ffffffff -> 0x0000000000
[    2.793603] microchip-pcie 3000000000.pcie: ECAM at [mem 0x3000000000-0x3007ffffff] for [bus 00-7f]
[    2.806599] microchip-pcie 3000000000.pcie: PCI host bridge to bus 0000:00
[    2.814622] pci_bus 0000:00: root bus resource [bus 00-7f]
[    2.821060] pci_bus 0000:00: root bus resource [mem 0x3008000000-0x3087ffffff] (bus address [0x08000000-0x87ffffff]
)
[    2.833923] pci 0000:00:00.0: [11aa:1556] type 01 class 0x000604
[    2.841032] pci 0000:00:00.0: reg 0x10: [mem 0x00000000-0x7fffffff 64bit pref]
[    2.850230] pci 0000:00:00.0: supports D1 D2
[    2.855282] pci 0000:00:00.0: PME# supported from D0 D1 D2 D3hot D3cold
[    2.918454] pci_bus 0000:01: busn_res: [bus 01-7f] end is updated to 01
[    2.926316] pci 0000:00:00.0: BAR 0: no space for [mem size 0x80000000 64bit pref]
[    2.935124] pci 0000:00:00.0: BAR 0: failed to assign [mem size 0x80000000 64bit pref]
[    2.944337] pci 0000:00:00.0: BAR 8: assigned [mem 0x3008000000-0x30081fffff]
[    2.952729] pci 0000:00:00.0: BAR 9: assigned [mem 0x3008200000-0x30083fffff 64bit pref]
[    2.962291] pci 0000:00:00.0: BAR 7: no space for [io  size 0x1000]
[    2.969658] pci 0000:00:00.0: BAR 7: failed to assign [io  size 0x1000]
[    2.977402] pci 0000:00:00.0: PCI bridge to [bus 01]
[    2.983258] pci 0000:00:00.0:   bridge window [mem 0x3008000000-0x30081fffff]
[    2.991555] pci 0000:00:00.0:   bridge window [mem 0x3008200000-0x30083fffff 64bit pref]
[    3.012105] CCACHE: 4 banks, 16 ways, sets/bank=512, bytes/block=64
[    3.019534] CCACHE: Index of the largest way enabled: 11
[    3.553216] Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
[    3.599227] 20100000.serial: ttyS1 at MMIO 0x20100000 (irq = 53, base_baud = 9375000) is a 16550A
[    3.610933] printk: console [ttyS1] enabled
[    3.610933] printk: console [ttyS1] enabled
[    3.635869] 20102000.serial: ttyS2 at MMIO 0x20102000 (irq = 54, base_baud = 9375000) is a 16550A
[    3.670527] 20104000.serial: ttyS3 at MMIO 0x20104000 (irq = 55, base_baud = 9375000) is a 16550A
[    3.706515] 20106000.serial: ttyS0 at MMIO 0x20106000 (irq = 56, base_baud = 9375000) is a 16550A
[    3.823007] loop: module loaded
[    3.839186] zram: Added device: zram0
[    3.867557] microchip-corespi 20108000.spi: Registered SPI controller 0
[    3.887509] microchip-corespi 20109000.spi: Registered SPI controller 1
[    5.062117] macb 20110000.ethernet eth0: Cadence GEM rev 0x0107010c at 0x20110000 irq 60 (00:04:a3:bf:d2:70)
[    6.106111] Freeing initrd memory: 25480K
[    6.124243]
[    6.127470] =====================================
[    6.137414] WARNING: bad unlock balance detected!
[    6.147364] 6.5.0-rc3-next-20230725 #1 Not tainted
[    6.157500] -------------------------------------
[    6.167447] modprobe/58 is trying to release lock (&vma->vm_lock->lock) at:
[    6.182194] [<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
[    6.193279] but there are no more locks to release!
[    6.203590]
[    6.203590] other info that might help us debug this:
[    6.217392] 1 lock held by modprobe/58:
[    6.225516]  #0: ffffffff8169daa0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire+0x0/0x2e
[    6.243441]
[    6.243441] stack backtrace:
[    6.252662] CPU: 3 PID: 58 Comm: modprobe Not tainted 6.5.0-rc3-next-20230725 #1
[    6.268319] Hardware name: Microchip PolarFire-SoC Icicle Kit (DT)
[    6.281383] Call Trace:
[    6.286563] [<ffffffff80006b48>] show_stack+0x2c/0x38
[    6.297274] [<ffffffff80b49bb2>] dump_stack_lvl+0x60/0x82
[    6.308723] [<ffffffff80b49be8>] dump_stack+0x14/0x1c
[    6.319417] [<ffffffff80089d5a>] print_unlock_imbalance_bug+0x1cc/0x1d6
[    6.333417] [<ffffffff80085e4a>] lock_release+0x236/0x3ae
[    6.344867] [<ffffffff8007e464>] up_read+0x16/0x26
[    6.355013] [<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
[    6.366072] [<ffffffff8000d9ca>] handle_page_fault+0x19e/0x3a4
[    6.378415] [<ffffffff80b4a18c>] do_page_fault+0x20/0x56
[    6.389656] [<ffffffff80004434>] ret_from_exception+0x0/0x64
[    6.401771] ------------[ cut here ]------------
[    6.411655] DEBUG_RWSEMS_WARN_ON(tmp < 0): count = 0xffffffffffffff00, magic = 0xffffffe7c63c1558, owner = 0x1, curr 0xffffffe7c0add600, list empty
[    6.439789] WARNING: CPU: 3 PID: 58 at kernel/locking/rwsem.c:1348 __up_read+0x1c2/0x224
[    6.439840] Modules linked in:
[    6.439855] CPU: 3 PID: 58 Comm: modprobe Not tainted 6.5.0-rc3-next-20230725 #1
[    6.439884] Hardware name: Microchip PolarFire-SoC Icicle Kit (DT)
[    6.439896] epc : __up_read+0x1c2/0x224
[    6.439923]  ra : __up_read+0x1c2/0x224
[    6.439947] epc : ffffffff8007e636 ra : ffffffff8007e636 sp : ffffffc8002b3df0
[    6.439966]  gp : ffffffff818ad0f0 tp : ffffffe7c0add600 t0 : ffffffc8002b3ab8
[    6.439986]  t1 : 0000000000000044 t2 : 5357525f47554245 s0 : ffffffc8002b3e20
[    6.440004]  s1 : ffffffffffffff00 a0 : b32cfaf25517f300 a1 : b32cfaf25517f300
[    6.440023]  a2 : b32cfaf25517f300 a3 : c0000000ffffefff a4 : 00000fff00000000
[    6.440042]  a5 : 0000000000000004 a6 : ffffffff81643e90 a7 : 0000000000000038
[    6.440060]  s2 : ffffffe7c63c1560 s3 : ffffffff81ae3578 s4 : ffffffe7c63c1558
[    6.440079]  s5 : ffffffc8002b3ee0 s6 : 0000000000000254 s7 : ffffffe7c63c05a0
[    6.440098]  s8 : 0000003f97687780 s9 : ffffffffeffffef5 s10: 000000006474e553
[    6.440117]  s11: 0000000000000000 t3 : ffffffff8259430f t4 : ffffffff8259430f
[    6.440136]  t5 : ffffffff82594310 t6 : ffffffff8259430f
[    6.440151] status: 0000000200000120 badaddr: 0000000000000000 cause: 0000000000000003
[    6.440169] [<ffffffff8007e636>] __up_read+0x1c2/0x224
[    6.440201] [<ffffffff8007e46a>] up_read+0x1c/0x26
[    6.440229] [<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
[    6.440255] [<ffffffff8000d9ca>] handle_page_fault+0x19e/0x3a4
[    6.440282] [<ffffffff80b4a18c>] do_page_fault+0x20/0x56
[    6.440309] [<ffffffff80004434>] ret_from_exception+0x0/0x64
[    6.440339] irq event stamp: 371
[    6.440347] hardirqs last  enabled at (371): [<ffffffff8000d862>] handle_page_fault+0x36/0x3a4
[    6.440377] hardirqs last disabled at (370): [<ffffffff80b4aef6>] irqentry_enter+0x16/0x4c
[    6.440407] softirqs last  enabled at (180): [<ffffffff80b56566>] __do_softirq+0x57e/0x66e
[    6.440453] softirqs last disabled at (173): [<ffffffff8001f288>] __irq_exit_rcu+0x8c/0x14c
[    6.440494] ---[ end trace 0000000000000000 ]---
Suren Baghdasaryan July 25, 2023, 2:15 p.m. UTC | #2
On Tue, Jul 25, 2023 at 5:58 AM Conor Dooley <conor.dooley@microchip.com> wrote:
>
> Hey,
>
> On Mon, Jul 24, 2023 at 07:54:02PM +0100, Matthew Wilcox (Oracle) wrote:
> > Remove the TCP layering violation by allowing per-VMA locks on all VMAs.
> > The fault path will immediately fail in handle_mm_fault().  There may be
> > a small performance reduction from this patch as a little unnecessary work
> > will be done on each page fault.  See later patches for the improvement.
> >
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> > Cc: Arjun Roy <arjunroy@google.com>
> > Cc: Eric Dumazet <edumazet@google.com>
>
> Unless my bisection has gone awry, this is causing boot failures for me
> in today's linux-next w/ a splat like so.

This patch requires [1] to work correctly. It follows the rule
introduced in [1] that anyone returning VM_FAULT_RETRY should also do
vma_end_read(). [1] is merged into mm-unstable but has not reached
linux-next yet, it seems.

[1] https://lore.kernel.org/all/20230630211957.1341547-4-surenb@google.com/

> Full log and bisection log below, it reproduces on this hardware using
> the standard riscv 64-bit defconfig, although my bisection was done with
> some more debugging stuff enabled.
>
>         =====================================
>         WARNING: bad unlock balance detected!
>         6.5.0-rc3-next-20230725 #1 Not tainted
>         -------------------------------------
>         modprobe/58 is trying to release lock (&vma->vm_lock->lock) at:
>         [<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
>         but there are no more locks to release!
>
>         other info that might help us debug this:
>         1 lock held by modprobe/58:
>          #0: ffffffff8169daa0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire+0x0/0x2e
>
>         stack backtrace:
>         CPU: 3 PID: 58 Comm: modprobe Not tainted 6.5.0-rc3-next-20230725 #1
>         Hardware name: Microchip PolarFire-SoC Icicle Kit (DT)
>         Call Trace:
>         [<ffffffff80006b48>] show_stack+0x2c/0x38
>         [<ffffffff80b49bb2>] dump_stack_lvl+0x60/0x82
>         [<ffffffff80b49be8>] dump_stack+0x14/0x1c
>         [<ffffffff80089d5a>] print_unlock_imbalance_bug+0x1cc/0x1d6
>         [<ffffffff80085e4a>] lock_release+0x236/0x3ae
>         [<ffffffff8007e464>] up_read+0x16/0x26
>         [<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
>         [<ffffffff8000d9ca>] handle_page_fault+0x19e/0x3a4
>         [<ffffffff80b4a18c>] do_page_fault+0x20/0x56
>         [<ffffffff80004434>] ret_from_exception+0x0/0x64
>         ------------[ cut here ]------------
>         DEBUG_RWSEMS_WARN_ON(tmp < 0): count = 0xffffffffffffff00, magic = 0xffffffe7c63c1558, owner = 0x1, curr 0xffffffe7c0add600, list empty
>         WARNING: CPU: 3 PID: 58 at kernel/locking/rwsem.c:1348 __up_read+0x1c2/0x224
>         Modules linked in:
>         CPU: 3 PID: 58 Comm: modprobe Not tainted 6.5.0-rc3-next-20230725 #1
>         Hardware name: Microchip PolarFire-SoC Icicle Kit (DT)
>         epc : __up_read+0x1c2/0x224
>          ra : __up_read+0x1c2/0x224
>         epc : ffffffff8007e636 ra : ffffffff8007e636 sp : ffffffc8002b3df0
>          gp : ffffffff818ad0f0 tp : ffffffe7c0add600 t0 : ffffffc8002b3ab8
>          t1 : 0000000000000044 t2 : 5357525f47554245 s0 : ffffffc8002b3e20
>          s1 : ffffffffffffff00 a0 : b32cfaf25517f300 a1 : b32cfaf25517f300
>          a2 : b32cfaf25517f300 a3 : c0000000ffffefff a4 : 00000fff00000000
>          a5 : 0000000000000004 a6 : ffffffff81643e90 a7 : 0000000000000038
>          s2 : ffffffe7c63c1560 s3 : ffffffff81ae3578 s4 : ffffffe7c63c1558
>          s5 : ffffffc8002b3ee0 s6 : 0000000000000254 s7 : ffffffe7c63c05a0
>          s8 : 0000003f97687780 s9 : ffffffffeffffef5 s10: 000000006474e553
>          s11: 0000000000000000 t3 : ffffffff8259430f t4 : ffffffff8259430f
>          t5 : ffffffff82594310 t6 : ffffffff8259430f
>         status: 0000000200000120 badaddr: 0000000000000000 cause: 0000000000000003
>         [<ffffffff8007e636>] __up_read+0x1c2/0x224
>         [<ffffffff8007e46a>] up_read+0x1c/0x26
>         [<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
>         [<ffffffff8000d9ca>] handle_page_fault+0x19e/0x3a4
>         [<ffffffff80b4a18c>] do_page_fault+0x20/0x56
>         [<ffffffff80004434>] ret_from_exception+0x0/0x64
>         irq event stamp: 371
>         hardirqs last  enabled at (371): [<ffffffff8000d862>] handle_page_fault+0x36/0x3a4
>         hardirqs last disabled at (370): [<ffffffff80b4aef6>] irqentry_enter+0x16/0x4c
>         softirqs last  enabled at (180): [<ffffffff80b56566>] __do_softirq+0x57e/0x66e
>         softirqs last disabled at (173): [<ffffffff8001f288>] __irq_exit_rcu+0x8c/0x14c
>         ---[ end trace 0000000000000000 ]---
>
> Thanks,
> Conor.
>
> git bisect start
> # status: waiting for both good and bad commits
> # good: [06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5] Linux 6.5-rc1
> git bisect good 06c2afb862f9da8dc5efa4b6076a0e48c3fbaaa5
> # status: waiting for bad commit, 1 good commit known
> # bad: [1e25dd7772483f477f79986d956028e9f47f990a] Add linux-next specific files for 20230725
> git bisect bad 1e25dd7772483f477f79986d956028e9f47f990a
> # bad: [73002c8a551db94daa4124dbe61a3340999c556e] Merge branch 'master' of git://linuxtv.org/mchehab/media-next.git
> git bisect bad 73002c8a551db94daa4124dbe61a3340999c556e
> # bad: [c37659958e0ff5aaffae86df9e696638f58cd3a3] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/joel/bmc.git
> git bisect bad c37659958e0ff5aaffae86df9e696638f58cd3a3
> # good: [bdd1d82e7d02bd2764a68a5cc54533dfc2ba452a] Merge tag 'io_uring-6.5-2023-07-21' of git://git.kernel.dk/linux
> git bisect good bdd1d82e7d02bd2764a68a5cc54533dfc2ba452a
> # good: [d7b4fea201483d40b7cb1f522915531c6e6b168b] mm/page_io: convert count_swpout_vm_event() to take in a folio
> git bisect good d7b4fea201483d40b7cb1f522915531c6e6b168b
> # good: [46dce626e248cc91b0612f7c6c31b15f48899465] Merge branch 'fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/cxl/cxl.git
> git bisect good 46dce626e248cc91b0612f7c6c31b15f48899465
> # bad: [e3fb201f858b63a7abf71b5bb563923b6424b98a] Merge branch 'mm-nonmm-unstable' into mm-everything
> git bisect bad e3fb201f858b63a7abf71b5bb563923b6424b98a
> # good: [1e0fb16464b68936b1b901a7b908c37d6f587b7c] arm64: smccc: replace custom COUNT_ARGS() & CONCATENATE() implementations
> git bisect good 1e0fb16464b68936b1b901a7b908c37d6f587b7c
> # good: [de32a89e11bab2e9c8c881fbbbad84a492d5ce9c] mm: set up vma iterator for vma_iter_prealloc() calls
> git bisect good de32a89e11bab2e9c8c881fbbbad84a492d5ce9c
> # bad: [add29438034569277ab967199af39fe42a4d858c] mm: handle PUD faults under the VMA lock
> git bisect bad add29438034569277ab967199af39fe42a4d858c
> # good: [ecc821b0404e19eebf393f7a5a73d80c3faa69e4] maple_tree: reduce resets during store setup
> git bisect good ecc821b0404e19eebf393f7a5a73d80c3faa69e4
> # good: [4e9c4f4a2949a2b47917647283799bb5952f2290] mm: remove CONFIG_PER_VMA_LOCK ifdefs
> git bisect good 4e9c4f4a2949a2b47917647283799bb5952f2290
> # bad: [9a709e2cca6097f66aaba411bd8758cf43a39eb9] mm: move FAULT_FLAG_VMA_LOCK check from handle_mm_fault()
> git bisect bad 9a709e2cca6097f66aaba411bd8758cf43a39eb9
> # bad: [78b696bb953cb0b553e1cf9084a6e09580aa4e2a] mm: allow per-VMA locks on file-backed VMAs
> git bisect bad 78b696bb953cb0b553e1cf9084a6e09580aa4e2a
> # first bad commit: [78b696bb953cb0b553e1cf9084a6e09580aa4e2a] mm: allow per-VMA locks on file-backed VMAs
>
> [    0.000000] Linux version 6.5.0-rc3-00283-g78b696bb953c (conor@wendy) (ClangBuiltLinux clang version 15.0.7 (/home/
> conor/stuff/dev/llvm/clang 8dfdcc7b7bf66834a761bd8de445840ef68e4d1a), ClangBuiltLinux LLD 15.0.7) #1 SMP PREEMPT @666
> [    0.000000] Machine model: Microchip PolarFire-SoC Icicle Kit
> [    0.000000] SBI specification v1.0 detected
> [    0.000000] SBI implementation ID=0x1 Version=0x10002
> [    0.000000] SBI TIME extension detected
> [    0.000000] SBI IPI extension detected
> [    0.000000] SBI RFENCE extension detected
> [    0.000000] SBI SRST extension detected
> [    0.000000] earlycon: ns16550a0 at MMIO32 0x0000000020100000 (options '115200n8')
> [    0.000000] printk: bootconsole [ns16550a0] enabled
> [    0.000000] printk: debug: skip boot console de-registration.
> [    0.000000] efi: UEFI not found.
> [    0.000000] OF: reserved mem: 0x00000000bfc00000..0x00000000bfffffff (4096 KiB) nomap non-reusable region@BFC00000
> [    0.000000] Zone ranges:
> [    0.000000]   DMA32    [mem 0x0000000080000000-0x00000000ffffffff]
> [    0.000000]   Normal   [mem 0x0000000100000000-0x000000107fffffff]
> [    0.000000] Movable zone start for each node
> [    0.000000] Early memory node ranges
> [    0.000000]   node   0: [mem 0x0000000080000000-0x00000000bfbfffff]
> [    0.000000]   node   0: [mem 0x00000000bfc00000-0x00000000bfffffff]
> [    0.000000]   node   0: [mem 0x0000001040000000-0x000000107fffffff]
> [    0.000000] Initmem setup node 0 [mem 0x0000000080000000-0x000000107fffffff]
> [    0.000000] SBI HSM extension detected
> [    0.000000] CPU with hartid=0 is not available
> [    0.000000] riscv: base ISA extensions acdfim
> [    0.000000] riscv: ELF capabilities acdfim
> [    0.000000] percpu: Embedded 30 pages/cpu s84320 r8192 d30368 u122880
> [    0.000000] Kernel command line: earlycon keep_bootcon root=/dev/mmcblk1p2 rootdelay=10 reboot=cold
> [    0.000000] Dentry cache hash table entries: 262144 (order: 9, 2097152 bytes, linear)
> [    0.000000] Inode-cache hash table entries: 131072 (order: 8, 1048576 bytes, linear)
> [    0.000000] Built 1 zonelists, mobility grouping on.  Total pages: 517120
> [    0.000000] mem auto-init: stack:all(zero), heap alloc:off, heap free:off
> [    0.000000] software IO TLB: area num 4.
> [    0.000000] software IO TLB: mapped [mem 0x00000000bbc00000-0x00000000bfc00000] (64MB)
> [    0.000000] Virtual kernel memory layout:
> [    0.000000]       fixmap : 0xffffffc6fea00000 - 0xffffffc6ff000000   (6144 kB)
> [    0.000000]       pci io : 0xffffffc6ff000000 - 0xffffffc700000000   (  16 MB)
> [    0.000000]      vmemmap : 0xffffffc700000000 - 0xffffffc800000000   (4096 MB)
> [    0.000000]      vmalloc : 0xffffffc800000000 - 0xffffffd800000000   (  64 GB)
> [    0.000000]      modules : 0xffffffff025d4000 - 0xffffffff80000000   (2010 MB)
> [    0.000000]       lowmem : 0xffffffd800000000 - 0xffffffe800000000   (  64 GB)
> [    0.000000]       kernel : 0xffffffff80000000 - 0xffffffffffffffff   (2047 MB)
> [    0.000000] Memory: 1913560K/2097152K available (11570K kernel code, 5054K rwdata, 4096K rodata, 6069K init, 11226K
>  bss, 183592K reserved, 0K cma-reserved)
> [    0.000000] SLUB: HWalign=64, Order=0-3, MinObjects=0, CPUs=4, Nodes=1
> [    0.000000] trace event string verifier disabled
> [    0.000000] Running RCU self tests
> [    0.000000] Running RCU synchronous self tests
> [    0.000000] rcu: Preemptible hierarchical RCU implementation.
> [    0.000000] rcu:     RCU lockdep checking is enabled.
> [    0.000000] rcu:     RCU restricting CPUs from NR_CPUS=64 to nr_cpu_ids=4.
> [    0.000000] rcu:     RCU debug extended QS entry/exit.
> [    0.000000]  Trampoline variant of Tasks RCU enabled.
> [    0.000000]  Tracing variant of Tasks RCU enabled.
> [    0.000000] rcu: RCU calculated value of scheduler-enlistment delay is 25 jiffies.
> [    0.000000] rcu: Adjusting geometry for rcu_fanout_leaf=16, nr_cpu_ids=4
> [    0.000000] Running RCU synchronous self tests
> [    0.000000] NR_IRQS: 64, nr_irqs: 64, preallocated irqs: 0
> [    0.000000] riscv-intc: unable to find hart id for /cpus/cpu@0/interrupt-controller
> [    0.000000] riscv-intc: 64 local interrupts mapped
> [    0.000000] plic: interrupt-controller@c000000: mapped 186 interrupts with 4 handlers for 9 contexts.
> [    0.000000] riscv: providing IPIs using SBI IPI extension
> [    0.000000] rcu: srcu_init: Setting srcu_struct sizes based on contention.
> [    0.000000] clocksource: riscv_clocksource: mask: 0xffffffffffffffff max_cycles: 0x1d854df40, max_idle_ns: 3526361616960 ns
> [    0.000003] sched_clock: 64 bits at 1000kHz, resolution 1000ns, wraps every 2199023255500ns
> [    0.010570] Console: colour dummy device 80x25
> [    0.015820] printk: console [tty0] enabled
> [    0.020660] Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar
> [    0.029527] ... MAX_LOCKDEP_SUBCLASSES:  8
> [    0.034262] ... MAX_LOCK_DEPTH:          48
> [    0.039096] ... MAX_LOCKDEP_KEYS:        8192
> [    0.044121] ... CLASSHASH_SIZE:          4096
> [    0.049148] ... MAX_LOCKDEP_ENTRIES:     32768
> [    0.054277] ... MAX_LOCKDEP_CHAINS:      65536
> [    0.059407] ... CHAINHASH_SIZE:          32768
> [    0.064539]  memory used by lock dependency info: 6493 kB
> [    0.070744]  memory used for stack traces: 4224 kB
> [    0.076263]  per task-struct memory footprint: 1920 bytes
> [    0.082662] Calibrating delay loop (skipped), value calculated using timer frequency.. 2.00 BogoMIPS (lpj=4000)
> [    0.094225] pid_max: default: 32768 minimum: 301
> [    0.100802] Mount-cache hash table entries: 4096 (order: 3, 32768 bytes, linear)
> [    0.109469] Mountpoint-cache hash table entries: 4096 (order: 3, 32768 bytes, linear)
> [    0.125003] Running RCU synchronous self tests
> [    0.130214] Running RCU synchronous self tests
> [    0.137709] CPU node for /cpus/cpu@0 exist but the possible cpu range is :0-3
> [    0.155633] RCU Tasks: Setting shift to 2 and lim to 1 rcu_task_cb_adjust=1.
> [    0.164714] RCU Tasks Trace: Setting shift to 2 and lim to 1 rcu_task_cb_adjust=1.
> [    0.174340] Running RCU-tasks wait API self tests
> [    0.286157] riscv: ELF compat mode unsupported
> [    0.286208] ASID allocator disabled (0 bits)
> [    0.296532] Callback from call_rcu_tasks_trace() invoked.
> [    0.304124] rcu: Hierarchical SRCU implementation.
> [    0.309791] rcu:     Max phase no-delay instances is 1000.
> [    0.321671] EFI services will not be available.
> [    0.329990] smp: Bringing up secondary CPUs ...
> [    0.354947] smp: Brought up 1 node, 4 CPUs
> [    0.365578] devtmpfs: initialized
> [    0.410073] Running RCU synchronous self tests
> [    0.416094] Running RCU synchronous self tests
> [    0.421843] clocksource: jiffies: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 7645041785100000 ns
> [    0.433116] futex hash table entries: 1024 (order: 5, 131072 bytes, linear)
> [    0.442918] pinctrl core: initialized pinctrl subsystem
> [    0.456687] NET: Registered PF_NETLINK/PF_ROUTE protocol family
> [    0.468413] DMA: preallocated 256 KiB GFP_KERNEL pool for atomic allocations
> [    0.476749] DMA: preallocated 256 KiB GFP_KERNEL|GFP_DMA32 pool for atomic allocations
> [    0.492183] cpuidle: using governor menu
> [    0.558914] Callback from call_rcu_tasks() invoked.
> [    0.613189] HugeTLB: registered 2.00 MiB page size, pre-allocated 0 pages
> [    0.621137] HugeTLB: 0 KiB vmemmap can be freed for a 2.00 MiB page
> [    0.635532] ACPI: Interpreter disabled.
> [    0.659634] SCSI subsystem initialized
> [    0.667627] usbcore: registered new interface driver usbfs
> [    0.674440] usbcore: registered new interface driver hub
> [    0.680969] usbcore: registered new device driver usb
> [    0.690490] FPGA manager framework
> [    0.703508] vgaarb: loaded
> [    0.708569] clocksource: Switched to clocksource riscv_clocksource
> [    0.723150] pnp: PnP ACPI: disabled
> [    0.809949] NET: Registered PF_INET protocol family
> [    0.818618] IP idents hash table entries: 32768 (order: 6, 262144 bytes, linear)
> [    0.839293] tcp_listen_portaddr_hash hash table entries: 1024 (order: 4, 73728 bytes, linear)
> [    0.850714] Table-perturb hash table entries: 65536 (order: 6, 262144 bytes, linear)
> [    0.859721] TCP established hash table entries: 16384 (order: 5, 131072 bytes, linear)
> [    0.872260] TCP bind hash table entries: 16384 (order: 9, 2359296 bytes, linear)
> [    0.902395] TCP: Hash tables configured (established 16384 bind 16384)
> [    0.911840] UDP hash table entries: 1024 (order: 5, 163840 bytes, linear)
> [    0.921420] UDP-Lite hash table entries: 1024 (order: 5, 163840 bytes, linear)
> [    0.932233] NET: Registered PF_UNIX/PF_LOCAL protocol family
> [    0.943727] RPC: Registered named UNIX socket transport module.
> [    0.950879] RPC: Registered udp transport module.
> [    0.956374] RPC: Registered tcp transport module.
> [    0.961926] RPC: Registered tcp-with-tls transport module.
> [    0.968707] RPC: Registered tcp NFSv4.1 backchannel transport module.
> [    0.976198] PCI: CLS 0 bytes, default 64
> [    0.985222] Unpacking initramfs...
> [    1.000909] workingset: timestamp_bits=62 max_order=19 bucket_order=0
> [    1.016047] NFS: Registering the id_resolver key type
> [    1.022604] Key type id_resolver registered
> [    1.027625] Key type id_legacy registered
> [    1.032619] nfs4filelayout_init: NFSv4 File Layout Driver Registering...
> [    1.040698] nfs4flexfilelayout_init: NFSv4 Flexfile Layout Driver Registering...
> [    1.050868] 9p: Installing v9fs 9p2000 file system support
> [    1.059351] NET: Registered PF_ALG protocol family
> [    1.065759] Block layer SCSI generic (bsg) driver version 0.4 loaded (major 246)
> [    1.074615] io scheduler mq-deadline registered
> [    1.079954] io scheduler kyber registered
> [    1.084855] io scheduler bfq registered
> [    2.691196] String selftests succeeded
> [    2.695696] test_string_helpers: Running tests...
> [    2.735759] microchip-pcie 3000000000.pcie: host bridge /pcie@3000000000 ranges:
> [    2.744646] microchip-pcie 3000000000.pcie:      MEM 0x3008000000..0x3087ffffff -> 0x0008000000
> [    2.754854] microchip-pcie 3000000000.pcie:   IB MEM 0x0000000000..0x00ffffffff -> 0x0000000000
> [    2.793603] microchip-pcie 3000000000.pcie: ECAM at [mem 0x3000000000-0x3007ffffff] for [bus 00-7f]
> [    2.806599] microchip-pcie 3000000000.pcie: PCI host bridge to bus 0000:00
> [    2.814622] pci_bus 0000:00: root bus resource [bus 00-7f]
> [    2.821060] pci_bus 0000:00: root bus resource [mem 0x3008000000-0x3087ffffff] (bus address [0x08000000-0x87ffffff]
> )
> [    2.833923] pci 0000:00:00.0: [11aa:1556] type 01 class 0x000604
> [    2.841032] pci 0000:00:00.0: reg 0x10: [mem 0x00000000-0x7fffffff 64bit pref]
> [    2.850230] pci 0000:00:00.0: supports D1 D2
> [    2.855282] pci 0000:00:00.0: PME# supported from D0 D1 D2 D3hot D3cold
> [    2.918454] pci_bus 0000:01: busn_res: [bus 01-7f] end is updated to 01
> [    2.926316] pci 0000:00:00.0: BAR 0: no space for [mem size 0x80000000 64bit pref]
> [    2.935124] pci 0000:00:00.0: BAR 0: failed to assign [mem size 0x80000000 64bit pref]
> [    2.944337] pci 0000:00:00.0: BAR 8: assigned [mem 0x3008000000-0x30081fffff]
> [    2.952729] pci 0000:00:00.0: BAR 9: assigned [mem 0x3008200000-0x30083fffff 64bit pref]
> [    2.962291] pci 0000:00:00.0: BAR 7: no space for [io  size 0x1000]
> [    2.969658] pci 0000:00:00.0: BAR 7: failed to assign [io  size 0x1000]
> [    2.977402] pci 0000:00:00.0: PCI bridge to [bus 01]
> [    2.983258] pci 0000:00:00.0:   bridge window [mem 0x3008000000-0x30081fffff]
> [    2.991555] pci 0000:00:00.0:   bridge window [mem 0x3008200000-0x30083fffff 64bit pref]
> [    3.012105] CCACHE: 4 banks, 16 ways, sets/bank=512, bytes/block=64
> [    3.019534] CCACHE: Index of the largest way enabled: 11
> [    3.553216] Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
> [    3.599227] 20100000.serial: ttyS1 at MMIO 0x20100000 (irq = 53, base_baud = 9375000) is a 16550A
> [    3.610933] printk: console [ttyS1] enabled
> [    3.610933] printk: console [ttyS1] enabled
> [    3.635869] 20102000.serial: ttyS2 at MMIO 0x20102000 (irq = 54, base_baud = 9375000) is a 16550A
> [    3.670527] 20104000.serial: ttyS3 at MMIO 0x20104000 (irq = 55, base_baud = 9375000) is a 16550A
> [    3.706515] 20106000.serial: ttyS0 at MMIO 0x20106000 (irq = 56, base_baud = 9375000) is a 16550A
> [    3.823007] loop: module loaded
> [    3.839186] zram: Added device: zram0
> [    3.867557] microchip-corespi 20108000.spi: Registered SPI controller 0
> [    3.887509] microchip-corespi 20109000.spi: Registered SPI controller 1
> [    5.062117] macb 20110000.ethernet eth0: Cadence GEM rev 0x0107010c at 0x20110000 irq 60 (00:04:a3:bf:d2:70)
> [    6.106111] Freeing initrd memory: 25480K
> [    6.124243]
> [    6.127470] =====================================
> [    6.137414] WARNING: bad unlock balance detected!
> [    6.147364] 6.5.0-rc3-next-20230725 #1 Not tainted
> [    6.157500] -------------------------------------
> [    6.167447] modprobe/58 is trying to release lock (&vma->vm_lock->lock) at:
> [    6.182194] [<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
> [    6.193279] but there are no more locks to release!
> [    6.203590]
> [    6.203590] other info that might help us debug this:
> [    6.217392] 1 lock held by modprobe/58:
> [    6.225516]  #0: ffffffff8169daa0 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire+0x0/0x2e
> [    6.243441]
> [    6.243441] stack backtrace:
> [    6.252662] CPU: 3 PID: 58 Comm: modprobe Not tainted 6.5.0-rc3-next-20230725 #1
> [    6.268319] Hardware name: Microchip PolarFire-SoC Icicle Kit (DT)
> [    6.281383] Call Trace:
> [    6.286563] [<ffffffff80006b48>] show_stack+0x2c/0x38
> [    6.297274] [<ffffffff80b49bb2>] dump_stack_lvl+0x60/0x82
> [    6.308723] [<ffffffff80b49be8>] dump_stack+0x14/0x1c
> [    6.319417] [<ffffffff80089d5a>] print_unlock_imbalance_bug+0x1cc/0x1d6
> [    6.333417] [<ffffffff80085e4a>] lock_release+0x236/0x3ae
> [    6.344867] [<ffffffff8007e464>] up_read+0x16/0x26
> [    6.355013] [<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
> [    6.366072] [<ffffffff8000d9ca>] handle_page_fault+0x19e/0x3a4
> [    6.378415] [<ffffffff80b4a18c>] do_page_fault+0x20/0x56
> [    6.389656] [<ffffffff80004434>] ret_from_exception+0x0/0x64
> [    6.401771] ------------[ cut here ]------------
> [    6.411655] DEBUG_RWSEMS_WARN_ON(tmp < 0): count = 0xffffffffffffff00, magic = 0xffffffe7c63c1558, owner = 0x1, curr 0xffffffe7c0add600, list empty
> [    6.439789] WARNING: CPU: 3 PID: 58 at kernel/locking/rwsem.c:1348 __up_read+0x1c2/0x224
> [    6.439840] Modules linked in:
> [    6.439855] CPU: 3 PID: 58 Comm: modprobe Not tainted 6.5.0-rc3-next-20230725 #1
> [    6.439884] Hardware name: Microchip PolarFire-SoC Icicle Kit (DT)
> [    6.439896] epc : __up_read+0x1c2/0x224
> [    6.439923]  ra : __up_read+0x1c2/0x224
> [    6.439947] epc : ffffffff8007e636 ra : ffffffff8007e636 sp : ffffffc8002b3df0
> [    6.439966]  gp : ffffffff818ad0f0 tp : ffffffe7c0add600 t0 : ffffffc8002b3ab8
> [    6.439986]  t1 : 0000000000000044 t2 : 5357525f47554245 s0 : ffffffc8002b3e20
> [    6.440004]  s1 : ffffffffffffff00 a0 : b32cfaf25517f300 a1 : b32cfaf25517f300
> [    6.440023]  a2 : b32cfaf25517f300 a3 : c0000000ffffefff a4 : 00000fff00000000
> [    6.440042]  a5 : 0000000000000004 a6 : ffffffff81643e90 a7 : 0000000000000038
> [    6.440060]  s2 : ffffffe7c63c1560 s3 : ffffffff81ae3578 s4 : ffffffe7c63c1558
> [    6.440079]  s5 : ffffffc8002b3ee0 s6 : 0000000000000254 s7 : ffffffe7c63c05a0
> [    6.440098]  s8 : 0000003f97687780 s9 : ffffffffeffffef5 s10: 000000006474e553
> [    6.440117]  s11: 0000000000000000 t3 : ffffffff8259430f t4 : ffffffff8259430f
> [    6.440136]  t5 : ffffffff82594310 t6 : ffffffff8259430f
> [    6.440151] status: 0000000200000120 badaddr: 0000000000000000 cause: 0000000000000003
> [    6.440169] [<ffffffff8007e636>] __up_read+0x1c2/0x224
> [    6.440201] [<ffffffff8007e46a>] up_read+0x1c/0x26
> [    6.440229] [<ffffffff8000dcfa>] vma_end_read+0x60/0xb8
> [    6.440255] [<ffffffff8000d9ca>] handle_page_fault+0x19e/0x3a4
> [    6.440282] [<ffffffff80b4a18c>] do_page_fault+0x20/0x56
> [    6.440309] [<ffffffff80004434>] ret_from_exception+0x0/0x64
> [    6.440339] irq event stamp: 371
> [    6.440347] hardirqs last  enabled at (371): [<ffffffff8000d862>] handle_page_fault+0x36/0x3a4
> [    6.440377] hardirqs last disabled at (370): [<ffffffff80b4aef6>] irqentry_enter+0x16/0x4c
> [    6.440407] softirqs last  enabled at (180): [<ffffffff80b56566>] __do_softirq+0x57e/0x66e
> [    6.440453] softirqs last disabled at (173): [<ffffffff8001f288>] __irq_exit_rcu+0x8c/0x14c
> [    6.440494] ---[ end trace 0000000000000000 ]---
Matthew Wilcox July 25, 2023, 2:31 p.m. UTC | #3
On Tue, Jul 25, 2023 at 07:15:08AM -0700, Suren Baghdasaryan wrote:
> On Tue, Jul 25, 2023 at 5:58 AM Conor Dooley <conor.dooley@microchip.com> wrote:
> >
> > Hey,
> >
> > On Mon, Jul 24, 2023 at 07:54:02PM +0100, Matthew Wilcox (Oracle) wrote:
> > > Remove the TCP layering violation by allowing per-VMA locks on all VMAs.
> > > The fault path will immediately fail in handle_mm_fault().  There may be
> > > a small performance reduction from this patch as a little unnecessary work
> > > will be done on each page fault.  See later patches for the improvement.
> > >
> > > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > > Reviewed-by: Suren Baghdasaryan <surenb@google.com>
> > > Cc: Arjun Roy <arjunroy@google.com>
> > > Cc: Eric Dumazet <edumazet@google.com>
> >
> > Unless my bisection has gone awry, this is causing boot failures for me
> > in today's linux-next w/ a splat like so.
> 
> This patch requires [1] to work correctly. It follows the rule
> introduced in [1] that anyone returning VM_FAULT_RETRY should also do
> vma_end_read(). [1] is merged into mm-unstable but has not reached
> linux-next yet, it seems.

No, it's in linux-next, but you didn't fix riscv ...

Andrew, can you add this fix to Suren's patch?
"mm: drop per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED"

diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 046732fcb48c..6115d7514972 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -296,7 +296,8 @@ void handle_page_fault(struct pt_regs *regs)
 	}
 
 	fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs);
-	vma_end_read(vma);
+	if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED)))
+		vma_end_read(vma);
 
 	if (!(fault & VM_FAULT_RETRY)) {
 		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index 220566cd8da8..f5f5134219b4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14874,7 +14874,6 @@  NETWORKING [TCP]
 M:	Eric Dumazet <edumazet@google.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
-F:	include/linux/net_mm.h
 F:	include/linux/tcp.h
 F:	include/net/tcp.h
 F:	include/trace/events/tcp.h
diff --git a/include/linux/net_mm.h b/include/linux/net_mm.h
deleted file mode 100644
index b298998bd5a0..000000000000
--- a/include/linux/net_mm.h
+++ /dev/null
@@ -1,17 +0,0 @@ 
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-#ifdef CONFIG_MMU
-
-#ifdef CONFIG_INET
-extern const struct vm_operations_struct tcp_vm_ops;
-static inline bool vma_is_tcp(const struct vm_area_struct *vma)
-{
-	return vma->vm_ops == &tcp_vm_ops;
-}
-#else
-static inline bool vma_is_tcp(const struct vm_area_struct *vma)
-{
-	return false;
-}
-#endif /* CONFIG_INET*/
-
-#endif /* CONFIG_MMU */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d17cb8ab4c48..9d1d312c90c2 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -45,7 +45,6 @@ 
 #include <linux/memcontrol.h>
 #include <linux/bpf-cgroup.h>
 #include <linux/siphash.h>
-#include <linux/net_mm.h>
 
 extern struct inet_hashinfo tcp_hashinfo;
 
diff --git a/mm/memory.c b/mm/memory.c
index 7ff73a197cce..c7ad754dd8ed 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -77,7 +77,6 @@ 
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/sysctl.h>
-#include <linux/net_mm.h>
 
 #include <trace/events/kmem.h>
 
@@ -5362,6 +5361,11 @@  vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
 		goto out;
 	}
 
+	if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) {
+		vma_end_read(vma);
+		return VM_FAULT_RETRY;
+	}
+
 	/*
 	 * Enable the memcg OOM handling for faults triggered in user
 	 * space.  Kernel faults are handled more gracefully.
@@ -5533,12 +5537,8 @@  struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 	if (!vma)
 		goto inval;
 
-	/* Only anonymous and tcp vmas are supported for now */
-	if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
-		goto inval;
-
 	/* find_mergeable_anon_vma uses adjacent vmas which are not locked */
-	if (!vma->anon_vma && !vma_is_tcp(vma))
+	if (vma_is_anonymous(vma) && !vma->anon_vma)
 		goto inval;
 
 	if (!vma_start_read(vma))
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index aca5620cf3ba..f7391d017c4c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1740,7 +1740,7 @@  void tcp_update_recv_tstamps(struct sk_buff *skb,
 }
 
 #ifdef CONFIG_MMU
-const struct vm_operations_struct tcp_vm_ops = {
+static const struct vm_operations_struct tcp_vm_ops = {
 };
 
 int tcp_mmap(struct file *file, struct socket *sock,
@@ -2043,13 +2043,10 @@  static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
 					   unsigned long address,
 					   bool *mmap_locked)
 {
-	struct vm_area_struct *vma = NULL;
+	struct vm_area_struct *vma = lock_vma_under_rcu(mm, address);
 
-#ifdef CONFIG_PER_VMA_LOCK
-	vma = lock_vma_under_rcu(mm, address);
-#endif
 	if (vma) {
-		if (!vma_is_tcp(vma)) {
+		if (vma->vm_ops != &tcp_vm_ops) {
 			vma_end_read(vma);
 			return NULL;
 		}
@@ -2059,7 +2056,7 @@  static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
 
 	mmap_read_lock(mm);
 	vma = vma_lookup(mm, address);
-	if (!vma || !vma_is_tcp(vma)) {
+	if (!vma || vma->vm_ops != &tcp_vm_ops) {
 		mmap_read_unlock(mm);
 		return NULL;
 	}