diff mbox series

[v5] parisc: Fix spinlock barriers

Message ID b38abd19-0a61-8968-b98d-9b8b2efb6747@bell.net (mailing list archive)
State Accepted, archived
Headers show
Series [v5] parisc: Fix spinlock barriers | expand

Commit Message

John David Anglin July 21, 2020, 11:36 a.m. UTC
Stalls are quite frequent with recent kernels.  I enabled CONFIG_SOFTLOCKUP_DETECTOR and I caught the following stall:

watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [cc1:22803]
Modules linked in: dm_mod dax binfmt_misc ext4 crc16 jbd2 ext2 mbcache sg ipmi_watchdog ipmi_si ipmi_poweroff ipmi_devintf ipmi_msghandler nfsd
ip_tables x_tables ipv6 autofs4 xfs libcrc32c crc32c_generic raid10 raid1 raid0 multipath linear md_mod ses enclosure sd_mod scsi_transport_sas
t10_pi sr_mod cdrom ata_generic uas usb_storage pata_cmd64x libata ohci_pci ehci_pci ohci_hcd sym53c8xx ehci_hcd scsi_transport_spi tg3 usbcore
scsi_mod usb_common
CPU: 0 PID: 22803 Comm: cc1 Not tainted 5.6.17+ #3
Hardware name: 9000/800/rp3440

     YZrvWESTHLNXBCVMcbcbcbcbOGFRQPDI
PSW: 00001000000001001111111100001111 Not tainted
r00-03  000000ff0804ff0f 0000000040891dc0 000000004037d1c4 000000006d5e8890
r04-07  000000004086fdc0 0000000040ab31ac 000000000004e99a 0000000000001f20
r08-11  0000000040b24710 000000006d5e8488 0000000040a1d280 000000006d5e89b0
r12-15  000000006d5e88c4 00000001802c2cb8 000000003c812825 0000004122eb4d18
r16-19  0000000040b26630 000000006d5e8898 000000000001d330 000000006d5e88c0
r20-23  000000000800000f 0000000a0ad24270 b6683633143fce3c 0000004122eb4d54
r24-27  000000006d5e88c4 000000006d5e8488 00000001802c2cb8 000000004086fdc0
r28-31  0000004122d57b69 000000006d5e89b0 000000006d5e89e0 000000006d5e8000
sr00-03  000000000c749000 0000000000000000 0000000000000000 000000000c749000
sr04-07  0000000000000000 0000000000000000 0000000000000000 0000000000000000

IASQ: 0000000000000000 0000000000000000 IAOQ: 000000004037d414 000000004037d418
 IIR: 0e0010dc    ISR: 00000041042d63f0  IOR: 000000004086fdc0
 CPU:        0   CR30: 000000006d5e8000 CR31: ffffffffffffefff
 ORIG_R28: 000000004086fdc0
 IAOQ[0]: d_alloc_parallel+0x384/0x688
 IAOQ[1]: d_alloc_parallel+0x388/0x688
 RP(r2): d_alloc_parallel+0x134/0x688
Backtrace:
 [<000000004036974c>] __lookup_slow+0xa4/0x200
 [<0000000040369fc8>] walk_component+0x288/0x458
 [<000000004036a9a0>] path_lookupat+0x88/0x198
 [<000000004036e748>] filename_lookup+0xa0/0x168
 [<000000004036e95c>] user_path_at_empty+0x64/0x80
 [<000000004035d93c>] vfs_statx+0x104/0x158
 [<000000004035dfcc>] __do_sys_lstat64+0x44/0x80
 [<000000004035e5a0>] sys_lstat64+0x20/0x38
 [<0000000040180054>] syscall_exit+0x0/0x14

The code was stuck in this loop in d_alloc_parallel:

    4037d414:   0e 00 10 dc     ldd 0(r16),ret0
    4037d418:   c7 fc 5f ed     bb,< ret0,1f,4037d414 <d_alloc_parallel+0x384>
    4037d41c:   08 00 02 40     nop

This is the inner loop of bit_spin_lock which is called by hlist_bl_unlock in d_alloc_parallel:

static inline void bit_spin_lock(int bitnum, unsigned long *addr)
{
        /*
         * Assuming the lock is uncontended, this never enters
         * the body of the outer loop. If it is contended, then
         * within the inner loop a non-atomic test is used to
         * busywait with less bus contention for a good time to
         * attempt to acquire the lock bit.
         */
        preempt_disable();
#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
        while (unlikely(test_and_set_bit_lock(bitnum, addr))) {
                preempt_enable();
                do {
                        cpu_relax();
                } while (test_bit(bitnum, addr));
                preempt_disable();
        }
#endif
        __acquire(bitlock);
}

After consideration, I realized that we must be losing bit unlocks.  Then, I noticed that we missed
defining atomic64_set_release().  Adding this define fixes the stalls in bit operations.

Signed-off-by: Dave Anglin <dave.anglin@bell.net>
---

Comments

Rolf Eike Beer Aug. 31, 2020, 4:21 p.m. UTC | #1
These things are in 5.8.4 AFAICT, and the lockups are still there:

[320616.602705] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [hppa2.0-unknown:29093]
[320616.602705] Modules linked in: 8021q ipmi_poweroff ipmi_si ipmi_devintf sata_via ipmi_msghandler cbc dm_zero dm_snapshot dm_mirror dm_region_hash dm_log dm_crypt dm_bufio pata_sil680 libata
[320616.602705] CPU: 0 PID: 29093 Comm: hppa2.0-unknown Not tainted 5.8.4-gentoo-parisc64 #1
[320616.602705] Hardware name: 9000/785/C8000
[320616.602705]
[320616.602705]      YZrvWESTHLNXBCVMcbcbcbcbOGFRQPDI
[320616.602705] PSW: 00001000000001101111111100001111 Not tainted
[320616.602705] r00-03  000000ff0806ff0f 0000000040558f08 00000000402706a8 00000040838787e0
[320616.602705] r04-07  0000000040a303e0 00000000409eafa0 0000000040c5b690 0000000041fce0d0
[320616.602705] r08-11  00000040838787e0 0000000000000001 00000000409fbf80 0000000040c5b740
[320616.602705] r12-15  0000000041fce0d8 00000000409eafa0 0000000040187140 0000000040a56be0
[320616.602705] r16-19  0000000040a56be0 0000000000000004 0000000001e92000 0000000000000002
[320616.602705] r20-23  0000000000000001 0000000000000000 0000000000000000 0000000000000002
[320616.602705] r24-27  0000000000000004 0000000000000000 0000000000000002 0000000040a303e0
[320616.602705] r28-31  0000000000000011 00000040838788e0 0000004083878810 0000000042001140
[320616.602705] sr00-03  00000000072a0800 0000000007296800 0000000000000000 00000000072a0800
[320616.602705] sr04-07  0000000000000000 0000000000000000 0000000000000000 0000000000000000
[320616.602705]
[320616.602705] IASQ: 0000000000000000 0000000000000000 IAOQ: 00000000402706d0 00000000402706d4
[320616.602705]  IIR: 0ff0109c    ISR: 000000005836f8a0  IOR: 0000000000000001
[320616.602705]  CPU:        0   CR30: 0000004083878000 CR31: ffffffffffffffff
[320616.602705]  ORIG_R28: 0000000000000801
[320616.602705]  IAOQ[0]: smp_call_function_many_cond+0x490/0x500
[320616.602705]  IAOQ[1]: smp_call_function_many_cond+0x494/0x500
[320616.602705]  RP(r2): smp_call_function_many_cond+0x468/0x500
[320616.602705] Backtrace:
[320616.602705]  [<0000000040270824>] on_each_cpu+0x5c/0x98
[320616.602705]  [<0000000040186a0c>] flush_tlb_all+0x204/0x228
[320616.602705]  [<00000000402ef1f8>] tlb_finish_mmu+0x1d8/0x210
[320616.602705]  [<00000000402eb820>] exit_mmap+0x1d8/0x370
[320616.602705]  [<00000000401b5ec0>] mmput+0xe8/0x260
[320616.602705]  [<00000000401c1690>] do_exit+0x558/0x12e8
[320616.602705]  [<00000000401c3f18>] do_group_exit+0x50/0x118
[320616.602705]  [<00000000401c4000>] sys_exit_group+0x20/0x28
[320616.602705]  [<0000000040192018>] syscall_exit+0x0/0x14
[320616.602705]
[320619.854705] watchdog: BUG: soft lockup - CPU#1 stuck for 22s! [sh:29109]
[320619.854705] Modules linked in: 8021q ipmi_poweroff ipmi_si ipmi_devintf sata_via ipmi_msghandler cbc dm_zero dm_snapshot dm_mirror dm_region_hash dm_log dm_crypt dm_bufio pata_sil680 libata
[320619.854705] CPU: 1 PID: 29109 Comm: sh Tainted: G             L    5.8.4-gentoo-parisc64 #1
[320619.854705] Hardware name: 9000/785/C8000
[320619.854705]
[320619.854705]      YZrvWESTHLNXBCVMcbcbcbcbOGFRQPDI
[320619.854705] PSW: 00001000000001001111111100001111 Tainted: G             L
[320619.854705] r00-03  000000ff0804ff0f 0000000040558f08 00000000402706a8 0000004083b089e0
[320619.854705] r04-07  0000000040a303e0 00000000409eafa0 0000000040c5b690 0000000041fe60d0
[320619.854705] r08-11  0000004083b089e0 0000000000000001 00000000409fbf80 0000000040c5b740
[320619.854705] r12-15  0000000041fe60d8 00000000409eafa0 0000000040187ce0 0000000040a56be0
[320619.854705] r16-19  0000000040a56be0 0000000000000004 0000000001e92000 0000000000000002
[320619.854705] r20-23  0000000000000001 0000000000000000 0000000000000000 0000000000000002
[320619.854705] r24-27  0000000000000004 0000000000000000 0000000000000002 0000000040a303e0
[320619.854705] r28-31  0000000000000011 0000004083b08ae0 0000004083b08a10 0000000042001ce0
[320619.854705] sr00-03  00000000072a4800 00000000072a4800 0000000000000000 00000000072a5800
[320619.854705] sr04-07  0000000000000000 0000000000000000 0000000000000000 0000000000000000
[320619.854705]
[320619.854705] IASQ: 0000000000000000 0000000000000000 IAOQ: 00000000402706d0 00000000402706d4
[320619.854705]  IIR: 0ff0109c    ISR: 0000000000000000  IOR: 0000000000000001
[320619.854705]  CPU:        1   CR30: 0000004083b08000 CR31: ffffffffffffffff
[320619.854705]  ORIG_R28: 0000000000000801
[320619.854705]  IAOQ[0]: smp_call_function_many_cond+0x490/0x500
[320619.854705]  IAOQ[1]: smp_call_function_many_cond+0x494/0x500
[320619.854705]  RP(r2): smp_call_function_many_cond+0x468/0x500
[320619.854705] Backtrace:
[320619.854705]  [<0000000040270824>] on_each_cpu+0x5c/0x98
[320619.854705]  [<0000000040186a0c>] flush_tlb_all+0x204/0x228
[320619.854705]  [<00000000402ef1f8>] tlb_finish_mmu+0x1d8/0x210
[320619.854705]  [<00000000402eb820>] exit_mmap+0x1d8/0x370
[320619.854705]  [<00000000401b5ec0>] mmput+0xe8/0x260
[320619.854705]  [<0000000040343254>] begin_new_exec+0x684/0x1140
[320619.854705]  [<00000000403e5790>] load_elf_binary+0xc00/0x18e0
[320619.854705]  [<000000004034237c>] __do_execve_file.isra.0+0xaa4/0x10d0
[320619.854705]  [<0000000040343eb8>] compat_sys_execve+0x70/0x88
[320619.854705]  [<0000000040192018>] syscall_exit+0x0/0x14
[320619.854705]
[320656.138705] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
[320656.138705]         (detected by 0, t=15002 jiffies, g=29568801, q=103)
[320656.138705] rcu: All QSes seen, last rcu_sched kthread activity 15000 (4375056330-4375041330), jiffies_till_next_fqs=1, root ->qsmask 0x0
[320656.138705] rcu: rcu_sched kthread starved for 15000 jiffies! g29568801 f0x2 RCU_GP_WAIT_FQS(5) ->state=0x200 ->cpu=3
[320656.138705] rcu:    Unless rcu_sched kthread gets sufficient CPU time, OOM is now expected behavior.
[320656.138705] rcu: RCU grace-period kthread stack dump:
[320656.138705] rcu_sched       R    0    10      2 0x00000000
[320656.138705] Backtrace:
[320656.138705]
[320656.922716] BUG: workqueue lockup - pool cpus=0 node=0 flags=0x0 nice=0 stuck for 60s!
[320657.026717] BUG: workqueue lockup - pool cpus=1 node=0 flags=0x0 nice=0 stuck for 60s!
[320657.130716] Showing busy workqueues and worker pools:
[320657.198716] workqueue events: flags=0x0
[320657.250717]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 refcnt=2
[320657.254705]     pending: vmstat_shepherd
[320657.394718] workqueue events_power_efficient: flags=0x80
[320657.466715]   pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/256 refcnt=2
[320657.470705]     pending: fb_flashcursor
[320657.610716]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256 refcnt=3
[320657.614705]     pending: neigh_periodic_work, neigh_periodic_work
[320657.782719] workqueue mm_percpu_wq: flags=0x8
[320657.842715]   pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/256 refcnt=2
[320657.846705]     pending: vmstat_update
[320657.982713]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 refcnt=2
[320657.986705]     pending: vmstat_update
[320658.126726] workqueue mpt_poll_1: flags=0x8
[320658.182716]   pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=1/256 refcnt=2
[320658.186705]     pending: mpt_fault_reset_work
[320684.602705] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [hppa2.0-unknown:29093]
[320684.602705] Modules linked in: 8021q ipmi_poweroff ipmi_si ipmi_devintf sata_via ipmi_msghandler cbc dm_zero dm_snapshot dm_mirror dm_region_hash dm_log dm_crypt dm_bufio pata_sil680 libata
[320684.602705] CPU: 0 PID: 29093 Comm: hppa2.0-unknown Tainted: G             L    5.8.4-gentoo-parisc64 #1
[320684.602705] Hardware name: 9000/785/C8000
[320684.602705]
[320684.602705]      YZrvWESTHLNXBCVMcbcbcbcbOGFRQPDI
[320684.602705] PSW: 00001000000001101111111100001111 Tainted: G             L
[320684.602705] r00-03  000000ff0806ff0f 0000000040558f08 00000000402706a8 00000040838787e0
[320684.602705] r04-07  0000000040a303e0 00000000409eafa0 0000000040c5b690 0000000041fce0d0
[320684.602705] r08-11  00000040838787e0 0000000000000001 00000000409fbf80 0000000040c5b740
[320684.602705] r12-15  0000000041fce0d8 00000000409eafa0 0000000040187140 0000000040a56be0
[320684.602705] r16-19  0000000040a56be0 0000000000000004 0000000001e92000 0000000000000002
[320684.602705] r20-23  0000000000000001 0000000000000000 0000000000000000 0000000000000002
[320684.602705] r24-27  0000000000000004 0000000000000000 0000000000000002 0000000040a303e0
[320684.602705] r28-31  0000000000000011 00000040838788e0 0000004083878810 0000000042001140
[320684.602705] sr00-03  00000000072a0800 0000000007296800 0000000000000000 00000000072a0800
[320684.602705] sr04-07  0000000000000000 0000000000000000 0000000000000000 0000000000000000
[320684.602705]
[320684.602705] IASQ: 0000000000000000 0000000000000000 IAOQ: 00000000402706d0 00000000402706d4
[320684.602705]  IIR: 0ff0109c    ISR: 000000005836f8a0  IOR: 0000000000000001
[320684.602705]  CPU:        0   CR30: 0000004083878000 CR31: ffffffffffffffff
[320684.602705]  ORIG_R28: 0000000000000801
[320684.602705]  IAOQ[0]: smp_call_function_many_cond+0x490/0x500
[320684.602705]  IAOQ[1]: smp_call_function_many_cond+0x494/0x500
[320684.602705]  RP(r2): smp_call_function_many_cond+0x468/0x500
[320684.602705] Backtrace:
[320684.602705]  [<0000000040270824>] on_each_cpu+0x5c/0x98
[320684.602705]  [<0000000040186a0c>] flush_tlb_all+0x204/0x228
[320684.602705]  [<00000000402ef1f8>] tlb_finish_mmu+0x1d8/0x210
[320684.602705]  [<00000000402eb820>] exit_mmap+0x1d8/0x370
[320684.602705]  [<00000000401b5ec0>] mmput+0xe8/0x260
[320684.602705]  [<00000000401c1690>] do_exit+0x558/0x12e8
[320684.602705]  [<00000000401c3f18>] do_group_exit+0x50/0x118
[320684.602705]  [<00000000401c4000>] sys_exit_group+0x20/0x28
[320684.602705]  [<0000000040192018>] syscall_exit+0x0/0x14
[320684.602705]
[320836.158705] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
[320836.158705]         (detected by 0, t=60007 jiffies, g=29568801, q=103)
[320836.158705] rcu: All QSes seen, last rcu_sched kthread activity 60005 (4375101335-4375041330), jiffies_till_next_fqs=1, root ->qsmask 0x0
[320836.158705] rcu: rcu_sched kthread starved for 60005 jiffies! g29568801 f0x2 RCU_GP_WAIT_FQS(5) ->state=0x200 ->cpu=3
[320836.158705] rcu:    Unless rcu_sched kthread gets sufficient CPU time, OOM is now expected behavior.
[320836.158705] rcu: RCU grace-period kthread stack dump:
[320836.158705] rcu_sched       R    0    10      2 0x00000000
[320836.158705] Backtrace:
[320836.158705]

And then the machine rebooted.
Helge Deller Aug. 31, 2020, 8:06 p.m. UTC | #2
Hi Rolf,

On 31.08.20 18:21, Rolf Eike Beer wrote:
> These things are in 5.8.4 AFAICT, and the lockups are still there:

Thanks for testing!
 
> [320616.602705] watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [hppa2.0-unknown:29093]
> [320616.602705] Modules linked in: 8021q ipmi_poweroff ipmi_si ipmi_devintf sata_via ipmi_msghandler cbc dm_zero dm_snapshot dm_mirror dm_region_hash dm_log dm_crypt dm_bufio pata_sil680 libata
> [320616.602705] CPU: 0 PID: 29093 Comm: hppa2.0-unknown Not tainted 5.8.4-gentoo-parisc64 #1
> [320616.602705] Hardware name: 9000/785/C8000
>...
> [320616.602705] IASQ: 0000000000000000 0000000000000000 IAOQ: 00000000402706d0 00000000402706d4
> [320616.602705]  IIR: 0ff0109c    ISR: 000000005836f8a0  IOR: 0000000000000001
> [320616.602705]  CPU:        0   CR30: 0000004083878000 CR31: ffffffffffffffff
> [320616.602705]  ORIG_R28: 0000000000000801
> [320616.602705]  IAOQ[0]: smp_call_function_many_cond+0x490/0x500
> [320616.602705]  IAOQ[1]: smp_call_function_many_cond+0x494/0x500
> [320616.602705]  RP(r2): smp_call_function_many_cond+0x468/0x500
> [320616.602705] Backtrace:
> [320616.602705]  [<0000000040270824>] on_each_cpu+0x5c/0x98
> [320616.602705]  [<0000000040186a0c>] flush_tlb_all+0x204/0x228
> [320616.602705]  [<00000000402ef1f8>] tlb_finish_mmu+0x1d8/0x210
> [320616.602705]  [<00000000402eb820>] exit_mmap+0x1d8/0x370
> [320616.602705]  [<00000000401b5ec0>] mmput+0xe8/0x260
> [320616.602705]  [<00000000401c1690>] do_exit+0x558/0x12e8
> [320616.602705]  [<00000000401c3f18>] do_group_exit+0x50/0x118
> [320616.602705]  [<00000000401c4000>] sys_exit_group+0x20/0x28
> [320616.602705]  [<0000000040192018>] syscall_exit+0x0/0x14

I agree. I have seen the same stall too.
I think we should try to analyze how the stall in smp_call_function_many_cond()
can happen. The trace seems always to point to do_exit().

I think those patches from Linus helped for the "old kind of stalls" which we have had in the last months/years:
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=c6fe44d96fc1536af5b11cd859686453d1b7bfd1 and 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=2a9127fcf2296674d58024f83981f40b128fffea
Those old stalls were something like this and didn't pointed to do_exit():

[111395.307021] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
[111395.311001] rcu:    3-...0: (1 GPs behind) idle=04e/1/0x4000000000000000 softirq=13650053/13650054 fqs=2625
[111395.311001]         (detected by 0, t=5252 jiffies, g=25258025, q=1240)
[111395.311001] Task dump for CPU 3:
[111395.311001] init            R  running task        0     1      0 0x00000016
[111395.311001] Backtrace:
[111395.311001]  [<0000000040416110>] hrtimer_try_to_cancel+0x13c/0x1f8
[111395.311001]  [<0000000040e65a18>] schedule_hrtimeout_range_clock+0x10c/0x1b8
[111395.311001]  [<0000000040e65af4>] schedule_hrtimeout_range+0x30/0x60
[111395.311001]  [<0000000040e5ebbc>] _cond_resched+0x40/0xb8
[111395.311001]  [<00000000403617a4>] get_signal+0x348/0xf00
[111395.311001]  [<000000004031cec0>] do_signal+0x54/0x230
[111395.311001]  [<000000004031d0f8>] do_notify_resume+0x5c/0x164
[111395.311001]  [<0000000040310110>] syscall_do_signal+0x54/0xa0
[111395.311001]  [<000000004030f074>] intr_return+0x0/0xc
[111395.311001]
[111458.330562] rcu: INFO: rcu_sched detected stalls on CPUs/tasks:
[111458.330562] rcu:    3-...0: (1 GPs behind) idle=04e/1/0x4000000000000000 softirq=13650053/13650054 fqs=4653
[111458.330562]         (detected by 0, t=21007 jiffies, g=25258025, q=1361)
[111458.330562] Task dump for CPU 3:
[111458.330562] init            R  running task        0     1      0 0x00000016
...

Helge
diff mbox series

Patch

diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 118953d41763..6dd4171c9530 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -212,6 +212,8 @@  atomic64_set(atomic64_t *v, s64 i)
 	_atomic_spin_unlock_irqrestore(v, flags);
 }

+#define atomic64_set_release(v, i)	atomic64_set((v), (i))
+
 static __inline__ s64
 atomic64_read(const atomic64_t *v)
 {