diff mbox series

[v4,35/35] mm, slub: convert kmem_cpu_slab protection to local_lock

Message ID 20210805152000.12817-36-vbabka@suse.cz (mailing list archive)
State New
Headers show
Series SLUB: reduce irq disabled scope and make it RT compatible | expand

Commit Message

Vlastimil Babka Aug. 5, 2021, 3:20 p.m. UTC
Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions of
local_lock instead of plain local_irq_save/restore. On !PREEMPT_RT that's
equivalent, with better lockdep visibility. On PREEMPT_RT that means better
preemption.

However, the cost on PREEMPT_RT is the loss of lockless fast paths which only
work with cpu freelist. Those are designed to detect and recover from being
preempted by other conflicting operations (both fast or slow path), but the
slow path operations assume they cannot be preempted by a fast path operation,
which is guaranteed naturally with disabled irqs. With local locks on
PREEMPT_RT, the fast paths now also need to take the local lock to avoid races.

In the allocation fastpath slab_alloc_node() we can just defer to the slowpath
__slab_alloc() which also works with cpu freelist, but under the local lock.
In the free fastpath do_slab_free() we have to add a new local lock protected
version of freeing to the cpu freelist, as the existing slowpath only works
with the page freelist.

Also update the comment about locking scheme in SLUB to reflect changes done
by this series.

[ Mike Galbraith <efault@gmx.de>: use local_lock() without irq in PREEMPT_RT
  scope; debugging of RT crashes resulting in put_cpu_partial() locking changes ]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slub_def.h |   2 +
 mm/slub.c                | 146 ++++++++++++++++++++++++++++++---------
 2 files changed, 115 insertions(+), 33 deletions(-)

Comments

Sven Eckelmann Aug. 15, 2021, 12:27 p.m. UTC | #1
> mm, slub: convert kmem_cpu_slab protection to local_lock
> 
> Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions
> of local_lock instead of plain local_irq_save/restore.  On !PREEMPT_RT
> that's equivalent, with better lockdep visibility.  On PREEMPT_RT that
> means better preemption.
[...]

Looks like this change breaks booting when 64BIT+LOCK_STAT is enabled on 
x86_64:

    general protection fault, maybe for address 0xffff888007fcf1c8: 0000 [#1] NOPTI
    CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.0-rc5+ #7
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
    RIP: 0010:kmem_cache_alloc+0x81/0x180
    Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943
    RSP: 0000:ffffffff81803c10 EFLAGS: 00000286
    RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024
    RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190
    RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0
    R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0
    R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100
    FS:  0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0
    Call Trace:
     __get_vm_area_node.constprop.0.isra.0+0x74/0x150
     __vmalloc_node_range+0x5a/0x2b0
     ? kernel_clone+0x88/0x390
     ? copy_process+0x1ac/0x17e0
     copy_process+0x768/0x17e0
     ? kernel_clone+0x88/0x390
     kernel_clone+0x88/0x390
     ? _vm_unmap_aliases.part.0+0xe9/0x110
     ? change_page_attr_set_clr+0x10d/0x180
     kernel_thread+0x43/0x50
     ? rest_init+0x100/0x100
     rest_init+0x1e/0x100
     arch_call_rest_init+0x9/0xc
     start_kernel+0x481/0x493
     x86_64_start_reservations+0x24/0x26
     x86_64_start_kernel+0x80/0x84
     secondary_startup_64_no_verify+0xc2/0xcb
    random: get_random_bytes called from oops_exit+0x34/0x60 with crng_init=0
    ---[ end trace 2cac18ac38f640c1 ]---
    RIP: 0010:kmem_cache_alloc+0x81/0x180
    Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943
    RSP: 0000:ffffffff81803c10 EFLAGS: 00000286
    RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024
    RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190
    RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0
    R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0
    R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100
    FS:  0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0
    Kernel panic - not syncing: Attempted to kill the idle task!
    ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---


This was tested in qemu-system-x86_64 (from Debian Bullseye) with
    
    cat > .config << "EOF"
    # CONFIG_LOCALVERSION_AUTO is not set
    # CONFIG_SWAP is not set
    # CONFIG_CROSS_MEMORY_ATTACH is not set
    # CONFIG_UTS_NS is not set
    # CONFIG_TIME_NS is not set
    # CONFIG_PID_NS is not set
    # CONFIG_COMPAT_BRK is not set
    # CONFIG_SLAB_MERGE_DEFAULT is not set
    # CONFIG_RETPOLINE is not set
    # CONFIG_X86_EXTENDED_PLATFORM is not set
    # CONFIG_SCHED_OMIT_FRAME_POINTER is not set
    # CONFIG_X86_MCE is not set
    # CONFIG_X86_IOPL_IOPERM is not set
    # CONFIG_MICROCODE is not set
    # CONFIG_MTRR_SANITIZER is not set
    # CONFIG_RELOCATABLE is not set
    # CONFIG_SUSPEND is not set
    # CONFIG_ACPI is not set
    # CONFIG_DMIID is not set
    # CONFIG_VIRTUALIZATION is not set
    # CONFIG_SECCOMP is not set
    # CONFIG_STACKPROTECTOR is not set
    # CONFIG_BLK_DEV_BSG is not set
    # CONFIG_MQ_IOSCHED_DEADLINE is not set
    # CONFIG_MQ_IOSCHED_KYBER is not set
    # CONFIG_BINFMT_ELF is not set
    # CONFIG_BINFMT_SCRIPT is not set
    # CONFIG_COMPACTION is not set
    # CONFIG_STANDALONE is not set
    # CONFIG_PREVENT_FIRMWARE_BUILD is not set
    # CONFIG_BLK_DEV is not set
    # CONFIG_INPUT_KEYBOARD is not set
    # CONFIG_INPUT_MOUSE is not set
    # CONFIG_SERIO is not set
    # CONFIG_LEGACY_PTYS is not set
    # CONFIG_LDISC_AUTOLOAD is not set
    CONFIG_SERIAL_8250=y
    CONFIG_SERIAL_8250_CONSOLE=y
    # CONFIG_HW_RANDOM is not set
    # CONFIG_DEVMEM is not set
    # CONFIG_HWMON is not set
    # CONFIG_HID is not set
    # CONFIG_USB_SUPPORT is not set
    # CONFIG_VIRTIO_MENU is not set
    # CONFIG_VHOST_MENU is not set
    # CONFIG_X86_PLATFORM_DEVICES is not set
    # CONFIG_IOMMU_SUPPORT is not set
    # CONFIG_MANDATORY_FILE_LOCKING is not set
    # CONFIG_DNOTIFY is not set
    # CONFIG_INOTIFY_USER is not set
    # CONFIG_MISC_FILESYSTEMS is not set
    # CONFIG_SYMBOLIC_ERRNAME is not set
    CONFIG_FRAME_WARN=1024
    # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
    CONFIG_DEBUG_KERNEL=y
    CONFIG_LOCK_STAT=y
    # CONFIG_FTRACE is not set
    # CONFIG_X86_VERBOSE_BOOTUP is not set
    CONFIG_UNWINDER_FRAME_POINTER=y
    # CONFIG_RUNTIME_TESTING_MENU is not set
    CONFIG_64BIT=y
    EOF
    
    make ARCH=x86_64 olddefconfig
    make ARCH=x86_64 all
    
    qemu-system-x86_64 -nographic -kernel arch/x86/boot/bzImage -append console=ttyS0

Here is the bisect log:

    # bad: [4b358aabb93a2c654cd1dcab1a25a589f6e2b153] Add linux-next specific files for 20210813
    # good: [36a21d51725af2ce0700c6ebcb6b9594aac658a6] Linux 5.14-rc5
    git bisect start 'HEAD' 'v5.14-rc5'
    # good: [204808b2ca750e27cbad3455f7cb4368c4f5b260] Merge remote-tracking branch 'crypto/master'
    git bisect good 204808b2ca750e27cbad3455f7cb4368c4f5b260
    # good: [2201162fca73b487152bcff2ebb0f85c1dde8479] Merge remote-tracking branch 'tip/auto-latest'
    git bisect good 2201162fca73b487152bcff2ebb0f85c1dde8479
    # good: [41f97b6df1c8fd9fa828967a687693454c4ce888] Merge remote-tracking branch 'staging/staging-next'
    git bisect good 41f97b6df1c8fd9fa828967a687693454c4ce888
    # good: [797896d32d52af43fc9b0099a198ef29c2ca0138] Merge remote-tracking branch 'userns/for-next'
    git bisect good 797896d32d52af43fc9b0099a198ef29c2ca0138
    # bad: [5c7e12cc3d39b5cfc0d1a470139e4e89f3b147ed] arch: Kconfig: fix spelling mistake "seperate" -> "separate"
    git bisect bad 5c7e12cc3d39b5cfc0d1a470139e4e89f3b147ed
    # bad: [3535cf93a31ecd8595744881dbbda666cf61be48] add-mmap_assert_locked-annotations-to-find_vma-fix
    git bisect bad 3535cf93a31ecd8595744881dbbda666cf61be48
    # bad: [ac90b3dacf327cecda9f3dabc0051c7332770224] mm/debug_vm_pgtable: use struct pgtable_debug_args in PGD and P4D modifying tests
    git bisect bad ac90b3dacf327cecda9f3dabc0051c7332770224
    # good: [3a7ac8f97abde2d6ec973a00a449c45b9642a15a] mm, slub: do initial checks in ___slab_alloc() with irqs enabled
    git bisect good 3a7ac8f97abde2d6ec973a00a449c45b9642a15a
    # good: [1c84f3c916405dc3d62cfca55fb2a84de9d7f31e] mm, slub: fix memory and cpu hotplug related lock ordering issues
    git bisect good 1c84f3c916405dc3d62cfca55fb2a84de9d7f31e
    # bad: [6ac9c394652dbba1181268cb09513edbd733685c] mm/debug_vm_pgtable: introduce struct pgtable_debug_args
    git bisect bad 6ac9c394652dbba1181268cb09513edbd733685c
    # good: [35a6f4bcf4ad9c8c0208ea48044539a952859b3a] mm, slub: make slab_lock() disable irqs with PREEMPT_RT
    git bisect good 35a6f4bcf4ad9c8c0208ea48044539a952859b3a
    # good: [03e736e3ca2c0a48822609a89ffa0329f4eb5aae] mm, slub: use migrate_disable() on PREEMPT_RT
    git bisect good 03e736e3ca2c0a48822609a89ffa0329f4eb5aae
    # bad: [3f57fd12e8b7eb77412623c347566fb83ec5e764] mm, slub: convert kmem_cpu_slab protection to local_lock
    git bisect bad 3f57fd12e8b7eb77412623c347566fb83ec5e764
    # first bad commit: [3f57fd12e8b7eb77412623c347566fb83ec5e764] mm, slub: convert kmem_cpu_slab protection to local_lock

I haven't checked what part of the change is actually causing the problem

Kind regards,
	Sven
Vlastimil Babka Aug. 17, 2021, 8:37 a.m. UTC | #2
On 8/15/21 2:27 PM, Sven Eckelmann wrote:
>> mm, slub: convert kmem_cpu_slab protection to local_lock
>>
>> Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions
>> of local_lock instead of plain local_irq_save/restore.  On !PREEMPT_RT
>> that's equivalent, with better lockdep visibility.  On PREEMPT_RT that
>> means better preemption.
> [...]
> 
> Looks like this change breaks booting when 64BIT+LOCK_STAT is enabled on 
> x86_64:

OK reproduced. Thanks, will investigate.

>     general protection fault, maybe for address 0xffff888007fcf1c8: 0000 [#1] NOPTI
>     CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.0-rc5+ #7
>     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
>     RIP: 0010:kmem_cache_alloc+0x81/0x180
>     Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943
>     RSP: 0000:ffffffff81803c10 EFLAGS: 00000286
>     RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024
>     RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190
>     RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0
>     R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0
>     R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100
>     FS:  0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000
>     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>     CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0
>     Call Trace:
>      __get_vm_area_node.constprop.0.isra.0+0x74/0x150
>      __vmalloc_node_range+0x5a/0x2b0
>      ? kernel_clone+0x88/0x390
>      ? copy_process+0x1ac/0x17e0
>      copy_process+0x768/0x17e0
>      ? kernel_clone+0x88/0x390
>      kernel_clone+0x88/0x390
>      ? _vm_unmap_aliases.part.0+0xe9/0x110
>      ? change_page_attr_set_clr+0x10d/0x180
>      kernel_thread+0x43/0x50
>      ? rest_init+0x100/0x100
>      rest_init+0x1e/0x100
>      arch_call_rest_init+0x9/0xc
>      start_kernel+0x481/0x493
>      x86_64_start_reservations+0x24/0x26
>      x86_64_start_kernel+0x80/0x84
>      secondary_startup_64_no_verify+0xc2/0xcb
>     random: get_random_bytes called from oops_exit+0x34/0x60 with crng_init=0
>     ---[ end trace 2cac18ac38f640c1 ]---
>     RIP: 0010:kmem_cache_alloc+0x81/0x180
>     Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943
>     RSP: 0000:ffffffff81803c10 EFLAGS: 00000286
>     RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024
>     RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190
>     RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0
>     R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0
>     R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100
>     FS:  0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000
>     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>     CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0
>     Kernel panic - not syncing: Attempted to kill the idle task!
>     ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---
> 
> 
> This was tested in qemu-system-x86_64 (from Debian Bullseye) with
>     
>     cat > .config << "EOF"
>     # CONFIG_LOCALVERSION_AUTO is not set
>     # CONFIG_SWAP is not set
>     # CONFIG_CROSS_MEMORY_ATTACH is not set
>     # CONFIG_UTS_NS is not set
>     # CONFIG_TIME_NS is not set
>     # CONFIG_PID_NS is not set
>     # CONFIG_COMPAT_BRK is not set
>     # CONFIG_SLAB_MERGE_DEFAULT is not set
>     # CONFIG_RETPOLINE is not set
>     # CONFIG_X86_EXTENDED_PLATFORM is not set
>     # CONFIG_SCHED_OMIT_FRAME_POINTER is not set
>     # CONFIG_X86_MCE is not set
>     # CONFIG_X86_IOPL_IOPERM is not set
>     # CONFIG_MICROCODE is not set
>     # CONFIG_MTRR_SANITIZER is not set
>     # CONFIG_RELOCATABLE is not set
>     # CONFIG_SUSPEND is not set
>     # CONFIG_ACPI is not set
>     # CONFIG_DMIID is not set
>     # CONFIG_VIRTUALIZATION is not set
>     # CONFIG_SECCOMP is not set
>     # CONFIG_STACKPROTECTOR is not set
>     # CONFIG_BLK_DEV_BSG is not set
>     # CONFIG_MQ_IOSCHED_DEADLINE is not set
>     # CONFIG_MQ_IOSCHED_KYBER is not set
>     # CONFIG_BINFMT_ELF is not set
>     # CONFIG_BINFMT_SCRIPT is not set
>     # CONFIG_COMPACTION is not set
>     # CONFIG_STANDALONE is not set
>     # CONFIG_PREVENT_FIRMWARE_BUILD is not set
>     # CONFIG_BLK_DEV is not set
>     # CONFIG_INPUT_KEYBOARD is not set
>     # CONFIG_INPUT_MOUSE is not set
>     # CONFIG_SERIO is not set
>     # CONFIG_LEGACY_PTYS is not set
>     # CONFIG_LDISC_AUTOLOAD is not set
>     CONFIG_SERIAL_8250=y
>     CONFIG_SERIAL_8250_CONSOLE=y
>     # CONFIG_HW_RANDOM is not set
>     # CONFIG_DEVMEM is not set
>     # CONFIG_HWMON is not set
>     # CONFIG_HID is not set
>     # CONFIG_USB_SUPPORT is not set
>     # CONFIG_VIRTIO_MENU is not set
>     # CONFIG_VHOST_MENU is not set
>     # CONFIG_X86_PLATFORM_DEVICES is not set
>     # CONFIG_IOMMU_SUPPORT is not set
>     # CONFIG_MANDATORY_FILE_LOCKING is not set
>     # CONFIG_DNOTIFY is not set
>     # CONFIG_INOTIFY_USER is not set
>     # CONFIG_MISC_FILESYSTEMS is not set
>     # CONFIG_SYMBOLIC_ERRNAME is not set
>     CONFIG_FRAME_WARN=1024
>     # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
>     CONFIG_DEBUG_KERNEL=y
>     CONFIG_LOCK_STAT=y
>     # CONFIG_FTRACE is not set
>     # CONFIG_X86_VERBOSE_BOOTUP is not set
>     CONFIG_UNWINDER_FRAME_POINTER=y
>     # CONFIG_RUNTIME_TESTING_MENU is not set
>     CONFIG_64BIT=y
>     EOF
>     
>     make ARCH=x86_64 olddefconfig
>     make ARCH=x86_64 all
>     
>     qemu-system-x86_64 -nographic -kernel arch/x86/boot/bzImage -append console=ttyS0
> 
> Here is the bisect log:
> 
>     # bad: [4b358aabb93a2c654cd1dcab1a25a589f6e2b153] Add linux-next specific files for 20210813
>     # good: [36a21d51725af2ce0700c6ebcb6b9594aac658a6] Linux 5.14-rc5
>     git bisect start 'HEAD' 'v5.14-rc5'
>     # good: [204808b2ca750e27cbad3455f7cb4368c4f5b260] Merge remote-tracking branch 'crypto/master'
>     git bisect good 204808b2ca750e27cbad3455f7cb4368c4f5b260
>     # good: [2201162fca73b487152bcff2ebb0f85c1dde8479] Merge remote-tracking branch 'tip/auto-latest'
>     git bisect good 2201162fca73b487152bcff2ebb0f85c1dde8479
>     # good: [41f97b6df1c8fd9fa828967a687693454c4ce888] Merge remote-tracking branch 'staging/staging-next'
>     git bisect good 41f97b6df1c8fd9fa828967a687693454c4ce888
>     # good: [797896d32d52af43fc9b0099a198ef29c2ca0138] Merge remote-tracking branch 'userns/for-next'
>     git bisect good 797896d32d52af43fc9b0099a198ef29c2ca0138
>     # bad: [5c7e12cc3d39b5cfc0d1a470139e4e89f3b147ed] arch: Kconfig: fix spelling mistake "seperate" -> "separate"
>     git bisect bad 5c7e12cc3d39b5cfc0d1a470139e4e89f3b147ed
>     # bad: [3535cf93a31ecd8595744881dbbda666cf61be48] add-mmap_assert_locked-annotations-to-find_vma-fix
>     git bisect bad 3535cf93a31ecd8595744881dbbda666cf61be48
>     # bad: [ac90b3dacf327cecda9f3dabc0051c7332770224] mm/debug_vm_pgtable: use struct pgtable_debug_args in PGD and P4D modifying tests
>     git bisect bad ac90b3dacf327cecda9f3dabc0051c7332770224
>     # good: [3a7ac8f97abde2d6ec973a00a449c45b9642a15a] mm, slub: do initial checks in ___slab_alloc() with irqs enabled
>     git bisect good 3a7ac8f97abde2d6ec973a00a449c45b9642a15a
>     # good: [1c84f3c916405dc3d62cfca55fb2a84de9d7f31e] mm, slub: fix memory and cpu hotplug related lock ordering issues
>     git bisect good 1c84f3c916405dc3d62cfca55fb2a84de9d7f31e
>     # bad: [6ac9c394652dbba1181268cb09513edbd733685c] mm/debug_vm_pgtable: introduce struct pgtable_debug_args
>     git bisect bad 6ac9c394652dbba1181268cb09513edbd733685c
>     # good: [35a6f4bcf4ad9c8c0208ea48044539a952859b3a] mm, slub: make slab_lock() disable irqs with PREEMPT_RT
>     git bisect good 35a6f4bcf4ad9c8c0208ea48044539a952859b3a
>     # good: [03e736e3ca2c0a48822609a89ffa0329f4eb5aae] mm, slub: use migrate_disable() on PREEMPT_RT
>     git bisect good 03e736e3ca2c0a48822609a89ffa0329f4eb5aae
>     # bad: [3f57fd12e8b7eb77412623c347566fb83ec5e764] mm, slub: convert kmem_cpu_slab protection to local_lock
>     git bisect bad 3f57fd12e8b7eb77412623c347566fb83ec5e764
>     # first bad commit: [3f57fd12e8b7eb77412623c347566fb83ec5e764] mm, slub: convert kmem_cpu_slab protection to local_lock
> 
> I haven't checked what part of the change is actually causing the problem
> 
> Kind regards,
> 	Sven
>
Sebastian Andrzej Siewior Aug. 17, 2021, 9:12 a.m. UTC | #3
On 2021-08-17 10:37:48 [+0200], Vlastimil Babka wrote:
> OK reproduced. Thanks, will investigate.

With the local_lock at the top, the needed alignment gets broken for dbl
cmpxchg. On RT it was working ;)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index b5bcac29b979c..cd14aa1f9bc3c 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -42,9 +42,9 @@ enum stat_item {
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
-	local_lock_t lock;	/* Protects the fields below except stat */
 	void **freelist;	/* Pointer to next available object */
 	unsigned long tid;	/* Globally unique transaction id */
+	local_lock_t lock;	/* Protects the fields below except stat */
 	struct page *page;	/* The slab from which we are allocating */
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	struct page *partial;	/* Partially allocated frozen slabs */

Sebastian
Vlastimil Babka Aug. 17, 2021, 9:13 a.m. UTC | #4
On 8/15/21 2:27 PM, Sven Eckelmann wrote:
>> mm, slub: convert kmem_cpu_slab protection to local_lock
>>
>> Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions
>> of local_lock instead of plain local_irq_save/restore.  On !PREEMPT_RT
>> that's equivalent, with better lockdep visibility.  On PREEMPT_RT that
>> means better preemption.
> [...]
> 
> Looks like this change breaks booting when 64BIT+LOCK_STAT is enabled on 
> x86_64:
> 
>     general protection fault, maybe for address 0xffff888007fcf1c8: 0000 [#1] NOPTI
>     CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.0-rc5+ #7

faddr2line points to slab_alloc_node(), namely this:

                if (unlikely(!this_cpu_cmpxchg_double(
                                s->cpu_slab->freelist, s->cpu_slab->tid,
                                object, tid,
                                next_object, next_tid(tid)))) {

pahole looks like this:

struct kmem_cache_cpu {
        local_lock_t               lock;                 /*     0    56 */
        void * *                   freelist;             /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        long unsigned int          tid;                  /*    64     8 */
        struct page *              page;                 /*    72     8 */
        struct page *              partial;              /*    80     8 */

        /* size: 88, cachelines: 2, members: 5 */
        /* last cacheline: 24 bytes */
};

I had a hunch and added a padding between lock and freelist,
which made the problem indeed go away. Now I don't know if LOCK_STAT
has some bug that makes it write past the local_lock (guess not) or if
it's some result of the false sharing, which I would have expected to be
only a perf issue, not a correctness issue.
Anyway it's a good idea to have the data in the same cache line so I guess
I'll enforce a cacheline boundary between lock and the data?

>     Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
>     RIP: 0010:kmem_cache_alloc+0x81/0x180
>     Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943
>     RSP: 0000:ffffffff81803c10 EFLAGS: 00000286
>     RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024
>     RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190
>     RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0
>     R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0
>     R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100
>     FS:  0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000
>     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>     CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0
>     Call Trace:
>      __get_vm_area_node.constprop.0.isra.0+0x74/0x150
>      __vmalloc_node_range+0x5a/0x2b0
>      ? kernel_clone+0x88/0x390
>      ? copy_process+0x1ac/0x17e0
>      copy_process+0x768/0x17e0
>      ? kernel_clone+0x88/0x390
>      kernel_clone+0x88/0x390
>      ? _vm_unmap_aliases.part.0+0xe9/0x110
>      ? change_page_attr_set_clr+0x10d/0x180
>      kernel_thread+0x43/0x50
>      ? rest_init+0x100/0x100
>      rest_init+0x1e/0x100
>      arch_call_rest_init+0x9/0xc
>      start_kernel+0x481/0x493
>      x86_64_start_reservations+0x24/0x26
>      x86_64_start_kernel+0x80/0x84
>      secondary_startup_64_no_verify+0xc2/0xcb
>     random: get_random_bytes called from oops_exit+0x34/0x60 with crng_init=0
>     ---[ end trace 2cac18ac38f640c1 ]---
>     RIP: 0010:kmem_cache_alloc+0x81/0x180
>     Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943
>     RSP: 0000:ffffffff81803c10 EFLAGS: 00000286
>     RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024
>     RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190
>     RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0
>     R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0
>     R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100
>     FS:  0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000
>     CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
>     CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0
>     Kernel panic - not syncing: Attempted to kill the idle task!
>     ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---
> 
> 
> This was tested in qemu-system-x86_64 (from Debian Bullseye) with
>     
>     cat > .config << "EOF"
>     # CONFIG_LOCALVERSION_AUTO is not set
>     # CONFIG_SWAP is not set
>     # CONFIG_CROSS_MEMORY_ATTACH is not set
>     # CONFIG_UTS_NS is not set
>     # CONFIG_TIME_NS is not set
>     # CONFIG_PID_NS is not set
>     # CONFIG_COMPAT_BRK is not set
>     # CONFIG_SLAB_MERGE_DEFAULT is not set
>     # CONFIG_RETPOLINE is not set
>     # CONFIG_X86_EXTENDED_PLATFORM is not set
>     # CONFIG_SCHED_OMIT_FRAME_POINTER is not set
>     # CONFIG_X86_MCE is not set
>     # CONFIG_X86_IOPL_IOPERM is not set
>     # CONFIG_MICROCODE is not set
>     # CONFIG_MTRR_SANITIZER is not set
>     # CONFIG_RELOCATABLE is not set
>     # CONFIG_SUSPEND is not set
>     # CONFIG_ACPI is not set
>     # CONFIG_DMIID is not set
>     # CONFIG_VIRTUALIZATION is not set
>     # CONFIG_SECCOMP is not set
>     # CONFIG_STACKPROTECTOR is not set
>     # CONFIG_BLK_DEV_BSG is not set
>     # CONFIG_MQ_IOSCHED_DEADLINE is not set
>     # CONFIG_MQ_IOSCHED_KYBER is not set
>     # CONFIG_BINFMT_ELF is not set
>     # CONFIG_BINFMT_SCRIPT is not set
>     # CONFIG_COMPACTION is not set
>     # CONFIG_STANDALONE is not set
>     # CONFIG_PREVENT_FIRMWARE_BUILD is not set
>     # CONFIG_BLK_DEV is not set
>     # CONFIG_INPUT_KEYBOARD is not set
>     # CONFIG_INPUT_MOUSE is not set
>     # CONFIG_SERIO is not set
>     # CONFIG_LEGACY_PTYS is not set
>     # CONFIG_LDISC_AUTOLOAD is not set
>     CONFIG_SERIAL_8250=y
>     CONFIG_SERIAL_8250_CONSOLE=y
>     # CONFIG_HW_RANDOM is not set
>     # CONFIG_DEVMEM is not set
>     # CONFIG_HWMON is not set
>     # CONFIG_HID is not set
>     # CONFIG_USB_SUPPORT is not set
>     # CONFIG_VIRTIO_MENU is not set
>     # CONFIG_VHOST_MENU is not set
>     # CONFIG_X86_PLATFORM_DEVICES is not set
>     # CONFIG_IOMMU_SUPPORT is not set
>     # CONFIG_MANDATORY_FILE_LOCKING is not set
>     # CONFIG_DNOTIFY is not set
>     # CONFIG_INOTIFY_USER is not set
>     # CONFIG_MISC_FILESYSTEMS is not set
>     # CONFIG_SYMBOLIC_ERRNAME is not set
>     CONFIG_FRAME_WARN=1024
>     # CONFIG_SECTION_MISMATCH_WARN_ONLY is not set
>     CONFIG_DEBUG_KERNEL=y
>     CONFIG_LOCK_STAT=y
>     # CONFIG_FTRACE is not set
>     # CONFIG_X86_VERBOSE_BOOTUP is not set
>     CONFIG_UNWINDER_FRAME_POINTER=y
>     # CONFIG_RUNTIME_TESTING_MENU is not set
>     CONFIG_64BIT=y
>     EOF
>     
>     make ARCH=x86_64 olddefconfig
>     make ARCH=x86_64 all
>     
>     qemu-system-x86_64 -nographic -kernel arch/x86/boot/bzImage -append console=ttyS0
> 
> Here is the bisect log:
> 
>     # bad: [4b358aabb93a2c654cd1dcab1a25a589f6e2b153] Add linux-next specific files for 20210813
>     # good: [36a21d51725af2ce0700c6ebcb6b9594aac658a6] Linux 5.14-rc5
>     git bisect start 'HEAD' 'v5.14-rc5'
>     # good: [204808b2ca750e27cbad3455f7cb4368c4f5b260] Merge remote-tracking branch 'crypto/master'
>     git bisect good 204808b2ca750e27cbad3455f7cb4368c4f5b260
>     # good: [2201162fca73b487152bcff2ebb0f85c1dde8479] Merge remote-tracking branch 'tip/auto-latest'
>     git bisect good 2201162fca73b487152bcff2ebb0f85c1dde8479
>     # good: [41f97b6df1c8fd9fa828967a687693454c4ce888] Merge remote-tracking branch 'staging/staging-next'
>     git bisect good 41f97b6df1c8fd9fa828967a687693454c4ce888
>     # good: [797896d32d52af43fc9b0099a198ef29c2ca0138] Merge remote-tracking branch 'userns/for-next'
>     git bisect good 797896d32d52af43fc9b0099a198ef29c2ca0138
>     # bad: [5c7e12cc3d39b5cfc0d1a470139e4e89f3b147ed] arch: Kconfig: fix spelling mistake "seperate" -> "separate"
>     git bisect bad 5c7e12cc3d39b5cfc0d1a470139e4e89f3b147ed
>     # bad: [3535cf93a31ecd8595744881dbbda666cf61be48] add-mmap_assert_locked-annotations-to-find_vma-fix
>     git bisect bad 3535cf93a31ecd8595744881dbbda666cf61be48
>     # bad: [ac90b3dacf327cecda9f3dabc0051c7332770224] mm/debug_vm_pgtable: use struct pgtable_debug_args in PGD and P4D modifying tests
>     git bisect bad ac90b3dacf327cecda9f3dabc0051c7332770224
>     # good: [3a7ac8f97abde2d6ec973a00a449c45b9642a15a] mm, slub: do initial checks in ___slab_alloc() with irqs enabled
>     git bisect good 3a7ac8f97abde2d6ec973a00a449c45b9642a15a
>     # good: [1c84f3c916405dc3d62cfca55fb2a84de9d7f31e] mm, slub: fix memory and cpu hotplug related lock ordering issues
>     git bisect good 1c84f3c916405dc3d62cfca55fb2a84de9d7f31e
>     # bad: [6ac9c394652dbba1181268cb09513edbd733685c] mm/debug_vm_pgtable: introduce struct pgtable_debug_args
>     git bisect bad 6ac9c394652dbba1181268cb09513edbd733685c
>     # good: [35a6f4bcf4ad9c8c0208ea48044539a952859b3a] mm, slub: make slab_lock() disable irqs with PREEMPT_RT
>     git bisect good 35a6f4bcf4ad9c8c0208ea48044539a952859b3a
>     # good: [03e736e3ca2c0a48822609a89ffa0329f4eb5aae] mm, slub: use migrate_disable() on PREEMPT_RT
>     git bisect good 03e736e3ca2c0a48822609a89ffa0329f4eb5aae
>     # bad: [3f57fd12e8b7eb77412623c347566fb83ec5e764] mm, slub: convert kmem_cpu_slab protection to local_lock
>     git bisect bad 3f57fd12e8b7eb77412623c347566fb83ec5e764
>     # first bad commit: [3f57fd12e8b7eb77412623c347566fb83ec5e764] mm, slub: convert kmem_cpu_slab protection to local_lock
> 
> I haven't checked what part of the change is actually causing the problem
> 
> Kind regards,
> 	Sven
>
Vlastimil Babka Aug. 17, 2021, 9:17 a.m. UTC | #5
On 8/17/21 11:12 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-17 10:37:48 [+0200], Vlastimil Babka wrote:
>> OK reproduced. Thanks, will investigate.
> 
> With the local_lock at the top, the needed alignment gets broken for dbl
> cmpxchg. On RT it was working ;)

I'd rather have page and partial in the same cacheline as well, is it ok
to just move the lock as last and not care about whether it straddles
cachelines or not? (with CONFIG_SLUB_CPU_PARTIAL it will naturally start
with the next cacheline).

> diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
> index b5bcac29b979c..cd14aa1f9bc3c 100644
> --- a/include/linux/slub_def.h
> +++ b/include/linux/slub_def.h
> @@ -42,9 +42,9 @@ enum stat_item {
>  	NR_SLUB_STAT_ITEMS };
>  
>  struct kmem_cache_cpu {
> -	local_lock_t lock;	/* Protects the fields below except stat */
>  	void **freelist;	/* Pointer to next available object */
>  	unsigned long tid;	/* Globally unique transaction id */
> +	local_lock_t lock;	/* Protects the fields below except stat */
>  	struct page *page;	/* The slab from which we are allocating */
>  #ifdef CONFIG_SLUB_CPU_PARTIAL
>  	struct page *partial;	/* Partially allocated frozen slabs */
> 
> Sebastian
>
Sebastian Andrzej Siewior Aug. 17, 2021, 9:31 a.m. UTC | #6
On 2021-08-17 11:17:18 [+0200], Vlastimil Babka wrote:
> On 8/17/21 11:12 AM, Sebastian Andrzej Siewior wrote:
> > On 2021-08-17 10:37:48 [+0200], Vlastimil Babka wrote:
> >> OK reproduced. Thanks, will investigate.
> > 
> > With the local_lock at the top, the needed alignment gets broken for dbl
> > cmpxchg. On RT it was working ;)
> 
> I'd rather have page and partial in the same cacheline as well, is it ok
> to just move the lock as last and not care about whether it straddles
> cachelines or not? (with CONFIG_SLUB_CPU_PARTIAL it will naturally start
> with the next cacheline).

Moving like you suggested appears to be more efficient and saves a bit
of memory:

RT+ debug:
struct kmem_cache_cpu {
        void * *                   freelist;             /*     0     8 */
        long unsigned int          tid;                  /*     8     8 */
        struct page *              page;                 /*    16     8 */
        struct page *              partial;              /*    24     8 */
        local_lock_t               lock;                 /*    32   144 */

        /* size: 176, cachelines: 3, members: 5 */
        /* last cacheline: 48 bytes */
};

RT no debug:
struct kmem_cache_cpu {
        void * *                   freelist;             /*     0     8 */
        long unsigned int          tid;                  /*     8     8 */
        struct page *              page;                 /*    16     8 */
        struct page *              partial;              /*    24     8 */
        local_lock_t               lock;                 /*    32    32 */

        /* size: 64, cachelines: 1, members: 5 */
};

no-RT, no-debug:
struct kmem_cache_cpu {
        void * *                   freelist;             /*     0     8 */
        long unsigned int          tid;                  /*     8     8 */
        struct page *              page;                 /*    16     8 */
        struct page *              partial;              /*    24     8 */
        local_lock_t               lock;                 /*    32     0 */

        /* size: 32, cachelines: 1, members: 5 */
        /* last cacheline: 32 bytes */
};

no-RT, debug:
struct kmem_cache_cpu {
        void * *                   freelist;             /*     0     8 */
        long unsigned int          tid;                  /*     8     8 */
        struct page *              page;                 /*    16     8 */
        struct page *              partial;              /*    24     8 */
        local_lock_t               lock;                 /*    32    56 */

        /* size: 88, cachelines: 2, members: 5 */
        /* last cacheline: 24 bytes */
};

Sebastian
Vlastimil Babka Aug. 17, 2021, 9:31 a.m. UTC | #7
On 8/17/21 11:12 AM, Sebastian Andrzej Siewior wrote:
> On 2021-08-17 10:37:48 [+0200], Vlastimil Babka wrote:
>> OK reproduced. Thanks, will investigate.
> 
> With the local_lock at the top, the needed alignment gets broken for dbl
> cmpxchg.

Right. I wondered why the checks in __pcpu_double_call_return_bool
didn't trigger. They are VM_BUG_ON() so they did trigger after enabling
DEBUG_VM.

> 
> diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
> index b5bcac29b979c..cd14aa1f9bc3c 100644
> --- a/include/linux/slub_def.h
> +++ b/include/linux/slub_def.h
> @@ -42,9 +42,9 @@ enum stat_item {
>  	NR_SLUB_STAT_ITEMS };
>  
>  struct kmem_cache_cpu {
> -	local_lock_t lock;	/* Protects the fields below except stat */
>  	void **freelist;	/* Pointer to next available object */
>  	unsigned long tid;	/* Globally unique transaction id */
> +	local_lock_t lock;	/* Protects the fields below except stat */
>  	struct page *page;	/* The slab from which we are allocating */
>  #ifdef CONFIG_SLUB_CPU_PARTIAL
>  	struct page *partial;	/* Partially allocated frozen slabs */
> 
> Sebastian
>
Sebastian Andrzej Siewior Aug. 17, 2021, 9:34 a.m. UTC | #8
On 2021-08-17 11:31:56 [+0200], Vlastimil Babka wrote:
> On 8/17/21 11:12 AM, Sebastian Andrzej Siewior wrote:
> > On 2021-08-17 10:37:48 [+0200], Vlastimil Babka wrote:
> >> OK reproduced. Thanks, will investigate.
> > 
> > With the local_lock at the top, the needed alignment gets broken for dbl
> > cmpxchg.
> 
> Right. I wondered why the checks in __pcpu_double_call_return_bool
> didn't trigger. They are VM_BUG_ON() so they did trigger after enabling
> DEBUG_VM.

Without the right debugging enabled

| typedef struct {
| #ifdef CONFIG_DEBUG_LOCK_ALLOC
|         struct lockdep_map      dep_map;
|         struct task_struct      *owner;
| #endif
| } local_lock_t;

the struct is just empty.

Sebastian
Vlastimil Babka Aug. 17, 2021, 10:14 a.m. UTC | #9
On 8/5/21 5:20 PM, Vlastimil Babka wrote:
> Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions of
> local_lock instead of plain local_irq_save/restore. On !PREEMPT_RT that's
> equivalent, with better lockdep visibility. On PREEMPT_RT that means better
> preemption.
> 
> However, the cost on PREEMPT_RT is the loss of lockless fast paths which only
> work with cpu freelist. Those are designed to detect and recover from being
> preempted by other conflicting operations (both fast or slow path), but the
> slow path operations assume they cannot be preempted by a fast path operation,
> which is guaranteed naturally with disabled irqs. With local locks on
> PREEMPT_RT, the fast paths now also need to take the local lock to avoid races.
> 
> In the allocation fastpath slab_alloc_node() we can just defer to the slowpath
> __slab_alloc() which also works with cpu freelist, but under the local lock.
> In the free fastpath do_slab_free() we have to add a new local lock protected
> version of freeing to the cpu freelist, as the existing slowpath only works
> with the page freelist.
> 
> Also update the comment about locking scheme in SLUB to reflect changes done
> by this series.
> 
> [ Mike Galbraith <efault@gmx.de>: use local_lock() without irq in PREEMPT_RT
>   scope; debugging of RT crashes resulting in put_cpu_partial() locking changes ]
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---

Another fixup. Is it too many and should we replace it all with a v5?
----8<----
From b13291ca13effc2b22a55619aada688ad5defa4b Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 17 Aug 2021 11:47:16 +0200
Subject: [PATCH] mm, slub: fix kmem_cache_cpu fields alignment for double
 cmpxchg

Sven Eckelmann reports [1] that the addition of local_lock to kmem_cache_cpu
breaks a config with 64BIT+LOCK_STAT:

    general protection fault, maybe for address 0xffff888007fcf1c8: 0000 [#1] NOPTI
    CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.0-rc5+ #7
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
    RIP: 0010:kmem_cache_alloc+0x81/0x180
    Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943
    RSP: 0000:ffffffff81803c10 EFLAGS: 00000286
    RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024
    RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190
    RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0
    R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0
    R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100
    FS:  0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0
    Call Trace:
     __get_vm_area_node.constprop.0.isra.0+0x74/0x150
     __vmalloc_node_range+0x5a/0x2b0
     ? kernel_clone+0x88/0x390
     ? copy_process+0x1ac/0x17e0
     copy_process+0x768/0x17e0
     ? kernel_clone+0x88/0x390
     kernel_clone+0x88/0x390
     ? _vm_unmap_aliases.part.0+0xe9/0x110
     ? change_page_attr_set_clr+0x10d/0x180
     kernel_thread+0x43/0x50
     ? rest_init+0x100/0x100
     rest_init+0x1e/0x100
     arch_call_rest_init+0x9/0xc
     start_kernel+0x481/0x493
     x86_64_start_reservations+0x24/0x26
     x86_64_start_kernel+0x80/0x84
     secondary_startup_64_no_verify+0xc2/0xcb
    random: get_random_bytes called from oops_exit+0x34/0x60 with crng_init=0
    ---[ end trace 2cac18ac38f640c1 ]---
    RIP: 0010:kmem_cache_alloc+0x81/0x180
    Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943
    RSP: 0000:ffffffff81803c10 EFLAGS: 00000286
    RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024
    RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190
    RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0
    R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0
    R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100
    FS:  0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0
    Kernel panic - not syncing: Attempted to kill the idle task!
    ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---

Decoding the RIP points to this_cpu_cmpxchg_double() call in slab_alloc_node().

The problem is the particular size of local_lock_t with LOCK_STAT resulting
in the following layout:

struct kmem_cache_cpu {
        local_lock_t               lock;                 /*     0    56 */
        void * *                   freelist;             /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        long unsigned int          tid;                  /*    64     8 */
        struct page *              page;                 /*    72     8 */
        struct page *              partial;              /*    80     8 */

        /* size: 88, cachelines: 2, members: 5 */
        /* last cacheline: 24 bytes */
};

As pointed out by Sebastian Andrzej Siewior, this_cpu_cmpxchg_double()
needs the freelist and tid fields to be aligned to sum of their sizes
(16 bytes) but they are not in this configuration. This didn't happen
with non-debug RT and !RT configs as well as lockdep.

To fix this, move the lock field below partial field, so that it doesn't
affect the layout.

[1] https://lore.kernel.org/linux-mm/2666777.vCjUEy5FO1@sven-desktop/

This is a fixup for mmotm patch
mm-slub-convert-kmem_cpu_slab-protection-to-local_lock.patch

Reported-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/slub_def.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index b5bcac29b979..85499f0586b0 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -41,14 +41,18 @@ enum stat_item {
 	CPU_PARTIAL_DRAIN,	/* Drain cpu partial to node partial */
 	NR_SLUB_STAT_ITEMS };
 
+/*
+ * When changing the layout, make sure freelist and tid are still compatible
+ * with this_cpu_cmpxchg_double() alignment requirements.
+ */
 struct kmem_cache_cpu {
-	local_lock_t lock;	/* Protects the fields below except stat */
 	void **freelist;	/* Pointer to next available object */
 	unsigned long tid;	/* Globally unique transaction id */
 	struct page *page;	/* The slab from which we are allocating */
 #ifdef CONFIG_SLUB_CPU_PARTIAL
 	struct page *partial;	/* Partially allocated frozen slabs */
 #endif
+	local_lock_t lock;	/* Protects the fields above */
 #ifdef CONFIG_SLUB_STATS
 	unsigned stat[NR_SLUB_STAT_ITEMS];
 #endif
Sebastian Andrzej Siewior Aug. 17, 2021, 3:39 p.m. UTC | #10
On 2021-08-05 17:20:00 [+0200], Vlastimil Babka wrote:
> @@ -2849,7 +2891,11 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>  
>  load_freelist:
>  
> -	lockdep_assert_irqs_disabled();
> +#ifdef CONFIG_PREEMPT_RT
> +	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock.lock));
> +#else
> +	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
> +#endif

Could you please make this hunk only

	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));

i.e. the non-RT version?

>  	/*
>  	 * freelist is pointing to the list of objects to be used.


Sebastian
Vlastimil Babka Aug. 17, 2021, 3:41 p.m. UTC | #11
On 8/17/21 5:39 PM, Sebastian Andrzej Siewior wrote:
> On 2021-08-05 17:20:00 [+0200], Vlastimil Babka wrote:
>> @@ -2849,7 +2891,11 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>>  
>>  load_freelist:
>>  
>> -	lockdep_assert_irqs_disabled();
>> +#ifdef CONFIG_PREEMPT_RT
>> +	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock.lock));
>> +#else
>> +	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
>> +#endif
> 
> Could you please make this hunk only
> 
> 	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
> 
> i.e. the non-RT version?

Does it mean that version works fine on RT now?

>>  	/*
>>  	 * freelist is pointing to the list of objects to be used.
> 
> 
> Sebastian
>
Sebastian Andrzej Siewior Aug. 17, 2021, 3:49 p.m. UTC | #12
On 2021-08-17 17:41:57 [+0200], Vlastimil Babka wrote:
> Does it mean that version works fine on RT now?

Yes. There is no difference now between RT and !RT regarding the
dep_map member.

Sebastian
Vlastimil Babka Aug. 17, 2021, 3:56 p.m. UTC | #13
On 8/5/21 5:20 PM, Vlastimil Babka wrote:
> Embed local_lock into struct kmem_cpu_slab and use the irq-safe versions of
> local_lock instead of plain local_irq_save/restore. On !PREEMPT_RT that's
> equivalent, with better lockdep visibility. On PREEMPT_RT that means better
> preemption.
> 
> However, the cost on PREEMPT_RT is the loss of lockless fast paths which only
> work with cpu freelist. Those are designed to detect and recover from being
> preempted by other conflicting operations (both fast or slow path), but the
> slow path operations assume they cannot be preempted by a fast path operation,
> which is guaranteed naturally with disabled irqs. With local locks on
> PREEMPT_RT, the fast paths now also need to take the local lock to avoid races.
> 
> In the allocation fastpath slab_alloc_node() we can just defer to the slowpath
> __slab_alloc() which also works with cpu freelist, but under the local lock.
> In the free fastpath do_slab_free() we have to add a new local lock protected
> version of freeing to the cpu freelist, as the existing slowpath only works
> with the page freelist.
> 
> Also update the comment about locking scheme in SLUB to reflect changes done
> by this series.
> 
> [ Mike Galbraith <efault@gmx.de>: use local_lock() without irq in PREEMPT_RT
>   scope; debugging of RT crashes resulting in put_cpu_partial() locking changes ]
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

And improvements in the RT land made the following fixup-cleanup
possible.
----8<----
From 8b87e5de5d79a9d3ab4627f5530f1888fa7824f8 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Tue, 17 Aug 2021 17:51:54 +0200
Subject: [PATCH] mm, slab: simplify lockdep_assert_held in
 lockdep_assert_held()

Sebastian reports [1] that the special version of lockdep_assert_held() for a
local lock with PREEMPT_RT is no longer necessary, and we can simplify.

[1] https://lore.kernel.org/linux-mm/20210817153937.hxnuh7mqp6vuiyws@linutronix.de/

This is a fixup for mmotm patch
mm-slub-convert-kmem_cpu_slab-protection-to-local_lock.patch

Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/mm/slub.c b/mm/slub.c
index be57687062aa..df1ac8aff86f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2913,11 +2913,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 
 load_freelist:
 
-#ifdef CONFIG_PREEMPT_RT
-	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock.lock));
-#else
 	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
-#endif
 
 	/*
 	 * freelist is pointing to the list of objects to be used.
Andrew Morton Aug. 17, 2021, 7:53 p.m. UTC | #14
On Tue, 17 Aug 2021 12:14:58 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:

> Another fixup. Is it too many and should we replace it all with a v5?

Maybe do a full resend when things have settled down and I can at least
check that we match.

What's your confidence level for a 5.15-rc1 merge?  It isn't terribly
well reviewed?
Vlastimil Babka Aug. 18, 2021, 11:52 a.m. UTC | #15
On 8/17/21 9:53 PM, Andrew Morton wrote:
> On Tue, 17 Aug 2021 12:14:58 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
> 
>> Another fixup. Is it too many and should we replace it all with a v5?
> 
> Maybe do a full resend when things have settled down and I can at least
> check that we match.

OK.

> What's your confidence level for a 5.15-rc1 merge?

I'd say pretty good. It's part of RT patchset for a while (since early
July IIRC?) and there has been lot of testing there. Mike and Mel also
tested under !RT configs, and the bug report from Sven means the mmotm
in -next also gets testing. The fixups were all thanks to the testing
and recently shifted to smaller unusual-config-specific issues that
could be dealt with even during rcX stabilization in case there's more.

> It isn't terribly
> well reviewed?

Yeah that could be better, the pool of people deeply familiar with SLUB
is not large, unfortunately. I hope folks will still step up!
Thomas Gleixner Aug. 23, 2021, 8:36 p.m. UTC | #16
Andrew,

On Wed, Aug 18 2021 at 13:52, Vlastimil Babka wrote:
> On 8/17/21 9:53 PM, Andrew Morton wrote:
>> On Tue, 17 Aug 2021 12:14:58 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
>
>> What's your confidence level for a 5.15-rc1 merge?
>
> I'd say pretty good. It's part of RT patchset for a while (since early
> July IIRC?) and there has been lot of testing there. Mike and Mel also
> tested under !RT configs, and the bug report from Sven means the mmotm
> in -next also gets testing. The fixups were all thanks to the testing
> and recently shifted to smaller unusual-config-specific issues that
> could be dealt with even during rcX stabilization in case there's
> more.

I can confirm that the series converged nicely from the very beginning
and Vlastimil was quickly addressing review feedback and the really
moderate fallout.

Various stress tests on both RT and mainline with the latest patches
applied look rock solid now. There might be still some small dragons
lurking, but I don't think there is a danger for a big fallout.

>> It isn't terribly
>> well reviewed?
>
> Yeah that could be better, the pool of people deeply familiar with SLUB
> is not large, unfortunately. I hope folks will still step up!

I've reviewed the lot several times with my RT hat on. I'm surely not
qualifiying for deeply familiar, but I've been dealing with taming SLUB
and the page allocator to play nicely with RT for almost 10 years now...

Thanks,

        tglx
diff mbox series

Patch

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index dcde82a4434c..b5bcac29b979 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,6 +10,7 @@ 
 #include <linux/kfence.h>
 #include <linux/kobject.h>
 #include <linux/reciprocal_div.h>
+#include <linux/local_lock.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
@@ -41,6 +42,7 @@  enum stat_item {
 	NR_SLUB_STAT_ITEMS };
 
 struct kmem_cache_cpu {
+	local_lock_t lock;	/* Protects the fields below except stat */
 	void **freelist;	/* Pointer to next available object */
 	unsigned long tid;	/* Globally unique transaction id */
 	struct page *page;	/* The slab from which we are allocating */
diff --git a/mm/slub.c b/mm/slub.c
index 690e762912b7..8052334fcc56 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -46,13 +46,21 @@ 
 /*
  * Lock order:
  *   1. slab_mutex (Global Mutex)
- *   2. node->list_lock
- *   3. slab_lock(page) (Only on some arches and for debugging)
+ *   2. node->list_lock (Spinlock)
+ *   3. kmem_cache->cpu_slab->lock (Local lock)
+ *   4. slab_lock(page) (Only on some arches or for debugging)
+ *   5. object_map_lock (Only for debugging)
  *
  *   slab_mutex
  *
  *   The role of the slab_mutex is to protect the list of all the slabs
  *   and to synchronize major metadata changes to slab cache structures.
+ *   Also synchronizes memory hotplug callbacks.
+ *
+ *   slab_lock
+ *
+ *   The slab_lock is a wrapper around the page lock, thus it is a bit
+ *   spinlock.
  *
  *   The slab_lock is only used for debugging and on arches that do not
  *   have the ability to do a cmpxchg_double. It only protects:
@@ -61,6 +69,8 @@ 
  *	C. page->objects	-> Number of objects in page
  *	D. page->frozen		-> frozen state
  *
+ *   Frozen slabs
+ *
  *   If a slab is frozen then it is exempt from list management. It is not
  *   on any list except per cpu partial list. The processor that froze the
  *   slab is the one who can perform list operations on the page. Other
@@ -68,6 +78,8 @@ 
  *   froze the slab is the only one that can retrieve the objects from the
  *   page's freelist.
  *
+ *   list_lock
+ *
  *   The list_lock protects the partial and full list on each node and
  *   the partial slab counter. If taken then no new slabs may be added or
  *   removed from the lists nor make the number of partial slabs be modified.
@@ -79,10 +91,36 @@ 
  *   slabs, operations can continue without any centralized lock. F.e.
  *   allocating a long series of objects that fill up slabs does not require
  *   the list lock.
- *   Interrupts are disabled during allocation and deallocation in order to
- *   make the slab allocator safe to use in the context of an irq. In addition
- *   interrupts are disabled to ensure that the processor does not change
- *   while handling per_cpu slabs, due to kernel preemption.
+ *
+ *   cpu_slab->lock local lock
+ *
+ *   This locks protect slowpath manipulation of all kmem_cache_cpu fields
+ *   except the stat counters. This is a percpu structure manipulated only by
+ *   the local cpu, so the lock protects against being preempted or interrupted
+ *   by an irq. Fast path operations rely on lockless operations instead.
+ *   On PREEMPT_RT, the local lock does not actually disable irqs (and thus
+ *   prevent the lockless operations), so fastpath operations also need to take
+ *   the lock and are no longer lockless.
+ *
+ *   lockless fastpaths
+ *
+ *   The fast path allocation (slab_alloc_node()) and freeing (do_slab_free())
+ *   are fully lockless when satisfied from the percpu slab (and when
+ *   cmpxchg_double is possible to use, otherwise slab_lock is taken).
+ *   They also don't disable preemption or migration or irqs. They rely on
+ *   the transaction id (tid) field to detect being preempted or moved to
+ *   another cpu.
+ *
+ *   irq, preemption, migration considerations
+ *
+ *   Interrupts are disabled as part of list_lock or local_lock operations, or
+ *   around the slab_lock operation, in order to make the slab allocator safe
+ *   to use in the context of an irq.
+ *
+ *   In addition, preemption (or migration on PREEMPT_RT) is disabled in the
+ *   allocation slowpath, bulk allocation, and put_cpu_partial(), so that the
+ *   local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer
+ *   doesn't have to be revalidated in each section protected by the local lock.
  *
  * SLUB assigns one slab for allocation to each processor.
  * Allocations only occur from these slabs called cpu slabs.
@@ -2228,9 +2266,13 @@  static inline void note_cmpxchg_failure(const char *n,
 static void init_kmem_cache_cpus(struct kmem_cache *s)
 {
 	int cpu;
+	struct kmem_cache_cpu *c;
 
-	for_each_possible_cpu(cpu)
-		per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+	for_each_possible_cpu(cpu) {
+		c = per_cpu_ptr(s->cpu_slab, cpu);
+		local_lock_init(&c->lock);
+		c->tid = init_tid(cpu);
+	}
 }
 
 /*
@@ -2441,10 +2483,10 @@  static void unfreeze_partials(struct kmem_cache *s)
 	struct page *partial_page;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&s->cpu_slab->lock, flags);
 	partial_page = this_cpu_read(s->cpu_slab->partial);
 	this_cpu_write(s->cpu_slab->partial, NULL);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 
 	if (partial_page)
 		__unfreeze_partials(s, partial_page);
@@ -2477,7 +2519,7 @@  static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 	int pages = 0;
 	int pobjects = 0;
 
-	local_irq_save(flags);
+	local_lock_irqsave(&s->cpu_slab->lock, flags);
 
 	oldpage = this_cpu_read(s->cpu_slab->partial);
 
@@ -2505,7 +2547,7 @@  static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 
 	this_cpu_write(s->cpu_slab->partial, page);
 
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 
 	if (page_to_unfreeze) {
 		__unfreeze_partials(s, page_to_unfreeze);
@@ -2529,7 +2571,7 @@  static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c,
 	struct page *page;
 
 	if (lock)
-		local_irq_save(flags);
+		local_lock_irqsave(&s->cpu_slab->lock, flags);
 
 	freelist = c->freelist;
 	page = c->page;
@@ -2539,7 +2581,7 @@  static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c,
 	c->tid = next_tid(c->tid);
 
 	if (lock)
-		local_irq_restore(flags);
+		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 
 	if (page)
 		deactivate_slab(s, page, freelist);
@@ -2827,9 +2869,9 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 		goto deactivate_slab;
 
 	/* must check again c->page in case we got preempted and it changed */
-	local_irq_save(flags);
+	local_lock_irqsave(&s->cpu_slab->lock, flags);
 	if (unlikely(page != c->page)) {
-		local_irq_restore(flags);
+		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 		goto reread_page;
 	}
 	freelist = c->freelist;
@@ -2840,7 +2882,7 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 
 	if (!freelist) {
 		c->page = NULL;
-		local_irq_restore(flags);
+		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 		stat(s, DEACTIVATE_BYPASS);
 		goto new_slab;
 	}
@@ -2849,7 +2891,11 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 
 load_freelist:
 
-	lockdep_assert_irqs_disabled();
+#ifdef CONFIG_PREEMPT_RT
+	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock.lock));
+#else
+	lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
+#endif
 
 	/*
 	 * freelist is pointing to the list of objects to be used.
@@ -2859,39 +2905,39 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 	VM_BUG_ON(!c->page->frozen);
 	c->freelist = get_freepointer(s, freelist);
 	c->tid = next_tid(c->tid);
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 	return freelist;
 
 deactivate_slab:
 
-	local_irq_save(flags);
+	local_lock_irqsave(&s->cpu_slab->lock, flags);
 	if (page != c->page) {
-		local_irq_restore(flags);
+		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 		goto reread_page;
 	}
 	freelist = c->freelist;
 	c->page = NULL;
 	c->freelist = NULL;
-	local_irq_restore(flags);
+	local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 	deactivate_slab(s, page, freelist);
 
 new_slab:
 
 	if (slub_percpu_partial(c)) {
-		local_irq_save(flags);
+		local_lock_irqsave(&s->cpu_slab->lock, flags);
 		if (unlikely(c->page)) {
-			local_irq_restore(flags);
+			local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 			goto reread_page;
 		}
 		if (unlikely(!slub_percpu_partial(c))) {
-			local_irq_restore(flags);
+			local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 			/* we were preempted and partial list got empty */
 			goto new_objects;
 		}
 
 		page = c->page = slub_percpu_partial(c);
 		slub_set_percpu_partial(c, page);
-		local_irq_restore(flags);
+		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 		stat(s, CPU_PARTIAL_ALLOC);
 		goto redo;
 	}
@@ -2944,7 +2990,7 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 
 retry_load_page:
 
-	local_irq_save(flags);
+	local_lock_irqsave(&s->cpu_slab->lock, flags);
 	if (unlikely(c->page)) {
 		void *flush_freelist = c->freelist;
 		struct page *flush_page = c->page;
@@ -2953,7 +2999,7 @@  static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 		c->freelist = NULL;
 		c->tid = next_tid(c->tid);
 
-		local_irq_restore(flags);
+		local_unlock_irqrestore(&s->cpu_slab->lock, flags);
 
 		deactivate_slab(s, flush_page, flush_freelist);
 
@@ -3072,7 +3118,15 @@  static __always_inline void *slab_alloc_node(struct kmem_cache *s,
 
 	object = c->freelist;
 	page = c->page;
-	if (unlikely(!object || !page || !node_match(page, node))) {
+	/*
+	 * We cannot use the lockless fastpath on PREEMPT_RT because if a
+	 * slowpath has taken the local_lock_irqsave(), it is not protected
+	 * against a fast path operation in an irq handler. So we need to take
+	 * the slow path which uses local_lock. It is still relatively fast if
+	 * there is a suitable cpu freelist.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) ||
+	    unlikely(!object || !page || !node_match(page, node))) {
 		object = __slab_alloc(s, gfpflags, node, addr, c);
 	} else {
 		void *next_object = get_freepointer_safe(s, object);
@@ -3332,6 +3386,7 @@  static __always_inline void do_slab_free(struct kmem_cache *s,
 	barrier();
 
 	if (likely(page == c->page)) {
+#ifndef CONFIG_PREEMPT_RT
 		void **freelist = READ_ONCE(c->freelist);
 
 		set_freepointer(s, tail_obj, freelist);
@@ -3344,6 +3399,31 @@  static __always_inline void do_slab_free(struct kmem_cache *s,
 			note_cmpxchg_failure("slab_free", s, tid);
 			goto redo;
 		}
+#else /* CONFIG_PREEMPT_RT */
+		/*
+		 * We cannot use the lockless fastpath on PREEMPT_RT because if
+		 * a slowpath has taken the local_lock_irqsave(), it is not
+		 * protected against a fast path operation in an irq handler. So
+		 * we need to take the local_lock. We shouldn't simply defer to
+		 * __slab_free() as that wouldn't use the cpu freelist at all.
+		 */
+		void **freelist;
+
+		local_lock(&s->cpu_slab->lock);
+		c = this_cpu_ptr(s->cpu_slab);
+		if (unlikely(page != c->page)) {
+			local_unlock(&s->cpu_slab->lock);
+			goto redo;
+		}
+		tid = c->tid;
+		freelist = c->freelist;
+
+		set_freepointer(s, tail_obj, freelist);
+		c->freelist = head;
+		c->tid = next_tid(tid);
+
+		local_unlock(&s->cpu_slab->lock);
+#endif
 		stat(s, FREE_FASTPATH);
 	} else
 		__slab_free(s, page, head, tail_obj, cnt, addr);
@@ -3522,7 +3602,7 @@  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 	 * handlers invoking normal fastpath.
 	 */
 	c = slub_get_cpu_ptr(s->cpu_slab);
-	local_irq_disable();
+	local_lock_irq(&s->cpu_slab->lock);
 
 	for (i = 0; i < size; i++) {
 		void *object = kfence_alloc(s, s->object_size, flags);
@@ -3543,7 +3623,7 @@  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 			 */
 			c->tid = next_tid(c->tid);
 
-			local_irq_enable();
+			local_unlock_irq(&s->cpu_slab->lock);
 
 			/*
 			 * Invoking slow path likely have side-effect
@@ -3557,7 +3637,7 @@  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 			c = this_cpu_ptr(s->cpu_slab);
 			maybe_wipe_obj_freeptr(s, p[i]);
 
-			local_irq_disable();
+			local_lock_irq(&s->cpu_slab->lock);
 
 			continue; /* goto for-loop */
 		}
@@ -3566,7 +3646,7 @@  int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
 		maybe_wipe_obj_freeptr(s, p[i]);
 	}
 	c->tid = next_tid(c->tid);
-	local_irq_enable();
+	local_unlock_irq(&s->cpu_slab->lock);
 	slub_put_cpu_ptr(s->cpu_slab);
 
 	/*