diff mbox series

[bpf-next,v1,12/22] rqspinlock: Add basic support for CONFIG_PARAVIRT

Message ID 20250107140004.2732830-13-memxor@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Resilient Queued Spin Lock | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-12 fail Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-7 fail Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 fail Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 fail Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 fail Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-40 fail Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 fail Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
netdev/series_format fail Series longer than 15 patches
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1 this patch: 1
netdev/build_tools success Errors and warnings before: 0 (+0) this patch: 0 (+0)
netdev/cc_maintainers warning 11 maintainers not CCed: longman@redhat.com will@kernel.org arnd@arndb.de boqun.feng@gmail.com bp@alien8.de tglx@linutronix.de mingo@redhat.com hpa@zytor.com dave.hansen@linux.intel.com linux-arch@vger.kernel.org x86@kernel.org
netdev/build_clang success Errors and warnings before: 32 this patch: 32
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 5 this patch: 5
netdev/checkpatch warning WARNING: added, moved or deleted file(s), does MAINTAINERS need updating? WARNING: line length of 106 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns WARNING: please, no spaces at the start of a line
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 2 this patch: 2
netdev/source_inline fail Was 0 now: 1

Commit Message

Kumar Kartikeya Dwivedi Jan. 7, 2025, 1:59 p.m. UTC
We ripped out PV and virtualization related bits from rqspinlock in an
earlier commit, however, a fair lock performs poorly within a virtual
machine when the lock holder is preempted. As such, retain the
virt_spin_lock fallback to test and set lock, but with timeout and
deadlock detection.

We don't integrate support for CONFIG_PARAVIRT_SPINLOCKS yet, as that
requires more involved algorithmic changes and introduces more
complexity. It can be done when the need arises in the future.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 arch/x86/include/asm/rqspinlock.h | 20 ++++++++++++++++
 include/asm-generic/rqspinlock.h  |  7 ++++++
 kernel/locking/rqspinlock.c       | 38 +++++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+)
 create mode 100644 arch/x86/include/asm/rqspinlock.h

Comments

Waiman Long Jan. 8, 2025, 4:27 p.m. UTC | #1
On 1/7/25 8:59 AM, Kumar Kartikeya Dwivedi wrote:
> We ripped out PV and virtualization related bits from rqspinlock in an
> earlier commit, however, a fair lock performs poorly within a virtual
> machine when the lock holder is preempted. As such, retain the
> virt_spin_lock fallback to test and set lock, but with timeout and
> deadlock detection.
>
> We don't integrate support for CONFIG_PARAVIRT_SPINLOCKS yet, as that
> requires more involved algorithmic changes and introduces more
> complexity. It can be done when the need arises in the future.

virt_spin_lock() doesn't scale well. It is for hypervisors that don't 
support PV qspinlock yet. Now rqspinlock() will be in this category.

I wonder if we should provide an option to disable rqspinlock and fall 
back to the regular qspinlock with strict BPF locking semantics.

Another question that I have is about PREEMPT_RT kernel which cannot 
tolerate any locking stall. That will probably require disabling 
rqspinlock if CONFIG_PREEMPT_RT is enabled.

Cheers,
Longman
Kumar Kartikeya Dwivedi Jan. 8, 2025, 8:32 p.m. UTC | #2
On Wed, 8 Jan 2025 at 21:57, Waiman Long <llong@redhat.com> wrote:
>
> On 1/7/25 8:59 AM, Kumar Kartikeya Dwivedi wrote:
> > We ripped out PV and virtualization related bits from rqspinlock in an
> > earlier commit, however, a fair lock performs poorly within a virtual
> > machine when the lock holder is preempted. As such, retain the
> > virt_spin_lock fallback to test and set lock, but with timeout and
> > deadlock detection.
> >
> > We don't integrate support for CONFIG_PARAVIRT_SPINLOCKS yet, as that
> > requires more involved algorithmic changes and introduces more
> > complexity. It can be done when the need arises in the future.
>
> virt_spin_lock() doesn't scale well. It is for hypervisors that don't
> support PV qspinlock yet. Now rqspinlock() will be in this category.

We would need to make algorithmic changes to paravirt versions, which
would be too much for this series, so I didn't go there.

>
> I wonder if we should provide an option to disable rqspinlock and fall
> back to the regular qspinlock with strict BPF locking semantics.

That unfortunately won't work, because rqspinlock operates essentially
like a trylock, where it is allowed to fail and callers must handle
errors accordingly. Some of the users in BPF (e.g. in patch 17) remove
their per-cpu nesting counts to rely on AA deadlock detection of
rqspinlock, which would cause a deadlock if we transparently replace
it with qspinlock as a fallback.

>
> Another question that I have is about PREEMPT_RT kernel which cannot
> tolerate any locking stall. That will probably require disabling
> rqspinlock if CONFIG_PREEMPT_RT is enabled.

I think rqspinlock better maps to the raw spin lock variants, which
stays as a spin lock on RT kernels, and as you see in patch 17 and 18,
BPF maps were already using the raw spin lock variants. To avoid
stalling, we perform deadlock checks immediately when we enter the
slow path, so for the cases where we rely upon rqspinlock to diagnose
and report an error, we'll recover quickly. If we still hit the
timeout it is probably a different problem / bug anyway (and would
have caused a kernel hang otherwise).

>
> Cheers,
> Longman
>
Waiman Long Jan. 9, 2025, 12:48 a.m. UTC | #3
On 1/8/25 3:32 PM, Kumar Kartikeya Dwivedi wrote:
> On Wed, 8 Jan 2025 at 21:57, Waiman Long <llong@redhat.com> wrote:
>> On 1/7/25 8:59 AM, Kumar Kartikeya Dwivedi wrote:
>>> We ripped out PV and virtualization related bits from rqspinlock in an
>>> earlier commit, however, a fair lock performs poorly within a virtual
>>> machine when the lock holder is preempted. As such, retain the
>>> virt_spin_lock fallback to test and set lock, but with timeout and
>>> deadlock detection.
>>>
>>> We don't integrate support for CONFIG_PARAVIRT_SPINLOCKS yet, as that
>>> requires more involved algorithmic changes and introduces more
>>> complexity. It can be done when the need arises in the future.
>> virt_spin_lock() doesn't scale well. It is for hypervisors that don't
>> support PV qspinlock yet. Now rqspinlock() will be in this category.
> We would need to make algorithmic changes to paravirt versions, which
> would be too much for this series, so I didn't go there.
I know. The paravirt part is the most difficult. It took me over a year 
to work on the paravirt part of qspinlock to get it right and merged 
upstream.
>
>> I wonder if we should provide an option to disable rqspinlock and fall
>> back to the regular qspinlock with strict BPF locking semantics.
> That unfortunately won't work, because rqspinlock operates essentially
> like a trylock, where it is allowed to fail and callers must handle
> errors accordingly. Some of the users in BPF (e.g. in patch 17) remove
> their per-cpu nesting counts to rely on AA deadlock detection of
> rqspinlock, which would cause a deadlock if we transparently replace
> it with qspinlock as a fallback.

I see. This information should be documented somewhere.


>> Another question that I have is about PREEMPT_RT kernel which cannot
>> tolerate any locking stall. That will probably require disabling
>> rqspinlock if CONFIG_PREEMPT_RT is enabled.
> I think rqspinlock better maps to the raw spin lock variants, which
> stays as a spin lock on RT kernels, and as you see in patch 17 and 18,
> BPF maps were already using the raw spin lock variants. To avoid
> stalling, we perform deadlock checks immediately when we enter the
> slow path, so for the cases where we rely upon rqspinlock to diagnose
> and report an error, we'll recover quickly. If we still hit the
> timeout it is probably a different problem / bug anyway (and would
> have caused a kernel hang otherwise).

Is the intention to only replace raw_spinlock_t by rqspinlock but never 
spinlock_t? Again, this information need to be documented. Looking at 
the pdf file, it looks like the rqspinlock usage will be extended over time.

As for the locking semantics allowed by the BPF verifier, is it possible 
to enforce the strict locking rules for PREEMPT_RT kernel and use the 
relaxed semantics for non-PREEMPT_RT kernel. We don't want the loading 
of an arbitrary BPF program to break the latency guarantee of a 
PREEMPT_RT kernel.

Cheers,
Longman
Alexei Starovoitov Jan. 9, 2025, 2:42 a.m. UTC | #4
On Wed, Jan 8, 2025 at 4:48 PM Waiman Long <llong@redhat.com> wrote:
>
> Is the intention to only replace raw_spinlock_t by rqspinlock but never
> spinlock_t?

Correct. We brainstormed whether we can introduce resilient mutex
for sleepable context, but it's way out of scope and PI
considerations are too complex to think through.
rqspinlock is a spinning lock, so it's a replacement for raw_spin_lock
and really only for bpf use cases.

We considered placing rqspinlock.c in kernel/bpf/ directory
to discourage any other use beyond bpf,
but decided to keep in kernel/locking/ only because
it's using mcs_spinlock.h and qspinlock_stat.h
and doing #include "../locking/mcs_spinlock.h"
is kinda ugly.

Patch 16 does:
+++ b/kernel/locking/Makefile
@@ -24,6 +24,9 @@  obj-$(CONFIG_SMP) += spinlock.o
 obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
 obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
 obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
+ifeq ($(CONFIG_BPF_SYSCALL),y)
+obj-$(CONFIG_QUEUED_SPINLOCKS) += rqspinlock.o
+endif

so that should give enough of a hint that it's for bpf usage.

> As for the locking semantics allowed by the BPF verifier, is it possible
> to enforce the strict locking rules for PREEMPT_RT kernel and use the
> relaxed semantics for non-PREEMPT_RT kernel. We don't want the loading
> of an arbitrary BPF program to break the latency guarantee of a
> PREEMPT_RT kernel.

Not really.
root can load silly bpf progs that take significant
amount time without abusing spinlocks.
Like 100k integer divides or a sequence of thousands of calls to map_update.
Long runtime of broken progs is a known issue.
We're working on a runtime termination check/watchdog that
will detect long running progs and will terminate them.
Safe termination is tricky, as you can imagine.
Waiman Long Jan. 9, 2025, 2:58 a.m. UTC | #5
On 1/8/25 9:42 PM, Alexei Starovoitov wrote:
> On Wed, Jan 8, 2025 at 4:48 PM Waiman Long <llong@redhat.com> wrote:
>> Is the intention to only replace raw_spinlock_t by rqspinlock but never
>> spinlock_t?
> Correct. We brainstormed whether we can introduce resilient mutex
> for sleepable context, but it's way out of scope and PI
> considerations are too complex to think through.
> rqspinlock is a spinning lock, so it's a replacement for raw_spin_lock
> and really only for bpf use cases.
Thank for the confirmation. I think we should document the fact that 
rqspinlock is a replacement for raw_spin_lock only in the rqspinlock.c 
file to prevent possible abuse in the future.
>
> We considered placing rqspinlock.c in kernel/bpf/ directory
> to discourage any other use beyond bpf,
> but decided to keep in kernel/locking/ only because
> it's using mcs_spinlock.h and qspinlock_stat.h
> and doing #include "../locking/mcs_spinlock.h"
> is kinda ugly.
>
> Patch 16 does:
> +++ b/kernel/locking/Makefile
> @@ -24,6 +24,9 @@  obj-$(CONFIG_SMP) += spinlock.o
>   obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
>   obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
>   obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
> +ifeq ($(CONFIG_BPF_SYSCALL),y)
> +obj-$(CONFIG_QUEUED_SPINLOCKS) += rqspinlock.o
> +endif
>
> so that should give enough of a hint that it's for bpf usage.
>
>> As for the locking semantics allowed by the BPF verifier, is it possible
>> to enforce the strict locking rules for PREEMPT_RT kernel and use the
>> relaxed semantics for non-PREEMPT_RT kernel. We don't want the loading
>> of an arbitrary BPF program to break the latency guarantee of a
>> PREEMPT_RT kernel.
> Not really.
> root can load silly bpf progs that take significant
> amount time without abusing spinlocks.
> Like 100k integer divides or a sequence of thousands of calls to map_update.
> Long runtime of broken progs is a known issue.
> We're working on a runtime termination check/watchdog that
> will detect long running progs and will terminate them.
> Safe termination is tricky, as you can imagine.

Right.

In that case, we just have to warn users that they can load BPF prog at 
their own risk and PREEMPT_RT kernel may break its latency guarantee.

Thanks,
Longman
Alexei Starovoitov Jan. 9, 2025, 3:37 a.m. UTC | #6
On Wed, Jan 8, 2025 at 6:58 PM Waiman Long <llong@redhat.com> wrote:
>
>
> On 1/8/25 9:42 PM, Alexei Starovoitov wrote:
> > On Wed, Jan 8, 2025 at 4:48 PM Waiman Long <llong@redhat.com> wrote:
> >> Is the intention to only replace raw_spinlock_t by rqspinlock but never
> >> spinlock_t?
> > Correct. We brainstormed whether we can introduce resilient mutex
> > for sleepable context, but it's way out of scope and PI
> > considerations are too complex to think through.
> > rqspinlock is a spinning lock, so it's a replacement for raw_spin_lock
> > and really only for bpf use cases.
> Thank for the confirmation. I think we should document the fact that
> rqspinlock is a replacement for raw_spin_lock only in the rqspinlock.c
> file to prevent possible abuse in the future.

Agreed.

> >
> > We considered placing rqspinlock.c in kernel/bpf/ directory
> > to discourage any other use beyond bpf,
> > but decided to keep in kernel/locking/ only because
> > it's using mcs_spinlock.h and qspinlock_stat.h
> > and doing #include "../locking/mcs_spinlock.h"
> > is kinda ugly.
> >
> > Patch 16 does:
> > +++ b/kernel/locking/Makefile
> > @@ -24,6 +24,9 @@  obj-$(CONFIG_SMP) += spinlock.o
> >   obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
> >   obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
> >   obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
> > +ifeq ($(CONFIG_BPF_SYSCALL),y)
> > +obj-$(CONFIG_QUEUED_SPINLOCKS) += rqspinlock.o
> > +endif
> >
> > so that should give enough of a hint that it's for bpf usage.
> >
> >> As for the locking semantics allowed by the BPF verifier, is it possible
> >> to enforce the strict locking rules for PREEMPT_RT kernel and use the
> >> relaxed semantics for non-PREEMPT_RT kernel. We don't want the loading
> >> of an arbitrary BPF program to break the latency guarantee of a
> >> PREEMPT_RT kernel.
> > Not really.
> > root can load silly bpf progs that take significant
> > amount time without abusing spinlocks.
> > Like 100k integer divides or a sequence of thousands of calls to map_update.
> > Long runtime of broken progs is a known issue.
> > We're working on a runtime termination check/watchdog that
> > will detect long running progs and will terminate them.
> > Safe termination is tricky, as you can imagine.
>
> Right.
>
> In that case, we just have to warn users that they can load BPF prog at
> their own risk and PREEMPT_RT kernel may break its latency guarantee.

Let's not open this can of worms.
There will be a proper watchdog eventually.
If we start to warn, when do we warn? On any bpf program loaded?
How about classic BPF ? tcpdump and seccomp ? They are limited
to 4k instructions, but folks can abuse that too.
Waiman Long Jan. 9, 2025, 3:46 a.m. UTC | #7
On 1/8/25 10:37 PM, Alexei Starovoitov wrote:
> On Wed, Jan 8, 2025 at 6:58 PM Waiman Long <llong@redhat.com> wrote:
>>
>> On 1/8/25 9:42 PM, Alexei Starovoitov wrote:
>>> On Wed, Jan 8, 2025 at 4:48 PM Waiman Long <llong@redhat.com> wrote:
>>>> Is the intention to only replace raw_spinlock_t by rqspinlock but never
>>>> spinlock_t?
>>> Correct. We brainstormed whether we can introduce resilient mutex
>>> for sleepable context, but it's way out of scope and PI
>>> considerations are too complex to think through.
>>> rqspinlock is a spinning lock, so it's a replacement for raw_spin_lock
>>> and really only for bpf use cases.
>> Thank for the confirmation. I think we should document the fact that
>> rqspinlock is a replacement for raw_spin_lock only in the rqspinlock.c
>> file to prevent possible abuse in the future.
> Agreed.
>
>>> We considered placing rqspinlock.c in kernel/bpf/ directory
>>> to discourage any other use beyond bpf,
>>> but decided to keep in kernel/locking/ only because
>>> it's using mcs_spinlock.h and qspinlock_stat.h
>>> and doing #include "../locking/mcs_spinlock.h"
>>> is kinda ugly.
>>>
>>> Patch 16 does:
>>> +++ b/kernel/locking/Makefile
>>> @@ -24,6 +24,9 @@  obj-$(CONFIG_SMP) += spinlock.o
>>>    obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
>>>    obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
>>>    obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
>>> +ifeq ($(CONFIG_BPF_SYSCALL),y)
>>> +obj-$(CONFIG_QUEUED_SPINLOCKS) += rqspinlock.o
>>> +endif
>>>
>>> so that should give enough of a hint that it's for bpf usage.
>>>
>>>> As for the locking semantics allowed by the BPF verifier, is it possible
>>>> to enforce the strict locking rules for PREEMPT_RT kernel and use the
>>>> relaxed semantics for non-PREEMPT_RT kernel. We don't want the loading
>>>> of an arbitrary BPF program to break the latency guarantee of a
>>>> PREEMPT_RT kernel.
>>> Not really.
>>> root can load silly bpf progs that take significant
>>> amount time without abusing spinlocks.
>>> Like 100k integer divides or a sequence of thousands of calls to map_update.
>>> Long runtime of broken progs is a known issue.
>>> We're working on a runtime termination check/watchdog that
>>> will detect long running progs and will terminate them.
>>> Safe termination is tricky, as you can imagine.
>> Right.
>>
>> In that case, we just have to warn users that they can load BPF prog at
>> their own risk and PREEMPT_RT kernel may break its latency guarantee.
> Let's not open this can of worms.
> There will be a proper watchdog eventually.
> If we start to warn, when do we warn? On any bpf program loaded?
> How about classic BPF ? tcpdump and seccomp ? They are limited
> to 4k instructions, but folks can abuse that too.

My intention is to document this somewhere, not to print out a warning 
in the kernel dmesg log.

Cheers,
Longman
Alexei Starovoitov Jan. 9, 2025, 3:53 a.m. UTC | #8
On Wed, Jan 8, 2025 at 7:46 PM Waiman Long <llong@redhat.com> wrote:
>
> >>>> As for the locking semantics allowed by the BPF verifier, is it possible
> >>>> to enforce the strict locking rules for PREEMPT_RT kernel and use the
> >>>> relaxed semantics for non-PREEMPT_RT kernel. We don't want the loading
> >>>> of an arbitrary BPF program to break the latency guarantee of a
> >>>> PREEMPT_RT kernel.
> >>> Not really.
> >>> root can load silly bpf progs that take significant
> >>> amount time without abusing spinlocks.
> >>> Like 100k integer divides or a sequence of thousands of calls to map_update.
> >>> Long runtime of broken progs is a known issue.
> >>> We're working on a runtime termination check/watchdog that
> >>> will detect long running progs and will terminate them.
> >>> Safe termination is tricky, as you can imagine.
> >> Right.
> >>
> >> In that case, we just have to warn users that they can load BPF prog at
> >> their own risk and PREEMPT_RT kernel may break its latency guarantee.
> > Let's not open this can of worms.
> > There will be a proper watchdog eventually.
> > If we start to warn, when do we warn? On any bpf program loaded?
> > How about classic BPF ? tcpdump and seccomp ? They are limited
> > to 4k instructions, but folks can abuse that too.
>
> My intention is to document this somewhere, not to print out a warning
> in the kernel dmesg log.

Document what exactly?
"Loading arbitrary BPF program may break the latency guarantee of PREEMPT_RT"
?
That's not helpful to anyone.
Especially it undermines the giant effort we did together
with RT folks to make bpf behave well on RT.
For a long time bpf was the only user of migrate_disable().
Some of XDP bits got friendly to RT only in the last release. Etc.
Waiman Long Jan. 9, 2025, 3:58 a.m. UTC | #9
On 1/8/25 10:53 PM, Alexei Starovoitov wrote:
> On Wed, Jan 8, 2025 at 7:46 PM Waiman Long <llong@redhat.com> wrote:
>>>>>> As for the locking semantics allowed by the BPF verifier, is it possible
>>>>>> to enforce the strict locking rules for PREEMPT_RT kernel and use the
>>>>>> relaxed semantics for non-PREEMPT_RT kernel. We don't want the loading
>>>>>> of an arbitrary BPF program to break the latency guarantee of a
>>>>>> PREEMPT_RT kernel.
>>>>> Not really.
>>>>> root can load silly bpf progs that take significant
>>>>> amount time without abusing spinlocks.
>>>>> Like 100k integer divides or a sequence of thousands of calls to map_update.
>>>>> Long runtime of broken progs is a known issue.
>>>>> We're working on a runtime termination check/watchdog that
>>>>> will detect long running progs and will terminate them.
>>>>> Safe termination is tricky, as you can imagine.
>>>> Right.
>>>>
>>>> In that case, we just have to warn users that they can load BPF prog at
>>>> their own risk and PREEMPT_RT kernel may break its latency guarantee.
>>> Let's not open this can of worms.
>>> There will be a proper watchdog eventually.
>>> If we start to warn, when do we warn? On any bpf program loaded?
>>> How about classic BPF ? tcpdump and seccomp ? They are limited
>>> to 4k instructions, but folks can abuse that too.
>> My intention is to document this somewhere, not to print out a warning
>> in the kernel dmesg log.
> Document what exactly?
> "Loading arbitrary BPF program may break the latency guarantee of PREEMPT_RT"
> ?
> That's not helpful to anyone.
> Especially it undermines the giant effort we did together
> with RT folks to make bpf behave well on RT.
> For a long time bpf was the only user of migrate_disable().
> Some of XDP bits got friendly to RT only in the last release. Etc.

OK, it is just a suggestion. If you don't think that is necessary, I am 
not going to insist. Anyway, users should thoroughly test their BPF 
program before deplolying on production systems.

Cheers,
Longman
diff mbox series

Patch

diff --git a/arch/x86/include/asm/rqspinlock.h b/arch/x86/include/asm/rqspinlock.h
new file mode 100644
index 000000000000..ecfb7dfe6370
--- /dev/null
+++ b/arch/x86/include/asm/rqspinlock.h
@@ -0,0 +1,20 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_RQSPINLOCK_H
+#define _ASM_X86_RQSPINLOCK_H
+
+#include <asm/paravirt.h>
+
+#ifdef CONFIG_PARAVIRT
+DECLARE_STATIC_KEY_FALSE(virt_spin_lock_key);
+
+#define resilient_virt_spin_lock_enabled resilient_virt_spin_lock_enabled
+static __always_inline bool resilient_virt_spin_lock_enabled(void)
+{
+       return static_branch_likely(&virt_spin_lock_key);
+}
+
+#endif /* CONFIG_PARAVIRT */
+
+#include <asm-generic/rqspinlock.h>
+
+#endif /* _ASM_X86_RQSPINLOCK_H */
diff --git a/include/asm-generic/rqspinlock.h b/include/asm-generic/rqspinlock.h
index c7e33ccc57a6..dc436ab01471 100644
--- a/include/asm-generic/rqspinlock.h
+++ b/include/asm-generic/rqspinlock.h
@@ -17,6 +17,13 @@  struct qspinlock;
 
 extern int resilient_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val, u64 timeout);
 
+#ifndef resilient_virt_spin_lock_enabled
+static __always_inline bool resilient_virt_spin_lock_enabled(void)
+{
+	return false;
+}
+#endif
+
 /*
  * Default timeout for waiting loops is 0.5 seconds
  */
diff --git a/kernel/locking/rqspinlock.c b/kernel/locking/rqspinlock.c
index b7c86127d288..e397f91ebcf6 100644
--- a/kernel/locking/rqspinlock.c
+++ b/kernel/locking/rqspinlock.c
@@ -247,6 +247,41 @@  static noinline int check_timeout(struct qspinlock *lock, u32 mask,
  */
 #define RES_RESET_TIMEOUT(ts) ({ (ts).timeout_end = 0; })
 
+#ifdef CONFIG_PARAVIRT
+
+static inline int resilient_virt_spin_lock(struct qspinlock *lock, struct rqspinlock_timeout *ts)
+{
+	int val, ret = 0;
+
+	RES_RESET_TIMEOUT(*ts);
+	grab_held_lock_entry(lock);
+retry:
+	val = atomic_read(&lock->val);
+
+	if (val || !atomic_try_cmpxchg(&lock->val, &val, _Q_LOCKED_VAL)) {
+		if (RES_CHECK_TIMEOUT(*ts, ret, ~0u)) {
+			lockevent_inc(rqspinlock_lock_timeout);
+			goto timeout;
+		}
+		cpu_relax();
+		goto retry;
+	}
+
+	return 0;
+timeout:
+	release_held_lock_entry();
+	return ret;
+}
+
+#else
+
+static __always_inline int resilient_virt_spin_lock(struct qspinlock *lock, struct rqspinlock_timeout *ts)
+{
+	return 0;
+}
+
+#endif /* CONFIG_PARAVIRT */
+
 /*
  * Per-CPU queue node structures; we can never have more than 4 nested
  * contexts: task, softirq, hardirq, nmi.
@@ -287,6 +322,9 @@  int __lockfunc resilient_queued_spin_lock_slowpath(struct qspinlock *lock, u32 v
 
 	RES_INIT_TIMEOUT(ts, timeout);
 
+	if (resilient_virt_spin_lock_enabled())
+		return resilient_virt_spin_lock(lock, &ts);
+
 	/*
 	 * Wait for in-progress pending->locked hand-overs with a bounded
 	 * number of spins so that we guarantee forward progress.