Message ID | Y67NrWVLaL7B/EuO@casper.infradead.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] Catch dwmw2's deadlock | expand |
On Fri, 2022-12-30 at 11:38 +0000, Matthew Wilcox wrote: > > <dwmw2> why doesn't lockdep catch us calling synchronize_srcu() with > a lock held, and elsewhere obtaining that lock within an srcu critical > region ("read lock") ? > > Because synchronize_srcu() doesn't acquire the lock, merely checks that > it isn't held. > > Would this work? Not even compile tested. > > You can put my SoB on this if it works. > Thanks. Not sure if this addresses the issue I was talking about. There exists a completely unrelated mutex, let's call it kvm->lock. One code path *holds* kvm->lock while calling synchronize_srcu(). Another code path is in a read-side section and attempts to obtain the same kvm->lock. Described here: https://lore.kernel.org/kvm/20221229211737.138861-1-mhal@rbox.co/T/#m594c1068d7a659f0a1aab3b64b0903836919bb0a > diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c > index ca4b5dcec675..e9c2ab8369c0 100644 > --- a/kernel/rcu/srcutree.c > +++ b/kernel/rcu/srcutree.c > @@ -1267,11 +1267,11 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) > { > struct rcu_synchronize rcu; > > - RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || > - lock_is_held(&rcu_bh_lock_map) || > + rcu_lock_acquire(&ssp->dep_map); > + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || > lock_is_held(&rcu_lock_map) || > lock_is_held(&rcu_sched_lock_map), > - "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); > + "Illegal synchronize_srcu() in RCU read-side critical section"); > > if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) > return; > @@ -1282,6 +1282,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) > __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm); > wait_for_completion(&rcu.completion); > destroy_rcu_head_on_stack(&rcu.head); > + rcu_lock_release(&ssp->dep_map); > > /* > * Make sure that later code is ordered after the SRCU grace
> On Dec 30, 2022, at 7:28 AM, David Woodhouse <dwmw2@infradead.org> wrote: > > On Fri, 2022-12-30 at 11:38 +0000, Matthew Wilcox wrote: >> >> <dwmw2> why doesn't lockdep catch us calling synchronize_srcu() with >> a lock held, and elsewhere obtaining that lock within an srcu critical >> region ("read lock") ? >> >> Because synchronize_srcu() doesn't acquire the lock, merely checks that >> it isn't held. >> >> Would this work? Not even compile tested. >> >> You can put my SoB on this if it works. >> > > Thanks. Not sure if this addresses the issue I was talking about. > > There exists a completely unrelated mutex, let's call it kvm->lock. > > One code path *holds* kvm->lock while calling synchronize_srcu(). > Another code path is in a read-side section and attempts to obtain the > same kvm->lock. > > Described here: > > https://lore.kernel.org/kvm/20221229211737.138861-1-mhal@rbox.co/T/#m594c1068d7a659f0a1aab3b64b0903836919bb0a I think the patch from Matthew Wilcox will address it because the read side section already acquires the dep_map. So lockdep subsystem should be able to nail the dependency. One comment on that below: > >> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c >> index ca4b5dcec675..e9c2ab8369c0 100644 >> --- a/kernel/rcu/srcutree.c >> +++ b/kernel/rcu/srcutree.c >> @@ -1267,11 +1267,11 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) >> { >> struct rcu_synchronize rcu; >> >> - RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || >> - lock_is_held(&rcu_bh_lock_map) || >> + rcu_lock_acquire(&ssp->dep_map); >> + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || >> lock_is_held(&rcu_lock_map) || >> lock_is_held(&rcu_sched_lock_map), >> - "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); >> + "Illegal synchronize_srcu() in RCU read-side critical section"); >> >> if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) >> return; Should you not release here? Thanks, - Joel >> @@ -1282,6 +1282,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) >> __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm); >> wait_for_completion(&rcu.completion); >> destroy_rcu_head_on_stack(&rcu.head); >> + rcu_lock_release(&ssp->dep_map); >> >> /* >> * Make sure that later code is ordered after the SRCU grace >
On Fri, Dec 30, 2022 at 11:38:21AM +0000, Matthew Wilcox wrote: > > <dwmw2> why doesn't lockdep catch us calling synchronize_srcu() with > a lock held, and elsewhere obtaining that lock within an srcu critical > region ("read lock") ? > > Because synchronize_srcu() doesn't acquire the lock, merely checks that > it isn't held. > > Would this work? Not even compile tested. > > You can put my SoB on this if it works. > > diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c > index ca4b5dcec675..e9c2ab8369c0 100644 > --- a/kernel/rcu/srcutree.c > +++ b/kernel/rcu/srcutree.c > @@ -1267,11 +1267,11 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) > { > struct rcu_synchronize rcu; > > - RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || > - lock_is_held(&rcu_bh_lock_map) || > + rcu_lock_acquire(&ssp->dep_map); If this does do anything, why doesn't lockdep complain about things like this? i1 = srcu_read_lock(&my_srcu); i2 = srcu_read_lock(&my_srcu); srcu_read_unlock(&my_srcu, i2); srcu_read_unlock(&my_srcu, i1); Or does it in fact complain and I have somehow managed to confuse things such that rcutorture does not detect such complaints? I would expect rcutorture_extend_mask_max() to make this happen, but then again, RCU has confused me in the past. Thanx, Paul > + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || > lock_is_held(&rcu_lock_map) || > lock_is_held(&rcu_sched_lock_map), > - "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); > + "Illegal synchronize_srcu() in RCU read-side critical section"); > > if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) > return; > @@ -1282,6 +1282,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) > __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm); > wait_for_completion(&rcu.completion); > destroy_rcu_head_on_stack(&rcu.head); > + rcu_lock_release(&ssp->dep_map); > > /* > * Make sure that later code is ordered after the SRCU grace
On Fri, Dec 30, 2022 at 12:30:12PM -0800, Paul E. McKenney wrote: > On Fri, Dec 30, 2022 at 11:38:21AM +0000, Matthew Wilcox wrote: > > > > <dwmw2> why doesn't lockdep catch us calling synchronize_srcu() with > > a lock held, and elsewhere obtaining that lock within an srcu critical > > region ("read lock") ? > > > > Because synchronize_srcu() doesn't acquire the lock, merely checks that > > it isn't held. > > > > Would this work? Not even compile tested. > > > > You can put my SoB on this if it works. > > > > diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c > > index ca4b5dcec675..e9c2ab8369c0 100644 > > --- a/kernel/rcu/srcutree.c > > +++ b/kernel/rcu/srcutree.c > > @@ -1267,11 +1267,11 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) > > { > > struct rcu_synchronize rcu; > > > > - RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || > > - lock_is_held(&rcu_bh_lock_map) || > > + rcu_lock_acquire(&ssp->dep_map); > > If this does do anything, why doesn't lockdep complain about things > like this? > > i1 = srcu_read_lock(&my_srcu); > i2 = srcu_read_lock(&my_srcu); > srcu_read_unlock(&my_srcu, i2); > srcu_read_unlock(&my_srcu, i1); > > Or does it in fact complain and I have somehow managed to confuse things > such that rcutorture does not detect such complaints? I would expect > rcutorture_extend_mask_max() to make this happen, but then again, RCU > has confused me in the past. static inline void rcu_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); } That's clear as mud, of course. The fourth argument, which has value '2' is documented as 'int read', and that has the meaning: * Values for "read": * * 0: exclusive (write) acquire * 1: read-acquire (no recursion allowed) * 2: read-acquire with same-instance recursion allowed But that's enough to establish the locking hierarchy. ie if you do: read-lock-A write-lock-B in one thread and in the other thread do: write-lock-B read-lock-A lockdep is still going to detect that as a deadly embrace. Isn't it? In any case, I was just trying to point out the general outlines of something I saw as a potential solution. If we need to turn that into lock_acquire(map, 0, 0, 0, 0, NULL, _THIS_IP), that's fine. And Joel is right, there needs to be a release in that other path. But I am not an expert in RCU, lockdep, or working while travelling, so I was rather hoping someone like dwmw2 would pick up the sketch, make it compile, and then test it.
On Fri, Dec 30, 2022 at 09:58:23PM +0000, Matthew Wilcox wrote: > On Fri, Dec 30, 2022 at 12:30:12PM -0800, Paul E. McKenney wrote: > > On Fri, Dec 30, 2022 at 11:38:21AM +0000, Matthew Wilcox wrote: > > > > > > <dwmw2> why doesn't lockdep catch us calling synchronize_srcu() with > > > a lock held, and elsewhere obtaining that lock within an srcu critical > > > region ("read lock") ? > > > > > > Because synchronize_srcu() doesn't acquire the lock, merely checks that > > > it isn't held. > > > > > > Would this work? Not even compile tested. > > > > > > You can put my SoB on this if it works. > > > > > > diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c > > > index ca4b5dcec675..e9c2ab8369c0 100644 > > > --- a/kernel/rcu/srcutree.c > > > +++ b/kernel/rcu/srcutree.c > > > @@ -1267,11 +1267,11 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) > > > { > > > struct rcu_synchronize rcu; > > > > > > - RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || > > > - lock_is_held(&rcu_bh_lock_map) || > > > + rcu_lock_acquire(&ssp->dep_map); > > > > If this does do anything, why doesn't lockdep complain about things > > like this? > > > > i1 = srcu_read_lock(&my_srcu); > > i2 = srcu_read_lock(&my_srcu); > > srcu_read_unlock(&my_srcu, i2); > > srcu_read_unlock(&my_srcu, i1); > > > > Or does it in fact complain and I have somehow managed to confuse things > > such that rcutorture does not detect such complaints? I would expect > > rcutorture_extend_mask_max() to make this happen, but then again, RCU > > has confused me in the past. > > static inline void rcu_lock_acquire(struct lockdep_map *map) > { > lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); > } > > That's clear as mud, of course. The fourth argument, which has value > '2' is documented as 'int read', and that has the meaning: > > * Values for "read": > * > * 0: exclusive (write) acquire > * 1: read-acquire (no recursion allowed) > * 2: read-acquire with same-instance recursion allowed > > But that's enough to establish the locking hierarchy. ie if you do: > > read-lock-A > write-lock-B > > in one thread and in the other thread do: > > write-lock-B > read-lock-A > > lockdep is still going to detect that as a deadly embrace. Isn't it? Maybe? > In any case, I was just trying to point out the general outlines of > something I saw as a potential solution. If we need to turn that into > lock_acquire(map, 0, 0, 0, 0, NULL, _THIS_IP), that's fine. And Joel > is right, there needs to be a release in that other path. > > But I am not an expert in RCU, lockdep, or working while travelling, > so I was rather hoping someone like dwmw2 would pick up the sketch, > make it compile, and then test it. Sounds like a plan to me! Thanx, Paul
On 12/30/22 14:18, Joel Fernandes wrote: > I think the patch from Matthew Wilcox will address it because the > read side section already acquires the dep_map. So lockdep subsystem > should be able to nail the dependency. (...) Perhaps it's something misconfigured on my side, but I still don't see any lockdep splats, just the usual task hang warning after 120s. If that's any help, here's a crude selftest (actually a severed version of xen_shinfo_test). Under current mainline 6.2.0-rc1 it results in exactly the type of deadlocks described by David. Michal diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 1750f91dd936..0f02a4fe9374 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -61,6 +61,7 @@ TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh # Compiled test targets TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test +TEST_GEN_PROGS_x86_64 += x86_64/deadlocks_test TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test diff --git a/tools/testing/selftests/kvm/x86_64/deadlocks_test.c b/tools/testing/selftests/kvm/x86_64/deadlocks_test.c new file mode 100644 index 000000000000..e6150a624905 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/deadlocks_test.c @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stdio.h> +#include <string.h> +#include <pthread.h> + +#include "kvm_util.h" +#include "processor.h" + +#define RACE_ITERATIONS 0x1000 + +#define EVTCHN_PORT 1 +#define XEN_HYPERCALL_MSR 0x40000000 + +#define EVTCHNSTAT_interdomain 2 +#define __HYPERVISOR_event_channel_op 32 +#define EVTCHNOP_send 4 + +#define VCPU_INFO_REGION_GPA 0xc0000000ULL +#define VCPU_INFO_REGION_SLOT 10 + +struct evtchn_send { + u32 port; +}; + +static void evtchn_assign(struct kvm_vm *vm) +{ + struct kvm_xen_hvm_attr assign = { + .type = KVM_XEN_ATTR_TYPE_EVTCHN, + .u.evtchn = { + .flags = 0, + .send_port = EVTCHN_PORT, + .type = EVTCHNSTAT_interdomain, + .deliver.port = { + .port = EVTCHN_PORT, + .priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL, + .vcpu = 0 + } + } + }; + + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &assign); +} + +static void *set_msr_filter(void *arg) +{ + struct kvm_vm *vm = (struct kvm_vm *)arg; + + struct kvm_msr_filter filter = { + .flags = KVM_MSR_FILTER_DEFAULT_ALLOW, + .ranges = {} + }; + + for (;;) { + vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter); + pthread_testcancel(); + } + + return NULL; +} + +static void *set_pmu_filter(void *arg) +{ + struct kvm_vm *vm = (struct kvm_vm *)arg; + + struct kvm_pmu_event_filter filter = { + .action = KVM_PMU_EVENT_ALLOW, + .flags = 0, + .nevents = 0 + }; + + for (;;) { + vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, &filter); + pthread_testcancel(); + } + + return NULL; +} + +static void guest_code(void) +{ + struct evtchn_send s = { .port = EVTCHN_PORT }; + + for (;;) { + asm volatile("vmcall" + : + : "a" (__HYPERVISOR_event_channel_op), + "D" (EVTCHNOP_send), + "S" (&s) + : "memory"); + GUEST_SYNC(0); + } +} + +static void race_against(struct kvm_vcpu *vcpu, void *(*func)(void *)) +{ + pthread_t thread; + int i, ret; + + ret = pthread_create(&thread, NULL, func, (void *)vcpu->vm); + TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret)); + + for (i = 0; i < RACE_ITERATIONS; ++i) { + fprintf(stderr, "."); + vcpu_run(vcpu); + } + printf("\n"); + + ret = pthread_cancel(thread); + TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret)); + + ret = pthread_join(thread, 0); + TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret)); +} + +int main(int argc, char *argv[]) +{ + unsigned int xen_caps; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM); + TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + vcpu_set_hv_cpuid(vcpu); + + struct kvm_xen_hvm_attr ha = { + .type = KVM_XEN_ATTR_TYPE_SHARED_INFO, + .u.shared_info.gfn = KVM_XEN_INVALID_GFN, + }; + vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha); + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, + VCPU_INFO_REGION_GPA, VCPU_INFO_REGION_SLOT, 1, 0); + + struct kvm_xen_vcpu_attr vi = { + .type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO, + .u.gpa = VCPU_INFO_REGION_GPA, + }; + vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vi); + + struct kvm_xen_hvm_config hvmc = { + .flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL, + .msr = XEN_HYPERCALL_MSR + }; + vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc); + + evtchn_assign(vm); + + race_against(vcpu, set_msr_filter); + race_against(vcpu, set_pmu_filter); + + kvm_vm_free(vm); + return 0; +}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index ca4b5dcec675..e9c2ab8369c0 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1267,11 +1267,11 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) { struct rcu_synchronize rcu; - RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || - lock_is_held(&rcu_bh_lock_map) || + rcu_lock_acquire(&ssp->dep_map); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); + "Illegal synchronize_srcu() in RCU read-side critical section"); if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) return; @@ -1282,6 +1282,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm); wait_for_completion(&rcu.completion); destroy_rcu_head_on_stack(&rcu.head); + rcu_lock_release(&ssp->dep_map); /* * Make sure that later code is ordered after the SRCU grace