diff mbox series

[RFC] Catch dwmw2's deadlock

Message ID Y67NrWVLaL7B/EuO@casper.infradead.org (mailing list archive)
State New, archived
Headers show
Series [RFC] Catch dwmw2's deadlock | expand

Commit Message

Matthew Wilcox (Oracle) Dec. 30, 2022, 11:38 a.m. UTC
<dwmw2> why doesn't lockdep catch us calling synchronize_srcu() with
   a lock held, and elsewhere obtaining that lock within an srcu critical
   region ("read lock") ?

Because synchronize_srcu() doesn't acquire the lock, merely checks that
it isn't held.

Would this work?  Not even compile tested.

You can put my SoB on this if it works.

Comments

David Woodhouse Dec. 30, 2022, 12:28 p.m. UTC | #1
On Fri, 2022-12-30 at 11:38 +0000, Matthew Wilcox wrote:
> 
> <dwmw2> why doesn't lockdep catch us calling synchronize_srcu() with
>    a lock held, and elsewhere obtaining that lock within an srcu critical
>    region ("read lock") ?
> 
> Because synchronize_srcu() doesn't acquire the lock, merely checks that
> it isn't held.
> 
> Would this work?  Not even compile tested.
> 
> You can put my SoB on this if it works.
> 

Thanks. Not sure if this addresses the issue I was talking about.

There exists a completely unrelated mutex, let's call it kvm->lock.

One code path *holds* kvm->lock while calling synchronize_srcu().
Another code path is in a read-side section and attempts to obtain the
same kvm->lock.

Described here:

https://lore.kernel.org/kvm/20221229211737.138861-1-mhal@rbox.co/T/#m594c1068d7a659f0a1aab3b64b0903836919bb0a

> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> index ca4b5dcec675..e9c2ab8369c0 100644
> --- a/kernel/rcu/srcutree.c
> +++ b/kernel/rcu/srcutree.c
> @@ -1267,11 +1267,11 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
>  {
>         struct rcu_synchronize rcu;
>  
> -       RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
> -                        lock_is_held(&rcu_bh_lock_map) ||
> +       rcu_lock_acquire(&ssp->dep_map);
> +       RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
>                          lock_is_held(&rcu_lock_map) ||
>                          lock_is_held(&rcu_sched_lock_map),
> -                        "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
> +                        "Illegal synchronize_srcu() in RCU read-side critical section");
>  
>         if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
>                 return;
> @@ -1282,6 +1282,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
>         __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm);
>         wait_for_completion(&rcu.completion);
>         destroy_rcu_head_on_stack(&rcu.head);
> +       rcu_lock_release(&ssp->dep_map);
>  
>         /*
>          * Make sure that later code is ordered after the SRCU grace
Joel Fernandes Dec. 30, 2022, 1:18 p.m. UTC | #2
> On Dec 30, 2022, at 7:28 AM, David Woodhouse <dwmw2@infradead.org> wrote:
> 
> On Fri, 2022-12-30 at 11:38 +0000, Matthew Wilcox wrote:
>> 
>> <dwmw2> why doesn't lockdep catch us calling synchronize_srcu() with
>>    a lock held, and elsewhere obtaining that lock within an srcu critical
>>    region ("read lock") ?
>> 
>> Because synchronize_srcu() doesn't acquire the lock, merely checks that
>> it isn't held.
>> 
>> Would this work?  Not even compile tested.
>> 
>> You can put my SoB on this if it works.
>> 
> 
> Thanks. Not sure if this addresses the issue I was talking about.
> 
> There exists a completely unrelated mutex, let's call it kvm->lock.
> 
> One code path *holds* kvm->lock while calling synchronize_srcu().
> Another code path is in a read-side section and attempts to obtain the
> same kvm->lock.
> 
> Described here:
> 
> https://lore.kernel.org/kvm/20221229211737.138861-1-mhal@rbox.co/T/#m594c1068d7a659f0a1aab3b64b0903836919bb0a

I think the patch from Matthew Wilcox will address it because the read side section already acquires the dep_map. So lockdep subsystem should be able to nail the dependency. One comment on that below:

> 
>> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
>> index ca4b5dcec675..e9c2ab8369c0 100644
>> --- a/kernel/rcu/srcutree.c
>> +++ b/kernel/rcu/srcutree.c
>> @@ -1267,11 +1267,11 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
>>  {
>>         struct rcu_synchronize rcu;
>>  
>> -       RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
>> -                        lock_is_held(&rcu_bh_lock_map) ||
>> +       rcu_lock_acquire(&ssp->dep_map);
>> +       RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
>>                          lock_is_held(&rcu_lock_map) ||
>>                          lock_is_held(&rcu_sched_lock_map),
>> -                        "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
>> +                        "Illegal synchronize_srcu() in RCU read-side critical section");
>>  
>>         if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
>>                 return;

Should you not release here?

Thanks,

- Joel 


>> @@ -1282,6 +1282,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
>>         __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm);
>>         wait_for_completion(&rcu.completion);
>>         destroy_rcu_head_on_stack(&rcu.head);
>> +       rcu_lock_release(&ssp->dep_map);
>>  
>>         /*
>>          * Make sure that later code is ordered after the SRCU grace
>
Paul E. McKenney Dec. 30, 2022, 8:30 p.m. UTC | #3
On Fri, Dec 30, 2022 at 11:38:21AM +0000, Matthew Wilcox wrote:
> 
> <dwmw2> why doesn't lockdep catch us calling synchronize_srcu() with
>    a lock held, and elsewhere obtaining that lock within an srcu critical
>    region ("read lock") ?
> 
> Because synchronize_srcu() doesn't acquire the lock, merely checks that
> it isn't held.
> 
> Would this work?  Not even compile tested.
> 
> You can put my SoB on this if it works.
> 
> diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> index ca4b5dcec675..e9c2ab8369c0 100644
> --- a/kernel/rcu/srcutree.c
> +++ b/kernel/rcu/srcutree.c
> @@ -1267,11 +1267,11 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
>  {
>  	struct rcu_synchronize rcu;
>  
> -	RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
> -			 lock_is_held(&rcu_bh_lock_map) ||
> +	rcu_lock_acquire(&ssp->dep_map);

If this does do anything, why doesn't lockdep complain about things
like this?

	i1 = srcu_read_lock(&my_srcu);
	i2 = srcu_read_lock(&my_srcu);
	srcu_read_unlock(&my_srcu, i2);
	srcu_read_unlock(&my_srcu, i1);

Or does it in fact complain and I have somehow managed to confuse things
such that rcutorture does not detect such complaints?  I would expect
rcutorture_extend_mask_max() to make this happen, but then again, RCU
has confused me in the past.

							Thanx, Paul

> +	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
>  			 lock_is_held(&rcu_lock_map) ||
>  			 lock_is_held(&rcu_sched_lock_map),
> -			 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
> +			 "Illegal synchronize_srcu() in RCU read-side critical section");
>  
>  	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
>  		return;
> @@ -1282,6 +1282,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
>  	__call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm);
>  	wait_for_completion(&rcu.completion);
>  	destroy_rcu_head_on_stack(&rcu.head);
> +	rcu_lock_release(&ssp->dep_map);
>  
>  	/*
>  	 * Make sure that later code is ordered after the SRCU grace
Matthew Wilcox (Oracle) Dec. 30, 2022, 9:58 p.m. UTC | #4
On Fri, Dec 30, 2022 at 12:30:12PM -0800, Paul E. McKenney wrote:
> On Fri, Dec 30, 2022 at 11:38:21AM +0000, Matthew Wilcox wrote:
> > 
> > <dwmw2> why doesn't lockdep catch us calling synchronize_srcu() with
> >    a lock held, and elsewhere obtaining that lock within an srcu critical
> >    region ("read lock") ?
> > 
> > Because synchronize_srcu() doesn't acquire the lock, merely checks that
> > it isn't held.
> > 
> > Would this work?  Not even compile tested.
> > 
> > You can put my SoB on this if it works.
> > 
> > diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > index ca4b5dcec675..e9c2ab8369c0 100644
> > --- a/kernel/rcu/srcutree.c
> > +++ b/kernel/rcu/srcutree.c
> > @@ -1267,11 +1267,11 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
> >  {
> >  	struct rcu_synchronize rcu;
> >  
> > -	RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
> > -			 lock_is_held(&rcu_bh_lock_map) ||
> > +	rcu_lock_acquire(&ssp->dep_map);
> 
> If this does do anything, why doesn't lockdep complain about things
> like this?
> 
> 	i1 = srcu_read_lock(&my_srcu);
> 	i2 = srcu_read_lock(&my_srcu);
> 	srcu_read_unlock(&my_srcu, i2);
> 	srcu_read_unlock(&my_srcu, i1);
> 
> Or does it in fact complain and I have somehow managed to confuse things
> such that rcutorture does not detect such complaints?  I would expect
> rcutorture_extend_mask_max() to make this happen, but then again, RCU
> has confused me in the past.

static inline void rcu_lock_acquire(struct lockdep_map *map)
{
        lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
}

That's clear as mud, of course.  The fourth argument, which has value
'2' is documented as 'int read', and that has the meaning:

 * Values for "read":
 *
 *   0: exclusive (write) acquire
 *   1: read-acquire (no recursion allowed)
 *   2: read-acquire with same-instance recursion allowed

But that's enough to establish the locking hierarchy.  ie if you do:

read-lock-A
write-lock-B

in one thread and in the other thread do:

write-lock-B
read-lock-A

lockdep is still going to detect that as a deadly embrace.  Isn't it?

In any case, I was just trying to point out the general outlines of
something I saw as a potential solution.  If we need to turn that into
lock_acquire(map, 0, 0, 0, 0, NULL, _THIS_IP), that's fine.  And Joel
is right, there needs to be a release in that other path.

But I am not an expert in RCU, lockdep, or working while travelling,
so I was rather hoping someone like dwmw2 would pick up the sketch,
make it compile, and then test it.
Paul E. McKenney Dec. 30, 2022, 11:24 p.m. UTC | #5
On Fri, Dec 30, 2022 at 09:58:23PM +0000, Matthew Wilcox wrote:
> On Fri, Dec 30, 2022 at 12:30:12PM -0800, Paul E. McKenney wrote:
> > On Fri, Dec 30, 2022 at 11:38:21AM +0000, Matthew Wilcox wrote:
> > > 
> > > <dwmw2> why doesn't lockdep catch us calling synchronize_srcu() with
> > >    a lock held, and elsewhere obtaining that lock within an srcu critical
> > >    region ("read lock") ?
> > > 
> > > Because synchronize_srcu() doesn't acquire the lock, merely checks that
> > > it isn't held.
> > > 
> > > Would this work?  Not even compile tested.
> > > 
> > > You can put my SoB on this if it works.
> > > 
> > > diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
> > > index ca4b5dcec675..e9c2ab8369c0 100644
> > > --- a/kernel/rcu/srcutree.c
> > > +++ b/kernel/rcu/srcutree.c
> > > @@ -1267,11 +1267,11 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
> > >  {
> > >  	struct rcu_synchronize rcu;
> > >  
> > > -	RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
> > > -			 lock_is_held(&rcu_bh_lock_map) ||
> > > +	rcu_lock_acquire(&ssp->dep_map);
> > 
> > If this does do anything, why doesn't lockdep complain about things
> > like this?
> > 
> > 	i1 = srcu_read_lock(&my_srcu);
> > 	i2 = srcu_read_lock(&my_srcu);
> > 	srcu_read_unlock(&my_srcu, i2);
> > 	srcu_read_unlock(&my_srcu, i1);
> > 
> > Or does it in fact complain and I have somehow managed to confuse things
> > such that rcutorture does not detect such complaints?  I would expect
> > rcutorture_extend_mask_max() to make this happen, but then again, RCU
> > has confused me in the past.
> 
> static inline void rcu_lock_acquire(struct lockdep_map *map)
> {
>         lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
> }
> 
> That's clear as mud, of course.  The fourth argument, which has value
> '2' is documented as 'int read', and that has the meaning:
> 
>  * Values for "read":
>  *
>  *   0: exclusive (write) acquire
>  *   1: read-acquire (no recursion allowed)
>  *   2: read-acquire with same-instance recursion allowed
> 
> But that's enough to establish the locking hierarchy.  ie if you do:
> 
> read-lock-A
> write-lock-B
> 
> in one thread and in the other thread do:
> 
> write-lock-B
> read-lock-A
> 
> lockdep is still going to detect that as a deadly embrace.  Isn't it?

Maybe?

> In any case, I was just trying to point out the general outlines of
> something I saw as a potential solution.  If we need to turn that into
> lock_acquire(map, 0, 0, 0, 0, NULL, _THIS_IP), that's fine.  And Joel
> is right, there needs to be a release in that other path.
> 
> But I am not an expert in RCU, lockdep, or working while travelling,
> so I was rather hoping someone like dwmw2 would pick up the sketch,
> make it compile, and then test it.

Sounds like a plan to me!

							Thanx, Paul
Michal Luczaj Dec. 31, 2022, 1:26 a.m. UTC | #6
On 12/30/22 14:18, Joel Fernandes wrote:
> I think the patch from Matthew Wilcox will address it because the
> read side section already acquires the dep_map. So lockdep subsystem
> should be able to nail the dependency. (...)

Perhaps it's something misconfigured on my side, but I still don't see any
lockdep splats, just the usual task hang warning after 120s.

If that's any help, here's a crude selftest (actually a severed version of
xen_shinfo_test). Under current mainline 6.2.0-rc1 it results in exactly
the type of deadlocks described by David.

Michal

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 1750f91dd936..0f02a4fe9374 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -61,6 +61,7 @@ TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh
 # Compiled test targets
 TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test
 TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test
+TEST_GEN_PROGS_x86_64 += x86_64/deadlocks_test
 TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features
 TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test
 TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test
diff --git a/tools/testing/selftests/kvm/x86_64/deadlocks_test.c b/tools/testing/selftests/kvm/x86_64/deadlocks_test.c
new file mode 100644
index 000000000000..e6150a624905
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/deadlocks_test.c
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+
+#define RACE_ITERATIONS			0x1000
+
+#define EVTCHN_PORT			1
+#define XEN_HYPERCALL_MSR		0x40000000
+
+#define EVTCHNSTAT_interdomain		2
+#define __HYPERVISOR_event_channel_op	32
+#define EVTCHNOP_send			4
+
+#define VCPU_INFO_REGION_GPA		0xc0000000ULL
+#define VCPU_INFO_REGION_SLOT		10
+
+struct evtchn_send {
+	u32 port;
+};
+
+static void evtchn_assign(struct kvm_vm *vm)
+{
+	struct kvm_xen_hvm_attr assign = {
+		.type = KVM_XEN_ATTR_TYPE_EVTCHN,
+		.u.evtchn = {
+			.flags = 0,
+			.send_port = EVTCHN_PORT,
+			.type = EVTCHNSTAT_interdomain,
+			.deliver.port = {
+				.port = EVTCHN_PORT,
+				.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL,
+				.vcpu = 0
+			}
+		}
+	};
+
+	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &assign);
+}
+
+static void *set_msr_filter(void *arg)
+{
+	struct kvm_vm *vm = (struct kvm_vm *)arg;
+
+	struct kvm_msr_filter filter = {
+		.flags = KVM_MSR_FILTER_DEFAULT_ALLOW,
+		.ranges = {}
+	};
+
+	for (;;) {
+		vm_ioctl(vm, KVM_X86_SET_MSR_FILTER, &filter);
+		pthread_testcancel();
+	}
+
+	return NULL;
+}
+
+static void *set_pmu_filter(void *arg)
+{
+	struct kvm_vm *vm = (struct kvm_vm *)arg;
+
+	struct kvm_pmu_event_filter filter = {
+		.action = KVM_PMU_EVENT_ALLOW,
+		.flags = 0,
+		.nevents = 0
+	};
+
+	for (;;) {
+		vm_ioctl(vm, KVM_SET_PMU_EVENT_FILTER, &filter);
+		pthread_testcancel();
+	}
+
+	return NULL;
+}
+
+static void guest_code(void)
+{
+	struct evtchn_send s = { .port = EVTCHN_PORT };
+
+	for (;;) {
+		asm volatile("vmcall"
+			     :
+			     : "a" (__HYPERVISOR_event_channel_op),
+			       "D" (EVTCHNOP_send),
+			       "S" (&s)
+			     : "memory");
+		GUEST_SYNC(0);
+	}
+}
+
+static void race_against(struct kvm_vcpu *vcpu, void *(*func)(void *))
+{
+	pthread_t thread;
+	int i, ret;
+
+	ret = pthread_create(&thread, NULL, func, (void *)vcpu->vm);
+	TEST_ASSERT(ret == 0, "pthread_create() failed: %s", strerror(ret));
+
+	for (i = 0; i < RACE_ITERATIONS; ++i) {
+		fprintf(stderr, ".");
+		vcpu_run(vcpu);
+	}
+	printf("\n");
+
+	ret = pthread_cancel(thread);
+	TEST_ASSERT(ret == 0, "pthread_cancel() failed: %s", strerror(ret));
+
+	ret = pthread_join(thread, 0);
+	TEST_ASSERT(ret == 0, "pthread_join() failed: %s", strerror(ret));
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned int xen_caps;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+
+	xen_caps = kvm_check_cap(KVM_CAP_XEN_HVM);
+	TEST_REQUIRE(xen_caps & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL);
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code);
+	vcpu_set_hv_cpuid(vcpu);
+
+	struct kvm_xen_hvm_attr ha = {
+		.type = KVM_XEN_ATTR_TYPE_SHARED_INFO,
+		.u.shared_info.gfn = KVM_XEN_INVALID_GFN,
+	};
+	vm_ioctl(vm, KVM_XEN_HVM_SET_ATTR, &ha);
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    VCPU_INFO_REGION_GPA, VCPU_INFO_REGION_SLOT, 1, 0);
+
+	struct kvm_xen_vcpu_attr vi = {
+		.type = KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO,
+		.u.gpa = VCPU_INFO_REGION_GPA,
+	};
+	vcpu_ioctl(vcpu, KVM_XEN_VCPU_SET_ATTR, &vi);
+
+	struct kvm_xen_hvm_config hvmc = {
+		.flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL,
+		.msr = XEN_HYPERCALL_MSR
+	};
+	vm_ioctl(vm, KVM_XEN_HVM_CONFIG, &hvmc);
+
+	evtchn_assign(vm);
+
+	race_against(vcpu, set_msr_filter);
+	race_against(vcpu, set_pmu_filter);
+
+	kvm_vm_free(vm);
+	return 0;
+}
diff mbox series

Patch

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index ca4b5dcec675..e9c2ab8369c0 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -1267,11 +1267,11 @@  static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
 {
 	struct rcu_synchronize rcu;
 
-	RCU_LOCKDEP_WARN(lockdep_is_held(ssp) ||
-			 lock_is_held(&rcu_bh_lock_map) ||
+	rcu_lock_acquire(&ssp->dep_map);
+	RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
 			 lock_is_held(&rcu_lock_map) ||
 			 lock_is_held(&rcu_sched_lock_map),
-			 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
+			 "Illegal synchronize_srcu() in RCU read-side critical section");
 
 	if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
 		return;
@@ -1282,6 +1282,7 @@  static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm)
 	__call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm);
 	wait_for_completion(&rcu.completion);
 	destroy_rcu_head_on_stack(&rcu.head);
+	rcu_lock_release(&ssp->dep_map);
 
 	/*
 	 * Make sure that later code is ordered after the SRCU grace