diff mbox series

[v4,07/10] mmap locking API: add mmap_read_trylock_non_owner()

Message ID 20200415004353.130248-8-walken@google.com (mailing list archive)
State New, archived
Headers show
Series Add a new mmap locking API wrapping mmap_sem calls | expand

Commit Message

Michel Lespinasse April 15, 2020, 12:43 a.m. UTC
Add a couple APIs used by kernel/bpf/stackmap.c only:
- mmap_read_trylock_non_owner()
- mmap_read_unlock_non_owner() (may be called from a work queue).

It's still not ideal that bpf/stackmap subverts the lock ownership
in this way. Thanks to Peter Zijlstra for suggesting this API as the
least-ugly way of addressing this in the short term.

Signed-off-by: Michel Lespinasse <walken@google.com>
---
 include/linux/mmap_lock.h | 14 ++++++++++++++
 kernel/bpf/stackmap.c     | 17 +++++------------
 2 files changed, 19 insertions(+), 12 deletions(-)

Comments

Daniel Jordan April 20, 2020, 6:22 p.m. UTC | #1
On Tue, Apr 14, 2020 at 05:43:50PM -0700, Michel Lespinasse wrote:
> diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> index 11d41f0c7005..998968659892 100644
> --- a/kernel/bpf/stackmap.c
> +++ b/kernel/bpf/stackmap.c
> @@ -317,7 +316,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
>  	 * with build_id.
>  	 */
>  	if (!user || !current || !current->mm || irq_work_busy ||
> -	    mmap_read_trylock(current->mm) == 0) {
> +	    !mmap_read_trylock_non_owner(current->mm)) {
>  		/* cannot access current->mm, fall back to ips */
>  		for (i = 0; i < trace_nr; i++) {
>  			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
> @@ -342,16 +341,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
>  	}
>  
>  	if (!work) {
> -		mmap_read_unlock(current->mm);
> +		mmap_read_unlock_non_owner(current->mm);

These 'non_owner' calls are not intuitive because current _is the owner, so the
v3 version seems better, even if it adds a special wrapper for rwsem_release.

Though it makes some sense if you think, "we're consistently using the
non_owner APIs because there's a legitimate use somewhere else," so I'm fine
either way.

Reviewed-by: Daniel Jordan <daniel.m.jordan@oracle.com>
Matthew Wilcox April 20, 2020, 7:23 p.m. UTC | #2
On Mon, Apr 20, 2020 at 02:22:11PM -0400, Daniel Jordan wrote:
> On Tue, Apr 14, 2020 at 05:43:50PM -0700, Michel Lespinasse wrote:
> > diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> > index 11d41f0c7005..998968659892 100644
> > --- a/kernel/bpf/stackmap.c
> > +++ b/kernel/bpf/stackmap.c
> > @@ -317,7 +316,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
> >  	 * with build_id.
> >  	 */
> >  	if (!user || !current || !current->mm || irq_work_busy ||
> > -	    mmap_read_trylock(current->mm) == 0) {
> > +	    !mmap_read_trylock_non_owner(current->mm)) {
> >  		/* cannot access current->mm, fall back to ips */
> >  		for (i = 0; i < trace_nr; i++) {
> >  			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
> > @@ -342,16 +341,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
> >  	}
> >  
> >  	if (!work) {
> > -		mmap_read_unlock(current->mm);
> > +		mmap_read_unlock_non_owner(current->mm);
> 
> These 'non_owner' calls are not intuitive because current _is the owner, so the
> v3 version seems better, even if it adds a special wrapper for rwsem_release.
> 
> Though it makes some sense if you think, "we're consistently using the
> non_owner APIs because there's a legitimate use somewhere else," so I'm fine
> either way.

I'm not really a big fan of v3 nor v4.  What I'd like to see is a
'transfer of ownership' API.  This could be to a different task, IRQ work,
RCU, softirq, timer, ...

That would let us track locking dependencies across complex flows, eg this
wouldn't be warned about right now:

rcu_work():
	lock(C)
	kfree(B)
	unlock(A)
	unlock(C)

thread 1:
	lock(A)
	call_rcu(B)

thread 2:
	lock(C)
	synchronize_rcu()
	unlock(C)

but if we had an API that transferred ownership of A to RCU, then we'd
see the C->RCU->A->C cycle.

This is perhaps a bit much work to require of Laurent in order to get
this patchset merged, but something to think about.
Michel Lespinasse April 21, 2020, 12:55 a.m. UTC | #3
On Mon, Apr 20, 2020 at 12:23 PM Matthew Wilcox <willy@infradead.org> wrote:
> On Mon, Apr 20, 2020 at 02:22:11PM -0400, Daniel Jordan wrote:
> > On Tue, Apr 14, 2020 at 05:43:50PM -0700, Michel Lespinasse wrote:
> > > diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
> > > index 11d41f0c7005..998968659892 100644
> > > --- a/kernel/bpf/stackmap.c
> > > +++ b/kernel/bpf/stackmap.c
> > > @@ -317,7 +316,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
> > >      * with build_id.
> > >      */
> > >     if (!user || !current || !current->mm || irq_work_busy ||
> > > -       mmap_read_trylock(current->mm) == 0) {
> > > +       !mmap_read_trylock_non_owner(current->mm)) {
> > >             /* cannot access current->mm, fall back to ips */
> > >             for (i = 0; i < trace_nr; i++) {
> > >                     id_offs[i].status = BPF_STACK_BUILD_ID_IP;
> > > @@ -342,16 +341,10 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
> > >     }
> > >
> > >     if (!work) {
> > > -           mmap_read_unlock(current->mm);
> > > +           mmap_read_unlock_non_owner(current->mm);
> >
> > These 'non_owner' calls are not intuitive because current _is the owner, so the
> > v3 version seems better, even if it adds a special wrapper for rwsem_release.
> >
> > Though it makes some sense if you think, "we're consistently using the
> > non_owner APIs because there's a legitimate use somewhere else," so I'm fine
> > either way.
>
> I'm not really a big fan of v3 nor v4.  What I'd like to see is a
> 'transfer of ownership' API.  This could be to a different task, IRQ work,
> RCU, softirq, timer, ...
>
> That would let us track locking dependencies across complex flows, eg this
> wouldn't be warned about right now:
>
> rcu_work():
>         lock(C)
>         kfree(B)
>         unlock(A)
>         unlock(C)
>
> thread 1:
>         lock(A)
>         call_rcu(B)
>
> thread 2:
>         lock(C)
>         synchronize_rcu()
>         unlock(C)
>
> but if we had an API that transferred ownership of A to RCU, then we'd
> see the C->RCU->A->C cycle.
>
> This is perhaps a bit much work to require of Laurent in order to get
> this patchset merged, but something to think about.

I think fundamentally, lockdep is better suited at handling locks that
are owned by a given task. I think extending lockdep just for the bpf
stacktrace use case would be way overkill ?

But yes, I agree that declining ownership as we do here leaves us open
to having lock dependency issues that lockdep won't diagnose.
diff mbox series

Patch

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 9d34b0690403..1050257361aa 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -56,4 +56,18 @@  static inline void mmap_read_unlock(struct mm_struct *mm)
 	up_read(&mm->mmap_sem);
 }
 
+static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
+{
+	if (down_read_trylock(&mm->mmap_sem)) {
+		rwsem_release(&mm->mmap_sem.dep_map, _RET_IP_);
+		return true;
+	}
+	return false;
+}
+
+static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
+{
+	up_read_non_owner(&mm->mmap_sem);
+}
+
 #endif /* _LINUX_MMAP_LOCK_H */
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 11d41f0c7005..998968659892 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -33,7 +33,7 @@  struct bpf_stack_map {
 /* irq_work to run up_read() for build_id lookup in nmi context */
 struct stack_map_irq_work {
 	struct irq_work irq_work;
-	struct rw_semaphore *sem;
+	struct mm_struct *mm;
 };
 
 static void do_up_read(struct irq_work *entry)
@@ -44,8 +44,7 @@  static void do_up_read(struct irq_work *entry)
 		return;
 
 	work = container_of(entry, struct stack_map_irq_work, irq_work);
-	up_read_non_owner(work->sem);
-	work->sem = NULL;
+	mmap_read_unlock_non_owner(work->mm);
 }
 
 static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
@@ -317,7 +316,7 @@  static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	 * with build_id.
 	 */
 	if (!user || !current || !current->mm || irq_work_busy ||
-	    mmap_read_trylock(current->mm) == 0) {
+	    !mmap_read_trylock_non_owner(current->mm)) {
 		/* cannot access current->mm, fall back to ips */
 		for (i = 0; i < trace_nr; i++) {
 			id_offs[i].status = BPF_STACK_BUILD_ID_IP;
@@ -342,16 +341,10 @@  static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 	}
 
 	if (!work) {
-		mmap_read_unlock(current->mm);
+		mmap_read_unlock_non_owner(current->mm);
 	} else {
-		work->sem = &current->mm->mmap_sem;
+		work->mm = current->mm;
 		irq_work_queue(&work->irq_work);
-		/*
-		 * The irq_work will release the mmap_sem with
-		 * up_read_non_owner(). The rwsem_release() is called
-		 * here to release the lock from lockdep's perspective.
-		 */
-		rwsem_release(&current->mm->mmap_sem.dep_map, _RET_IP_);
 	}
 }