diff mbox series

[v3,bpf-next,1/2] bpf: Patch to Fix deadlocks in queue and stack maps

Message ID 20240514124052.1240266-2-sidchintamaneni@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series [v3,bpf-next,1/2] bpf: Patch to Fix deadlocks in queue and stack maps | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR fail PR summary
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 928 this patch: 928
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers fail 1 blamed authors not CCed: song@kernel.org; 8 maintainers not CCed: song@kernel.org sdf@google.com john.fastabend@gmail.com martin.lau@linux.dev kpsingh@kernel.org haoluo@google.com eddyz87@gmail.com jolsa@kernel.org
netdev/build_clang success Errors and warnings before: 937 this patch: 937
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 939 this patch: 939
netdev/checkpatch warning CHECK: Alignment should match open parenthesis CHECK: Please don't use multiple blank lines CHECK: Unnecessary parentheses around 'qs->map_locked' CHECK: Unnecessary parentheses around qs->map_locked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline fail Was 0 now: 2
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_maps, false, 360) / test_maps on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 fail Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 fail Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17

Commit Message

Siddharth Chintamaneni May 14, 2024, 12:40 p.m. UTC
This patch is a revised version which addresses a possible deadlock issue in
queue and stack map types.

Deadlock could happen when a nested BPF program acquires the same lock
as the parent BPF program to perform a write operation on the same map
as the first one. This bug is also reported by syzbot.

Link: https://lore.kernel.org/lkml/0000000000004c3fc90615f37756@google.com/
Reported-by: syzbot+8bdfc2c53fb2b63e1871@syzkaller.appspotmail.com
Fixes: f1a2e44a3aec ("bpf: add queue and stack maps")
Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
---
 kernel/bpf/queue_stack_maps.c | 76 +++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 3 deletions(-)

Comments

Kumar Kartikeya Dwivedi May 15, 2024, 5:32 p.m. UTC | #1
On Tue, 14 May 2024 at 14:41, Siddharth Chintamaneni
<sidchintamaneni@gmail.com> wrote:
>
> This patch is a revised version which addresses a possible deadlock issue in
> queue and stack map types.
>
> Deadlock could happen when a nested BPF program acquires the same lock
> as the parent BPF program to perform a write operation on the same map
> as the first one. This bug is also reported by syzbot.
>
> Link: https://lore.kernel.org/lkml/0000000000004c3fc90615f37756@google.com/
> Reported-by: syzbot+8bdfc2c53fb2b63e1871@syzkaller.appspotmail.com
> Fixes: f1a2e44a3aec ("bpf: add queue and stack maps")
> Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
> ---

Acked-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>

There are a couple of extra newlines, it's minor but can also fix them
if you respin.
Barret Rhoden May 16, 2024, 2:04 p.m. UTC | #2
On 5/14/24 08:40, Siddharth Chintamaneni wrote:
[...]
> +static inline int map_lock_inc(struct bpf_queue_stack *qs)
> +{
> +	unsigned long flags;
> +
> +	preempt_disable();
> +	local_irq_save(flags);
> +	if (unlikely(__this_cpu_inc_return(*(qs->map_locked)) != 1)) {
> +		__this_cpu_dec(*(qs->map_locked));
> +		local_irq_restore(flags);
> +		preempt_enable();
> +		return -EBUSY;
> +	}
> +
> +	local_irq_restore(flags);
> +	preempt_enable();

it looks like you're taking the approach from kernel/bpf/hashtab.c to 
use a per-cpu lock before grabbing the real lock.  but in the success 
case here (where you incremented the percpu counter), you're enabling 
irqs and preemption.

what happens if you get preempted right after this?  you've left the 
per-cpu bit set, but then you run on another cpu.

possible alternative: instead of splitting the overall lock into "grab 
percpu lock, then grab real lock", have a single function for both, 
similar to htab_lock_bucket().  and keep irqs and preemption off from 
the moment you start attempting the overall lock until you completely 
unlock.

barret


> +
> +	return 0;
> +}
> +
> +static inline void map_unlock_dec(struct bpf_queue_stack *qs)
> +{
> +	unsigned long flags;
> +
> +	preempt_disable();
> +	local_irq_save(flags);
> +	__this_cpu_dec(*(qs->map_locked));
> +	local_irq_restore(flags);
> +	preempt_enable();
> +}
> +
>   static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
>   {
>   	struct bpf_queue_stack *qs = bpf_queue_stack(map);
>   	unsigned long flags;
>   	int err = 0;
>   	void *ptr;
> +	int ret;
> +
> +	ret = map_lock_inc(qs);
> +	if (ret)
> +		return ret;
>   
>   	if (in_nmi()) {
> -		if (!raw_spin_trylock_irqsave(&qs->lock, flags))
> +		if (!raw_spin_trylock_irqsave(&qs->lock, flags)) {
> +			map_unlock_dec(qs);
>   			return -EBUSY;
> +		}
>   	} else {
>   		raw_spin_lock_irqsave(&qs->lock, flags);
>   	}
> @@ -121,6 +170,8 @@ static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
>   
>   out:
>   	raw_spin_unlock_irqrestore(&qs->lock, flags);
> +	map_unlock_dec(qs);
> +
>   	return err;
>   }
>   
> @@ -132,10 +183,17 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
>   	int err = 0;
>   	void *ptr;
>   	u32 index;
> +	int ret;
> +
> +	ret = map_lock_inc(qs);
> +	if (ret)
> +		return ret;
>   
>   	if (in_nmi()) {
> -		if (!raw_spin_trylock_irqsave(&qs->lock, flags))
> +		if (!raw_spin_trylock_irqsave(&qs->lock, flags)) {
> +			map_unlock_dec(qs);
>   			return -EBUSY;
> +		}
>   	} else {
>   		raw_spin_lock_irqsave(&qs->lock, flags);
>   	}
> @@ -158,6 +216,8 @@ static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
>   
>   out:
>   	raw_spin_unlock_irqrestore(&qs->lock, flags);
> +	map_unlock_dec(qs);
> +
>   	return err;
>   }
>   
> @@ -193,6 +253,7 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
>   	unsigned long irq_flags;
>   	int err = 0;
>   	void *dst;
> +	int ret;
>   
>   	/* BPF_EXIST is used to force making room for a new element in case the
>   	 * map is full
> @@ -203,9 +264,16 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
>   	if (flags & BPF_NOEXIST || flags > BPF_EXIST)
>   		return -EINVAL;
>   
> +
> +	ret = map_lock_inc(qs);
> +	if (ret)
> +		return ret;
> +
>   	if (in_nmi()) {
> -		if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags))
> +		if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags)) {
> +			map_unlock_dec(qs);
>   			return -EBUSY;
> +		}
>   	} else {
>   		raw_spin_lock_irqsave(&qs->lock, irq_flags);
>   	}
> @@ -228,6 +296,8 @@ static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
>   
>   out:
>   	raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
> +	map_unlock_dec(qs);
> +
>   	return err;
>   }
>
Kumar Kartikeya Dwivedi May 16, 2024, 2:34 p.m. UTC | #3
On Thu, 16 May 2024 at 16:05, Barret Rhoden <brho@google.com> wrote:
>
> On 5/14/24 08:40, Siddharth Chintamaneni wrote:
> [...]
> > +static inline int map_lock_inc(struct bpf_queue_stack *qs)
> > +{
> > +     unsigned long flags;
> > +
> > +     preempt_disable();
> > +     local_irq_save(flags);
> > +     if (unlikely(__this_cpu_inc_return(*(qs->map_locked)) != 1)) {
> > +             __this_cpu_dec(*(qs->map_locked));
> > +             local_irq_restore(flags);
> > +             preempt_enable();
> > +             return -EBUSY;
> > +     }
> > +
> > +     local_irq_restore(flags);
> > +     preempt_enable();
>
> it looks like you're taking the approach from kernel/bpf/hashtab.c to
> use a per-cpu lock before grabbing the real lock.  but in the success
> case here (where you incremented the percpu counter), you're enabling
> irqs and preemption.
>
> what happens if you get preempted right after this?  you've left the
> per-cpu bit set, but then you run on another cpu.

Great catch, that's a bug. It's not a problem when BPF programs call
this, as migration is disabled for them (but it's questionable whether
we should keep preemption enabled between map_inc/dec increasing the
chances of conflicts on the same CPU), but it's certainly a problem
from the syscall path.

>
> possible alternative: instead of splitting the overall lock into "grab
> percpu lock, then grab real lock", have a single function for both,
> similar to htab_lock_bucket().  and keep irqs and preemption off from
> the moment you start attempting the overall lock until you completely
> unlock.

+1.

>
> barret
>
Siddharth Chintamaneni May 16, 2024, 3:31 p.m. UTC | #4
On Thu, 16 May 2024 at 10:34, Kumar Kartikeya Dwivedi <memxor@gmail.com> wrote:
>
> On Thu, 16 May 2024 at 16:05, Barret Rhoden <brho@google.com> wrote:
> >
> > On 5/14/24 08:40, Siddharth Chintamaneni wrote:
> > [...]
> > > +static inline int map_lock_inc(struct bpf_queue_stack *qs)
> > > +{
> > > +     unsigned long flags;
> > > +
> > > +     preempt_disable();
> > > +     local_irq_save(flags);
> > > +     if (unlikely(__this_cpu_inc_return(*(qs->map_locked)) != 1)) {
> > > +             __this_cpu_dec(*(qs->map_locked));
> > > +             local_irq_restore(flags);
> > > +             preempt_enable();
> > > +             return -EBUSY;
> > > +     }
> > > +
> > > +     local_irq_restore(flags);
> > > +     preempt_enable();
> >
> > it looks like you're taking the approach from kernel/bpf/hashtab.c to
> > use a per-cpu lock before grabbing the real lock.  but in the success
> > case here (where you incremented the percpu counter), you're enabling
> > irqs and preemption.
> >
> > what happens if you get preempted right after this?  you've left the
> > per-cpu bit set, but then you run on another cpu.
>
> Great catch, that's a bug. It's not a problem when BPF programs call
> this, as migration is disabled for them (but it's questionable whether
> we should keep preemption enabled between map_inc/dec increasing the
> chances of conflicts on the same CPU), but it's certainly a problem
> from the syscall path.
>

I was also thinking from the BPF programs perspective as migration is
disabled on them. I will fix this.

> >
> > possible alternative: instead of splitting the overall lock into "grab
> > percpu lock, then grab real lock", have a single function for both,
> > similar to htab_lock_bucket().  and keep irqs and preemption off from
> > the moment you start attempting the overall lock until you completely
> > unlock.
>
> +1.
>
> >
> > barret
> >
Hou Tao May 17, 2024, 1:53 a.m. UTC | #5
Hi,

On 5/14/2024 8:40 PM, Siddharth Chintamaneni wrote:
> This patch is a revised version which addresses a possible deadlock issue in
> queue and stack map types.
>
> Deadlock could happen when a nested BPF program acquires the same lock
> as the parent BPF program to perform a write operation on the same map
> as the first one. This bug is also reported by syzbot.
>
> Link: https://lore.kernel.org/lkml/0000000000004c3fc90615f37756@google.com/
> Reported-by: syzbot+8bdfc2c53fb2b63e1871@syzkaller.appspotmail.com
> Fixes: f1a2e44a3aec ("bpf: add queue and stack maps")
> Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
> ---
>  kernel/bpf/queue_stack_maps.c | 76 +++++++++++++++++++++++++++++++++--
>  1 file changed, 73 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
> index d869f51ea93a..b5ed76c9ddd7 100644
> --- a/kernel/bpf/queue_stack_maps.c
> +++ b/kernel/bpf/queue_stack_maps.c
> @@ -13,11 +13,13 @@
>  #define QUEUE_STACK_CREATE_FLAG_MASK \
>  	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
>  
> +
>  struct bpf_queue_stack {
>  	struct bpf_map map;
>  	raw_spinlock_t lock;
>  	u32 head, tail;
>  	u32 size; /* max_entries + 1 */
> +	int __percpu *map_locked;
>  
>  	char elements[] __aligned(8);
>  };
> @@ -78,6 +80,15 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
>  
>  	qs->size = size;
>  
> +	qs->map_locked = bpf_map_alloc_percpu(&qs->map,
> +						sizeof(int),
> +						sizeof(int),
> +						GFP_USER | __GFP_NOWARN);
> +	if (!qs->map_locked) {
> +		bpf_map_area_free(qs);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
>  	raw_spin_lock_init(&qs->lock);
>  
>  	return &qs->map;
> @@ -88,19 +99,57 @@ static void queue_stack_map_free(struct bpf_map *map)
>  {
>  	struct bpf_queue_stack *qs = bpf_queue_stack(map);
>  
> +	free_percpu(qs->map_locked);
>  	bpf_map_area_free(qs);
>  }
>  
> +static inline int map_lock_inc(struct bpf_queue_stack *qs)
> +{
> +	unsigned long flags;
> +
> +	preempt_disable();
> +	local_irq_save(flags);
> +	if (unlikely(__this_cpu_inc_return(*(qs->map_locked)) != 1)) {
> +		__this_cpu_dec(*(qs->map_locked));
> +		local_irq_restore(flags);
> +		preempt_enable();
> +		return -EBUSY;
> +	}
> +
> +	local_irq_restore(flags);
> +	preempt_enable();
> +
> +	return 0;
> +}
> +
> +static inline void map_unlock_dec(struct bpf_queue_stack *qs)
> +{
> +	unsigned long flags;
> +
> +	preempt_disable();
> +	local_irq_save(flags);
> +	__this_cpu_dec(*(qs->map_locked));
> +	local_irq_restore(flags);
> +	preempt_enable();
> +}
> +
>  static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
>  {
>  	struct bpf_queue_stack *qs = bpf_queue_stack(map);
>  	unsigned long flags;
>  	int err = 0;
>  	void *ptr;
> +	int ret;
> +
> +	ret = map_lock_inc(qs);
> +	if (ret)
> +		return ret;
>  
>  	if (in_nmi()) {
> -		if (!raw_spin_trylock_irqsave(&qs->lock, flags))
> +		if (!raw_spin_trylock_irqsave(&qs->lock, flags)) {
> +			map_unlock_dec(qs);
>  			return -EBUSY;
> +		}

With percpu map-locked in place, I think the in_nmi() checking could
also be remove. When the BPF program X which has already acquired the
lock is interrupted by a NMI, if the BPF program Y for the NMI also
tries to acquire the same lock, it will find map_locked is 1 and return
early.
Siddharth Chintamaneni May 17, 2024, 3:32 a.m. UTC | #6
On Thu, 16 May 2024 at 21:53, Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 5/14/2024 8:40 PM, Siddharth Chintamaneni wrote:
> > This patch is a revised version which addresses a possible deadlock issue in
> > queue and stack map types.
> >
> > Deadlock could happen when a nested BPF program acquires the same lock
> > as the parent BPF program to perform a write operation on the same map
> > as the first one. This bug is also reported by syzbot.
> >
> > Link: https://lore.kernel.org/lkml/0000000000004c3fc90615f37756@google.com/
> > Reported-by: syzbot+8bdfc2c53fb2b63e1871@syzkaller.appspotmail.com
> > Fixes: f1a2e44a3aec ("bpf: add queue and stack maps")
> > Signed-off-by: Siddharth Chintamaneni <sidchintamaneni@gmail.com>
> > ---
> >  kernel/bpf/queue_stack_maps.c | 76 +++++++++++++++++++++++++++++++++--
> >  1 file changed, 73 insertions(+), 3 deletions(-)
> >
> > diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
> > index d869f51ea93a..b5ed76c9ddd7 100644
> > --- a/kernel/bpf/queue_stack_maps.c
> > +++ b/kernel/bpf/queue_stack_maps.c
> > @@ -13,11 +13,13 @@
> >  #define QUEUE_STACK_CREATE_FLAG_MASK \
> >       (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
> >
> > +
> >  struct bpf_queue_stack {
> >       struct bpf_map map;
> >       raw_spinlock_t lock;
> >       u32 head, tail;
> >       u32 size; /* max_entries + 1 */
> > +     int __percpu *map_locked;
> >
> >       char elements[] __aligned(8);
> >  };
> > @@ -78,6 +80,15 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
> >
> >       qs->size = size;
> >
> > +     qs->map_locked = bpf_map_alloc_percpu(&qs->map,
> > +                                             sizeof(int),
> > +                                             sizeof(int),
> > +                                             GFP_USER | __GFP_NOWARN);
> > +     if (!qs->map_locked) {
> > +             bpf_map_area_free(qs);
> > +             return ERR_PTR(-ENOMEM);
> > +     }
> > +
> >       raw_spin_lock_init(&qs->lock);
> >
> >       return &qs->map;
> > @@ -88,19 +99,57 @@ static void queue_stack_map_free(struct bpf_map *map)
> >  {
> >       struct bpf_queue_stack *qs = bpf_queue_stack(map);
> >
> > +     free_percpu(qs->map_locked);
> >       bpf_map_area_free(qs);
> >  }
> >
> > +static inline int map_lock_inc(struct bpf_queue_stack *qs)
> > +{
> > +     unsigned long flags;
> > +
> > +     preempt_disable();
> > +     local_irq_save(flags);
> > +     if (unlikely(__this_cpu_inc_return(*(qs->map_locked)) != 1)) {
> > +             __this_cpu_dec(*(qs->map_locked));
> > +             local_irq_restore(flags);
> > +             preempt_enable();
> > +             return -EBUSY;
> > +     }
> > +
> > +     local_irq_restore(flags);
> > +     preempt_enable();
> > +
> > +     return 0;
> > +}
> > +
> > +static inline void map_unlock_dec(struct bpf_queue_stack *qs)
> > +{
> > +     unsigned long flags;
> > +
> > +     preempt_disable();
> > +     local_irq_save(flags);
> > +     __this_cpu_dec(*(qs->map_locked));
> > +     local_irq_restore(flags);
> > +     preempt_enable();
> > +}
> > +
> >  static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
> >  {
> >       struct bpf_queue_stack *qs = bpf_queue_stack(map);
> >       unsigned long flags;
> >       int err = 0;
> >       void *ptr;
> > +     int ret;
> > +
> > +     ret = map_lock_inc(qs);
> > +     if (ret)
> > +             return ret;
> >
> >       if (in_nmi()) {
> > -             if (!raw_spin_trylock_irqsave(&qs->lock, flags))
> > +             if (!raw_spin_trylock_irqsave(&qs->lock, flags)) {
> > +                     map_unlock_dec(qs);
> >                       return -EBUSY;
> > +             }
>
> With percpu map-locked in place, I think the in_nmi() checking could
> also be remove. When the BPF program X which has already acquired the
> lock is interrupted by a NMI, if the BPF program Y for the NMI also
> tries to acquire the same lock, it will find map_locked is 1 and return
> early.

Agreed. Same thing could be done for ringbuf as well. I will fix this
in the revision for both the patches.

>
>
diff mbox series

Patch

diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index d869f51ea93a..b5ed76c9ddd7 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -13,11 +13,13 @@ 
 #define QUEUE_STACK_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
 
+
 struct bpf_queue_stack {
 	struct bpf_map map;
 	raw_spinlock_t lock;
 	u32 head, tail;
 	u32 size; /* max_entries + 1 */
+	int __percpu *map_locked;
 
 	char elements[] __aligned(8);
 };
@@ -78,6 +80,15 @@  static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr)
 
 	qs->size = size;
 
+	qs->map_locked = bpf_map_alloc_percpu(&qs->map,
+						sizeof(int),
+						sizeof(int),
+						GFP_USER | __GFP_NOWARN);
+	if (!qs->map_locked) {
+		bpf_map_area_free(qs);
+		return ERR_PTR(-ENOMEM);
+	}
+
 	raw_spin_lock_init(&qs->lock);
 
 	return &qs->map;
@@ -88,19 +99,57 @@  static void queue_stack_map_free(struct bpf_map *map)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 
+	free_percpu(qs->map_locked);
 	bpf_map_area_free(qs);
 }
 
+static inline int map_lock_inc(struct bpf_queue_stack *qs)
+{
+	unsigned long flags;
+
+	preempt_disable();
+	local_irq_save(flags);
+	if (unlikely(__this_cpu_inc_return(*(qs->map_locked)) != 1)) {
+		__this_cpu_dec(*(qs->map_locked));
+		local_irq_restore(flags);
+		preempt_enable();
+		return -EBUSY;
+	}
+
+	local_irq_restore(flags);
+	preempt_enable();
+
+	return 0;
+}
+
+static inline void map_unlock_dec(struct bpf_queue_stack *qs)
+{
+	unsigned long flags;
+
+	preempt_disable();
+	local_irq_save(flags);
+	__this_cpu_dec(*(qs->map_locked));
+	local_irq_restore(flags);
+	preempt_enable();
+}
+
 static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
 {
 	struct bpf_queue_stack *qs = bpf_queue_stack(map);
 	unsigned long flags;
 	int err = 0;
 	void *ptr;
+	int ret;
+
+	ret = map_lock_inc(qs);
+	if (ret)
+		return ret;
 
 	if (in_nmi()) {
-		if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+		if (!raw_spin_trylock_irqsave(&qs->lock, flags)) {
+			map_unlock_dec(qs);
 			return -EBUSY;
+		}
 	} else {
 		raw_spin_lock_irqsave(&qs->lock, flags);
 	}
@@ -121,6 +170,8 @@  static long __queue_map_get(struct bpf_map *map, void *value, bool delete)
 
 out:
 	raw_spin_unlock_irqrestore(&qs->lock, flags);
+	map_unlock_dec(qs);
+
 	return err;
 }
 
@@ -132,10 +183,17 @@  static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
 	int err = 0;
 	void *ptr;
 	u32 index;
+	int ret;
+
+	ret = map_lock_inc(qs);
+	if (ret)
+		return ret;
 
 	if (in_nmi()) {
-		if (!raw_spin_trylock_irqsave(&qs->lock, flags))
+		if (!raw_spin_trylock_irqsave(&qs->lock, flags)) {
+			map_unlock_dec(qs);
 			return -EBUSY;
+		}
 	} else {
 		raw_spin_lock_irqsave(&qs->lock, flags);
 	}
@@ -158,6 +216,8 @@  static long __stack_map_get(struct bpf_map *map, void *value, bool delete)
 
 out:
 	raw_spin_unlock_irqrestore(&qs->lock, flags);
+	map_unlock_dec(qs);
+
 	return err;
 }
 
@@ -193,6 +253,7 @@  static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
 	unsigned long irq_flags;
 	int err = 0;
 	void *dst;
+	int ret;
 
 	/* BPF_EXIST is used to force making room for a new element in case the
 	 * map is full
@@ -203,9 +264,16 @@  static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
 	if (flags & BPF_NOEXIST || flags > BPF_EXIST)
 		return -EINVAL;
 
+
+	ret = map_lock_inc(qs);
+	if (ret)
+		return ret;
+
 	if (in_nmi()) {
-		if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags))
+		if (!raw_spin_trylock_irqsave(&qs->lock, irq_flags)) {
+			map_unlock_dec(qs);
 			return -EBUSY;
+		}
 	} else {
 		raw_spin_lock_irqsave(&qs->lock, irq_flags);
 	}
@@ -228,6 +296,8 @@  static long queue_stack_map_push_elem(struct bpf_map *map, void *value,
 
 out:
 	raw_spin_unlock_irqrestore(&qs->lock, irq_flags);
+	map_unlock_dec(qs);
+
 	return err;
 }