diff mbox series

[bpf-next,v1,08/15] bpf: Adapt copy_map_value for multiple offset case

Message ID 20220220134813.3411982-9-memxor@gmail.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Introduce typed pointer support in BPF maps | expand

Checks

Context Check Description
netdev/tree_selection success Clearly marked for bpf-next
netdev/fixes_present success Fixes tag not required for -next series
netdev/subject_prefix success Link
netdev/cover_letter success Series has a cover letter
netdev/patch_count success Link
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1450 this patch: 1450
netdev/cc_maintainers warning 5 maintainers not CCed: kpsingh@kernel.org john.fastabend@gmail.com kafai@fb.com songliubraving@fb.com yhs@fb.com
netdev/build_clang success Errors and warnings before: 196 this patch: 196
netdev/module_param success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1467 this patch: 1467
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next success VM_Test

Commit Message

Kumar Kartikeya Dwivedi Feb. 20, 2022, 1:48 p.m. UTC
The changes in this patch deserve closer look, so it has been split into
its own independent patch. While earlier we just had to skip two objects
at most while copying in and out of map, now we have potentially many
objects (at most 8 + 2 = 10, due to the BPF_MAP_VALUE_OFF_MAX limit).

Hence, divide the copy_map_value function into an inlined fast path and
function call to slowpath. The slowpath handles the case of > 3 offsets,
while we handle the most common cases (0, 1, 2, or 3 offsets) in the
inline function itself.

In copy_map_value_slow, we use 11 offsets, just to make the for loop
that copies the value free of edge cases for the last offset, by using
map->value_size as final offset to subtract remaining area to copy from.

Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 include/linux/bpf.h  | 43 +++++++++++++++++++++++++++++++---
 kernel/bpf/syscall.c | 55 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 3 deletions(-)

Comments

Alexei Starovoitov Feb. 22, 2022, 7:04 a.m. UTC | #1
On Sun, Feb 20, 2022 at 07:18:06PM +0530, Kumar Kartikeya Dwivedi wrote:
> The changes in this patch deserve closer look, so it has been split into
> its own independent patch. While earlier we just had to skip two objects
> at most while copying in and out of map, now we have potentially many
> objects (at most 8 + 2 = 10, due to the BPF_MAP_VALUE_OFF_MAX limit).
> 
> Hence, divide the copy_map_value function into an inlined fast path and
> function call to slowpath. The slowpath handles the case of > 3 offsets,
> while we handle the most common cases (0, 1, 2, or 3 offsets) in the
> inline function itself.
> 
> In copy_map_value_slow, we use 11 offsets, just to make the for loop
> that copies the value free of edge cases for the last offset, by using
> map->value_size as final offset to subtract remaining area to copy from.
> 
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> ---
>  include/linux/bpf.h  | 43 +++++++++++++++++++++++++++++++---
>  kernel/bpf/syscall.c | 55 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 95 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index ae599aaf8d4c..5d845ca02eba 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -253,12 +253,22 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
>  		memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock));
>  	if (unlikely(map_value_has_timer(map)))
>  		memset(dst + map->timer_off, 0, sizeof(struct bpf_timer));
> +	if (unlikely(map_value_has_ptr_to_btf_id(map))) {
> +		struct bpf_map_value_off *tab = map->ptr_off_tab;
> +		int i;
> +
> +		for (i = 0; i < tab->nr_off; i++)
> +			*(u64 *)(dst + tab->off[i].offset) = 0;
> +	}
>  }
>  
> +void copy_map_value_slow(struct bpf_map *map, void *dst, void *src, u32 s_off,
> +			 u32 s_sz, u32 t_off, u32 t_sz);
> +
>  /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */
>  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
>  {
> -	u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0;
> +	u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0, p_off = 0, p_sz = 0;
>  
>  	if (unlikely(map_value_has_spin_lock(map))) {
>  		s_off = map->spin_lock_off;
> @@ -268,13 +278,40 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
>  		t_off = map->timer_off;
>  		t_sz = sizeof(struct bpf_timer);
>  	}
> +	/* Multiple offset case is slow, offload to function */
> +	if (unlikely(map_value_has_ptr_to_btf_id(map))) {
> +		struct bpf_map_value_off *tab = map->ptr_off_tab;
> +
> +		/* Inline the likely common case */
> +		if (likely(tab->nr_off == 1)) {
> +			p_off = tab->off[0].offset;
> +			p_sz = sizeof(u64);
> +		} else {
> +			copy_map_value_slow(map, dst, src, s_off, s_sz, t_off, t_sz);
> +			return;
> +		}
> +	}
> +
> +	if (unlikely(s_sz || t_sz || p_sz)) {
> +		/* The order is p_off, t_off, s_off, use insertion sort */
>  
> -	if (unlikely(s_sz || t_sz)) {
> +		if (t_off < p_off || !t_sz) {
> +			swap(t_off, p_off);
> +			swap(t_sz, p_sz);
> +		}
>  		if (s_off < t_off || !s_sz) {
>  			swap(s_off, t_off);
>  			swap(s_sz, t_sz);
> +			if (t_off < p_off || !t_sz) {
> +				swap(t_off, p_off);
> +				swap(t_sz, p_sz);
> +			}
>  		}
> -		memcpy(dst, src, t_off);
> +
> +		memcpy(dst, src, p_off);
> +		memcpy(dst + p_off + p_sz,
> +		       src + p_off + p_sz,
> +		       t_off - p_off - p_sz);
>  		memcpy(dst + t_off + t_sz,
>  		       src + t_off + t_sz,
>  		       s_off - t_off - t_sz);
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index beb96866f34d..83d71d6912f5 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -30,6 +30,7 @@
>  #include <linux/pgtable.h>
>  #include <linux/bpf_lsm.h>
>  #include <linux/poll.h>
> +#include <linux/sort.h>
>  #include <linux/bpf-netns.h>
>  #include <linux/rcupdate_trace.h>
>  #include <linux/memcontrol.h>
> @@ -230,6 +231,60 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
>  	return err;
>  }
>  
> +static int copy_map_value_cmp(const void *_a, const void *_b)
> +{
> +	const u32 a = *(const u32 *)_a;
> +	const u32 b = *(const u32 *)_b;
> +
> +	/* We only need to sort based on offset */
> +	if (a < b)
> +		return -1;
> +	else if (a > b)
> +		return 1;
> +	return 0;
> +}
> +
> +void copy_map_value_slow(struct bpf_map *map, void *dst, void *src, u32 s_off,
> +			 u32 s_sz, u32 t_off, u32 t_sz)
> +{
> +	struct bpf_map_value_off *tab = map->ptr_off_tab; /* already set to non-NULL */
> +	/* 3 = 2 for bpf_timer, bpf_spin_lock, 1 for map->value_size sentinel */
> +	struct {
> +		u32 off;
> +		u32 sz;
> +	} off_arr[BPF_MAP_VALUE_OFF_MAX + 3];
> +	int i, cnt = 0;
> +
> +	/* Reconsider stack usage when bumping BPF_MAP_VALUE_OFF_MAX */
> +	BUILD_BUG_ON(sizeof(off_arr) != 88);
> +
> +	for (i = 0; i < tab->nr_off; i++) {
> +		off_arr[cnt].off = tab->off[i].offset;
> +		off_arr[cnt++].sz = sizeof(u64);
> +	}
> +	if (s_sz) {
> +		off_arr[cnt].off = s_off;
> +		off_arr[cnt++].sz = s_sz;
> +	}
> +	if (t_sz) {
> +		off_arr[cnt].off = t_off;
> +		off_arr[cnt++].sz = t_sz;
> +	}
> +	off_arr[cnt].off = map->value_size;
> +
> +	sort(off_arr, cnt, sizeof(off_arr[0]), copy_map_value_cmp, NULL);

Ouch. sort every time we need to copy map value?
sort it once please. 88 bytes in a map are worth it.
Especially since "slow" version will trigger with just 2 kptrs.
(if I understand this correctly).
Kumar Kartikeya Dwivedi Feb. 23, 2022, 3:13 a.m. UTC | #2
On Tue, Feb 22, 2022 at 12:34:05PM IST, Alexei Starovoitov wrote:
> On Sun, Feb 20, 2022 at 07:18:06PM +0530, Kumar Kartikeya Dwivedi wrote:
> > The changes in this patch deserve closer look, so it has been split into
> > its own independent patch. While earlier we just had to skip two objects
> > at most while copying in and out of map, now we have potentially many
> > objects (at most 8 + 2 = 10, due to the BPF_MAP_VALUE_OFF_MAX limit).
> >
> > Hence, divide the copy_map_value function into an inlined fast path and
> > function call to slowpath. The slowpath handles the case of > 3 offsets,
> > while we handle the most common cases (0, 1, 2, or 3 offsets) in the
> > inline function itself.
> >
> > In copy_map_value_slow, we use 11 offsets, just to make the for loop
> > that copies the value free of edge cases for the last offset, by using
> > map->value_size as final offset to subtract remaining area to copy from.
> >
> > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> > ---
> >  include/linux/bpf.h  | 43 +++++++++++++++++++++++++++++++---
> >  kernel/bpf/syscall.c | 55 ++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 95 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index ae599aaf8d4c..5d845ca02eba 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -253,12 +253,22 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
> >  		memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock));
> >  	if (unlikely(map_value_has_timer(map)))
> >  		memset(dst + map->timer_off, 0, sizeof(struct bpf_timer));
> > +	if (unlikely(map_value_has_ptr_to_btf_id(map))) {
> > +		struct bpf_map_value_off *tab = map->ptr_off_tab;
> > +		int i;
> > +
> > +		for (i = 0; i < tab->nr_off; i++)
> > +			*(u64 *)(dst + tab->off[i].offset) = 0;
> > +	}
> >  }
> >
> > +void copy_map_value_slow(struct bpf_map *map, void *dst, void *src, u32 s_off,
> > +			 u32 s_sz, u32 t_off, u32 t_sz);
> > +
> >  /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */
> >  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
> >  {
> > -	u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0;
> > +	u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0, p_off = 0, p_sz = 0;
> >
> >  	if (unlikely(map_value_has_spin_lock(map))) {
> >  		s_off = map->spin_lock_off;
> > @@ -268,13 +278,40 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
> >  		t_off = map->timer_off;
> >  		t_sz = sizeof(struct bpf_timer);
> >  	}
> > +	/* Multiple offset case is slow, offload to function */
> > +	if (unlikely(map_value_has_ptr_to_btf_id(map))) {
> > +		struct bpf_map_value_off *tab = map->ptr_off_tab;
> > +
> > +		/* Inline the likely common case */
> > +		if (likely(tab->nr_off == 1)) {
> > +			p_off = tab->off[0].offset;
> > +			p_sz = sizeof(u64);
> > +		} else {
> > +			copy_map_value_slow(map, dst, src, s_off, s_sz, t_off, t_sz);
> > +			return;
> > +		}
> > +	}
> > +
> > +	if (unlikely(s_sz || t_sz || p_sz)) {
> > +		/* The order is p_off, t_off, s_off, use insertion sort */
> >
> > -	if (unlikely(s_sz || t_sz)) {
> > +		if (t_off < p_off || !t_sz) {
> > +			swap(t_off, p_off);
> > +			swap(t_sz, p_sz);
> > +		}
> >  		if (s_off < t_off || !s_sz) {
> >  			swap(s_off, t_off);
> >  			swap(s_sz, t_sz);
> > +			if (t_off < p_off || !t_sz) {
> > +				swap(t_off, p_off);
> > +				swap(t_sz, p_sz);
> > +			}
> >  		}
> > -		memcpy(dst, src, t_off);
> > +
> > +		memcpy(dst, src, p_off);
> > +		memcpy(dst + p_off + p_sz,
> > +		       src + p_off + p_sz,
> > +		       t_off - p_off - p_sz);
> >  		memcpy(dst + t_off + t_sz,
> >  		       src + t_off + t_sz,
> >  		       s_off - t_off - t_sz);
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index beb96866f34d..83d71d6912f5 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -30,6 +30,7 @@
> >  #include <linux/pgtable.h>
> >  #include <linux/bpf_lsm.h>
> >  #include <linux/poll.h>
> > +#include <linux/sort.h>
> >  #include <linux/bpf-netns.h>
> >  #include <linux/rcupdate_trace.h>
> >  #include <linux/memcontrol.h>
> > @@ -230,6 +231,60 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
> >  	return err;
> >  }
> >
> > +static int copy_map_value_cmp(const void *_a, const void *_b)
> > +{
> > +	const u32 a = *(const u32 *)_a;
> > +	const u32 b = *(const u32 *)_b;
> > +
> > +	/* We only need to sort based on offset */
> > +	if (a < b)
> > +		return -1;
> > +	else if (a > b)
> > +		return 1;
> > +	return 0;
> > +}
> > +
> > +void copy_map_value_slow(struct bpf_map *map, void *dst, void *src, u32 s_off,
> > +			 u32 s_sz, u32 t_off, u32 t_sz)
> > +{
> > +	struct bpf_map_value_off *tab = map->ptr_off_tab; /* already set to non-NULL */
> > +	/* 3 = 2 for bpf_timer, bpf_spin_lock, 1 for map->value_size sentinel */
> > +	struct {
> > +		u32 off;
> > +		u32 sz;
> > +	} off_arr[BPF_MAP_VALUE_OFF_MAX + 3];
> > +	int i, cnt = 0;
> > +
> > +	/* Reconsider stack usage when bumping BPF_MAP_VALUE_OFF_MAX */
> > +	BUILD_BUG_ON(sizeof(off_arr) != 88);
> > +
> > +	for (i = 0; i < tab->nr_off; i++) {
> > +		off_arr[cnt].off = tab->off[i].offset;
> > +		off_arr[cnt++].sz = sizeof(u64);
> > +	}
> > +	if (s_sz) {
> > +		off_arr[cnt].off = s_off;
> > +		off_arr[cnt++].sz = s_sz;
> > +	}
> > +	if (t_sz) {
> > +		off_arr[cnt].off = t_off;
> > +		off_arr[cnt++].sz = t_sz;
> > +	}
> > +	off_arr[cnt].off = map->value_size;
> > +
> > +	sort(off_arr, cnt, sizeof(off_arr[0]), copy_map_value_cmp, NULL);
>
> Ouch. sort every time we need to copy map value?
> sort it once please. 88 bytes in a map are worth it.
> Especially since "slow" version will trigger with just 2 kptrs.
> (if I understand this correctly).

Ok, also think we can reduce the size of the 88 bytes down to 55 bytes (32-bit
off + 8-bit size), and embed it in struct map. Then the shuffling needed for
timer and spin lock should also be gone.

--
Kartikeya
Alexei Starovoitov Feb. 23, 2022, 9:41 p.m. UTC | #3
On Tue, Feb 22, 2022 at 7:13 PM Kumar Kartikeya Dwivedi
<memxor@gmail.com> wrote:
>
> On Tue, Feb 22, 2022 at 12:34:05PM IST, Alexei Starovoitov wrote:
> > On Sun, Feb 20, 2022 at 07:18:06PM +0530, Kumar Kartikeya Dwivedi wrote:
> > > The changes in this patch deserve closer look, so it has been split into
> > > its own independent patch. While earlier we just had to skip two objects
> > > at most while copying in and out of map, now we have potentially many
> > > objects (at most 8 + 2 = 10, due to the BPF_MAP_VALUE_OFF_MAX limit).
> > >
> > > Hence, divide the copy_map_value function into an inlined fast path and
> > > function call to slowpath. The slowpath handles the case of > 3 offsets,
> > > while we handle the most common cases (0, 1, 2, or 3 offsets) in the
> > > inline function itself.
> > >
> > > In copy_map_value_slow, we use 11 offsets, just to make the for loop
> > > that copies the value free of edge cases for the last offset, by using
> > > map->value_size as final offset to subtract remaining area to copy from.
> > >
> > > Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> > > ---
> > >  include/linux/bpf.h  | 43 +++++++++++++++++++++++++++++++---
> > >  kernel/bpf/syscall.c | 55 ++++++++++++++++++++++++++++++++++++++++++++
> > >  2 files changed, 95 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > > index ae599aaf8d4c..5d845ca02eba 100644
> > > --- a/include/linux/bpf.h
> > > +++ b/include/linux/bpf.h
> > > @@ -253,12 +253,22 @@ static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
> > >             memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock));
> > >     if (unlikely(map_value_has_timer(map)))
> > >             memset(dst + map->timer_off, 0, sizeof(struct bpf_timer));
> > > +   if (unlikely(map_value_has_ptr_to_btf_id(map))) {
> > > +           struct bpf_map_value_off *tab = map->ptr_off_tab;
> > > +           int i;
> > > +
> > > +           for (i = 0; i < tab->nr_off; i++)
> > > +                   *(u64 *)(dst + tab->off[i].offset) = 0;
> > > +   }
> > >  }
> > >
> > > +void copy_map_value_slow(struct bpf_map *map, void *dst, void *src, u32 s_off,
> > > +                    u32 s_sz, u32 t_off, u32 t_sz);
> > > +
> > >  /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */
> > >  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
> > >  {
> > > -   u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0;
> > > +   u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0, p_off = 0, p_sz = 0;
> > >
> > >     if (unlikely(map_value_has_spin_lock(map))) {
> > >             s_off = map->spin_lock_off;
> > > @@ -268,13 +278,40 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
> > >             t_off = map->timer_off;
> > >             t_sz = sizeof(struct bpf_timer);
> > >     }
> > > +   /* Multiple offset case is slow, offload to function */
> > > +   if (unlikely(map_value_has_ptr_to_btf_id(map))) {
> > > +           struct bpf_map_value_off *tab = map->ptr_off_tab;
> > > +
> > > +           /* Inline the likely common case */
> > > +           if (likely(tab->nr_off == 1)) {
> > > +                   p_off = tab->off[0].offset;
> > > +                   p_sz = sizeof(u64);
> > > +           } else {
> > > +                   copy_map_value_slow(map, dst, src, s_off, s_sz, t_off, t_sz);
> > > +                   return;
> > > +           }
> > > +   }
> > > +
> > > +   if (unlikely(s_sz || t_sz || p_sz)) {
> > > +           /* The order is p_off, t_off, s_off, use insertion sort */
> > >
> > > -   if (unlikely(s_sz || t_sz)) {
> > > +           if (t_off < p_off || !t_sz) {
> > > +                   swap(t_off, p_off);
> > > +                   swap(t_sz, p_sz);
> > > +           }
> > >             if (s_off < t_off || !s_sz) {
> > >                     swap(s_off, t_off);
> > >                     swap(s_sz, t_sz);
> > > +                   if (t_off < p_off || !t_sz) {
> > > +                           swap(t_off, p_off);
> > > +                           swap(t_sz, p_sz);
> > > +                   }
> > >             }
> > > -           memcpy(dst, src, t_off);
> > > +
> > > +           memcpy(dst, src, p_off);
> > > +           memcpy(dst + p_off + p_sz,
> > > +                  src + p_off + p_sz,
> > > +                  t_off - p_off - p_sz);
> > >             memcpy(dst + t_off + t_sz,
> > >                    src + t_off + t_sz,
> > >                    s_off - t_off - t_sz);
> > > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > > index beb96866f34d..83d71d6912f5 100644
> > > --- a/kernel/bpf/syscall.c
> > > +++ b/kernel/bpf/syscall.c
> > > @@ -30,6 +30,7 @@
> > >  #include <linux/pgtable.h>
> > >  #include <linux/bpf_lsm.h>
> > >  #include <linux/poll.h>
> > > +#include <linux/sort.h>
> > >  #include <linux/bpf-netns.h>
> > >  #include <linux/rcupdate_trace.h>
> > >  #include <linux/memcontrol.h>
> > > @@ -230,6 +231,60 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
> > >     return err;
> > >  }
> > >
> > > +static int copy_map_value_cmp(const void *_a, const void *_b)
> > > +{
> > > +   const u32 a = *(const u32 *)_a;
> > > +   const u32 b = *(const u32 *)_b;
> > > +
> > > +   /* We only need to sort based on offset */
> > > +   if (a < b)
> > > +           return -1;
> > > +   else if (a > b)
> > > +           return 1;
> > > +   return 0;
> > > +}
> > > +
> > > +void copy_map_value_slow(struct bpf_map *map, void *dst, void *src, u32 s_off,
> > > +                    u32 s_sz, u32 t_off, u32 t_sz)
> > > +{
> > > +   struct bpf_map_value_off *tab = map->ptr_off_tab; /* already set to non-NULL */
> > > +   /* 3 = 2 for bpf_timer, bpf_spin_lock, 1 for map->value_size sentinel */
> > > +   struct {
> > > +           u32 off;
> > > +           u32 sz;
> > > +   } off_arr[BPF_MAP_VALUE_OFF_MAX + 3];
> > > +   int i, cnt = 0;
> > > +
> > > +   /* Reconsider stack usage when bumping BPF_MAP_VALUE_OFF_MAX */
> > > +   BUILD_BUG_ON(sizeof(off_arr) != 88);
> > > +
> > > +   for (i = 0; i < tab->nr_off; i++) {
> > > +           off_arr[cnt].off = tab->off[i].offset;
> > > +           off_arr[cnt++].sz = sizeof(u64);
> > > +   }
> > > +   if (s_sz) {
> > > +           off_arr[cnt].off = s_off;
> > > +           off_arr[cnt++].sz = s_sz;
> > > +   }
> > > +   if (t_sz) {
> > > +           off_arr[cnt].off = t_off;
> > > +           off_arr[cnt++].sz = t_sz;
> > > +   }
> > > +   off_arr[cnt].off = map->value_size;
> > > +
> > > +   sort(off_arr, cnt, sizeof(off_arr[0]), copy_map_value_cmp, NULL);
> >
> > Ouch. sort every time we need to copy map value?
> > sort it once please. 88 bytes in a map are worth it.
> > Especially since "slow" version will trigger with just 2 kptrs.
> > (if I understand this correctly).
>
> Ok, also think we can reduce the size of the 88 bytes down to 55 bytes (32-bit
> off + 8-bit size), and embed it in struct map. Then the shuffling needed for
> timer and spin lock should also be gone.

We can probably make this copy_map_value_slow to be the only function.
I suspect
+       /* There is always at least one element */
+       memcpy(dst, src, off_arr[0].off);
+       /* Copy the rest, while skipping other regions */
+       for (i = 1; i < cnt; i++) {
+               u32 curr_off = off_arr[i - 1].off + off_arr[i - 1].sz;
+               u32 next_off = off_arr[i].off;
+
+               memcpy(dst + curr_off, src + curr_off, next_off - curr_off);
+       }
is faster than the dance we currently do in copy_map_value with a bunch of if-s.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ae599aaf8d4c..5d845ca02eba 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -253,12 +253,22 @@  static inline void check_and_init_map_value(struct bpf_map *map, void *dst)
 		memset(dst + map->spin_lock_off, 0, sizeof(struct bpf_spin_lock));
 	if (unlikely(map_value_has_timer(map)))
 		memset(dst + map->timer_off, 0, sizeof(struct bpf_timer));
+	if (unlikely(map_value_has_ptr_to_btf_id(map))) {
+		struct bpf_map_value_off *tab = map->ptr_off_tab;
+		int i;
+
+		for (i = 0; i < tab->nr_off; i++)
+			*(u64 *)(dst + tab->off[i].offset) = 0;
+	}
 }
 
+void copy_map_value_slow(struct bpf_map *map, void *dst, void *src, u32 s_off,
+			 u32 s_sz, u32 t_off, u32 t_sz);
+
 /* copy everything but bpf_spin_lock and bpf_timer. There could be one of each. */
 static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 {
-	u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0;
+	u32 s_off = 0, s_sz = 0, t_off = 0, t_sz = 0, p_off = 0, p_sz = 0;
 
 	if (unlikely(map_value_has_spin_lock(map))) {
 		s_off = map->spin_lock_off;
@@ -268,13 +278,40 @@  static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
 		t_off = map->timer_off;
 		t_sz = sizeof(struct bpf_timer);
 	}
+	/* Multiple offset case is slow, offload to function */
+	if (unlikely(map_value_has_ptr_to_btf_id(map))) {
+		struct bpf_map_value_off *tab = map->ptr_off_tab;
+
+		/* Inline the likely common case */
+		if (likely(tab->nr_off == 1)) {
+			p_off = tab->off[0].offset;
+			p_sz = sizeof(u64);
+		} else {
+			copy_map_value_slow(map, dst, src, s_off, s_sz, t_off, t_sz);
+			return;
+		}
+	}
+
+	if (unlikely(s_sz || t_sz || p_sz)) {
+		/* The order is p_off, t_off, s_off, use insertion sort */
 
-	if (unlikely(s_sz || t_sz)) {
+		if (t_off < p_off || !t_sz) {
+			swap(t_off, p_off);
+			swap(t_sz, p_sz);
+		}
 		if (s_off < t_off || !s_sz) {
 			swap(s_off, t_off);
 			swap(s_sz, t_sz);
+			if (t_off < p_off || !t_sz) {
+				swap(t_off, p_off);
+				swap(t_sz, p_sz);
+			}
 		}
-		memcpy(dst, src, t_off);
+
+		memcpy(dst, src, p_off);
+		memcpy(dst + p_off + p_sz,
+		       src + p_off + p_sz,
+		       t_off - p_off - p_sz);
 		memcpy(dst + t_off + t_sz,
 		       src + t_off + t_sz,
 		       s_off - t_off - t_sz);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index beb96866f34d..83d71d6912f5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -30,6 +30,7 @@ 
 #include <linux/pgtable.h>
 #include <linux/bpf_lsm.h>
 #include <linux/poll.h>
+#include <linux/sort.h>
 #include <linux/bpf-netns.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/memcontrol.h>
@@ -230,6 +231,60 @@  static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
 	return err;
 }
 
+static int copy_map_value_cmp(const void *_a, const void *_b)
+{
+	const u32 a = *(const u32 *)_a;
+	const u32 b = *(const u32 *)_b;
+
+	/* We only need to sort based on offset */
+	if (a < b)
+		return -1;
+	else if (a > b)
+		return 1;
+	return 0;
+}
+
+void copy_map_value_slow(struct bpf_map *map, void *dst, void *src, u32 s_off,
+			 u32 s_sz, u32 t_off, u32 t_sz)
+{
+	struct bpf_map_value_off *tab = map->ptr_off_tab; /* already set to non-NULL */
+	/* 3 = 2 for bpf_timer, bpf_spin_lock, 1 for map->value_size sentinel */
+	struct {
+		u32 off;
+		u32 sz;
+	} off_arr[BPF_MAP_VALUE_OFF_MAX + 3];
+	int i, cnt = 0;
+
+	/* Reconsider stack usage when bumping BPF_MAP_VALUE_OFF_MAX */
+	BUILD_BUG_ON(sizeof(off_arr) != 88);
+
+	for (i = 0; i < tab->nr_off; i++) {
+		off_arr[cnt].off = tab->off[i].offset;
+		off_arr[cnt++].sz = sizeof(u64);
+	}
+	if (s_sz) {
+		off_arr[cnt].off = s_off;
+		off_arr[cnt++].sz = s_sz;
+	}
+	if (t_sz) {
+		off_arr[cnt].off = t_off;
+		off_arr[cnt++].sz = t_sz;
+	}
+	off_arr[cnt].off = map->value_size;
+
+	sort(off_arr, cnt, sizeof(off_arr[0]), copy_map_value_cmp, NULL);
+
+	/* There is always at least one element */
+	memcpy(dst, src, off_arr[0].off);
+	/* Copy the rest, while skipping other regions */
+	for (i = 1; i < cnt; i++) {
+		u32 curr_off = off_arr[i - 1].off + off_arr[i - 1].sz;
+		u32 next_off = off_arr[i].off;
+
+		memcpy(dst + curr_off, src + curr_off, next_off - curr_off);
+	}
+}
+
 static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
 			      __u64 flags)
 {