diff mbox series

[bpf,1/2] bpf/memalloc: Non-atomically allocate freelist during prefill

Message ID d47f7d1c80b0eabfee89a0fc9ef75bbe3d1eced7.1689885610.git.zhuyifei@google.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series bpf/memalloc: Allow non-atomic alloc_bulk | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf
netdev/fixes_present success Fixes tag present in non-next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1347 this patch: 1347
netdev/cc_maintainers fail 1 blamed authors not CCed: memxor@gmail.com; 7 maintainers not CCed: yhs@fb.com kpsingh@kernel.org memxor@gmail.com john.fastabend@gmail.com song@kernel.org jolsa@kernel.org haoluo@google.com
netdev/build_clang success Errors and warnings before: 1365 this patch: 1365
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success Fixes tag looks correct
netdev/build_allmodconfig_warn success Errors and warnings before: 1370 this patch: 1370
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 42 lines checked
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-PR success PR summary
bpf/vmtest-bpf-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-2 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-4 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-5 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-6 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-3 success Logs for build for s390x with gcc
bpf/vmtest-bpf-VM_Test-7 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-11 success Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-13 success Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-14 success Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-15 success Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-18 success Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-19 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-21 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-22 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-23 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-24 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-25 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-27 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-28 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-29 success Logs for veristat
bpf/vmtest-bpf-VM_Test-26 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-12 success Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-16 success Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-8 success Logs for test_maps on s390x with gcc

Commit Message

YiFei Zhu July 20, 2023, 8:44 p.m. UTC
Sometimes during prefill all precpu chunks are full and atomic
__alloc_percpu_gfp would not allocate new chunks. This will cause
-ENOMEM immediately upon next unit_alloc.

Prefill phase does not actually run in atomic context, so we can
use this fact to allocate non-atomically with GFP_KERNEL instead
of GFP_NOWAIT. This avoids the immediate -ENOMEM. Unfortunately
unit_alloc runs in atomic context, even from map item allocation in
syscalls, due to rcu_read_lock, so we can't do non-atomic
workarounds in unit_alloc.

Fixes: 4ab67149f3c6 ("bpf: Add percpu allocation support to bpf_mem_alloc.")
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
---
 kernel/bpf/memalloc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

Comments

Hou Tao July 21, 2023, 1:44 a.m. UTC | #1
On 7/21/2023 4:44 AM, YiFei Zhu wrote:
> Sometimes during prefill all precpu chunks are full and atomic
> __alloc_percpu_gfp would not allocate new chunks. This will cause
> -ENOMEM immediately upon next unit_alloc.
>
> Prefill phase does not actually run in atomic context, so we can
> use this fact to allocate non-atomically with GFP_KERNEL instead
> of GFP_NOWAIT. This avoids the immediate -ENOMEM. Unfortunately
> unit_alloc runs in atomic context, even from map item allocation in
> syscalls, due to rcu_read_lock, so we can't do non-atomic
> workarounds in unit_alloc.
>
> Fixes: 4ab67149f3c6 ("bpf: Add percpu allocation support to bpf_mem_alloc.")
> Signed-off-by: YiFei Zhu <zhuyifei@google.com>

Make sense to me, so

Acked-by: Hou Tao <houtao1@huawei.com>

But I don't know whether or not it is suitable for bpf tree.
> ---
>  kernel/bpf/memalloc.c | 12 ++++++++----
>  1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> index 0668bcd7c926..016249672b43 100644
> --- a/kernel/bpf/memalloc.c
> +++ b/kernel/bpf/memalloc.c
> @@ -154,13 +154,17 @@ static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
>  }
>  
>  /* Mostly runs from irq_work except __init phase. */
> -static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
> +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
>  {
>  	struct mem_cgroup *memcg = NULL, *old_memcg;
>  	unsigned long flags;
> +	gfp_t gfp;
>  	void *obj;
>  	int i;
>  
> +	gfp = __GFP_NOWARN | __GFP_ACCOUNT;
> +	gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL;
> +
>  	memcg = get_memcg(c);
>  	old_memcg = set_active_memcg(memcg);
>  	for (i = 0; i < cnt; i++) {
> @@ -183,7 +187,7 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
>  			 * will allocate from the current numa node which is what we
>  			 * want here.
>  			 */
> -			obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
> +			obj = __alloc(c, node, gfp);
>  			if (!obj)
>  				break;
>  		}
> @@ -321,7 +325,7 @@ static void bpf_mem_refill(struct irq_work *work)
>  		/* irq_work runs on this cpu and kmalloc will allocate
>  		 * from the current numa node which is what we want here.
>  		 */
> -		alloc_bulk(c, c->batch, NUMA_NO_NODE);
> +		alloc_bulk(c, c->batch, NUMA_NO_NODE, true);
>  	else if (cnt > c->high_watermark)
>  		free_bulk(c);
>  }
> @@ -367,7 +371,7 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
>  	 * prog won't be doing more than 4 map_update_elem from
>  	 * irq disabled region
>  	 */
> -	alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
> +	alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
>  }
>  
>  /* When size != 0 bpf_mem_cache for each cpu.
YiFei Zhu July 21, 2023, 2:31 a.m. UTC | #2
On Thu, Jul 20, 2023 at 6:45 PM Hou Tao <houtao@huaweicloud.com> wrote:
> On 7/21/2023 4:44 AM, YiFei Zhu wrote:
> > Sometimes during prefill all precpu chunks are full and atomic
> > __alloc_percpu_gfp would not allocate new chunks. This will cause
> > -ENOMEM immediately upon next unit_alloc.
> >
> > Prefill phase does not actually run in atomic context, so we can
> > use this fact to allocate non-atomically with GFP_KERNEL instead
> > of GFP_NOWAIT. This avoids the immediate -ENOMEM. Unfortunately
> > unit_alloc runs in atomic context, even from map item allocation in
> > syscalls, due to rcu_read_lock, so we can't do non-atomic
> > workarounds in unit_alloc.
> >
> > Fixes: 4ab67149f3c6 ("bpf: Add percpu allocation support to bpf_mem_alloc.")
> > Signed-off-by: YiFei Zhu <zhuyifei@google.com>
>
> Make sense to me, so
>
> Acked-by: Hou Tao <houtao1@huawei.com>
>
> But I don't know whether or not it is suitable for bpf tree.

I don't mind either way :) If changing to bpf-next requires a resend I
can do that too.

YiFei Zhu

> > ---
> >  kernel/bpf/memalloc.c | 12 ++++++++----
> >  1 file changed, 8 insertions(+), 4 deletions(-)
> >
> > diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
> > index 0668bcd7c926..016249672b43 100644
> > --- a/kernel/bpf/memalloc.c
> > +++ b/kernel/bpf/memalloc.c
> > @@ -154,13 +154,17 @@ static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
> >  }
> >
> >  /* Mostly runs from irq_work except __init phase. */
> > -static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
> > +static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
> >  {
> >       struct mem_cgroup *memcg = NULL, *old_memcg;
> >       unsigned long flags;
> > +     gfp_t gfp;
> >       void *obj;
> >       int i;
> >
> > +     gfp = __GFP_NOWARN | __GFP_ACCOUNT;
> > +     gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL;
> > +
> >       memcg = get_memcg(c);
> >       old_memcg = set_active_memcg(memcg);
> >       for (i = 0; i < cnt; i++) {
> > @@ -183,7 +187,7 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
> >                        * will allocate from the current numa node which is what we
> >                        * want here.
> >                        */
> > -                     obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
> > +                     obj = __alloc(c, node, gfp);
> >                       if (!obj)
> >                               break;
> >               }
> > @@ -321,7 +325,7 @@ static void bpf_mem_refill(struct irq_work *work)
> >               /* irq_work runs on this cpu and kmalloc will allocate
> >                * from the current numa node which is what we want here.
> >                */
> > -             alloc_bulk(c, c->batch, NUMA_NO_NODE);
> > +             alloc_bulk(c, c->batch, NUMA_NO_NODE, true);
> >       else if (cnt > c->high_watermark)
> >               free_bulk(c);
> >  }
> > @@ -367,7 +371,7 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
> >        * prog won't be doing more than 4 map_update_elem from
> >        * irq disabled region
> >        */
> > -     alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
> > +     alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
> >  }
> >
> >  /* When size != 0 bpf_mem_cache for each cpu.
>
Hou Tao July 26, 2023, 11:38 a.m. UTC | #3
Hi,

On 7/21/2023 10:31 AM, YiFei Zhu wrote:
> On Thu, Jul 20, 2023 at 6:45 PM Hou Tao <houtao@huaweicloud.com> wrote:
>> On 7/21/2023 4:44 AM, YiFei Zhu wrote:
>>> Sometimes during prefill all precpu chunks are full and atomic
>>> __alloc_percpu_gfp would not allocate new chunks. This will cause
>>> -ENOMEM immediately upon next unit_alloc.
>>>
>>> Prefill phase does not actually run in atomic context, so we can
>>> use this fact to allocate non-atomically with GFP_KERNEL instead
>>> of GFP_NOWAIT. This avoids the immediate -ENOMEM. Unfortunately
>>> unit_alloc runs in atomic context, even from map item allocation in
>>> syscalls, due to rcu_read_lock, so we can't do non-atomic
>>> workarounds in unit_alloc.
>>>
>>> Fixes: 4ab67149f3c6 ("bpf: Add percpu allocation support to bpf_mem_alloc.")
>>> Signed-off-by: YiFei Zhu <zhuyifei@google.com>
>> Make sense to me, so
>>
>> Acked-by: Hou Tao <houtao1@huawei.com>
>>
>> But I don't know whether or not it is suitable for bpf tree.
> I don't mind either way :) If changing to bpf-next requires a resend I
> can do that too.

Please resend and rebase the patch again bpf-next tree.
YiFei Zhu July 26, 2023, 6:44 p.m. UTC | #4
On Wed, Jul 26, 2023 at 4:38 AM Hou Tao <houtao@huaweicloud.com> wrote:
>
> Hi,
>
> On 7/21/2023 10:31 AM, YiFei Zhu wrote:
> > On Thu, Jul 20, 2023 at 6:45 PM Hou Tao <houtao@huaweicloud.com> wrote:
> >> On 7/21/2023 4:44 AM, YiFei Zhu wrote:
> >>> Sometimes during prefill all precpu chunks are full and atomic
> >>> __alloc_percpu_gfp would not allocate new chunks. This will cause
> >>> -ENOMEM immediately upon next unit_alloc.
> >>>
> >>> Prefill phase does not actually run in atomic context, so we can
> >>> use this fact to allocate non-atomically with GFP_KERNEL instead
> >>> of GFP_NOWAIT. This avoids the immediate -ENOMEM. Unfortunately
> >>> unit_alloc runs in atomic context, even from map item allocation in
> >>> syscalls, due to rcu_read_lock, so we can't do non-atomic
> >>> workarounds in unit_alloc.
> >>>
> >>> Fixes: 4ab67149f3c6 ("bpf: Add percpu allocation support to bpf_mem_alloc.")
> >>> Signed-off-by: YiFei Zhu <zhuyifei@google.com>
> >> Make sense to me, so
> >>
> >> Acked-by: Hou Tao <houtao1@huawei.com>
> >>
> >> But I don't know whether or not it is suitable for bpf tree.
> > I don't mind either way :) If changing to bpf-next requires a resend I
> > can do that too.
>
> Please resend and rebase the patch again bpf-next tree.
>

Will do. Should I drop the Fixes tag then?

YiFei Zhu
Hou Tao July 27, 2023, 1:21 a.m. UTC | #5
Hi,

On 7/27/2023 2:44 AM, YiFei Zhu wrote:
> On Wed, Jul 26, 2023 at 4:38 AM Hou Tao <houtao@huaweicloud.com> wrote:
>> Hi,
>>
>> On 7/21/2023 10:31 AM, YiFei Zhu wrote:
>>> On Thu, Jul 20, 2023 at 6:45 PM Hou Tao <houtao@huaweicloud.com> wrote:
>>>> On 7/21/2023 4:44 AM, YiFei Zhu wrote:
>>>>> Sometimes during prefill all precpu chunks are full and atomic
>>>>> __alloc_percpu_gfp would not allocate new chunks. This will cause
>>>>> -ENOMEM immediately upon next unit_alloc.
>>>>>
>>>>> Prefill phase does not actually run in atomic context, so we can
>>>>> use this fact to allocate non-atomically with GFP_KERNEL instead
>>>>> of GFP_NOWAIT. This avoids the immediate -ENOMEM. Unfortunately
>>>>> unit_alloc runs in atomic context, even from map item allocation in
>>>>> syscalls, due to rcu_read_lock, so we can't do non-atomic
>>>>> workarounds in unit_alloc.
>>>>>
>>>>> Fixes: 4ab67149f3c6 ("bpf: Add percpu allocation support to bpf_mem_alloc.")
>>>>> Signed-off-by: YiFei Zhu <zhuyifei@google.com>
>>>> Make sense to me, so
>>>>
>>>> Acked-by: Hou Tao <houtao1@huawei.com>
>>>>
>>>> But I don't know whether or not it is suitable for bpf tree.
>>> I don't mind either way :) If changing to bpf-next requires a resend I
>>> can do that too.
>> Please resend and rebase the patch again bpf-next tree.
>>
> Will do. Should I drop the Fixes tag then?

Before the introduction of bpf memory allocator, the allocation flag for
per-cpu memory allocation in hash map is GFP_NOWAIT. BPF memory
allocator doesn't change that, so I think we could drop the Fixes tag.
>
> YiFei Zhu
>
> .
diff mbox series

Patch

diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c
index 0668bcd7c926..016249672b43 100644
--- a/kernel/bpf/memalloc.c
+++ b/kernel/bpf/memalloc.c
@@ -154,13 +154,17 @@  static struct mem_cgroup *get_memcg(const struct bpf_mem_cache *c)
 }
 
 /* Mostly runs from irq_work except __init phase. */
-static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
+static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node, bool atomic)
 {
 	struct mem_cgroup *memcg = NULL, *old_memcg;
 	unsigned long flags;
+	gfp_t gfp;
 	void *obj;
 	int i;
 
+	gfp = __GFP_NOWARN | __GFP_ACCOUNT;
+	gfp |= atomic ? GFP_NOWAIT : GFP_KERNEL;
+
 	memcg = get_memcg(c);
 	old_memcg = set_active_memcg(memcg);
 	for (i = 0; i < cnt; i++) {
@@ -183,7 +187,7 @@  static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
 			 * will allocate from the current numa node which is what we
 			 * want here.
 			 */
-			obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
+			obj = __alloc(c, node, gfp);
 			if (!obj)
 				break;
 		}
@@ -321,7 +325,7 @@  static void bpf_mem_refill(struct irq_work *work)
 		/* irq_work runs on this cpu and kmalloc will allocate
 		 * from the current numa node which is what we want here.
 		 */
-		alloc_bulk(c, c->batch, NUMA_NO_NODE);
+		alloc_bulk(c, c->batch, NUMA_NO_NODE, true);
 	else if (cnt > c->high_watermark)
 		free_bulk(c);
 }
@@ -367,7 +371,7 @@  static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
 	 * prog won't be doing more than 4 map_update_elem from
 	 * irq disabled region
 	 */
-	alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu));
+	alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
 }
 
 /* When size != 0 bpf_mem_cache for each cpu.