diff mbox series

[RFC,bpf-next,10/15] bpf: Use bpf_map_pages_alloc in ringbuf

Message ID 20220729152316.58205-11-laoar.shao@gmail.com (mailing list archive)
State New
Headers show
Series bpf: Introduce selectable memcg for bpf map | expand

Commit Message

Yafang Shao July 29, 2022, 3:23 p.m. UTC
Introduce new helper bpf_map_pages_alloc() for this memory allocation.

Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
---
 include/linux/bpf.h  |  4 ++++
 kernel/bpf/ringbuf.c | 27 +++++++++------------------
 kernel/bpf/syscall.c | 41 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 18 deletions(-)

Comments

Andrii Nakryiko Aug. 1, 2022, 11:16 p.m. UTC | #1
On Fri, Jul 29, 2022 at 8:23 AM Yafang Shao <laoar.shao@gmail.com> wrote:
>
> Introduce new helper bpf_map_pages_alloc() for this memory allocation.
>
> Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> ---
>  include/linux/bpf.h  |  4 ++++
>  kernel/bpf/ringbuf.c | 27 +++++++++------------------
>  kernel/bpf/syscall.c | 41 +++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 54 insertions(+), 18 deletions(-)
>

[...]

>         /* Each data page is mapped twice to allow "virtual"
>          * continuous read of samples wrapping around the end of ring
> @@ -95,16 +95,10 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(struct bpf_map *map,
>         if (!pages)
>                 return NULL;
>
> -       for (i = 0; i < nr_pages; i++) {
> -               page = alloc_pages_node(numa_node, flags, 0);
> -               if (!page) {
> -                       nr_pages = i;
> -                       goto err_free_pages;
> -               }
> -               pages[i] = page;
> -               if (i >= nr_meta_pages)
> -                       pages[nr_data_pages + i] = page;
> -       }
> +       ptr = bpf_map_pages_alloc(map, pages, nr_meta_pages, nr_data_pages,
> +                                 numa_node, flags, 0);
> +       if (!ptr)

bpf_map_pages_alloc() has some weird and confusing interface. It fills
out pages (second argument) and also returns pages as void *. Why not
just return int error (0 or -ENOMEM)? You are discarding this ptr
anyways.


But also thinking some more, bpf_map_pages_alloc() is very ringbuf
specific (which other map will have exactly the same meaning for
nr_meta_pages and nr_data_pages, where we also allocate 2 *
nr_data_pages, etc).

I don't think it makes sense to expose it as a generic internal API.
Why not keep all that inside kernel/bpf/ringbuf.c instead?

> +               goto err_free_pages;
>
>         rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
>                   VM_MAP | VM_USERMAP, PAGE_KERNEL);

[...]
Yafang Shao Aug. 2, 2022, 1:31 p.m. UTC | #2
On Tue, Aug 2, 2022 at 7:17 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Fri, Jul 29, 2022 at 8:23 AM Yafang Shao <laoar.shao@gmail.com> wrote:
> >
> > Introduce new helper bpf_map_pages_alloc() for this memory allocation.
> >
> > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > ---
> >  include/linux/bpf.h  |  4 ++++
> >  kernel/bpf/ringbuf.c | 27 +++++++++------------------
> >  kernel/bpf/syscall.c | 41 +++++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 54 insertions(+), 18 deletions(-)
> >
>
> [...]
>
> >         /* Each data page is mapped twice to allow "virtual"
> >          * continuous read of samples wrapping around the end of ring
> > @@ -95,16 +95,10 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(struct bpf_map *map,
> >         if (!pages)
> >                 return NULL;
> >
> > -       for (i = 0; i < nr_pages; i++) {
> > -               page = alloc_pages_node(numa_node, flags, 0);
> > -               if (!page) {
> > -                       nr_pages = i;
> > -                       goto err_free_pages;
> > -               }
> > -               pages[i] = page;
> > -               if (i >= nr_meta_pages)
> > -                       pages[nr_data_pages + i] = page;
> > -       }
> > +       ptr = bpf_map_pages_alloc(map, pages, nr_meta_pages, nr_data_pages,
> > +                                 numa_node, flags, 0);
> > +       if (!ptr)
>
> bpf_map_pages_alloc() has some weird and confusing interface. It fills
> out pages (second argument) and also returns pages as void *. Why not
> just return int error (0 or -ENOMEM)? You are discarding this ptr
> anyways.
>

I will change it.

>
> But also thinking some more, bpf_map_pages_alloc() is very ringbuf
> specific (which other map will have exactly the same meaning for
> nr_meta_pages and nr_data_pages, where we also allocate 2 *
> nr_data_pages, etc).
>
> I don't think it makes sense to expose it as a generic internal API.
> Why not keep all that inside kernel/bpf/ringbuf.c instead?
>

Right, it is used in ringbuf.c only currently. I will keep it inside ringbuf.c.
Andrii Nakryiko Aug. 2, 2022, 6 p.m. UTC | #3
On Tue, Aug 2, 2022 at 6:31 AM Yafang Shao <laoar.shao@gmail.com> wrote:
>
> On Tue, Aug 2, 2022 at 7:17 AM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Fri, Jul 29, 2022 at 8:23 AM Yafang Shao <laoar.shao@gmail.com> wrote:
> > >
> > > Introduce new helper bpf_map_pages_alloc() for this memory allocation.
> > >
> > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > > ---
> > >  include/linux/bpf.h  |  4 ++++
> > >  kernel/bpf/ringbuf.c | 27 +++++++++------------------
> > >  kernel/bpf/syscall.c | 41 +++++++++++++++++++++++++++++++++++++++++
> > >  3 files changed, 54 insertions(+), 18 deletions(-)
> > >
> >
> > [...]
> >
> > >         /* Each data page is mapped twice to allow "virtual"
> > >          * continuous read of samples wrapping around the end of ring
> > > @@ -95,16 +95,10 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(struct bpf_map *map,
> > >         if (!pages)
> > >                 return NULL;
> > >
> > > -       for (i = 0; i < nr_pages; i++) {
> > > -               page = alloc_pages_node(numa_node, flags, 0);
> > > -               if (!page) {
> > > -                       nr_pages = i;
> > > -                       goto err_free_pages;
> > > -               }
> > > -               pages[i] = page;
> > > -               if (i >= nr_meta_pages)
> > > -                       pages[nr_data_pages + i] = page;
> > > -       }
> > > +       ptr = bpf_map_pages_alloc(map, pages, nr_meta_pages, nr_data_pages,
> > > +                                 numa_node, flags, 0);
> > > +       if (!ptr)
> >
> > bpf_map_pages_alloc() has some weird and confusing interface. It fills
> > out pages (second argument) and also returns pages as void *. Why not
> > just return int error (0 or -ENOMEM)? You are discarding this ptr
> > anyways.
> >
>
> I will change it.
>
> >
> > But also thinking some more, bpf_map_pages_alloc() is very ringbuf
> > specific (which other map will have exactly the same meaning for
> > nr_meta_pages and nr_data_pages, where we also allocate 2 *
> > nr_data_pages, etc).
> >
> > I don't think it makes sense to expose it as a generic internal API.
> > Why not keep all that inside kernel/bpf/ringbuf.c instead?
> >
>
> Right, it is used in ringbuf.c only currently. I will keep it inside ringbuf.c.
>

In such case you might as well put pages = bpf_map_area_alloc(); part
into this function and return struct page ** as a result, so that
everything related to pages is handled as a single unit. And then
bpf_map_pages_free() will free not just each individual page, but also
struct page*[] array.

Also please call it something ringbuf specific, e.g.,
bpf_ringbuf_pages_{alloc,free}()?

>
> --
> Regards
> Yafang
Yafang Shao Aug. 3, 2022, 1:27 p.m. UTC | #4
On Wed, Aug 3, 2022 at 2:00 AM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Aug 2, 2022 at 6:31 AM Yafang Shao <laoar.shao@gmail.com> wrote:
> >
> > On Tue, Aug 2, 2022 at 7:17 AM Andrii Nakryiko
> > <andrii.nakryiko@gmail.com> wrote:
> > >
> > > On Fri, Jul 29, 2022 at 8:23 AM Yafang Shao <laoar.shao@gmail.com> wrote:
> > > >
> > > > Introduce new helper bpf_map_pages_alloc() for this memory allocation.
> > > >
> > > > Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
> > > > ---
> > > >  include/linux/bpf.h  |  4 ++++
> > > >  kernel/bpf/ringbuf.c | 27 +++++++++------------------
> > > >  kernel/bpf/syscall.c | 41 +++++++++++++++++++++++++++++++++++++++++
> > > >  3 files changed, 54 insertions(+), 18 deletions(-)
> > > >
> > >
> > > [...]
> > >
> > > >         /* Each data page is mapped twice to allow "virtual"
> > > >          * continuous read of samples wrapping around the end of ring
> > > > @@ -95,16 +95,10 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(struct bpf_map *map,
> > > >         if (!pages)
> > > >                 return NULL;
> > > >
> > > > -       for (i = 0; i < nr_pages; i++) {
> > > > -               page = alloc_pages_node(numa_node, flags, 0);
> > > > -               if (!page) {
> > > > -                       nr_pages = i;
> > > > -                       goto err_free_pages;
> > > > -               }
> > > > -               pages[i] = page;
> > > > -               if (i >= nr_meta_pages)
> > > > -                       pages[nr_data_pages + i] = page;
> > > > -       }
> > > > +       ptr = bpf_map_pages_alloc(map, pages, nr_meta_pages, nr_data_pages,
> > > > +                                 numa_node, flags, 0);
> > > > +       if (!ptr)
> > >
> > > bpf_map_pages_alloc() has some weird and confusing interface. It fills
> > > out pages (second argument) and also returns pages as void *. Why not
> > > just return int error (0 or -ENOMEM)? You are discarding this ptr
> > > anyways.
> > >
> >
> > I will change it.
> >
> > >
> > > But also thinking some more, bpf_map_pages_alloc() is very ringbuf
> > > specific (which other map will have exactly the same meaning for
> > > nr_meta_pages and nr_data_pages, where we also allocate 2 *
> > > nr_data_pages, etc).
> > >
> > > I don't think it makes sense to expose it as a generic internal API.
> > > Why not keep all that inside kernel/bpf/ringbuf.c instead?
> > >
> >
> > Right, it is used in ringbuf.c only currently. I will keep it inside ringbuf.c.
> >
>
> In such case you might as well put pages = bpf_map_area_alloc(); part
> into this function and return struct page ** as a result, so that
> everything related to pages is handled as a single unit. And then
> bpf_map_pages_free() will free not just each individual page, but also
> struct page*[] array.
>

Good suggestion. I will do it.

> Also please call it something ringbuf specific, e.g.,
> bpf_ringbuf_pages_{alloc,free}()?
>

It Makes sense to me.
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 711d9b1829d4..4af72d2b6d73 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1638,8 +1638,12 @@  void *bpf_map_container_alloc(u64 size, int numa_node);
 void *bpf_map_container_mmapable_alloc(u64 size, int numa_node,
 				       u32 align, u32 offset);
 void *bpf_map_area_alloc(struct bpf_map *map, u64 size, int numa_node);
+void *bpf_map_pages_alloc(struct bpf_map *map, struct page **pages,
+			  int nr_meta_pages, int nr_data_pages, int nid,
+			  gfp_t flags, unsigned int order);
 void bpf_map_area_free(void *base);
 void bpf_map_container_free(void *base);
+void bpf_map_pages_free(struct page **pages, int nr_pages);
 bool bpf_map_write_active(const struct bpf_map *map);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 int  generic_map_lookup_batch(struct bpf_map *map,
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index 7c875d4d5b2f..25973cab251d 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -63,15 +63,15 @@  static struct bpf_ringbuf *bpf_ringbuf_area_alloc(struct bpf_map *map,
 						  size_t data_sz,
 						  int numa_node)
 {
-	const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
+	const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL |
 			    __GFP_NOWARN | __GFP_ZERO;
 	int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
 	int nr_data_pages = data_sz >> PAGE_SHIFT;
 	int nr_pages = nr_meta_pages + nr_data_pages;
-	struct page **pages, *page;
 	struct bpf_ringbuf *rb;
+	struct page **pages;
 	size_t array_size;
-	int i;
+	void *ptr;
 
 	/* Each data page is mapped twice to allow "virtual"
 	 * continuous read of samples wrapping around the end of ring
@@ -95,16 +95,10 @@  static struct bpf_ringbuf *bpf_ringbuf_area_alloc(struct bpf_map *map,
 	if (!pages)
 		return NULL;
 
-	for (i = 0; i < nr_pages; i++) {
-		page = alloc_pages_node(numa_node, flags, 0);
-		if (!page) {
-			nr_pages = i;
-			goto err_free_pages;
-		}
-		pages[i] = page;
-		if (i >= nr_meta_pages)
-			pages[nr_data_pages + i] = page;
-	}
+	ptr = bpf_map_pages_alloc(map, pages, nr_meta_pages, nr_data_pages,
+				  numa_node, flags, 0);
+	if (!ptr)
+		goto err_free_pages;
 
 	rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
 		  VM_MAP | VM_USERMAP, PAGE_KERNEL);
@@ -116,8 +110,6 @@  static struct bpf_ringbuf *bpf_ringbuf_area_alloc(struct bpf_map *map,
 	}
 
 err_free_pages:
-	for (i = 0; i < nr_pages; i++)
-		__free_page(pages[i]);
 	bpf_map_area_free(pages);
 	return NULL;
 }
@@ -189,11 +181,10 @@  static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
 	 * to unmap rb itself with vunmap() below
 	 */
 	struct page **pages = rb->pages;
-	int i, nr_pages = rb->nr_pages;
+	int nr_pages = rb->nr_pages;
 
 	vunmap(rb);
-	for (i = 0; i < nr_pages; i++)
-		__free_page(pages[i]);
+	bpf_map_pages_free(pages, nr_pages);
 	bpf_map_area_free(pages);
 }
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4f893d2ac4fd..5c13782839f3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -559,6 +559,47 @@  void bpf_map_container_free(void *container)
 	kvfree(container);
 }
 
+void *bpf_map_pages_alloc(struct bpf_map *map, struct page **pages,
+			  int nr_meta_pages, int nr_data_pages, int nid,
+			  gfp_t flags, unsigned int order)
+{
+	int nr_pages = nr_meta_pages + nr_data_pages;
+	struct mem_cgroup *memcg, *old_memcg;
+	struct page *page;
+	int i;
+
+	memcg = bpf_map_get_memcg(map);
+	old_memcg = set_active_memcg(memcg);
+	for (i = 0; i < nr_pages; i++) {
+		page = alloc_pages_node(nid, flags | __GFP_ACCOUNT, order);
+		if (!page) {
+			nr_pages = i;
+			set_active_memcg(old_memcg);
+			goto err_free_pages;
+		}
+		pages[i] = page;
+		if (i >= nr_meta_pages)
+			pages[nr_data_pages + i] = page;
+	}
+	set_active_memcg(old_memcg);
+
+	return pages;
+
+err_free_pages:
+	for (i = 0; i < nr_pages; i++)
+		__free_page(pages[i]);
+
+	return NULL;
+}
+
+void bpf_map_pages_free(struct page **pages, int nr_pages)
+{
+	int i;
+
+	for (i = 0; i < nr_pages; i++)
+		__free_page(pages[i]);
+}
+
 static int bpf_map_kptr_off_cmp(const void *a, const void *b)
 {
 	const struct bpf_map_value_off_desc *off_desc1 = a, *off_desc2 = b;