diff mbox series

[v2,bpf-next,5/9] libbpf: use stable map placeholder FDs

Message ID 20240102190055.1602698-6-andrii@kernel.org (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Libbpf-side __arg_ctx fallback support | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next
netdev/ynl success SINGLE THREAD; Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 8 this patch: 8
netdev/cc_maintainers warning 7 maintainers not CCed: sdf@google.com kpsingh@kernel.org yonghong.song@linux.dev haoluo@google.com martin.lau@linux.dev song@kernel.org john.fastabend@gmail.com
netdev/build_clang success Errors and warnings before: 8 this patch: 8
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 8 this patch: 8
netdev/checkpatch warning WARNING: line length of 91 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-17 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-18 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-13 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-15 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18 and -O2 optimization
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17 and -O2 optimization

Commit Message

Andrii Nakryiko Jan. 2, 2024, 7 p.m. UTC
Move map creation to later during BPF object loading by pre-creating
stable placeholder FDs (initially pointing to /dev/null). Use dup2()
syscall to then atomically make those placeholder FDs point to real
kernel BPF map objects.

This change allows to delay BPF map creation to after all the BPF
program relocations. That, in turn, allows to delay BTF finalization and
loading into kernel to after all the relocations as well. We'll take
advantage of the latter in subsequent patches to allow libbpf to adjust
BTF in a way that helps with BPF global function usage.

Clean up a few places where we close map->fd, which now shouldn't
happen, because map->fd should be a valid FD regardless of whether map
was created or not. Surprisingly and nicely it simplifies a bunch of
error handling code. If this change doesn't backfire, I'm tempted to
pre-create such stable FDs for other entities (progs, maybe even BTF).
We previously did some manipulations to make gen_loader work with fake
map FDs, with stable map FDs this hack is not necessary for maps (we
still have it for BTF, but I left it as is for now).

Acked-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
---
 tools/lib/bpf/libbpf.c          | 91 +++++++++++++++++++--------------
 tools/lib/bpf/libbpf_internal.h | 24 +++++++++
 2 files changed, 77 insertions(+), 38 deletions(-)

Comments

Eduard Zingerman Jan. 3, 2024, 8:57 p.m. UTC | #1
Tbh, it looks like calls to zclose(map->fd) were unnecessary
regardless of this patch, as all maps are closed at the end of
bpf_object_load() in case of an error.

[...]

> diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> index f29cfb344f80..e0085aef17d7 100644
> --- a/tools/lib/bpf/libbpf.c
> +++ b/tools/lib/bpf/libbpf.c

[...]

> @@ -5275,13 +5289,11 @@ static int bpf_object_create_map(struct bpf_object *obj, struct bpf_map *map, bo
>  		create_attr.btf_value_type_id = 0;
>  		map->btf_key_type_id = 0;
>  		map->btf_value_type_id = 0;
> -		map->fd = bpf_map_create(def->type, map_name,
> -					 def->key_size, def->value_size,
> -					 def->max_entries, &create_attr);
> +		map_fd = bpf_map_create(def->type, map_name,
> +					def->key_size, def->value_size,
> +					def->max_entries, &create_attr);
>  	}
>  
> -	err = map->fd < 0 ? -errno : 0;
> -
>  	if (bpf_map_type_is_map_in_map(def->type) && map->inner_map) {
>  		if (obj->gen_loader)
>  			map->inner_map->fd = -1;
> @@ -5289,7 +5301,19 @@ static int bpf_object_create_map(struct bpf_object *obj, struct bpf_map *map, bo
>  		zfree(&map->inner_map);
>  	}
>  
> -	return err;
> +	if (map_fd < 0)
> +		return -errno;

Nit: this check is now placed after call to bpf_map_destroy(),
     which might call munmap(), which might overwrite "errno",
     set by some of the previous calls to bpf_map_create().

[...]

> diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
> index b5d334754e5d..662a3df1e29f 100644
> --- a/tools/lib/bpf/libbpf_internal.h
> +++ b/tools/lib/bpf/libbpf_internal.h
> @@ -555,6 +555,30 @@ static inline int ensure_good_fd(int fd)
>  	return fd;
>  }
>  
> +static inline int create_placeholder_fd(void)
> +{
> +	int fd;
> +
> +	fd = ensure_good_fd(open("/dev/null", O_WRONLY | O_CLOEXEC));

Stupid question: is it ok to assume that /dev is always mounted?
Googling says that kernel chooses if to mount it automatically
depending on the value of CONFIG_DEVTMPFS_MOUNT option.
Another option might be memfd_create().

> +	if (fd < 0)
> +		return -errno;
> +	return fd;
> +}

[...]
Andrii Nakryiko Jan. 3, 2024, 10:46 p.m. UTC | #2
On Wed, Jan 3, 2024 at 12:57 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> Tbh, it looks like calls to zclose(map->fd) were unnecessary
> regardless of this patch, as all maps are closed at the end of
> bpf_object_load() in case of an error.
>

yep, agreed

> [...]
>
> > diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
> > index f29cfb344f80..e0085aef17d7 100644
> > --- a/tools/lib/bpf/libbpf.c
> > +++ b/tools/lib/bpf/libbpf.c
>
> [...]
>
> > @@ -5275,13 +5289,11 @@ static int bpf_object_create_map(struct bpf_object *obj, struct bpf_map *map, bo
> >               create_attr.btf_value_type_id = 0;
> >               map->btf_key_type_id = 0;
> >               map->btf_value_type_id = 0;
> > -             map->fd = bpf_map_create(def->type, map_name,
> > -                                      def->key_size, def->value_size,
> > -                                      def->max_entries, &create_attr);
> > +             map_fd = bpf_map_create(def->type, map_name,
> > +                                     def->key_size, def->value_size,
> > +                                     def->max_entries, &create_attr);
> >       }
> >
> > -     err = map->fd < 0 ? -errno : 0;
> > -
> >       if (bpf_map_type_is_map_in_map(def->type) && map->inner_map) {
> >               if (obj->gen_loader)
> >                       map->inner_map->fd = -1;
> > @@ -5289,7 +5301,19 @@ static int bpf_object_create_map(struct bpf_object *obj, struct bpf_map *map, bo
> >               zfree(&map->inner_map);
> >       }
> >
> > -     return err;
> > +     if (map_fd < 0)
> > +             return -errno;
>
> Nit: this check is now placed after call to bpf_map_destroy(),
>      which might call munmap(), which might overwrite "errno",
>      set by some of the previous calls to bpf_map_create().
>

this should be `return map_fd;`, no need for errno, good catch, fixed

> [...]
>
> > diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
> > index b5d334754e5d..662a3df1e29f 100644
> > --- a/tools/lib/bpf/libbpf_internal.h
> > +++ b/tools/lib/bpf/libbpf_internal.h
> > @@ -555,6 +555,30 @@ static inline int ensure_good_fd(int fd)
> >       return fd;
> >  }
> >
> > +static inline int create_placeholder_fd(void)
> > +{
> > +     int fd;
> > +
> > +     fd = ensure_good_fd(open("/dev/null", O_WRONLY | O_CLOEXEC));
>
> Stupid question: is it ok to assume that /dev is always mounted?
> Googling says that kernel chooses if to mount it automatically
> depending on the value of CONFIG_DEVTMPFS_MOUNT option.
> Another option might be memfd_create().

Yeah, good point, I actually don't know how reliable is /dev/null.
memfd_create() is not a bad idea, Linux 3.17+, should be fine. I'll
switch.

>
> > +     if (fd < 0)
> > +             return -errno;
> > +     return fd;
> > +}
>
> [...]
>
>
diff mbox series

Patch

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index f29cfb344f80..e0085aef17d7 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -1510,7 +1510,21 @@  static struct bpf_map *bpf_object_add_map(struct bpf_object *obj)
 
 	map = &obj->maps[obj->nr_maps++];
 	map->obj = obj;
-	map->fd = -1;
+	/* Preallocate map FD without actually creating BPF map just yet.
+	 * These map FD "placeholders" will be reused later without changing
+	 * FD value when map is actually created in the kernel.
+	 *
+	 * This is useful to be able to perform BPF program relocations
+	 * without having to create BPF maps before that step. This allows us
+	 * to finalize and load BTF very late in BPF object's loading phase,
+	 * right before BPF maps have to be created and BPF programs have to
+	 * be loaded. By having these map FD placeholders we can perform all
+	 * the sanitizations, relocations, and any other adjustments before we
+	 * start creating actual BPF kernel objects (BTF, maps, progs).
+	 */
+	map->fd = create_placeholder_fd();
+	if (map->fd < 0)
+		return ERR_PTR(map->fd);
 	map->inner_map_fd = -1;
 	map->autocreate = true;
 
@@ -2600,7 +2614,9 @@  static int bpf_object_init_user_btf_map(struct bpf_object *obj,
 		map->inner_map = calloc(1, sizeof(*map->inner_map));
 		if (!map->inner_map)
 			return -ENOMEM;
-		map->inner_map->fd = -1;
+		map->inner_map->fd = create_placeholder_fd();
+		if (map->inner_map->fd < 0)
+			return map->inner_map->fd;
 		map->inner_map->sec_idx = sec_idx;
 		map->inner_map->name = malloc(strlen(map_name) + sizeof(".inner") + 1);
 		if (!map->inner_map->name)
@@ -4536,14 +4552,12 @@  int bpf_map__reuse_fd(struct bpf_map *map, int fd)
 		goto err_free_new_name;
 	}
 
-	err = zclose(map->fd);
-	if (err) {
-		err = -errno;
-		goto err_close_new_fd;
-	}
+	err = reuse_fd(map->fd, new_fd);
+	if (err)
+		goto err_free_new_name;
+
 	free(map->name);
 
-	map->fd = new_fd;
 	map->name = new_name;
 	map->def.type = info.type;
 	map->def.key_size = info.key_size;
@@ -4557,8 +4571,6 @@  int bpf_map__reuse_fd(struct bpf_map *map, int fd)
 
 	return 0;
 
-err_close_new_fd:
-	close(new_fd);
 err_free_new_name:
 	free(new_name);
 	return libbpf_err(err);
@@ -5194,7 +5206,7 @@  static int bpf_object_create_map(struct bpf_object *obj, struct bpf_map *map, bo
 	LIBBPF_OPTS(bpf_map_create_opts, create_attr);
 	struct bpf_map_def *def = &map->def;
 	const char *map_name = NULL;
-	int err = 0;
+	int err = 0, map_fd;
 
 	if (kernel_supports(obj, FEAT_PROG_NAME))
 		map_name = map->name;
@@ -5253,17 +5265,19 @@  static int bpf_object_create_map(struct bpf_object *obj, struct bpf_map *map, bo
 		bpf_gen__map_create(obj->gen_loader, def->type, map_name,
 				    def->key_size, def->value_size, def->max_entries,
 				    &create_attr, is_inner ? -1 : map - obj->maps);
-		/* Pretend to have valid FD to pass various fd >= 0 checks.
-		 * This fd == 0 will not be used with any syscall and will be reset to -1 eventually.
+		/* We keep pretenting we have valid FD to pass various fd >= 0
+		 * checks by just keeping original placeholder FDs in place.
+		 * See bpf_object_prepare_maps() comments.
+		 * This placeholder fd will not be used with any syscall and
+		 * will be reset to -1 eventually.
 		 */
-		map->fd = 0;
+		map_fd = map->fd;
 	} else {
-		map->fd = bpf_map_create(def->type, map_name,
-					 def->key_size, def->value_size,
-					 def->max_entries, &create_attr);
+		map_fd = bpf_map_create(def->type, map_name,
+					def->key_size, def->value_size,
+					def->max_entries, &create_attr);
 	}
-	if (map->fd < 0 && (create_attr.btf_key_type_id ||
-			    create_attr.btf_value_type_id)) {
+	if (map_fd < 0 && (create_attr.btf_key_type_id || create_attr.btf_value_type_id)) {
 		char *cp, errmsg[STRERR_BUFSIZE];
 
 		err = -errno;
@@ -5275,13 +5289,11 @@  static int bpf_object_create_map(struct bpf_object *obj, struct bpf_map *map, bo
 		create_attr.btf_value_type_id = 0;
 		map->btf_key_type_id = 0;
 		map->btf_value_type_id = 0;
-		map->fd = bpf_map_create(def->type, map_name,
-					 def->key_size, def->value_size,
-					 def->max_entries, &create_attr);
+		map_fd = bpf_map_create(def->type, map_name,
+					def->key_size, def->value_size,
+					def->max_entries, &create_attr);
 	}
 
-	err = map->fd < 0 ? -errno : 0;
-
 	if (bpf_map_type_is_map_in_map(def->type) && map->inner_map) {
 		if (obj->gen_loader)
 			map->inner_map->fd = -1;
@@ -5289,7 +5301,19 @@  static int bpf_object_create_map(struct bpf_object *obj, struct bpf_map *map, bo
 		zfree(&map->inner_map);
 	}
 
-	return err;
+	if (map_fd < 0)
+		return -errno;
+
+	/* obj->gen_loader case, prevent reuse_fd() from closing map_fd */
+	if (map->fd == map_fd)
+		return 0;
+
+	/* Keep placeholder FD value but now point it to the BPF map object.
+	 * This way everything that relied on this map's FD (e.g., relocated
+	 * ldimm64 instructions) will stay valid and won't need adjustments.
+	 * map->fd stays valid but now point to what map_fd points to.
+	 */
+	return reuse_fd(map->fd, map_fd);
 }
 
 static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map)
@@ -5373,10 +5397,8 @@  static int bpf_object_init_prog_arrays(struct bpf_object *obj)
 			continue;
 
 		err = init_prog_array_slots(obj, map);
-		if (err < 0) {
-			zclose(map->fd);
+		if (err < 0)
 			return err;
-		}
 	}
 	return 0;
 }
@@ -5466,25 +5488,20 @@  static int bpf_object_create_maps(struct bpf_object *obj)
 
 			if (bpf_map__is_internal(map)) {
 				err = bpf_object_populate_internal_map(obj, map);
-				if (err < 0) {
-					zclose(map->fd);
+				if (err < 0)
 					goto err_out;
-				}
 			}
 
 			if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) {
 				err = init_map_in_map_slots(obj, map);
-				if (err < 0) {
-					zclose(map->fd);
+				if (err < 0)
 					goto err_out;
-				}
 			}
 		}
 
 		if (map->pin_path && !map->pinned) {
 			err = bpf_map__pin(map, NULL);
 			if (err) {
-				zclose(map->fd);
 				if (!retried && err == -EEXIST) {
 					retried = true;
 					goto retry;
@@ -8049,8 +8066,8 @@  static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
 	err = err ? : bpf_object_sanitize_and_load_btf(obj);
 	err = err ? : bpf_object_sanitize_maps(obj);
 	err = err ? : bpf_object_init_kern_struct_ops_maps(obj);
-	err = err ? : bpf_object_create_maps(obj);
 	err = err ? : bpf_object_relocate(obj, obj->btf_custom_path ? : target_btf_path);
+	err = err ? : bpf_object_create_maps(obj);
 	err = err ? : bpf_object_load_progs(obj, extra_log_level);
 	err = err ? : bpf_object_init_prog_arrays(obj);
 	err = err ? : bpf_object_prepare_struct_ops(obj);
@@ -8059,8 +8076,6 @@  static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch
 		/* reset FDs */
 		if (obj->btf)
 			btf__set_fd(obj->btf, -1);
-		for (i = 0; i < obj->nr_maps; i++)
-			obj->maps[i].fd = -1;
 		if (!err)
 			err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps);
 	}
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index b5d334754e5d..662a3df1e29f 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -555,6 +555,30 @@  static inline int ensure_good_fd(int fd)
 	return fd;
 }
 
+static inline int create_placeholder_fd(void)
+{
+	int fd;
+
+	fd = ensure_good_fd(open("/dev/null", O_WRONLY | O_CLOEXEC));
+	if (fd < 0)
+		return -errno;
+	return fd;
+}
+
+/* Point *fixed_fd* to the same file that *tmp_fd* points to.
+ * Regardless of success, *tmp_fd* is closed.
+ * Whatever *fixed_fd* pointed to is closed silently.
+ */
+static inline int reuse_fd(int fixed_fd, int tmp_fd)
+{
+	int err;
+
+	err = dup2(tmp_fd, fixed_fd);
+	err = err < 0 ? -errno : 0;
+	close(tmp_fd); /* clean up temporary FD */
+	return err;
+}
+
 /* The following two functions are exposed to bpftool */
 int bpf_core_add_cands(struct bpf_core_cand *local_cand,
 		       size_t local_essent_len,