diff mbox series

[bpf-next,2/2] bpftool: support loading flow dissector

Message ID 20181107193552.77894-3-sdf@google.com (mailing list archive)
State New
Headers show
Series bpftool: support loading flow dissector | expand

Commit Message

Stanislav Fomichev Nov. 7, 2018, 7:35 p.m. UTC
This commit adds support for loading/attaching/detaching flow
dissector program. The structure of the flow dissector program is
assumed to be the same as in the selftests:

* flow_dissector section with the main entry point
* a bunch of tail call progs
* a jmp_table map that is populated with the tail call progs

When `bpftool load` is called with a flow_dissector prog (i.e. when the
first section is flow_dissector of 'type flow_dissector' argument is
passed), we load and pin all the programs and build the jump table.

The last argument of `bpftool attach` is made optional for this use
case.

Example:
bpftool prog load tools/testing/selftests/bpf/bpf_flow.o \
	/sys/fs/bpf/flow type flow_dissector
bpftool prog attach pinned /sys/fs/bpf/flow/flow_dissector/0 flow_dissector

Tested by using the above two lines to load the prog in
the test_flow_dissector.sh selftest.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
---
 .../bpftool/Documentation/bpftool-prog.rst    |  16 ++-
 tools/bpf/bpftool/common.c                    |  32 +++--
 tools/bpf/bpftool/main.h                      |   1 +
 tools/bpf/bpftool/prog.c                      | 135 +++++++++++++++---
 4 files changed, 141 insertions(+), 43 deletions(-)

Comments

Jakub Kicinski Nov. 7, 2018, 8:03 p.m. UTC | #1
On Wed,  7 Nov 2018 11:35:52 -0800, Stanislav Fomichev wrote:
> This commit adds support for loading/attaching/detaching flow
> dissector program. The structure of the flow dissector program is
> assumed to be the same as in the selftests:
> 
> * flow_dissector section with the main entry point
> * a bunch of tail call progs
> * a jmp_table map that is populated with the tail call progs
> 
> When `bpftool load` is called with a flow_dissector prog (i.e. when the
> first section is flow_dissector of 'type flow_dissector' argument is
> passed), we load and pin all the programs and build the jump table.
> 
> The last argument of `bpftool attach` is made optional for this use
> case.
> 
> Example:
> bpftool prog load tools/testing/selftests/bpf/bpf_flow.o \
> 	/sys/fs/bpf/flow type flow_dissector
> bpftool prog attach pinned /sys/fs/bpf/flow/flow_dissector/0 flow_dissector
> 
> Tested by using the above two lines to load the prog in
> the test_flow_dissector.sh selftest.
> 
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  .../bpftool/Documentation/bpftool-prog.rst    |  16 ++-
>  tools/bpf/bpftool/common.c                    |  32 +++--
>  tools/bpf/bpftool/main.h                      |   1 +
>  tools/bpf/bpftool/prog.c                      | 135 +++++++++++++++---

Please add the new attach type to bash completions.

>  4 files changed, 141 insertions(+), 43 deletions(-)
> 
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> index ac4e904b10fb..3caa9153435b 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> @@ -25,8 +25,8 @@ MAP COMMANDS
>  |	**bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | **opcodes**}]
>  |	**bpftool** **prog pin** *PROG* *FILE*
>  |	**bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*]
> -|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* *MAP*
> -|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* *MAP*
> +|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
> +|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
>  |	**bpftool** **prog help**
>  |
>  |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
> @@ -39,7 +39,9 @@ MAP COMMANDS
>  |		**cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
>  |		**cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6**
>  |	}
> -|       *ATTACH_TYPE* := { **msg_verdict** | **skb_verdict** | **skb_parse** }
> +|       *ATTACH_TYPE* := {
> +|	|	**msg_verdict** | **skb_verdict** | **skb_parse** | **flow_dissector**
> +|	}
>  
>  
>  DESCRIPTION
> @@ -97,13 +99,13 @@ DESCRIPTION
>  		  contain a dot character ('.'), which is reserved for future
>  		  extensions of *bpffs*.
>  
> -        **bpftool prog attach** *PROG* *ATTACH_TYPE* *MAP*
> +        **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
>                    Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
> -                  to the map *MAP*.
> +                  to the optional map *MAP*.

Perhaps we can do better on help?  Attach BPF program *PROG* (with type
specified by *ATTACH_TYPE*).  Most *ATTACH_TYPEs* require a *MAP*
parameter, with the exception of *flow_dissector* which is attached to
current networking name space.

> -        **bpftool prog detach** *PROG* *ATTACH_TYPE* *MAP*
> +        **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
>                    Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
> -                  from the map *MAP*.
> +                  from the optional map *MAP*.
>  
>  	**bpftool prog help**
>  		  Print short help message.
> diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
> index 25af85304ebe..963881142dfb 100644
> --- a/tools/bpf/bpftool/common.c
> +++ b/tools/bpf/bpftool/common.c

> @@ -204,10 +194,22 @@ int do_pin_fd(int fd, const char *name)
>  
>  out_free:
>  	free(file);
> -out:
>  	return err;
>  }
>  
> +int do_pin_fd(int fd, const char *name)
> +{
> +	int err = mount_bpffs_for_pin(name);

Please don't initialize the error variable with a non-trivial function
call.

> +	if (err) {
> +		p_err("can't mount bpffs for pin %s: %s",
> +		      name, strerror(errno));

I think mount_bpffs_for_pin() will already print an error.  We can't
print two errors, because it will break JSON output.

> +		return err;
> +	}
> +
> +	return bpf_obj_pin(fd, name);
> +}
> +
>  int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32))
>  {
>  	unsigned int id;

> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> index 5302ee282409..f3a07ec3a444 100644
> --- a/tools/bpf/bpftool/prog.c
> +++ b/tools/bpf/bpftool/prog.c
> @@ -81,6 +81,7 @@ static const char * const attach_type_strings[] = {
>  	[BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
>  	[BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
>  	[BPF_SK_MSG_VERDICT] = "msg_verdict",
> +	[BPF_FLOW_DISSECTOR] = "flow_dissector",
>  	[__MAX_BPF_ATTACH_TYPE] = NULL,
>  };
>  
> @@ -724,9 +725,10 @@ int map_replace_compar(const void *p1, const void *p2)
>  static int do_attach(int argc, char **argv)
>  {
>  	enum bpf_attach_type attach_type;
> -	int err, mapfd, progfd;
> +	int err, progfd;
> +	int mapfd = 0;
>  
> -	if (!REQ_ARGS(5)) {
> +	if (!REQ_ARGS(3)) {
>  		p_err("too few parameters for map attach");
>  		return -EINVAL;
>  	}
> @@ -741,10 +743,11 @@ static int do_attach(int argc, char **argv)
>  		return -EINVAL;
>  	}
>  	NEXT_ARG();
> -
> -	mapfd = map_parse_fd(&argc, &argv);
> -	if (mapfd < 0)
> -		return mapfd;
> +	if (argc > 0) {

Flow dissector can't need a map right?  I think explicitly checking for
the correct number of arguments once attach type is known would be good.

> +		mapfd = map_parse_fd(&argc, &argv);
> +		if (mapfd < 0)
> +			return mapfd;
> +	}
>  
>  	err = bpf_prog_attach(progfd, mapfd, attach_type, 0);
>  	if (err) {
> @@ -760,9 +763,10 @@ static int do_attach(int argc, char **argv)
>  static int do_detach(int argc, char **argv)
>  {
>  	enum bpf_attach_type attach_type;
> -	int err, mapfd, progfd;
> +	int err, progfd;
> +	int mapfd = 0;
>  
> -	if (!REQ_ARGS(5)) {
> +	if (!REQ_ARGS(3)) {
>  		p_err("too few parameters for map detach");
>  		return -EINVAL;
>  	}
> @@ -777,10 +781,11 @@ static int do_detach(int argc, char **argv)
>  		return -EINVAL;
>  	}
>  	NEXT_ARG();
> -
> -	mapfd = map_parse_fd(&argc, &argv);
> -	if (mapfd < 0)
> -		return mapfd;
> +	if (argc > 0) {
> +		mapfd = map_parse_fd(&argc, &argv);
> +		if (mapfd < 0)
> +			return mapfd;
> +	}
>  
>  	err = bpf_prog_detach2(progfd, mapfd, attach_type);
>  	if (err) {
> @@ -792,6 +797,56 @@ static int do_detach(int argc, char **argv)
>  		jsonw_null(json_wtr);
>  	return 0;
>  }
> +
> +/* Flow dissector consists of a main program and a jump table for each
> + * supported protocol. The assumption here is that the first prog is the main
> + * one and the other progs are used in the tail calls. In this routine we
> + * build the jump table for the non-main progs.
> + */
> +static int build_flow_dissector_jmp_table(struct bpf_object *obj,
> +					  struct bpf_program *prog,
> +					  const char *jmp_table_map)
> +{
> +	struct bpf_map *jmp_table;
> +	struct bpf_program *pos;
> +	int i = 0;
> +	int prog_fd, jmp_table_fd, fd;

Please order variables longest to shortest.

> +	prog_fd = bpf_program__fd(prog);
> +	if (prog_fd < 0) {
> +		p_err("failed to get fd of main prog");
> +		return prog_fd;
> +	}
> +
> +	jmp_table = bpf_object__find_map_by_name(obj, jmp_table_map);
> +	if (jmp_table == NULL) {

nit: !jmp_table

> +		p_err("failed to find '%s' map", jmp_table_map);
> +		return -1;
> +	}
> +
> +	jmp_table_fd = bpf_map__fd(jmp_table);
> +	if (jmp_table_fd < 0) {
> +		p_err("failed to get fd of jmp_table");
> +		return jmp_table_fd;
> +	}
> +
> +	bpf_object__for_each_program(pos, obj) {
> +		fd = bpf_program__fd(pos);
> +		if (fd < 0) {
> +			p_err("failed to get fd of '%s'",
> +			      bpf_program__title(pos, false));
> +			return fd;
> +		}
> +
> +		if (fd != prog_fd) {
> +			bpf_map_update_elem(jmp_table_fd, &i, &fd, BPF_ANY);
> +			++i;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int do_load(int argc, char **argv)
>  {
>  	enum bpf_attach_type expected_attach_type;
> @@ -800,7 +855,7 @@ static int do_load(int argc, char **argv)
>  	};
>  	struct map_replace *map_replace = NULL;
>  	unsigned int old_map_fds = 0;
> -	struct bpf_program *prog;
> +	struct bpf_program *prog, *pos;

variable order

>  	struct bpf_object *obj;
>  	struct bpf_map *map;
>  	const char *pinfile;
> @@ -918,13 +973,19 @@ static int do_load(int argc, char **argv)
>  		goto err_free_reuse_maps;
>  	}
>  
> -	prog = bpf_program__next(NULL, obj);
> +	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
> +		/* for the flow dissector type, the entry point is in the
> +		 * section flow_dissector; other progs are tail calls
> +		 */
> +		prog = bpf_object__find_program_by_title(obj, "flow_dissector");
> +	} else {
> +		prog = bpf_program__next(NULL, obj);
> +	}
>  	if (!prog) {
>  		p_err("object file doesn't contain any bpf program");
>  		goto err_close_obj;
>  	}
>  
> -	bpf_program__set_ifindex(prog, ifindex);
>  	if (attr.prog_type == BPF_PROG_TYPE_UNSPEC) {
>  		const char *sec_name = bpf_program__title(prog, false);
>  
> @@ -936,8 +997,13 @@ static int do_load(int argc, char **argv)
>  			goto err_close_obj;
>  		}
>  	}
> -	bpf_program__set_type(prog, attr.prog_type);
> -	bpf_program__set_expected_attach_type(prog, expected_attach_type);
> +
> +	bpf_object__for_each_program(pos, obj) {
> +		bpf_program__set_ifindex(pos, ifindex);
> +		bpf_program__set_type(pos, attr.prog_type);
> +		bpf_program__set_expected_attach_type(pos,
> +						      expected_attach_type);
> +	}
>  
>  	qsort(map_replace, old_map_fds, sizeof(*map_replace),
>  	      map_replace_compar);
> @@ -1001,8 +1067,34 @@ static int do_load(int argc, char **argv)
>  		goto err_close_obj;
>  	}
>  
> -	if (do_pin_fd(bpf_program__fd(prog), pinfile))
> +	err = mount_bpffs_for_pin(pinfile);
> +	if (err) {
> +		p_err("failed to mount bpffs for pin '%s'", pinfile);

Probably would be a duplicated error again?

>  		goto err_close_obj;
> +	}
> +
> +	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
> +		err = build_flow_dissector_jmp_table(obj, prog, "jmp_table");
> +		if (err) {
> +			p_err("failed to build flow dissector jump table");
> +			goto err_close_obj;
> +		}
> +		/* flow dissector consist of multiple programs,
> +		 * we want to pin them all

Why pin them all shouldn't the main program be the only one pinned?

> +		 */
> +		err = bpf_object__pin(obj, pinfile);
> +		if (err) {
> +			p_err("failed to pin flow dissector object");
> +			goto err_close_obj;
> +		}
> +	} else {
> +		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
> +		if (err) {
> +			p_err("failed to pin program %s",
> +			      bpf_program__title(prog, false));
> +			goto err_close_obj;
> +		}
> +	}
Quentin Monnet Nov. 7, 2018, 8:08 p.m. UTC | #2
Hi Stanislav,

2018-11-07 11:35 UTC-0800 ~ Stanislav Fomichev <sdf@google.com>
> This commit adds support for loading/attaching/detaching flow
> dissector program. The structure of the flow dissector program is
> assumed to be the same as in the selftests:
> 
> * flow_dissector section with the main entry point
> * a bunch of tail call progs
> * a jmp_table map that is populated with the tail call progs
> 
> When `bpftool load` is called with a flow_dissector prog (i.e. when the
> first section is flow_dissector of 'type flow_dissector' argument is
> passed), we load and pin all the programs and build the jump table.
> 
> The last argument of `bpftool attach` is made optional for this use
> case.
> 
> Example:
> bpftool prog load tools/testing/selftests/bpf/bpf_flow.o \
> 	/sys/fs/bpf/flow type flow_dissector
> bpftool prog attach pinned /sys/fs/bpf/flow/flow_dissector/0 flow_dissector
> 
> Tested by using the above two lines to load the prog in
> the test_flow_dissector.sh selftest.
> 
> Signed-off-by: Stanislav Fomichev <sdf@google.com>
> ---
>  .../bpftool/Documentation/bpftool-prog.rst    |  16 ++-
>  tools/bpf/bpftool/common.c                    |  32 +++--
>  tools/bpf/bpftool/main.h                      |   1 +
>  tools/bpf/bpftool/prog.c                      | 135 +++++++++++++++---
>  4 files changed, 141 insertions(+), 43 deletions(-)
> 
> diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> index ac4e904b10fb..3caa9153435b 100644
> --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> @@ -25,8 +25,8 @@ MAP COMMANDS
>  |	**bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | **opcodes**}]
>  |	**bpftool** **prog pin** *PROG* *FILE*
>  |	**bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*]
> -|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* *MAP*
> -|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* *MAP*
> +|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
> +|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
>  |	**bpftool** **prog help**
>  |
>  |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
> @@ -39,7 +39,9 @@ MAP COMMANDS
>  |		**cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
>  |		**cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6**
>  |	}
> -|       *ATTACH_TYPE* := { **msg_verdict** | **skb_verdict** | **skb_parse** }
> +|       *ATTACH_TYPE* := {
> +|	|	**msg_verdict** | **skb_verdict** | **skb_parse** | **flow_dissector**

        ^
Nitpick: Could you please remove the above pipe?

> +|	}
>  
>  
>  DESCRIPTION
> @@ -97,13 +99,13 @@ DESCRIPTION
>  		  contain a dot character ('.'), which is reserved for future
>  		  extensions of *bpffs*.
>  
> -        **bpftool prog attach** *PROG* *ATTACH_TYPE* *MAP*
> +        **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
>                    Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
> -                  to the map *MAP*.
> +                  to the optional map *MAP*.
>  
> -        **bpftool prog detach** *PROG* *ATTACH_TYPE* *MAP*
> +        **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
>                    Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
> -                  from the map *MAP*.
> +                  from the optional map *MAP*.
>  
>  	**bpftool prog help**
>  		  Print short help message.
> diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
> index 25af85304ebe..963881142dfb 100644
> --- a/tools/bpf/bpftool/common.c
> +++ b/tools/bpf/bpftool/common.c
> @@ -169,34 +169,24 @@ int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type)
>  	return fd;
>  }
>  
> -int do_pin_fd(int fd, const char *name)
> +int mount_bpffs_for_pin(const char *name)
>  {
>  	char err_str[ERR_MAX_LEN];
>  	char *file;
>  	char *dir;
>  	int err = 0;
>  
> -	err = bpf_obj_pin(fd, name);
> -	if (!err)
> -		goto out;
> -
>  	file = malloc(strlen(name) + 1);
>  	strcpy(file, name);
>  	dir = dirname(file);
>  
> -	if (errno != EPERM || is_bpffs(dir)) {
> -		p_err("can't pin the object (%s): %s", name, strerror(errno));
> +	if (is_bpffs(dir)) {
> +		/* nothing to do if already mounted */
>  		goto out_free;
>  	}
>  
> -	/* Attempt to mount bpffs, then retry pinning. */
>  	err = mnt_bpffs(dir, err_str, ERR_MAX_LEN);
> -	if (!err) {
> -		err = bpf_obj_pin(fd, name);
> -		if (err)
> -			p_err("can't pin the object (%s): %s", name,
> -			      strerror(errno));
> -	} else {
> +	if (err) {
>  		err_str[ERR_MAX_LEN - 1] = '\0';
>  		p_err("can't mount BPF file system to pin the object (%s): %s",
>  		      name, err_str);
> @@ -204,10 +194,22 @@ int do_pin_fd(int fd, const char *name)
>  
>  out_free:
>  	free(file);
> -out:
>  	return err;
>  }
>  
> +int do_pin_fd(int fd, const char *name)
> +{
> +	int err = mount_bpffs_for_pin(name);
> +
> +	if (err) {
> +		p_err("can't mount bpffs for pin %s: %s",
> +		      name, strerror(errno));
> +		return err;
> +	}
> +
> +	return bpf_obj_pin(fd, name);
> +}
> +
>  int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32))
>  {
>  	unsigned int id;
> diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
> index 28322ace2856..1383824c9baf 100644
> --- a/tools/bpf/bpftool/main.h
> +++ b/tools/bpf/bpftool/main.h
> @@ -129,6 +129,7 @@ const char *get_fd_type_name(enum bpf_obj_type type);
>  char *get_fdinfo(int fd, const char *key);
>  int open_obj_pinned(char *path);
>  int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type);
> +int mount_bpffs_for_pin(const char *name);
>  int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32));
>  int do_pin_fd(int fd, const char *name);
>  
> diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> index 5302ee282409..f3a07ec3a444 100644
> --- a/tools/bpf/bpftool/prog.c
> +++ b/tools/bpf/bpftool/prog.c
> @@ -81,6 +81,7 @@ static const char * const attach_type_strings[] = {
>  	[BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
>  	[BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
>  	[BPF_SK_MSG_VERDICT] = "msg_verdict",
> +	[BPF_FLOW_DISSECTOR] = "flow_dissector",
>  	[__MAX_BPF_ATTACH_TYPE] = NULL,
>  };
>  
> @@ -724,9 +725,10 @@ int map_replace_compar(const void *p1, const void *p2)
>  static int do_attach(int argc, char **argv)
>  {
>  	enum bpf_attach_type attach_type;
> -	int err, mapfd, progfd;
> +	int err, progfd;
> +	int mapfd = 0;
>  
> -	if (!REQ_ARGS(5)) {
> +	if (!REQ_ARGS(3)) {
>  		p_err("too few parameters for map attach");
>  		return -EINVAL;
>  	}
> @@ -741,10 +743,11 @@ static int do_attach(int argc, char **argv)
>  		return -EINVAL;
>  	}
>  	NEXT_ARG();
> -
> -	mapfd = map_parse_fd(&argc, &argv);
> -	if (mapfd < 0)
> -		return mapfd;
> +	if (argc > 0) {
> +		mapfd = map_parse_fd(&argc, &argv);
> +		if (mapfd < 0)
> +			return mapfd;
> +	}
>  
>  	err = bpf_prog_attach(progfd, mapfd, attach_type, 0);
>  	if (err) {
> @@ -760,9 +763,10 @@ static int do_attach(int argc, char **argv)
>  static int do_detach(int argc, char **argv)
>  {
>  	enum bpf_attach_type attach_type;
> -	int err, mapfd, progfd;
> +	int err, progfd;
> +	int mapfd = 0;
>  
> -	if (!REQ_ARGS(5)) {
> +	if (!REQ_ARGS(3)) {
>  		p_err("too few parameters for map detach");
>  		return -EINVAL;
>  	}
> @@ -777,10 +781,11 @@ static int do_detach(int argc, char **argv)
>  		return -EINVAL;
>  	}
>  	NEXT_ARG();
> -
> -	mapfd = map_parse_fd(&argc, &argv);
> -	if (mapfd < 0)
> -		return mapfd;
> +	if (argc > 0) {
> +		mapfd = map_parse_fd(&argc, &argv);
> +		if (mapfd < 0)
> +			return mapfd;
> +	}
>  
>  	err = bpf_prog_detach2(progfd, mapfd, attach_type);
>  	if (err) {
> @@ -792,6 +797,56 @@ static int do_detach(int argc, char **argv)
>  		jsonw_null(json_wtr);
>  	return 0;
>  }
> +
> +/* Flow dissector consists of a main program and a jump table for each
> + * supported protocol. The assumption here is that the first prog is the main
> + * one and the other progs are used in the tail calls. In this routine we
> + * build the jump table for the non-main progs.
> + */
> +static int build_flow_dissector_jmp_table(struct bpf_object *obj,
> +					  struct bpf_program *prog,
> +					  const char *jmp_table_map)
> +{
> +	struct bpf_map *jmp_table;
> +	struct bpf_program *pos;
> +	int i = 0;
> +	int prog_fd, jmp_table_fd, fd;
> +
> +	prog_fd = bpf_program__fd(prog);
> +	if (prog_fd < 0) {
> +		p_err("failed to get fd of main prog");
> +		return prog_fd;
> +	}
> +
> +	jmp_table = bpf_object__find_map_by_name(obj, jmp_table_map);
> +	if (jmp_table == NULL) {
> +		p_err("failed to find '%s' map", jmp_table_map);
> +		return -1;
> +	}
> +
> +	jmp_table_fd = bpf_map__fd(jmp_table);
> +	if (jmp_table_fd < 0) {
> +		p_err("failed to get fd of jmp_table");
> +		return jmp_table_fd;
> +	}
> +
> +	bpf_object__for_each_program(pos, obj) {
> +		fd = bpf_program__fd(pos);
> +		if (fd < 0) {
> +			p_err("failed to get fd of '%s'",
> +			      bpf_program__title(pos, false));
> +			return fd;
> +		}
> +
> +		if (fd != prog_fd) {
> +			bpf_map_update_elem(jmp_table_fd, &i, &fd, BPF_ANY);
> +			++i;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
>  static int do_load(int argc, char **argv)
>  {
>  	enum bpf_attach_type expected_attach_type;
> @@ -800,7 +855,7 @@ static int do_load(int argc, char **argv)
>  	};
>  	struct map_replace *map_replace = NULL;
>  	unsigned int old_map_fds = 0;
> -	struct bpf_program *prog;
> +	struct bpf_program *prog, *pos;
>  	struct bpf_object *obj;
>  	struct bpf_map *map;
>  	const char *pinfile;
> @@ -918,13 +973,19 @@ static int do_load(int argc, char **argv)
>  		goto err_free_reuse_maps;
>  	}
>  
> -	prog = bpf_program__next(NULL, obj);
> +	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
> +		/* for the flow dissector type, the entry point is in the
> +		 * section flow_dissector; other progs are tail calls
> +		 */
> +		prog = bpf_object__find_program_by_title(obj, "flow_dissector");

Is the section always called like this? Or could we use instead the same
assumption as above, i.e. the main program is the first one?

> +	} else {
> +		prog = bpf_program__next(NULL, obj);
> +	}
>  	if (!prog) {
>  		p_err("object file doesn't contain any bpf program");
>  		goto err_close_obj;
>  	}
>  
> -	bpf_program__set_ifindex(prog, ifindex);
>  	if (attr.prog_type == BPF_PROG_TYPE_UNSPEC) {
>  		const char *sec_name = bpf_program__title(prog, false);
>  
> @@ -936,8 +997,13 @@ static int do_load(int argc, char **argv)
>  			goto err_close_obj;
>  		}
>  	}
> -	bpf_program__set_type(prog, attr.prog_type);
> -	bpf_program__set_expected_attach_type(prog, expected_attach_type);
> +
> +	bpf_object__for_each_program(pos, obj) {
> +		bpf_program__set_ifindex(pos, ifindex);
> +		bpf_program__set_type(pos, attr.prog_type);
> +		bpf_program__set_expected_attach_type(pos,
> +						      expected_attach_type);
> +	}

I believe it is possible to load program of several types or attach
types from a same object file? If so, we would need to get individual
prog types and attach types for each program with
libbpf_prog_type_by_name().

>  
>  	qsort(map_replace, old_map_fds, sizeof(*map_replace),
>  	      map_replace_compar);
> @@ -1001,8 +1067,34 @@ static int do_load(int argc, char **argv)
>  		goto err_close_obj;
>  	}
>  
> -	if (do_pin_fd(bpf_program__fd(prog), pinfile))
> +	err = mount_bpffs_for_pin(pinfile);
> +	if (err) {
> +		p_err("failed to mount bpffs for pin '%s'", pinfile);
>  		goto err_close_obj;
> +	}
> +
> +	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
> +		err = build_flow_dissector_jmp_table(obj, prog, "jmp_table");
> +		if (err) {
> +			p_err("failed to build flow dissector jump table");
> +			goto err_close_obj;
> +		}
> +		/* flow dissector consist of multiple programs,
> +		 * we want to pin them all
> +		 */
> +		err = bpf_object__pin(obj, pinfile);
> +		if (err) {
> +			p_err("failed to pin flow dissector object");
> +			goto err_close_obj;
> +		}

What happens for the programs previously loaded and pinned in one of the
program fails? Do they remain loaded and pinned even if all programs
were not successfully loaded? Or should we consider removing all links
we added instead and go back to a "clean" state?

> +	} else {
> +		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
> +		if (err) {
> +			p_err("failed to pin program %s",
> +			      bpf_program__title(prog, false));
> +			goto err_close_obj;
> +		}

I don't have the same opinion as Jakub for pinning :). I was hoping we
could also load additional programs (for tail calls) for
non-flow_dissector programs. Could this be an occasion to update the
code in that direction?

> +	}
>  
>  	if (json_output)
>  		jsonw_null(json_wtr);
> @@ -1037,8 +1129,8 @@ static int do_help(int argc, char **argv)
>  		"       %s %s pin   PROG FILE\n"
>  		"       %s %s load  OBJ  FILE [type TYPE] [dev NAME] \\\n"
>  		"                         [map { idx IDX | name NAME } MAP]\n"
> -		"       %s %s attach PROG ATTACH_TYPE MAP\n"
> -		"       %s %s detach PROG ATTACH_TYPE MAP\n"
> +		"       %s %s attach PROG ATTACH_TYPE [MAP]\n"
> +		"       %s %s detach PROG ATTACH_TYPE [MAP]\n"
>  		"       %s %s help\n"
>  		"\n"
>  		"       " HELP_SPEC_MAP "\n"
> @@ -1050,7 +1142,8 @@ static int do_help(int argc, char **argv)
>  		"                 cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
>  		"                 cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
>  		"                 cgroup/sendmsg4 | cgroup/sendmsg6 }\n"
> -		"       ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse }\n"
> +		"       ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse |\n"
> +		"                        flow_dissector }\n"
>  		"       " HELP_SPEC_OPTIONS "\n"
>  		"",
>  		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
> 

Thanks a lot for the set!
Quentin
Jakub Kicinski Nov. 7, 2018, 8:32 p.m. UTC | #3
On Wed, 7 Nov 2018 20:08:53 +0000, Quentin Monnet wrote:
> > +		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
> > +		if (err) {
> > +			p_err("failed to pin program %s",
> > +			      bpf_program__title(prog, false));
> > +			goto err_close_obj;
> > +		}  
> 
> I don't have the same opinion as Jakub for pinning :). I was hoping we
> could also load additional programs (for tail calls) for
> non-flow_dissector programs. Could this be an occasion to update the
> code in that direction?

Do you mean having the bpftool construct an array for tail calling
automatically when loading an object?  Or do a "mass pin" of all
programs in an object file?

I'm not convinced about this strategy of auto assembling a tail call
array by assuming that a flow dissector object carries programs for
protocols in order (apart from the main program which doesn't have to
be first, for some reason).
Stanislav Fomichev Nov. 7, 2018, 9 p.m. UTC | #4
On 11/07, Jakub Kicinski wrote:
> On Wed,  7 Nov 2018 11:35:52 -0800, Stanislav Fomichev wrote:
> > This commit adds support for loading/attaching/detaching flow
> > dissector program. The structure of the flow dissector program is
> > assumed to be the same as in the selftests:
> > 
> > * flow_dissector section with the main entry point
> > * a bunch of tail call progs
> > * a jmp_table map that is populated with the tail call progs
> > 
> > When `bpftool load` is called with a flow_dissector prog (i.e. when the
> > first section is flow_dissector of 'type flow_dissector' argument is
> > passed), we load and pin all the programs and build the jump table.
> > 
> > The last argument of `bpftool attach` is made optional for this use
> > case.
> > 
> > Example:
> > bpftool prog load tools/testing/selftests/bpf/bpf_flow.o \
> > 	/sys/fs/bpf/flow type flow_dissector
> > bpftool prog attach pinned /sys/fs/bpf/flow/flow_dissector/0 flow_dissector
> > 
> > Tested by using the above two lines to load the prog in
> > the test_flow_dissector.sh selftest.
> > 
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  .../bpftool/Documentation/bpftool-prog.rst    |  16 ++-
> >  tools/bpf/bpftool/common.c                    |  32 +++--
> >  tools/bpf/bpftool/main.h                      |   1 +
> >  tools/bpf/bpftool/prog.c                      | 135 +++++++++++++++---
> 
> Please add the new attach type to bash completions.
Thanks for a quick review! Will address everything in the v2.
Answered some of your questions below.

> >  4 files changed, 141 insertions(+), 43 deletions(-)
> > 
> > diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> > index ac4e904b10fb..3caa9153435b 100644
> > --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> > +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> > @@ -25,8 +25,8 @@ MAP COMMANDS
> >  |	**bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | **opcodes**}]
> >  |	**bpftool** **prog pin** *PROG* *FILE*
> >  |	**bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*]
> > -|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* *MAP*
> > -|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* *MAP*
> > +|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
> > +|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
> >  |	**bpftool** **prog help**
> >  |
> >  |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
> > @@ -39,7 +39,9 @@ MAP COMMANDS
> >  |		**cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
> >  |		**cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6**
> >  |	}
> > -|       *ATTACH_TYPE* := { **msg_verdict** | **skb_verdict** | **skb_parse** }
> > +|       *ATTACH_TYPE* := {
> > +|	|	**msg_verdict** | **skb_verdict** | **skb_parse** | **flow_dissector**
> > +|	}
> >  
> >  
> >  DESCRIPTION
> > @@ -97,13 +99,13 @@ DESCRIPTION
> >  		  contain a dot character ('.'), which is reserved for future
> >  		  extensions of *bpffs*.
> >  
> > -        **bpftool prog attach** *PROG* *ATTACH_TYPE* *MAP*
> > +        **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
> >                    Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
> > -                  to the map *MAP*.
> > +                  to the optional map *MAP*.
> 
> Perhaps we can do better on help?  Attach BPF program *PROG* (with type
> specified by *ATTACH_TYPE*).  Most *ATTACH_TYPEs* require a *MAP*
> parameter, with the exception of *flow_dissector* which is attached to
> current networking name space.
> 
> > -        **bpftool prog detach** *PROG* *ATTACH_TYPE* *MAP*
> > +        **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
> >                    Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
> > -                  from the map *MAP*.
> > +                  from the optional map *MAP*.
> >  
> >  	**bpftool prog help**
> >  		  Print short help message.
> > diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
> > index 25af85304ebe..963881142dfb 100644
> > --- a/tools/bpf/bpftool/common.c
> > +++ b/tools/bpf/bpftool/common.c
> 
> > @@ -204,10 +194,22 @@ int do_pin_fd(int fd, const char *name)
> >  
> >  out_free:
> >  	free(file);
> > -out:
> >  	return err;
> >  }
> >  
> > +int do_pin_fd(int fd, const char *name)
> > +{
> > +	int err = mount_bpffs_for_pin(name);
> 
> Please don't initialize the error variable with a non-trivial function
> call.
> 
> > +	if (err) {
> > +		p_err("can't mount bpffs for pin %s: %s",
> > +		      name, strerror(errno));
> 
> I think mount_bpffs_for_pin() will already print an error.  We can't
> print two errors, because it will break JSON output.
> 
> > +		return err;
> > +	}
> > +
> > +	return bpf_obj_pin(fd, name);
> > +}
> > +
> >  int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32))
> >  {
> >  	unsigned int id;
> 
> > diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> > index 5302ee282409..f3a07ec3a444 100644
> > --- a/tools/bpf/bpftool/prog.c
> > +++ b/tools/bpf/bpftool/prog.c
> > @@ -81,6 +81,7 @@ static const char * const attach_type_strings[] = {
> >  	[BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
> >  	[BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
> >  	[BPF_SK_MSG_VERDICT] = "msg_verdict",
> > +	[BPF_FLOW_DISSECTOR] = "flow_dissector",
> >  	[__MAX_BPF_ATTACH_TYPE] = NULL,
> >  };
> >  
> > @@ -724,9 +725,10 @@ int map_replace_compar(const void *p1, const void *p2)
> >  static int do_attach(int argc, char **argv)
> >  {
> >  	enum bpf_attach_type attach_type;
> > -	int err, mapfd, progfd;
> > +	int err, progfd;
> > +	int mapfd = 0;
> >  
> > -	if (!REQ_ARGS(5)) {
> > +	if (!REQ_ARGS(3)) {
> >  		p_err("too few parameters for map attach");
> >  		return -EINVAL;
> >  	}
> > @@ -741,10 +743,11 @@ static int do_attach(int argc, char **argv)
> >  		return -EINVAL;
> >  	}
> >  	NEXT_ARG();
> > -
> > -	mapfd = map_parse_fd(&argc, &argv);
> > -	if (mapfd < 0)
> > -		return mapfd;
> > +	if (argc > 0) {
> 
> Flow dissector can't need a map right?  I think explicitly checking for
> the correct number of arguments once attach type is known would be good.
Makes sense. I initially didn't want to depend on the attach_type too
much, but it might be more readable actually. Will chain in v2.

> > +		mapfd = map_parse_fd(&argc, &argv);
> > +		if (mapfd < 0)
> > +			return mapfd;
> > +	}
> >  
> >  	err = bpf_prog_attach(progfd, mapfd, attach_type, 0);
> >  	if (err) {
> > @@ -760,9 +763,10 @@ static int do_attach(int argc, char **argv)
> >  static int do_detach(int argc, char **argv)
> >  {
> >  	enum bpf_attach_type attach_type;
> > -	int err, mapfd, progfd;
> > +	int err, progfd;
> > +	int mapfd = 0;
> >  
> > -	if (!REQ_ARGS(5)) {
> > +	if (!REQ_ARGS(3)) {
> >  		p_err("too few parameters for map detach");
> >  		return -EINVAL;
> >  	}
> > @@ -777,10 +781,11 @@ static int do_detach(int argc, char **argv)
> >  		return -EINVAL;
> >  	}
> >  	NEXT_ARG();
> > -
> > -	mapfd = map_parse_fd(&argc, &argv);
> > -	if (mapfd < 0)
> > -		return mapfd;
> > +	if (argc > 0) {
> > +		mapfd = map_parse_fd(&argc, &argv);
> > +		if (mapfd < 0)
> > +			return mapfd;
> > +	}
> >  
> >  	err = bpf_prog_detach2(progfd, mapfd, attach_type);
> >  	if (err) {
> > @@ -792,6 +797,56 @@ static int do_detach(int argc, char **argv)
> >  		jsonw_null(json_wtr);
> >  	return 0;
> >  }
> > +
> > +/* Flow dissector consists of a main program and a jump table for each
> > + * supported protocol. The assumption here is that the first prog is the main
> > + * one and the other progs are used in the tail calls. In this routine we
> > + * build the jump table for the non-main progs.
> > + */
> > +static int build_flow_dissector_jmp_table(struct bpf_object *obj,
> > +					  struct bpf_program *prog,
> > +					  const char *jmp_table_map)
> > +{
> > +	struct bpf_map *jmp_table;
> > +	struct bpf_program *pos;
> > +	int i = 0;
> > +	int prog_fd, jmp_table_fd, fd;
> 
> Please order variables longest to shortest.
> 
> > +	prog_fd = bpf_program__fd(prog);
> > +	if (prog_fd < 0) {
> > +		p_err("failed to get fd of main prog");
> > +		return prog_fd;
> > +	}
> > +
> > +	jmp_table = bpf_object__find_map_by_name(obj, jmp_table_map);
> > +	if (jmp_table == NULL) {
> 
> nit: !jmp_table
> 
> > +		p_err("failed to find '%s' map", jmp_table_map);
> > +		return -1;
> > +	}
> > +
> > +	jmp_table_fd = bpf_map__fd(jmp_table);
> > +	if (jmp_table_fd < 0) {
> > +		p_err("failed to get fd of jmp_table");
> > +		return jmp_table_fd;
> > +	}
> > +
> > +	bpf_object__for_each_program(pos, obj) {
> > +		fd = bpf_program__fd(pos);
> > +		if (fd < 0) {
> > +			p_err("failed to get fd of '%s'",
> > +			      bpf_program__title(pos, false));
> > +			return fd;
> > +		}
> > +
> > +		if (fd != prog_fd) {
> > +			bpf_map_update_elem(jmp_table_fd, &i, &fd, BPF_ANY);
> > +			++i;
> > +		}
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static int do_load(int argc, char **argv)
> >  {
> >  	enum bpf_attach_type expected_attach_type;
> > @@ -800,7 +855,7 @@ static int do_load(int argc, char **argv)
> >  	};
> >  	struct map_replace *map_replace = NULL;
> >  	unsigned int old_map_fds = 0;
> > -	struct bpf_program *prog;
> > +	struct bpf_program *prog, *pos;
> 
> variable order
> 
> >  	struct bpf_object *obj;
> >  	struct bpf_map *map;
> >  	const char *pinfile;
> > @@ -918,13 +973,19 @@ static int do_load(int argc, char **argv)
> >  		goto err_free_reuse_maps;
> >  	}
> >  
> > -	prog = bpf_program__next(NULL, obj);
> > +	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
> > +		/* for the flow dissector type, the entry point is in the
> > +		 * section flow_dissector; other progs are tail calls
> > +		 */
> > +		prog = bpf_object__find_program_by_title(obj, "flow_dissector");
> > +	} else {
> > +		prog = bpf_program__next(NULL, obj);
> > +	}
> >  	if (!prog) {
> >  		p_err("object file doesn't contain any bpf program");
> >  		goto err_close_obj;
> >  	}
> >  
> > -	bpf_program__set_ifindex(prog, ifindex);
> >  	if (attr.prog_type == BPF_PROG_TYPE_UNSPEC) {
> >  		const char *sec_name = bpf_program__title(prog, false);
> >  
> > @@ -936,8 +997,13 @@ static int do_load(int argc, char **argv)
> >  			goto err_close_obj;
> >  		}
> >  	}
> > -	bpf_program__set_type(prog, attr.prog_type);
> > -	bpf_program__set_expected_attach_type(prog, expected_attach_type);
> > +
> > +	bpf_object__for_each_program(pos, obj) {
> > +		bpf_program__set_ifindex(pos, ifindex);
> > +		bpf_program__set_type(pos, attr.prog_type);
> > +		bpf_program__set_expected_attach_type(pos,
> > +						      expected_attach_type);
> > +	}
> >  
> >  	qsort(map_replace, old_map_fds, sizeof(*map_replace),
> >  	      map_replace_compar);
> > @@ -1001,8 +1067,34 @@ static int do_load(int argc, char **argv)
> >  		goto err_close_obj;
> >  	}
> >  
> > -	if (do_pin_fd(bpf_program__fd(prog), pinfile))
> > +	err = mount_bpffs_for_pin(pinfile);
> > +	if (err) {
> > +		p_err("failed to mount bpffs for pin '%s'", pinfile);
> 
> Probably would be a duplicated error again?
> 
> >  		goto err_close_obj;
> > +	}
> > +
> > +	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
> > +		err = build_flow_dissector_jmp_table(obj, prog, "jmp_table");
> > +		if (err) {
> > +			p_err("failed to build flow dissector jump table");
> > +			goto err_close_obj;
> > +		}
> > +		/* flow dissector consist of multiple programs,
> > +		 * we want to pin them all
> 
> Why pin them all shouldn't the main program be the only one pinned?
If I pin only the main program, the tail ones disappear from the jmp_table map
when bpftool exits.
Am I missing something?
Should BPF_MAP_TYPE_PROG_ARRAY hold the referenced progs?

> > +		 */
> > +		err = bpf_object__pin(obj, pinfile);
> > +		if (err) {
> > +			p_err("failed to pin flow dissector object");
> > +			goto err_close_obj;
> > +		}
> > +	} else {
> > +		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
> > +		if (err) {
> > +			p_err("failed to pin program %s",
> > +			      bpf_program__title(prog, false));
> > +			goto err_close_obj;
> > +		}
> > +	}
>
Jakub Kicinski Nov. 7, 2018, 9:12 p.m. UTC | #5
On Wed, 7 Nov 2018 13:00:09 -0800, Stanislav Fomichev wrote:
> > > +	}
> > > +
> > > +	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
> > > +		err = build_flow_dissector_jmp_table(obj, prog, "jmp_table");
> > > +		if (err) {
> > > +			p_err("failed to build flow dissector jump table");
> > > +			goto err_close_obj;
> > > +		}
> > > +		/* flow dissector consist of multiple programs,
> > > +		 * we want to pin them all  
> > 
> > Why pin them all shouldn't the main program be the only one pinned?  
> If I pin only the main program, the tail ones disappear from the jmp_table map
> when bpftool exits.
> Am I missing something?
> Should BPF_MAP_TYPE_PROG_ARRAY hold the referenced progs?

It does.

# bpftool map create /sys/fs/bpf/map \
	type prog_array key 4 value 4 entries 4 \
	name tail_call_map
# bpftool prog load perf_event_output_stack.o /sys/fs/bpf/prog \
	 type xdp 
# bpftool map update pinned /sys/fs/bpf/map \
	key 0 0 0 0 \
	value pinned /sys/fs/bpf/prog
# bpftool prog 
11: xdp  name xdp_prog1  tag 6f1b482b27443edf  gpl
	loaded_at 2018-11-07T13:09:20-0800  uid 0
	xlated 144B  jited 130B  memlock 4096B  map_ids 14
# rm /sys/fs/bpf/prog
# bpftool prog 
11: xdp  name xdp_prog1  tag 6f1b482b27443edf  gpl
	loaded_at 2018-11-07T13:09:20-0800  uid 0
	xlated 144B  jited 130B  memlock 4096B  map_ids 14
# rm /sys/fs/bpf/map 
# bpftool prog 
# bpftool prog show id 11
Error: get by id (11): No such file or directory


I think we should remove all this auto tail call construction unless
we have solid annotations in the elf file which can clearly guide the
loader.
Stanislav Fomichev Nov. 7, 2018, 9:17 p.m. UTC | #6
On 11/07, Quentin Monnet wrote:
> Hi Stanislav,
> 
> 2018-11-07 11:35 UTC-0800 ~ Stanislav Fomichev <sdf@google.com>
> > This commit adds support for loading/attaching/detaching flow
> > dissector program. The structure of the flow dissector program is
> > assumed to be the same as in the selftests:
> > 
> > * flow_dissector section with the main entry point
> > * a bunch of tail call progs
> > * a jmp_table map that is populated with the tail call progs
> > 
> > When `bpftool load` is called with a flow_dissector prog (i.e. when the
> > first section is flow_dissector of 'type flow_dissector' argument is
> > passed), we load and pin all the programs and build the jump table.
> > 
> > The last argument of `bpftool attach` is made optional for this use
> > case.
> > 
> > Example:
> > bpftool prog load tools/testing/selftests/bpf/bpf_flow.o \
> > 	/sys/fs/bpf/flow type flow_dissector
> > bpftool prog attach pinned /sys/fs/bpf/flow/flow_dissector/0 flow_dissector
> > 
> > Tested by using the above two lines to load the prog in
> > the test_flow_dissector.sh selftest.
> > 
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  .../bpftool/Documentation/bpftool-prog.rst    |  16 ++-
> >  tools/bpf/bpftool/common.c                    |  32 +++--
> >  tools/bpf/bpftool/main.h                      |   1 +
> >  tools/bpf/bpftool/prog.c                      | 135 +++++++++++++++---
> >  4 files changed, 141 insertions(+), 43 deletions(-)
> > 
> > diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> > index ac4e904b10fb..3caa9153435b 100644
> > --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> > +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
> > @@ -25,8 +25,8 @@ MAP COMMANDS
> >  |	**bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | **opcodes**}]
> >  |	**bpftool** **prog pin** *PROG* *FILE*
> >  |	**bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*]
> > -|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* *MAP*
> > -|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* *MAP*
> > +|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
> > +|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
> >  |	**bpftool** **prog help**
> >  |
> >  |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
> > @@ -39,7 +39,9 @@ MAP COMMANDS
> >  |		**cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
> >  |		**cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6**
> >  |	}
> > -|       *ATTACH_TYPE* := { **msg_verdict** | **skb_verdict** | **skb_parse** }
> > +|       *ATTACH_TYPE* := {
> > +|	|	**msg_verdict** | **skb_verdict** | **skb_parse** | **flow_dissector**
> 
>         ^
> Nitpick: Could you please remove the above pipe?
Ah, I missed that one, my bad, thanks.

> > +|	}
> >  
> >  
> >  DESCRIPTION
> > @@ -97,13 +99,13 @@ DESCRIPTION
> >  		  contain a dot character ('.'), which is reserved for future
> >  		  extensions of *bpffs*.
> >  
> > -        **bpftool prog attach** *PROG* *ATTACH_TYPE* *MAP*
> > +        **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
> >                    Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
> > -                  to the map *MAP*.
> > +                  to the optional map *MAP*.
> >  
> > -        **bpftool prog detach** *PROG* *ATTACH_TYPE* *MAP*
> > +        **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
> >                    Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
> > -                  from the map *MAP*.
> > +                  from the optional map *MAP*.
> >  
> >  	**bpftool prog help**
> >  		  Print short help message.
> > diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
> > index 25af85304ebe..963881142dfb 100644
> > --- a/tools/bpf/bpftool/common.c
> > +++ b/tools/bpf/bpftool/common.c
> > @@ -169,34 +169,24 @@ int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type)
> >  	return fd;
> >  }
> >  
> > -int do_pin_fd(int fd, const char *name)
> > +int mount_bpffs_for_pin(const char *name)
> >  {
> >  	char err_str[ERR_MAX_LEN];
> >  	char *file;
> >  	char *dir;
> >  	int err = 0;
> >  
> > -	err = bpf_obj_pin(fd, name);
> > -	if (!err)
> > -		goto out;
> > -
> >  	file = malloc(strlen(name) + 1);
> >  	strcpy(file, name);
> >  	dir = dirname(file);
> >  
> > -	if (errno != EPERM || is_bpffs(dir)) {
> > -		p_err("can't pin the object (%s): %s", name, strerror(errno));
> > +	if (is_bpffs(dir)) {
> > +		/* nothing to do if already mounted */
> >  		goto out_free;
> >  	}
> >  
> > -	/* Attempt to mount bpffs, then retry pinning. */
> >  	err = mnt_bpffs(dir, err_str, ERR_MAX_LEN);
> > -	if (!err) {
> > -		err = bpf_obj_pin(fd, name);
> > -		if (err)
> > -			p_err("can't pin the object (%s): %s", name,
> > -			      strerror(errno));
> > -	} else {
> > +	if (err) {
> >  		err_str[ERR_MAX_LEN - 1] = '\0';
> >  		p_err("can't mount BPF file system to pin the object (%s): %s",
> >  		      name, err_str);
> > @@ -204,10 +194,22 @@ int do_pin_fd(int fd, const char *name)
> >  
> >  out_free:
> >  	free(file);
> > -out:
> >  	return err;
> >  }
> >  
> > +int do_pin_fd(int fd, const char *name)
> > +{
> > +	int err = mount_bpffs_for_pin(name);
> > +
> > +	if (err) {
> > +		p_err("can't mount bpffs for pin %s: %s",
> > +		      name, strerror(errno));
> > +		return err;
> > +	}
> > +
> > +	return bpf_obj_pin(fd, name);
> > +}
> > +
> >  int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32))
> >  {
> >  	unsigned int id;
> > diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
> > index 28322ace2856..1383824c9baf 100644
> > --- a/tools/bpf/bpftool/main.h
> > +++ b/tools/bpf/bpftool/main.h
> > @@ -129,6 +129,7 @@ const char *get_fd_type_name(enum bpf_obj_type type);
> >  char *get_fdinfo(int fd, const char *key);
> >  int open_obj_pinned(char *path);
> >  int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type);
> > +int mount_bpffs_for_pin(const char *name);
> >  int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32));
> >  int do_pin_fd(int fd, const char *name);
> >  
> > diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
> > index 5302ee282409..f3a07ec3a444 100644
> > --- a/tools/bpf/bpftool/prog.c
> > +++ b/tools/bpf/bpftool/prog.c
> > @@ -81,6 +81,7 @@ static const char * const attach_type_strings[] = {
> >  	[BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
> >  	[BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
> >  	[BPF_SK_MSG_VERDICT] = "msg_verdict",
> > +	[BPF_FLOW_DISSECTOR] = "flow_dissector",
> >  	[__MAX_BPF_ATTACH_TYPE] = NULL,
> >  };
> >  
> > @@ -724,9 +725,10 @@ int map_replace_compar(const void *p1, const void *p2)
> >  static int do_attach(int argc, char **argv)
> >  {
> >  	enum bpf_attach_type attach_type;
> > -	int err, mapfd, progfd;
> > +	int err, progfd;
> > +	int mapfd = 0;
> >  
> > -	if (!REQ_ARGS(5)) {
> > +	if (!REQ_ARGS(3)) {
> >  		p_err("too few parameters for map attach");
> >  		return -EINVAL;
> >  	}
> > @@ -741,10 +743,11 @@ static int do_attach(int argc, char **argv)
> >  		return -EINVAL;
> >  	}
> >  	NEXT_ARG();
> > -
> > -	mapfd = map_parse_fd(&argc, &argv);
> > -	if (mapfd < 0)
> > -		return mapfd;
> > +	if (argc > 0) {
> > +		mapfd = map_parse_fd(&argc, &argv);
> > +		if (mapfd < 0)
> > +			return mapfd;
> > +	}
> >  
> >  	err = bpf_prog_attach(progfd, mapfd, attach_type, 0);
> >  	if (err) {
> > @@ -760,9 +763,10 @@ static int do_attach(int argc, char **argv)
> >  static int do_detach(int argc, char **argv)
> >  {
> >  	enum bpf_attach_type attach_type;
> > -	int err, mapfd, progfd;
> > +	int err, progfd;
> > +	int mapfd = 0;
> >  
> > -	if (!REQ_ARGS(5)) {
> > +	if (!REQ_ARGS(3)) {
> >  		p_err("too few parameters for map detach");
> >  		return -EINVAL;
> >  	}
> > @@ -777,10 +781,11 @@ static int do_detach(int argc, char **argv)
> >  		return -EINVAL;
> >  	}
> >  	NEXT_ARG();
> > -
> > -	mapfd = map_parse_fd(&argc, &argv);
> > -	if (mapfd < 0)
> > -		return mapfd;
> > +	if (argc > 0) {
> > +		mapfd = map_parse_fd(&argc, &argv);
> > +		if (mapfd < 0)
> > +			return mapfd;
> > +	}
> >  
> >  	err = bpf_prog_detach2(progfd, mapfd, attach_type);
> >  	if (err) {
> > @@ -792,6 +797,56 @@ static int do_detach(int argc, char **argv)
> >  		jsonw_null(json_wtr);
> >  	return 0;
> >  }
> > +
> > +/* Flow dissector consists of a main program and a jump table for each
> > + * supported protocol. The assumption here is that the first prog is the main
> > + * one and the other progs are used in the tail calls. In this routine we
> > + * build the jump table for the non-main progs.
> > + */
> > +static int build_flow_dissector_jmp_table(struct bpf_object *obj,
> > +					  struct bpf_program *prog,
> > +					  const char *jmp_table_map)
> > +{
> > +	struct bpf_map *jmp_table;
> > +	struct bpf_program *pos;
> > +	int i = 0;
> > +	int prog_fd, jmp_table_fd, fd;
> > +
> > +	prog_fd = bpf_program__fd(prog);
> > +	if (prog_fd < 0) {
> > +		p_err("failed to get fd of main prog");
> > +		return prog_fd;
> > +	}
> > +
> > +	jmp_table = bpf_object__find_map_by_name(obj, jmp_table_map);
> > +	if (jmp_table == NULL) {
> > +		p_err("failed to find '%s' map", jmp_table_map);
> > +		return -1;
> > +	}
> > +
> > +	jmp_table_fd = bpf_map__fd(jmp_table);
> > +	if (jmp_table_fd < 0) {
> > +		p_err("failed to get fd of jmp_table");
> > +		return jmp_table_fd;
> > +	}
> > +
> > +	bpf_object__for_each_program(pos, obj) {
> > +		fd = bpf_program__fd(pos);
> > +		if (fd < 0) {
> > +			p_err("failed to get fd of '%s'",
> > +			      bpf_program__title(pos, false));
> > +			return fd;
> > +		}
> > +
> > +		if (fd != prog_fd) {
> > +			bpf_map_update_elem(jmp_table_fd, &i, &fd, BPF_ANY);
> > +			++i;
> > +		}
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> >  static int do_load(int argc, char **argv)
> >  {
> >  	enum bpf_attach_type expected_attach_type;
> > @@ -800,7 +855,7 @@ static int do_load(int argc, char **argv)
> >  	};
> >  	struct map_replace *map_replace = NULL;
> >  	unsigned int old_map_fds = 0;
> > -	struct bpf_program *prog;
> > +	struct bpf_program *prog, *pos;
> >  	struct bpf_object *obj;
> >  	struct bpf_map *map;
> >  	const char *pinfile;
> > @@ -918,13 +973,19 @@ static int do_load(int argc, char **argv)
> >  		goto err_free_reuse_maps;
> >  	}
> >  
> > -	prog = bpf_program__next(NULL, obj);
> > +	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
> > +		/* for the flow dissector type, the entry point is in the
> > +		 * section flow_dissector; other progs are tail calls
> > +		 */
> > +		prog = bpf_object__find_program_by_title(obj, "flow_dissector");
> 
> Is the section always called like this? Or could we use instead the same
> assumption as above, i.e. the main program is the first one?
Ideally yes, the main section should always be called
flow_dissector (to not break libbpf_prog_type_by_name assumptions).

We can call bpftool with and without the ATTACH_TYPE argument:

1. when 'type flow_dissector' is specified, we try to find section called
  'flow_dissector' and assume, that's the main prog (libbpf_prog_type_by_name)
2. when 'type flow_dissector' is _not_ specified, we try to detect
  program type from the first section

This check here is for the cases where first section is not
flow_dissector (but some tail call) and we want to force proper type.


> > +	} else {
> > +		prog = bpf_program__next(NULL, obj);
> > +	}
> >  	if (!prog) {
> >  		p_err("object file doesn't contain any bpf program");
> >  		goto err_close_obj;
> >  	}
> >  
> > -	bpf_program__set_ifindex(prog, ifindex);
> >  	if (attr.prog_type == BPF_PROG_TYPE_UNSPEC) {
> >  		const char *sec_name = bpf_program__title(prog, false);
> >  
> > @@ -936,8 +997,13 @@ static int do_load(int argc, char **argv)
> >  			goto err_close_obj;
> >  		}
> >  	}
> > -	bpf_program__set_type(prog, attr.prog_type);
> > -	bpf_program__set_expected_attach_type(prog, expected_attach_type);
> > +
> > +	bpf_object__for_each_program(pos, obj) {
> > +		bpf_program__set_ifindex(pos, ifindex);
> > +		bpf_program__set_type(pos, attr.prog_type);
> > +		bpf_program__set_expected_attach_type(pos,
> > +						      expected_attach_type);
> > +	}
> 
> I believe it is possible to load program of several types or attach
> types from a same object file? If so, we would need to get individual
> prog types and attach types for each program with
> libbpf_prog_type_by_name().
I don't think it works actually. I also assumed that initially, but as
you can see from the patch, I had to loop over the programs and set
correct type. Current implementation will fail if you have two progs
in the same obj AFAICT.

> >  	qsort(map_replace, old_map_fds, sizeof(*map_replace),
> >  	      map_replace_compar);
> > @@ -1001,8 +1067,34 @@ static int do_load(int argc, char **argv)
> >  		goto err_close_obj;
> >  	}
> >  
> > -	if (do_pin_fd(bpf_program__fd(prog), pinfile))
> > +	err = mount_bpffs_for_pin(pinfile);
> > +	if (err) {
> > +		p_err("failed to mount bpffs for pin '%s'", pinfile);
> >  		goto err_close_obj;
> > +	}
> > +
> > +	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
> > +		err = build_flow_dissector_jmp_table(obj, prog, "jmp_table");
> > +		if (err) {
> > +			p_err("failed to build flow dissector jump table");
> > +			goto err_close_obj;
> > +		}
> > +		/* flow dissector consist of multiple programs,
> > +		 * we want to pin them all
> > +		 */
> > +		err = bpf_object__pin(obj, pinfile);
> > +		if (err) {
> > +			p_err("failed to pin flow dissector object");
> > +			goto err_close_obj;
> > +		}
> 
> What happens for the programs previously loaded and pinned in one of the
> program fails? Do they remain loaded and pinned even if all programs
> were not successfully loaded? Or should we consider removing all links
> we added instead and go back to a "clean" state?
They remain loaded. I agree, we should go back to the clean state;
that probably should be achieved by adding proper cleanup to libbpf's
bpf_object__pin when it fails. I can look into that for v2.

> > +	} else {
> > +		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
> > +		if (err) {
> > +			p_err("failed to pin program %s",
> > +			      bpf_program__title(prog, false));
> > +			goto err_close_obj;
> > +		}
> 
> I don't have the same opinion as Jakub for pinning :). I was hoping we
> could also load additional programs (for tail calls) for
> non-flow_dissector programs. Could this be an occasion to update the
> code in that direction?
(See above for loading multiple progs from the same object).

If we want to be able to load non-flow_dissector programs from the same file,
we should have some way to distinguish between flow_dissector and
non-flow_dissector ones; I don't see an easy way to do that.

> > +	}
> >  
> >  	if (json_output)
> >  		jsonw_null(json_wtr);
> > @@ -1037,8 +1129,8 @@ static int do_help(int argc, char **argv)
> >  		"       %s %s pin   PROG FILE\n"
> >  		"       %s %s load  OBJ  FILE [type TYPE] [dev NAME] \\\n"
> >  		"                         [map { idx IDX | name NAME } MAP]\n"
> > -		"       %s %s attach PROG ATTACH_TYPE MAP\n"
> > -		"       %s %s detach PROG ATTACH_TYPE MAP\n"
> > +		"       %s %s attach PROG ATTACH_TYPE [MAP]\n"
> > +		"       %s %s detach PROG ATTACH_TYPE [MAP]\n"
> >  		"       %s %s help\n"
> >  		"\n"
> >  		"       " HELP_SPEC_MAP "\n"
> > @@ -1050,7 +1142,8 @@ static int do_help(int argc, char **argv)
> >  		"                 cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
> >  		"                 cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
> >  		"                 cgroup/sendmsg4 | cgroup/sendmsg6 }\n"
> > -		"       ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse }\n"
> > +		"       ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse |\n"
> > +		"                        flow_dissector }\n"
> >  		"       " HELP_SPEC_OPTIONS "\n"
> >  		"",
> >  		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],
> > 
> 
> Thanks a lot for the set!
Thank you for a review as well!
> Quentin
Quentin Monnet Nov. 7, 2018, 9:34 p.m. UTC | #7
2018-11-07 12:32 UTC-0800 ~ Jakub Kicinski <jakub.kicinski@netronome.com>
> On Wed, 7 Nov 2018 20:08:53 +0000, Quentin Monnet wrote:
>>> +		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
>>> +		if (err) {
>>> +			p_err("failed to pin program %s",
>>> +			      bpf_program__title(prog, false));
>>> +			goto err_close_obj;
>>> +		}
>>
>> I don't have the same opinion as Jakub for pinning :). I was hoping we
>> could also load additional programs (for tail calls) for
>> non-flow_dissector programs. Could this be an occasion to update the
>> code in that direction?
> 
> Do you mean having the bpftool construct an array for tail calling
> automatically when loading an object?  Or do a "mass pin" of all
> programs in an object file?
> 
> I'm not convinced about this strategy of auto assembling a tail call
> array by assuming that a flow dissector object carries programs for
> protocols in order (apart from the main program which doesn't have to
> be first, for some reason).

Not constructing the prog array, I don't think this should be the role 
of bpftool either. Much more a "mass pin", so that you have a link to 
each program loaded from the object file and can later add them to a 
prog array map with subsequent calls to bpftool.
Stanislav Fomichev Nov. 7, 2018, 10:15 p.m. UTC | #8
On 11/07, Quentin Monnet wrote:
> 2018-11-07 12:32 UTC-0800 ~ Jakub Kicinski <jakub.kicinski@netronome.com>
> > On Wed, 7 Nov 2018 20:08:53 +0000, Quentin Monnet wrote:
> > > > +		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
> > > > +		if (err) {
> > > > +			p_err("failed to pin program %s",
> > > > +			      bpf_program__title(prog, false));
> > > > +			goto err_close_obj;
> > > > +		}
> > > 
> > > I don't have the same opinion as Jakub for pinning :). I was hoping we
> > > could also load additional programs (for tail calls) for
> > > non-flow_dissector programs. Could this be an occasion to update the
> > > code in that direction?
> > 
> > Do you mean having the bpftool construct an array for tail calling
> > automatically when loading an object?  Or do a "mass pin" of all
> > programs in an object file?
> > 
> > I'm not convinced about this strategy of auto assembling a tail call
> > array by assuming that a flow dissector object carries programs for
> > protocols in order (apart from the main program which doesn't have to
> > be first, for some reason).
> 
> Not constructing the prog array, I don't think this should be the role of
> bpftool either. Much more a "mass pin", so that you have a link to each
> program loaded from the object file and can later add them to a prog array
> map with subsequent calls to bpftool.
I agree, constructing the jmp_table is a bit fragile with all the
dependencies on the order of the progs. I'll drop that and will send a
v2 that pins all the programs from the obj file instead and offloads
jmp_table construction to the user. So the supposed use case would be
something like the following:

bpftool prog load bpf_flow.o /sys/fs/bpf/flow type flow_dissector
bpftool map update pinned /sys/fs/bpf/flow/jmp_table \
	key ... value pinned /sys/fs/bpf/flow/<PROTO>/0
bpftool map update ...
bpftool prog attach pinned /sys/fs/bpf/flow/flow_dissector/0 flow_dissector
Jakub Kicinski Nov. 7, 2018, 10:51 p.m. UTC | #9
On Wed, 7 Nov 2018 14:15:06 -0800, Stanislav Fomichev wrote:
> On 11/07, Quentin Monnet wrote:
> > 2018-11-07 12:32 UTC-0800 ~ Jakub Kicinski <jakub.kicinski@netronome.com>  
> > > On Wed, 7 Nov 2018 20:08:53 +0000, Quentin Monnet wrote:  
> > > > > +		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
> > > > > +		if (err) {
> > > > > +			p_err("failed to pin program %s",
> > > > > +			      bpf_program__title(prog, false));
> > > > > +			goto err_close_obj;
> > > > > +		}  
> > > > 
> > > > I don't have the same opinion as Jakub for pinning :). I was hoping we
> > > > could also load additional programs (for tail calls) for
> > > > non-flow_dissector programs. Could this be an occasion to update the
> > > > code in that direction?  
> > > 
> > > Do you mean having the bpftool construct an array for tail calling
> > > automatically when loading an object?  Or do a "mass pin" of all
> > > programs in an object file?
> > > 
> > > I'm not convinced about this strategy of auto assembling a tail call
> > > array by assuming that a flow dissector object carries programs for
> > > protocols in order (apart from the main program which doesn't have to
> > > be first, for some reason).  
> > 
> > Not constructing the prog array, I don't think this should be the role of
> > bpftool either. Much more a "mass pin", so that you have a link to each
> > program loaded from the object file and can later add them to a prog array
> > map with subsequent calls to bpftool.  

Makes sense, specific files named after index or program name/title?
Program name may be nicer?

> I agree, constructing the jmp_table is a bit fragile with all the
> dependencies on the order of the progs. I'll drop that and will send a
> v2 that pins all the programs from the obj file instead and offloads
> jmp_table construction to the user. So the supposed use case would be
> something like the following:
> 
> bpftool prog load bpf_flow.o /sys/fs/bpf/flow type flow_dissector

Okay.  One more thing - how do we differentiate between mass pin and the
existing pin first behaviour?  Should we perhaps add a loadall command
or some flag?

> bpftool map update pinned /sys/fs/bpf/flow/jmp_table \
> 	key ... value pinned /sys/fs/bpf/flow/<PROTO>/0

Interesting, why $dir/$title/0 ?  Perhaps $dir/$title is sufficient?

> bpftool map update ...
> bpftool prog attach pinned /sys/fs/bpf/flow/flow_dissector/0 flow_dissector
Stanislav Fomichev Nov. 7, 2018, 11:12 p.m. UTC | #10
On 11/07, Jakub Kicinski wrote:
> On Wed, 7 Nov 2018 14:15:06 -0800, Stanislav Fomichev wrote:
> > On 11/07, Quentin Monnet wrote:
> > > 2018-11-07 12:32 UTC-0800 ~ Jakub Kicinski <jakub.kicinski@netronome.com>  
> > > > On Wed, 7 Nov 2018 20:08:53 +0000, Quentin Monnet wrote:  
> > > > > > +		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
> > > > > > +		if (err) {
> > > > > > +			p_err("failed to pin program %s",
> > > > > > +			      bpf_program__title(prog, false));
> > > > > > +			goto err_close_obj;
> > > > > > +		}  
> > > > > 
> > > > > I don't have the same opinion as Jakub for pinning :). I was hoping we
> > > > > could also load additional programs (for tail calls) for
> > > > > non-flow_dissector programs. Could this be an occasion to update the
> > > > > code in that direction?  
> > > > 
> > > > Do you mean having the bpftool construct an array for tail calling
> > > > automatically when loading an object?  Or do a "mass pin" of all
> > > > programs in an object file?
> > > > 
> > > > I'm not convinced about this strategy of auto assembling a tail call
> > > > array by assuming that a flow dissector object carries programs for
> > > > protocols in order (apart from the main program which doesn't have to
> > > > be first, for some reason).  
> > > 
> > > Not constructing the prog array, I don't think this should be the role of
> > > bpftool either. Much more a "mass pin", so that you have a link to each
> > > program loaded from the object file and can later add them to a prog array
> > > map with subsequent calls to bpftool.  
> 
> Makes sense, specific files named after index or program name/title?
> Program name may be nicer?
> 
> > I agree, constructing the jmp_table is a bit fragile with all the
> > dependencies on the order of the progs. I'll drop that and will send a
> > v2 that pins all the programs from the obj file instead and offloads
> > jmp_table construction to the user. So the supposed use case would be
> > something like the following:
> > 
> > bpftool prog load bpf_flow.o /sys/fs/bpf/flow type flow_dissector
> 
> Okay.  One more thing - how do we differentiate between mass pin and the
> existing pin first behaviour?  Should we perhaps add a loadall command
> or some flag?
In v2 I did by program type:
* flow_dissector -> pin all
* not flow_dissector -> pin first?

But we can have loadall or something like:
load OBJ [pinfirst|pinall] FILE|DIR [type TYPE]

If we want to add user control, I'd go with loadall command,
adding more optional flags in between is a mess..

> > bpftool map update pinned /sys/fs/bpf/flow/jmp_table \
> > 	key ... value pinned /sys/fs/bpf/flow/<PROTO>/0
> 
> Interesting, why $dir/$title/0 ?  Perhaps $dir/$title is sufficient?
That's how bpf_program__pin does the pinning, for each program instance.
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/bpf/libbpf.c#n1744

> 
> > bpftool map update ...
> > bpftool prog attach pinned /sys/fs/bpf/flow/flow_dissector/0 flow_dissector
>
Jakub Kicinski Nov. 7, 2018, 11:30 p.m. UTC | #11
On Wed, 7 Nov 2018 15:12:07 -0800, Stanislav Fomichev wrote:
> > > I agree, constructing the jmp_table is a bit fragile with all the
> > > dependencies on the order of the progs. I'll drop that and will send a
> > > v2 that pins all the programs from the obj file instead and offloads
> > > jmp_table construction to the user. So the supposed use case would be
> > > something like the following:
> > > 
> > > bpftool prog load bpf_flow.o /sys/fs/bpf/flow type flow_dissector  
> > 
> > Okay.  One more thing - how do we differentiate between mass pin and the
> > existing pin first behaviour?  Should we perhaps add a loadall command
> > or some flag?  
> In v2 I did by program type:
> * flow_dissector -> pin all
> * not flow_dissector -> pin first?
> 
> But we can have loadall or something like:
> load OBJ [pinfirst|pinall] FILE|DIR [type TYPE]
> 
> If we want to add user control, I'd go with loadall command,
> adding more optional flags in between is a mess..

I think user control would be good.  Agreed on loadall being better.
diff mbox series

Patch

diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
index ac4e904b10fb..3caa9153435b 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst
@@ -25,8 +25,8 @@  MAP COMMANDS
 |	**bpftool** **prog dump jited**  *PROG* [{**file** *FILE* | **opcodes**}]
 |	**bpftool** **prog pin** *PROG* *FILE*
 |	**bpftool** **prog load** *OBJ* *FILE* [**type** *TYPE*] [**map** {**idx** *IDX* | **name** *NAME*} *MAP*] [**dev** *NAME*]
-|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* *MAP*
-|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* *MAP*
+|       **bpftool** **prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
+|       **bpftool** **prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
 |	**bpftool** **prog help**
 |
 |	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
@@ -39,7 +39,9 @@  MAP COMMANDS
 |		**cgroup/bind4** | **cgroup/bind6** | **cgroup/post_bind4** | **cgroup/post_bind6** |
 |		**cgroup/connect4** | **cgroup/connect6** | **cgroup/sendmsg4** | **cgroup/sendmsg6**
 |	}
-|       *ATTACH_TYPE* := { **msg_verdict** | **skb_verdict** | **skb_parse** }
+|       *ATTACH_TYPE* := {
+|	|	**msg_verdict** | **skb_verdict** | **skb_parse** | **flow_dissector**
+|	}
 
 
 DESCRIPTION
@@ -97,13 +99,13 @@  DESCRIPTION
 		  contain a dot character ('.'), which is reserved for future
 		  extensions of *bpffs*.
 
-        **bpftool prog attach** *PROG* *ATTACH_TYPE* *MAP*
+        **bpftool prog attach** *PROG* *ATTACH_TYPE* [*MAP*]
                   Attach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
-                  to the map *MAP*.
+                  to the optional map *MAP*.
 
-        **bpftool prog detach** *PROG* *ATTACH_TYPE* *MAP*
+        **bpftool prog detach** *PROG* *ATTACH_TYPE* [*MAP*]
                   Detach bpf program *PROG* (with type specified by *ATTACH_TYPE*)
-                  from the map *MAP*.
+                  from the optional map *MAP*.
 
 	**bpftool prog help**
 		  Print short help message.
diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c
index 25af85304ebe..963881142dfb 100644
--- a/tools/bpf/bpftool/common.c
+++ b/tools/bpf/bpftool/common.c
@@ -169,34 +169,24 @@  int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type)
 	return fd;
 }
 
-int do_pin_fd(int fd, const char *name)
+int mount_bpffs_for_pin(const char *name)
 {
 	char err_str[ERR_MAX_LEN];
 	char *file;
 	char *dir;
 	int err = 0;
 
-	err = bpf_obj_pin(fd, name);
-	if (!err)
-		goto out;
-
 	file = malloc(strlen(name) + 1);
 	strcpy(file, name);
 	dir = dirname(file);
 
-	if (errno != EPERM || is_bpffs(dir)) {
-		p_err("can't pin the object (%s): %s", name, strerror(errno));
+	if (is_bpffs(dir)) {
+		/* nothing to do if already mounted */
 		goto out_free;
 	}
 
-	/* Attempt to mount bpffs, then retry pinning. */
 	err = mnt_bpffs(dir, err_str, ERR_MAX_LEN);
-	if (!err) {
-		err = bpf_obj_pin(fd, name);
-		if (err)
-			p_err("can't pin the object (%s): %s", name,
-			      strerror(errno));
-	} else {
+	if (err) {
 		err_str[ERR_MAX_LEN - 1] = '\0';
 		p_err("can't mount BPF file system to pin the object (%s): %s",
 		      name, err_str);
@@ -204,10 +194,22 @@  int do_pin_fd(int fd, const char *name)
 
 out_free:
 	free(file);
-out:
 	return err;
 }
 
+int do_pin_fd(int fd, const char *name)
+{
+	int err = mount_bpffs_for_pin(name);
+
+	if (err) {
+		p_err("can't mount bpffs for pin %s: %s",
+		      name, strerror(errno));
+		return err;
+	}
+
+	return bpf_obj_pin(fd, name);
+}
+
 int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32))
 {
 	unsigned int id;
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 28322ace2856..1383824c9baf 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -129,6 +129,7 @@  const char *get_fd_type_name(enum bpf_obj_type type);
 char *get_fdinfo(int fd, const char *key);
 int open_obj_pinned(char *path);
 int open_obj_pinned_any(char *path, enum bpf_obj_type exp_type);
+int mount_bpffs_for_pin(const char *name);
 int do_pin_any(int argc, char **argv, int (*get_fd_by_id)(__u32));
 int do_pin_fd(int fd, const char *name);
 
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 5302ee282409..f3a07ec3a444 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -81,6 +81,7 @@  static const char * const attach_type_strings[] = {
 	[BPF_SK_SKB_STREAM_PARSER] = "stream_parser",
 	[BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict",
 	[BPF_SK_MSG_VERDICT] = "msg_verdict",
+	[BPF_FLOW_DISSECTOR] = "flow_dissector",
 	[__MAX_BPF_ATTACH_TYPE] = NULL,
 };
 
@@ -724,9 +725,10 @@  int map_replace_compar(const void *p1, const void *p2)
 static int do_attach(int argc, char **argv)
 {
 	enum bpf_attach_type attach_type;
-	int err, mapfd, progfd;
+	int err, progfd;
+	int mapfd = 0;
 
-	if (!REQ_ARGS(5)) {
+	if (!REQ_ARGS(3)) {
 		p_err("too few parameters for map attach");
 		return -EINVAL;
 	}
@@ -741,10 +743,11 @@  static int do_attach(int argc, char **argv)
 		return -EINVAL;
 	}
 	NEXT_ARG();
-
-	mapfd = map_parse_fd(&argc, &argv);
-	if (mapfd < 0)
-		return mapfd;
+	if (argc > 0) {
+		mapfd = map_parse_fd(&argc, &argv);
+		if (mapfd < 0)
+			return mapfd;
+	}
 
 	err = bpf_prog_attach(progfd, mapfd, attach_type, 0);
 	if (err) {
@@ -760,9 +763,10 @@  static int do_attach(int argc, char **argv)
 static int do_detach(int argc, char **argv)
 {
 	enum bpf_attach_type attach_type;
-	int err, mapfd, progfd;
+	int err, progfd;
+	int mapfd = 0;
 
-	if (!REQ_ARGS(5)) {
+	if (!REQ_ARGS(3)) {
 		p_err("too few parameters for map detach");
 		return -EINVAL;
 	}
@@ -777,10 +781,11 @@  static int do_detach(int argc, char **argv)
 		return -EINVAL;
 	}
 	NEXT_ARG();
-
-	mapfd = map_parse_fd(&argc, &argv);
-	if (mapfd < 0)
-		return mapfd;
+	if (argc > 0) {
+		mapfd = map_parse_fd(&argc, &argv);
+		if (mapfd < 0)
+			return mapfd;
+	}
 
 	err = bpf_prog_detach2(progfd, mapfd, attach_type);
 	if (err) {
@@ -792,6 +797,56 @@  static int do_detach(int argc, char **argv)
 		jsonw_null(json_wtr);
 	return 0;
 }
+
+/* Flow dissector consists of a main program and a jump table for each
+ * supported protocol. The assumption here is that the first prog is the main
+ * one and the other progs are used in the tail calls. In this routine we
+ * build the jump table for the non-main progs.
+ */
+static int build_flow_dissector_jmp_table(struct bpf_object *obj,
+					  struct bpf_program *prog,
+					  const char *jmp_table_map)
+{
+	struct bpf_map *jmp_table;
+	struct bpf_program *pos;
+	int i = 0;
+	int prog_fd, jmp_table_fd, fd;
+
+	prog_fd = bpf_program__fd(prog);
+	if (prog_fd < 0) {
+		p_err("failed to get fd of main prog");
+		return prog_fd;
+	}
+
+	jmp_table = bpf_object__find_map_by_name(obj, jmp_table_map);
+	if (jmp_table == NULL) {
+		p_err("failed to find '%s' map", jmp_table_map);
+		return -1;
+	}
+
+	jmp_table_fd = bpf_map__fd(jmp_table);
+	if (jmp_table_fd < 0) {
+		p_err("failed to get fd of jmp_table");
+		return jmp_table_fd;
+	}
+
+	bpf_object__for_each_program(pos, obj) {
+		fd = bpf_program__fd(pos);
+		if (fd < 0) {
+			p_err("failed to get fd of '%s'",
+			      bpf_program__title(pos, false));
+			return fd;
+		}
+
+		if (fd != prog_fd) {
+			bpf_map_update_elem(jmp_table_fd, &i, &fd, BPF_ANY);
+			++i;
+		}
+	}
+
+	return 0;
+}
+
 static int do_load(int argc, char **argv)
 {
 	enum bpf_attach_type expected_attach_type;
@@ -800,7 +855,7 @@  static int do_load(int argc, char **argv)
 	};
 	struct map_replace *map_replace = NULL;
 	unsigned int old_map_fds = 0;
-	struct bpf_program *prog;
+	struct bpf_program *prog, *pos;
 	struct bpf_object *obj;
 	struct bpf_map *map;
 	const char *pinfile;
@@ -918,13 +973,19 @@  static int do_load(int argc, char **argv)
 		goto err_free_reuse_maps;
 	}
 
-	prog = bpf_program__next(NULL, obj);
+	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
+		/* for the flow dissector type, the entry point is in the
+		 * section flow_dissector; other progs are tail calls
+		 */
+		prog = bpf_object__find_program_by_title(obj, "flow_dissector");
+	} else {
+		prog = bpf_program__next(NULL, obj);
+	}
 	if (!prog) {
 		p_err("object file doesn't contain any bpf program");
 		goto err_close_obj;
 	}
 
-	bpf_program__set_ifindex(prog, ifindex);
 	if (attr.prog_type == BPF_PROG_TYPE_UNSPEC) {
 		const char *sec_name = bpf_program__title(prog, false);
 
@@ -936,8 +997,13 @@  static int do_load(int argc, char **argv)
 			goto err_close_obj;
 		}
 	}
-	bpf_program__set_type(prog, attr.prog_type);
-	bpf_program__set_expected_attach_type(prog, expected_attach_type);
+
+	bpf_object__for_each_program(pos, obj) {
+		bpf_program__set_ifindex(pos, ifindex);
+		bpf_program__set_type(pos, attr.prog_type);
+		bpf_program__set_expected_attach_type(pos,
+						      expected_attach_type);
+	}
 
 	qsort(map_replace, old_map_fds, sizeof(*map_replace),
 	      map_replace_compar);
@@ -1001,8 +1067,34 @@  static int do_load(int argc, char **argv)
 		goto err_close_obj;
 	}
 
-	if (do_pin_fd(bpf_program__fd(prog), pinfile))
+	err = mount_bpffs_for_pin(pinfile);
+	if (err) {
+		p_err("failed to mount bpffs for pin '%s'", pinfile);
 		goto err_close_obj;
+	}
+
+	if (attr.prog_type == BPF_PROG_TYPE_FLOW_DISSECTOR) {
+		err = build_flow_dissector_jmp_table(obj, prog, "jmp_table");
+		if (err) {
+			p_err("failed to build flow dissector jump table");
+			goto err_close_obj;
+		}
+		/* flow dissector consist of multiple programs,
+		 * we want to pin them all
+		 */
+		err = bpf_object__pin(obj, pinfile);
+		if (err) {
+			p_err("failed to pin flow dissector object");
+			goto err_close_obj;
+		}
+	} else {
+		err = bpf_obj_pin(bpf_program__fd(prog), pinfile);
+		if (err) {
+			p_err("failed to pin program %s",
+			      bpf_program__title(prog, false));
+			goto err_close_obj;
+		}
+	}
 
 	if (json_output)
 		jsonw_null(json_wtr);
@@ -1037,8 +1129,8 @@  static int do_help(int argc, char **argv)
 		"       %s %s pin   PROG FILE\n"
 		"       %s %s load  OBJ  FILE [type TYPE] [dev NAME] \\\n"
 		"                         [map { idx IDX | name NAME } MAP]\n"
-		"       %s %s attach PROG ATTACH_TYPE MAP\n"
-		"       %s %s detach PROG ATTACH_TYPE MAP\n"
+		"       %s %s attach PROG ATTACH_TYPE [MAP]\n"
+		"       %s %s detach PROG ATTACH_TYPE [MAP]\n"
 		"       %s %s help\n"
 		"\n"
 		"       " HELP_SPEC_MAP "\n"
@@ -1050,7 +1142,8 @@  static int do_help(int argc, char **argv)
 		"                 cgroup/bind4 | cgroup/bind6 | cgroup/post_bind4 |\n"
 		"                 cgroup/post_bind6 | cgroup/connect4 | cgroup/connect6 |\n"
 		"                 cgroup/sendmsg4 | cgroup/sendmsg6 }\n"
-		"       ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse }\n"
+		"       ATTACH_TYPE := { msg_verdict | skb_verdict | skb_parse |\n"
+		"                        flow_dissector }\n"
 		"       " HELP_SPEC_OPTIONS "\n"
 		"",
 		bin_name, argv[-2], bin_name, argv[-2], bin_name, argv[-2],