diff mbox series

[bpf-next,v2,3/4] libbpf: add low level TC-BPF API

Message ID 20210419121811.117400-4-memxor@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series Add TC-BPF API | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers success CCed 10 of 10 maintainers
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 0 this patch: 0
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch fail CHECK: Please don't use multiple blank lines ERROR: space prohibited before that ':' (ctx:WxV) WARNING: Block comments use a trailing */ on a separate line WARNING: line length of 81 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 0 this patch: 0
netdev/header_inline success Link

Commit Message

Kumar Kartikeya Dwivedi April 19, 2021, 12:18 p.m. UTC
This adds functions that wrap the netlink API used for adding,
manipulating, and removing traffic control filters. These functions
operate directly on the loaded prog's fd, and return a handle to the
filter using an out parameter named id.

The basic featureset is covered to allow for attaching, manipulation of
properties, and removal of filters. Some additional features like
TCA_BPF_POLICE and TCA_RATE for tc_cls have been omitted. These can
added on top later by extending the bpf_tc_cls_opts struct.

Support for binding actions directly to a classifier by passing them in
during filter creation has also been omitted for now. These actions have
an auto clean up property because their lifetime is bound to the filter
they are attached to. This can be added later, but was omitted for now
as direct action mode is a better alternative to it, which is enabled by
default.

An API summary:

bpf_tc_act_{attach, change, replace} may be used to attach, change, and
replace SCHED_CLS bpf classifier. The protocol field can be set as 0, in
which case it is subsitituted as ETH_P_ALL by default.

The behavior of the three functions is as follows:

attach = create filter if it does not exist, fail otherwise
change = change properties of the classifier of existing filter
replace = create filter, and replace any existing filter

bpf_tc_cls_detach may be used to detach existing SCHED_CLS
filter. The bpf_tc_cls_attach_id object filled in during attach,
change, or replace must be passed in to the detach functions for them to
remove the filter and its attached classififer correctly.

bpf_tc_cls_get_info is a helper that can be used to obtain attributes
for the filter and classififer. The opts structure may be used to
choose the granularity of search, such that info for a specific filter
corresponding to the same loaded bpf program can be obtained. By
default, the first match is returned to the user.

Examples:

	struct bpf_tc_cls_attach_id id = {};
	struct bpf_object *obj;
	struct bpf_program *p;
	int fd, r;

	obj = bpf_object_open("foo.o");
	if (IS_ERR_OR_NULL(obj))
		return PTR_ERR(obj);

	p = bpf_object__find_program_by_title(obj, "classifier");
	if (IS_ERR_OR_NULL(p))
		return PTR_ERR(p);

	if (bpf_object__load(obj) < 0)
		return -1;

	fd = bpf_program__fd(p);

	r = bpf_tc_cls_attach(fd, if_nametoindex("lo"),
			      BPF_TC_CLSACT_INGRESS,
			      NULL, &id);
	if (r < 0)
		return r;

... which is roughly equivalent to (after clsact qdisc setup):
  # tc filter add dev lo ingress bpf obj foo.o sec classifier da

... as direct action mode is always enabled.

If a user wishes to modify existing options on an attached classifier,
bpf_tc_cls_change API may be used.

Only parameters class_id can be modified, the rest are filled in to
identify the correct filter. protocol can be left out if it was not
chosen explicitly (defaulting to ETH_P_ALL).

Example:

	/* Optional parameters necessary to select the right filter */
	DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts,
			    .handle = id.handle,
			    .priority = id.priority,
			    .chain_index = id.chain_index)
	opts.class_id = TC_H_MAKE(1UL << 16, 12);
	r = bpf_tc_cls_change(fd, if_nametoindex("lo"),
			      BPF_TC_CLSACT_INGRESS,
			      &opts, &id);
	if (r < 0)
		return r;

	struct bpf_tc_cls_info info = {};
	r = bpf_tc_cls_get_info(fd, if_nametoindex("lo"),
			        BPF_TC_CLSACT_INGRESS,
				&opts, &info);
	if (r < 0)
		return r;

	assert(info.class_id == TC_H_MAKE(1UL << 16, 12));

This would be roughly equivalent to doing:
  # tc filter change dev lo egress prio <p> handle <h> bpf obj foo.o sec \
    classifier classid 1:12

... except a new bpf program will be loaded and replace existing one.

Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
---
 tools/lib/bpf/libbpf.h   |  52 ++++++
 tools/lib/bpf/libbpf.map |   5 +
 tools/lib/bpf/netlink.c  | 377 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 428 insertions(+), 6 deletions(-)

Comments

Daniel Borkmann April 19, 2021, 9 p.m. UTC | #1
On 4/19/21 2:18 PM, Kumar Kartikeya Dwivedi wrote:
> This adds functions that wrap the netlink API used for adding,
> manipulating, and removing traffic control filters. These functions
> operate directly on the loaded prog's fd, and return a handle to the
> filter using an out parameter named id.
> 
> The basic featureset is covered to allow for attaching, manipulation of
> properties, and removal of filters. Some additional features like
> TCA_BPF_POLICE and TCA_RATE for tc_cls have been omitted. These can
> added on top later by extending the bpf_tc_cls_opts struct.
> 
> Support for binding actions directly to a classifier by passing them in
> during filter creation has also been omitted for now. These actions have
> an auto clean up property because their lifetime is bound to the filter
> they are attached to. This can be added later, but was omitted for now
> as direct action mode is a better alternative to it, which is enabled by
> default.
> 
> An API summary:
> 
> bpf_tc_act_{attach, change, replace} may be used to attach, change, and

typo on bpf_tc_act_{...} ?
                ^^^
> replace SCHED_CLS bpf classifier. The protocol field can be set as 0, in
> which case it is subsitituted as ETH_P_ALL by default.

Do you have an actual user that needs anything other than ETH_P_ALL? Why is it
even needed? Why not stick to just ETH_P_ALL?

> The behavior of the three functions is as follows:
> 
> attach = create filter if it does not exist, fail otherwise
> change = change properties of the classifier of existing filter
> replace = create filter, and replace any existing filter

This touches on tc oddities quite a bit. Why do we need to expose them? Can't we
simplify/abstract this e.g. i) create or update instance, ii) delete instance,
iii) get instance ? What concrete use case do you have that you need those three
above?

> bpf_tc_cls_detach may be used to detach existing SCHED_CLS
> filter. The bpf_tc_cls_attach_id object filled in during attach,
> change, or replace must be passed in to the detach functions for them to
> remove the filter and its attached classififer correctly.
> 
> bpf_tc_cls_get_info is a helper that can be used to obtain attributes
> for the filter and classififer. The opts structure may be used to
> choose the granularity of search, such that info for a specific filter
> corresponding to the same loaded bpf program can be obtained. By
> default, the first match is returned to the user.
> 
> Examples:
> 
> 	struct bpf_tc_cls_attach_id id = {};
> 	struct bpf_object *obj;
> 	struct bpf_program *p;
> 	int fd, r;
> 
> 	obj = bpf_object_open("foo.o");
> 	if (IS_ERR_OR_NULL(obj))
> 		return PTR_ERR(obj);
> 
> 	p = bpf_object__find_program_by_title(obj, "classifier");
> 	if (IS_ERR_OR_NULL(p))
> 		return PTR_ERR(p);
> 
> 	if (bpf_object__load(obj) < 0)
> 		return -1;
> 
> 	fd = bpf_program__fd(p);
> 
> 	r = bpf_tc_cls_attach(fd, if_nametoindex("lo"),
> 			      BPF_TC_CLSACT_INGRESS,
> 			      NULL, &id);
> 	if (r < 0)
> 		return r;
> 
> ... which is roughly equivalent to (after clsact qdisc setup):
>    # tc filter add dev lo ingress bpf obj foo.o sec classifier da
> 
> ... as direct action mode is always enabled.
> 
> If a user wishes to modify existing options on an attached classifier,
> bpf_tc_cls_change API may be used.
> 
> Only parameters class_id can be modified, the rest are filled in to
> identify the correct filter. protocol can be left out if it was not
> chosen explicitly (defaulting to ETH_P_ALL).
> 
> Example:
> 
> 	/* Optional parameters necessary to select the right filter */
> 	DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts,
> 			    .handle = id.handle,
> 			    .priority = id.priority,
> 			    .chain_index = id.chain_index)

Why do we need chain_index as part of the basic API?

> 	opts.class_id = TC_H_MAKE(1UL << 16, 12);
> 	r = bpf_tc_cls_change(fd, if_nametoindex("lo"),
> 			      BPF_TC_CLSACT_INGRESS,
> 			      &opts, &id);

Also, I'm not sure whether the prefix should even be named  bpf_tc_cls_*() tbh,
yes, despite being "low level" api. I think in the context of bpf we should stop
regarding this as 'classifier' and 'action' objects since it's really just a
single entity and not separate ones. It's weird enough to explain this concept
to new users and if a libbpf based api could cleanly abstract it, I would be all
for it. I don't think we need to map 1:1 the old tc legacy even in the low level
api, tbh, as it feels backwards. I think the 'handle' & 'priority' bits are okay,
but I would remove the others.

> 	if (r < 0)
> 		return r;
> 
> 	struct bpf_tc_cls_info info = {};
> 	r = bpf_tc_cls_get_info(fd, if_nametoindex("lo"),
> 			        BPF_TC_CLSACT_INGRESS,
> 				&opts, &info);
> 	if (r < 0)
> 		return r;
> 
> 	assert(info.class_id == TC_H_MAKE(1UL << 16, 12));
> 
> This would be roughly equivalent to doing:
>    # tc filter change dev lo egress prio <p> handle <h> bpf obj foo.o sec \
>      classifier classid 1:12

Why even bother to support !da mode, what are you trying to solve with it? I
don't think official libbpf api should support something that doesn't scale.

> ... except a new bpf program will be loaded and replace existing one.
> 
> Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Toke Høiland-Jørgensen April 19, 2021, 9:43 p.m. UTC | #2
Daniel Borkmann <daniel@iogearbox.net> writes:

> On 4/19/21 2:18 PM, Kumar Kartikeya Dwivedi wrote:
>> This adds functions that wrap the netlink API used for adding,
>> manipulating, and removing traffic control filters. These functions
>> operate directly on the loaded prog's fd, and return a handle to the
>> filter using an out parameter named id.
>> 
>> The basic featureset is covered to allow for attaching, manipulation of
>> properties, and removal of filters. Some additional features like
>> TCA_BPF_POLICE and TCA_RATE for tc_cls have been omitted. These can
>> added on top later by extending the bpf_tc_cls_opts struct.
>> 
>> Support for binding actions directly to a classifier by passing them in
>> during filter creation has also been omitted for now. These actions have
>> an auto clean up property because their lifetime is bound to the filter
>> they are attached to. This can be added later, but was omitted for now
>> as direct action mode is a better alternative to it, which is enabled by
>> default.
>> 
>> An API summary:
>> 
>> bpf_tc_act_{attach, change, replace} may be used to attach, change, and
>
> typo on bpf_tc_act_{...} ?
>                 ^^^
>> replace SCHED_CLS bpf classifier. The protocol field can be set as 0, in
>> which case it is subsitituted as ETH_P_ALL by default.
>
> Do you have an actual user that needs anything other than ETH_P_ALL? Why is it
> even needed? Why not stick to just ETH_P_ALL?
>
>> The behavior of the three functions is as follows:
>> 
>> attach = create filter if it does not exist, fail otherwise
>> change = change properties of the classifier of existing filter
>> replace = create filter, and replace any existing filter
>
> This touches on tc oddities quite a bit. Why do we need to expose them? Can't we
> simplify/abstract this e.g. i) create or update instance, ii) delete instance,
> iii) get instance ? What concrete use case do you have that you need those three
> above?
>
>> bpf_tc_cls_detach may be used to detach existing SCHED_CLS
>> filter. The bpf_tc_cls_attach_id object filled in during attach,
>> change, or replace must be passed in to the detach functions for them to
>> remove the filter and its attached classififer correctly.
>> 
>> bpf_tc_cls_get_info is a helper that can be used to obtain attributes
>> for the filter and classififer. The opts structure may be used to
>> choose the granularity of search, such that info for a specific filter
>> corresponding to the same loaded bpf program can be obtained. By
>> default, the first match is returned to the user.
>> 
>> Examples:
>> 
>> 	struct bpf_tc_cls_attach_id id = {};
>> 	struct bpf_object *obj;
>> 	struct bpf_program *p;
>> 	int fd, r;
>> 
>> 	obj = bpf_object_open("foo.o");
>> 	if (IS_ERR_OR_NULL(obj))
>> 		return PTR_ERR(obj);
>> 
>> 	p = bpf_object__find_program_by_title(obj, "classifier");
>> 	if (IS_ERR_OR_NULL(p))
>> 		return PTR_ERR(p);
>> 
>> 	if (bpf_object__load(obj) < 0)
>> 		return -1;
>> 
>> 	fd = bpf_program__fd(p);
>> 
>> 	r = bpf_tc_cls_attach(fd, if_nametoindex("lo"),
>> 			      BPF_TC_CLSACT_INGRESS,
>> 			      NULL, &id);
>> 	if (r < 0)
>> 		return r;
>> 
>> ... which is roughly equivalent to (after clsact qdisc setup):
>>    # tc filter add dev lo ingress bpf obj foo.o sec classifier da
>> 
>> ... as direct action mode is always enabled.
>> 
>> If a user wishes to modify existing options on an attached classifier,
>> bpf_tc_cls_change API may be used.
>> 
>> Only parameters class_id can be modified, the rest are filled in to
>> identify the correct filter. protocol can be left out if it was not
>> chosen explicitly (defaulting to ETH_P_ALL).
>> 
>> Example:
>> 
>> 	/* Optional parameters necessary to select the right filter */
>> 	DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts,
>> 			    .handle = id.handle,
>> 			    .priority = id.priority,
>> 			    .chain_index = id.chain_index)
>
> Why do we need chain_index as part of the basic API?
>
>> 	opts.class_id = TC_H_MAKE(1UL << 16, 12);
>> 	r = bpf_tc_cls_change(fd, if_nametoindex("lo"),
>> 			      BPF_TC_CLSACT_INGRESS,
>> 			      &opts, &id);
>
> Also, I'm not sure whether the prefix should even be named  bpf_tc_cls_*() tbh,
> yes, despite being "low level" api. I think in the context of bpf we should stop
> regarding this as 'classifier' and 'action' objects since it's really just a
> single entity and not separate ones. It's weird enough to explain this concept
> to new users and if a libbpf based api could cleanly abstract it, I would be all
> for it. I don't think we need to map 1:1 the old tc legacy even in the low level
> api, tbh, as it feels backwards. I think the 'handle' & 'priority' bits are okay,
> but I would remove the others.

Hmm, I'm OK with dropping the TC oddities (including the cls_ in the
name), but I think we should be documenting it so that users that do
come from TC will not be completely lost :)

-Toke
Kumar Kartikeya Dwivedi April 19, 2021, 9:45 p.m. UTC | #3
On Tue, Apr 20, 2021 at 02:30:44AM IST, Daniel Borkmann wrote:
> On 4/19/21 2:18 PM, Kumar Kartikeya Dwivedi wrote:
> > This adds functions that wrap the netlink API used for adding,
> > manipulating, and removing traffic control filters. These functions
> > operate directly on the loaded prog's fd, and return a handle to the
> > filter using an out parameter named id.
> >
> > The basic featureset is covered to allow for attaching, manipulation of
> > properties, and removal of filters. Some additional features like
> > TCA_BPF_POLICE and TCA_RATE for tc_cls have been omitted. These can
> > added on top later by extending the bpf_tc_cls_opts struct.
> >
> > Support for binding actions directly to a classifier by passing them in
> > during filter creation has also been omitted for now. These actions have
> > an auto clean up property because their lifetime is bound to the filter
> > they are attached to. This can be added later, but was omitted for now
> > as direct action mode is a better alternative to it, which is enabled by
> > default.
> >
> > An API summary:
> >
> > bpf_tc_act_{attach, change, replace} may be used to attach, change, and
>
> typo on bpf_tc_act_{...} ?
>

Oops, yes. Should be bpf_tc_cls_...

> > replace SCHED_CLS bpf classifier. The protocol field can be set as 0, in
> > which case it is subsitituted as ETH_P_ALL by default.
>
> Do you have an actual user that needs anything other than ETH_P_ALL? Why is it
> even needed? Why not stick to just ETH_P_ALL?
>

Mostly because it was little to no effort to expose this. Though if you feel
strongly about it I can drop the protocol option, and just bake in ETH_P_ALL. It
can always be added later ofcourse, if the need arises in the future.

> > The behavior of the three functions is as follows:
> >
> > attach = create filter if it does not exist, fail otherwise
> > change = change properties of the classifier of existing filter
> > replace = create filter, and replace any existing filter
>
> This touches on tc oddities quite a bit. Why do we need to expose them? Can't we
> simplify/abstract this e.g. i) create or update instance, ii) delete instance,
> iii) get instance ? What concrete use case do you have that you need those three
> above?
>

'change' is relevant for modifying classifier specific options, and given it's
a lot less useful now as per the current state of this patch, I am fine with
removing it. This is also where the distinction becomes visible to the user, so
removing it should hide the filter/classifier separation.

> > bpf_tc_cls_detach may be used to detach existing SCHED_CLS
> > filter. The bpf_tc_cls_attach_id object filled in during attach,
> > change, or replace must be passed in to the detach functions for them to
> > remove the filter and its attached classififer correctly.
> >
> > bpf_tc_cls_get_info is a helper that can be used to obtain attributes
> > for the filter and classififer. The opts structure may be used to
> > choose the granularity of search, such that info for a specific filter
> > corresponding to the same loaded bpf program can be obtained. By
> > default, the first match is returned to the user.
> >
> > Examples:
> >
> > 	struct bpf_tc_cls_attach_id id = {};
> > 	struct bpf_object *obj;
> > 	struct bpf_program *p;
> > 	int fd, r;
> >
> > 	obj = bpf_object_open("foo.o");
> > 	if (IS_ERR_OR_NULL(obj))
> > 		return PTR_ERR(obj);
> >
> > 	p = bpf_object__find_program_by_title(obj, "classifier");
> > 	if (IS_ERR_OR_NULL(p))
> > 		return PTR_ERR(p);
> >
> > 	if (bpf_object__load(obj) < 0)
> > 		return -1;
> >
> > 	fd = bpf_program__fd(p);
> >
> > 	r = bpf_tc_cls_attach(fd, if_nametoindex("lo"),
> > 			      BPF_TC_CLSACT_INGRESS,
> > 			      NULL, &id);
> > 	if (r < 0)
> > 		return r;
> >
> > ... which is roughly equivalent to (after clsact qdisc setup):
> >    # tc filter add dev lo ingress bpf obj foo.o sec classifier da
> >
> > ... as direct action mode is always enabled.
> >
> > If a user wishes to modify existing options on an attached classifier,
> > bpf_tc_cls_change API may be used.
> >
> > Only parameters class_id can be modified, the rest are filled in to
> > identify the correct filter. protocol can be left out if it was not
> > chosen explicitly (defaulting to ETH_P_ALL).
> >
> > Example:
> >
> > 	/* Optional parameters necessary to select the right filter */
> > 	DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts,
> > 			    .handle = id.handle,
> > 			    .priority = id.priority,
> > 			    .chain_index = id.chain_index)
>
> Why do we need chain_index as part of the basic API?
>

It would be relevant when TC_ACT_GOTO_CHAIN is used. Other than that, I guess
it's not very useful.

> > 	opts.class_id = TC_H_MAKE(1UL << 16, 12);
> > 	r = bpf_tc_cls_change(fd, if_nametoindex("lo"),
> > 			      BPF_TC_CLSACT_INGRESS,
> > 			      &opts, &id);
>
> Also, I'm not sure whether the prefix should even be named  bpf_tc_cls_*() tbh,
> yes, despite being "low level" api. I think in the context of bpf we should stop
> regarding this as 'classifier' and 'action' objects since it's really just a
> single entity and not separate ones. It's weird enough to explain this concept
> to new users and if a libbpf based api could cleanly abstract it, I would be all
> for it. I don't think we need to map 1:1 the old tc legacy even in the low level
> api, tbh, as it feels backwards. I think the 'handle' & 'priority' bits are okay,
> but I would remove the others.
>

Ok, would dropping _cls from the name be better?
bpf_tc_attach
bpf_tc_replace
bpf_tc_get_info
bpf_tc_detach

As for options, I'll drop protocol, if you feel strongly about chain_index I can
drop that one too.

> > 	if (r < 0)
> > 		return r;
> >
> > 	struct bpf_tc_cls_info info = {};
> > 	r = bpf_tc_cls_get_info(fd, if_nametoindex("lo"),
> > 			        BPF_TC_CLSACT_INGRESS,
> > 				&opts, &info);
> > 	if (r < 0)
> > 		return r;
> >
> > 	assert(info.class_id == TC_H_MAKE(1UL << 16, 12));
> >
> > This would be roughly equivalent to doing:
> >    # tc filter change dev lo egress prio <p> handle <h> bpf obj foo.o sec \
> >      classifier classid 1:12
>
> Why even bother to support !da mode, what are you trying to solve with it? I
> don't think official libbpf api should support something that doesn't scale.
>

da is default now, this is yet another typo/oversight...

--
Kartikeya
Daniel Borkmann April 19, 2021, 9:57 p.m. UTC | #4
On 4/19/21 11:43 PM, Toke Høiland-Jørgensen wrote:
> Daniel Borkmann <daniel@iogearbox.net> writes:
>> On 4/19/21 2:18 PM, Kumar Kartikeya Dwivedi wrote:
>>> This adds functions that wrap the netlink API used for adding,
>>> manipulating, and removing traffic control filters. These functions
>>> operate directly on the loaded prog's fd, and return a handle to the
>>> filter using an out parameter named id.
>>>
>>> The basic featureset is covered to allow for attaching, manipulation of
>>> properties, and removal of filters. Some additional features like
>>> TCA_BPF_POLICE and TCA_RATE for tc_cls have been omitted. These can
>>> added on top later by extending the bpf_tc_cls_opts struct.
>>>
>>> Support for binding actions directly to a classifier by passing them in
>>> during filter creation has also been omitted for now. These actions have
>>> an auto clean up property because their lifetime is bound to the filter
>>> they are attached to. This can be added later, but was omitted for now
>>> as direct action mode is a better alternative to it, which is enabled by
>>> default.
>>>
>>> An API summary:
>>>
>>> bpf_tc_act_{attach, change, replace} may be used to attach, change, and
>>
>> typo on bpf_tc_act_{...} ?
>>                  ^^^
>>> replace SCHED_CLS bpf classifier. The protocol field can be set as 0, in
>>> which case it is subsitituted as ETH_P_ALL by default.
>>
>> Do you have an actual user that needs anything other than ETH_P_ALL? Why is it
>> even needed? Why not stick to just ETH_P_ALL?
>>
>>> The behavior of the three functions is as follows:
>>>
>>> attach = create filter if it does not exist, fail otherwise
>>> change = change properties of the classifier of existing filter
>>> replace = create filter, and replace any existing filter
>>
>> This touches on tc oddities quite a bit. Why do we need to expose them? Can't we
>> simplify/abstract this e.g. i) create or update instance, ii) delete instance,
>> iii) get instance ? What concrete use case do you have that you need those three
>> above?
>>
>>> bpf_tc_cls_detach may be used to detach existing SCHED_CLS
>>> filter. The bpf_tc_cls_attach_id object filled in during attach,
>>> change, or replace must be passed in to the detach functions for them to
>>> remove the filter and its attached classififer correctly.
>>>
>>> bpf_tc_cls_get_info is a helper that can be used to obtain attributes
>>> for the filter and classififer. The opts structure may be used to
>>> choose the granularity of search, such that info for a specific filter
>>> corresponding to the same loaded bpf program can be obtained. By
>>> default, the first match is returned to the user.
>>>
>>> Examples:
>>>
>>> 	struct bpf_tc_cls_attach_id id = {};
>>> 	struct bpf_object *obj;
>>> 	struct bpf_program *p;
>>> 	int fd, r;
>>>
>>> 	obj = bpf_object_open("foo.o");
>>> 	if (IS_ERR_OR_NULL(obj))
>>> 		return PTR_ERR(obj);
>>>
>>> 	p = bpf_object__find_program_by_title(obj, "classifier");
>>> 	if (IS_ERR_OR_NULL(p))
>>> 		return PTR_ERR(p);
>>>
>>> 	if (bpf_object__load(obj) < 0)
>>> 		return -1;
>>>
>>> 	fd = bpf_program__fd(p);
>>>
>>> 	r = bpf_tc_cls_attach(fd, if_nametoindex("lo"),
>>> 			      BPF_TC_CLSACT_INGRESS,
>>> 			      NULL, &id);
>>> 	if (r < 0)
>>> 		return r;
>>>
>>> ... which is roughly equivalent to (after clsact qdisc setup):
>>>     # tc filter add dev lo ingress bpf obj foo.o sec classifier da
>>>
>>> ... as direct action mode is always enabled.
>>>
>>> If a user wishes to modify existing options on an attached classifier,
>>> bpf_tc_cls_change API may be used.
>>>
>>> Only parameters class_id can be modified, the rest are filled in to
>>> identify the correct filter. protocol can be left out if it was not
>>> chosen explicitly (defaulting to ETH_P_ALL).
>>>
>>> Example:
>>>
>>> 	/* Optional parameters necessary to select the right filter */
>>> 	DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts,
>>> 			    .handle = id.handle,
>>> 			    .priority = id.priority,
>>> 			    .chain_index = id.chain_index)
>>
>> Why do we need chain_index as part of the basic API?
>>
>>> 	opts.class_id = TC_H_MAKE(1UL << 16, 12);
>>> 	r = bpf_tc_cls_change(fd, if_nametoindex("lo"),
>>> 			      BPF_TC_CLSACT_INGRESS,
>>> 			      &opts, &id);
>>
>> Also, I'm not sure whether the prefix should even be named  bpf_tc_cls_*() tbh,
>> yes, despite being "low level" api. I think in the context of bpf we should stop
>> regarding this as 'classifier' and 'action' objects since it's really just a
>> single entity and not separate ones. It's weird enough to explain this concept
>> to new users and if a libbpf based api could cleanly abstract it, I would be all
>> for it. I don't think we need to map 1:1 the old tc legacy even in the low level
>> api, tbh, as it feels backwards. I think the 'handle' & 'priority' bits are okay,
>> but I would remove the others.
> 
> Hmm, I'm OK with dropping the TC oddities (including the cls_ in the
> name), but I think we should be documenting it so that users that do
> come from TC will not be completely lost :)

Yeah, that sounds good to me. :) All I'm trying to say is that /we/ are used to the
terminology and quirks that come with it, but I'm hoping that such API will be used
by *new* folks who have zero context on the underlying details, and they also really
shouldn't have to care. Even if on the lower level we expose handle/priority at least,
the API should be designed with a mindset of no prior tc experience required. Simple
and easy to use. I think the handle/priority concept can be explained/documented fairly
straight forward. There is a chance to hide all the nasty historic/legacy details in
something clean, easy to use and scaleable (implied by the da mode), we should use this
opportunity. ;)

Thanks,
Daniel
diff mbox series

Patch

diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index bec4e6a6e31d..2f4a2036cb74 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -16,6 +16,9 @@ 
 #include <stdbool.h>
 #include <sys/types.h>  // for size_t
 #include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+#include <linux/pkt_sched.h>
+#include <linux/tc_act/tc_bpf.h>
 
 #include "libbpf_common.h"
 
@@ -775,6 +778,55 @@  LIBBPF_API int bpf_linker__add_file(struct bpf_linker *linker, const char *filen
 LIBBPF_API int bpf_linker__finalize(struct bpf_linker *linker);
 LIBBPF_API void bpf_linker__free(struct bpf_linker *linker);
 
+/* Convenience macros for the clsact attach hooks */
+#define BPF_TC_CLSACT_INGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_INGRESS)
+#define BPF_TC_CLSACT_EGRESS TC_H_MAKE(TC_H_CLSACT, TC_H_MIN_EGRESS)
+
+struct bpf_tc_cls_opts {
+	size_t sz;
+	__u32 protocol;
+	__u32 handle;
+	__u32 chain_index;
+	__u32 priority;
+	__u32 class_id;
+	size_t :0;
+};
+
+#define bpf_tc_cls_opts__last_field class_id
+
+/* Acts as a handle for an attached filter */
+struct bpf_tc_cls_attach_id {
+	__u32 protocol;
+	__u32 chain_index;
+	__u32 handle;
+	__u32 priority;
+};
+
+struct bpf_tc_cls_info {
+	struct bpf_tc_cls_attach_id id;
+	__u32 prog_id;
+	__u8 tag[BPF_TAG_SIZE];
+	__u32 class_id;
+	__u32 bpf_flags;
+	__u32 bpf_flags_gen;
+};
+
+/* id is out parameter that will be written to, it must not be NULL */
+LIBBPF_API int bpf_tc_cls_attach(int fd, __u32 ifindex, __u32 parent_id,
+				 const struct bpf_tc_cls_opts *opts,
+				 struct bpf_tc_cls_attach_id *id);
+LIBBPF_API int bpf_tc_cls_change(int fd, __u32 ifindex, __u32 parent_id,
+				 const struct bpf_tc_cls_opts *opts,
+				 struct bpf_tc_cls_attach_id *id);
+LIBBPF_API int bpf_tc_cls_replace(int fd, __u32 ifindex, __u32 parent_id,
+				  const struct bpf_tc_cls_opts *opts,
+				  struct bpf_tc_cls_attach_id *id);
+LIBBPF_API int bpf_tc_cls_detach(__u32 ifindex, __u32 parent_id,
+				 const struct bpf_tc_cls_attach_id *id);
+LIBBPF_API int bpf_tc_cls_get_info(int fd, __u32 ifindex, __u32 parent_id,
+				   const struct bpf_tc_cls_opts *opts,
+				   struct bpf_tc_cls_info *info);
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index b9b29baf1df8..52e5de1e82ea 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -361,4 +361,9 @@  LIBBPF_0.4.0 {
 		bpf_linker__new;
 		bpf_map__inner_map;
 		bpf_object__set_kversion;
+		bpf_tc_cls_attach;
+		bpf_tc_cls_change;
+		bpf_tc_cls_detach;
+		bpf_tc_cls_replace;
+		bpf_tc_cls_get_info;
 } LIBBPF_0.3.0;
diff --git a/tools/lib/bpf/netlink.c b/tools/lib/bpf/netlink.c
index c79e30484e81..93cc1e027065 100644
--- a/tools/lib/bpf/netlink.c
+++ b/tools/lib/bpf/netlink.c
@@ -4,7 +4,11 @@ 
 #include <stdlib.h>
 #include <memory.h>
 #include <unistd.h>
+#include <inttypes.h>
+#include <arpa/inet.h>
 #include <linux/bpf.h>
+#include <linux/pkt_cls.h>
+#include <linux/if_ether.h>
 #include <linux/rtnetlink.h>
 #include <sys/socket.h>
 #include <errno.h>
@@ -131,6 +135,41 @@  static int bpf_netlink_recv(int sock, __u32 nl_pid, int seq,
 	return ret;
 }
 
+static int tc_setup_clsact_excl(int sock, __u32 nl_pid, __u32 ifindex)
+{
+	int seq = 0, ret = 0;
+	struct {
+		struct nlmsghdr nh;
+		struct tcmsg t;
+		char buf[256];
+	} req;
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	req.nh.nlmsg_flags =
+		NLM_F_REQUEST | NLM_F_CREATE | NLM_F_ACK | NLM_F_EXCL;
+	req.nh.nlmsg_type = RTM_NEWQDISC;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.t.tcm_family = AF_UNSPEC;
+	req.t.tcm_ifindex = ifindex;
+	req.t.tcm_parent = TC_H_CLSACT;
+	req.t.tcm_handle = TC_H_MAKE(TC_H_CLSACT, 0);
+
+	ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "clsact",
+			 sizeof("clsact"));
+	if (ret < 0)
+		return ret;
+
+	ret = send(sock, &req.nh, req.nh.nlmsg_len, 0);
+	if (ret < 0)
+		return ret;
+
+	ret = bpf_netlink_recv(sock, nl_pid, seq, NULL, NULL, NULL);
+
+	return ret;
+}
+
 static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd,
 					 __u32 flags)
 {
@@ -344,6 +383,20 @@  int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags)
 	return ret;
 }
 
+static int bpf_nl_get_ext(struct nlmsghdr *nh, int sock, unsigned int nl_pid,
+			  __dump_nlmsg_t dump_link_nlmsg_p,
+			  libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie)
+{
+	int seq = time(NULL);
+
+	nh->nlmsg_seq = seq;
+	if (send(sock, nh, nh->nlmsg_len, 0) < 0)
+		return -errno;
+
+	return bpf_netlink_recv(sock, nl_pid, seq, dump_link_nlmsg_p,
+				dump_link_nlmsg, cookie);
+}
+
 int libbpf_nl_get_link(int sock, unsigned int nl_pid,
 		       libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie)
 {
@@ -356,12 +409,324 @@  int libbpf_nl_get_link(int sock, unsigned int nl_pid,
 		.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST,
 		.ifm.ifi_family = AF_PACKET,
 	};
-	int seq = time(NULL);
 
-	req.nlh.nlmsg_seq = seq;
-	if (send(sock, &req, req.nlh.nlmsg_len, 0) < 0)
-		return -errno;
+	return bpf_nl_get_ext(&req.nlh, sock, nl_pid, __dump_link_nlmsg,
+			      dump_link_nlmsg, cookie);
+}
 
-	return bpf_netlink_recv(sock, nl_pid, seq, __dump_link_nlmsg,
-				dump_link_nlmsg, cookie);
+static int tc_bpf_add_fd_and_name(struct nlmsghdr *nh, size_t maxsz, int fd)
+{
+	struct bpf_prog_info info = {};
+	__u32 info_len = sizeof(info);
+	char name[256] = {};
+	int len, ret;
+
+	ret = bpf_obj_get_info_by_fd(fd, &info, &info_len);
+	if (ret < 0)
+		return ret;
+
+	ret = nlattr_add(nh, maxsz, TCA_BPF_FD, &fd, sizeof(fd));
+	if (ret < 0)
+		return ret;
+
+	len = snprintf(name, sizeof(name), "%s:[%" PRIu32 "]", info.name,
+		       info.id);
+	if (len < 0 || len >= sizeof(name))
+		return len < 0 ? -EINVAL : -ENAMETOOLONG;
+
+	return nlattr_add(nh, maxsz, TCA_BPF_NAME, name, len + 1);
+}
+
+struct pass_info {
+	struct bpf_tc_cls_info *info;
+	__u32 prog_id;
+};
+
+static int cls_get_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn,
+			void *cookie);
+
+static int tc_cls_bpf_modify(int fd, int cmd, unsigned int flags, __u32 ifindex,
+			     __u32 parent_id, const struct bpf_tc_cls_opts *opts,
+			     __dump_nlmsg_t fn, struct bpf_tc_cls_attach_id *id)
+{
+	struct bpf_tc_cls_info info = {};
+	unsigned int bpf_flags = 0;
+	__u32 nl_pid = 0, protocol;
+	int sock, seq = 0, ret;
+	struct nlattr *nla;
+	struct {
+		struct nlmsghdr nh;
+		struct tcmsg t;
+		char buf[256];
+	} req;
+
+	if (OPTS_GET(opts, priority, 0) > 0xFFFF)
+		return -EINVAL;
+
+	sock = libbpf_netlink_open(&nl_pid);
+	if (sock < 0)
+		return sock;
+
+	if ((parent_id == BPF_TC_CLSACT_INGRESS ||
+	     parent_id == BPF_TC_CLSACT_EGRESS) &&
+	    flags & NLM_F_CREATE) {
+		ret = tc_setup_clsact_excl(sock, nl_pid, ifindex);
+		/* attachment can still fail if ingress qdisc is installed, and
+		 * we're trying attach on egress as parent */
+		if (ret < 0 && ret != -EEXIST)
+			goto end;
+	}
+
+	protocol = OPTS_GET(opts, protocol, 0) ?: ETH_P_ALL;
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | flags;
+	req.nh.nlmsg_type = cmd;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.t.tcm_family = AF_UNSPEC;
+	req.t.tcm_handle = OPTS_GET(opts, handle, 0);
+	req.t.tcm_parent = parent_id;
+	req.t.tcm_ifindex = ifindex;
+	req.t.tcm_info =
+		TC_H_MAKE(OPTS_GET(opts, priority, 0UL) << 16, htons(protocol));
+
+	if (OPTS_HAS(opts, chain_index)) {
+		ret = nlattr_add(&req.nh, sizeof(req), TCA_CHAIN,
+				 &opts->chain_index, sizeof(opts->chain_index));
+		if (ret < 0)
+			goto end;
+	}
+
+	ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf"));
+	if (ret < 0)
+		goto end;
+
+	nla = nlattr_begin_nested(&req.nh, sizeof(req), TCA_OPTIONS);
+	if (!nla) {
+		ret = -EMSGSIZE;
+		goto end;
+	}
+
+	if (OPTS_GET(opts, class_id, TC_H_UNSPEC)) {
+		ret = nlattr_add(&req.nh, sizeof(req), TCA_BPF_CLASSID,
+				 &opts->class_id, sizeof(opts->class_id));
+		if (ret < 0)
+			goto end;
+	}
+
+	if (cmd != RTM_DELTFILTER) {
+		ret = tc_bpf_add_fd_and_name(&req.nh, sizeof(req), fd);
+		if (ret < 0)
+			goto end;
+
+		/* direct action is always set */
+		bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT;
+		ret = nlattr_add(&req.nh, sizeof(req), TCA_BPF_FLAGS,
+				 &bpf_flags, sizeof(bpf_flags));
+		if (ret < 0)
+			goto end;
+	}
+
+	nlattr_end_nested(&req.nh, nla);
+
+	ret = send(sock, &req.nh, req.nh.nlmsg_len, 0);
+	if (ret < 0)
+		goto end;
+
+	ret = bpf_netlink_recv(sock, nl_pid, seq, fn, NULL,
+			       &(struct pass_info){ &info, 0 });
+
+	if (fn)
+		*id = info.id;
+
+end:
+	close(sock);
+	return ret;
+}
+
+int bpf_tc_cls_attach(int fd, __u32 ifindex, __u32 parent_id,
+		      const struct bpf_tc_cls_opts *opts,
+		      struct bpf_tc_cls_attach_id *id)
+{
+	if (fd < 0 || !OPTS_VALID(opts, bpf_tc_cls_opts) || !id)
+		return -EINVAL;
+
+	return tc_cls_bpf_modify(fd, RTM_NEWTFILTER,
+				 NLM_F_ECHO | NLM_F_EXCL | NLM_F_CREATE,
+				 ifindex, parent_id, opts, cls_get_info, id);
+}
+
+int bpf_tc_cls_change(int fd, __u32 ifindex, __u32 parent_id,
+		      const struct bpf_tc_cls_opts *opts,
+		      struct bpf_tc_cls_attach_id *id)
+{
+	if (fd < 0 || !OPTS_VALID(opts, bpf_tc_cls_opts) || !id)
+		return -EINVAL;
+
+	return tc_cls_bpf_modify(fd, RTM_NEWTFILTER, NLM_F_ECHO, ifindex,
+				 parent_id, opts, cls_get_info, id);
+}
+
+int bpf_tc_cls_replace(int fd, __u32 ifindex, __u32 parent_id,
+		       const struct bpf_tc_cls_opts *opts,
+		       struct bpf_tc_cls_attach_id *id)
+{
+	if (fd < 0 || !OPTS_VALID(opts, bpf_tc_cls_opts) || !id)
+		return -EINVAL;
+
+	return tc_cls_bpf_modify(fd, RTM_NEWTFILTER, NLM_F_ECHO | NLM_F_CREATE,
+				 ifindex, parent_id, opts, cls_get_info, id);
+}
+
+int bpf_tc_cls_detach(__u32 ifindex, __u32 parent_id,
+		      const struct bpf_tc_cls_attach_id *id)
+{
+	DECLARE_LIBBPF_OPTS(bpf_tc_cls_opts, opts, 0);
+
+	if (!id)
+		return -EINVAL;
+
+	opts.protocol = id->protocol;
+	opts.chain_index = id->chain_index;
+	opts.handle = id->handle;
+	opts.priority = id->priority;
+
+	return tc_cls_bpf_modify(-1, RTM_DELTFILTER, 0, ifindex, parent_id,
+				 &opts, NULL, NULL);
+}
+
+static int __cls_get_info(void *cookie, void *msg, struct nlattr **tb)
+{
+	struct nlattr *tbb[TCA_BPF_MAX + 1];
+	struct pass_info *cinfo = cookie;
+	struct bpf_tc_cls_info *info;
+	struct tcmsg *t = msg;
+	__u32 prog_id;
+
+	info = cinfo->info;
+
+	if (!tb[TCA_OPTIONS])
+		return 0;
+
+	libbpf_nla_parse_nested(tbb, TCA_BPF_MAX, tb[TCA_OPTIONS], NULL);
+	if (!tbb[TCA_BPF_ID])
+		return 0;
+
+	prog_id = libbpf_nla_getattr_u32(tbb[TCA_BPF_ID]);
+	if (cinfo->prog_id && cinfo->prog_id != prog_id)
+		return 0;
+
+	info->id.protocol = ntohs(TC_H_MIN(t->tcm_info));
+	info->id.priority = TC_H_MAJ(t->tcm_info) >> 16;
+	info->id.handle = t->tcm_handle;
+
+	if (tb[TCA_CHAIN])
+		info->id.chain_index = libbpf_nla_getattr_u32(tb[TCA_CHAIN]);
+	else
+		info->id.chain_index = 0;
+
+	if (tbb[TCA_BPF_FLAGS])
+		info->bpf_flags = libbpf_nla_getattr_u32(tbb[TCA_BPF_FLAGS]);
+
+	if (tbb[TCA_BPF_FLAGS_GEN])
+		info->bpf_flags_gen =
+			libbpf_nla_getattr_u32(tbb[TCA_BPF_FLAGS_GEN]);
+
+	if (tbb[TCA_BPF_ID])
+		info->prog_id = libbpf_nla_getattr_u32(tbb[TCA_BPF_ID]);
+
+	if (tbb[TCA_BPF_TAG])
+		memcpy(info->tag, libbpf_nla_getattr_str(tbb[TCA_BPF_TAG]),
+		       sizeof(info->tag));
+
+	if (tbb[TCA_BPF_CLASSID])
+		info->class_id = libbpf_nla_getattr_u32(tbb[TCA_BPF_CLASSID]);
+
+	return 1;
+}
+
+static int cls_get_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn,
+			void *cookie)
+{
+	struct tcmsg *t = NLMSG_DATA(nh);
+	struct nlattr *tb[TCA_MAX + 1];
+
+	libbpf_nla_parse(tb, TCA_MAX,
+			 (struct nlattr *)((char *)t + NLMSG_ALIGN(sizeof(*t))),
+			 NLMSG_PAYLOAD(nh, sizeof(*t)), NULL);
+	if (!tb[TCA_KIND])
+		return -EINVAL;
+
+	return __cls_get_info(cookie, t, tb);
+}
+
+static int tc_cls_get_info(int fd, __u32 ifindex, __u32 parent_id,
+			   const struct bpf_tc_cls_opts *opts,
+			   struct bpf_tc_cls_info *info)
+{
+	__u32 nl_pid = 0, protocol, info_len = sizeof(struct bpf_prog_info);
+	struct bpf_prog_info prog_info = {};
+	int sock, ret;
+	struct {
+		struct nlmsghdr nh;
+		struct tcmsg t;
+		char buf[256];
+	} req = {
+		.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)),
+		.nh.nlmsg_type = RTM_GETTFILTER,
+		.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
+		.t.tcm_family = AF_UNSPEC,
+	};
+
+
+	if (!OPTS_VALID(opts, bpf_tc_cls_opts))
+		return -EINVAL;
+
+	protocol = OPTS_GET(opts, protocol, 0) ?: ETH_P_ALL;
+
+	req.t.tcm_parent = parent_id;
+	req.t.tcm_ifindex = ifindex;
+	req.t.tcm_handle = OPTS_GET(opts, handle, 0);
+	req.t.tcm_info =
+		TC_H_MAKE(OPTS_GET(opts, priority, 0UL) << 16, htons(protocol));
+
+	ret = bpf_obj_get_info_by_fd(fd, &prog_info, &info_len);
+	if (ret < 0)
+		return ret;
+
+	sock = libbpf_netlink_open(&nl_pid);
+	if (sock < 0)
+		return sock;
+
+	ret = nlattr_add(&req.nh, sizeof(req), TCA_KIND, "bpf", sizeof("bpf"));
+	if (ret < 0)
+		goto end;
+
+	if (OPTS_HAS(opts, chain_index)) {
+		ret = nlattr_add(&req.nh, sizeof(req), TCA_CHAIN,
+				 &opts->chain_index, sizeof(opts->chain_index));
+		if (ret < 0)
+			goto end;
+	}
+
+	req.nh.nlmsg_seq = time(NULL);
+
+	ret = bpf_nl_get_ext(&req.nh, sock, nl_pid, cls_get_info, NULL,
+			     &(struct pass_info){ info, prog_info.id });
+	if (ret < 0)
+		goto end;
+	/* 1 denotes a match */
+	ret = ret == 1 ? 0 : -ESRCH;
+end:
+	close(sock);
+	return ret;
+}
+
+int bpf_tc_cls_get_info(int fd, __u32 ifindex, __u32 parent_id,
+			const struct bpf_tc_cls_opts *opts,
+			struct bpf_tc_cls_info *info)
+{
+	return tc_cls_get_info(fd, ifindex, parent_id, opts, info);
 }