diff mbox series

Bpf Helper Function BPF_FUNC_skb_change_dsfield

Message ID 20211220204034.24443-1-quic_twear@quicinc.com (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Bpf Helper Function BPF_FUNC_skb_change_dsfield | expand

Checks

Context Check Description
bpf/vmtest-bpf-PR fail merge-conflict
netdev/tree_selection success Guessing tree name failed - patch did not apply, async

Commit Message

Tyler Wear (QUIC) Dec. 20, 2021, 8:40 p.m. UTC
New bpf helper function BPF_FUNC_skb_change_dsfield
"int bpf_skb_change_dsfield(struct sk_buff *skb, u8 mask, u8 value)".
BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can
be attached to the ingress and egress path. The helper is needed
because this type of bpf_prog cannot modify the skb directly.

Used by a bpf_prog to specify DS field values on egress or
ingress.

Signed-off-by: Tyler Wear <quic_twear@quicinc.com>
---
 include/uapi/linux/bpf.h |  9 ++++++++
 net/core/filter.c        | 46 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

Comments

Yonghong Song Dec. 21, 2021, 3:18 a.m. UTC | #1
On 12/20/21 12:40 PM, Tyler Wear wrote:
> New bpf helper function BPF_FUNC_skb_change_dsfield
> "int bpf_skb_change_dsfield(struct sk_buff *skb, u8 mask, u8 value)".
> BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can
> be attached to the ingress and egress path. The helper is needed
> because this type of bpf_prog cannot modify the skb directly.
> 
> Used by a bpf_prog to specify DS field values on egress or
> ingress.

Maybe you can expand a little bit here for your use case?
I know DS field might help but a description of your actual
use case will make adding this helper more compelling.

> 
> Signed-off-by: Tyler Wear <quic_twear@quicinc.com>
> ---
>   include/uapi/linux/bpf.h |  9 ++++++++
>   net/core/filter.c        | 46 ++++++++++++++++++++++++++++++++++++++++
>   2 files changed, 55 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 556216dc9703..742cea7dcf8c 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3742,6 +3742,14 @@ union bpf_attr {
>    * 	Return
>    * 		The helper returns **TC_ACT_REDIRECT** on success or
>    * 		**TC_ACT_SHOT** on error.
> + *
> + * long bpf_skb_change_dsfield(struct sk_buff *skb, u8 mask, u8 value)
> + *	Description
> + *		Set DS field of IP header to the specified *value*. The *value*
> + *		is masked with the provided *mask* when ds field is updated.

This description probably not precise. If I understand correctly, the 
*mask* is used to mask the current skb iph->tos value which is then 
'or'ed with *value*.

I am also debating the helper name, bpf_skb_change_dsfield vs.
bpf_skb_update_dsfield vs. bpf_skb_update_ds. Here, we are actually
doing an update instead of completely overwrite the original value.
Maybe "update" is better than "change". Maybe we can just do
"bpf_skb_update_ds"? We have an existing helper bpf_skb_ecn_set_ce
to update ecn. We don't have any helper with suffix "field".

> + *		Works with IPv6 and IPv4.
> + *	Return
> + *		1 if the DS field is set, 0 if it is not set.
>    */
>   #define __BPF_FUNC_MAPPER(FN)		\
>   	FN(unspec),			\
> @@ -3900,6 +3908,7 @@ union bpf_attr {
>   	FN(per_cpu_ptr),		\
>   	FN(this_cpu_ptr),		\
>   	FN(redirect_peer),		\
> +	FN(skb_change_dsfield),		\
>   	/* */
>   
>   /* integer value in 'imm' field of BPF_CALL instruction selects which helper
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 035d66227ae2..71ea943c8059 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -6402,6 +6402,50 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
>   	return INET_ECN_set_ce(skb);
>   }
>   
> +BPF_CALL_3(bpf_skb_change_dsfield, struct sk_buff *, skb, u8, mask, u8, value)
> +{
> +	unsigned int iphdr_len;
> +
> +	switch (skb_protocol(skb, true)) {
> +	case cpu_to_be16(ETH_P_IP):
> +		iphdr_len = sizeof(struct iphdr);
> +		break;
> +	case cpu_to_be16(ETH_P_IPV6):
> +		iphdr_len = sizeof(struct ipv6hdr);
> +		break;
> +	default:
> +		return 0;
> +	}
> +
> +	if (skb_headlen(skb) < iphdr_len)
> +		return 0;
> +
> +	if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
> +		return 0;
> +
> +	switch (skb_protocol(skb, true)) {
> +	case cpu_to_be16(ETH_P_IP):
> +		ipv4_change_dsfield(ipip_hdr(skb), mask, value);
> +		break;
> +	case cpu_to_be16(ETH_P_IPV6):
> +		ipv6_change_dsfield(ipv6_hdr(skb), mask, value);
> +		break;
> +	default:
> +		return 0;
> +	}

There are some repetition here. For example, in the above, 'default' is 
not possible at all. Is it possible to remove the second 'switch' 
statement with simple
	if (...)
		ipv4_change_dsfield(ipip_hdr(skb), mask, value);
	else
		ipv6_change_dsfield(ipv6_hdr(skb), mask, value);
?

> +
> +	return 1;
> +}
> +
> +static const struct bpf_func_proto bpf_skb_change_dsfield_proto = {
> +	.func           = bpf_skb_change_dsfield,
> +	.gpl_only       = false,
> +	.ret_type       = RET_INTEGER,
> +	.arg1_type      = ARG_PTR_TO_CTX,
> +	.arg2_type      = ARG_ANYTHING,
> +	.arg3_type      = ARG_ANYTHING,
> +};
> +
>   bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
>   				  struct bpf_insn_access_aux *info)
>   {
> @@ -7057,6 +7101,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
>   		return &bpf_get_listener_sock_proto;
>   	case BPF_FUNC_skb_ecn_set_ce:
>   		return &bpf_skb_ecn_set_ce_proto;
> +	case BPF_FUNC_skb_change_dsfield:
> +		return &bpf_skb_change_dsfield_proto;

We do need a self test to exercise this helper.

>   #endif
>   	default:
>   		return sk_filter_func_proto(func_id, prog);
Martin KaFai Lau Dec. 21, 2021, 6:16 a.m. UTC | #2
On Mon, Dec 20, 2021 at 07:18:42PM -0800, Yonghong Song wrote:
> 
> 
> On 12/20/21 12:40 PM, Tyler Wear wrote:
> > New bpf helper function BPF_FUNC_skb_change_dsfield
> > "int bpf_skb_change_dsfield(struct sk_buff *skb, u8 mask, u8 value)".
> > BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can
> > be attached to the ingress and egress path. The helper is needed
> > because this type of bpf_prog cannot modify the skb directly.
> > 
> > Used by a bpf_prog to specify DS field values on egress or
> > ingress.
> 
> Maybe you can expand a little bit here for your use case?
> I know DS field might help but a description of your actual
> use case will make adding this helper more compelling.
+1.  More details on the use case is needed.
Also, having an individual helper for each particular header field
is too specific.

For egress, there is bpf_setsockopt() for IP_TOS and IPV6_TCLASS
and it can be called in other cgroup hooks. e.g.
BPF_PROG_TYPE_SOCK_OPS during tcp ESTABLISHED event.
There is an example in tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c.
Is it enough for egress?
Tyler Wear (QUIC) Dec. 21, 2021, 7:16 p.m. UTC | #3
> On Mon, Dec 20, 2021 at 07:18:42PM -0800, Yonghong Song wrote:
> >
> >
> > On 12/20/21 12:40 PM, Tyler Wear wrote:
> > > New bpf helper function BPF_FUNC_skb_change_dsfield "int 
> > > bpf_skb_change_dsfield(struct sk_buff *skb, u8 mask, u8 value)".
> > > BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can be 
> > > attached to the ingress and egress path. The helper is needed 
> > > because this type of bpf_prog cannot modify the skb directly.
> > >
> > > Used by a bpf_prog to specify DS field values on egress or ingress.
> >
> > Maybe you can expand a little bit here for your use case?
> > I know DS field might help but a description of your actual use case 
> > will make adding this helper more compelling.
> +1.  More details on the use case is needed.
> Also, having an individual helper for each particular header field is too specific.
>
> For egress, there is bpf_setsockopt() for IP_TOS and IPV6_TCLASS and it can be called in other cgroup hooks. e.g.
> BPF_PROG_TYPE_SOCK_OPS during tcp ESTABLISHED event.
> There is an example in tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c.
> Is it enough for egress?

Using bpf_setsockopt() has 2 issues: 1) it changes the userspace visible state 2) won't work with udp sendmsg cmsg
Maciej Żenczykowski Dec. 21, 2021, 7:46 p.m. UTC | #4
On Tue, Dec 21, 2021 at 11:16 AM Tyler Wear (QUIC)
<quic_twear@quicinc.com> wrote:
> > On Mon, Dec 20, 2021 at 07:18:42PM -0800, Yonghong Song wrote:
> > > On 12/20/21 12:40 PM, Tyler Wear wrote:
> > > > New bpf helper function BPF_FUNC_skb_change_dsfield "int
> > > > bpf_skb_change_dsfield(struct sk_buff *skb, u8 mask, u8 value)".
> > > > BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can be
> > > > attached to the ingress and egress path. The helper is needed
> > > > because this type of bpf_prog cannot modify the skb directly.
> > > >
> > > > Used by a bpf_prog to specify DS field values on egress or ingress.
> > >
> > > Maybe you can expand a little bit here for your use case?
> > > I know DS field might help but a description of your actual use case
> > > will make adding this helper more compelling.
> > +1.  More details on the use case is needed.
> > Also, having an individual helper for each particular header field is too specific.
> >
> > For egress, there is bpf_setsockopt() for IP_TOS and IPV6_TCLASS and it can be called in other cgroup hooks. e.g.
> > BPF_PROG_TYPE_SOCK_OPS during tcp ESTABLISHED event.
> > There is an example in tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c.
> > Is it enough for egress?
>
> Using bpf_setsockopt() has 2 issues: 1) it changes the userspace visible state 2) won't work with udp sendmsg cmsg

Right, so to clarify since I've been working with Tyler on a project
of which this patch is a small component.
Note, I may be wrong here, I don't fully understand how all of this
works... but:

ad 1) AFAIK if bpf calls bpf_setsockopt on the socket in question,
then userspace's view of the socket settings via
getsockopt(IP_TOS/IPV6_TCLASS) will also be affected - this may be
undesirable (it's technically userspace visible change in behaviour
and could, as unlikely as it is, lead to application misbehaviour).
This can be worked around via also overriding getsockopt/setsockopt
with bpf, but then you need to store the value to return to userspace
somewhere... AFAICT it all ends up being pretty ugly and very complex.

I wouldn't be worried about needing to override each individual field,
as the only other field that looks likely to be potentially beneficial
to override would be the ipv6 flowlabel.

ad 2) I don't think the bpf_setsockopt(IP_TOS/IPV6_TCLASS) approach
works for packets generated via udp sendmsg where cmsg is being used
to set tos.

3) I also think the bpf_setsockopt(IP_TOS/IPV6_TCLASS) might be too
late, since it would be in response to an already built packet, and
would thus presumably only take effect on the next packet, and not for
this one, no?

Technically this could be done by attaching the programs to tc egress
instead of the cgroup hook, but then it's per interface, which is
potentially the wrong granularity...

As for what is driving this?  Upcoming wifi standard to allow access
points to inform client devices how to dscp mark individual flows.

As for the patch itself, I wonder if the return value shouldn't be
reversed, currently '1 if the DS field is set, 0 if it is not set.'
But I think returning 0 on success and an error on failure is more in
line with what other bpf helpers do?
OTOH, it does match bpf_skb_ecn_set_ce() returning 0 on failure...

- Maciej
Martin KaFai Lau Dec. 21, 2021, 9:52 p.m. UTC | #5
On Tue, Dec 21, 2021 at 11:46:40AM -0800, Maciej Żenczykowski wrote:
> On Tue, Dec 21, 2021 at 11:16 AM Tyler Wear (QUIC)
> <quic_twear@quicinc.com> wrote:
> > > On Mon, Dec 20, 2021 at 07:18:42PM -0800, Yonghong Song wrote:
> > > > On 12/20/21 12:40 PM, Tyler Wear wrote:
> > > > > New bpf helper function BPF_FUNC_skb_change_dsfield "int
> > > > > bpf_skb_change_dsfield(struct sk_buff *skb, u8 mask, u8 value)".
> > > > > BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can be
> > > > > attached to the ingress and egress path. The helper is needed
> > > > > because this type of bpf_prog cannot modify the skb directly.
> > > > >
> > > > > Used by a bpf_prog to specify DS field values on egress or ingress.
> > > >
> > > > Maybe you can expand a little bit here for your use case?
> > > > I know DS field might help but a description of your actual use case
> > > > will make adding this helper more compelling.
> > > +1.  More details on the use case is needed.
> > > Also, having an individual helper for each particular header field is too specific.
> > >
> > > For egress, there is bpf_setsockopt() for IP_TOS and IPV6_TCLASS and it can be called in other cgroup hooks. e.g.
> > > BPF_PROG_TYPE_SOCK_OPS during tcp ESTABLISHED event.
> > > There is an example in tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c.
> > > Is it enough for egress?
> >
> > Using bpf_setsockopt() has 2 issues: 1) it changes the userspace visible state 2) won't work with udp sendmsg cmsg
> 
> Right, so to clarify since I've been working with Tyler on a project
> of which this patch is a small component.
> Note, I may be wrong here, I don't fully understand how all of this
> works... but:
> 
> ad 1) AFAIK if bpf calls bpf_setsockopt on the socket in question,
> then userspace's view of the socket settings via
> getsockopt(IP_TOS/IPV6_TCLASS) will also be affected - this may be
> undesirable (it's technically userspace visible change in behaviour
> and could, as unlikely as it is, lead to application misbehaviour).
> This can be worked around via also overriding getsockopt/setsockopt
> with bpf, but then you need to store the value to return to userspace
> somewhere... AFAICT it all ends up being pretty ugly and very complex.
CGROUP_(SET|GET)SOCKOPT is created for that.
The user's value can be stored in bpf_sk_storage.

> 
> I wouldn't be worried about needing to override each individual field,
> as the only other field that looks likely to be potentially beneficial
> to override would be the ipv6 flowlabel.
> 
> ad 2) I don't think the bpf_setsockopt(IP_TOS/IPV6_TCLASS) approach
> works for packets generated via udp sendmsg where cmsg is being used
> to set tos.
There is CGROUP_UDP[4|6]_SENDMSG.  Right now, it can only change the addr.
tos/tclass support could be added.

> 3) I also think the bpf_setsockopt(IP_TOS/IPV6_TCLASS) might be too
> late, since it would be in response to an already built packet, and
> would thus presumably only take effect on the next packet, and not for
> this one, no?
The bpf_setsockopt can be called in bind and connect.
Is it not early enough?

> Technically this could be done by attaching the programs to tc egress
> instead of the cgroup hook, but then it's per interface, which is
> potentially the wrong granularity...
Right, there is advantage to do it at higher layer,
and earlier also.

If the tos/tclass value can be changed early on, the correct
ip[6] header can be written at the beginning instead
of redoing it later and need to worry about the skb_clone_writable(),
rewriting it, do iph->check..etc.

> As for what is driving this?  Upcoming wifi standard to allow access
> points to inform client devices how to dscp mark individual flows.
Interesting.

How does the sending host get this dscp value from wifi
and then affect the dscp of a particular flow?  Is the dscp
going to be stored in a bpf map for the bpf prog to use?

Are you testing on TCP also?

> As for the patch itself, I wonder if the return value shouldn't be
> reversed, currently '1 if the DS field is set, 0 if it is not set.'
> But I think returning 0 on success and an error on failure is more in
> line with what other bpf helpers do?
> OTOH, it does match bpf_skb_ecn_set_ce() returning 0 on failure...
If adding a helper , how about making the bpf_skb_store_bytes()
available to BPF_PROG_TYPE_CGROUP_SKB?  Then it will be
more flexible to change other fields in the future in
the network and transport header.
Maciej Żenczykowski Dec. 21, 2021, 10:13 p.m. UTC | #6
On Tue, Dec 21, 2021 at 1:52 PM Martin KaFai Lau <kafai@fb.com> wrote:
> On Tue, Dec 21, 2021 at 11:46:40AM -0800, Maciej Żenczykowski wrote:
> > On Tue, Dec 21, 2021 at 11:16 AM Tyler Wear (QUIC)
> > <quic_twear@quicinc.com> wrote:
> > > > On Mon, Dec 20, 2021 at 07:18:42PM -0800, Yonghong Song wrote:
> > > > > On 12/20/21 12:40 PM, Tyler Wear wrote:
> > > > > > New bpf helper function BPF_FUNC_skb_change_dsfield "int
> > > > > > bpf_skb_change_dsfield(struct sk_buff *skb, u8 mask, u8 value)".
> > > > > > BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can be
> > > > > > attached to the ingress and egress path. The helper is needed
> > > > > > because this type of bpf_prog cannot modify the skb directly.
> > > > > >
> > > > > > Used by a bpf_prog to specify DS field values on egress or ingress.
> > > > >
> > > > > Maybe you can expand a little bit here for your use case?
> > > > > I know DS field might help but a description of your actual use case
> > > > > will make adding this helper more compelling.
> > > > +1.  More details on the use case is needed.
> > > > Also, having an individual helper for each particular header field is too specific.
> > > >
> > > > For egress, there is bpf_setsockopt() for IP_TOS and IPV6_TCLASS and it can be called in other cgroup hooks. e.g.
> > > > BPF_PROG_TYPE_SOCK_OPS during tcp ESTABLISHED event.
> > > > There is an example in tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c.
> > > > Is it enough for egress?
> > >
> > > Using bpf_setsockopt() has 2 issues: 1) it changes the userspace visible state 2) won't work with udp sendmsg cmsg
> >
> > Right, so to clarify since I've been working with Tyler on a project
> > of which this patch is a small component.
> > Note, I may be wrong here, I don't fully understand how all of this
> > works... but:
> >
> > ad 1) AFAIK if bpf calls bpf_setsockopt on the socket in question,
> > then userspace's view of the socket settings via
> > getsockopt(IP_TOS/IPV6_TCLASS) will also be affected - this may be
> > undesirable (it's technically userspace visible change in behaviour
> > and could, as unlikely as it is, lead to application misbehaviour).
> > This can be worked around via also overriding getsockopt/setsockopt
> > with bpf, but then you need to store the value to return to userspace
> > somewhere... AFAICT it all ends up being pretty ugly and very complex.
> CGROUP_(SET|GET)SOCKOPT is created for that.
> The user's value can be stored in bpf_sk_storage.

Yes, it can be done, it's very complex to do so.

The policy can change during run time (indeed that's probably a
relatively likely situation,
network gear notices a new high bandwidth connection and provides out
of band feedback
that it should be using a different dscp code point - we probably
don't want the full policy to
be present in the device because it might be a huge number of entries,
with wildcards).

> > I wouldn't be worried about needing to override each individual field,
> > as the only other field that looks likely to be potentially beneficial
> > to override would be the ipv6 flowlabel.
> >
> > ad 2) I don't think the bpf_setsockopt(IP_TOS/IPV6_TCLASS) approach
> > works for packets generated via udp sendmsg where cmsg is being used
> > to set tos.
> There is CGROUP_UDP[4|6]_SENDMSG.  Right now, it can only change the addr.
> tos/tclass support could be added.

It could, that doesn't seem easier to do than this approach though.

> > 3) I also think the bpf_setsockopt(IP_TOS/IPV6_TCLASS) might be too
> > late, since it would be in response to an already built packet, and
> > would thus presumably only take effect on the next packet, and not for
> > this one, no?
> The bpf_setsockopt can be called in bind and connect.
> Is it not early enough?

It depends, it's actually too early in some cases.  The information
may come post connect.

Worst case the actual dscp marking to use could potentially even be
dynamic, for example tcp pure acks
should use one value, while tcp data packets another (not saying this
is a good idea,
but I've seen hw implementations of pure ack prioritization...)

> > Technically this could be done by attaching the programs to tc egress
> > instead of the cgroup hook, but then it's per interface, which is
> > potentially the wrong granularity...

> Right, there is advantage to do it at higher layer,
> and earlier also.
>
> If the tos/tclass value can be changed early on, the correct
> ip[6] header can be written at the beginning instead
> of redoing it later and need to worry about the skb_clone_writable(),
> rewriting it, do iph->check..etc.

I would indeed like it if we could decouple what userspace wants,
from what the kernel/network actually uses.  There would need to be
some sort of bpf hook,
that takes a socket/flow and returns the tos/dscp to actually use
(based on 5-tuple and other information).

But again, this would be *much* more complex.

> > As for what is driving this?  Upcoming wifi standard to allow access
> > points to inform client devices how to dscp mark individual flows.
> Interesting.
>
> How does the sending host get this dscp value from wifi
> and then affect the dscp of a particular flow?  Is the dscp
> going to be stored in a bpf map for the bpf prog to use?

It gets it out of band via some wifi signaling mechanism.
Tyler probably knows the details.

Storing flow match information to dscp mapping in a bpf map is indeed the plan.

> Are you testing on TCP also?
>
> > As for the patch itself, I wonder if the return value shouldn't be
> > reversed, currently '1 if the DS field is set, 0 if it is not set.'
> > But I think returning 0 on success and an error on failure is more in
> > line with what other bpf helpers do?
> > OTOH, it does match bpf_skb_ecn_set_ce() returning 0 on failure...

> If adding a helper , how about making the bpf_skb_store_bytes()
> available to BPF_PROG_TYPE_CGROUP_SKB?  Then it will be
> more flexible to change other fields in the future in
> the network and transport header.

I assume there's some reason why it's not available?
Martin KaFai Lau Dec. 21, 2021, 11:07 p.m. UTC | #7
On Tue, Dec 21, 2021 at 02:13:04PM -0800, Maciej Żenczykowski wrote:
> > > ad 1) AFAIK if bpf calls bpf_setsockopt on the socket in question,
> > > then userspace's view of the socket settings via
> > > getsockopt(IP_TOS/IPV6_TCLASS) will also be affected - this may be
> > > undesirable (it's technically userspace visible change in behaviour
> > > and could, as unlikely as it is, lead to application misbehaviour).
> > > This can be worked around via also overriding getsockopt/setsockopt
> > > with bpf, but then you need to store the value to return to userspace
> > > somewhere... AFAICT it all ends up being pretty ugly and very complex.
> > CGROUP_(SET|GET)SOCKOPT is created for that.
> > The user's value can be stored in bpf_sk_storage.
> 
> Yes, it can be done, it's very complex to do so.
> 
> The policy can change during run time (indeed that's probably a
> relatively likely situation,
> network gear notices a new high bandwidth connection and provides out
> of band feedback
> that it should be using a different dscp code point - we probably
> don't want the full policy to
> be present in the device because it might be a huge number of entries,
> with wildcards).
ic. got it. The value is very dynamic, so changing tos/tclass of a sock
in a more static hook like bind won't work well.

Details like this should have been in the commit message.

> > > I wouldn't be worried about needing to override each individual field,
> > > as the only other field that looks likely to be potentially beneficial
> > > to override would be the ipv6 flowlabel.
> > >
> > > ad 2) I don't think the bpf_setsockopt(IP_TOS/IPV6_TCLASS) approach
> > > works for packets generated via udp sendmsg where cmsg is being used
> > > to set tos.
> > There is CGROUP_UDP[4|6]_SENDMSG.  Right now, it can only change the addr.
> > tos/tclass support could be added.
> 
> It could, that doesn't seem easier to do than this approach though.
> 

[ ... ]

> > > Technically this could be done by attaching the programs to tc egress
> > > instead of the cgroup hook, but then it's per interface, which is
> > > potentially the wrong granularity...
> 
> > Right, there is advantage to do it at higher layer,
> > and earlier also.
> >
> > If the tos/tclass value can be changed early on, the correct
> > ip[6] header can be written at the beginning instead
> > of redoing it later and need to worry about the skb_clone_writable(),
> > rewriting it, do iph->check..etc.
> 
> I would indeed like it if we could decouple what userspace wants,
> from what the kernel/network actually uses.  There would need to be
> some sort of bpf hook,
> that takes a socket/flow and returns the tos/dscp to actually use
> (based on 5-tuple and other information).
> 
> But again, this would be *much* more complex.
In terms of the bpf prog doing the dscp-logic, it should be
pretty much the same.  Getting the 5-tuple, lookup from a map
and return the dscp value.

> > > As for what is driving this?  Upcoming wifi standard to allow access
> > > points to inform client devices how to dscp mark individual flows.
> > Interesting.
> >
> > How does the sending host get this dscp value from wifi
> > and then affect the dscp of a particular flow?  Is the dscp
> > going to be stored in a bpf map for the bpf prog to use?
> 
> It gets it out of band via some wifi signaling mechanism.
> Tyler probably knows the details.
> 
> Storing flow match information to dscp mapping in a bpf map is indeed the plan.
> 
> > Are you testing on TCP also?
> >
> > > As for the patch itself, I wonder if the return value shouldn't be
> > > reversed, currently '1 if the DS field is set, 0 if it is not set.'
> > > But I think returning 0 on success and an error on failure is more in
> > > line with what other bpf helpers do?
> > > OTOH, it does match bpf_skb_ecn_set_ce() returning 0 on failure...
> 
> > If adding a helper , how about making the bpf_skb_store_bytes()
> > available to BPF_PROG_TYPE_CGROUP_SKB?  Then it will be
> > more flexible to change other fields in the future in
> > the network and transport header.
> 
> I assume there's some reason why it's not available?
Not that I am aware of.  There was just no use case for it.
The static case is handled in socket bind/creation time.
The more dynamic case is handled in the udp_sendmsg which so
far the use case is changing the address only.

I don't mind adding bpf_skb_store_bytes() which could be useful
for other header's fields.
Tyler Wear Dec. 21, 2021, 11:28 p.m. UTC | #8
> On Tue, Dec 21, 2021 at 02:13:04PM -0800, Maciej Żenczykowski wrote:
> > > > As for what is driving this?  Upcoming wifi standard to allow
> > > > access points to inform client devices how to dscp mark individual flows.
> > > Interesting.
> > >
> > > How does the sending host get this dscp value from wifi and then
> > > affect the dscp of a particular flow?  Is the dscp going to be
> > > stored in a bpf map for the bpf prog to use?
> >
> > It gets it out of band via some wifi signaling mechanism.
> > Tyler probably knows the details.
> >
> > Storing flow match information to dscp mapping in a bpf map is indeed the plan.
> >

This is for an upcoming QoS Wifi Alliance spec. Wifi will get the dscp for a corresponding
flow and as Maze said the current plan is to store this map info in bpf to modify packets.
Toke Høiland-Jørgensen Jan. 3, 2022, 8:59 p.m. UTC | #9
Tyler Wear <twear@quicinc.com> writes:

>> On Tue, Dec 21, 2021 at 02:13:04PM -0800, Maciej Żenczykowski wrote:
>> > > > As for what is driving this?  Upcoming wifi standard to allow
>> > > > access points to inform client devices how to dscp mark individual flows.
>> > > Interesting.
>> > >
>> > > How does the sending host get this dscp value from wifi and then
>> > > affect the dscp of a particular flow?  Is the dscp going to be
>> > > stored in a bpf map for the bpf prog to use?
>> >
>> > It gets it out of band via some wifi signaling mechanism.
>> > Tyler probably knows the details.
>> >
>> > Storing flow match information to dscp mapping in a bpf map is indeed the plan.
>> >
>
> This is for an upcoming QoS Wifi Alliance spec.

Got a link, or some other information about the specifics of this? :)

-Toke
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 556216dc9703..742cea7dcf8c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3742,6 +3742,14 @@  union bpf_attr {
  * 	Return
  * 		The helper returns **TC_ACT_REDIRECT** on success or
  * 		**TC_ACT_SHOT** on error.
+ *
+ * long bpf_skb_change_dsfield(struct sk_buff *skb, u8 mask, u8 value)
+ *	Description
+ *		Set DS field of IP header to the specified *value*. The *value*
+ *		is masked with the provided *mask* when ds field is updated.
+ *		Works with IPv6 and IPv4.
+ *	Return
+ *		1 if the DS field is set, 0 if it is not set.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3900,6 +3908,7 @@  union bpf_attr {
 	FN(per_cpu_ptr),		\
 	FN(this_cpu_ptr),		\
 	FN(redirect_peer),		\
+	FN(skb_change_dsfield),		\
 	/* */
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/net/core/filter.c b/net/core/filter.c
index 035d66227ae2..71ea943c8059 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6402,6 +6402,50 @@  BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
 	return INET_ECN_set_ce(skb);
 }
 
+BPF_CALL_3(bpf_skb_change_dsfield, struct sk_buff *, skb, u8, mask, u8, value)
+{
+	unsigned int iphdr_len;
+
+	switch (skb_protocol(skb, true)) {
+	case cpu_to_be16(ETH_P_IP):
+		iphdr_len = sizeof(struct iphdr);
+		break;
+	case cpu_to_be16(ETH_P_IPV6):
+		iphdr_len = sizeof(struct ipv6hdr);
+		break;
+	default:
+		return 0;
+	}
+
+	if (skb_headlen(skb) < iphdr_len)
+		return 0;
+
+	if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len))
+		return 0;
+
+	switch (skb_protocol(skb, true)) {
+	case cpu_to_be16(ETH_P_IP):
+		ipv4_change_dsfield(ipip_hdr(skb), mask, value);
+		break;
+	case cpu_to_be16(ETH_P_IPV6):
+		ipv6_change_dsfield(ipv6_hdr(skb), mask, value);
+		break;
+	default:
+		return 0;
+	}
+
+	return 1;
+}
+
+static const struct bpf_func_proto bpf_skb_change_dsfield_proto = {
+	.func           = bpf_skb_change_dsfield,
+	.gpl_only       = false,
+	.ret_type       = RET_INTEGER,
+	.arg1_type      = ARG_PTR_TO_CTX,
+	.arg2_type      = ARG_ANYTHING,
+	.arg3_type      = ARG_ANYTHING,
+};
+
 bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
 				  struct bpf_insn_access_aux *info)
 {
@@ -7057,6 +7101,8 @@  cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_listener_sock_proto;
 	case BPF_FUNC_skb_ecn_set_ce:
 		return &bpf_skb_ecn_set_ce_proto;
+	case BPF_FUNC_skb_change_dsfield:
+		return &bpf_skb_change_dsfield_proto;
 #endif
 	default:
 		return sk_filter_func_proto(func_id, prog);