diff mbox series

[RFC,2/3] ipv6: Run a reverse sk_lookup on sendmsg.

Message ID 20240913-reverse-sk-lookup-v1-2-e721ea003d4c@cloudflare.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series Allow sk_lookup UDP return traffic to egress. | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-PR fail PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-8 fail Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-4 fail Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-9 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-10 success Logs for s390x-gcc / test
bpf/vmtest-bpf-next-VM_Test-12 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-15 success Logs for x86_64-gcc / test
bpf/vmtest-bpf-next-VM_Test-16 success Logs for x86_64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-llvm-17 / test
bpf/vmtest-bpf-next-VM_Test-17 fail Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-13 fail Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-18 fail Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-22 fail Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-21 fail Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-14 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-llvm-18 / test
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-llvm-18 / veristat
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 16 this patch: 16
netdev/build_tools success No tools touched, skip
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 16 this patch: 16
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 16 this patch: 16
netdev/checkpatch warning WARNING: line length of 100 exceeds 80 columns WARNING: line length of 107 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 93 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns WARNING: line length of 95 exceeds 80 columns WARNING: line length of 97 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline fail Was 0 now: 1

Commit Message

Tiago Lam Sept. 13, 2024, 9:39 a.m. UTC
This follows the same rationale provided for the ipv4 counterpart, where
it now runs a reverse socket lookup when source addresses and/or ports
are changed, on sendmsg, to check whether egress traffic should be
allowed to go through or not.

As with ipv4, the ipv6 sendmsg path is also extended here to support the
IPV6_ORIGDSTADDR ancilliary message to be able to specify a source
address/port.

Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
---
 net/ipv6/datagram.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/udp.c      |  8 ++++--
 2 files changed, 82 insertions(+), 2 deletions(-)

Comments

Martin KaFai Lau Sept. 13, 2024, 6:24 p.m. UTC | #1
On 9/13/24 2:39 AM, Tiago Lam wrote:
> This follows the same rationale provided for the ipv4 counterpart, where
> it now runs a reverse socket lookup when source addresses and/or ports
> are changed, on sendmsg, to check whether egress traffic should be
> allowed to go through or not.
> 
> As with ipv4, the ipv6 sendmsg path is also extended here to support the
> IPV6_ORIGDSTADDR ancilliary message to be able to specify a source
> address/port.
> 
> Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
> ---
>   net/ipv6/datagram.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>   net/ipv6/udp.c      |  8 ++++--
>   2 files changed, 82 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
> index fff78496803d..4214dda1c320 100644
> --- a/net/ipv6/datagram.c
> +++ b/net/ipv6/datagram.c
> @@ -756,6 +756,27 @@ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
>   }
>   EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
>   
> +static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
> +				     struct in6_addr *saddr, __be16 sport)
> +{
> +	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
> +	    (saddr && sport) &&
> +	    (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) || inet_sk(sk)->inet_sport != sport)) {
> +		struct sock *sk_egress;
> +
> +		bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr, fl6->fl6_dport,
> +				     saddr, ntohs(sport), 0, &sk_egress);

iirc, in the ingress path, the sk could also be selected by a tc bpf prog doing 
bpf_sk_assign. Then this re-run on sk_lookup may give an incorrect result?

In general, is it necessary to rerun any bpf prog if the user space has 
specified the IP[v6]_ORIGDSTADDR.

> +		if (!IS_ERR_OR_NULL(sk_egress) &&
> +		    atomic64_read(&sk_egress->sk_cookie) == atomic64_read(&sk->sk_cookie))
> +			return true;
> +
> +		net_info_ratelimited("No reverse socket lookup match for local addr %pI6:%d remote addr %pI6:%d\n",
> +				     &saddr, ntohs(sport), &fl6->daddr, ntohs(fl6->fl6_dport));
> +	}
> +
> +	return false;
> +}
> +
>   int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>   			  struct msghdr *msg, struct flowi6 *fl6,
>   			  struct ipcm6_cookie *ipc6)
> @@ -844,7 +865,62 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>   
>   			break;
>   		    }
> +		case IPV6_ORIGDSTADDR:
> +			{
> +			struct sockaddr_in6 *sockaddr_in;
> +			struct net_device *dev = NULL;
> +
> +			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct sockaddr_in6))) {
> +				err = -EINVAL;
> +				goto exit_f;
> +			}
> +
> +			sockaddr_in = (struct sockaddr_in6 *)CMSG_DATA(cmsg);
> +
> +			addr_type = __ipv6_addr_type(&sockaddr_in->sin6_addr);
> +
> +			if (addr_type & IPV6_ADDR_LINKLOCAL)
> +				return -EINVAL;
> +
> +			/* If we're egressing with a different source address and/or port, we
> +			 * perform a reverse socket lookup.  The rationale behind this is that we
> +			 * can allow return UDP traffic that has ingressed through sk_lookup to
> +			 * also egress correctly. In case the reverse lookup fails, we
> +			 * continue with the normal path.
> +			 *
> +			 * The lookup is performed if either source address and/or port changed, and
> +			 * neither is "0".
> +			 */
> +			if (reverse_sk_lookup(fl6, sk, &sockaddr_in->sin6_addr,
> +					      sockaddr_in->sin6_port)) {
> +				/* Override the source port and address to use with the one we
> +				 * got in cmsg and bail early.
> +				 */
> +				fl6->saddr = sockaddr_in->sin6_addr;
> +				fl6->fl6_sport = sockaddr_in->sin6_port;
> +				break;
> +			}
>   
> +			if (addr_type != IPV6_ADDR_ANY) {
> +				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
> +
> +				if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) &&
> +				    !ipv6_chk_addr_and_flags(net,
> +							     &sockaddr_in->sin6_addr,
> +							     dev, !strict, 0,
> +							     IFA_F_TENTATIVE) &&
> +				    !ipv6_chk_acast_addr_src(net, dev,
> +							     &sockaddr_in->sin6_addr))
> +					err = -EINVAL;
> +				else
> +					fl6->saddr = sockaddr_in->sin6_addr;
> +			}
> +
> +			if (err)
> +				goto exit_f;
> +
> +			break;
> +			}
>   		case IPV6_FLOWINFO:
>   			if (cmsg->cmsg_len < CMSG_LEN(4)) {
>   				err = -EINVAL;
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 6602a2e9cdb5..6121cbb71ad3 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1476,6 +1476,12 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>   
>   	fl6->flowi6_uid = sk->sk_uid;
>   
> +	/* We use fl6's daddr and fl6_sport in the reverse sk_lookup done
> +	 * within ip6_datagram_send_ctl() now.
> +	 */
> +	fl6->daddr = *daddr;
> +	fl6->fl6_sport = inet->inet_sport;
> +
>   	if (msg->msg_controllen) {
>   		opt = &opt_space;
>   		memset(opt, 0, sizeof(struct ipv6_txoptions));
> @@ -1511,10 +1517,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>   
>   	fl6->flowi6_proto = sk->sk_protocol;
>   	fl6->flowi6_mark = ipc6.sockc.mark;
> -	fl6->daddr = *daddr;
>   	if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
>   		fl6->saddr = np->saddr;
> -	fl6->fl6_sport = inet->inet_sport;
>   
>   	if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
>   		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
>
Simon Horman Sept. 14, 2024, 8:59 a.m. UTC | #2
On Fri, Sep 13, 2024 at 10:39:20AM +0100, Tiago Lam wrote:
> This follows the same rationale provided for the ipv4 counterpart, where
> it now runs a reverse socket lookup when source addresses and/or ports
> are changed, on sendmsg, to check whether egress traffic should be
> allowed to go through or not.
> 
> As with ipv4, the ipv6 sendmsg path is also extended here to support the
> IPV6_ORIGDSTADDR ancilliary message to be able to specify a source

Hi Tiago Lam,

Some minor nits from my side.

ancilliary -> ancillary

Likewise in patch 3/3.
Flagged by checkpatch.pl --codespell

> address/port.
> 
> Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
> ---
>  net/ipv6/datagram.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  net/ipv6/udp.c      |  8 ++++--
>  2 files changed, 82 insertions(+), 2 deletions(-)
> 
> diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
> index fff78496803d..4214dda1c320 100644
> --- a/net/ipv6/datagram.c
> +++ b/net/ipv6/datagram.c
> @@ -756,6 +756,27 @@ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
>  }
>  EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
>  
> +static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
> +				     struct in6_addr *saddr, __be16 sport)
> +{
> +	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
> +	    (saddr && sport) &&
> +	    (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) || inet_sk(sk)->inet_sport != sport)) {

Please consider, where it can trivially be achieved, limiting Networking
code to 80 columns wide.

Checkpatch can be run with a flag to check for this.

> +		struct sock *sk_egress;
> +
> +		bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr, fl6->fl6_dport,
> +				     saddr, ntohs(sport), 0, &sk_egress);
> +		if (!IS_ERR_OR_NULL(sk_egress) &&
> +		    atomic64_read(&sk_egress->sk_cookie) == atomic64_read(&sk->sk_cookie))
> +			return true;
> +
> +		net_info_ratelimited("No reverse socket lookup match for local addr %pI6:%d remote addr %pI6:%d\n",
> +				     &saddr, ntohs(sport), &fl6->daddr, ntohs(fl6->fl6_dport));
> +	}
> +
> +	return false;
> +}
> +
>  int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>  			  struct msghdr *msg, struct flowi6 *fl6,
>  			  struct ipcm6_cookie *ipc6)

...
Eric Dumazet Sept. 14, 2024, 11:40 a.m. UTC | #3
On Fri, Sep 13, 2024 at 11:39 AM Tiago Lam <tiagolam@cloudflare.com> wrote:
>
> This follows the same rationale provided for the ipv4 counterpart, where
> it now runs a reverse socket lookup when source addresses and/or ports
> are changed, on sendmsg, to check whether egress traffic should be
> allowed to go through or not.
>
> As with ipv4, the ipv6 sendmsg path is also extended here to support the
> IPV6_ORIGDSTADDR ancilliary message to be able to specify a source
> address/port.
>
> Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
> Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
> ---
>  net/ipv6/datagram.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  net/ipv6/udp.c      |  8 ++++--
>  2 files changed, 82 insertions(+), 2 deletions(-)
>
> diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
> index fff78496803d..4214dda1c320 100644
> --- a/net/ipv6/datagram.c
> +++ b/net/ipv6/datagram.c
> @@ -756,6 +756,27 @@ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
>  }
>  EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
>
> +static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
> +                                    struct in6_addr *saddr, __be16 sport)
> +{
> +       if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
> +           (saddr && sport) &&
> +           (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) || inet_sk(sk)->inet_sport != sport)) {
> +               struct sock *sk_egress;
> +
> +               bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr, fl6->fl6_dport,
> +                                    saddr, ntohs(sport), 0, &sk_egress);
> +               if (!IS_ERR_OR_NULL(sk_egress) &&
> +                   atomic64_read(&sk_egress->sk_cookie) == atomic64_read(&sk->sk_cookie))

I do not understand this.

1) sk_cookie is not always initialized. It is done on demand, when/if
__sock_gen_cookie() was called.

2) if sk1 and sk2 share the same sk_cookie, then sk1 == sk2 ???

So why not simply testing sk_egress == sk ?

> +                       return true;
> +
> +               net_info_ratelimited("No reverse socket lookup match for local addr %pI6:%d remote addr %pI6:%d\n",
> +                                    &saddr, ntohs(sport), &fl6->daddr, ntohs(fl6->fl6_dport));
> +       }
> +
> +       return false;
> +}
> +
>  int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>                           struct msghdr *msg, struct flowi6 *fl6,
>                           struct ipcm6_cookie *ipc6)
> @@ -844,7 +865,62 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
>
>                         break;
>                     }
> +               case IPV6_ORIGDSTADDR:
> +                       {
> +                       struct sockaddr_in6 *sockaddr_in;
> +                       struct net_device *dev = NULL;
> +
> +                       if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct sockaddr_in6))) {
> +                               err = -EINVAL;
> +                               goto exit_f;
> +                       }
> +
> +                       sockaddr_in = (struct sockaddr_in6 *)CMSG_DATA(cmsg);
> +
> +                       addr_type = __ipv6_addr_type(&sockaddr_in->sin6_addr);
> +
> +                       if (addr_type & IPV6_ADDR_LINKLOCAL)
> +                               return -EINVAL;
> +
> +                       /* If we're egressing with a different source address and/or port, we
> +                        * perform a reverse socket lookup.  The rationale behind this is that we
> +                        * can allow return UDP traffic that has ingressed through sk_lookup to
> +                        * also egress correctly. In case the reverse lookup fails, we
> +                        * continue with the normal path.
> +                        *
> +                        * The lookup is performed if either source address and/or port changed, and
> +                        * neither is "0".
> +                        */
> +                       if (reverse_sk_lookup(fl6, sk, &sockaddr_in->sin6_addr,
> +                                             sockaddr_in->sin6_port)) {
> +                               /* Override the source port and address to use with the one we
> +                                * got in cmsg and bail early.
> +                                */
> +                               fl6->saddr = sockaddr_in->sin6_addr;
> +                               fl6->fl6_sport = sockaddr_in->sin6_port;
> +                               break;
> +                       }
>
> +                       if (addr_type != IPV6_ADDR_ANY) {
> +                               int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
> +
> +                               if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) &&
> +                                   !ipv6_chk_addr_and_flags(net,
> +                                                            &sockaddr_in->sin6_addr,
> +                                                            dev, !strict, 0,
> +                                                            IFA_F_TENTATIVE) &&
> +                                   !ipv6_chk_acast_addr_src(net, dev,
> +                                                            &sockaddr_in->sin6_addr))
> +                                       err = -EINVAL;
> +                               else
> +                                       fl6->saddr = sockaddr_in->sin6_addr;
> +                       }
> +
> +                       if (err)
> +                               goto exit_f;
> +
> +                       break;
> +                       }
>                 case IPV6_FLOWINFO:
>                         if (cmsg->cmsg_len < CMSG_LEN(4)) {
>                                 err = -EINVAL;
> diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> index 6602a2e9cdb5..6121cbb71ad3 100644
> --- a/net/ipv6/udp.c
> +++ b/net/ipv6/udp.c
> @@ -1476,6 +1476,12 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>
>         fl6->flowi6_uid = sk->sk_uid;
>
> +       /* We use fl6's daddr and fl6_sport in the reverse sk_lookup done
> +        * within ip6_datagram_send_ctl() now.
> +        */
> +       fl6->daddr = *daddr;
> +       fl6->fl6_sport = inet->inet_sport;
> +
>         if (msg->msg_controllen) {
>                 opt = &opt_space;
>                 memset(opt, 0, sizeof(struct ipv6_txoptions));
> @@ -1511,10 +1517,8 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
>
>         fl6->flowi6_proto = sk->sk_protocol;
>         fl6->flowi6_mark = ipc6.sockc.mark;
> -       fl6->daddr = *daddr;
>         if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
>                 fl6->saddr = np->saddr;
> -       fl6->fl6_sport = inet->inet_sport;
>
>         if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
>                 err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
>
> --
> 2.34.1
>
Tiago Lam Sept. 17, 2024, 4:03 p.m. UTC | #4
On Sat, Sep 14, 2024 at 01:40:25PM +0200, Eric Dumazet wrote:
> On Fri, Sep 13, 2024 at 11:39 AM Tiago Lam <tiagolam@cloudflare.com> wrote:
> >
> > This follows the same rationale provided for the ipv4 counterpart, where
> > it now runs a reverse socket lookup when source addresses and/or ports
> > are changed, on sendmsg, to check whether egress traffic should be
> > allowed to go through or not.
> >
> > As with ipv4, the ipv6 sendmsg path is also extended here to support the
> > IPV6_ORIGDSTADDR ancilliary message to be able to specify a source
> > address/port.
> >
> > Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
> > Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
> > ---
> >  net/ipv6/datagram.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  net/ipv6/udp.c      |  8 ++++--
> >  2 files changed, 82 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
> > index fff78496803d..4214dda1c320 100644
> > --- a/net/ipv6/datagram.c
> > +++ b/net/ipv6/datagram.c
> > @@ -756,6 +756,27 @@ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
> >  }
> >  EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
> >
> > +static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
> > +                                    struct in6_addr *saddr, __be16 sport)
> > +{
> > +       if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
> > +           (saddr && sport) &&
> > +           (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) || inet_sk(sk)->inet_sport != sport)) {
> > +               struct sock *sk_egress;
> > +
> > +               bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr, fl6->fl6_dport,
> > +                                    saddr, ntohs(sport), 0, &sk_egress);
> > +               if (!IS_ERR_OR_NULL(sk_egress) &&
> > +                   atomic64_read(&sk_egress->sk_cookie) == atomic64_read(&sk->sk_cookie))
> 
> I do not understand this.
> 
> 1) sk_cookie is not always initialized. It is done on demand, when/if
> __sock_gen_cookie() was called.
> 
> 2) if sk1 and sk2 share the same sk_cookie, then sk1 == sk2 ???
> 
> So why not simply testing sk_egress == sk ?
> 

Oh, yes, you're right. I'll include this in my next revision, thanks.
Tiago Lam Sept. 17, 2024, 4:06 p.m. UTC | #5
On Sat, Sep 14, 2024 at 09:59:50AM +0100, Simon Horman wrote:
> On Fri, Sep 13, 2024 at 10:39:20AM +0100, Tiago Lam wrote:
> > This follows the same rationale provided for the ipv4 counterpart, where
> > it now runs a reverse socket lookup when source addresses and/or ports
> > are changed, on sendmsg, to check whether egress traffic should be
> > allowed to go through or not.
> > 
> > As with ipv4, the ipv6 sendmsg path is also extended here to support the
> > IPV6_ORIGDSTADDR ancilliary message to be able to specify a source
> 
> Hi Tiago Lam,
> 
> Some minor nits from my side.
> 
> ancilliary -> ancillary
> 
> Likewise in patch 3/3.
> Flagged by checkpatch.pl --codespell
> 
> > address/port.
> > 
> > Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
> > Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
> > ---
> >  net/ipv6/datagram.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  net/ipv6/udp.c      |  8 ++++--
> >  2 files changed, 82 insertions(+), 2 deletions(-)
> > 
> > diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
> > index fff78496803d..4214dda1c320 100644
> > --- a/net/ipv6/datagram.c
> > +++ b/net/ipv6/datagram.c
> > @@ -756,6 +756,27 @@ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
> >  }
> >  EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
> >  
> > +static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
> > +				     struct in6_addr *saddr, __be16 sport)
> > +{
> > +	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
> > +	    (saddr && sport) &&
> > +	    (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) || inet_sk(sk)->inet_sport != sport)) {
> 
> Please consider, where it can trivially be achieved, limiting Networking
> code to 80 columns wide.
> 
> Checkpatch can be run with a flag to check for this.
> 

Thanks for the hints here, I've addressed these and will include the
changes into the next revision. I use b4 which takes care of some of
this checks, but I'll make sure I change my settings to use
`--max-line-length=80` as well.

Tiago.
Tiago Lam Sept. 17, 2024, 4:15 p.m. UTC | #6
On Fri, Sep 13, 2024 at 11:24:09AM -0700, Martin KaFai Lau wrote:
> On 9/13/24 2:39 AM, Tiago Lam wrote:
> > This follows the same rationale provided for the ipv4 counterpart, where
> > it now runs a reverse socket lookup when source addresses and/or ports
> > are changed, on sendmsg, to check whether egress traffic should be
> > allowed to go through or not.
> > 
> > As with ipv4, the ipv6 sendmsg path is also extended here to support the
> > IPV6_ORIGDSTADDR ancilliary message to be able to specify a source
> > address/port.
> > 
> > Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
> > Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
> > ---
> >   net/ipv6/datagram.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> >   net/ipv6/udp.c      |  8 ++++--
> >   2 files changed, 82 insertions(+), 2 deletions(-)
> > 
> > diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
> > index fff78496803d..4214dda1c320 100644
> > --- a/net/ipv6/datagram.c
> > +++ b/net/ipv6/datagram.c
> > @@ -756,6 +756,27 @@ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
> >   }
> >   EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
> > +static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
> > +				     struct in6_addr *saddr, __be16 sport)
> > +{
> > +	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
> > +	    (saddr && sport) &&
> > +	    (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) || inet_sk(sk)->inet_sport != sport)) {
> > +		struct sock *sk_egress;
> > +
> > +		bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr, fl6->fl6_dport,
> > +				     saddr, ntohs(sport), 0, &sk_egress);
> 
> iirc, in the ingress path, the sk could also be selected by a tc bpf prog
> doing bpf_sk_assign. Then this re-run on sk_lookup may give an incorrect
> result?
> 

If it does give the incorrect result, we still fallback to the normal
egress path.

> In general, is it necessary to rerun any bpf prog if the user space has
> specified the IP[v6]_ORIGDSTADDR.
> 

More generally, wouldn't that also be the case if someone calls
bpf_sk_assign() in both TC and sk_lookup on ingress? It can lead to some
interference between the two.

It seems like the interesting cases are:
1. Calling bpf_sk_assign() on both TC and sk_lookup ingress: if this
happens sk_lookup on egress should match the correct socket when doing
the reverse lookup;
2. Calling bpf_sk_assign() only on ingress TC: in this case it will
depend if an sk_lookup program is attached or not:
  a. If not, there's no reverse lookup on egress either;
  b. But if yes, although the reverse sk_lookup here won't match the
  initial socket assigned at ingress TC, the packets will still fallback
  to the normal egress path;

You're right in that case 2b above will continue with the same
restrictions as before.

Tiago.
Martin KaFai Lau Sept. 24, 2024, 11:58 p.m. UTC | #7
On 9/17/24 6:15 PM, Tiago Lam wrote:
> On Fri, Sep 13, 2024 at 11:24:09AM -0700, Martin KaFai Lau wrote:
>> On 9/13/24 2:39 AM, Tiago Lam wrote:
>>> This follows the same rationale provided for the ipv4 counterpart, where
>>> it now runs a reverse socket lookup when source addresses and/or ports
>>> are changed, on sendmsg, to check whether egress traffic should be
>>> allowed to go through or not.
>>>
>>> As with ipv4, the ipv6 sendmsg path is also extended here to support the
>>> IPV6_ORIGDSTADDR ancilliary message to be able to specify a source
>>> address/port.
>>>
>>> Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
>>> Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
>>> ---
>>>    net/ipv6/datagram.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>>>    net/ipv6/udp.c      |  8 ++++--
>>>    2 files changed, 82 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
>>> index fff78496803d..4214dda1c320 100644
>>> --- a/net/ipv6/datagram.c
>>> +++ b/net/ipv6/datagram.c
>>> @@ -756,6 +756,27 @@ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
>>>    }
>>>    EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
>>> +static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
>>> +				     struct in6_addr *saddr, __be16 sport)
>>> +{
>>> +	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
>>> +	    (saddr && sport) &&
>>> +	    (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) || inet_sk(sk)->inet_sport != sport)) {
>>> +		struct sock *sk_egress;
>>> +
>>> +		bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr, fl6->fl6_dport,
>>> +				     saddr, ntohs(sport), 0, &sk_egress);
>>
>> iirc, in the ingress path, the sk could also be selected by a tc bpf prog
>> doing bpf_sk_assign. Then this re-run on sk_lookup may give an incorrect
>> result?
>>
> 
> If it does give the incorrect result, we still fallback to the normal
> egress path.
> 
>> In general, is it necessary to rerun any bpf prog if the user space has
>> specified the IP[v6]_ORIGDSTADDR.
>>
> 
> More generally, wouldn't that also be the case if someone calls
> bpf_sk_assign() in both TC and sk_lookup on ingress? It can lead to some
> interference between the two.
> 
> It seems like the interesting cases are:
> 1. Calling bpf_sk_assign() on both TC and sk_lookup ingress: if this
> happens sk_lookup on egress should match the correct socket when doing
> the reverse lookup;
> 2. Calling bpf_sk_assign() only on ingress TC: in this case it will
> depend if an sk_lookup program is attached or not:
>    a. If not, there's no reverse lookup on egress either;
>    b. But if yes, although the reverse sk_lookup here won't match the
>    initial socket assigned at ingress TC, the packets will still fallback
>    to the normal egress path;
> 
> You're right in that case 2b above will continue with the same
> restrictions as before.

imo, all these cases you described above is a good signal that neither the TC 
nor the BPF_PROG_TYPE_SK_LOOKUP program type is the right bpf prog to run here 
_if_ a bpf prog was indeed useful here.

I only followed some of the other discussion in v1 and v2. For now, I still 
don't see running a bpf prog is useful here to process the IP[V6]_ORIGDSTADDR. 
Jakub Sitnicki and I had discussed a similar point during the LPC.

If a bpf prog was indeed needed to process a cmsg, this should work closer to 
what Jakub Sitnicki had proposed for getting the meta data during LPC (but I 
believe the verdict there is also that a bpf prog is not needed). It should be a 
bpf prog that can work in a more generic way to process any BPF specific cmsg 
and can do other operations in the future using kfunc (e.g. route lookup or 
something). Saying yes/no to a particular local IP and port could be one of 
things that the bpf prog can do when processing the cmsg.
Tiago Lam Oct. 11, 2024, 11:21 a.m. UTC | #8
On Tue, Sep 24, 2024 at 04:58:19PM -0700, Martin KaFai Lau wrote:
> On 9/17/24 6:15 PM, Tiago Lam wrote:
> > On Fri, Sep 13, 2024 at 11:24:09AM -0700, Martin KaFai Lau wrote:
> > > On 9/13/24 2:39 AM, Tiago Lam wrote:
> > > > This follows the same rationale provided for the ipv4 counterpart, where
> > > > it now runs a reverse socket lookup when source addresses and/or ports
> > > > are changed, on sendmsg, to check whether egress traffic should be
> > > > allowed to go through or not.
> > > > 
> > > > As with ipv4, the ipv6 sendmsg path is also extended here to support the
> > > > IPV6_ORIGDSTADDR ancilliary message to be able to specify a source
> > > > address/port.
> > > > 
> > > > Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
> > > > Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
> > > > ---
> > > >    net/ipv6/datagram.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> > > >    net/ipv6/udp.c      |  8 ++++--
> > > >    2 files changed, 82 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
> > > > index fff78496803d..4214dda1c320 100644
> > > > --- a/net/ipv6/datagram.c
> > > > +++ b/net/ipv6/datagram.c
> > > > @@ -756,6 +756,27 @@ void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
> > > >    }
> > > >    EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
> > > > +static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
> > > > +				     struct in6_addr *saddr, __be16 sport)
> > > > +{
> > > > +	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
> > > > +	    (saddr && sport) &&
> > > > +	    (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) || inet_sk(sk)->inet_sport != sport)) {
> > > > +		struct sock *sk_egress;
> > > > +
> > > > +		bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr, fl6->fl6_dport,
> > > > +				     saddr, ntohs(sport), 0, &sk_egress);
> > > 
> > > iirc, in the ingress path, the sk could also be selected by a tc bpf prog
> > > doing bpf_sk_assign. Then this re-run on sk_lookup may give an incorrect
> > > result?
> > > 
> > 
> > If it does give the incorrect result, we still fallback to the normal
> > egress path.
> > 
> > > In general, is it necessary to rerun any bpf prog if the user space has
> > > specified the IP[v6]_ORIGDSTADDR.
> > > 
> > 
> > More generally, wouldn't that also be the case if someone calls
> > bpf_sk_assign() in both TC and sk_lookup on ingress? It can lead to some
> > interference between the two.
> > 
> > It seems like the interesting cases are:
> > 1. Calling bpf_sk_assign() on both TC and sk_lookup ingress: if this
> > happens sk_lookup on egress should match the correct socket when doing
> > the reverse lookup;
> > 2. Calling bpf_sk_assign() only on ingress TC: in this case it will
> > depend if an sk_lookup program is attached or not:
> >    a. If not, there's no reverse lookup on egress either;
> >    b. But if yes, although the reverse sk_lookup here won't match the
> >    initial socket assigned at ingress TC, the packets will still fallback
> >    to the normal egress path;
> > 
> > You're right in that case 2b above will continue with the same
> > restrictions as before.
> 
> imo, all these cases you described above is a good signal that neither the
> TC nor the BPF_PROG_TYPE_SK_LOOKUP program type is the right bpf prog to run
> here _if_ a bpf prog was indeed useful here.
> 
> I only followed some of the other discussion in v1 and v2. For now, I still
> don't see running a bpf prog is useful here to process the
> IP[V6]_ORIGDSTADDR. Jakub Sitnicki and I had discussed a similar point
> during the LPC.
> 
> If a bpf prog was indeed needed to process a cmsg, this should work closer
> to what Jakub Sitnicki had proposed for getting the meta data during LPC
> (but I believe the verdict there is also that a bpf prog is not needed). It
> should be a bpf prog that can work in a more generic way to process any BPF
> specific cmsg and can do other operations in the future using kfunc (e.g.
> route lookup or something). Saying yes/no to a particular local IP and port
> could be one of things that the bpf prog can do when processing the cmsg.

Thanks for the feeback here, Martin. And apologies for the delay in
respoding to this.

I think you do have a point, and after syncing up some more with Jakub
about your discussion during the LPC, the argument that applications can
already bind to a specific address + port to send their traffic from
makes sense to me. However, I think we could introduce a new cmsg in
sendmsg to allow apps to set the source port to egress from, extending
what they can already do with IP_PKTINFO, i.e. setting the source IP.
We'd need to take care with priviledged and reserved ports, but this
would avoid applications having to do an extra bind. Do you have any
thoughts on this?

Tiago.
diff mbox series

Patch

diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index fff78496803d..4214dda1c320 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -756,6 +756,27 @@  void ip6_datagram_recv_ctl(struct sock *sk, struct msghdr *msg,
 }
 EXPORT_SYMBOL_GPL(ip6_datagram_recv_ctl);
 
+static inline bool reverse_sk_lookup(struct flowi6 *fl6, struct sock *sk,
+				     struct in6_addr *saddr, __be16 sport)
+{
+	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
+	    (saddr && sport) &&
+	    (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, saddr) || inet_sk(sk)->inet_sport != sport)) {
+		struct sock *sk_egress;
+
+		bpf_sk_lookup_run_v6(sock_net(sk), IPPROTO_UDP, &fl6->daddr, fl6->fl6_dport,
+				     saddr, ntohs(sport), 0, &sk_egress);
+		if (!IS_ERR_OR_NULL(sk_egress) &&
+		    atomic64_read(&sk_egress->sk_cookie) == atomic64_read(&sk->sk_cookie))
+			return true;
+
+		net_info_ratelimited("No reverse socket lookup match for local addr %pI6:%d remote addr %pI6:%d\n",
+				     &saddr, ntohs(sport), &fl6->daddr, ntohs(fl6->fl6_dport));
+	}
+
+	return false;
+}
+
 int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 			  struct msghdr *msg, struct flowi6 *fl6,
 			  struct ipcm6_cookie *ipc6)
@@ -844,7 +865,62 @@  int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 
 			break;
 		    }
+		case IPV6_ORIGDSTADDR:
+			{
+			struct sockaddr_in6 *sockaddr_in;
+			struct net_device *dev = NULL;
+
+			if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct sockaddr_in6))) {
+				err = -EINVAL;
+				goto exit_f;
+			}
+
+			sockaddr_in = (struct sockaddr_in6 *)CMSG_DATA(cmsg);
+
+			addr_type = __ipv6_addr_type(&sockaddr_in->sin6_addr);
+
+			if (addr_type & IPV6_ADDR_LINKLOCAL)
+				return -EINVAL;
+
+			/* If we're egressing with a different source address and/or port, we
+			 * perform a reverse socket lookup.  The rationale behind this is that we
+			 * can allow return UDP traffic that has ingressed through sk_lookup to
+			 * also egress correctly. In case the reverse lookup fails, we
+			 * continue with the normal path.
+			 *
+			 * The lookup is performed if either source address and/or port changed, and
+			 * neither is "0".
+			 */
+			if (reverse_sk_lookup(fl6, sk, &sockaddr_in->sin6_addr,
+					      sockaddr_in->sin6_port)) {
+				/* Override the source port and address to use with the one we
+				 * got in cmsg and bail early.
+				 */
+				fl6->saddr = sockaddr_in->sin6_addr;
+				fl6->fl6_sport = sockaddr_in->sin6_port;
+				break;
+			}
 
+			if (addr_type != IPV6_ADDR_ANY) {
+				int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
+
+				if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) &&
+				    !ipv6_chk_addr_and_flags(net,
+							     &sockaddr_in->sin6_addr,
+							     dev, !strict, 0,
+							     IFA_F_TENTATIVE) &&
+				    !ipv6_chk_acast_addr_src(net, dev,
+							     &sockaddr_in->sin6_addr))
+					err = -EINVAL;
+				else
+					fl6->saddr = sockaddr_in->sin6_addr;
+			}
+
+			if (err)
+				goto exit_f;
+
+			break;
+			}
 		case IPV6_FLOWINFO:
 			if (cmsg->cmsg_len < CMSG_LEN(4)) {
 				err = -EINVAL;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6602a2e9cdb5..6121cbb71ad3 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1476,6 +1476,12 @@  int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	fl6->flowi6_uid = sk->sk_uid;
 
+	/* We use fl6's daddr and fl6_sport in the reverse sk_lookup done
+	 * within ip6_datagram_send_ctl() now.
+	 */
+	fl6->daddr = *daddr;
+	fl6->fl6_sport = inet->inet_sport;
+
 	if (msg->msg_controllen) {
 		opt = &opt_space;
 		memset(opt, 0, sizeof(struct ipv6_txoptions));
@@ -1511,10 +1517,8 @@  int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	fl6->flowi6_proto = sk->sk_protocol;
 	fl6->flowi6_mark = ipc6.sockc.mark;
-	fl6->daddr = *daddr;
 	if (ipv6_addr_any(&fl6->saddr) && !ipv6_addr_any(&np->saddr))
 		fl6->saddr = np->saddr;
-	fl6->fl6_sport = inet->inet_sport;
 
 	if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
 		err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,