diff mbox series

[RFC,v2,1/3] ipv4: Support setting src port in sendmsg().

Message ID 20240920-reverse-sk-lookup-v2-1-916a48c47d56@cloudflare.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series Allow sk_lookup UDP return traffic to egress when setting src port/address. | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Guessed tree name to be net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 25 this patch: 25
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers success CCed 7 of 7 maintainers
netdev/build_clang success Errors and warnings before: 37 this patch: 37
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1757 this patch: 1757
netdev/checkpatch warning WARNING: line length of 83 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 3 this patch: 3
netdev/source_inline success Was 0 now: 0

Commit Message

Tiago Lam Sept. 20, 2024, 5:02 p.m. UTC
sendmsg() doesn't currently allow users to set the src port from which
egress traffic should be sent from. This is possible if a user wants to
configure the src address from which egress traffic should be sent from
- with the IP_PKTINFO ancillary message, a user is currently able to
  specify a source address to egress from when calling sendmsg().
However, this still requires the user to set the IP_TRANSPARENT flag
using setsockopt(), which happens to require special privileges in the
case of IPv4.

To support users setting the src port for egress traffic when using
sendmsg(), this patch extends the ancillary messages supported by
sendmsg() to support the IP_ORIGDSTADDR ancillary message, reusing the
same cmsg and struct used in recvmsg() - which already supports
specifying a port.

Additionally, to avoid having to have special configurations, such as
IP_TRANSPARENT, this patch allows egress traffic that's been configured
using (the newly added) IP_ORIGDSTADDR to proceed if there's an ingress
socket lookup (sk_lookup) that matches that traffic - by performing a
reserve sk_lookup. Thus, if the sk_lookup reverse call returns a socket
that matches the egress socket, we also let the egress traffic through -
following the principle of, allowing return traffic to proceed if
ingress traffic is allowed in. In case no match is found in the reverse
sk_lookup, traffic falls back to the regular egress path.

This reverse lookup is only performed in case an sk_lookup ebpf program
is attached and the source address and/or port for the return traffic
have been modified using the (newly added) IP_ORIGDSTADDR in sendmsg.

Suggested-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Tiago Lam <tiagolam@cloudflare.com>
---
 include/net/ip.h       |  1 +
 net/ipv4/ip_sockglue.c | 11 +++++++++++
 net/ipv4/udp.c         | 35 ++++++++++++++++++++++++++++++++++-
 3 files changed, 46 insertions(+), 1 deletion(-)
diff mbox series

Patch

diff --git a/include/net/ip.h b/include/net/ip.h
index c5606cadb1a5..e5753abd7247 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -75,6 +75,7 @@  static inline unsigned int ip_hdrlen(const struct sk_buff *skb)
 struct ipcm_cookie {
 	struct sockcm_cookie	sockc;
 	__be32			addr;
+	__be16			port;
 	int			oif;
 	struct ip_options_rcu	*opt;
 	__u8			protocol;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cf377377b52d..6e55bd25b5f7 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -297,6 +297,17 @@  int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 			ipc->addr = info->ipi_spec_dst.s_addr;
 			break;
 		}
+		case IP_ORIGDSTADDR:
+		{
+			struct sockaddr_in *dst_addr;
+
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct sockaddr_in)))
+				return -EINVAL;
+			dst_addr = (struct sockaddr_in *)CMSG_DATA(cmsg);
+			ipc->port = dst_addr->sin_port;
+			ipc->addr = dst_addr->sin_addr.s_addr;
+			break;
+		}
 		case IP_TTL:
 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
 				return -EINVAL;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 49c622e743e8..208cee40c0ec 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1060,6 +1060,7 @@  int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 	DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
 	struct flowi4 fl4_stack;
 	struct flowi4 *fl4;
+	__u8 flow_flags = inet_sk_flowi_flags(sk);
 	int ulen = len;
 	struct ipcm_cookie ipc;
 	struct rtable *rt = NULL;
@@ -1179,6 +1180,39 @@  int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 		}
 	}
 
+	/* If we're egressing with a different source address and/or port, we
+	 * perform a reverse socket lookup.  The rationale behind this is that
+	 * we can allow return UDP traffic that has ingressed through sk_lookup
+	 * to also egress correctly. In case this the reverse lookup fails.
+	 *
+	 * The lookup is performed if either source address and/or port
+	 * changed, and neither is "0".
+	 */
+	if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
+	    !connected &&
+	    (ipc.port && ipc.addr) &&
+	    (inet->inet_saddr != ipc.addr || inet->inet_sport != ipc.port)) {
+		struct sock *sk_egress;
+
+		bpf_sk_lookup_run_v4(sock_net(sk), IPPROTO_UDP,
+				     daddr, dport, ipc.addr, ntohs(ipc.port),
+				     1, &sk_egress);
+		if (IS_ERR_OR_NULL(sk_egress) || sk_egress != sk) {
+			net_info_ratelimited("No reverse socket lookup match for local addr %pI4:%d remote addr %pI4:%d\n",
+					     &ipc.addr, ntohs(ipc.port), &daddr,
+					     ntohs(dport));
+		} else {
+			/* Override the source port to use with the one we got
+			 * in cmsg, and tell routing to let us use a non-local
+			 * address. Otherwise route lookups will fail with
+			 * non-local source address when IP_TRANSPARENT isn't
+			 * set.
+			 */
+			inet->inet_sport = ipc.port;
+			flow_flags |= FLOWI_FLAG_ANYSRC;
+		}
+	}
+
 	saddr = ipc.addr;
 	ipc.addr = faddr = daddr;
 
@@ -1223,7 +1257,6 @@  int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
 
 	if (!rt) {
 		struct net *net = sock_net(sk);
-		__u8 flow_flags = inet_sk_flowi_flags(sk);
 
 		fl4 = &fl4_stack;