diff mbox series

[bpf,v8,08/13] bpf: sockmap, incorrectly handling copied_seq

Message ID 20230517052244.294755-9-john.fastabend@gmail.com (mailing list archive)
State Superseded
Delegated to: BPF
Headers show
Series bpf sockmap fixes | expand

Checks

Context Check Description
bpf/vmtest-bpf-PR fail merge-conflict
netdev/tree_selection success Clearly marked for bpf, async
netdev/apply fail Patch does not apply to bpf
bpf/vmtest-bpf-VM_Test-1 success Logs for ${{ matrix.test }} on ${{ matrix.arch }} with ${{ matrix.toolchain_full }}
bpf/vmtest-bpf-VM_Test-2 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-3 success Logs for ShellCheck
bpf/vmtest-bpf-VM_Test-4 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-5 success Logs for build for aarch64 with gcc
bpf/vmtest-bpf-VM_Test-6 fail Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-VM_Test-7 success Logs for build for aarch64 with llvm-16
bpf/vmtest-bpf-VM_Test-8 success Logs for build for s390x with gcc
bpf/vmtest-bpf-VM_Test-9 success Logs for build for s390x with gcc
bpf/vmtest-bpf-VM_Test-10 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-11 success Logs for build for x86_64 with gcc
bpf/vmtest-bpf-VM_Test-12 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-13 success Logs for build for x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-14 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-15 success Logs for set-matrix
bpf/vmtest-bpf-VM_Test-16 success Logs for test_maps on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-17 success Logs for test_maps on aarch64 with llvm-16
bpf/vmtest-bpf-VM_Test-18 success Logs for test_maps on s390x with gcc
bpf/vmtest-bpf-VM_Test-19 success Logs for test_maps on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-20 success Logs for test_maps on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-21 fail Logs for test_progs on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-22 fail Logs for test_progs on aarch64 with llvm-16
bpf/vmtest-bpf-VM_Test-23 fail Logs for test_progs on s390x with gcc
bpf/vmtest-bpf-VM_Test-24 fail Logs for test_progs on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-25 fail Logs for test_progs on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-26 fail Logs for test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-27 fail Logs for test_progs_no_alu32 on aarch64 with llvm-16
bpf/vmtest-bpf-VM_Test-28 fail Logs for test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-VM_Test-29 fail Logs for test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-30 fail Logs for test_progs_no_alu32 on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-31 success Logs for test_progs_no_alu32_parallel on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-32 success Logs for test_progs_no_alu32_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-VM_Test-33 success Logs for test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-34 success Logs for test_progs_no_alu32_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-35 success Logs for test_progs_parallel on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-36 success Logs for test_progs_parallel on aarch64 with llvm-16
bpf/vmtest-bpf-VM_Test-37 success Logs for test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-38 success Logs for test_progs_parallel on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-39 success Logs for test_verifier on aarch64 with gcc
bpf/vmtest-bpf-VM_Test-40 success Logs for test_verifier on aarch64 with llvm-16
bpf/vmtest-bpf-VM_Test-41 success Logs for test_verifier on s390x with gcc
bpf/vmtest-bpf-VM_Test-42 success Logs for test_verifier on x86_64 with gcc
bpf/vmtest-bpf-VM_Test-43 success Logs for test_verifier on x86_64 with llvm-16
bpf/vmtest-bpf-VM_Test-44 success Logs for veristat
bpf/vmtest-bpf-VM_Test-45 success Logs for veristat

Commit Message

John Fastabend May 17, 2023, 5:22 a.m. UTC
The read_skb() logic is incrementing the tcp->copied_seq which is used for
among other things calculating how many outstanding bytes can be read by
the application. This results in application errors, if the application
does an ioctl(FIONREAD) we return zero because this is calculated from
the copied_seq value.

To fix this we move tcp->copied_seq accounting into the recv handler so
that we update these when the recvmsg() hook is called and data is in
fact copied into user buffers. This gives an accurate FIONREAD value
as expected and improves ACK handling. Before we were calling the
tcp_rcv_space_adjust() which would update 'number of bytes copied to
user in last RTT' which is wrong for programs returning SK_PASS. The
bytes are only copied to the user when recvmsg is handled.

Doing the fix for recvmsg is straightforward, but fixing redirect and
SK_DROP pkts is a bit tricker. Build a tcp_psock_eat() helper and then
call this from skmsg handlers. This fixes another issue where a broken
socket with a BPF program doing a resubmit could hang the receiver. This
happened because although read_skb() consumed the skb through sock_drop()
it did not update the copied_seq. Now if a single reccv socket is
redirecting to many sockets (for example for lb) the receiver sk will be
hung even though we might expect it to continue. The hang comes from
not updating the copied_seq numbers and memory pressure resulting from
that.

We have a slight layer problem of calling tcp_eat_skb even if its not
a TCP socket. To fix we could refactor and create per type receiver
handlers. I decided this is more work than we want in the fix and we
already have some small tweaks depending on caller that use the
helper skb_bpf_strparser(). So we extend that a bit and always set
the strparser bit when it is in use and then we can gate the
seq_copied updates on this.

Fixes: 04919bed948dc ("tcp: Introduce tcp_read_skb()")
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
---
 include/net/tcp.h  | 10 ++++++++++
 net/core/skmsg.c   | 15 +++++++--------
 net/ipv4/tcp.c     | 10 +---------
 net/ipv4/tcp_bpf.c | 28 +++++++++++++++++++++++++++-
 4 files changed, 45 insertions(+), 18 deletions(-)
diff mbox series

Patch

diff --git a/include/net/tcp.h b/include/net/tcp.h
index db9f828e9d1e..76bf0a11bdc7 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1467,6 +1467,8 @@  static inline void tcp_adjust_rcv_ssthresh(struct sock *sk)
 }
 
 void tcp_cleanup_rbuf(struct sock *sk, int copied);
+void __tcp_cleanup_rbuf(struct sock *sk, int copied);
+
 
 /* We provision sk_rcvbuf around 200% of sk_rcvlowat.
  * If 87.5 % (7/8) of the space has been consumed, we want to override
@@ -2323,6 +2325,14 @@  int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
 #endif /* CONFIG_BPF_SYSCALL */
 
+#ifdef CONFIG_INET
+void tcp_eat_skb(struct sock *sk, struct sk_buff *skb);
+#else
+static inline void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
+{
+}
+#endif
+
 int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
 			  struct sk_msg *msg, u32 bytes, int flags);
 #endif /* CONFIG_NET_SOCK_MSG */
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 08be5f409fb8..a9060e1f0e43 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -979,10 +979,8 @@  static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
 		err = -EIO;
 		sk_other = psock->sk;
 		if (sock_flag(sk_other, SOCK_DEAD) ||
-		    !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
-			skb_bpf_redirect_clear(skb);
+		    !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
 			goto out_free;
-		}
 
 		skb_bpf_set_ingress(skb);
 
@@ -1011,18 +1009,19 @@  static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
 				err = 0;
 			}
 			spin_unlock_bh(&psock->ingress_lock);
-			if (err < 0) {
-				skb_bpf_redirect_clear(skb);
+			if (err < 0)
 				goto out_free;
-			}
 		}
 		break;
 	case __SK_REDIRECT:
+		tcp_eat_skb(psock->sk, skb);
 		err = sk_psock_skb_redirect(psock, skb);
 		break;
 	case __SK_DROP:
 	default:
 out_free:
+		skb_bpf_redirect_clear(skb);
+		tcp_eat_skb(psock->sk, skb);
 		sock_drop(psock->sk, skb);
 	}
 
@@ -1067,8 +1066,7 @@  static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb)
 		skb_dst_drop(skb);
 		skb_bpf_redirect_clear(skb);
 		ret = bpf_prog_run_pin_on_cpu(prog, skb);
-		if (ret == SK_PASS)
-			skb_bpf_set_strparser(skb);
+		skb_bpf_set_strparser(skb);
 		ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb));
 		skb->sk = NULL;
 	}
@@ -1176,6 +1174,7 @@  static int sk_psock_verdict_recv(struct sock *sk, struct sk_buff *skb)
 	psock = sk_psock(sk);
 	if (unlikely(!psock)) {
 		len = 0;
+		tcp_eat_skb(sk, skb);
 		sock_drop(sk, skb);
 		goto out;
 	}
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 1be305e3d3c7..5610f8341b38 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1568,7 +1568,7 @@  static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
  * calculation of whether or not we must ACK for the sake of
  * a window update.
  */
-static void __tcp_cleanup_rbuf(struct sock *sk, int copied)
+void __tcp_cleanup_rbuf(struct sock *sk, int copied)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	bool time_to_ack = false;
@@ -1783,14 +1783,6 @@  int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
 			break;
 		}
 	}
-	WRITE_ONCE(tp->copied_seq, seq);
-
-	tcp_rcv_space_adjust(sk);
-
-	/* Clean up data we have read: This will do ACK frames. */
-	if (copied > 0)
-		__tcp_cleanup_rbuf(sk, copied);
-
 	return copied;
 }
 EXPORT_SYMBOL(tcp_read_skb);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 404857ab14cc..0f8b09553dc1 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -11,6 +11,24 @@ 
 #include <net/inet_common.h>
 #include <net/tls.h>
 
+void tcp_eat_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tcp;
+	int copied;
+
+	if (!skb || !skb->len || !sk_is_tcp(sk))
+		return;
+
+	if (skb_bpf_strparser(skb))
+		return;
+
+	tcp = tcp_sk(sk);
+	copied = tcp->copied_seq + skb->len;
+	WRITE_ONCE(tcp->copied_seq, copied);
+	tcp_rcv_space_adjust(sk);
+	__tcp_cleanup_rbuf(sk, skb->len);
+}
+
 static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock,
 			   struct sk_msg *msg, u32 apply_bytes, int flags)
 {
@@ -198,8 +216,10 @@  static int tcp_bpf_recvmsg_parser(struct sock *sk,
 				  int flags,
 				  int *addr_len)
 {
+	struct tcp_sock *tcp = tcp_sk(sk);
+	u32 seq = tcp->copied_seq;
 	struct sk_psock *psock;
-	int copied;
+	int copied = 0;
 
 	if (unlikely(flags & MSG_ERRQUEUE))
 		return inet_recv_error(sk, msg, len, addr_len);
@@ -244,9 +264,11 @@  static int tcp_bpf_recvmsg_parser(struct sock *sk,
 
 		if (is_fin) {
 			copied = 0;
+			seq++;
 			goto out;
 		}
 	}
+	seq += copied;
 	if (!copied) {
 		long timeo;
 		int data;
@@ -284,6 +306,10 @@  static int tcp_bpf_recvmsg_parser(struct sock *sk,
 		copied = -EAGAIN;
 	}
 out:
+	WRITE_ONCE(tcp->copied_seq, seq);
+	tcp_rcv_space_adjust(sk);
+	if (copied > 0)
+		__tcp_cleanup_rbuf(sk, copied);
 	release_sock(sk);
 	sk_psock_put(sk, psock);
 	return copied;