diff mbox series

[RFC,bpf-next,2/3] bpf: tcp: Avoid socket skips during iteration

Message ID 20250313233615.2329869-3-jrife@google.com (mailing list archive)
State RFC
Delegated to: BPF
Headers show
Series Avoid skipping sockets with socket iterators | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-8 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-10 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-11 success Logs for aarch64-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 pending Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for aarch64-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / GCC BPF
bpf/vmtest-bpf-next-VM_Test-17 pending Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-18 pending Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-16 pending Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for s390x-gcc / veristat-meta
bpf/vmtest-bpf-next-VM_Test-19 success Logs for s390x-gcc / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-21 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-25 pending Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-26 pending Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-27 pending Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 pending Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-gcc / veristat-kernel / x86_64-gcc veristat_kernel
bpf/vmtest-bpf-next-VM_Test-30 pending Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-29 pending Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 pending Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-37 pending Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 pending Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-gcc / veristat-meta / x86_64-gcc veristat_meta
bpf/vmtest-bpf-next-VM_Test-39 pending Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-17 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-42 success Logs for x86_64-llvm-18 / GCC BPF / GCC BPF
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-17 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-43 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-44 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-46 pending Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-47 pending Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-48 pending Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-51 success Logs for x86_64-llvm-18 / veristat-meta
bpf/vmtest-bpf-next-VM_Test-50 success Logs for x86_64-llvm-18 / veristat-kernel
bpf/vmtest-bpf-next-VM_Test-45 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-49 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for bpf-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 9 this patch: 9
netdev/build_tools success Errors and warnings before: 26 (+0) this patch: 26 (+0)
netdev/cc_maintainers warning 7 maintainers not CCed: edumazet@google.com pabeni@redhat.com kuniyu@amazon.com dsahern@kernel.org ncardwell@google.com horms@kernel.org kuba@kernel.org
netdev/build_clang success Errors and warnings before: 12 this patch: 12
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 918 this patch: 918
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 182 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 2 this patch: 2
netdev/source_inline fail Was 0 now: 1

Commit Message

Jordan Rife March 13, 2025, 11:35 p.m. UTC
Replace the offset-based approach for tracking progress through a bucket
in the TCP table with one based on unique, monotonically increasing
index numbers associated with each socket in a bucket.

Signed-off-by: Jordan Rife <jrife@google.com>
---
 include/net/inet_hashtables.h |  2 ++
 include/net/tcp.h             |  3 ++-
 net/ipv4/inet_hashtables.c    | 18 +++++++++++++++---
 net/ipv4/tcp.c                |  1 +
 net/ipv4/tcp_ipv4.c           | 29 ++++++++++++++++-------------
 5 files changed, 36 insertions(+), 17 deletions(-)
diff mbox series

Patch

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 5eea47f135a4..c95d3b1da199 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -172,6 +172,8 @@  struct inet_hashinfo {
 	struct inet_listen_hashbucket	*lhash2;
 
 	bool				pernet;
+
+	atomic64_t			ver;
 } ____cacheline_aligned_in_smp;
 
 static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2d08473a6dc0..499acd6da35f 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2202,7 +2202,8 @@  struct tcp_iter_state {
 	struct seq_net_private	p;
 	enum tcp_seq_states	state;
 	struct sock		*syn_wait_sk;
-	int			bucket, offset, sbucket, num;
+	int			bucket, sbucket, num;
+	__s64			prev_idx;
 	loff_t			last_pos;
 };
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 9bfcfd016e18..bc9f58172790 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -534,6 +534,12 @@  struct sock *__inet_lookup_established(const struct net *net,
 }
 EXPORT_SYMBOL_GPL(__inet_lookup_established);
 
+static inline __s64 inet_hashinfo_next_idx(struct inet_hashinfo *hinfo,
+					   bool pos)
+{
+	return (pos ? 1 : -1) * atomic64_inc_return(&hinfo->ver);
+}
+
 /* called with local bh disabled */
 static int __inet_check_established(struct inet_timewait_death_row *death_row,
 				    struct sock *sk, __u16 lport,
@@ -581,6 +587,7 @@  static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	sk->sk_hash = hash;
 	WARN_ON(!sk_unhashed(sk));
 	__sk_nulls_add_node_rcu(sk, &head->chain);
+	sk->sk_idx = inet_hashinfo_next_idx(hinfo, false);
 	if (tw) {
 		sk_nulls_del_node_init_rcu((struct sock *)tw);
 		__NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
@@ -678,8 +685,10 @@  bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
 			ret = false;
 	}
 
-	if (ret)
+	if (ret) {
 		__sk_nulls_add_node_rcu(sk, list);
+		sk->sk_idx = inet_hashinfo_next_idx(hashinfo, false);
+	}
 
 	spin_unlock(lock);
 
@@ -729,6 +738,7 @@  int __inet_hash(struct sock *sk, struct sock *osk)
 {
 	struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
 	struct inet_listen_hashbucket *ilb2;
+	bool add_tail;
 	int err = 0;
 
 	if (sk->sk_state != TCP_LISTEN) {
@@ -747,11 +757,13 @@  int __inet_hash(struct sock *sk, struct sock *osk)
 			goto unlock;
 	}
 	sock_set_flag(sk, SOCK_RCU_FREE);
-	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
-		sk->sk_family == AF_INET6)
+	add_tail = IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
+		   sk->sk_family == AF_INET6;
+	if (add_tail)
 		__sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
 	else
 		__sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
+	sk->sk_idx = inet_hashinfo_next_idx(hashinfo, add_tail);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 unlock:
 	spin_unlock(&ilb2->lock);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 285678d8ce07..63693af0c05c 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -5147,6 +5147,7 @@  void __init tcp_init(void)
 
 	cnt = tcp_hashinfo.ehash_mask + 1;
 	sysctl_tcp_max_orphans = cnt / 2;
+	atomic64_set(&tcp_hashinfo.ver, 0);
 
 	tcp_init_mem();
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2632844d2c35..d0ddb307e2a1 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2602,7 +2602,7 @@  static void *listening_get_first(struct seq_file *seq)
 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
 	struct tcp_iter_state *st = seq->private;
 
-	st->offset = 0;
+	st->prev_idx = 0;
 	for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
 		struct inet_listen_hashbucket *ilb2;
 		struct hlist_nulls_node *node;
@@ -2637,7 +2637,7 @@  static void *listening_get_next(struct seq_file *seq, void *cur)
 	struct sock *sk = cur;
 
 	++st->num;
-	++st->offset;
+	st->prev_idx = sk->sk_idx;
 
 	sk = sk_nulls_next(sk);
 	sk_nulls_for_each_from(sk, node) {
@@ -2658,7 +2658,6 @@  static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
 	void *rc;
 
 	st->bucket = 0;
-	st->offset = 0;
 	rc = listening_get_first(seq);
 
 	while (rc && *pos) {
@@ -2683,7 +2682,7 @@  static void *established_get_first(struct seq_file *seq)
 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
 	struct tcp_iter_state *st = seq->private;
 
-	st->offset = 0;
+	st->prev_idx = 0;
 	for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
 		struct sock *sk;
 		struct hlist_nulls_node *node;
@@ -2714,7 +2713,7 @@  static void *established_get_next(struct seq_file *seq, void *cur)
 	struct sock *sk = cur;
 
 	++st->num;
-	++st->offset;
+	st->prev_idx = sk->sk_idx;
 
 	sk = sk_nulls_next(sk);
 
@@ -2763,8 +2762,8 @@  static void *tcp_seek_last_pos(struct seq_file *seq)
 {
 	struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
 	struct tcp_iter_state *st = seq->private;
+	__s64 prev_idx = st->prev_idx;
 	int bucket = st->bucket;
-	int offset = st->offset;
 	int orig_num = st->num;
 	void *rc = NULL;
 
@@ -2773,18 +2772,21 @@  static void *tcp_seek_last_pos(struct seq_file *seq)
 		if (st->bucket > hinfo->lhash2_mask)
 			break;
 		rc = listening_get_first(seq);
-		while (offset-- && rc && bucket == st->bucket)
+		while (rc && bucket == st->bucket && prev_idx &&
+		       ((struct sock *)rc)->sk_idx <= prev_idx)
 			rc = listening_get_next(seq, rc);
 		if (rc)
 			break;
 		st->bucket = 0;
+		prev_idx = 0;
 		st->state = TCP_SEQ_STATE_ESTABLISHED;
 		fallthrough;
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (st->bucket > hinfo->ehash_mask)
 			break;
 		rc = established_get_first(seq);
-		while (offset-- && rc && bucket == st->bucket)
+		while (rc && bucket == st->bucket && prev_idx &&
+		       ((struct sock *)rc)->sk_idx <= prev_idx)
 			rc = established_get_next(seq, rc);
 	}
 
@@ -2807,7 +2809,7 @@  void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
 	st->state = TCP_SEQ_STATE_LISTENING;
 	st->num = 0;
 	st->bucket = 0;
-	st->offset = 0;
+	st->prev_idx = 0;
 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 
 out:
@@ -2832,7 +2834,7 @@  void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		if (!rc) {
 			st->state = TCP_SEQ_STATE_ESTABLISHED;
 			st->bucket = 0;
-			st->offset = 0;
+			st->prev_idx = 0;
 			rc	  = established_get_first(seq);
 		}
 		break;
@@ -3124,7 +3126,7 @@  static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
 	 * it has to advance to the next bucket.
 	 */
 	if (iter->st_bucket_done) {
-		st->offset = 0;
+		st->prev_idx = 0;
 		st->bucket++;
 		if (st->state == TCP_SEQ_STATE_LISTENING &&
 		    st->bucket > hinfo->lhash2_mask) {
@@ -3192,8 +3194,9 @@  static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 		 * the future start() will resume at st->offset in
 		 * st->bucket.  See tcp_seek_last_pos().
 		 */
-		st->offset++;
-		sock_gen_put(iter->batch[iter->cur_sk++]);
+		sk = iter->batch[iter->cur_sk++];
+		st->prev_idx = sk->sk_idx;
+		sock_gen_put(sk);
 	}
 
 	if (iter->cur_sk < iter->end_sk)