diff mbox series

[v2,bpf-next,5/8] tcp: seq_file: Replace listening_hash with lhash2

Message ID 20210701200606.1035783-1-kafai@fb.com (mailing list archive)
State Accepted
Delegated to: BPF
Headers show
Series bpf: Allow bpf tcp iter to do bpf_(get|set)sockopt | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 8 maintainers not CCed: dsahern@kernel.org kpsingh@kernel.org yoshfuji@linux-ipv6.org andrii@kernel.org john.fastabend@gmail.com songliubraving@fb.com davem@davemloft.net kuba@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 1092 this patch: 1092
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 87 lines checked
netdev/build_allmodconfig_warn success Errors and warnings before: 1095 this patch: 1095
netdev/header_inline success Link

Commit Message

Martin KaFai Lau July 1, 2021, 8:06 p.m. UTC
This patch moves the tcp seq_file iteration on listeners
from the port only listening_hash to the port+addr lhash2.

When iterating from the bpf iter, the next patch will need to
lock the socket such that the bpf iter can call setsockopt (e.g. to
change the TCP_CONGESTION).  To avoid locking the bucket and then locking
the sock, the bpf iter will first batch some sockets from the same bucket
and then unlock the bucket.  If the bucket size is small (which
usually is), it is easier to batch the whole bucket such that it is less
likely to miss a setsockopt on a socket due to changes in the bucket.

However, the port only listening_hash could have many listeners
hashed to a bucket (e.g. many individual VIP(s):443 and also
multiple by the number of SO_REUSEPORT).  We have seen bucket size in
tens of thousands range.  Also, the chance of having changes
in some popular port buckets (e.g. 443) is also high.

The port+addr lhash2 was introduced to solve this large listener bucket
issue.  Also, the listening_hash usage has already been replaced with
lhash2 in the fast path inet[6]_lookup_listener().  This patch follows
the same direction on moving to lhash2 and iterates the lhash2
instead of listening_hash.

Acked-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 include/net/inet_hashtables.h |  6 ++++++
 net/ipv4/tcp_ipv4.c           | 35 ++++++++++++++++++-----------------
 2 files changed, 24 insertions(+), 17 deletions(-)
diff mbox series

Patch

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ca6a3ea9057e..f72ec113ae56 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -160,6 +160,12 @@  struct inet_hashinfo {
 					____cacheline_aligned_in_smp;
 };
 
+#define inet_lhash2_for_each_icsk_continue(__icsk) \
+	hlist_for_each_entry_continue(__icsk, icsk_listen_portaddr_node)
+
+#define inet_lhash2_for_each_icsk(__icsk, list) \
+	hlist_for_each_entry(__icsk, list, icsk_listen_portaddr_node)
+
 #define inet_lhash2_for_each_icsk_rcu(__icsk, list) \
 	hlist_for_each_entry_rcu(__icsk, list, icsk_listen_portaddr_node)
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index fc2c2ecd10e1..6b3c26be539b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2296,21 +2296,22 @@  static void *listening_get_first(struct seq_file *seq)
 	struct tcp_iter_state *st = seq->private;
 
 	st->offset = 0;
-	for (; st->bucket < INET_LHTABLE_SIZE; st->bucket++) {
-		struct inet_listen_hashbucket *ilb;
-		struct hlist_nulls_node *node;
+	for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
+		struct inet_listen_hashbucket *ilb2;
+		struct inet_connection_sock *icsk;
 		struct sock *sk;
 
-		ilb = &tcp_hashinfo.listening_hash[st->bucket];
-		if (hlist_nulls_empty(&ilb->nulls_head))
+		ilb2 = &tcp_hashinfo.lhash2[st->bucket];
+		if (hlist_empty(&ilb2->head))
 			continue;
 
-		spin_lock(&ilb->lock);
-		sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+		spin_lock(&ilb2->lock);
+		inet_lhash2_for_each_icsk(icsk, &ilb2->head) {
+			sk = (struct sock *)icsk;
 			if (seq_sk_match(seq, sk))
 				return sk;
 		}
-		spin_unlock(&ilb->lock);
+		spin_unlock(&ilb2->lock);
 	}
 
 	return NULL;
@@ -2324,22 +2325,22 @@  static void *listening_get_first(struct seq_file *seq)
 static void *listening_get_next(struct seq_file *seq, void *cur)
 {
 	struct tcp_iter_state *st = seq->private;
-	struct inet_listen_hashbucket *ilb;
-	struct hlist_nulls_node *node;
+	struct inet_listen_hashbucket *ilb2;
+	struct inet_connection_sock *icsk;
 	struct sock *sk = cur;
 
 	++st->num;
 	++st->offset;
 
-	sk = sk_nulls_next(sk);
-
-	sk_nulls_for_each_from(sk, node) {
+	icsk = inet_csk(sk);
+	inet_lhash2_for_each_icsk_continue(icsk) {
+		sk = (struct sock *)icsk;
 		if (seq_sk_match(seq, sk))
 			return sk;
 	}
 
-	ilb = &tcp_hashinfo.listening_hash[st->bucket];
-	spin_unlock(&ilb->lock);
+	ilb2 = &tcp_hashinfo.lhash2[st->bucket];
+	spin_unlock(&ilb2->lock);
 	++st->bucket;
 	return listening_get_first(seq);
 }
@@ -2456,7 +2457,7 @@  static void *tcp_seek_last_pos(struct seq_file *seq)
 
 	switch (st->state) {
 	case TCP_SEQ_STATE_LISTENING:
-		if (st->bucket >= INET_LHTABLE_SIZE)
+		if (st->bucket > tcp_hashinfo.lhash2_mask)
 			break;
 		st->state = TCP_SEQ_STATE_LISTENING;
 		rc = listening_get_first(seq);
@@ -2541,7 +2542,7 @@  void tcp_seq_stop(struct seq_file *seq, void *v)
 	switch (st->state) {
 	case TCP_SEQ_STATE_LISTENING:
 		if (v != SEQ_START_TOKEN)
-			spin_unlock(&tcp_hashinfo.listening_hash[st->bucket].lock);
+			spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
 		break;
 	case TCP_SEQ_STATE_ESTABLISHED:
 		if (v)