diff mbox series

[v2,bpf-next,12/13] bpf: Call bpf_run_sk_reuseport() for socket migration.

Message ID 20201207132456.65472-13-kuniyu@amazon.co.jp (mailing list archive)
State Changes Requested
Delegated to: BPF
Headers show
Series Socket migration for SO_REUSEPORT. | expand

Checks

Context Check Description
netdev/cover_letter success Link
netdev/fixes_present success Link
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for bpf-next
netdev/subject_prefix success Link
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Link
netdev/module_param success Was 0 now: 0
netdev/build_32bit success Errors and warnings before: 45 this patch: 45
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/verify_fixes success Link
netdev/checkpatch warning WARNING: ENOTSUPP is not a SUSV4 error code, prefer EOPNOTSUPP WARNING: line length of 83 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 99 exceeds 80 columns
netdev/build_allmodconfig_warn success Errors and warnings before: 45 this patch: 45
netdev/header_inline success Link
netdev/stable success Stable not CCed

Commit Message

Iwashima, Kuniyuki Dec. 7, 2020, 1:24 p.m. UTC
This patch supports socket migration by eBPF. If the attached type is
BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, we can select a new listener by
BPF_FUNC_sk_select_reuseport(). Also, we can cancel migration by returning
SK_DROP. This feature is useful when listeners have different settings at
the socket API level or when we want to free resources as soon as possible.

There are two noteworthy points. The first is that we select a listening
socket in reuseport_detach_sock() and __reuseport_select_sock(), but we do
not have struct skb at closing a listener or retransmitting a SYN+ACK.
However, some helper functions do not expect skb is NULL (e.g.
skb_header_pointer() in BPF_FUNC_skb_load_bytes(), skb_tail_pointer() in
BPF_FUNC_skb_load_bytes_relative()). So we allocate an empty skb
temporarily before running the eBPF program. The second is that we do not
have struct request_sock in unhash path, and the sk_hash of the listener is
always zero. So we pass zero as hash to bpf_run_sk_reuseport().

Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
---
 net/core/filter.c          | 19 +++++++++++++++++++
 net/core/sock_reuseport.c  | 21 +++++++++++----------
 net/ipv4/inet_hashtables.c |  2 +-
 3 files changed, 31 insertions(+), 11 deletions(-)
diff mbox series

Patch

diff --git a/net/core/filter.c b/net/core/filter.c
index 9f7018e3f545..53fa3bcbf00f 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -9890,10 +9890,29 @@  struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk,
 {
 	struct sk_reuseport_kern reuse_kern;
 	enum sk_action action;
+	bool allocated = false;
+
+	if (migration) {
+		/* cancel migration for possibly incapable eBPF program */
+		if (prog->expected_attach_type != BPF_SK_REUSEPORT_SELECT_OR_MIGRATE)
+			return ERR_PTR(-ENOTSUPP);
+
+		if (!skb) {
+			allocated = true;
+			skb = alloc_skb(0, GFP_ATOMIC);
+			if (!skb)
+				return ERR_PTR(-ENOMEM);
+		}
+	} else if (!skb) {
+		return NULL; /* fall back to select by hash */
+	}
 
 	bpf_init_reuseport_kern(&reuse_kern, reuse, sk, skb, hash, migration);
 	action = BPF_PROG_RUN(prog, &reuse_kern);
 
+	if (allocated)
+		kfree_skb(skb);
+
 	if (action == SK_PASS)
 		return reuse_kern.selected_sk;
 	else
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index b877c8e552d2..2358e8896199 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -221,8 +221,15 @@  struct sock *reuseport_detach_sock(struct sock *sk)
 						 lockdep_is_held(&reuseport_lock));
 
 		if (sk->sk_protocol == IPPROTO_TCP) {
-			if (reuse->num_socks && !prog)
-				nsk = i == reuse->num_socks ? reuse->socks[i - 1] : reuse->socks[i];
+			if (reuse->num_socks) {
+				if (prog)
+					nsk = bpf_run_sk_reuseport(reuse, sk, prog, NULL, 0,
+								   BPF_SK_REUSEPORT_MIGRATE_QUEUE);
+
+				if (!nsk)
+					nsk = i == reuse->num_socks ?
+						reuse->socks[i - 1] : reuse->socks[i];
+			}
 
 			reuse->num_closed_socks++;
 		} else {
@@ -306,15 +313,9 @@  static struct sock *__reuseport_select_sock(struct sock *sk, u32 hash,
 		if (!prog)
 			goto select_by_hash;
 
-		if (migration)
-			goto out;
-
-		if (!skb)
-			goto select_by_hash;
-
 		if (prog->type == BPF_PROG_TYPE_SK_REUSEPORT)
 			sk2 = bpf_run_sk_reuseport(reuse, sk, prog, skb, hash, migration);
-		else
+		else if (!skb)
 			sk2 = run_bpf_filter(reuse, socks, prog, skb, hdr_len);
 
 select_by_hash:
@@ -352,7 +353,7 @@  struct sock *reuseport_select_migrated_sock(struct sock *sk, u32 hash,
 	struct sock *nsk;
 
 	nsk = __reuseport_select_sock(sk, hash, skb, 0, BPF_SK_REUSEPORT_MIGRATE_REQUEST);
-	if (nsk && likely(refcount_inc_not_zero(&nsk->sk_refcnt)))
+	if (!IS_ERR_OR_NULL(nsk) && likely(refcount_inc_not_zero(&nsk->sk_refcnt)))
 		return nsk;
 
 	return NULL;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 545538a6bfac..59f58740c20d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -699,7 +699,7 @@  void inet_unhash(struct sock *sk)
 
 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
 		nsk = reuseport_detach_sock(sk);
-		if (nsk)
+		if (!IS_ERR_OR_NULL(nsk))
 			inet_csk_reqsk_queue_migrate(sk, nsk);
 	}