diff mbox series

[1/2] bpf: Introduce cpu affinity for sockmap

Message ID 20241101023832.32404-1-mrpre@163.com (mailing list archive)
State New
Delegated to: BPF
Headers show
Series [1/2] bpf: Introduce cpu affinity for sockmap | expand

Checks

Context Check Description
bpf/vmtest-bpf-next-PR success PR summary
bpf/vmtest-bpf-next-VM_Test-0 success Logs for Lint
bpf/vmtest-bpf-next-VM_Test-1 success Logs for ShellCheck
bpf/vmtest-bpf-next-VM_Test-2 success Logs for Unittests
bpf/vmtest-bpf-next-VM_Test-3 success Logs for Validate matrix.py
bpf/vmtest-bpf-next-VM_Test-5 success Logs for aarch64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-4 success Logs for aarch64-gcc / build / build for aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-9 success Logs for aarch64-gcc / test (test_verifier, false, 360) / test_verifier on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-10 success Logs for aarch64-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-11 success Logs for s390x-gcc / build / build for s390x with gcc
bpf/vmtest-bpf-next-VM_Test-12 success Logs for s390x-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-15 success Logs for s390x-gcc / test (test_verifier, false, 360) / test_verifier on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-16 success Logs for s390x-gcc / veristat
bpf/vmtest-bpf-next-VM_Test-17 success Logs for set-matrix
bpf/vmtest-bpf-next-VM_Test-19 success Logs for x86_64-gcc / build-release
bpf/vmtest-bpf-next-VM_Test-18 success Logs for x86_64-gcc / build / build for x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-20 success Logs for x86_64-gcc / test (test_maps, false, 360) / test_maps on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-23 success Logs for x86_64-gcc / test (test_progs_no_alu32_parallel, true, 30) / test_progs_no_alu32_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-24 success Logs for x86_64-gcc / test (test_progs_parallel, true, 30) / test_progs_parallel on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-25 success Logs for x86_64-gcc / test (test_verifier, false, 360) / test_verifier on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-28 success Logs for x86_64-llvm-17 / build-release / build for x86_64 with llvm-17-O2
bpf/vmtest-bpf-next-VM_Test-27 success Logs for x86_64-llvm-17 / build / build for x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-29 success Logs for x86_64-llvm-17 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-33 success Logs for x86_64-llvm-17 / veristat
bpf/vmtest-bpf-next-VM_Test-34 success Logs for x86_64-llvm-18 / build / build for x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-32 success Logs for x86_64-llvm-17 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-36 success Logs for x86_64-llvm-18 / test (test_maps, false, 360) / test_maps on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-35 success Logs for x86_64-llvm-18 / build-release / build for x86_64 with llvm-18-O2
bpf/vmtest-bpf-next-VM_Test-41 success Logs for x86_64-llvm-18 / veristat
bpf/vmtest-bpf-next-VM_Test-40 success Logs for x86_64-llvm-18 / test (test_verifier, false, 360) / test_verifier on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-6 success Logs for aarch64-gcc / test (test_maps, false, 360) / test_maps on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-7 success Logs for aarch64-gcc / test (test_progs, false, 360) / test_progs on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-8 success Logs for aarch64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on aarch64 with gcc
bpf/vmtest-bpf-next-VM_Test-21 success Logs for x86_64-gcc / test (test_progs, false, 360) / test_progs on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-22 success Logs for x86_64-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-26 success Logs for x86_64-gcc / veristat / veristat on x86_64 with gcc
bpf/vmtest-bpf-next-VM_Test-30 success Logs for x86_64-llvm-17 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-31 success Logs for x86_64-llvm-17 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-17
bpf/vmtest-bpf-next-VM_Test-38 success Logs for x86_64-llvm-18 / test (test_progs_cpuv4, false, 360) / test_progs_cpuv4 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-39 success Logs for x86_64-llvm-18 / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-37 success Logs for x86_64-llvm-18 / test (test_progs, false, 360) / test_progs on x86_64 with llvm-18
bpf/vmtest-bpf-next-VM_Test-14 success Logs for s390x-gcc / test (test_progs_no_alu32, false, 360) / test_progs_no_alu32 on s390x with gcc
bpf/vmtest-bpf-next-VM_Test-13 success Logs for s390x-gcc / test (test_progs, false, 360) / test_progs on s390x with gcc
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Guessed tree name to be net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 202 this patch: 202
netdev/build_tools success Errors and warnings before: 2 (+0) this patch: 2 (+0)
netdev/cc_maintainers warning 11 maintainers not CCed: horms@kernel.org daniel@iogearbox.net ast@kernel.org kpsingh@kernel.org sdf@fomichev.me eddyz87@gmail.com martin.lau@linux.dev song@kernel.org haoluo@google.com andrii@kernel.org jolsa@kernel.org
netdev/build_clang success Errors and warnings before: 235 this patch: 235
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 6932 this patch: 6932
netdev/checkpatch warning WARNING: From:/Signed-off-by: email name mismatch: 'From: mrpre <mrpre@163.com>' != 'Signed-off-by: Jiayuan Chen <mrpre@163.com>' WARNING: line length of 102 exceeds 80 columns WARNING: line length of 84 exceeds 80 columns WARNING: line length of 85 exceeds 80 columns WARNING: line length of 86 exceeds 80 columns WARNING: line length of 90 exceeds 80 columns WARNING: line length of 92 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns WARNING: line length of 96 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 6 this patch: 6
netdev/source_inline success Was 0 now: 0

Commit Message

mrpre Nov. 1, 2024, 2:38 a.m. UTC
Why we need cpu affinity:
Mainstream data planes, like Nginx and HAProxy, utilize CPU affinity
by binding user processes to specific CPUs. This avoids interference
between processes and prevents impact from other processes.

Sockmap, as an optimization to accelerate such proxy programs,
currently lacks the ability to specify CPU affinity. The current
implementation of sockmap handling backlog is based on workqueue,
which operates by calling 'schedule_delayed_work()'. It's current
implementation prefers to schedule on the local CPU, i.e., the CPU
that handled the packet under softirq.

For extremely high traffic with large numbers of packets,
'sk_psock_backlog' becomes a large loop.

For multi-threaded programs with only one map, we expect different
sockets to run on different CPUs. It is important to note that this
feature is not a general performance optimization. Instead, it
provides users with the ability to bind to specific CPU, allowing
them to enhance overall operating system utilization based on their
own system environments.

Implementation:
1.When updating the sockmap, support passing a CPU parameter and
save it to the psock.
2.When scheduling psock, determine which CPU to run on using the
psock's CPU information.
3.For thoes sockmap without CPU affinity, keep original logic by using
'schedule_delayed_work()'.

Performance Testing:
'client <-> sockmap proxy <-> server'

Using 'iperf3' tests, with the iperf server bound to CPU5 and the iperf
client bound to CPU6, performance without using CPU affinity is
around 34 Gbits/s, and CPU usage is concentrated on CPU5 and CPU6.
'''
[  5] local 127.0.0.1 port 57144 connected to 127.0.0.1 port 10000
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec  3.95 GBytes  33.9 Gbits/sec
[  5]   1.00-2.00   sec  3.95 GBytes  34.0 Gbits/sec
......
'''

With using CPU affinity, the performnce is close to direct connection
(without any proxy).
'''
[  5] local 127.0.0.1 port 56518 connected to 127.0.0.1 port 10000
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec  7.76 GBytes  66.6 Gbits/sec
[  5]   1.00-2.00   sec  7.76 GBytes  66.7 Gbits/sec
......
'''

Signed-off-by: Jiayuan Chen <mrpre@163.com>
---
 include/linux/bpf.h      |  3 ++-
 include/linux/skmsg.h    |  8 ++++++++
 include/uapi/linux/bpf.h |  4 ++++
 kernel/bpf/syscall.c     | 23 +++++++++++++++++------
 net/core/skmsg.c         | 11 +++++++----
 net/core/sock_map.c      | 12 +++++++-----
 6 files changed, 45 insertions(+), 16 deletions(-)
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c3ba4d475174..a56028c389e7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -3080,7 +3080,8 @@  int bpf_prog_test_run_syscall(struct bpf_prog *prog,
 
 int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
 int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
-int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
+int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags,
+			     s32 target_cpu);
 int sock_map_bpf_prog_query(const union bpf_attr *attr,
 			    union bpf_attr __user *uattr);
 int sock_map_link_create(const union bpf_attr *attr, struct bpf_prog *prog);
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index d9b03e0746e7..919425a92adf 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -117,6 +117,7 @@  struct sk_psock {
 	struct delayed_work		work;
 	struct sock			*sk_pair;
 	struct rcu_work			rwork;
+	s32				target_cpu;
 };
 
 int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len,
@@ -514,6 +515,13 @@  static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
 	return !!psock->saved_data_ready;
 }
 
+static inline int sk_psock_strp_get_cpu(struct sk_psock *psock)
+{
+	if (psock->target_cpu != -1)
+		return psock->target_cpu;
+	return WORK_CPU_UNBOUND;
+}
+
 #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
 
 #define BPF_F_STRPARSER	(1UL << 1)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index f28b6527e815..2019a87b5d4a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1509,6 +1509,10 @@  union bpf_attr {
 			__aligned_u64 next_key;
 		};
 		__u64		flags;
+		union {
+			/* specify the CPU where the sockmap job run on */
+			__aligned_u64 target_cpu;
+		};
 	};
 
 	struct { /* struct used by BPF_MAP_*_BATCH commands */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8254b2973157..95f719b9c3f3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -239,10 +239,9 @@  static int bpf_obj_pin_uptrs(struct btf_record *rec, void *obj)
 }
 
 static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
-				void *key, void *value, __u64 flags)
+				void *key, void *value, __u64 flags, s32 target_cpu)
 {
 	int err;
-
 	/* Need to create a kthread, thus must support schedule */
 	if (bpf_map_is_offloaded(map)) {
 		return bpf_map_offload_update_elem(map, key, value, flags);
@@ -252,7 +251,7 @@  static int bpf_map_update_value(struct bpf_map *map, struct file *map_file,
 		return map->ops->map_update_elem(map, key, value, flags);
 	} else if (map->map_type == BPF_MAP_TYPE_SOCKHASH ||
 		   map->map_type == BPF_MAP_TYPE_SOCKMAP) {
-		return sock_map_update_elem_sys(map, key, value, flags);
+		return sock_map_update_elem_sys(map, key, value, flags, target_cpu);
 	} else if (IS_FD_PROG_ARRAY(map)) {
 		return bpf_fd_array_map_update_elem(map, map_file, key, value,
 						    flags);
@@ -1680,12 +1679,14 @@  static int map_lookup_elem(union bpf_attr *attr)
 }
 
 
-#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD target_cpu
 
 static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 {
 	bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
 	bpfptr_t uvalue = make_bpfptr(attr->value, uattr.is_kernel);
+	bpfptr_t utarget_cpu = make_bpfptr(attr->target_cpu, uattr.is_kernel);
+	s64 target_cpu = 0;
 	struct bpf_map *map;
 	void *key, *value;
 	u32 value_size;
@@ -1723,7 +1724,17 @@  static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
 		goto free_key;
 	}
 
-	err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags);
+	if (map->map_type == BPF_MAP_TYPE_SOCKMAP &&
+	    !bpfptr_is_null(utarget_cpu)) {
+		if (copy_from_bpfptr(&target_cpu, utarget_cpu, sizeof(target_cpu)) ||
+		    target_cpu > nr_cpu_ids) {
+			err = -EINVAL;
+			goto err_put;
+		}
+	} else {
+		target_cpu = -1;
+	}
+	err = bpf_map_update_value(map, fd_file(f), key, value, attr->flags, (s32)target_cpu);
 	if (!err)
 		maybe_wait_bpf_programs(map);
 
@@ -1947,7 +1958,7 @@  int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
 			break;
 
 		err = bpf_map_update_value(map, map_file, key, value,
-					   attr->batch.elem_flags);
+					   attr->batch.elem_flags, -1);
 
 		if (err)
 			break;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index b1dcbd3be89e..d3b6f2468dab 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -679,7 +679,8 @@  static void sk_psock_backlog(struct work_struct *work)
 					 * other work that might be here.
 					 */
 					if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
-						schedule_delayed_work(&psock->work, 1);
+						schedule_delayed_work_on(sk_psock_strp_get_cpu(psock),
+									 &psock->work, 1);
 					goto end;
 				}
 				/* Hard errors break pipe and stop xmit. */
@@ -729,6 +730,7 @@  struct sk_psock *sk_psock_init(struct sock *sk, int node)
 	psock->saved_destroy = prot->destroy;
 	psock->saved_close = prot->close;
 	psock->saved_write_space = sk->sk_write_space;
+	psock->target_cpu = -1;
 
 	INIT_LIST_HEAD(&psock->link);
 	spin_lock_init(&psock->link_lock);
@@ -934,7 +936,7 @@  static int sk_psock_skb_redirect(struct sk_psock *from, struct sk_buff *skb)
 	}
 
 	skb_queue_tail(&psock_other->ingress_skb, skb);
-	schedule_delayed_work(&psock_other->work, 0);
+	schedule_delayed_work_on(sk_psock_strp_get_cpu(psock_other), &psock_other->work, 0);
 	spin_unlock_bh(&psock_other->ingress_lock);
 	return 0;
 }
@@ -1012,7 +1014,8 @@  static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb,
 			spin_lock_bh(&psock->ingress_lock);
 			if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
 				skb_queue_tail(&psock->ingress_skb, skb);
-				schedule_delayed_work(&psock->work, 0);
+				schedule_delayed_work_on(sk_psock_strp_get_cpu(psock),
+							 &psock->work, 0);
 				err = 0;
 			}
 			spin_unlock_bh(&psock->ingress_lock);
@@ -1044,7 +1047,7 @@  static void sk_psock_write_space(struct sock *sk)
 	psock = sk_psock(sk);
 	if (likely(psock)) {
 		if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
-			schedule_delayed_work(&psock->work, 0);
+			schedule_delayed_work_on(sk_psock_strp_get_cpu(psock), &psock->work, 0);
 		write_space = psock->saved_write_space;
 	}
 	rcu_read_unlock();
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index 07d6aa4e39ef..36e9787c60de 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -465,7 +465,7 @@  static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next)
 }
 
 static int sock_map_update_common(struct bpf_map *map, u32 idx,
-				  struct sock *sk, u64 flags)
+				  struct sock *sk, u64 flags, s32 target_cpu)
 {
 	struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
 	struct sk_psock_link *link;
@@ -490,6 +490,8 @@  static int sock_map_update_common(struct bpf_map *map, u32 idx,
 	psock = sk_psock(sk);
 	WARN_ON_ONCE(!psock);
 
+	psock->target_cpu = target_cpu;
+
 	spin_lock_bh(&stab->lock);
 	osk = stab->sks[idx];
 	if (osk && flags == BPF_NOEXIST) {
@@ -548,7 +550,7 @@  static int sock_hash_update_common(struct bpf_map *map, void *key,
 				   struct sock *sk, u64 flags);
 
 int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
-			     u64 flags)
+			     u64 flags, s32 target_cpu)
 {
 	struct socket *sock;
 	struct sock *sk;
@@ -579,7 +581,7 @@  int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value,
 	if (!sock_map_sk_state_allowed(sk))
 		ret = -EOPNOTSUPP;
 	else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
-		ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+		ret = sock_map_update_common(map, *(u32 *)key, sk, flags, target_cpu);
 	else
 		ret = sock_hash_update_common(map, key, sk, flags);
 	sock_map_sk_release(sk);
@@ -605,7 +607,7 @@  static long sock_map_update_elem(struct bpf_map *map, void *key,
 	if (!sock_map_sk_state_allowed(sk))
 		ret = -EOPNOTSUPP;
 	else if (map->map_type == BPF_MAP_TYPE_SOCKMAP)
-		ret = sock_map_update_common(map, *(u32 *)key, sk, flags);
+		ret = sock_map_update_common(map, *(u32 *)key, sk, flags, -1);
 	else
 		ret = sock_hash_update_common(map, key, sk, flags);
 	bh_unlock_sock(sk);
@@ -621,7 +623,7 @@  BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, sops,
 	if (likely(sock_map_sk_is_suitable(sops->sk) &&
 		   sock_map_op_okay(sops)))
 		return sock_map_update_common(map, *(u32 *)key, sops->sk,
-					      flags);
+					      flags, -1);
 	return -EOPNOTSUPP;
 }