diff mbox series

[RFC,net-next] net/smc: Introduce a hook to modify syn_smc at runtime

Message ID 1726654204-61655-1-git-send-email-alibuda@linux.alibaba.com (mailing list archive)
State RFC
Delegated to: Netdev Maintainers
Headers show
Series [RFC,net-next] net/smc: Introduce a hook to modify syn_smc at runtime | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 27 this patch: 27
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 1 maintainers not CCed: dsahern@kernel.org
netdev/build_clang success Errors and warnings before: 49 this patch: 49
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 2086 this patch: 2087
netdev/checkpatch warning CHECK: Alignment should match open parenthesis WARNING: line length of 82 exceeds 80 columns WARNING: line length of 89 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 3 this patch: 3
netdev/source_inline success Was 0 now: 0

Commit Message

D. Wythe Sept. 18, 2024, 10:10 a.m. UTC
From: "D. Wythe" <alibuda@linux.alibaba.com>

The introduction of IPPROTO_SMC enables eBPF programs to determine
whether to use SMC based on the context of socket creation, such as
network namespaces, PID and comm name, etc.

As a subsequent enhancement, this patch introduces a new hook for eBPF
programs that allows decisions on whether to use SMC or not at runtime,
including but not limited to local/remote IP address or ports. In
simpler words, this feature allows modifications to syn_smc through eBPF
programs before the TCP three-way handshake got established.

Thanks to kfunc for making it easier for us to implement this feature in
SMC.

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
---
 include/linux/tcp.h  |  4 ++-
 net/ipv4/tcp_input.c |  4 +--
 net/smc/af_smc.c     | 69 ++++++++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 66 insertions(+), 11 deletions(-)

Comments

Guangguan Wang Sept. 19, 2024, 12:36 p.m. UTC | #1
On 2024/9/18 18:10, D. Wythe wrote:
> From: "D. Wythe" <alibuda@linux.alibaba.com>
> 
> The introduction of IPPROTO_SMC enables eBPF programs to determine
> whether to use SMC based on the context of socket creation, such as
> network namespaces, PID and comm name, etc.
> 
> As a subsequent enhancement, this patch introduces a new hook for eBPF
> programs that allows decisions on whether to use SMC or not at runtime,
> including but not limited to local/remote IP address or ports. In
> simpler words, this feature allows modifications to syn_smc through eBPF
> programs before the TCP three-way handshake got established.
> 
> Thanks to kfunc for making it easier for us to implement this feature in
> SMC.
> 
> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>

Hi D. Wythe, 

I think it is a good feature to have for more flexible using of SMC.

It is also a good solution for the problem we met before:
Some services are not correctly handled TCP syn packet with SMC experimental option in head.
The TCP connections to such services can not be successfully established through SMC. Thus, a
program can not using SMC and accessing the services mentioned above in the same time.
With this feature, by filter the port to the services metioned above, it is possible for
programes both using SMC and accessing the services metioned above.

> ---
>  include/linux/tcp.h  |  4 ++-
>  net/ipv4/tcp_input.c |  4 +--
>  net/smc/af_smc.c     | 69 ++++++++++++++++++++++++++++++++++++++++++++++------
>  3 files changed, 66 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 6a5e08b..d028d76 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -478,7 +478,9 @@ struct tcp_sock {
>  #endif
>  #if IS_ENABLED(CONFIG_SMC)
>  	bool	syn_smc;	/* SYN includes SMC */
> -	bool	(*smc_hs_congested)(const struct sock *sk);
> +	void	(*smc_openreq_init)(struct request_sock *req,
> +			     const struct tcp_options_received *rx_opt,
> +			     struct sk_buff *skb, const struct sock *sk);
>  #endif
>  
>  #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index e37488d..e33e2a0 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -7029,8 +7029,8 @@ static void tcp_openreq_init(struct request_sock *req,
>  	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
>  	ireq->ir_mark = inet_request_mark(sk, skb);
>  #if IS_ENABLED(CONFIG_SMC)
> -	ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
> -			tcp_sk(sk)->smc_hs_congested(sk));
> +	if (ireq->smc_ok && tcp_sk(sk)->smc_openreq_init)Should be rx_opt->smc_ok?

> +		tcp_sk(sk)->smc_openreq_init(req, rx_opt, skb, sk);
>  #endif
>  }
>  
> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> index 0316217..003b2ac 100644
> --- a/net/smc/af_smc.c
> +++ b/net/smc/af_smc.c
> @@ -70,6 +70,15 @@
>  static void smc_tcp_listen_work(struct work_struct *);
>  static void smc_connect_work(struct work_struct *);
>  
> +__bpf_hook_start();
> +
> +__weak noinline int select_syn_smc(const struct sock *sk, struct sockaddr *peer)
> +{
> +	return 1;
> +}
> +
> +__bpf_hook_end();
> +
>  int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
>  {
>  	struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
> @@ -156,19 +165,41 @@ static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
>  	return NULL;
>  }
>  
> -static bool smc_hs_congested(const struct sock *sk)
> +static void smc_openreq_init(struct request_sock *req,
> +			     const struct tcp_options_received *rx_opt,
> +			     struct sk_buff *skb, const struct sock *sk)
>  {
> +	struct inet_request_sock *ireq = inet_rsk(req);
> +	struct sockaddr_storage rmt_sockaddr = {0};
>  	const struct smc_sock *smc;
>  
>  	smc = smc_clcsock_user_data(sk);
>  
>  	if (!smc)
> -		return true;
> +		return;
It is better goto out_no_smc rather than return to explicitly set ireq->smc_ok to 0.

>  
> -	if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
> -		return true;
> +	if (smc->limit_smc_hs && workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
> +		goto out_no_smc;
>  
> -	return false;
> +	rmt_sockaddr.ss_family = sk->sk_family;
> +
> +	if (rmt_sockaddr.ss_family == AF_INET) {
> +		struct sockaddr_in *rmt4_sockaddr =  (struct sockaddr_in *)&rmt_sockaddr;
> +
> +		rmt4_sockaddr->sin_addr.s_addr = ireq->ir_rmt_addr;
> +		rmt4_sockaddr->sin_port	= ireq->ir_rmt_port;
> +	} else {
> +		struct sockaddr_in6 *rmt6_sockaddr =  (struct sockaddr_in6 *)&rmt_sockaddr;
> +
> +		rmt6_sockaddr->sin6_addr = ireq->ir_v6_rmt_addr;
> +		rmt6_sockaddr->sin6_port = ireq->ir_rmt_port;
> +	}
> +
> +	ireq->smc_ok = select_syn_smc(sk, (struct sockaddr *)&rmt_sockaddr);
> +	return;
> +out_no_smc:
> +	ireq->smc_ok = 0;
> +	return;
>  }
>  
>  struct smc_hashinfo smc_v4_hashinfo = {
> @@ -1671,7 +1702,7 @@ int smc_connect(struct socket *sock, struct sockaddr *addr,
>  	}
>  
>  	smc_copy_sock_settings_to_clc(smc);
> -	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
> +	tcp_sk(smc->clcsock->sk)->syn_smc = select_syn_smc(sk, addr);
>  	if (smc->connect_nonblock) {
>  		rc = -EALREADY;
>  		goto out;
> @@ -2650,8 +2681,7 @@ int smc_listen(struct socket *sock, int backlog)
>  
>  	inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
>  
> -	if (smc->limit_smc_hs)
> -		tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
> +	tcp_sk(smc->clcsock->sk)->smc_openreq_init = smc_openreq_init;
>  
>  	rc = kernel_listen(smc->clcsock, backlog);
>  	if (rc) {
> @@ -3475,6 +3505,20 @@ static void __net_exit smc_net_stat_exit(struct net *net)
>  	.exit = smc_net_stat_exit,
>  };
>  
> +BTF_SET8_START(bpf_smc_fmodret_ids)
> +BTF_ID_FLAGS(func, select_syn_smc)
> +BTF_SET8_END(bpf_smc_fmodret_ids)
> +
> +static const struct btf_kfunc_id_set bpf_smc_fmodret_set = {
> +	.owner = THIS_MODULE,
> +	.set   = &bpf_smc_fmodret_ids,
> +};
> +
> +static int __init bpf_smc_kfunc_init(void)
> +{
> +	return register_btf_fmodret_id_set(&bpf_smc_fmodret_set);
> +}
Does it have unregister function? Is it OK for repeate register when reload the smc module?

Thanks,
Guangguan Wang
> +
>  static int __init smc_init(void)
>  {
>  	int rc;
> @@ -3574,8 +3618,17 @@ static int __init smc_init(void)
>  		pr_err("%s: smc_inet_init fails with %d\n", __func__, rc);
>  		goto out_ulp;
>  	}
> +
> +	rc = bpf_smc_kfunc_init();
> +	if (rc) {
> +		pr_err("%s: bpf_smc_kfunc_init fails with %d\n", __func__, rc);
> +		goto out_inet;
> +	}
> +
>  	static_branch_enable(&tcp_have_smc);
>  	return 0;
> +out_inet:
> +	smc_inet_exit();
>  out_ulp:
>  	tcp_unregister_ulp(&smc_ulp_ops);
>  out_lo:
diff mbox series

Patch

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6a5e08b..d028d76 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -478,7 +478,9 @@  struct tcp_sock {
 #endif
 #if IS_ENABLED(CONFIG_SMC)
 	bool	syn_smc;	/* SYN includes SMC */
-	bool	(*smc_hs_congested)(const struct sock *sk);
+	void	(*smc_openreq_init)(struct request_sock *req,
+			     const struct tcp_options_received *rx_opt,
+			     struct sk_buff *skb, const struct sock *sk);
 #endif
 
 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e37488d..e33e2a0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7029,8 +7029,8 @@  static void tcp_openreq_init(struct request_sock *req,
 	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
 	ireq->ir_mark = inet_request_mark(sk, skb);
 #if IS_ENABLED(CONFIG_SMC)
-	ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
-			tcp_sk(sk)->smc_hs_congested(sk));
+	if (ireq->smc_ok && tcp_sk(sk)->smc_openreq_init)
+		tcp_sk(sk)->smc_openreq_init(req, rx_opt, skb, sk);
 #endif
 }
 
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 0316217..003b2ac 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -70,6 +70,15 @@ 
 static void smc_tcp_listen_work(struct work_struct *);
 static void smc_connect_work(struct work_struct *);
 
+__bpf_hook_start();
+
+__weak noinline int select_syn_smc(const struct sock *sk, struct sockaddr *peer)
+{
+	return 1;
+}
+
+__bpf_hook_end();
+
 int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
@@ -156,19 +165,41 @@  static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
 	return NULL;
 }
 
-static bool smc_hs_congested(const struct sock *sk)
+static void smc_openreq_init(struct request_sock *req,
+			     const struct tcp_options_received *rx_opt,
+			     struct sk_buff *skb, const struct sock *sk)
 {
+	struct inet_request_sock *ireq = inet_rsk(req);
+	struct sockaddr_storage rmt_sockaddr = {0};
 	const struct smc_sock *smc;
 
 	smc = smc_clcsock_user_data(sk);
 
 	if (!smc)
-		return true;
+		return;
 
-	if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
-		return true;
+	if (smc->limit_smc_hs && workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
+		goto out_no_smc;
 
-	return false;
+	rmt_sockaddr.ss_family = sk->sk_family;
+
+	if (rmt_sockaddr.ss_family == AF_INET) {
+		struct sockaddr_in *rmt4_sockaddr =  (struct sockaddr_in *)&rmt_sockaddr;
+
+		rmt4_sockaddr->sin_addr.s_addr = ireq->ir_rmt_addr;
+		rmt4_sockaddr->sin_port	= ireq->ir_rmt_port;
+	} else {
+		struct sockaddr_in6 *rmt6_sockaddr =  (struct sockaddr_in6 *)&rmt_sockaddr;
+
+		rmt6_sockaddr->sin6_addr = ireq->ir_v6_rmt_addr;
+		rmt6_sockaddr->sin6_port = ireq->ir_rmt_port;
+	}
+
+	ireq->smc_ok = select_syn_smc(sk, (struct sockaddr *)&rmt_sockaddr);
+	return;
+out_no_smc:
+	ireq->smc_ok = 0;
+	return;
 }
 
 struct smc_hashinfo smc_v4_hashinfo = {
@@ -1671,7 +1702,7 @@  int smc_connect(struct socket *sock, struct sockaddr *addr,
 	}
 
 	smc_copy_sock_settings_to_clc(smc);
-	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
+	tcp_sk(smc->clcsock->sk)->syn_smc = select_syn_smc(sk, addr);
 	if (smc->connect_nonblock) {
 		rc = -EALREADY;
 		goto out;
@@ -2650,8 +2681,7 @@  int smc_listen(struct socket *sock, int backlog)
 
 	inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
 
-	if (smc->limit_smc_hs)
-		tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
+	tcp_sk(smc->clcsock->sk)->smc_openreq_init = smc_openreq_init;
 
 	rc = kernel_listen(smc->clcsock, backlog);
 	if (rc) {
@@ -3475,6 +3505,20 @@  static void __net_exit smc_net_stat_exit(struct net *net)
 	.exit = smc_net_stat_exit,
 };
 
+BTF_SET8_START(bpf_smc_fmodret_ids)
+BTF_ID_FLAGS(func, select_syn_smc)
+BTF_SET8_END(bpf_smc_fmodret_ids)
+
+static const struct btf_kfunc_id_set bpf_smc_fmodret_set = {
+	.owner = THIS_MODULE,
+	.set   = &bpf_smc_fmodret_ids,
+};
+
+static int __init bpf_smc_kfunc_init(void)
+{
+	return register_btf_fmodret_id_set(&bpf_smc_fmodret_set);
+}
+
 static int __init smc_init(void)
 {
 	int rc;
@@ -3574,8 +3618,17 @@  static int __init smc_init(void)
 		pr_err("%s: smc_inet_init fails with %d\n", __func__, rc);
 		goto out_ulp;
 	}
+
+	rc = bpf_smc_kfunc_init();
+	if (rc) {
+		pr_err("%s: bpf_smc_kfunc_init fails with %d\n", __func__, rc);
+		goto out_inet;
+	}
+
 	static_branch_enable(&tcp_have_smc);
 	return 0;
+out_inet:
+	smc_inet_exit();
 out_ulp:
 	tcp_unregister_ulp(&smc_ulp_ops);
 out_lo: