diff mbox series

[net-next,3/4] net/smc: Introduce smc_bpf_ops

Message ID 1729737768-124596-4-git-send-email-alibuda@linux.alibaba.com (mailing list archive)
State Not Applicable
Headers show
Series net/smc: Introduce smc_bpf_ops | expand

Commit Message

D. Wythe Oct. 24, 2024, 2:42 a.m. UTC
From: "D. Wythe" <alibuda@linux.alibaba.com>

The introduction of IPPROTO_SMC enables eBPF programs to determine
whether to use SMC based on the context of socket creation, such as
network namespaces, PID and comm name, etc.

As a subsequent enhancement, this patch introduces a new hook for eBPF
programs that allows decisions on whether to use SMC or not at runtime,
including but not limited to local/remote IP address or ports. In
simpler words, this feature allows modifications to syn_smc through eBPF
programs before the TCP three-way handshake got established.

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
---
 include/linux/tcp.h   |   2 +-
 include/net/smc.h     |  47 +++++++++++
 include/net/tcp.h     |   6 ++
 net/ipv4/tcp_input.c  |   3 +-
 net/ipv4/tcp_output.c |  14 +++-
 net/smc/Kconfig       |  12 +++
 net/smc/Makefile      |   1 +
 net/smc/af_smc.c      |  38 ++++++---
 net/smc/smc.h         |   4 +
 net/smc/smc_bpf.c     | 212 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/smc/smc_bpf.h     |  34 ++++++++
 11 files changed, 357 insertions(+), 16 deletions(-)
 create mode 100644 net/smc/smc_bpf.c
 create mode 100644 net/smc/smc_bpf.h

Comments

Martin KaFai Lau Oct. 25, 2024, 12:26 a.m. UTC | #1
On 10/23/24 7:42 PM, D. Wythe wrote:
> From: "D. Wythe" <alibuda@linux.alibaba.com>
> 
> The introduction of IPPROTO_SMC enables eBPF programs to determine
> whether to use SMC based on the context of socket creation, such as
> network namespaces, PID and comm name, etc.
> 
> As a subsequent enhancement, this patch introduces a new hook for eBPF
> programs that allows decisions on whether to use SMC or not at runtime,
> including but not limited to local/remote IP address or ports. In
> simpler words, this feature allows modifications to syn_smc through eBPF
> programs before the TCP three-way handshake got established.
> 
> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
> ---
>   include/linux/tcp.h   |   2 +-
>   include/net/smc.h     |  47 +++++++++++
>   include/net/tcp.h     |   6 ++
>   net/ipv4/tcp_input.c  |   3 +-
>   net/ipv4/tcp_output.c |  14 +++-
>   net/smc/Kconfig       |  12 +++
>   net/smc/Makefile      |   1 +
>   net/smc/af_smc.c      |  38 ++++++---
>   net/smc/smc.h         |   4 +
>   net/smc/smc_bpf.c     | 212 ++++++++++++++++++++++++++++++++++++++++++++++++++
>   net/smc/smc_bpf.h     |  34 ++++++++
>   11 files changed, 357 insertions(+), 16 deletions(-)
>   create mode 100644 net/smc/smc_bpf.c
>   create mode 100644 net/smc/smc_bpf.h
> 
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 6a5e08b..4ef160a 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -478,7 +478,7 @@ struct tcp_sock {
>   #endif
>   #if IS_ENABLED(CONFIG_SMC)
>   	bool	syn_smc;	/* SYN includes SMC */
> -	bool	(*smc_hs_congested)(const struct sock *sk);
> +	struct tcpsmc_ctx *smc;
>   #endif
>   
>   #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
> diff --git a/include/net/smc.h b/include/net/smc.h
> index db84e4e..34ab2c6 100644
> --- a/include/net/smc.h
> +++ b/include/net/smc.h
> @@ -18,6 +18,8 @@
>   #include "linux/ism.h"
>   
>   struct sock;
> +struct tcp_sock;
> +struct inet_request_sock;
>   
>   #define SMC_MAX_PNETID_LEN	16	/* Max. length of PNET id */
>   
> @@ -97,4 +99,49 @@ struct smcd_dev {
>   	u8 going_away : 1;
>   };
>   
> +/*
> + * This structure is used to store the parameters passed to the member of struct_ops.
> + * Due to the BPF verifier cannot restrict the writing of bit fields, such as limiting
> + * it to only write ireq->smc_ok. Using kfunc can solve this issue, but we don't want
> + * to introduce a kfunc with such a narrow function.

imo, adding kfunc is fine.

> + *
> + * Moreover, using this structure for unified parameters also addresses another
> + * potential issue. Currently, kfunc cannot recognize the calling context
> + * through BPF's existing structure. In the future, we can solve this problem
> + * by passing this ctx to kfunc.

This part I don't understand. How is it different from the "tcp_cubic_kfunc_set" 
allowed in tcp_congestion_ops?

> + */
> +struct smc_bpf_ops_ctx {
> +	struct {
> +		struct tcp_sock *tp;
> +	} set_option;
> +	struct {
> +		const struct tcp_sock *tp;
> +		struct inet_request_sock *ireq;
> +		int smc_ok;
> +	} set_option_cond;
> +};

There is no need to create one single ctx for struct_ops prog. struct_ops prog 
can take >1 args and different ops can take different args.

> +
> +struct smc_bpf_ops {
> +	/* priavte */
> +
> +	struct list_head	list;
> +
> +	/* public */
> +
> +	/* Invoked before computing SMC option for SYN packets.
> +	 * We can control whether to set SMC options by modifying
> +	 * ctx->set_option->tp->syn_smc.
> +	 * This's also the only member that can be modified now.
> +	 * Only member in ctx->set_option is valid for this callback.
> +	 */
> +	void (*set_option)(struct smc_bpf_ops_ctx *ctx);
> +
> +	/* Invoked before Set up SMC options for SYN-ACK packets
> +	 * We can control whether to respond SMC options by modifying
> +	 * ctx->set_option_cond.smc_ok.
> +	 * Only member in ctx->set_option_cond is valid for this callback.
> +	 */
> +	void (*set_option_cond)(struct smc_bpf_ops_ctx *ctx);

The struct smc_bpf_ops already has set_option and set_option_cnd, but...

> +};
> +
>   #endif	/* _SMC_H */
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 739a9fb..c322443 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -2730,6 +2730,12 @@ static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt)
>   
>   #if IS_ENABLED(CONFIG_SMC)
>   extern struct static_key_false tcp_have_smc;
> +struct tcpsmc_ctx {
> +	/* Invoked before computing SMC option for SYN packets. */
> +	void (*set_option)(struct tcp_sock *tp);
> +	/* Invoked before Set up SMC options for SYN-ACK packets */
> +	void (*set_option_cond)(const struct tcp_sock *tp, struct inet_request_sock *ireq);
> +};

another new struct tcpsmc_ctx has exactly the same functions (at least the same 
name) but different arguments. I don't understand why this duplicate, is it 
because the need to prepare the "struct smc_bpf_ops_ctx"?

The "struct tcpsmc_ctx" should be the "struct smc_bpf_ops" itself.

[ ... ]

> +static int smc_bpf_ops_btf_struct_access(struct bpf_verifier_log *log,
> +					 const struct bpf_reg_state *reg,
> +					 const struct bpf_prog *prog,
> +					 int off, int size)
> +{
> +	const struct btf_member *member;
> +	const char *mname;
> +	int member_idx;
> +
> +	member_idx = prog->expected_attach_type;
> +	if (member_idx >= btf_type_vlen(smc_bpf_ops_type))
> +		goto out_err;
> +
> +	member = &btf_type_member(smc_bpf_ops_type)[member_idx];
> +	mname = btf_str_by_offset(saved_btf, member->name_off);
> +
> +	if (!strcmp(mname, "set_option")) {

btf_member_bit_offset can be used instead of strcmp. Take a look at bpf_tcp_ca.c 
and kernel/sched/ext.c

> +		/* only support to modify tcp_sock->syn_smc */
> +		if (reg->btf_id == tcp_sock_id &&
> +		    off == offsetof(struct tcp_sock, syn_smc) &&
> +		    off + size == offsetofend(struct tcp_sock, syn_smc))
> +			return 0;
> +	} else if (!strcmp(mname, "set_option_cond")) {
> +		/* only support to modify smc_bpf_ops_ctx->smc_ok */
> +		if (reg->btf_id == smc_bpf_ops_ctx_id &&
> +		    off == offsetof(struct smc_bpf_ops_ctx, set_option_cond.smc_ok) &&
> +		    off + size == offsetofend(struct smc_bpf_ops_ctx, set_option_cond.smc_ok))
> +			return 0;
> +	}
> +
> +out_err:
> +	return -EACCES;
> +}
> +
> +static const struct bpf_verifier_ops smc_bpf_verifier_ops = {
> +	.get_func_proto = bpf_base_func_proto,
> +	.is_valid_access = bpf_tracing_btf_ctx_access,
> +	.btf_struct_access = smc_bpf_ops_btf_struct_access,
> +};
> +
> +static struct bpf_struct_ops bpf_smc_bpf_ops = {
> +	.init = smc_bpf_ops_init,
> +	.name = "smc_bpf_ops",
> +	.reg = smc_bpf_ops_reg,
> +	.unreg = smc_bpf_ops_unreg,
> +	.cfi_stubs = &__bpf_smc_bpf_ops,
> +	.verifier_ops = &smc_bpf_verifier_ops,
> +	.init_member = smc_bpf_ops_init_member,
> +	.check_member = smc_bpf_ops_check_member,
> +	.owner = THIS_MODULE,
> +};
> +
> +int smc_bpf_struct_ops_init(void)
> +{
> +	return register_bpf_struct_ops(&bpf_smc_bpf_ops, smc_bpf_ops);
> +}
> +
> +void bpf_smc_set_tcp_option(struct tcp_sock *tp)
> +{
> +	struct smc_bpf_ops_ctx ops_ctx = {};
> +	struct smc_bpf_ops *ops;
> +
> +	ops_ctx.set_option.tp = tp;

All this initialization should be unnecessary. Directly pass tp instead.

> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(ops, &smc_bpf_ops_list, list) {

Does it need to have a list (meaning >1) of smc_bpf_ops to act on a sock? The 
ordering expectation is hard to manage.

> +		ops->set_option(&ops_ctx);

A dumb question. This will only affect AF_SMC (or AF_INET[6]/IPPROTO_SMC) 
socket but not the AF_INET[6]/IPPROTO_{TCP,UDP} socket?

pw-bot: cr

> +	}
> +	rcu_read_unlock();
> +}
D. Wythe Oct. 25, 2024, 11:05 a.m. UTC | #2
On 10/25/24 8:26 AM, Martin KaFai Lau wrote:
> On 10/23/24 7:42 PM, D. Wythe wrote:
>> From: "D. Wythe" <alibuda@linux.alibaba.com>
>>
>> The introduction of IPPROTO_SMC enables eBPF programs to determine
>> whether to use SMC based on the context of socket creation, such as
>> network namespaces, PID and comm name, etc.
>>
>> As a subsequent enhancement, this patch introduces a new hook for eBPF
>> programs that allows decisions on whether to use SMC or not at runtime,
>> including but not limited to local/remote IP address or ports. In
>> simpler words, this feature allows modifications to syn_smc through eBPF
>> programs before the TCP three-way handshake got established.
>>
>> Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
>> ---
>>   include/linux/tcp.h   |   2 +-
>>   include/net/smc.h     |  47 +++++++++++
>>   include/net/tcp.h     |   6 ++
>>   net/ipv4/tcp_input.c  |   3 +-
>>   net/ipv4/tcp_output.c |  14 +++-
>>   net/smc/Kconfig       |  12 +++
>>   net/smc/Makefile      |   1 +
>>   net/smc/af_smc.c      |  38 ++++++---
>>   net/smc/smc.h         |   4 +
>>   net/smc/smc_bpf.c     | 212 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   net/smc/smc_bpf.h     |  34 ++++++++
>>   11 files changed, 357 insertions(+), 16 deletions(-)
>>   create mode 100644 net/smc/smc_bpf.c
>>   create mode 100644 net/smc/smc_bpf.h
>>
>> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
>> index 6a5e08b..4ef160a 100644
>> --- a/include/linux/tcp.h
>> +++ b/include/linux/tcp.h
>> @@ -478,7 +478,7 @@ struct tcp_sock {
>>   #endif
>>   #if IS_ENABLED(CONFIG_SMC)
>>       bool    syn_smc;    /* SYN includes SMC */
>> -    bool    (*smc_hs_congested)(const struct sock *sk);
>> +    struct tcpsmc_ctx *smc;
>>   #endif
>>   #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
>> diff --git a/include/net/smc.h b/include/net/smc.h
>> index db84e4e..34ab2c6 100644
>> --- a/include/net/smc.h
>> +++ b/include/net/smc.h
>> @@ -18,6 +18,8 @@
>>   #include "linux/ism.h"
>>   struct sock;
>> +struct tcp_sock;
>> +struct inet_request_sock;
>>   #define SMC_MAX_PNETID_LEN    16    /* Max. length of PNET id */
>> @@ -97,4 +99,49 @@ struct smcd_dev {
>>       u8 going_away : 1;
>>   };
>> +/*
>> + * This structure is used to store the parameters passed to the member of struct_ops.
>> + * Due to the BPF verifier cannot restrict the writing of bit fields, such as limiting
>> + * it to only write ireq->smc_ok. Using kfunc can solve this issue, but we don't want
>> + * to introduce a kfunc with such a narrow function.
> 
> imo, adding kfunc is fine.
> 
>> + *
>> + * Moreover, using this structure for unified parameters also addresses another
>> + * potential issue. Currently, kfunc cannot recognize the calling context
>> + * through BPF's existing structure. In the future, we can solve this problem
>> + * by passing this ctx to kfunc.
> 
> This part I don't understand. How is it different from the "tcp_cubic_kfunc_set" allowed in 
> tcp_congestion_ops?

Hi Martin,

Yes, creating an independent kfunc for each callback and filtering via expected_attach_type can 
indeed solve the problem.

Our main concern is to avoid introducing kfuncs as much as possible. For our subsystem, we might 
need to maintain it in a way that maintains a uapi, as we certainly have user applications depending 
on it.

This is also why we need to create a separate ctx, as there’s no way to restrict bit writes, so we 
created a ctx->smc_ok that is allowed to write.

This is also why we had to create a separate structure, tcpsmc_ctx ...

However, I now realize that compromising to avoid introducing kfuncs has gone too far, affecting the 
readability of the code. I will try to use kfuncs in the next version to solve those issues.


> 
>> + */
>> +struct smc_bpf_ops_ctx {
>> +    struct {
>> +        struct tcp_sock *tp;
>> +    } set_option;
>> +    struct {
>> +        const struct tcp_sock *tp;
>> +        struct inet_request_sock *ireq;
>> +        int smc_ok;
>> +    } set_option_cond;
>> +};
> 
> There is no need to create one single ctx for struct_ops prog. struct_ops prog can take >1 args and 
> different ops can take different args.
> 

Same reason with concern on kfunc. I'll change it in next version.


>> +
>> +struct smc_bpf_ops {
>> +    /* priavte */
>> +
>> +    struct list_head    list;
>> +
>> +    /* public */
>> +
>> +    /* Invoked before computing SMC option for SYN packets.
>> +     * We can control whether to set SMC options by modifying
>> +     * ctx->set_option->tp->syn_smc.
>> +     * This's also the only member that can be modified now.
>> +     * Only member in ctx->set_option is valid for this callback.
>> +     */
>> +    void (*set_option)(struct smc_bpf_ops_ctx *ctx);
>> +
>> +    /* Invoked before Set up SMC options for SYN-ACK packets
>> +     * We can control whether to respond SMC options by modifying
>> +     * ctx->set_option_cond.smc_ok.
>> +     * Only member in ctx->set_option_cond is valid for this callback.
>> +     */
>> +    void (*set_option_cond)(struct smc_bpf_ops_ctx *ctx);
> 
> The struct smc_bpf_ops already has set_option and set_option_cnd, but...
> 
>> +};
>> +
>>   #endif    /* _SMC_H */
>> diff --git a/include/net/tcp.h b/include/net/tcp.h
>> index 739a9fb..c322443 100644
>> --- a/include/net/tcp.h
>> +++ b/include/net/tcp.h
>> @@ -2730,6 +2730,12 @@ static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt)
>>   #if IS_ENABLED(CONFIG_SMC)
>>   extern struct static_key_false tcp_have_smc;
>> +struct tcpsmc_ctx {
>> +    /* Invoked before computing SMC option for SYN packets. */
>> +    void (*set_option)(struct tcp_sock *tp);
>> +    /* Invoked before Set up SMC options for SYN-ACK packets */
>> +    void (*set_option_cond)(const struct tcp_sock *tp, struct inet_request_sock *ireq);
>> +};
> 
> another new struct tcpsmc_ctx has exactly the same functions (at least the same name) but different 
> arguments. I don't understand why this duplicate, is it because the need to prepare the "struct 
> smc_bpf_ops_ctx"?

Yes, same reason with concern on kfunc. I'll change it in next version.

> 
> The "struct tcpsmc_ctx" should be the "struct smc_bpf_ops" itself.
> 
> [ ... ]
> 
>> +static int smc_bpf_ops_btf_struct_access(struct bpf_verifier_log *log,
>> +                     const struct bpf_reg_state *reg,
>> +                     const struct bpf_prog *prog,
>> +                     int off, int size)
>> +{
>> +    const struct btf_member *member;
>> +    const char *mname;
>> +    int member_idx;
>> +
>> +    member_idx = prog->expected_attach_type;
>> +    if (member_idx >= btf_type_vlen(smc_bpf_ops_type))
>> +        goto out_err;
>> +
>> +    member = &btf_type_member(smc_bpf_ops_type)[member_idx];
>> +    mname = btf_str_by_offset(saved_btf, member->name_off);
>> +
>> +    if (!strcmp(mname, "set_option")) {
> 
> btf_member_bit_offset can be used instead of strcmp. Take a look at bpf_tcp_ca.c and kernel/sched/ext.c
> 

Got it, thanks for that.

Besides, it seems that we don't need the export btf_str_by_offset anymore in that way.
I'll remove it in the next version.


>> +        /* only support to modify tcp_sock->syn_smc */
>> +        if (reg->btf_id == tcp_sock_id &&
>> +            off == offsetof(struct tcp_sock, syn_smc) &&
>> +            off + size == offsetofend(struct tcp_sock, syn_smc))
>> +            return 0;
>> +    } else if (!strcmp(mname, "set_option_cond")) {
>> +        /* only support to modify smc_bpf_ops_ctx->smc_ok */
>> +        if (reg->btf_id == smc_bpf_ops_ctx_id &&
>> +            off == offsetof(struct smc_bpf_ops_ctx, set_option_cond.smc_ok) &&
>> +            off + size == offsetofend(struct smc_bpf_ops_ctx, set_option_cond.smc_ok))
>> +            return 0;
>> +    }
>> +
>> +out_err:
>> +    return -EACCES;
>> +}
>> +
>> +static const struct bpf_verifier_ops smc_bpf_verifier_ops = {
>> +    .get_func_proto = bpf_base_func_proto,
>> +    .is_valid_access = bpf_tracing_btf_ctx_access,
>> +    .btf_struct_access = smc_bpf_ops_btf_struct_access,
>> +};
>> +
>> +static struct bpf_struct_ops bpf_smc_bpf_ops = {
>> +    .init = smc_bpf_ops_init,
>> +    .name = "smc_bpf_ops",
>> +    .reg = smc_bpf_ops_reg,
>> +    .unreg = smc_bpf_ops_unreg,
>> +    .cfi_stubs = &__bpf_smc_bpf_ops,
>> +    .verifier_ops = &smc_bpf_verifier_ops,
>> +    .init_member = smc_bpf_ops_init_member,
>> +    .check_member = smc_bpf_ops_check_member,
>> +    .owner = THIS_MODULE,
>> +};
>> +
>> +int smc_bpf_struct_ops_init(void)
>> +{
>> +    return register_bpf_struct_ops(&bpf_smc_bpf_ops, smc_bpf_ops);
>> +}
>> +
>> +void bpf_smc_set_tcp_option(struct tcp_sock *tp)
>> +{
>> +    struct smc_bpf_ops_ctx ops_ctx = {};
>> +    struct smc_bpf_ops *ops;
>> +
>> +    ops_ctx.set_option.tp = tp;
> 
> All this initialization should be unnecessary. Directly pass tp instead.
> 

Same reason with kfunc concern. I'll change it in next version.

>> +
>> +    rcu_read_lock();
>> +    list_for_each_entry_rcu(ops, &smc_bpf_ops_list, list) {
> 
> Does it need to have a list (meaning >1) of smc_bpf_ops to act on a sock? The ordering expectation 
> is hard to manage.
> 

Considering that the SMC modules also has its own ops that needs to be registered on it (the logic 
of smc_limit_fs), and need to be all executed, perhaps a list is a more suitable choice.


>> +        ops->set_option(&ops_ctx);
> 
> A dumb question. This will only affect AF_SMC (or AF_INET[6]/IPPROTO_SMC) socket but not the 
> AF_INET[6]/IPPROTO_{TCP,UDP} socket?
> 

Yes, it only affects AF_SMC, AF_SMC6, or IPPROTO_SMC sockets. Due to only SMC sockets will set 
tp->syn_smc, and we will check it before calling the very ops.

Best wishes,
D.

> pw-bot: cr
> 
>> +    }
>> +    rcu_read_unlock();
>> +}
Martin KaFai Lau Oct. 25, 2024, 6:30 p.m. UTC | #3
On 10/25/24 4:05 AM, D. Wythe wrote:
> Our main concern is to avoid introducing kfuncs as much as possible. For our 
> subsystem, we might need to maintain it in a way that maintains a uapi, as we 
> certainly have user applications depending on it.

The smc_bpf_ops can read/write the tp and ireq. In patch 4, there is 
'tp->syn_smc = 1'. I assume the real bpf prog will read something from the tp to 
make the decision also. Note that tp/ireq is also not in the uapi but the CO-RE 
can help in case the tp->syn_smc bool is moved around.

 From looking at the selftest in patch 4 again, I think all it needs is for the 
bpf prog (i.e. the ops) to return a bool instead of allowing the bpf prog to 
write or call a kfunc to change the tp/ireq.
D. Wythe Oct. 29, 2024, 8:53 a.m. UTC | #4
On 10/26/24 2:30 AM, Martin KaFai Lau wrote:
> On 10/25/24 4:05 AM, D. Wythe wrote:
>> Our main concern is to avoid introducing kfuncs as much as possible. For our subsystem, we might 
>> need to maintain it in a way that maintains a uapi, as we certainly have user applications 
>> depending on it.
> 
> The smc_bpf_ops can read/write the tp and ireq. In patch 4, there is 'tp->syn_smc = 1'. I assume the 
> real bpf prog will read something from the tp to make the decision also. Note that tp/ireq is also 
> not in the uapi but the CO-RE can help in case the tp->syn_smc bool is moved around.
> 
>  From looking at the selftest in patch 4 again, I think all it needs is for the bpf prog (i.e. the 
> ops) to return a bool instead of allowing the bpf prog to write or call a kfunc to change the tp/ireq.
> 

Hi Martin,

At the beginning, I did modify it by returning values, but later I wanted to make this ops more 
universal, so I considered influencing the behavior by modifying the tp without returning any value. 
But considering we currently do not have any other needs, perhaps modifying it by returning a value 
would be more appropriate.

And If that's the case, we won't need to add new prog parameters to the struct_access anymore. I'll 
try this in the next series.

Thanks,
D. Wythe
diff mbox series

Patch

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 6a5e08b..4ef160a 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -478,7 +478,7 @@  struct tcp_sock {
 #endif
 #if IS_ENABLED(CONFIG_SMC)
 	bool	syn_smc;	/* SYN includes SMC */
-	bool	(*smc_hs_congested)(const struct sock *sk);
+	struct tcpsmc_ctx *smc;
 #endif
 
 #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
diff --git a/include/net/smc.h b/include/net/smc.h
index db84e4e..34ab2c6 100644
--- a/include/net/smc.h
+++ b/include/net/smc.h
@@ -18,6 +18,8 @@ 
 #include "linux/ism.h"
 
 struct sock;
+struct tcp_sock;
+struct inet_request_sock;
 
 #define SMC_MAX_PNETID_LEN	16	/* Max. length of PNET id */
 
@@ -97,4 +99,49 @@  struct smcd_dev {
 	u8 going_away : 1;
 };
 
+/*
+ * This structure is used to store the parameters passed to the member of struct_ops.
+ * Due to the BPF verifier cannot restrict the writing of bit fields, such as limiting
+ * it to only write ireq->smc_ok. Using kfunc can solve this issue, but we don't want
+ * to introduce a kfunc with such a narrow function.
+ *
+ * Moreover, using this structure for unified parameters also addresses another
+ * potential issue. Currently, kfunc cannot recognize the calling context
+ * through BPF's existing structure. In the future, we can solve this problem
+ * by passing this ctx to kfunc.
+ */
+struct smc_bpf_ops_ctx {
+	struct {
+		struct tcp_sock *tp;
+	} set_option;
+	struct {
+		const struct tcp_sock *tp;
+		struct inet_request_sock *ireq;
+		int smc_ok;
+	} set_option_cond;
+};
+
+struct smc_bpf_ops {
+	/* priavte */
+
+	struct list_head	list;
+
+	/* public */
+
+	/* Invoked before computing SMC option for SYN packets.
+	 * We can control whether to set SMC options by modifying
+	 * ctx->set_option->tp->syn_smc.
+	 * This's also the only member that can be modified now.
+	 * Only member in ctx->set_option is valid for this callback.
+	 */
+	void (*set_option)(struct smc_bpf_ops_ctx *ctx);
+
+	/* Invoked before Set up SMC options for SYN-ACK packets
+	 * We can control whether to respond SMC options by modifying
+	 * ctx->set_option_cond.smc_ok.
+	 * Only member in ctx->set_option_cond is valid for this callback.
+	 */
+	void (*set_option_cond)(struct smc_bpf_ops_ctx *ctx);
+};
+
 #endif	/* _SMC_H */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 739a9fb..c322443 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2730,6 +2730,12 @@  static inline void tcp_bpf_rtt(struct sock *sk, long mrtt, u32 srtt)
 
 #if IS_ENABLED(CONFIG_SMC)
 extern struct static_key_false tcp_have_smc;
+struct tcpsmc_ctx {
+	/* Invoked before computing SMC option for SYN packets. */
+	void (*set_option)(struct tcp_sock *tp);
+	/* Invoked before Set up SMC options for SYN-ACK packets */
+	void (*set_option_cond)(const struct tcp_sock *tp, struct inet_request_sock *ireq);
+};
 #endif
 
 #if IS_ENABLED(CONFIG_TLS_DEVICE)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2d844e1..8ebd529 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -7070,8 +7070,7 @@  static void tcp_openreq_init(struct request_sock *req,
 	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
 	ireq->ir_mark = inet_request_mark(sk, skb);
 #if IS_ENABLED(CONFIG_SMC)
-	ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested &&
-			tcp_sk(sk)->smc_hs_congested(sk));
+	ireq->smc_ok = rx_opt->smc_ok;
 #endif
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 054244ce..5ab47dd 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -759,14 +759,17 @@  static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
 	mptcp_options_write(th, ptr, tp, opts);
 }
 
-static void smc_set_option(const struct tcp_sock *tp,
+static void smc_set_option(struct tcp_sock *tp,
 			   struct tcp_out_options *opts,
 			   unsigned int *remaining)
 {
 #if IS_ENABLED(CONFIG_SMC)
 	if (static_branch_unlikely(&tcp_have_smc)) {
 		if (tp->syn_smc) {
-			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+			if (tp->smc && tp->smc->set_option)
+				tp->smc->set_option(tp);
+			/* set_option may modify syn_smc, so it needs to be checked again */
+			if (tp->syn_smc && *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
 				opts->options |= OPTION_SMC;
 				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
 			}
@@ -776,14 +779,17 @@  static void smc_set_option(const struct tcp_sock *tp,
 }
 
 static void smc_set_option_cond(const struct tcp_sock *tp,
-				const struct inet_request_sock *ireq,
+				struct inet_request_sock *ireq,
 				struct tcp_out_options *opts,
 				unsigned int *remaining)
 {
 #if IS_ENABLED(CONFIG_SMC)
 	if (static_branch_unlikely(&tcp_have_smc)) {
 		if (tp->syn_smc && ireq->smc_ok) {
-			if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+			if (tp->smc && tp->smc->set_option_cond)
+				tp->smc->set_option_cond(tp, ireq);
+			/* set_option_cond may modify smc_ok, so it needs to be checked again */
+			if (ireq->smc_ok && *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
 				opts->options |= OPTION_SMC;
 				*remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
 			}
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index ba5e6a2..1eca835 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -33,3 +33,15 @@  config SMC_LO
 	  of architecture or hardware.
 
 	  if unsure, say N.
+
+config SMC_BPF
+	bool "eBPF support for SMC subsystem"
+	depends on SMC && BPF_SYSCALL
+	default n
+	help
+	  This option enables support for eBPF programs for SMC
+	  subsystem. eBPF programs offer much greater flexibility
+	  in modifying the behavior of the SMC protocol stack compared
+	  to a complete kernel-based approach.
+
+	  if unsure, say N.
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 60f1c87..1c04906 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -7,3 +7,4 @@  smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_sta
 smc-y += smc_tracepoint.o smc_inet.o
 smc-$(CONFIG_SYSCTL) += smc_sysctl.o
 smc-$(CONFIG_SMC_LO) += smc_loopback.o
+smc-$(CONFIG_SMC_BPF) += smc_bpf.o
\ No newline at end of file
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 0316217..316c8a1 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -55,6 +55,7 @@ 
 #include "smc_sysctl.h"
 #include "smc_loopback.h"
 #include "smc_inet.h"
+#include "smc_bpf.h"
 
 static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
 						 * creation on server
@@ -156,19 +157,25 @@  static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
 	return NULL;
 }
 
-static bool smc_hs_congested(const struct sock *sk)
+static void smc_set_tcp_option_cond(const struct tcp_sock *tp, struct inet_request_sock *ireq)
 {
 	const struct smc_sock *smc;
 
-	smc = smc_clcsock_user_data(sk);
+	smc = smc_clcsock_user_data(&tp->inet_conn.icsk_inet.sk);
 
 	if (!smc)
-		return true;
+		goto no_smc;
 
-	if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
-		return true;
+	if (smc->limit_smc_hs && workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
+		goto no_smc;
 
-	return false;
+#if IS_ENABLED(CONFIG_SMC_BPF)
+	bpf_smc_set_tcp_option_cond(tp, ireq);
+#endif /* CONFIG_SMC_BPF */
+
+	return;
+no_smc:
+	ireq->smc_ok = 0;
 }
 
 struct smc_hashinfo smc_v4_hashinfo = {
@@ -2650,9 +2657,6 @@  int smc_listen(struct socket *sock, int backlog)
 
 	inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
 
-	if (smc->limit_smc_hs)
-		tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
-
 	rc = kernel_listen(smc->clcsock, backlog);
 	if (rc) {
 		write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
@@ -3324,6 +3328,13 @@  int smc_create_clcsk(struct net *net, struct sock *sk, int family)
 	sk->sk_net_refcnt = 1;
 	get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
 	sock_inuse_add(net, 1);
+
+	/* init tcp_smc_ctx */
+#if IS_ENABLED(CONFIG_SMC_BPF)
+	smc->tcp_smc_ctx.set_option = bpf_smc_set_tcp_option;
+#endif /* CONFIG_SMC_BPF */
+	smc->tcp_smc_ctx.set_option_cond = smc_set_tcp_option_cond;
+	tcp_sk(sk)->smc = &smc->tcp_smc_ctx;
 	return 0;
 }
 
@@ -3574,8 +3585,17 @@  static int __init smc_init(void)
 		pr_err("%s: smc_inet_init fails with %d\n", __func__, rc);
 		goto out_ulp;
 	}
+
+	rc = smc_bpf_struct_ops_init();
+	if (rc) {
+		pr_err("%s: smc_bpf_struct_ops_init fails with %d\n", __func__, rc);
+		goto out_inet;
+	}
+
 	static_branch_enable(&tcp_have_smc);
 	return 0;
+out_inet:
+	smc_inet_exit();
 out_ulp:
 	tcp_unregister_ulp(&smc_ulp_ops);
 out_lo:
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 78ae10d..a9794fb 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -16,6 +16,7 @@ 
 #include <linux/compiler.h> /* __aligned */
 #include <net/genetlink.h>
 #include <net/sock.h>
+#include <net/tcp.h>
 
 #include "smc_ib.h"
 
@@ -328,6 +329,9 @@  struct smc_sock {				/* smc sock container */
 						/* protects clcsock of a listen
 						 * socket
 						 * */
+
+	/* smc context for tcp stack */
+	struct tcpsmc_ctx	tcp_smc_ctx;
 };
 
 #define smc_sk(ptr) container_of_const(ptr, struct smc_sock, sk)
diff --git a/net/smc/smc_bpf.c b/net/smc/smc_bpf.c
new file mode 100644
index 00000000..fa90406
--- /dev/null
+++ b/net/smc/smc_bpf.c
@@ -0,0 +1,212 @@ 
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  support for eBPF programs in SMC subsystem.
+ *
+ *  Copyright IBM Corp. 2016
+ *  Copyright (c) 2024, Alibaba Inc.
+ *
+ *  Author: D. Wythe <alibuda@linux.alibaba.com>
+ */
+
+#include <linux/bpf_verifier.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <net/smc.h>
+
+#include "smc_bpf.h"
+
+static DEFINE_SPINLOCK(smc_bpf_ops_list_lock);
+static LIST_HEAD(smc_bpf_ops_list);
+
+static u32 tcp_sock_id, smc_bpf_ops_ctx_id;
+static const struct btf_type *smc_bpf_ops_type;
+static const struct btf *saved_btf;
+
+static int smc_bpf_ops_init(struct btf *btf)
+{
+	s32 type_id;
+
+	type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT);
+	if (type_id < 0)
+		return -EINVAL;
+	tcp_sock_id = type_id;
+
+	type_id = btf_find_by_name_kind(btf, "smc_bpf_ops_ctx", BTF_KIND_STRUCT);
+	if (type_id < 0)
+		return -EINVAL;
+	smc_bpf_ops_ctx_id = type_id;
+
+	type_id = btf_find_by_name_kind(btf, "smc_bpf_ops", BTF_KIND_STRUCT);
+	if (type_id < 0)
+		return -EINVAL;
+	smc_bpf_ops_type = btf_type_by_id(btf, type_id);
+
+	saved_btf = btf;
+	return 0;
+}
+
+static int smc_bpf_ops_init_member(const struct btf_type *t,
+				   const struct btf_member *member,
+				   void *kdata, const void *udata)
+{
+	struct smc_bpf_ops *k_ops;
+	u32 moff;
+
+	k_ops = (struct smc_bpf_ops *)kdata;
+
+	moff = __btf_member_bit_offset(t, member) / 8;
+	switch (moff) {
+	case offsetof(struct smc_bpf_ops, list):
+		INIT_LIST_HEAD(&k_ops->list);
+		return 1;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+static int smc_bpf_ops_check_member(const struct btf_type *t,
+				    const struct btf_member *member,
+				    const struct bpf_prog *prog)
+{
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+
+	switch (moff) {
+	case offsetof(struct smc_bpf_ops, set_option):
+	case offsetof(struct smc_bpf_ops, set_option_cond):
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int smc_bpf_ops_reg(void *kdata, struct bpf_link *link)
+{
+	struct smc_bpf_ops *ops = kdata;
+
+	/* Prevent the same ops from being registered repeatedly. */
+	if (!list_empty(&ops->list))
+		return -EINVAL;
+
+	spin_lock(&smc_bpf_ops_list_lock);
+	list_add_tail_rcu(&ops->list, &smc_bpf_ops_list);
+	spin_unlock(&smc_bpf_ops_list_lock);
+
+	return 0;
+}
+
+static void smc_bpf_ops_unreg(void *kdata, struct bpf_link *link)
+{
+	struct smc_bpf_ops *ops = kdata;
+
+	spin_lock(&smc_bpf_ops_list_lock);
+	list_del_rcu(&ops->list);
+	spin_unlock(&smc_bpf_ops_list_lock);
+
+	/* Ensure that all readers to complete */
+	synchronize_rcu();
+}
+
+static void __bpf_smc_stub_set_tcp_option(struct smc_bpf_ops_ctx *ops_ctx) {}
+static void __bpf_smc_stub_set_tcp_option_cond(struct smc_bpf_ops_ctx *ops_ctx) {}
+
+static struct smc_bpf_ops __bpf_smc_bpf_ops = {
+	.set_option = __bpf_smc_stub_set_tcp_option,
+	.set_option_cond = __bpf_smc_stub_set_tcp_option_cond,
+};
+
+static int smc_bpf_ops_btf_struct_access(struct bpf_verifier_log *log,
+					 const struct bpf_reg_state *reg,
+					 const struct bpf_prog *prog,
+					 int off, int size)
+{
+	const struct btf_member *member;
+	const char *mname;
+	int member_idx;
+
+	member_idx = prog->expected_attach_type;
+	if (member_idx >= btf_type_vlen(smc_bpf_ops_type))
+		goto out_err;
+
+	member = &btf_type_member(smc_bpf_ops_type)[member_idx];
+	mname = btf_str_by_offset(saved_btf, member->name_off);
+
+	if (!strcmp(mname, "set_option")) {
+		/* only support to modify tcp_sock->syn_smc */
+		if (reg->btf_id == tcp_sock_id &&
+		    off == offsetof(struct tcp_sock, syn_smc) &&
+		    off + size == offsetofend(struct tcp_sock, syn_smc))
+			return 0;
+	} else if (!strcmp(mname, "set_option_cond")) {
+		/* only support to modify smc_bpf_ops_ctx->smc_ok */
+		if (reg->btf_id == smc_bpf_ops_ctx_id &&
+		    off == offsetof(struct smc_bpf_ops_ctx, set_option_cond.smc_ok) &&
+		    off + size == offsetofend(struct smc_bpf_ops_ctx, set_option_cond.smc_ok))
+			return 0;
+	}
+
+out_err:
+	return -EACCES;
+}
+
+static const struct bpf_verifier_ops smc_bpf_verifier_ops = {
+	.get_func_proto = bpf_base_func_proto,
+	.is_valid_access = bpf_tracing_btf_ctx_access,
+	.btf_struct_access = smc_bpf_ops_btf_struct_access,
+};
+
+static struct bpf_struct_ops bpf_smc_bpf_ops = {
+	.init = smc_bpf_ops_init,
+	.name = "smc_bpf_ops",
+	.reg = smc_bpf_ops_reg,
+	.unreg = smc_bpf_ops_unreg,
+	.cfi_stubs = &__bpf_smc_bpf_ops,
+	.verifier_ops = &smc_bpf_verifier_ops,
+	.init_member = smc_bpf_ops_init_member,
+	.check_member = smc_bpf_ops_check_member,
+	.owner = THIS_MODULE,
+};
+
+int smc_bpf_struct_ops_init(void)
+{
+	return register_bpf_struct_ops(&bpf_smc_bpf_ops, smc_bpf_ops);
+}
+
+void bpf_smc_set_tcp_option(struct tcp_sock *tp)
+{
+	struct smc_bpf_ops_ctx ops_ctx = {};
+	struct smc_bpf_ops *ops;
+
+	ops_ctx.set_option.tp = tp;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ops, &smc_bpf_ops_list, list) {
+		ops->set_option(&ops_ctx);
+	}
+	rcu_read_unlock();
+}
+
+void bpf_smc_set_tcp_option_cond(const struct tcp_sock *tp, struct inet_request_sock *ireq)
+{
+	struct smc_bpf_ops_ctx ops_ctx = {};
+	struct smc_bpf_ops *ops;
+
+	ops_ctx.set_option_cond.tp = tp;
+	ops_ctx.set_option_cond.ireq = ireq;
+	ops_ctx.set_option_cond.smc_ok = ireq->smc_ok;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ops, &smc_bpf_ops_list, list) {
+		ops->set_option_cond(&ops_ctx);
+	}
+	rcu_read_unlock();
+
+	ireq->smc_ok = ops_ctx.set_option_cond.smc_ok;
+}
diff --git a/net/smc/smc_bpf.h b/net/smc/smc_bpf.h
new file mode 100644
index 00000000..a5ed0fc
--- /dev/null
+++ b/net/smc/smc_bpf.h
@@ -0,0 +1,34 @@ 
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  support for eBPF programs in SMC subsystem.
+ *
+ *  Copyright IBM Corp. 2016
+ *  Copyright (c) 2024, Alibaba Inc.
+ *
+ *  Author: D. Wythe <alibuda@linux.alibaba.com>
+ */
+#ifndef __SMC_BPF
+#define __SMC_BPF
+
+#include <linux/types.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+
+#if IS_ENABLED(CONFIG_SMC_BPF)
+
+/* Initialize struct_ops registration. It will automatically unload
+ * when module is unloaded.
+ * @return 0 on success
+ */
+int smc_bpf_struct_ops_init(void);
+
+void bpf_smc_set_tcp_option(struct tcp_sock *sk);
+void bpf_smc_set_tcp_option_cond(const struct tcp_sock *tp, struct inet_request_sock *ireq);
+
+#else
+static inline int smc_bpf_struct_ops_init(void) { return 0; }
+#endif /* CONFIG_SMC_BPF */
+
+#endif /* __SMC_BPF */