From patchwork Tue Mar 1 09:43:56 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dust Li X-Patchwork-Id: 12764421 X-Patchwork-Delegate: kuba@kernel.org Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 1494DC433EF for ; Tue, 1 Mar 2022 09:44:12 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S233518AbiCAJou (ORCPT ); Tue, 1 Mar 2022 04:44:50 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55814 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232098AbiCAJot (ORCPT ); Tue, 1 Mar 2022 04:44:49 -0500 Received: from out30-42.freemail.mail.aliyun.com (out30-42.freemail.mail.aliyun.com [115.124.30.42]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id BFAA73B01E; Tue, 1 Mar 2022 01:44:07 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R821e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=e01e04423;MF=dust.li@linux.alibaba.com;NM=1;PH=DS;RN=8;SR=0;TI=SMTPD_---0V5vuqVq_1646127844; Received: from localhost(mailfrom:dust.li@linux.alibaba.com fp:SMTPD_---0V5vuqVq_1646127844) by smtp.aliyun-inc.com(127.0.0.1); Tue, 01 Mar 2022 17:44:05 +0800 From: Dust Li To: Karsten Graul , Tony Lu , Guangguan Wang Cc: davem@davemloft.net, kuba@kernel.org, netdev@vger.kernel.org, linux-s390@vger.kernel.org, linux-rdma@vger.kernel.org Subject: [PATCH net-next 1/7] net/smc: add sysctl interface for SMC Date: Tue, 1 Mar 2022 17:43:56 +0800 Message-Id: <20220301094402.14992-2-dust.li@linux.alibaba.com> X-Mailer: git-send-email 2.19.1.3.ge56e4f7 In-Reply-To: <20220301094402.14992-1-dust.li@linux.alibaba.com> References: <20220301094402.14992-1-dust.li@linux.alibaba.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org X-Patchwork-Delegate: kuba@kernel.org This patch add sysctl interface to support container environment for SMC as we talk in the mail list. Link: https://lore.kernel.org/netdev/20220224020253.GF5443@linux.alibaba.com Co-developed-by: Tony Lu Signed-off-by: Tony Lu Signed-off-by: Dust Li --- include/net/netns/smc.h | 3 ++ net/smc/Makefile | 2 +- net/smc/af_smc.c | 10 ++++++ net/smc/smc_sysctl.c | 70 +++++++++++++++++++++++++++++++++++++++++ net/smc/smc_sysctl.h | 32 +++++++++++++++++++ 5 files changed, 116 insertions(+), 1 deletion(-) create mode 100644 net/smc/smc_sysctl.c create mode 100644 net/smc/smc_sysctl.h diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 47b166684fd8..1682eae50579 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -14,5 +14,8 @@ struct netns_smc { struct smc_stats_rsn *fback_rsn; bool limit_smc_hs; /* constraint on handshake */ +#ifdef CONFIG_SYSCTL + struct ctl_table_header *smc_hdr; +#endif }; #endif diff --git a/net/smc/Makefile b/net/smc/Makefile index 196fb6f01b14..640af9a39f9c 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o obj-$(CONFIG_SMC_DIAG) += smc_diag.o smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o -smc-y += smc_tracepoint.o +smc-y += smc_tracepoint.o smc_sysctl.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 30acc31b2c45..19b3066cf7af 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -51,6 +51,7 @@ #include "smc_close.h" #include "smc_stats.h" #include "smc_tracepoint.h" +#include "smc_sysctl.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -3273,9 +3274,17 @@ static int __init smc_init(void) goto out_sock; } + rc = smc_sysctl_init(); + if (rc) { + pr_err("%s: sysctl_init fails with %d\n", __func__, rc); + goto out_ulp; + } + static_branch_enable(&tcp_have_smc); return 0; +out_ulp: + tcp_unregister_ulp(&smc_ulp_ops); out_sock: sock_unregister(PF_SMC); out_proto6: @@ -3303,6 +3312,7 @@ static int __init smc_init(void) static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); + smc_sysctl_exit(); tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c new file mode 100644 index 000000000000..8a3a8e145976 --- /dev/null +++ b/net/smc/smc_sysctl.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu + * + */ + +#include +#include +#include + +#include "smc_sysctl.h" + +static struct ctl_table smc_table[] = { + { } +}; + +static __net_init int smc_sysctl_init_net(struct net *net) +{ + struct ctl_table *table; + + table = smc_table; + if (!net_eq(net, &init_net)) { + int i; + + table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++) + table[i].data += (void *)net - (void *)&init_net; + } + + net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table); + if (!net->smc.smc_hdr) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static __net_exit void smc_sysctl_exit_net(struct net *net) +{ + unregister_net_sysctl_table(net->smc.smc_hdr); +} + +static struct pernet_operations smc_sysctl_ops __net_initdata = { + .init = smc_sysctl_init_net, + .exit = smc_sysctl_exit_net, +}; + +int __init smc_sysctl_init(void) +{ + return register_pernet_subsys(&smc_sysctl_ops); +} + +void smc_sysctl_exit(void) +{ + unregister_pernet_subsys(&smc_sysctl_ops); +} diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h new file mode 100644 index 000000000000..49553ac236b6 --- /dev/null +++ b/net/smc/smc_sysctl.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * smc_sysctl.c: sysctl interface to SMC subsystem. + * + * Copyright (c) 2022, Alibaba Inc. + * + * Author: Tony Lu + * + */ + +#ifndef _SMC_SYSCTL_H +#define _SMC_SYSCTL_H + +#ifdef CONFIG_SYSCTL + +int smc_sysctl_init(void); +void smc_sysctl_exit(void); + +#else + +int smc_sysctl_init(void) +{ + return 0; +} + +void smc_sysctl_exit(void) { } + +#endif /* CONFIG_SYSCTL */ + +#endif /* _SMC_SYSCTL_H */ From patchwork Tue Mar 1 09:43:57 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dust Li X-Patchwork-Id: 12764423 X-Patchwork-Delegate: kuba@kernel.org Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id C0134C433EF for ; Tue, 1 Mar 2022 09:44:15 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S233968AbiCAJow (ORCPT ); Tue, 1 Mar 2022 04:44:52 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55848 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229607AbiCAJou (ORCPT ); Tue, 1 Mar 2022 04:44:50 -0500 Received: from out30-57.freemail.mail.aliyun.com (out30-57.freemail.mail.aliyun.com [115.124.30.57]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 96F2E3B3C5; Tue, 1 Mar 2022 01:44:08 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R821e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=e01e04394;MF=dust.li@linux.alibaba.com;NM=1;PH=DS;RN=8;SR=0;TI=SMTPD_---0V5wm1Jv_1646127845; Received: from localhost(mailfrom:dust.li@linux.alibaba.com fp:SMTPD_---0V5wm1Jv_1646127845) by smtp.aliyun-inc.com(127.0.0.1); Tue, 01 Mar 2022 17:44:06 +0800 From: Dust Li To: Karsten Graul , Tony Lu , Guangguan Wang Cc: davem@davemloft.net, kuba@kernel.org, netdev@vger.kernel.org, linux-s390@vger.kernel.org, linux-rdma@vger.kernel.org Subject: [PATCH net-next 2/7] net/smc: add autocorking support Date: Tue, 1 Mar 2022 17:43:57 +0800 Message-Id: <20220301094402.14992-3-dust.li@linux.alibaba.com> X-Mailer: git-send-email 2.19.1.3.ge56e4f7 In-Reply-To: <20220301094402.14992-1-dust.li@linux.alibaba.com> References: <20220301094402.14992-1-dust.li@linux.alibaba.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org X-Patchwork-Delegate: kuba@kernel.org This patch adds autocorking support for SMC which could improve throughput for small message by x3+. The main idea is borrowed from TCP autocorking with some RDMA specific modification: 1. The first message should never cork to make sure we won't bring extra latency 2. If we have posted any Tx WRs to the NIC that have not completed, cork the new messages until: a) Receive CQE for the last Tx WR b) We have corked enough message on the connection 3. Try to push the corked data out when we receive CQE of the last Tx WR to prevent the corked messages hang in the send queue. Both SMC autocorking and TCP autocorking check the TX completion to decide whether we should cork or not. The difference is when we got a SMC Tx WR completion, the data have been confirmed by the RNIC while TCP TX completion just tells us the data have been sent out by the local NIC. Add an atomic variable tx_pushing in smc_connection to make sure only one can send to let it cork more and save CDC slot. SMC autocorking should not bring extra latency since the first message will always been sent out immediately. The qperf tcp_bw test shows more than x4 increase under small message size with Mellanox connectX4-Lx, same result with other throughput benchmarks like sockperf/netperf. The qperf tcp_lat test shows SMC autocorking has not increase any ping-pong latency. Test command: client: smc_run taskset -c 1 qperf smc-server -oo msg_size:1:64K:*2 \ -t 30 -vu tcp_{bw|lat} server: smc_run taskset -c 1 qperf === Bandwidth ==== MsgSize(Bytes) SMC-NoCork TCP SMC-AutoCorking 1 0.578 MB/s 2.392 MB/s(313.57%) 2.647 MB/s(357.72%) 2 1.159 MB/s 4.780 MB/s(312.53%) 5.153 MB/s(344.71%) 4 2.283 MB/s 10.266 MB/s(349.77%) 10.363 MB/s(354.02%) 8 4.668 MB/s 19.040 MB/s(307.86%) 21.215 MB/s(354.45%) 16 9.147 MB/s 38.904 MB/s(325.31%) 41.740 MB/s(356.32%) 32 18.369 MB/s 79.587 MB/s(333.25%) 82.392 MB/s(348.52%) 64 36.562 MB/s 148.668 MB/s(306.61%) 161.564 MB/s(341.89%) 128 72.961 MB/s 274.913 MB/s(276.80%) 325.363 MB/s(345.94%) 256 144.705 MB/s 512.059 MB/s(253.86%) 633.743 MB/s(337.96%) 512 288.873 MB/s 884.977 MB/s(206.35%) 1250.681 MB/s(332.95%) 1024 574.180 MB/s 1337.736 MB/s(132.98%) 2246.121 MB/s(291.19%) 2048 1095.192 MB/s 1865.952 MB/s( 70.38%) 2057.767 MB/s( 87.89%) 4096 2066.157 MB/s 2380.337 MB/s( 15.21%) 2173.983 MB/s( 5.22%) 8192 3717.198 MB/s 2733.073 MB/s(-26.47%) 3491.223 MB/s( -6.08%) 16384 4742.221 MB/s 2958.693 MB/s(-37.61%) 4637.692 MB/s( -2.20%) 32768 5349.550 MB/s 3061.285 MB/s(-42.77%) 5385.796 MB/s( 0.68%) 65536 5162.919 MB/s 3731.408 MB/s(-27.73%) 5223.890 MB/s( 1.18%) ==== Latency ==== MsgSize(Bytes) SMC-NoCork TCP SMC-AutoCorking 1 10.540 us 11.938 us( 13.26%) 10.573 us( 0.31%) 2 10.996 us 11.992 us( 9.06%) 10.269 us( -6.61%) 4 10.229 us 11.687 us( 14.25%) 10.240 us( 0.11%) 8 10.203 us 11.653 us( 14.21%) 10.402 us( 1.95%) 16 10.530 us 11.313 us( 7.44%) 10.599 us( 0.66%) 32 10.241 us 11.586 us( 13.13%) 10.223 us( -0.18%) 64 10.693 us 11.652 us( 8.97%) 10.251 us( -4.13%) 128 10.597 us 11.579 us( 9.27%) 10.494 us( -0.97%) 256 10.409 us 11.957 us( 14.87%) 10.710 us( 2.89%) 512 11.088 us 12.505 us( 12.78%) 10.547 us( -4.88%) 1024 11.240 us 12.255 us( 9.03%) 10.787 us( -4.03%) 2048 11.485 us 16.970 us( 47.76%) 11.256 us( -1.99%) 4096 12.077 us 13.948 us( 15.49%) 12.230 us( 1.27%) 8192 13.683 us 16.693 us( 22.00%) 13.786 us( 0.75%) 16384 16.470 us 23.615 us( 43.38%) 16.459 us( -0.07%) 32768 22.540 us 40.966 us( 81.75%) 23.284 us( 3.30%) 65536 34.192 us 73.003 us(113.51%) 34.233 us( 0.12%) With SMC autocorking support, we can archive better throughput than TCP in most message sizes without any latency trade-off. Signed-off-by: Dust Li --- v2: 1. Remove empty line in smc_connection 2. use Reverse Christmas tree style for local variable. 3. remove redundant container_of v3: 1. use hex instead of decimal 2. Remove unintented removal of new line 3. Rename autocork to autocorking to be compliant with TCP 4. re-test the data, use SMC NoCork as baseline --- net/smc/smc.h | 2 + net/smc/smc_cdc.c | 11 +++-- net/smc/smc_tx.c | 107 ++++++++++++++++++++++++++++++++++++++++------ 3 files changed, 105 insertions(+), 15 deletions(-) diff --git a/net/smc/smc.h b/net/smc/smc.h index a096d8af21a0..e266b04b7585 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -29,6 +29,7 @@ #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ +#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */ extern struct proto smc_proto; extern struct proto smc_proto6; @@ -192,6 +193,7 @@ struct smc_connection { * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ + atomic_t tx_pushing; /* nr_threads trying tx push */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 9d5a97168969..2b37bec90824 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -48,9 +48,14 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } - if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) && - unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) - wake_up(&conn->cdc_pend_tx_wq); + if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { + /* If this is the last pending WR complete, we must push to + * prevent hang when autocork enabled. + */ + smc_tx_sndbuf_nonempty(conn); + if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) + wake_up(&conn->cdc_pend_tx_wq); + } WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0); smc_tx_sndbuf_nonfull(smc); diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 436ac836f363..062c6b1535e3 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -131,6 +131,51 @@ static bool smc_tx_is_corked(struct smc_sock *smc) return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; } +/* If we have pending CDC messages, do not send: + * Because CQE of this CDC message will happen shortly, it gives + * a chance to coalesce future sendmsg() payload in to one RDMA Write, + * without need for a timer, and with no latency trade off. + * Algorithm here: + * 1. First message should never cork + * 2. If we have pending Tx CDC messages, wait for the first CDC + * message's completion + * 3. Don't cork to much data in a single RDMA Write to prevent burst + * traffic, total corked message should not exceed sendbuf/2 + */ +static bool smc_should_autocork(struct smc_sock *smc) +{ + struct smc_connection *conn = &smc->conn; + int corking_size; + + corking_size = min(SMC_AUTOCORKING_DEFAULT_SIZE, + conn->sndbuf_desc->len >> 1); + + if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || + smc_tx_prepared_sends(conn) > corking_size) + return false; + return true; +} + +static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg) +{ + struct smc_connection *conn = &smc->conn; + + if (smc_should_autocork(smc)) + return true; + + /* for a corked socket defer the RDMA writes if + * sndbuf_space is still available. The applications + * should known how/when to uncork it. + */ + if ((msg->msg_flags & MSG_MORE || + smc_tx_is_corked(smc) || + msg->msg_flags & MSG_SENDPAGE_NOTLAST) && + atomic_read(&conn->sndbuf_space)) + return true; + + return false; +} + /* sndbuf producer: main API called by socket layer. * called under sock lock. */ @@ -235,13 +280,10 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) */ if ((msg->msg_flags & MSG_OOB) && !send_remaining) conn->urg_tx_pend = true; - /* for a corked socket defer the RDMA writes if - * sndbuf_space is still available. The applications - * should known how/when to uncork it. + /* If we need to cork, do nothing and wait for the next + * sendmsg() call or push on tx completion */ - if (!((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) || - msg->msg_flags & MSG_SENDPAGE_NOTLAST) && - atomic_read(&conn->sndbuf_space))) + if (!smc_tx_should_cork(smc, msg)) smc_tx_sndbuf_nonempty(conn); trace_smc_tx_sendmsg(smc, copylen); @@ -589,13 +631,26 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) return rc; } -int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn) { - int rc; + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + int rc = 0; + + /* No data in the send queue */ + if (unlikely(smc_tx_prepared_sends(conn) <= 0)) + goto out; + + /* Peer don't have RMBE space */ + if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) { + SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk); + goto out; + } if (conn->killed || - conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) - return -EPIPE; /* connection being aborted */ + conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { + rc = -EPIPE; /* connection being aborted */ + goto out; + } if (conn->lgr->is_smcd) rc = smcd_tx_sndbuf_nonempty(conn); else @@ -603,10 +658,38 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) if (!rc) { /* trigger socket release if connection is closing */ - struct smc_sock *smc = container_of(conn, struct smc_sock, - conn); smc_close_wake_tx_prepared(smc); } + +out: + return rc; +} + +int smc_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + int rc; + + /* This make sure only one can send simultaneously to prevent wasting + * of CPU and CDC slot. + * Record whether someone has tried to push while we are pushing. + */ + if (atomic_inc_return(&conn->tx_pushing) > 1) + return 0; + +again: + atomic_set(&conn->tx_pushing, 1); + smp_wmb(); /* Make sure tx_pushing is 1 before real send */ + rc = __smc_tx_sndbuf_nonempty(conn); + + /* We need to check whether someone else have added some data into + * the send queue and tried to push but failed after the atomic_set() + * when we are pushing. + * If so, we need to push again to prevent those data hang in the send + * queue. + */ + if (unlikely(!atomic_dec_and_test(&conn->tx_pushing))) + goto again; + return rc; } From patchwork Tue Mar 1 09:43:58 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dust Li X-Patchwork-Id: 12764424 X-Patchwork-Delegate: kuba@kernel.org Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 396B4C433FE for ; Tue, 1 Mar 2022 09:44:17 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S233862AbiCAJoz (ORCPT ); Tue, 1 Mar 2022 04:44:55 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55870 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S233859AbiCAJou (ORCPT ); Tue, 1 Mar 2022 04:44:50 -0500 Received: from out30-45.freemail.mail.aliyun.com (out30-45.freemail.mail.aliyun.com [115.124.30.45]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id D2C2C3B55E; Tue, 1 Mar 2022 01:44:09 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R111e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=e01e04357;MF=dust.li@linux.alibaba.com;NM=1;PH=DS;RN=8;SR=0;TI=SMTPD_---0V5wm1KG_1646127846; Received: from localhost(mailfrom:dust.li@linux.alibaba.com fp:SMTPD_---0V5wm1KG_1646127846) by smtp.aliyun-inc.com(127.0.0.1); Tue, 01 Mar 2022 17:44:07 +0800 From: Dust Li To: Karsten Graul , Tony Lu , Guangguan Wang Cc: davem@davemloft.net, kuba@kernel.org, netdev@vger.kernel.org, linux-s390@vger.kernel.org, linux-rdma@vger.kernel.org Subject: [PATCH net-next 3/7] net/smc: add sysctl for autocorking Date: Tue, 1 Mar 2022 17:43:58 +0800 Message-Id: <20220301094402.14992-4-dust.li@linux.alibaba.com> X-Mailer: git-send-email 2.19.1.3.ge56e4f7 In-Reply-To: <20220301094402.14992-1-dust.li@linux.alibaba.com> References: <20220301094402.14992-1-dust.li@linux.alibaba.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org X-Patchwork-Delegate: kuba@kernel.org This add a new sysctl: net.smc.autocorking_size We can dynamically change the behaviour of autocorking by change the value of autocorking_size. Setting to 0 disables autocorking in SMC Signed-off-by: Dust Li --- Documentation/networking/smc-sysctl.rst | 23 +++++++++++++++++++++++ include/net/netns/smc.h | 1 + net/smc/smc_sysctl.c | 10 ++++++++++ net/smc/smc_tx.c | 2 +- 4 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 Documentation/networking/smc-sysctl.rst diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst new file mode 100644 index 000000000000..c53f8c61c9e4 --- /dev/null +++ b/Documentation/networking/smc-sysctl.rst @@ -0,0 +1,23 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========= +SMC Sysctl +========= + +/proc/sys/net/smc/* Variables +============================== + +autocorking_size - INTEGER + Setting SMC auto corking size: + SMC auto corking is like TCP auto corking from the application's + perspective of view. When applications do consecutive small + write()/sendmsg() system calls, we try to coalesce these small writes + as much as possible, to lower total amount of CDC and RDMA Write been + sent. + autocorking_size limits the maximum corked bytes that can be sent to + the under device in 1 single sending. If set to 0, the SMC auto corking + is disabled. + Applications can still use TCP_CORK for optimal behavior when they + know how/when to uncork their sockets. + + Default: 64K diff --git a/include/net/netns/smc.h b/include/net/netns/smc.h index 1682eae50579..e5389eeaf8bd 100644 --- a/include/net/netns/smc.h +++ b/include/net/netns/smc.h @@ -17,5 +17,6 @@ struct netns_smc { #ifdef CONFIG_SYSCTL struct ctl_table_header *smc_hdr; #endif + unsigned int sysctl_autocorking_size; }; #endif diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 8a3a8e145976..3b59876aaac9 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -14,9 +14,17 @@ #include #include +#include "smc.h" #include "smc_sysctl.h" static struct ctl_table smc_table[] = { + { + .procname = "autocorking_size", + .data = &init_net.smc.sysctl_autocorking_size, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec, + }, { } }; @@ -40,6 +48,8 @@ static __net_init int smc_sysctl_init_net(struct net *net) if (!net->smc.smc_hdr) goto err_reg; + net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; + return 0; err_reg: diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 062c6b1535e3..257dc0d0aeb1 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -147,7 +147,7 @@ static bool smc_should_autocork(struct smc_sock *smc) struct smc_connection *conn = &smc->conn; int corking_size; - corking_size = min(SMC_AUTOCORKING_DEFAULT_SIZE, + corking_size = min(sock_net(&smc->sk)->smc.sysctl_autocorking_size, conn->sndbuf_desc->len >> 1); if (atomic_read(&conn->cdc_pend_tx_wr) == 0 || From patchwork Tue Mar 1 09:43:59 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dust Li X-Patchwork-Id: 12764427 X-Patchwork-Delegate: kuba@kernel.org Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 7D72BC433EF for ; Tue, 1 Mar 2022 09:44:30 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S234017AbiCAJpI (ORCPT ); Tue, 1 Mar 2022 04:45:08 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56830 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S233974AbiCAJpE (ORCPT ); Tue, 1 Mar 2022 04:45:04 -0500 Received: from out199-18.us.a.mail.aliyun.com (out199-18.us.a.mail.aliyun.com [47.90.199.18]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 313623BF93; Tue, 1 Mar 2022 01:44:12 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R621e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=e01e04400;MF=dust.li@linux.alibaba.com;NM=1;PH=DS;RN=8;SR=0;TI=SMTPD_---0V5x4ASj_1646127847; Received: from localhost(mailfrom:dust.li@linux.alibaba.com fp:SMTPD_---0V5x4ASj_1646127847) by smtp.aliyun-inc.com(127.0.0.1); Tue, 01 Mar 2022 17:44:08 +0800 From: Dust Li To: Karsten Graul , Tony Lu , Guangguan Wang Cc: davem@davemloft.net, kuba@kernel.org, netdev@vger.kernel.org, linux-s390@vger.kernel.org, linux-rdma@vger.kernel.org Subject: [PATCH net-next 4/7] net/smc: send directly on setting TCP_NODELAY Date: Tue, 1 Mar 2022 17:43:59 +0800 Message-Id: <20220301094402.14992-5-dust.li@linux.alibaba.com> X-Mailer: git-send-email 2.19.1.3.ge56e4f7 In-Reply-To: <20220301094402.14992-1-dust.li@linux.alibaba.com> References: <20220301094402.14992-1-dust.li@linux.alibaba.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org X-Patchwork-Delegate: kuba@kernel.org In commit ea785a1a573b("net/smc: Send directly when TCP_CORK is cleared"), we don't use delayed work to implement cork. This patch use the same algorithm, removes the delayed work when setting TCP_NODELAY and send directly in setsockopt(). This also makes the TCP_NODELAY the same as TCP. Cc: Tony Lu Signed-off-by: Dust Li --- net/smc/af_smc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 19b3066cf7af..e661b3747945 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2796,8 +2796,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, sk->sk_state != SMC_CLOSED) { if (val) { SMC_STAT_INC(smc, ndly_cnt); - mod_delayed_work(smc->conn.lgr->tx_wq, - &smc->conn.tx_work, 0); + smc_tx_pending(&smc->conn); + cancel_delayed_work(&smc->conn.tx_work); } } break; From patchwork Tue Mar 1 09:44:00 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dust Li X-Patchwork-Id: 12764425 X-Patchwork-Delegate: kuba@kernel.org Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 7EC97C4332F for ; Tue, 1 Mar 2022 09:44:27 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S233994AbiCAJpF (ORCPT ); Tue, 1 Mar 2022 04:45:05 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55976 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S233969AbiCAJox (ORCPT ); Tue, 1 Mar 2022 04:44:53 -0500 Received: from out30-130.freemail.mail.aliyun.com (out30-130.freemail.mail.aliyun.com [115.124.30.130]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 22F563BBF3; Tue, 1 Mar 2022 01:44:11 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R131e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=e01e01424;MF=dust.li@linux.alibaba.com;NM=1;PH=DS;RN=8;SR=0;TI=SMTPD_---0V5vTpxp_1646127849; Received: from localhost(mailfrom:dust.li@linux.alibaba.com fp:SMTPD_---0V5vTpxp_1646127849) by smtp.aliyun-inc.com(127.0.0.1); Tue, 01 Mar 2022 17:44:09 +0800 From: Dust Li To: Karsten Graul , Tony Lu , Guangguan Wang Cc: davem@davemloft.net, kuba@kernel.org, netdev@vger.kernel.org, linux-s390@vger.kernel.org, linux-rdma@vger.kernel.org Subject: [PATCH net-next 5/7] net/smc: correct settings of RMB window update limit Date: Tue, 1 Mar 2022 17:44:00 +0800 Message-Id: <20220301094402.14992-6-dust.li@linux.alibaba.com> X-Mailer: git-send-email 2.19.1.3.ge56e4f7 In-Reply-To: <20220301094402.14992-1-dust.li@linux.alibaba.com> References: <20220301094402.14992-1-dust.li@linux.alibaba.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org X-Patchwork-Delegate: kuba@kernel.org rmbe_update_limit is used to limit announcing receive window updating too frequently. RFC7609 request a minimal increase in the window size of 10% of the receive buffer space. But current implementation used: min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2) and SOCK_MIN_SNDBUF / 2 == 2304 Bytes, which is almost always less then 10% of the receive buffer space. This causes the receiver always sending CDC message to update its consumer cursor when it consumes more then 2K of data. And as a result, we may encounter something like "TCP silly window syndrome" when sending 2.5~8K message. This patch fixes this using max(rmbe_size / 10, SOCK_MIN_SNDBUF / 2). With this patch and SMC autocorking enabled, qperf 2K/4K/8K tcp_bw test shows 45%/75%/40% increase in throughput respectively. Signed-off-by: Dust Li --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 29525d03b253..1674b2549f8b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1988,7 +1988,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, */ static inline int smc_rmb_wnd_update_limit(int rmbe_size) { - return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); + return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } /* map an rmb buf to a link */ From patchwork Tue Mar 1 09:44:01 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dust Li X-Patchwork-Id: 12764426 X-Patchwork-Delegate: kuba@kernel.org Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id B7EECC4167B for ; Tue, 1 Mar 2022 09:44:28 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S234007AbiCAJpG (ORCPT ); Tue, 1 Mar 2022 04:45:06 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56828 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S233975AbiCAJpE (ORCPT ); Tue, 1 Mar 2022 04:45:04 -0500 Received: from out30-44.freemail.mail.aliyun.com (out30-44.freemail.mail.aliyun.com [115.124.30.44]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 82AB63BFA8; Tue, 1 Mar 2022 01:44:13 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R151e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=e01e04394;MF=dust.li@linux.alibaba.com;NM=1;PH=DS;RN=8;SR=0;TI=SMTPD_---0V5wm1LL_1646127850; Received: from localhost(mailfrom:dust.li@linux.alibaba.com fp:SMTPD_---0V5wm1LL_1646127850) by smtp.aliyun-inc.com(127.0.0.1); Tue, 01 Mar 2022 17:44:11 +0800 From: Dust Li To: Karsten Graul , Tony Lu , Guangguan Wang Cc: davem@davemloft.net, kuba@kernel.org, netdev@vger.kernel.org, linux-s390@vger.kernel.org, linux-rdma@vger.kernel.org Subject: [PATCH net-next 6/7] net/smc: don't req_notify until all CQEs drained Date: Tue, 1 Mar 2022 17:44:01 +0800 Message-Id: <20220301094402.14992-7-dust.li@linux.alibaba.com> X-Mailer: git-send-email 2.19.1.3.ge56e4f7 In-Reply-To: <20220301094402.14992-1-dust.li@linux.alibaba.com> References: <20220301094402.14992-1-dust.li@linux.alibaba.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org X-Patchwork-Delegate: kuba@kernel.org When we are handling softirq workload, enable hardirq may again interrupt the current routine of softirq, and then try to raise softirq again. This only wastes CPU cycles and won't have any real gain. Since IB_CQ_REPORT_MISSED_EVENTS already make sure if ib_req_notify_cq() returns 0, it is safe to wait for the next event, with no need to poll the CQ again in this case. This patch disables hardirq during the processing of softirq, and re-arm the CQ after softirq is done. Somehow like NAPI. Co-developed-by: Guangguan Wang Signed-off-by: Guangguan Wang Signed-off-by: Dust Li --- net/smc/smc_wr.c | 49 +++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 24be1d03fef9..34d616406d51 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -137,25 +137,28 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int i = 0, rc; - int polled = 0; + int i, rc; again: - polled++; do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); - if (polled == 1) { - ib_req_notify_cq(dev->roce_cq_send, - IB_CQ_NEXT_COMP | - IB_CQ_REPORT_MISSED_EVENTS); - } - if (!rc) - break; for (i = 0; i < rc; i++) smc_wr_tx_process_cqe(&wc[i]); + if (rc < SMC_WR_MAX_POLL_CQE) + /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been + * drained, no need to poll again. --Guangguan Wang + */ + break; } while (rc > 0); - if (polled == 1) + + /* IB_CQ_REPORT_MISSED_EVENTS make sure if ib_req_notify_cq() returns + * 0, it is safe to wait for the next event. + * Else we must poll the CQ again to make sure we won't miss any event + */ + if (ib_req_notify_cq(dev->roce_cq_send, + IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS)) goto again; } @@ -478,24 +481,28 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) { struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; - int polled = 0; int rc; again: - polled++; do { memset(&wc, 0, sizeof(wc)); rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); - if (polled == 1) { - ib_req_notify_cq(dev->roce_cq_recv, - IB_CQ_SOLICITED_MASK - | IB_CQ_REPORT_MISSED_EVENTS); - } - if (!rc) + if (rc > 0) + smc_wr_rx_process_cqes(&wc[0], rc); + if (rc < SMC_WR_MAX_POLL_CQE) + /* If < SMC_WR_MAX_POLL_CQE, the CQ should have been + * drained, no need to poll again. --Guangguan Wang + */ break; - smc_wr_rx_process_cqes(&wc[0], rc); } while (rc > 0); - if (polled == 1) + + /* IB_CQ_REPORT_MISSED_EVENTS make sure if ib_req_notify_cq() returns + * 0, it is safe to wait for the next event. + * Else we must poll the CQ again to make sure we won't miss any event + */ + if (ib_req_notify_cq(dev->roce_cq_recv, + IB_CQ_SOLICITED_MASK | + IB_CQ_REPORT_MISSED_EVENTS)) goto again; } From patchwork Tue Mar 1 09:44:02 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dust Li X-Patchwork-Id: 12764428 X-Patchwork-Delegate: kuba@kernel.org Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 5DFD1C4332F for ; Tue, 1 Mar 2022 09:44:33 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S233995AbiCAJpJ (ORCPT ); Tue, 1 Mar 2022 04:45:09 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:56886 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S233990AbiCAJpF (ORCPT ); Tue, 1 Mar 2022 04:45:05 -0500 Received: from out30-133.freemail.mail.aliyun.com (out30-133.freemail.mail.aliyun.com [115.124.30.133]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id A24B047AC0; Tue, 1 Mar 2022 01:44:14 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R381e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=e01e01424;MF=dust.li@linux.alibaba.com;NM=1;PH=DS;RN=8;SR=0;TI=SMTPD_---0V5wm1Le_1646127851; Received: from localhost(mailfrom:dust.li@linux.alibaba.com fp:SMTPD_---0V5wm1Le_1646127851) by smtp.aliyun-inc.com(127.0.0.1); Tue, 01 Mar 2022 17:44:12 +0800 From: Dust Li To: Karsten Graul , Tony Lu , Guangguan Wang Cc: davem@davemloft.net, kuba@kernel.org, netdev@vger.kernel.org, linux-s390@vger.kernel.org, linux-rdma@vger.kernel.org Subject: [PATCH net-next 7/7] net/smc: don't send in the BH context if sock_owned_by_user Date: Tue, 1 Mar 2022 17:44:02 +0800 Message-Id: <20220301094402.14992-8-dust.li@linux.alibaba.com> X-Mailer: git-send-email 2.19.1.3.ge56e4f7 In-Reply-To: <20220301094402.14992-1-dust.li@linux.alibaba.com> References: <20220301094402.14992-1-dust.li@linux.alibaba.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org X-Patchwork-Delegate: kuba@kernel.org Send data all the way down to the RDMA device is a time consuming operation(get a new slot, maybe do RDMA Write and send a CDC, etc). Moving those operations from BH to user context is good for performance. If the sock_lock is hold by user, we don't try to send data out in the BH context, but just mark we should send. Since the user will release the sock_lock soon, we can do the sending there. Add smc_release_cb() which will be called in release_sock() and try send in the callback if needed. This patch moves the sending part out from BH if sock lock is hold by user. In my testing environment, this saves about 20% softirq in the qperf 4K tcp_bw test in the sender side with no noticeable throughput drop. Signed-off-by: Dust Li --- net/smc/af_smc.c | 16 ++++++++++++++++ net/smc/smc.h | 4 ++++ net/smc/smc_cdc.c | 19 ++++++++++++++----- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e661b3747945..6447607675fa 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -193,12 +193,27 @@ void smc_unhash_sk(struct sock *sk) } EXPORT_SYMBOL_GPL(smc_unhash_sk); +/* This will be called before user really release sock_lock. So do the + * work which we didn't do because of user hold the sock_lock in the + * BH context + */ +static void smc_release_cb(struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + if (smc->conn.tx_in_release_sock) { + smc_tx_pending(&smc->conn); + smc->conn.tx_in_release_sock = false; + } +} + struct proto smc_proto = { .name = "SMC", .owner = THIS_MODULE, .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, @@ -211,6 +226,7 @@ struct proto smc_proto6 = { .keepalive = smc_set_keepalive, .hash = smc_hash_sk, .unhash = smc_unhash_sk, + .release_cb = smc_release_cb, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v6_hashinfo, .slab_flags = SLAB_TYPESAFE_BY_RCU, diff --git a/net/smc/smc.h b/net/smc/smc.h index e266b04b7585..ea0620529ebe 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -213,6 +213,10 @@ struct smc_connection { * data still pending */ char urg_rx_byte; /* urgent byte */ + bool tx_in_release_sock; + /* flush pending tx data in + * sock release_cb() + */ atomic_t bytes_to_rcv; /* arrived data, * not yet received */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 2b37bec90824..5c731f27996e 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -49,10 +49,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, } if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) { - /* If this is the last pending WR complete, we must push to - * prevent hang when autocork enabled. + /* If user owns the sock_lock, mark the connection need sending. + * User context will later try to send when it release sock_lock + * in smc_release_cb() */ - smc_tx_sndbuf_nonempty(conn); + if (sock_owned_by_user(&smc->sk)) + conn->tx_in_release_sock = true; + else + smc_tx_pending(conn); + if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq))) wake_up(&conn->cdc_pend_tx_wq); } @@ -355,8 +360,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ if ((diff_cons && smc_tx_prepared_sends(conn)) || conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || - conn->local_rx_ctrl.prod_flags.urg_data_pending) - smc_tx_sndbuf_nonempty(conn); + conn->local_rx_ctrl.prod_flags.urg_data_pending) { + if (!sock_owned_by_user(&smc->sk)) + smc_tx_pending(conn); + else + conn->tx_in_release_sock = true; + } if (diff_cons && conn->urg_tx_pend && atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {