diff mbox series

[net-next] tcp: switch orphan_count to bare per-cpu counters

Message ID 20211014022723.3477478-1-eric.dumazet@gmail.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next] tcp: switch orphan_count to bare per-cpu counters | expand

Checks

Context Check Description
netdev/cover_letter success Single patches do not need cover letters
netdev/fixes_present success Fixes tag not required for -next series
netdev/patch_count success Link
netdev/tree_selection success Clearly marked for net-next
netdev/subject_prefix success Link
netdev/cc_maintainers warning 6 maintainers not CCed: dccp@vger.kernel.org yoshfuji@linux-ipv6.org aahringo@redhat.com vvs@virtuozzo.com rdunlap@infradead.org dsahern@kernel.org
netdev/source_inline success Was 0 now: 0
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/module_param success Was 0 now: 0
netdev/build_32bit fail Errors and warnings before: 3059 this patch: 2806
netdev/kdoc success Errors and warnings before: 4 this patch: 4
netdev/verify_fixes success No Fixes tag
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 192 lines checked
netdev/build_allmodconfig_warn fail Errors and warnings before: 3143 this patch: 2096
netdev/header_inline success No static functions without inline keyword in header files

Commit Message

Eric Dumazet Oct. 14, 2021, 2:27 a.m. UTC
From: Eric Dumazet <edumazet@google.com>

Use of percpu_counter structure to track count of orphaned
sockets is causing problems on modern hosts with 256 cpus
or more.

Stefan Bach reported a serious spinlock contention in real workloads,
that I was able to reproduce with a netfilter rule dropping
incoming FIN packets.

    53.56%  server  [kernel.kallsyms]      [k] queued_spin_lock_slowpath
            |
            ---queued_spin_lock_slowpath
               |
                --53.51%--_raw_spin_lock_irqsave
                          |
                           --53.51%--__percpu_counter_sum
                                     tcp_check_oom
                                     |
                                     |--39.03%--__tcp_close
                                     |          tcp_close
                                     |          inet_release
                                     |          inet6_release
                                     |          sock_close
                                     |          __fput
                                     |          ____fput
                                     |          task_work_run
                                     |          exit_to_usermode_loop
                                     |          do_syscall_64
                                     |
                                      --14.48%--tcp_out_of_resources
                                                tcp_write_timeout
                                                tcp_retransmit_timer
                                                tcp_write_timer_handler
                                                tcp_write_timer
                                                call_timer_fn
                                                expire_timers
                                                __run_timers
                                                run_timer_softirq
                                                __softirqentry_text_start

As explained in commit cf86a086a180 ("net/dst: use a smaller percpu_counter
batch for dst entries accounting"), default batch size is too big
for the default value of tcp_max_orphans (262144).

But even if we reduce batch sizes, there would still be cases
where the estimated count of orphans is beyond the limit,
and where tcp_too_many_orphans() has to call the expensive
percpu_counter_sum_positive().

One solution is to use plain per-cpu counters, and have
a timer to periodically refresh this cache.

Updating this cache every 100ms seems about right, tcp pressure
state is not radically changing over shorter periods.

percpu_counter was nice 15 years ago while hosts had less
than 16 cpus, not anymore by current standards.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Stefan Bach <sfb@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
---
 include/net/inet_connection_sock.h |  2 +-
 include/net/sock.h                 |  2 +-
 include/net/tcp.h                  | 17 +++-----------
 net/dccp/dccp.h                    |  2 +-
 net/dccp/proto.c                   | 14 ++++--------
 net/ipv4/inet_connection_sock.c    |  4 ++--
 net/ipv4/inet_hashtables.c         |  2 +-
 net/ipv4/proc.c                    |  2 +-
 net/ipv4/tcp.c                     | 36 ++++++++++++++++++++++++++----
 9 files changed, 46 insertions(+), 35 deletions(-)

Comments

kernel test robot Oct. 14, 2021, 8:59 a.m. UTC | #1
Hi Eric,

I love your patch! Yet something to improve:

[auto build test ERROR on net-next/master]

url:    https://github.com/0day-ci/linux/commits/Eric-Dumazet/tcp-switch-orphan_count-to-bare-per-cpu-counters/20211014-102939
base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 39e222bfd7f37e7a98069869375b903d7096c113
config: microblaze-buildonly-randconfig-r004-20211013 (attached as .config)
compiler: microblaze-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/0day-ci/linux/commit/890c055a0e11b7505283aa06a63f04cbf7e115f0
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review Eric-Dumazet/tcp-switch-orphan_count-to-bare-per-cpu-counters/20211014-102939
        git checkout 890c055a0e11b7505283aa06a63f04cbf7e115f0
        # save the attached .config to linux build tree
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross ARCH=microblaze 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   In file included from drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c:33:
   drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c: In function 'reset_listen_child':
>> drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h:98:62: error: passing argument 1 of 'percpu_counter_inc' from incompatible pointer type [-Werror=incompatible-pointer-types]
      98 | #define INC_ORPHAN_COUNT(sk) percpu_counter_inc((sk)->sk_prot->orphan_count)
         |                                                 ~~~~~~~~~~~~~^~~~~~~~~~~~~~
         |                                                              |
         |                                                              unsigned int *
   drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c:508:9: note: in expansion of macro 'INC_ORPHAN_COUNT'
     508 |         INC_ORPHAN_COUNT(child);
         |         ^~~~~~~~~~~~~~~~
   In file included from include/linux/sched/user.h:7,
                    from include/linux/cred.h:17,
                    from include/linux/sched/signal.h:10,
                    from include/linux/rcuwait.h:6,
                    from include/linux/percpu-rwsem.h:7,
                    from include/linux/fs.h:33,
                    from include/linux/highmem.h:5,
                    from include/linux/bvec.h:10,
                    from include/linux/skbuff.h:17,
                    from drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c:11:
   include/linux/percpu_counter.h:181:62: note: expected 'struct percpu_counter *' but argument is of type 'unsigned int *'
     181 | static inline void percpu_counter_inc(struct percpu_counter *fbc)
         |                                       ~~~~~~~~~~~~~~~~~~~~~~~^~~
   drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c: In function 'do_abort_syn_rcv':
>> drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c:873:52: error: passing argument 1 of 'percpu_counter_inc' from incompatible pointer type [-Werror=incompatible-pointer-types]
     873 |                 percpu_counter_inc((child)->sk_prot->orphan_count);
         |                                    ~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~
         |                                                    |
         |                                                    unsigned int *
   In file included from include/linux/sched/user.h:7,
                    from include/linux/cred.h:17,
                    from include/linux/sched/signal.h:10,
                    from include/linux/rcuwait.h:6,
                    from include/linux/percpu-rwsem.h:7,
                    from include/linux/fs.h:33,
                    from include/linux/highmem.h:5,
                    from include/linux/bvec.h:10,
                    from include/linux/skbuff.h:17,
                    from drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.c:11:
   include/linux/percpu_counter.h:181:62: note: expected 'struct percpu_counter *' but argument is of type 'unsigned int *'
     181 | static inline void percpu_counter_inc(struct percpu_counter *fbc)
         |                                       ~~~~~~~~~~~~~~~~~~~~~~~^~~
   cc1: some warnings being treated as errors


vim +/percpu_counter_inc +98 drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls_cm.h

a6779341a173aa drivers/crypto/chelsio/chtls/chtls_cm.h Atul Gupta 2018-03-31  90  
a6779341a173aa drivers/crypto/chelsio/chtls/chtls_cm.h Atul Gupta 2018-03-31  91  #define SND_WSCALE(tp) ((tp)->rx_opt.snd_wscale)
a6779341a173aa drivers/crypto/chelsio/chtls/chtls_cm.h Atul Gupta 2018-03-31  92  #define RCV_WSCALE(tp) ((tp)->rx_opt.rcv_wscale)
a6779341a173aa drivers/crypto/chelsio/chtls/chtls_cm.h Atul Gupta 2018-03-31  93  #define USER_MSS(tp) ((tp)->rx_opt.user_mss)
a6779341a173aa drivers/crypto/chelsio/chtls/chtls_cm.h Atul Gupta 2018-03-31  94  #define TS_RECENT_STAMP(tp) ((tp)->rx_opt.ts_recent_stamp)
a6779341a173aa drivers/crypto/chelsio/chtls/chtls_cm.h Atul Gupta 2018-03-31  95  #define WSCALE_OK(tp) ((tp)->rx_opt.wscale_ok)
a6779341a173aa drivers/crypto/chelsio/chtls/chtls_cm.h Atul Gupta 2018-03-31  96  #define TSTAMP_OK(tp) ((tp)->rx_opt.tstamp_ok)
a6779341a173aa drivers/crypto/chelsio/chtls/chtls_cm.h Atul Gupta 2018-03-31  97  #define SACK_OK(tp) ((tp)->rx_opt.sack_ok)
a6779341a173aa drivers/crypto/chelsio/chtls/chtls_cm.h Atul Gupta 2018-03-31 @98  #define INC_ORPHAN_COUNT(sk) percpu_counter_inc((sk)->sk_prot->orphan_count)
a6779341a173aa drivers/crypto/chelsio/chtls/chtls_cm.h Atul Gupta 2018-03-31  99  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
diff mbox series

Patch

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b06c2d02ec84e96c6222ac608473d7eaf71e5590..fa6a87246a7b85d3358b6ec66e6029445fe3b066 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -289,7 +289,7 @@  static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
 {
 	/* The below has to be done to allow calling inet_csk_destroy_sock */
 	sock_set_flag(sk, SOCK_DEAD);
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	this_cpu_inc(*sk->sk_prot->orphan_count);
 }
 
 void inet_csk_destroy_sock(struct sock *sk);
diff --git a/include/net/sock.h b/include/net/sock.h
index d08ab55fa4a05f403d7591eff9752145ea73e008..596ba85611bc786affed2bf2b18e455b015f3774 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1235,7 +1235,7 @@  struct proto {
 	unsigned int		useroffset;	/* Usercopy region offset */
 	unsigned int		usersize;	/* Usercopy region size */
 
-	struct percpu_counter	*orphan_count;
+	unsigned int __percpu	*orphan_count;
 
 	struct request_sock_ops	*rsk_prot;
 	struct timewait_sock_ops *twsk_prot;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4c2898ac65698a097ccaedcc59542041f15f4f70..af77e6453b1b461780df2d6e2fb94ab1baab688e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -48,7 +48,9 @@ 
 
 extern struct inet_hashinfo tcp_hashinfo;
 
-extern struct percpu_counter tcp_orphan_count;
+DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
+int tcp_orphan_count_sum(void);
+
 void tcp_time_wait(struct sock *sk, int state, int timeo);
 
 #define MAX_TCP_HEADER	L1_CACHE_ALIGN(128 + MAX_HEADER)
@@ -290,19 +292,6 @@  static inline bool tcp_out_of_memory(struct sock *sk)
 
 void sk_forced_mem_schedule(struct sock *sk, int size);
 
-static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
-{
-	struct percpu_counter *ocp = sk->sk_prot->orphan_count;
-	int orphans = percpu_counter_read_positive(ocp);
-
-	if (orphans << shift > sysctl_tcp_max_orphans) {
-		orphans = percpu_counter_sum_positive(ocp);
-		if (orphans << shift > sysctl_tcp_max_orphans)
-			return true;
-	}
-	return false;
-}
-
 bool tcp_check_oom(struct sock *sk, int shift);
 
 
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index c5c1d2b8045e8efd9bf32a2db1e679c13cbf1852..5183e627468d8901f4f80ed8d74a655aa2a6557f 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -48,7 +48,7 @@  extern bool dccp_debug;
 
 extern struct inet_hashinfo dccp_hashinfo;
 
-extern struct percpu_counter dccp_orphan_count;
+DECLARE_PER_CPU(unsigned int, dccp_orphan_count);
 
 void dccp_time_wait(struct sock *sk, int state, int timeo);
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index abb5c596a81763b3a571c48137e98d61d21207bf..fc44dadc778bbef72b2cade8b453f012831f592d 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -42,8 +42,8 @@  DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
 
 EXPORT_SYMBOL_GPL(dccp_statistics);
 
-struct percpu_counter dccp_orphan_count;
-EXPORT_SYMBOL_GPL(dccp_orphan_count);
+DEFINE_PER_CPU(unsigned int, dccp_orphan_count);
+EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count);
 
 struct inet_hashinfo dccp_hashinfo;
 EXPORT_SYMBOL_GPL(dccp_hashinfo);
@@ -1055,7 +1055,7 @@  void dccp_close(struct sock *sk, long timeout)
 	bh_lock_sock(sk);
 	WARN_ON(sock_owned_by_user(sk));
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	this_cpu_inc(dccp_orphan_count);
 
 	/* Have we already been destroyed by a softirq or backlog? */
 	if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
@@ -1115,13 +1115,10 @@  static int __init dccp_init(void)
 
 	BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
 		     sizeof_field(struct sk_buff, cb));
-	rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
-	if (rc)
-		goto out_fail;
 	inet_hashinfo_init(&dccp_hashinfo);
 	rc = inet_hashinfo2_init_mod(&dccp_hashinfo);
 	if (rc)
-		goto out_free_percpu;
+		goto out_fail;
 	rc = -ENOBUFS;
 	dccp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("dccp_bind_bucket",
@@ -1226,8 +1223,6 @@  static int __init dccp_init(void)
 	kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
 out_free_hashinfo2:
 	inet_hashinfo2_free_mod(&dccp_hashinfo);
-out_free_percpu:
-	percpu_counter_destroy(&dccp_orphan_count);
 out_fail:
 	dccp_hashinfo.bhash = NULL;
 	dccp_hashinfo.ehash = NULL;
@@ -1250,7 +1245,6 @@  static void __exit dccp_fini(void)
 	dccp_ackvec_exit();
 	dccp_sysctl_exit();
 	inet_hashinfo2_free_mod(&dccp_hashinfo);
-	percpu_counter_destroy(&dccp_orphan_count);
 }
 
 module_init(dccp_init);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index f25d02ad4a8af41790261a0c79188111ed408efc..f7fea3a7c5e64b92ca9c6b56293628923649e58c 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1015,7 +1015,7 @@  void inet_csk_destroy_sock(struct sock *sk)
 
 	sk_refcnt_debug_release(sk);
 
-	percpu_counter_dec(sk->sk_prot->orphan_count);
+	this_cpu_dec(*sk->sk_prot->orphan_count);
 
 	sock_put(sk);
 }
@@ -1074,7 +1074,7 @@  static void inet_child_forget(struct sock *sk, struct request_sock *req,
 
 	sock_orphan(child);
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	this_cpu_inc(*sk->sk_prot->orphan_count);
 
 	if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
 		BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index bfb522e513461a92cbd19c0c2c14b2dda33bb4f7..75737267746f85a5be82fbe04bbfb429914334c1 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -598,7 +598,7 @@  bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
 	if (ok) {
 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 	} else {
-		percpu_counter_inc(sk->sk_prot->orphan_count);
+		this_cpu_inc(*sk->sk_prot->orphan_count);
 		inet_sk_set_state(sk, TCP_CLOSE);
 		sock_set_flag(sk, SOCK_DEAD);
 		inet_csk_destroy_sock(sk);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b0d3a09dc84e7a1126bb45b0f8d4ff1d36131d25..f30273afb5399ddf0122e46e36da2ddae720a1c3 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -53,7 +53,7 @@  static int sockstat_seq_show(struct seq_file *seq, void *v)
 	struct net *net = seq->private;
 	int orphans, sockets;
 
-	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
+	orphans = tcp_orphan_count_sum();
 	sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
 
 	socket_seq_show(seq);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 414c179c28e0dd5b91194456d34b46faf2b122e4..a4481748cdf08809bbb600a8559647db83624c43 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -287,8 +287,8 @@  enum {
 	TCP_CMSG_TS = 2
 };
 
-struct percpu_counter tcp_orphan_count;
-EXPORT_SYMBOL_GPL(tcp_orphan_count);
+DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
+EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
 
 long sysctl_tcp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_tcp_mem);
@@ -2673,6 +2673,31 @@  void tcp_shutdown(struct sock *sk, int how)
 }
 EXPORT_SYMBOL(tcp_shutdown);
 
+int tcp_orphan_count_sum(void)
+{
+	int i, total = 0;
+
+	for_each_possible_cpu(i)
+		total += per_cpu(tcp_orphan_count, i);
+
+	return max(total, 0);
+}
+
+static int tcp_orphan_cache;
+static struct timer_list tcp_orphan_timer;
+#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
+
+static void tcp_orphan_update(struct timer_list *unused)
+{
+	WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
+	mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+}
+
+static bool tcp_too_many_orphans(struct sock *sk, int shift)
+{
+	return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
+}
+
 bool tcp_check_oom(struct sock *sk, int shift)
 {
 	bool too_many_orphans, out_of_socket_memory;
@@ -2786,7 +2811,7 @@  void __tcp_close(struct sock *sk, long timeout)
 	/* remove backlog if any, without releasing ownership. */
 	__release_sock(sk);
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	this_cpu_inc(tcp_orphan_count);
 
 	/* Have we already been destroyed by a softirq or backlog? */
 	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -4479,7 +4504,10 @@  void __init tcp_init(void)
 		     sizeof_field(struct sk_buff, cb));
 
 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
-	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
+
+	timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
+	mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
+
 	inet_hashinfo_init(&tcp_hashinfo);
 	inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
 			    thash_entries, 21,  /* one slot per 2 MB*/