diff mbox series

[net-next,4/4] tcp: use RCU lookup in __inet_hash_connect()

Message ID 20250302124237.3913746-5-edumazet@google.com (mailing list archive)
State Accepted
Commit 86c2bc293b8130aec9fa504e953531a84a6eb9a6
Delegated to: Netdev Maintainers
Headers show
Series tcp: scale connect() under pressure | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 7 this patch: 7
netdev/build_tools success Errors and warnings before: 26 (+1) this patch: 26 (+1)
netdev/cc_maintainers warning 1 maintainers not CCed: dsahern@kernel.org
netdev/build_clang success Errors and warnings before: 10 this patch: 10
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 917 this patch: 917
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 145 lines checked
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 0 this patch: 0
netdev/source_inline success Was 0 now: 0
netdev/contest success net-next-2025-03-04--12-00 (tests: 893)

Commit Message

Eric Dumazet March 2, 2025, 12:42 p.m. UTC
When __inet_hash_connect() has to try many 4-tuples before
finding an available one, we see a high spinlock cost from
the many spin_lock_bh(&head->lock) performed in its loop.

This patch adds an RCU lookup to avoid the spinlock cost.

check_established() gets a new @rcu_lookup argument.
First reason is to not make any changes while head->lock
is not held.
Second reason is to not make this RCU lookup a second time
after the spinlock has been acquired.

Tested:

Server:

ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog

Client:

ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server

Before series:

  utime_start=0.288582
  utime_end=1.548707
  stime_start=20.637138
  stime_end=2002.489845
  num_transactions=484453
  latency_min=0.156279245
  latency_max=20.922042756
  latency_mean=1.546521274
  latency_stddev=3.936005194
  num_samples=312537
  throughput=47426.00

perf top on the client:

 49.54%  [kernel]       [k] _raw_spin_lock
 25.87%  [kernel]       [k] _raw_spin_lock_bh
  5.97%  [kernel]       [k] queued_spin_lock_slowpath
  5.67%  [kernel]       [k] __inet_hash_connect
  3.53%  [kernel]       [k] __inet6_check_established
  3.48%  [kernel]       [k] inet6_ehashfn
  0.64%  [kernel]       [k] rcu_all_qs

After this series:

  utime_start=0.271607
  utime_end=3.847111
  stime_start=18.407684
  stime_end=1997.485557
  num_transactions=1350742
  latency_min=0.014131929
  latency_max=17.895073144
  latency_mean=0.505675853  # Nice reduction of latency metrics
  latency_stddev=2.125164772
  num_samples=307884
  throughput=139866.80      # 190 % increase

perf top on client:

 56.86%  [kernel]       [k] __inet6_check_established
 17.96%  [kernel]       [k] __inet_hash_connect
 13.88%  [kernel]       [k] inet6_ehashfn
  2.52%  [kernel]       [k] rcu_all_qs
  2.01%  [kernel]       [k] __cond_resched
  0.41%  [kernel]       [k] _raw_spin_lock

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/inet_hashtables.h |  3 +-
 net/ipv4/inet_hashtables.c    | 52 +++++++++++++++++++++++------------
 net/ipv6/inet6_hashtables.c   | 24 ++++++++--------
 3 files changed, 50 insertions(+), 29 deletions(-)

Comments

Jason Xing March 3, 2025, 1:07 a.m. UTC | #1
On Sun, Mar 2, 2025 at 8:42 PM Eric Dumazet <edumazet@google.com> wrote:
>
> When __inet_hash_connect() has to try many 4-tuples before
> finding an available one, we see a high spinlock cost from
> the many spin_lock_bh(&head->lock) performed in its loop.
>
> This patch adds an RCU lookup to avoid the spinlock cost.
>
> check_established() gets a new @rcu_lookup argument.
> First reason is to not make any changes while head->lock
> is not held.
> Second reason is to not make this RCU lookup a second time
> after the spinlock has been acquired.
>
> Tested:
>
> Server:
>
> ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog
>
> Client:
>
> ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server
>
> Before series:
>
>   utime_start=0.288582
>   utime_end=1.548707
>   stime_start=20.637138
>   stime_end=2002.489845
>   num_transactions=484453
>   latency_min=0.156279245
>   latency_max=20.922042756
>   latency_mean=1.546521274
>   latency_stddev=3.936005194
>   num_samples=312537
>   throughput=47426.00
>
> perf top on the client:
>
>  49.54%  [kernel]       [k] _raw_spin_lock
>  25.87%  [kernel]       [k] _raw_spin_lock_bh
>   5.97%  [kernel]       [k] queued_spin_lock_slowpath
>   5.67%  [kernel]       [k] __inet_hash_connect
>   3.53%  [kernel]       [k] __inet6_check_established
>   3.48%  [kernel]       [k] inet6_ehashfn
>   0.64%  [kernel]       [k] rcu_all_qs
>
> After this series:
>
>   utime_start=0.271607
>   utime_end=3.847111
>   stime_start=18.407684
>   stime_end=1997.485557
>   num_transactions=1350742
>   latency_min=0.014131929
>   latency_max=17.895073144
>   latency_mean=0.505675853  # Nice reduction of latency metrics
>   latency_stddev=2.125164772
>   num_samples=307884
>   throughput=139866.80      # 190 % increase
>
> perf top on client:
>
>  56.86%  [kernel]       [k] __inet6_check_established
>  17.96%  [kernel]       [k] __inet_hash_connect
>  13.88%  [kernel]       [k] inet6_ehashfn
>   2.52%  [kernel]       [k] rcu_all_qs
>   2.01%  [kernel]       [k] __cond_resched
>   0.41%  [kernel]       [k] _raw_spin_lock
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
Tested-by: Jason Xing <kerneljasonxing@gmail.com>

I tested only on my virtual machine (with 64 cpus) and got an around
100% performance increase which is really good. And I also noticed
that the spin lock hotspot has gone :)

Thanks for working on this!!!
Eric Dumazet March 3, 2025, 10:25 a.m. UTC | #2
On Mon, Mar 3, 2025 at 2:08 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Sun, Mar 2, 2025 at 8:42 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > When __inet_hash_connect() has to try many 4-tuples before
> > finding an available one, we see a high spinlock cost from
> > the many spin_lock_bh(&head->lock) performed in its loop.
> >
> > This patch adds an RCU lookup to avoid the spinlock cost.
> >
> > check_established() gets a new @rcu_lookup argument.
> > First reason is to not make any changes while head->lock
> > is not held.
> > Second reason is to not make this RCU lookup a second time
> > after the spinlock has been acquired.
> >
> > Tested:
> >
> > Server:
> >
> > ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog
> >
> > Client:
> >
> > ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server
> >
> > Before series:
> >
> >   utime_start=0.288582
> >   utime_end=1.548707
> >   stime_start=20.637138
> >   stime_end=2002.489845
> >   num_transactions=484453
> >   latency_min=0.156279245
> >   latency_max=20.922042756
> >   latency_mean=1.546521274
> >   latency_stddev=3.936005194
> >   num_samples=312537
> >   throughput=47426.00
> >
> > perf top on the client:
> >
> >  49.54%  [kernel]       [k] _raw_spin_lock
> >  25.87%  [kernel]       [k] _raw_spin_lock_bh
> >   5.97%  [kernel]       [k] queued_spin_lock_slowpath
> >   5.67%  [kernel]       [k] __inet_hash_connect
> >   3.53%  [kernel]       [k] __inet6_check_established
> >   3.48%  [kernel]       [k] inet6_ehashfn
> >   0.64%  [kernel]       [k] rcu_all_qs
> >
> > After this series:
> >
> >   utime_start=0.271607
> >   utime_end=3.847111
> >   stime_start=18.407684
> >   stime_end=1997.485557
> >   num_transactions=1350742
> >   latency_min=0.014131929
> >   latency_max=17.895073144
> >   latency_mean=0.505675853  # Nice reduction of latency metrics
> >   latency_stddev=2.125164772
> >   num_samples=307884
> >   throughput=139866.80      # 190 % increase
> >
> > perf top on client:
> >
> >  56.86%  [kernel]       [k] __inet6_check_established
> >  17.96%  [kernel]       [k] __inet_hash_connect
> >  13.88%  [kernel]       [k] inet6_ehashfn
> >   2.52%  [kernel]       [k] rcu_all_qs
> >   2.01%  [kernel]       [k] __cond_resched
> >   0.41%  [kernel]       [k] _raw_spin_lock
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
>
> Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
> Tested-by: Jason Xing <kerneljasonxing@gmail.com>
>
> I tested only on my virtual machine (with 64 cpus) and got an around
> 100% performance increase which is really good. And I also noticed
> that the spin lock hotspot has gone :)
>
> Thanks for working on this!!!

Hold your breath, I have two additional patches bringing the perf to :

local_throughput=353891          #   646 % improvement

I will wait for this first series to be merged before sending these.
Jason Xing March 3, 2025, 10:39 a.m. UTC | #3
On Mon, Mar 3, 2025 at 6:25 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Mon, Mar 3, 2025 at 2:08 AM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > On Sun, Mar 2, 2025 at 8:42 PM Eric Dumazet <edumazet@google.com> wrote:
> > >
> > > When __inet_hash_connect() has to try many 4-tuples before
> > > finding an available one, we see a high spinlock cost from
> > > the many spin_lock_bh(&head->lock) performed in its loop.
> > >
> > > This patch adds an RCU lookup to avoid the spinlock cost.
> > >
> > > check_established() gets a new @rcu_lookup argument.
> > > First reason is to not make any changes while head->lock
> > > is not held.
> > > Second reason is to not make this RCU lookup a second time
> > > after the spinlock has been acquired.
> > >
> > > Tested:
> > >
> > > Server:
> > >
> > > ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog
> > >
> > > Client:
> > >
> > > ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server
> > >
> > > Before series:
> > >
> > >   utime_start=0.288582
> > >   utime_end=1.548707
> > >   stime_start=20.637138
> > >   stime_end=2002.489845
> > >   num_transactions=484453
> > >   latency_min=0.156279245
> > >   latency_max=20.922042756
> > >   latency_mean=1.546521274
> > >   latency_stddev=3.936005194
> > >   num_samples=312537
> > >   throughput=47426.00
> > >
> > > perf top on the client:
> > >
> > >  49.54%  [kernel]       [k] _raw_spin_lock
> > >  25.87%  [kernel]       [k] _raw_spin_lock_bh
> > >   5.97%  [kernel]       [k] queued_spin_lock_slowpath
> > >   5.67%  [kernel]       [k] __inet_hash_connect
> > >   3.53%  [kernel]       [k] __inet6_check_established
> > >   3.48%  [kernel]       [k] inet6_ehashfn
> > >   0.64%  [kernel]       [k] rcu_all_qs
> > >
> > > After this series:
> > >
> > >   utime_start=0.271607
> > >   utime_end=3.847111
> > >   stime_start=18.407684
> > >   stime_end=1997.485557
> > >   num_transactions=1350742
> > >   latency_min=0.014131929
> > >   latency_max=17.895073144
> > >   latency_mean=0.505675853  # Nice reduction of latency metrics
> > >   latency_stddev=2.125164772
> > >   num_samples=307884
> > >   throughput=139866.80      # 190 % increase
> > >
> > > perf top on client:
> > >
> > >  56.86%  [kernel]       [k] __inet6_check_established
> > >  17.96%  [kernel]       [k] __inet_hash_connect
> > >  13.88%  [kernel]       [k] inet6_ehashfn
> > >   2.52%  [kernel]       [k] rcu_all_qs
> > >   2.01%  [kernel]       [k] __cond_resched
> > >   0.41%  [kernel]       [k] _raw_spin_lock
> > >
> > > Signed-off-by: Eric Dumazet <edumazet@google.com>
> >
> > Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
> > Tested-by: Jason Xing <kerneljasonxing@gmail.com>
> >
> > I tested only on my virtual machine (with 64 cpus) and got an around
> > 100% performance increase which is really good. And I also noticed
> > that the spin lock hotspot has gone :)
> >
> > Thanks for working on this!!!
>
> Hold your breath, I have two additional patches bringing the perf to :
>
> local_throughput=353891          #   646 % improvement
>
> I will wait for this first series to be merged before sending these.

OMG, I'm really shocked... It would be super cool :D

Thanks,
Jason
Kuniyuki Iwashima March 4, 2025, 12:51 a.m. UTC | #4
From: Eric Dumazet <edumazet@google.com>
Date: Sun,  2 Mar 2025 12:42:37 +0000
> When __inet_hash_connect() has to try many 4-tuples before
> finding an available one, we see a high spinlock cost from
> the many spin_lock_bh(&head->lock) performed in its loop.
> 
> This patch adds an RCU lookup to avoid the spinlock cost.
> 
> check_established() gets a new @rcu_lookup argument.
> First reason is to not make any changes while head->lock
> is not held.
> Second reason is to not make this RCU lookup a second time
> after the spinlock has been acquired.
> 
> Tested:
> 
> Server:
> 
> ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog
> 
> Client:
> 
> ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server
> 
> Before series:
> 
>   utime_start=0.288582
>   utime_end=1.548707
>   stime_start=20.637138
>   stime_end=2002.489845
>   num_transactions=484453
>   latency_min=0.156279245
>   latency_max=20.922042756
>   latency_mean=1.546521274
>   latency_stddev=3.936005194
>   num_samples=312537
>   throughput=47426.00
> 
> perf top on the client:
> 
>  49.54%  [kernel]       [k] _raw_spin_lock
>  25.87%  [kernel]       [k] _raw_spin_lock_bh
>   5.97%  [kernel]       [k] queued_spin_lock_slowpath
>   5.67%  [kernel]       [k] __inet_hash_connect
>   3.53%  [kernel]       [k] __inet6_check_established
>   3.48%  [kernel]       [k] inet6_ehashfn
>   0.64%  [kernel]       [k] rcu_all_qs
> 
> After this series:
> 
>   utime_start=0.271607
>   utime_end=3.847111
>   stime_start=18.407684
>   stime_end=1997.485557
>   num_transactions=1350742
>   latency_min=0.014131929
>   latency_max=17.895073144
>   latency_mean=0.505675853  # Nice reduction of latency metrics
>   latency_stddev=2.125164772
>   num_samples=307884
>   throughput=139866.80      # 190 % increase
> 
> perf top on client:
> 
>  56.86%  [kernel]       [k] __inet6_check_established
>  17.96%  [kernel]       [k] __inet_hash_connect
>  13.88%  [kernel]       [k] inet6_ehashfn
>   2.52%  [kernel]       [k] rcu_all_qs
>   2.01%  [kernel]       [k] __cond_resched
>   0.41%  [kernel]       [k] _raw_spin_lock
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Thanks for the great optimisation!

Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
kernel test robot March 10, 2025, 2:03 p.m. UTC | #5
Hello,

kernel test robot noticed a 6.9% improvement of stress-ng.sockmany.ops_per_sec on:


commit: ba6c94b99d772f431fd589dd2cd606b59063557b ("[PATCH net-next 4/4] tcp: use RCU lookup in __inet_hash_connect()")
url: https://github.com/intel-lab-lkp/linux/commits/Eric-Dumazet/tcp-use-RCU-in-__inet-6-_check_established/20250302-204711
base: https://git.kernel.org/cgit/linux/kernel/git/davem/net-next.git f77f12010f67259bd0e1ad18877ed27c721b627a
patch link: https://lore.kernel.org/all/20250302124237.3913746-5-edumazet@google.com/
patch subject: [PATCH net-next 4/4] tcp: use RCU lookup in __inet_hash_connect()

testcase: stress-ng
config: x86_64-rhel-9.4
compiler: gcc-12
test machine: 224 threads 2 sockets Intel(R) Xeon(R) Platinum 8480CTDX (Sapphire Rapids) with 256G memory
parameters:

	nr_threads: 100%
	testtime: 60s
	test: sockmany
	cpufreq_governor: performance






Details are as below:
-------------------------------------------------------------------------------------------------->


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20250310/202503102159.5f78c207-lkp@intel.com

=========================================================================================
compiler/cpufreq_governor/kconfig/nr_threads/rootfs/tbox_group/test/testcase/testtime:
  gcc-12/performance/x86_64-rhel-9.4/100%/debian-12-x86_64-20240206.cgz/lkp-spr-r02/sockmany/stress-ng/60s

commit: 
  4f97f75a5b ("tcp: add RCU management to inet_bind_bucket")
  ba6c94b99d ("tcp: use RCU lookup in __inet_hash_connect()")

4f97f75a5bfa79ba ba6c94b99d772f431fd589dd2cd 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
   1742139 ± 89%     -91.6%     146373 ± 56%  numa-meminfo.node1.Unevictable
      0.61 ±  3%      +0.1        0.71 ±  3%  mpstat.cpu.all.irq%
      0.42            +0.0        0.46 ±  2%  mpstat.cpu.all.usr%
    435534 ± 89%     -91.6%      36593 ± 56%  numa-vmstat.node1.nr_unevictable
    435534 ± 89%     -91.6%      36593 ± 56%  numa-vmstat.node1.nr_zone_unevictable
   4057584            +7.0%    4340521        stress-ng.sockmany.ops
     67264            +6.9%      71933        stress-ng.sockmany.ops_per_sec
    604900           +12.3%     679404 ±  4%  perf-c2c.DRAM.local
     42998 ±  2%     -55.7%      19034 ±  3%  perf-c2c.HITM.local
     13764 ±  4%     -95.2%     663.67 ± 13%  perf-c2c.HITM.remote
     56762 ±  2%     -65.3%      19698 ±  4%  perf-c2c.HITM.total
   7422009           +13.2%    8403980 ±  2%  sched_debug.cfs_rq:/.avg_vruntime.max
    195564 ±  5%     +62.7%     318178 ± 10%  sched_debug.cfs_rq:/.avg_vruntime.stddev
      0.23 ±  7%     +25.4%       0.29 ±  4%  sched_debug.cfs_rq:/.h_nr_queued.stddev
     39935 ±  4%     +27.0%      50726 ± 29%  sched_debug.cfs_rq:/.load_avg.max
   7422009           +13.2%    8403980 ±  2%  sched_debug.cfs_rq:/.min_vruntime.max
    195564 ±  5%     +62.7%     318178 ± 10%  sched_debug.cfs_rq:/.min_vruntime.stddev
      0.23 ±  6%     +26.6%       0.29 ±  4%  sched_debug.cpu.nr_running.stddev
    387640            +5.9%     410501 ±  9%  proc-vmstat.nr_active_anon
    109911 ±  2%      +8.5%     119206 ±  2%  proc-vmstat.nr_mapped
    200627            +1.9%     204454        proc-vmstat.nr_shmem
    895041            +4.9%     939289        proc-vmstat.nr_slab_reclaimable
   2982921            +5.0%    3131084        proc-vmstat.nr_slab_unreclaimable
    387640            +5.9%     410501 ±  9%  proc-vmstat.nr_zone_active_anon
   2071760            +2.0%    2112591        proc-vmstat.numa_hit
   1839824            +2.2%    1880606        proc-vmstat.numa_local
   5905025            +5.2%    6210697        proc-vmstat.pgalloc_normal
   5291411 ± 12%     +11.9%    5921072        proc-vmstat.pgfree
      0.82 ± 13%     -29.0%       0.58 ±  6%  perf-sched.sch_delay.avg.ms.__cond_resched.__inet_hash_connect.tcp_v4_connect.__inet_stream_connect.inet_stream_connect
      4.50 ± 16%     +29.5%       5.83 ± 15%  perf-sched.sch_delay.max.ms.__cond_resched.generic_perform_write.shmem_file_write_iter.vfs_write.ksys_write
      0.03 ± 56%     -88.8%       0.00 ±223%  perf-sched.sch_delay.max.ms.__cond_resched.stop_one_cpu.migrate_task_to.task_numa_migrate.isra
      0.07 ±125%   +3754.0%       2.67 ± 71%  perf-sched.sch_delay.max.ms.__cond_resched.ww_mutex_lock.drm_gem_vunmap_unlocked.drm_gem_fb_vunmap.drm_atomic_helper_commit_planes
     19.83           -22.3%      15.41        perf-sched.total_wait_and_delay.average.ms
    177991           +32.7%     236147        perf-sched.total_wait_and_delay.count.ms
     19.76           -22.3%      15.35        perf-sched.total_wait_time.average.ms
      1.64 ± 12%     -28.9%       1.17 ±  6%  perf-sched.wait_and_delay.avg.ms.__cond_resched.__inet_hash_connect.tcp_v4_connect.__inet_stream_connect.inet_stream_connect
     13.69           -26.2%      10.10        perf-sched.wait_and_delay.avg.ms.schedule_timeout.inet_csk_accept.inet_accept.do_accept
      6844           +11.8%       7651 ±  3%  perf-sched.wait_and_delay.count.__cond_resched.__inet_hash_connect.tcp_v4_connect.__inet_stream_connect.inet_stream_connect
     78701           +33.6%     105168        perf-sched.wait_and_delay.count.__cond_resched.__release_sock.release_sock.__inet_stream_connect.inet_stream_connect
     81026           +35.2%     109539        perf-sched.wait_and_delay.count.schedule_timeout.inet_csk_accept.inet_accept.do_accept
      2268 ± 14%     +90.6%       4325 ±  6%  perf-sched.wait_and_delay.count.schedule_timeout.wait_woken.sk_wait_data.tcp_recvmsg_locked
      0.82 ± 12%     -28.6%       0.59 ±  6%  perf-sched.wait_time.avg.ms.__cond_resched.__inet_hash_connect.tcp_v4_connect.__inet_stream_connect.inet_stream_connect
     13.49           -26.5%       9.91        perf-sched.wait_time.avg.ms.__cond_resched.__release_sock.release_sock.tcp_sendmsg.__sys_sendto
      3.05 ±  3%     +16.5%       3.55 ±  3%  perf-sched.wait_time.avg.ms.__cond_resched.__wait_for_common.affine_move_task.__set_cpus_allowed_ptr.__sched_setaffinity
     30.10 ± 20%     -64.4%      10.72 ±113%  perf-sched.wait_time.avg.ms.do_task_dead.do_exit.do_group_exit.__x64_sys_exit_group.x64_sys_call
      1.14 ±  9%     +22.2%       1.40 ±  7%  perf-sched.wait_time.avg.ms.schedule_timeout.__wait_for_common.wait_for_completion_state.kernel_clone
     13.67           -26.3%      10.08        perf-sched.wait_time.avg.ms.schedule_timeout.inet_csk_accept.inet_accept.do_accept
      7.36 ± 57%    +103.9%      15.01 ± 27%  perf-sched.wait_time.avg.ms.syscall_exit_to_user_mode.do_syscall_64.entry_SYSCALL_64_after_hwframe.[unknown]
      0.03 ± 56%     -88.8%       0.00 ±223%  perf-sched.wait_time.max.ms.__cond_resched.stop_one_cpu.migrate_task_to.task_numa_migrate.isra
      0.07 ±125%    +4e+05%     275.31 ±115%  perf-sched.wait_time.max.ms.__cond_resched.ww_mutex_lock.drm_gem_vunmap_unlocked.drm_gem_fb_vunmap.drm_atomic_helper_commit_planes
     35.70           +15.3%      41.18        perf-stat.i.MPKI
 1.368e+10            +4.6%  1.431e+10        perf-stat.i.branch-instructions
      2.15            +0.1        2.27        perf-stat.i.branch-miss-rate%
 2.884e+08           +10.7%  3.192e+08        perf-stat.i.branch-misses
     71.62            +5.5       77.09        perf-stat.i.cache-miss-rate%
 2.377e+09           +26.3%  3.003e+09        perf-stat.i.cache-misses
 3.264e+09           +17.4%  3.832e+09        perf-stat.i.cache-references
      9.40            -8.1%       8.64        perf-stat.i.cpi
    292.27           -18.0%     239.70        perf-stat.i.cycles-between-cache-misses
 6.963e+10            +9.8%  7.645e+10        perf-stat.i.instructions
      0.12 ±  2%      +7.3%       0.13        perf-stat.i.ipc
     34.12           +15.0%      39.25        perf-stat.overall.MPKI
      2.11            +0.1        2.23        perf-stat.overall.branch-miss-rate%
     72.81            +5.5       78.36        perf-stat.overall.cache-miss-rate%
      9.07            -8.4%       8.31        perf-stat.overall.cpi
    265.92           -20.4%     211.72        perf-stat.overall.cycles-between-cache-misses
      0.11            +9.2%       0.12        perf-stat.overall.ipc
 1.345e+10            +4.6%  1.408e+10        perf-stat.ps.branch-instructions
 2.835e+08           +10.7%  3.139e+08        perf-stat.ps.branch-misses
 2.337e+09           +26.3%  2.952e+09        perf-stat.ps.cache-misses
 3.209e+09           +17.4%  3.768e+09        perf-stat.ps.cache-references
 6.849e+10            +9.8%  7.521e+10        perf-stat.ps.instructions
 4.236e+12            +9.1%  4.621e+12        perf-stat.total.instructions




Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.
diff mbox series

Patch

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 73c0e4087fd1a6d0d2a40ab0394165e07b08ed6d..b12797f13c9a3d66fab99c877d059f9c29c30d11 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -529,7 +529,8 @@  int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 			struct sock *sk, u64 port_offset,
 			int (*check_established)(struct inet_timewait_death_row *,
 						 struct sock *, __u16,
-						 struct inet_timewait_sock **));
+						 struct inet_timewait_sock **,
+						 bool rcu_lookup));
 
 int inet_hash_connect(struct inet_timewait_death_row *death_row,
 		      struct sock *sk);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index b737e13f8459c53428980221355344327c4bc8dd..d1b5f45ee718410fdf3e78c113c7ebd4a1ddba40 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -537,7 +537,8 @@  EXPORT_SYMBOL_GPL(__inet_lookup_established);
 /* called with local bh disabled */
 static int __inet_check_established(struct inet_timewait_death_row *death_row,
 				    struct sock *sk, __u16 lport,
-				    struct inet_timewait_sock **twp)
+				    struct inet_timewait_sock **twp,
+				    bool rcu_lookup)
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	struct inet_sock *inet = inet_sk(sk);
@@ -556,17 +557,17 @@  static int __inet_check_established(struct inet_timewait_death_row *death_row,
 	struct sock *sk2;
 	spinlock_t *lock;
 
-	rcu_read_lock();
-	sk_nulls_for_each(sk2, node, &head->chain) {
-		if (sk2->sk_hash != hash ||
-		    !inet_match(net, sk2, acookie, ports, dif, sdif))
-			continue;
-		if (sk2->sk_state == TCP_TIME_WAIT)
-			break;
-		rcu_read_unlock();
-		return -EADDRNOTAVAIL;
+	if (rcu_lookup) {
+		sk_nulls_for_each(sk2, node, &head->chain) {
+			if (sk2->sk_hash != hash ||
+			    !inet_match(net, sk2, acookie, ports, dif, sdif))
+				continue;
+			if (sk2->sk_state == TCP_TIME_WAIT)
+				break;
+			return -EADDRNOTAVAIL;
+		}
+		return 0;
 	}
-	rcu_read_unlock();
 
 	lock = inet_ehash_lockp(hinfo, hash);
 	spin_lock(lock);
@@ -1007,7 +1008,8 @@  static u32 *table_perturb;
 int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		struct sock *sk, u64 port_offset,
 		int (*check_established)(struct inet_timewait_death_row *,
-			struct sock *, __u16, struct inet_timewait_sock **))
+			struct sock *, __u16, struct inet_timewait_sock **,
+			bool rcu_lookup))
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	struct inet_bind_hashbucket *head, *head2;
@@ -1025,7 +1027,7 @@  int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 
 	if (port) {
 		local_bh_disable();
-		ret = check_established(death_row, sk, port, NULL);
+		ret = check_established(death_row, sk, port, NULL, false);
 		local_bh_enable();
 		return ret;
 	}
@@ -1061,6 +1063,21 @@  int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 			continue;
 		head = &hinfo->bhash[inet_bhashfn(net, port,
 						  hinfo->bhash_size)];
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(tb, &head->chain, node) {
+			if (!inet_bind_bucket_match(tb, net, port, l3mdev))
+				continue;
+			if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
+				rcu_read_unlock();
+				goto next_port;
+			}
+			if (!check_established(death_row, sk, port, &tw, true))
+				break;
+			rcu_read_unlock();
+			goto next_port;
+		}
+		rcu_read_unlock();
+
 		spin_lock_bh(&head->lock);
 
 		/* Does not bother with rcv_saddr checks, because
@@ -1070,12 +1087,12 @@  int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 			if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
 				if (tb->fastreuse >= 0 ||
 				    tb->fastreuseport >= 0)
-					goto next_port;
+					goto next_port_unlock;
 				WARN_ON(hlist_empty(&tb->bhash2));
 				if (!check_established(death_row, sk,
-						       port, &tw))
+						       port, &tw, false))
 					goto ok;
-				goto next_port;
+				goto next_port_unlock;
 			}
 		}
 
@@ -1089,8 +1106,9 @@  int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		tb->fastreuse = -1;
 		tb->fastreuseport = -1;
 		goto ok;
-next_port:
+next_port_unlock:
 		spin_unlock_bh(&head->lock);
+next_port:
 		cond_resched();
 	}
 
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 3604a5cae5d29a25d24f9513308334ff8e64b083..9be315496459fcb391123a07ac887e2f59d27360 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -263,7 +263,8 @@  EXPORT_SYMBOL_GPL(inet6_lookup);
 
 static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 				     struct sock *sk, const __u16 lport,
-				     struct inet_timewait_sock **twp)
+				     struct inet_timewait_sock **twp,
+				     bool rcu_lookup)
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	struct inet_sock *inet = inet_sk(sk);
@@ -281,17 +282,18 @@  static int __inet6_check_established(struct inet_timewait_death_row *death_row,
 	struct sock *sk2;
 	spinlock_t *lock;
 
-	rcu_read_lock();
-	sk_nulls_for_each(sk2, node, &head->chain) {
-		if (sk2->sk_hash != hash ||
-		    !inet6_match(net, sk2, saddr, daddr, ports, dif, sdif))
-			continue;
-		if (sk2->sk_state == TCP_TIME_WAIT)
-			break;
-		rcu_read_unlock();
-		return -EADDRNOTAVAIL;
+	if (rcu_lookup) {
+		sk_nulls_for_each(sk2, node, &head->chain) {
+			if (sk2->sk_hash != hash ||
+			    !inet6_match(net, sk2, saddr, daddr,
+					 ports, dif, sdif))
+				continue;
+			if (sk2->sk_state == TCP_TIME_WAIT)
+				break;
+			return -EADDRNOTAVAIL;
+		}
+		return 0;
 	}
-	rcu_read_unlock();
 
 	lock = inet_ehash_lockp(hinfo, hash);
 	spin_lock(lock);