diff mbox series

[net-next] tcp: Use per-vma locking for receive zerocopy

Message ID 20230615185516.3738855-2-arjunroy.kdev@gmail.com (mailing list archive)
State Superseded
Delegated to: Netdev Maintainers
Headers show
Series [net-next] tcp: Use per-vma locking for receive zerocopy | expand

Checks

Context Check Description
netdev/series_format success Single patches do not need cover letters
netdev/tree_selection success Clearly marked for net-next, async
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit success Errors and warnings before: 1005 this patch: 1005
netdev/cc_maintainers warning 6 maintainers not CCed: kuba@kernel.org akpm@linux-foundation.org dsahern@kernel.org davem@davemloft.net linux-mm@kvack.org pabeni@redhat.com
netdev/build_clang success Errors and warnings before: 129 this patch: 129
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn success Errors and warnings before: 1012 this patch: 1012
netdev/checkpatch success total: 0 errors, 0 warnings, 0 checks, 120 lines checked
netdev/kdoc success Errors and warnings before: 2 this patch: 2
netdev/source_inline success Was 0 now: 0

Commit Message

Arjun Roy June 15, 2023, 6:55 p.m. UTC
From: Arjun Roy <arjunroy@google.com>

Per-VMA locking allows us to lock a struct vm_area_struct without
taking the process-wide mmap lock in read mode.

Consider a process workload where the mmap lock is taken constantly in
write mode. In this scenario, all zerocopy receives are periodically
blocked during that period of time - though in principle, the memory
ranges being used by TCP are not touched by the operations that need
the mmap write lock. This results in performance degradation.

Now consider another workload where the mmap lock is never taken in
write mode, but there are many TCP connections using receive zerocopy
that are concurrently receiving. These connections all take the mmap
lock in read mode, but this does induce a lot of contention and atomic
ops for this process-wide lock. This results in additional CPU
overhead caused by contending on the cache line for this lock.

However, with per-vma locking, both of these problems can be avoided.

As a test, I ran an RPC-style request/response workload with 4KB
payloads and receive zerocopy enabled, with 100 simultaneous TCP
connections. I measured perf cycles within the
find_tcp_vma/mmap_read_lock/mmap_read_unlock codepath, with and
without per-vma locking enabled.

When using process-wide mmap semaphore read locking, about 1% of
measured perf cycles were within this path. With per-VMA locking, this
value dropped to about 0.45%.

Signed-off-by: Arjun Roy <arjunroy@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
---
 MAINTAINERS            |  1 +
 include/linux/net_mm.h |  8 ++++++++
 include/net/tcp.h      |  1 +
 mm/memory.c            |  7 ++++---
 net/ipv4/tcp.c         | 45 ++++++++++++++++++++++++++++++++++--------
 5 files changed, 51 insertions(+), 11 deletions(-)
 create mode 100644 include/linux/net_mm.h

Comments

kernel test robot June 16, 2023, 1:25 a.m. UTC | #1
Hi Arjun,

kernel test robot noticed the following build errors:

[auto build test ERROR on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Arjun-Roy/tcp-Use-per-vma-locking-for-receive-zerocopy/20230616-025823
base:   net-next/main
patch link:    https://lore.kernel.org/r/20230615185516.3738855-2-arjunroy.kdev%40gmail.com
patch subject: [net-next] tcp: Use per-vma locking for receive zerocopy
config: s390-randconfig-r026-20230615 (https://download.01.org/0day-ci/archive/20230616/202306160941.CgtiNISL-lkp@intel.com/config)
compiler: clang version 17.0.0 (https://github.com/llvm/llvm-project.git 4a5ac14ee968ff0ad5d2cc1ffa0299048db4c88a)
reproduce (this is a W=1 build):
        mkdir -p ~/bin
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # install s390 cross compiling tool for clang build
        # apt-get install binutils-s390x-linux-gnu
        git remote add net-next https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git
        git fetch net-next main
        git checkout net-next/main
        b4 shazam https://lore.kernel.org/r/20230615185516.3738855-2-arjunroy.kdev@gmail.com
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang ~/bin/make.cross W=1 O=build_dir ARCH=s390 olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang ~/bin/make.cross W=1 O=build_dir ARCH=s390 SHELL=/bin/bash

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202306160941.CgtiNISL-lkp@intel.com/

All errors (new ones prefixed by >>):

   s390x-linux-ld: mm/memory.o: in function `lock_vma_under_rcu':
>> memory.c:(.text+0x9784): undefined reference to `tcp_vm_ops'
   s390x-linux-ld: drivers/dma/qcom/hidma.o: in function `hidma_probe':
   hidma.c:(.text+0x3e): undefined reference to `devm_ioremap_resource'
   s390x-linux-ld: hidma.c:(.text+0x96): undefined reference to `devm_ioremap_resource'
   s390x-linux-ld: drivers/net/ethernet/altera/altera_tse_main.o: in function `altera_tse_probe':
   altera_tse_main.c:(.text+0x12c): undefined reference to `devm_ioremap'
   s390x-linux-ld: altera_tse_main.c:(.text+0x28a): undefined reference to `devm_ioremap'
   s390x-linux-ld: altera_tse_main.c:(.text+0x316): undefined reference to `devm_ioremap'
   s390x-linux-ld: drivers/net/ethernet/altera/altera_tse_main.o: in function `request_and_map':
   altera_tse_main.c:(.text+0x688): undefined reference to `devm_ioremap'
diff mbox series

Patch

diff --git a/MAINTAINERS b/MAINTAINERS
index c6fa6ed454f4..a7c495e3323b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14727,6 +14727,7 @@  NETWORKING [TCP]
 M:	Eric Dumazet <edumazet@google.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
+F:	include/linux/net_mm.h
 F:	include/linux/tcp.h
 F:	include/net/tcp.h
 F:	include/trace/events/tcp.h
diff --git a/include/linux/net_mm.h b/include/linux/net_mm.h
new file mode 100644
index 000000000000..a4a3301e05be
--- /dev/null
+++ b/include/linux/net_mm.h
@@ -0,0 +1,8 @@ 
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifdef CONFIG_MMU
+extern const struct vm_operations_struct tcp_vm_ops;
+static inline bool vma_is_tcp(const struct vm_area_struct *vma)
+{
+	return vma->vm_ops == &tcp_vm_ops;
+}
+#endif
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 5066e4586cf0..bfa5e27205ba 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -45,6 +45,7 @@ 
 #include <linux/memcontrol.h>
 #include <linux/bpf-cgroup.h>
 #include <linux/siphash.h>
+#include <linux/net_mm.h>
 
 extern struct inet_hashinfo tcp_hashinfo;
 
diff --git a/mm/memory.c b/mm/memory.c
index f69fbc251198..3e46b4d881dc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -77,6 +77,7 @@ 
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/sysctl.h>
+#include <linux/net_mm.h>
 
 #include <trace/events/kmem.h>
 
@@ -5280,12 +5281,12 @@  struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 	if (!vma)
 		goto inval;
 
-	/* Only anonymous vmas are supported for now */
-	if (!vma_is_anonymous(vma))
+	/* Only anonymous and tcp vmas are supported for now */
+	if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
 		goto inval;
 
 	/* find_mergeable_anon_vma uses adjacent vmas which are not locked */
-	if (!vma->anon_vma)
+	if (!vma->anon_vma && !vma_is_tcp(vma))
 		goto inval;
 
 	if (!vma_start_read(vma))
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8d20d9221238..6240d81476b8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1877,7 +1877,7 @@  void tcp_update_recv_tstamps(struct sk_buff *skb,
 }
 
 #ifdef CONFIG_MMU
-static const struct vm_operations_struct tcp_vm_ops = {
+const struct vm_operations_struct tcp_vm_ops = {
 };
 
 int tcp_mmap(struct file *file, struct socket *sock,
@@ -2176,6 +2176,34 @@  static void tcp_zc_finalize_rx_tstamp(struct sock *sk,
 	}
 }
 
+static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm,
+					   unsigned long address,
+					   bool *mmap_locked)
+{
+	struct vm_area_struct *vma = NULL;
+
+#ifdef CONFIG_PER_VMA_LOCK
+	vma = lock_vma_under_rcu(mm, address);
+#endif
+	if (vma) {
+		if (!vma_is_tcp(vma)) {
+			vma_end_read(vma);
+			return NULL;
+		}
+		*mmap_locked = false;
+		return vma;
+	}
+
+	mmap_read_lock(mm);
+	vma = vma_lookup(mm, address);
+	if (!vma || !vma_is_tcp(vma)) {
+		mmap_read_unlock(mm);
+		return NULL;
+	}
+	*mmap_locked = true;
+	return vma;
+}
+
 #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
 static int tcp_zerocopy_receive(struct sock *sk,
 				struct tcp_zerocopy_receive *zc,
@@ -2193,6 +2221,7 @@  static int tcp_zerocopy_receive(struct sock *sk,
 	u32 seq = tp->copied_seq;
 	u32 total_bytes_to_map;
 	int inq = tcp_inq(sk);
+	bool mmap_locked;
 	int ret;
 
 	zc->copybuf_len = 0;
@@ -2217,13 +2246,10 @@  static int tcp_zerocopy_receive(struct sock *sk,
 		return 0;
 	}
 
-	mmap_read_lock(current->mm);
-
-	vma = vma_lookup(current->mm, address);
-	if (!vma || vma->vm_ops != &tcp_vm_ops) {
-		mmap_read_unlock(current->mm);
+	vma = find_tcp_vma(current->mm, address, &mmap_locked);
+	if (!vma)
 		return -EINVAL;
-	}
+
 	vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
 	avail_len = min_t(u32, vma_len, inq);
 	total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
@@ -2297,7 +2323,10 @@  static int tcp_zerocopy_receive(struct sock *sk,
 						   zc, total_bytes_to_map);
 	}
 out:
-	mmap_read_unlock(current->mm);
+	if (mmap_locked)
+		mmap_read_unlock(current->mm);
+	else
+		vma_end_read(vma);
 	/* Try to copy straggler data. */
 	if (!ret)
 		copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);