diff mbox series

[net-next,v8,2/3] sock: add MSG_ZEROCOPY notification mechanism based on msg_control

Message ID 20240730184120.4089835-3-zijianzhang@bytedance.com (mailing list archive)
State New
Headers show
Series net: A lightweight zero-copy notification mechanism for MSG_ZEROCOPY | expand

Commit Message

Zijian Zhang July 30, 2024, 6:41 p.m. UTC
From: Zijian Zhang <zijianzhang@bytedance.com>

The MSG_ZEROCOPY flag enables copy avoidance for socket send calls.
However, zerocopy is not a free lunch. Apart from the management of user
pages, the combination of poll + recvmsg to receive notifications incurs
unignorable overhead in the applications. We try to mitigate this overhead
with a new notification mechanism based on msg_control. Leveraging the
general framework to copy cmsgs to the user space, we copy zerocopy
notifications to the user upon returning of sendmsgs.

Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com>
---
 arch/alpha/include/uapi/asm/socket.h  |  2 +
 arch/mips/include/uapi/asm/socket.h   |  2 +
 arch/parisc/include/uapi/asm/socket.h |  2 +
 arch/sparc/include/uapi/asm/socket.h  |  2 +
 include/linux/socket.h                |  2 +-
 include/uapi/asm-generic/socket.h     |  2 +
 include/uapi/linux/socket.h           | 23 +++++++++
 net/core/sock.c                       | 72 +++++++++++++++++++++++++--
 8 files changed, 102 insertions(+), 5 deletions(-)

Comments

Willem de Bruijn July 31, 2024, 10:20 p.m. UTC | #1
zijianzhang@ wrote:
> From: Zijian Zhang <zijianzhang@bytedance.com>
> 
> The MSG_ZEROCOPY flag enables copy avoidance for socket send calls.
> However, zerocopy is not a free lunch. Apart from the management of user
> pages, the combination of poll + recvmsg to receive notifications incurs
> unignorable overhead in the applications. We try to mitigate this overhead
> with a new notification mechanism based on msg_control. Leveraging the
> general framework to copy cmsgs to the user space, we copy zerocopy
> notifications to the user upon returning of sendmsgs.

May want to

- Explicitly state that receiving notifications on sendmsg is
  optional and existing recvmsg MSG_ERRQUEUE continues to work

- Include a very brief example of how this interface is used.
  Probably pseudo-code, as msghdr setup and CMSG processing are
  verbose operations

Btw patchwork shows red for patch 1/3 due to a new error or warning.
Not sure if it's a false positive, but take a look.
 
> Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
> Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com>

> +/*
> + * zc_info is the struct used for the SCM_ZC_NOTIFICATION control message.
> + */
> +struct zc_info {
> +	__u32 size; /* size of the zc_info_elem arr */

Size is ambiguous, could mean byte size. Perhaps length, or number of
elements in arr[].

> +	struct zc_info_elem arr[];
> +};
Jakub Kicinski Aug. 1, 2024, 1:29 a.m. UTC | #2
On Wed, 31 Jul 2024 18:20:35 -0400 Willem de Bruijn wrote:
> Btw patchwork shows red for patch 1/3 due to a new error or warning.
> Not sure if it's a false positive, but take a look.

Patchwork is not for contributors, I keep repeating this :|
Were you not in the room at netdev when I was talking about NIPA
or am I this shit at communicating?

Next person pointing someone to patchwork will get a task to fix
something in NIPA.
Willem de Bruijn Aug. 1, 2024, 5:52 p.m. UTC | #3
On Wed, Jul 31, 2024 at 9:29 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed, 31 Jul 2024 18:20:35 -0400 Willem de Bruijn wrote:
> > Btw patchwork shows red for patch 1/3 due to a new error or warning.
> > Not sure if it's a false positive, but take a look.
>
> Patchwork is not for contributors, I keep repeating this :|
> Were you not in the room at netdev when I was talking about NIPA
> or am I this shit at communicating?
>
> Next person pointing someone to patchwork will get a task to fix
> something in NIPA.

:-)

It's a super informative tool. I did miss the point about the intended
audience, use cases and known limitations (such as false positives).
Got it now!

Looking forward to the netdev talks and slides online soon.
diff mbox series

Patch

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index e94f621903fe..7c32d9dbe47f 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -140,6 +140,8 @@ 
 #define SO_PASSPIDFD		76
 #define SO_PEERPIDFD		77
 
+#define SCM_ZC_NOTIFICATION	78
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 60ebaed28a4c..3f7fade998cb 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -151,6 +151,8 @@ 
 #define SO_PASSPIDFD		76
 #define SO_PEERPIDFD		77
 
+#define SCM_ZC_NOTIFICATION	78
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index be264c2b1a11..77f5bee0fdc9 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -132,6 +132,8 @@ 
 #define SO_PASSPIDFD		0x404A
 #define SO_PEERPIDFD		0x404B
 
+#define SCM_ZC_NOTIFICATION	0x404C
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 682da3714686..eb44fc515b45 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -133,6 +133,8 @@ 
 #define SO_PASSPIDFD             0x0055
 #define SO_PEERPIDFD             0x0056
 
+#define SCM_ZC_NOTIFICATION      0x0057
+
 #if !defined(__KERNEL__)
 
 
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 40173c919d0f..71e3c6ebfed5 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -171,7 +171,7 @@  static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr
 
 static inline bool cmsg_copy_to_user(struct cmsghdr *__cmsg)
 {
-	return 0;
+	return __cmsg->cmsg_type == SCM_ZC_NOTIFICATION;
 }
 
 static inline size_t msg_data_left(struct msghdr *msg)
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 8ce8a39a1e5f..02e9159c7944 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -135,6 +135,8 @@ 
 #define SO_PASSPIDFD		76
 #define SO_PEERPIDFD		77
 
+#define SCM_ZC_NOTIFICATION	78
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h
index d3fcd3b5ec53..b5b5fa9febb1 100644
--- a/include/uapi/linux/socket.h
+++ b/include/uapi/linux/socket.h
@@ -2,6 +2,8 @@ 
 #ifndef _UAPI_LINUX_SOCKET_H
 #define _UAPI_LINUX_SOCKET_H
 
+#include <linux/types.h>
+
 /*
  * Desired design of maximum size and alignment (see RFC2553)
  */
@@ -35,4 +37,25 @@  struct __kernel_sockaddr_storage {
 #define SOCK_TXREHASH_DISABLED	0
 #define SOCK_TXREHASH_ENABLED	1
 
+#define ZC_NOTIFICATION_MAX	16
+
+/*
+ * A zc_info_elem represents a completion notification for sendmsgs in range
+ * lo to high, zerocopy represents whether the underlying transmission is
+ * zerocopy or not.
+ */
+struct zc_info_elem {
+	__u32 lo;
+	__u32 hi;
+	__u8 zerocopy;
+};
+
+/*
+ * zc_info is the struct used for the SCM_ZC_NOTIFICATION control message.
+ */
+struct zc_info {
+	__u32 size; /* size of the zc_info_elem arr */
+	struct zc_info_elem arr[];
+};
+
 #endif /* _UAPI_LINUX_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index b2cbe753af1d..37b1b12623ee 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1481,10 +1481,12 @@  int sk_setsockopt(struct sock *sk, int level, int optname,
 			ret = -EOPNOTSUPP;
 		}
 		if (!ret) {
-			if (val < 0 || val > 1)
+			if (val < 0 || val > 1) {
 				ret = -EINVAL;
-			else
+			} else {
 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
+				static_branch_enable(&tx_copy_cmsg_to_user_key);
+			}
 		}
 		break;
 
@@ -2826,8 +2828,8 @@  struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
 }
 EXPORT_SYMBOL(sock_alloc_send_pskb);
 
-int __sock_cmsg_send(struct sock *sk, struct msghdr *msg __always_unused,
-		     struct cmsghdr *cmsg, struct sockcm_cookie *sockc)
+int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
+		     struct sockcm_cookie *sockc)
 {
 	u32 tsflags;
 
@@ -2863,6 +2865,68 @@  int __sock_cmsg_send(struct sock *sk, struct msghdr *msg __always_unused,
 	case SCM_RIGHTS:
 	case SCM_CREDENTIALS:
 		break;
+	case SCM_ZC_NOTIFICATION: {
+		struct zc_info *zc = CMSG_DATA(cmsg);
+		struct sk_buff_head *q, local_q;
+		int cmsg_data_len, i = 0;
+		unsigned long flags;
+		struct sk_buff *skb;
+
+		if (!sock_flag(sk, SOCK_ZEROCOPY) || sk->sk_family == PF_RDS)
+			return -EINVAL;
+
+		cmsg_data_len = cmsg->cmsg_len - sizeof(struct cmsghdr);
+		if (cmsg_data_len < sizeof(struct zc_info))
+			return -EINVAL;
+
+		if (zc->size > ZC_NOTIFICATION_MAX ||
+		    (cmsg_data_len - sizeof(struct zc_info)) !=
+		    (zc->size * sizeof(struct zc_info_elem)))
+			return -EINVAL;
+
+		q = &sk->sk_error_queue;
+		skb_queue_head_init(&local_q);
+
+		/* Get zerocopy error messages from sk_error_queue, and add them
+		 * to a local queue for later processing. This minimizes the
+		 * code while the spinlock is held and irq is disabled.
+		 */
+		spin_lock_irqsave(&q->lock, flags);
+		skb = skb_peek(q);
+		while (skb && i < zc->size) {
+			struct sk_buff *skb_next = skb_peek_next(skb, q);
+			struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+
+			if (serr->ee.ee_errno != 0 ||
+			    serr->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY) {
+				skb = skb_next;
+				continue;
+			}
+
+			__skb_unlink(skb, q);
+			__skb_queue_tail(&local_q, skb);
+			skb = skb_next;
+			i++;
+		}
+		spin_unlock_irqrestore(&q->lock, flags);
+
+		i = 0;
+		while ((skb = skb_peek(&local_q)) != NULL) {
+			struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
+
+			zc->arr[i].hi = serr->ee.ee_data;
+			zc->arr[i].lo = serr->ee.ee_info;
+			zc->arr[i].zerocopy = !(serr->ee.ee_code
+						& SO_EE_CODE_ZEROCOPY_COPIED);
+			__skb_unlink(skb, &local_q);
+			consume_skb(skb);
+			i++;
+		}
+
+		zc->size = i;
+		msg->msg_control_copy_to_user = true;
+		break;
+	}
 	default:
 		return -EINVAL;
 	}