diff mbox series

[net-next,v2,2/3] sock: add MSG_ZEROCOPY notification mechanism based on msg_control

Message ID 20240419214819.671536-3-zijianzhang@bytedance.com (mailing list archive)
State Changes Requested
Delegated to: Netdev Maintainers
Headers show
Series net: A lightweight zero-copy notification | expand

Checks

Context Check Description
netdev/series_format success Posting correctly formatted
netdev/tree_selection success Clearly marked for net-next, async
netdev/ynl success Generated files up to date; no warnings/errors; no diff in generated;
netdev/fixes_present success Fixes tag not required for -next series
netdev/header_inline success No static functions without inline keyword in header files
netdev/build_32bit fail Errors and warnings before: 6344 this patch: 6345
netdev/build_tools success Errors and warnings before: 0 this patch: 0
netdev/cc_maintainers warning 16 maintainers not CCed: linux-arch@vger.kernel.org mattst88@gmail.com richard.henderson@linaro.org arnd@arndb.de brauner@kernel.org linux-mips@vger.kernel.org linux-parisc@vger.kernel.org tsbogend@alpha.franken.de deller@gmx.de sparclinux@vger.kernel.org alexander@mihalicyn.com pabeni@redhat.com James.Bottomley@HansenPartnership.com linux-alpha@vger.kernel.org andreas@gaisler.com ink@jurassic.park.msu.ru
netdev/build_clang success Errors and warnings before: 2064 this patch: 2064
netdev/verify_signedoff success Signed-off-by tag matches author and committer
netdev/deprecated_api success None detected
netdev/check_selftest success No net selftest shell script
netdev/verify_fixes success No Fixes tag
netdev/build_allmodconfig_warn fail Errors and warnings before: 16177 this patch: 16178
netdev/checkpatch warning WARNING: line length of 85 exceeds 80 columns WARNING: line length of 87 exceeds 80 columns WARNING: line length of 91 exceeds 80 columns WARNING: line length of 94 exceeds 80 columns
netdev/build_clang_rust success No Rust files in patch. Skipping build
netdev/kdoc success Errors and warnings before: 9 this patch: 9
netdev/source_inline success Was 0 now: 0

Commit Message

Zijian Zhang April 19, 2024, 9:48 p.m. UTC
From: Zijian Zhang <zijianzhang@bytedance.com>

The MSG_ZEROCOPY flag enables copy avoidance for socket send calls.
However, zerocopy is not a free lunch. Apart from the management of user
pages, the combination of poll + recvmsg to receive notifications incurs
unignorable overhead in the applications. The overhead of such sometimes
might be more than the CPU savings from zerocopy. We try to solve this
problem with a new notification mechanism based on msgcontrol.
This new mechanism aims to reduce the overhead associated with receiving
notifications by embedding them directly into user arguments passed with
each sendmsg control message. By doing so, we can significantly reduce
the complexity and overhead for managing notifications. In an ideal
pattern, the user will keep calling sendmsg with SO_ZC_NOTIFICATION
msg_control, and the notification will be delivered as soon as possible.

Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com>
---
 arch/alpha/include/uapi/asm/socket.h  |  2 +
 arch/mips/include/uapi/asm/socket.h   |  2 +
 arch/parisc/include/uapi/asm/socket.h |  2 +
 arch/sparc/include/uapi/asm/socket.h  |  2 +
 include/uapi/asm-generic/socket.h     |  2 +
 include/uapi/linux/socket.h           | 16 ++++++
 net/core/sock.c                       | 70 +++++++++++++++++++++++++++
 7 files changed, 96 insertions(+)

Comments

Willem de Bruijn April 21, 2024, 3:45 p.m. UTC | #1
zijianzhang@ wrote:
> From: Zijian Zhang <zijianzhang@bytedance.com>
> 
> The MSG_ZEROCOPY flag enables copy avoidance for socket send calls.
> However, zerocopy is not a free lunch. Apart from the management of user
> pages, the combination of poll + recvmsg to receive notifications incurs
> unignorable overhead in the applications. The overhead of such sometimes
> might be more than the CPU savings from zerocopy. We try to solve this
> problem with a new notification mechanism based on msgcontrol.
> This new mechanism aims to reduce the overhead associated with receiving
> notifications by embedding them directly into user arguments passed with
> each sendmsg control message. By doing so, we can significantly reduce
> the complexity and overhead for managing notifications. In an ideal
> pattern, the user will keep calling sendmsg with SO_ZC_NOTIFICATION
> msg_control, and the notification will be delivered as soon as possible.
> 
> Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
> Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com>
> ---
>  arch/alpha/include/uapi/asm/socket.h  |  2 +
>  arch/mips/include/uapi/asm/socket.h   |  2 +
>  arch/parisc/include/uapi/asm/socket.h |  2 +
>  arch/sparc/include/uapi/asm/socket.h  |  2 +
>  include/uapi/asm-generic/socket.h     |  2 +
>  include/uapi/linux/socket.h           | 16 ++++++
>  net/core/sock.c                       | 70 +++++++++++++++++++++++++++
>  7 files changed, 96 insertions(+)
> 
> diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
> index e94f621903fe..b24622a9cd47 100644
> --- a/arch/alpha/include/uapi/asm/socket.h
> +++ b/arch/alpha/include/uapi/asm/socket.h
> @@ -140,6 +140,8 @@
>  #define SO_PASSPIDFD		76
>  #define SO_PEERPIDFD		77
>  
> +#define SO_ZC_NOTIFICATION 78
> +

SCM_ for cmsgs

>  /*
>   * Desired design of maximum size and alignment (see RFC2553)
>   */
> @@ -35,4 +37,18 @@ struct __kernel_sockaddr_storage {
>  #define SOCK_TXREHASH_DISABLED	0
>  #define SOCK_TXREHASH_ENABLED	1
>  
> +#define SOCK_ZC_INFO_MAX 256
> +
> +struct zc_info_elem {
> +	__u32 lo;
> +	__u32 hi;
> +	__u8 zerocopy;
> +};
> +
> +struct zc_info_usr {
> +	__u64 usr_addr;
> +	unsigned int length;
> +	struct zc_info_elem info[];
> +};
> +

Don't pass a pointer to user memory, just have msg_control point to an
array of zc_info_elem.

>  #endif /* _UAPI_LINUX_SOCKET_H */
> diff --git a/net/core/sock.c b/net/core/sock.c
> index fe9195186c13..13f06480f2d8 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -2809,6 +2809,13 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
>  		     struct sockcm_cookie *sockc)
>  {
>  	u32 tsflags;
> +	int ret, zc_info_size, i = 0;
> +	unsigned long flags;
> +	struct sk_buff_head *q, local_q;
> +	struct sk_buff *skb, *tmp;
> +	struct sock_exterr_skb *serr;
> +	struct zc_info_usr *zc_info_usr_p, *zc_info_kern_p;
> +	void __user	*usr_addr;

Please wrap the case in parentheses and define variables in that scope
(Since there are so many variables for this case only.)

>  
>  	switch (cmsg->cmsg_type) {
>  	case SO_MARK:
> @@ -2842,6 +2849,69 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
>  	case SCM_RIGHTS:
>  	case SCM_CREDENTIALS:
>  		break;
> +	case SO_ZC_NOTIFICATION:
> +		if (!sock_flag(sk, SOCK_ZEROCOPY) || sk->sk_family == PF_RDS)
> +			return -EINVAL;
> +

Why allow PF_RDS without the sock flag set?

> +		zc_info_usr_p = (struct zc_info_usr *)CMSG_DATA(cmsg);
> +		if (zc_info_usr_p->length <= 0 || zc_info_usr_p->length > SOCK_ZC_INFO_MAX)
> +			return -EINVAL;
> +
> +		zc_info_size = struct_size(zc_info_usr_p, info, zc_info_usr_p->length);
> +		if (cmsg->cmsg_len != CMSG_LEN(zc_info_size))
> +			return -EINVAL;

By passing a straightforward array, the array len can be inferred from
cmsg_len, simplifying all these checks.

See for instance how SO_DEVMEM_DONTNEED returns an array of tokens to
the kernel.

> +
> +		usr_addr = (void *)(uintptr_t)(zc_info_usr_p->usr_addr);
> +		if (!access_ok(usr_addr, zc_info_size))
> +			return -EFAULT;
> +
> +		zc_info_kern_p = kmalloc(zc_info_size, GFP_KERNEL);
> +		if (!zc_info_kern_p)
> +			return -ENOMEM;
> +
> +		q = &sk->sk_error_queue;
> +		skb_queue_head_init(&local_q);
> +		spin_lock_irqsave(&q->lock, flags);
> +		skb = skb_peek(q);
> +		while (skb && i < zc_info_usr_p->length) {
> +			struct sk_buff *skb_next = skb_peek_next(skb, q);
> +
> +			serr = SKB_EXT_ERR(skb);
> +			if (serr->ee.ee_errno == 0 &&
> +			    serr->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY) {
> +				zc_info_kern_p->info[i].hi = serr->ee.ee_data;
> +				zc_info_kern_p->info[i].lo = serr->ee.ee_info;
> +				zc_info_kern_p->info[i].zerocopy = !(serr->ee.ee_code
> +								& SO_EE_CODE_ZEROCOPY_COPIED);
> +				__skb_unlink(skb, q);
> +				__skb_queue_tail(&local_q, skb);
> +				i++;
> +			}
> +			skb = skb_next;
> +		}
> +		spin_unlock_irqrestore(&q->lock, flags);

In almost all sane cases, all outstanding notifications can be passed
to userspace.

It may be interesting to experiment with briefly taking the lock to
move to a private list. See for instance net_rx_action.

Then if userspace cannot handle all notifications, the rest have to be
spliced back. This can reorder notifications. But rare reordering is
not a correctness issue.

I would choose the more complex splice approach only if it shows
benefit, i.e., if taking the lock does contend with error enqueue
events.

> +
> +		zc_info_kern_p->usr_addr = zc_info_usr_p->usr_addr;
> +		zc_info_kern_p->length = i;
> +
> +		ret = copy_to_user(usr_addr,
> +				   zc_info_kern_p,
> +					struct_size(zc_info_kern_p, info, i));

You'll still need to support the gnarly MSG_CMSG_COMPAT version too.

Wait, is this the reason to pass a usr_addr explicitly? To get around
any compat issues?

Or even the entire issue of having to copy msg_sys->msg_control to
user if !msg_control_is_user.

I suppose this simplifies a lot in terms of development. If making the
user interface uglier.

IMHO the sane interface should be used eventually. There may also be
other uses of passing msg_control data up to userspace from sendmsg.

But this approach works for now for evaluation and discussion.
kernel test robot April 23, 2024, 3:16 a.m. UTC | #2
Hi,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/zijianzhang-bytedance-com/selftests-fix-OOM-problem-in-msg_zerocopy-selftest/20240420-055035
base:   net-next/main
patch link:    https://lore.kernel.org/r/20240419214819.671536-3-zijianzhang%40bytedance.com
patch subject: [PATCH net-next v2 2/3] sock: add MSG_ZEROCOPY notification mechanism based on msg_control
config: i386-randconfig-061-20240423 (https://download.01.org/0day-ci/archive/20240423/202404231145.7NfmE7Hn-lkp@intel.com/config)
compiler: gcc-13 (Ubuntu 13.2.0-4ubuntu3) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240423/202404231145.7NfmE7Hn-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202404231145.7NfmE7Hn-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> net/core/sock.c:2864:26: sparse: sparse: incorrect type in assignment (different address spaces) @@     expected void [noderef] __user *usr_addr @@     got void * @@
   net/core/sock.c:2864:26: sparse:     expected void [noderef] __user *usr_addr
   net/core/sock.c:2864:26: sparse:     got void *
   net/core/sock.c:2393:9: sparse: sparse: context imbalance in 'sk_clone_lock' - wrong count at exit
   net/core/sock.c:2397:6: sparse: sparse: context imbalance in 'sk_free_unlock_clone' - unexpected unlock
   net/core/sock.c:4103:13: sparse: sparse: context imbalance in 'proto_seq_start' - wrong count at exit
   net/core/sock.c:4115:13: sparse: sparse: context imbalance in 'proto_seq_stop' - wrong count at exit

vim +2864 net/core/sock.c

  2807	
  2808	int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
  2809			     struct sockcm_cookie *sockc)
  2810	{
  2811		u32 tsflags;
  2812		int ret, zc_info_size, i = 0;
  2813		unsigned long flags;
  2814		struct sk_buff_head *q, local_q;
  2815		struct sk_buff *skb, *tmp;
  2816		struct sock_exterr_skb *serr;
  2817		struct zc_info_usr *zc_info_usr_p, *zc_info_kern_p;
  2818		void __user	*usr_addr;
  2819	
  2820		switch (cmsg->cmsg_type) {
  2821		case SO_MARK:
  2822			if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
  2823			    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
  2824				return -EPERM;
  2825			if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
  2826				return -EINVAL;
  2827			sockc->mark = *(u32 *)CMSG_DATA(cmsg);
  2828			break;
  2829		case SO_TIMESTAMPING_OLD:
  2830		case SO_TIMESTAMPING_NEW:
  2831			if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
  2832				return -EINVAL;
  2833	
  2834			tsflags = *(u32 *)CMSG_DATA(cmsg);
  2835			if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
  2836				return -EINVAL;
  2837	
  2838			sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
  2839			sockc->tsflags |= tsflags;
  2840			break;
  2841		case SCM_TXTIME:
  2842			if (!sock_flag(sk, SOCK_TXTIME))
  2843				return -EINVAL;
  2844			if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
  2845				return -EINVAL;
  2846			sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
  2847			break;
  2848		/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
  2849		case SCM_RIGHTS:
  2850		case SCM_CREDENTIALS:
  2851			break;
  2852		case SO_ZC_NOTIFICATION:
  2853			if (!sock_flag(sk, SOCK_ZEROCOPY) || sk->sk_family == PF_RDS)
  2854				return -EINVAL;
  2855	
  2856			zc_info_usr_p = (struct zc_info_usr *)CMSG_DATA(cmsg);
  2857			if (zc_info_usr_p->length <= 0 || zc_info_usr_p->length > SOCK_ZC_INFO_MAX)
  2858				return -EINVAL;
  2859	
  2860			zc_info_size = struct_size(zc_info_usr_p, info, zc_info_usr_p->length);
  2861			if (cmsg->cmsg_len != CMSG_LEN(zc_info_size))
  2862				return -EINVAL;
  2863	
> 2864			usr_addr = (void *)(uintptr_t)(zc_info_usr_p->usr_addr);
  2865			if (!access_ok(usr_addr, zc_info_size))
  2866				return -EFAULT;
  2867	
  2868			zc_info_kern_p = kmalloc(zc_info_size, GFP_KERNEL);
  2869			if (!zc_info_kern_p)
  2870				return -ENOMEM;
  2871	
  2872			q = &sk->sk_error_queue;
  2873			skb_queue_head_init(&local_q);
  2874			spin_lock_irqsave(&q->lock, flags);
  2875			skb = skb_peek(q);
  2876			while (skb && i < zc_info_usr_p->length) {
  2877				struct sk_buff *skb_next = skb_peek_next(skb, q);
  2878	
  2879				serr = SKB_EXT_ERR(skb);
  2880				if (serr->ee.ee_errno == 0 &&
  2881				    serr->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY) {
  2882					zc_info_kern_p->info[i].hi = serr->ee.ee_data;
  2883					zc_info_kern_p->info[i].lo = serr->ee.ee_info;
  2884					zc_info_kern_p->info[i].zerocopy = !(serr->ee.ee_code
  2885									& SO_EE_CODE_ZEROCOPY_COPIED);
  2886					__skb_unlink(skb, q);
  2887					__skb_queue_tail(&local_q, skb);
  2888					i++;
  2889				}
  2890				skb = skb_next;
  2891			}
  2892			spin_unlock_irqrestore(&q->lock, flags);
  2893	
  2894			zc_info_kern_p->usr_addr = zc_info_usr_p->usr_addr;
  2895			zc_info_kern_p->length = i;
  2896	
  2897			ret = copy_to_user(usr_addr,
  2898					   zc_info_kern_p,
  2899						struct_size(zc_info_kern_p, info, i));
  2900			kfree(zc_info_kern_p);
  2901	
  2902			if (unlikely(ret)) {
  2903				spin_lock_irqsave(&q->lock, flags);
  2904				skb_queue_reverse_walk_safe(&local_q, skb, tmp) {
  2905					__skb_unlink(skb, &local_q);
  2906					__skb_queue_head(q, skb);
  2907				}
  2908				spin_unlock_irqrestore(&q->lock, flags);
  2909				return -EFAULT;
  2910			}
  2911	
  2912			while ((skb = __skb_dequeue(&local_q)))
  2913				consume_skb(skb);
  2914			break;
  2915		default:
  2916			return -EINVAL;
  2917		}
  2918		return 0;
  2919	}
  2920	EXPORT_SYMBOL(__sock_cmsg_send);
  2921
Zijian Zhang April 23, 2024, 7:51 p.m. UTC | #3
On 4/21/24 8:45 AM, Willem de Bruijn wrote:
> zijianzhang@ wrote:
>> From: Zijian Zhang <zijianzhang@bytedance.com>
>>
>> The MSG_ZEROCOPY flag enables copy avoidance for socket send calls.
>> However, zerocopy is not a free lunch. Apart from the management of user
>> pages, the combination of poll + recvmsg to receive notifications incurs
>> unignorable overhead in the applications. The overhead of such sometimes
>> might be more than the CPU savings from zerocopy. We try to solve this
>> problem with a new notification mechanism based on msgcontrol.
>> This new mechanism aims to reduce the overhead associated with receiving
>> notifications by embedding them directly into user arguments passed with
>> each sendmsg control message. By doing so, we can significantly reduce
>> the complexity and overhead for managing notifications. In an ideal
>> pattern, the user will keep calling sendmsg with SO_ZC_NOTIFICATION
>> msg_control, and the notification will be delivered as soon as possible.
>>
>> Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
>> Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com>
>> ---
>>   arch/alpha/include/uapi/asm/socket.h  |  2 +
>>   arch/mips/include/uapi/asm/socket.h   |  2 +
>>   arch/parisc/include/uapi/asm/socket.h |  2 +
>>   arch/sparc/include/uapi/asm/socket.h  |  2 +
>>   include/uapi/asm-generic/socket.h     |  2 +
>>   include/uapi/linux/socket.h           | 16 ++++++
>>   net/core/sock.c                       | 70 +++++++++++++++++++++++++++
>>   7 files changed, 96 insertions(+)
>>
>> diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
>> index e94f621903fe..b24622a9cd47 100644
>> --- a/arch/alpha/include/uapi/asm/socket.h
>> +++ b/arch/alpha/include/uapi/asm/socket.h
>> @@ -140,6 +140,8 @@
>>   #define SO_PASSPIDFD		76
>>   #define SO_PEERPIDFD		77
>>   
>> +#define SO_ZC_NOTIFICATION 78
>> +
> 
> SCM_ for cmsgs
> 

Ack.

>>   /*
>>    * Desired design of maximum size and alignment (see RFC2553)
>>    */
>> @@ -35,4 +37,18 @@ struct __kernel_sockaddr_storage {
>>   #define SOCK_TXREHASH_DISABLED	0
>>   #define SOCK_TXREHASH_ENABLED	1
>>   
>> +#define SOCK_ZC_INFO_MAX 256
>> +
>> +struct zc_info_elem {
>> +	__u32 lo;
>> +	__u32 hi;
>> +	__u8 zerocopy;
>> +};
>> +
>> +struct zc_info_usr {
>> +	__u64 usr_addr;
>> +	unsigned int length;
>> +	struct zc_info_elem info[];
>> +};
>> +
> 
> Don't pass a pointer to user memory, just have msg_control point to an
> array of zc_info_elem.
> 

Ack.

>>   #endif /* _UAPI_LINUX_SOCKET_H */
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index fe9195186c13..13f06480f2d8 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -2809,6 +2809,13 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
>>   		     struct sockcm_cookie *sockc)
>>   {
>>   	u32 tsflags;
>> +	int ret, zc_info_size, i = 0;
>> +	unsigned long flags;
>> +	struct sk_buff_head *q, local_q;
>> +	struct sk_buff *skb, *tmp;
>> +	struct sock_exterr_skb *serr;
>> +	struct zc_info_usr *zc_info_usr_p, *zc_info_kern_p;
>> +	void __user	*usr_addr;
> 
> Please wrap the case in parentheses and define variables in that scope
> (Since there are so many variables for this case only.)
> 
>>   
>>   	switch (cmsg->cmsg_type) {
>>   	case SO_MARK:
>> @@ -2842,6 +2849,69 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
>>   	case SCM_RIGHTS:
>>   	case SCM_CREDENTIALS:
>>   		break;
>> +	case SO_ZC_NOTIFICATION:
>> +		if (!sock_flag(sk, SOCK_ZEROCOPY) || sk->sk_family == PF_RDS)
>> +			return -EINVAL;
>> +
> 
> Why allow PF_RDS without the sock flag set?
> 

PF_RDS uses POLLIN instead of POLLERR, thus this mechanism cannot work
for PF_RDS. I am rejecting any PF_RDS socket here.

>> +		zc_info_usr_p = (struct zc_info_usr *)CMSG_DATA(cmsg);
>> +		if (zc_info_usr_p->length <= 0 || zc_info_usr_p->length > SOCK_ZC_INFO_MAX)
>> +			return -EINVAL;
>> +
>> +		zc_info_size = struct_size(zc_info_usr_p, info, zc_info_usr_p->length);
>> +		if (cmsg->cmsg_len != CMSG_LEN(zc_info_size))
>> +			return -EINVAL;
> 
> By passing a straightforward array, the array len can be inferred from
> cmsg_len, simplifying all these checks.
> 
> See for instance how SO_DEVMEM_DONTNEED returns an array of tokens to
> the kernel.
> 

Ack.

>> +
>> +		usr_addr = (void *)(uintptr_t)(zc_info_usr_p->usr_addr);
>> +		if (!access_ok(usr_addr, zc_info_size))
>> +			return -EFAULT;
>> +
>> +		zc_info_kern_p = kmalloc(zc_info_size, GFP_KERNEL);
>> +		if (!zc_info_kern_p)
>> +			return -ENOMEM;
>> +
>> +		q = &sk->sk_error_queue;
>> +		skb_queue_head_init(&local_q);
>> +		spin_lock_irqsave(&q->lock, flags);
>> +		skb = skb_peek(q);
>> +		while (skb && i < zc_info_usr_p->length) {
>> +			struct sk_buff *skb_next = skb_peek_next(skb, q);
>> +
>> +			serr = SKB_EXT_ERR(skb);
>> +			if (serr->ee.ee_errno == 0 &&
>> +			    serr->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY) {
>> +				zc_info_kern_p->info[i].hi = serr->ee.ee_data;
>> +				zc_info_kern_p->info[i].lo = serr->ee.ee_info;
>> +				zc_info_kern_p->info[i].zerocopy = !(serr->ee.ee_code
>> +								& SO_EE_CODE_ZEROCOPY_COPIED);
>> +				__skb_unlink(skb, q);
>> +				__skb_queue_tail(&local_q, skb);
>> +				i++;
>> +			}
>> +			skb = skb_next;
>> +		}
>> +		spin_unlock_irqrestore(&q->lock, flags);
> 
> In almost all sane cases, all outstanding notifications can be passed
> to userspace.
> 
> It may be interesting to experiment with briefly taking the lock to
> move to a private list. See for instance net_rx_action.
> 

Nice catch.

> Then if userspace cannot handle all notifications, the rest have to be
> spliced back. This can reorder notifications. But rare reordering is
> not a correctness issue.
> 
> I would choose the more complex splice approach only if it shows
> benefit, i.e., if taking the lock does contend with error enqueue
> events.
> 

Maybe when the network is very busy, it will contend with
__msg_zerocopy_callback(where new notifications are added)?
I think splice is a better idea.

>> +
>> +		zc_info_kern_p->usr_addr = zc_info_usr_p->usr_addr;
>> +		zc_info_kern_p->length = i;
>> +
>> +		ret = copy_to_user(usr_addr,
>> +				   zc_info_kern_p,
>> +					struct_size(zc_info_kern_p, info, i));
> 
> You'll still need to support the gnarly MSG_CMSG_COMPAT version too.
> 

Assume users pass in zc_info_elem array pointer here, I may use
in_compat_syscall function to check if it's compat. If so, I can
use compat_ptr to convert it. Is it correct?

> Wait, is this the reason to pass a usr_addr explicitly? To get around
> any compat issues?
> 

Yes, I try to make it u64 regardless of 32-bit or 64-bit program, but
it is indeed ugly though.

> Or even the entire issue of having to copy msg_sys->msg_control to
> user if !msg_control_is_user.
> 
> I suppose this simplifies a lot in terms of development. If making the
> user interface uglier.
> 
> IMHO the sane interface should be used eventually. There may also be
> other uses of passing msg_control data up to userspace from sendmsg.
> 
> But this approach works for now for evaluation and discussion.
> 
>
diff mbox series

Patch

diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index e94f621903fe..b24622a9cd47 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -140,6 +140,8 @@ 
 #define SO_PASSPIDFD		76
 #define SO_PEERPIDFD		77
 
+#define SO_ZC_NOTIFICATION 78
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 60ebaed28a4c..638a4ebbffa7 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -151,6 +151,8 @@ 
 #define SO_PASSPIDFD		76
 #define SO_PEERPIDFD		77
 
+#define SO_ZC_NOTIFICATION 78
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index be264c2b1a11..393f1a6e9562 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -132,6 +132,8 @@ 
 #define SO_PASSPIDFD		0x404A
 #define SO_PEERPIDFD		0x404B
 
+#define SO_ZC_NOTIFICATION 0x404C
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 682da3714686..4fe127b0682b 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -133,6 +133,8 @@ 
 #define SO_PASSPIDFD             0x0055
 #define SO_PEERPIDFD             0x0056
 
+#define SO_ZC_NOTIFICATION 0x0057
+
 #if !defined(__KERNEL__)
 
 
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 8ce8a39a1e5f..acbbbe7ac06a 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -135,6 +135,8 @@ 
 #define SO_PASSPIDFD		76
 #define SO_PEERPIDFD		77
 
+#define SO_ZC_NOTIFICATION 78
+
 #if !defined(__KERNEL__)
 
 #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h
index d3fcd3b5ec53..60e4db759d49 100644
--- a/include/uapi/linux/socket.h
+++ b/include/uapi/linux/socket.h
@@ -2,6 +2,8 @@ 
 #ifndef _UAPI_LINUX_SOCKET_H
 #define _UAPI_LINUX_SOCKET_H
 
+#include <linux/types.h>
+
 /*
  * Desired design of maximum size and alignment (see RFC2553)
  */
@@ -35,4 +37,18 @@  struct __kernel_sockaddr_storage {
 #define SOCK_TXREHASH_DISABLED	0
 #define SOCK_TXREHASH_ENABLED	1
 
+#define SOCK_ZC_INFO_MAX 256
+
+struct zc_info_elem {
+	__u32 lo;
+	__u32 hi;
+	__u8 zerocopy;
+};
+
+struct zc_info_usr {
+	__u64 usr_addr;
+	unsigned int length;
+	struct zc_info_elem info[];
+};
+
 #endif /* _UAPI_LINUX_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index fe9195186c13..13f06480f2d8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2809,6 +2809,13 @@  int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
 		     struct sockcm_cookie *sockc)
 {
 	u32 tsflags;
+	int ret, zc_info_size, i = 0;
+	unsigned long flags;
+	struct sk_buff_head *q, local_q;
+	struct sk_buff *skb, *tmp;
+	struct sock_exterr_skb *serr;
+	struct zc_info_usr *zc_info_usr_p, *zc_info_kern_p;
+	void __user	*usr_addr;
 
 	switch (cmsg->cmsg_type) {
 	case SO_MARK:
@@ -2842,6 +2849,69 @@  int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
 	case SCM_RIGHTS:
 	case SCM_CREDENTIALS:
 		break;
+	case SO_ZC_NOTIFICATION:
+		if (!sock_flag(sk, SOCK_ZEROCOPY) || sk->sk_family == PF_RDS)
+			return -EINVAL;
+
+		zc_info_usr_p = (struct zc_info_usr *)CMSG_DATA(cmsg);
+		if (zc_info_usr_p->length <= 0 || zc_info_usr_p->length > SOCK_ZC_INFO_MAX)
+			return -EINVAL;
+
+		zc_info_size = struct_size(zc_info_usr_p, info, zc_info_usr_p->length);
+		if (cmsg->cmsg_len != CMSG_LEN(zc_info_size))
+			return -EINVAL;
+
+		usr_addr = (void *)(uintptr_t)(zc_info_usr_p->usr_addr);
+		if (!access_ok(usr_addr, zc_info_size))
+			return -EFAULT;
+
+		zc_info_kern_p = kmalloc(zc_info_size, GFP_KERNEL);
+		if (!zc_info_kern_p)
+			return -ENOMEM;
+
+		q = &sk->sk_error_queue;
+		skb_queue_head_init(&local_q);
+		spin_lock_irqsave(&q->lock, flags);
+		skb = skb_peek(q);
+		while (skb && i < zc_info_usr_p->length) {
+			struct sk_buff *skb_next = skb_peek_next(skb, q);
+
+			serr = SKB_EXT_ERR(skb);
+			if (serr->ee.ee_errno == 0 &&
+			    serr->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY) {
+				zc_info_kern_p->info[i].hi = serr->ee.ee_data;
+				zc_info_kern_p->info[i].lo = serr->ee.ee_info;
+				zc_info_kern_p->info[i].zerocopy = !(serr->ee.ee_code
+								& SO_EE_CODE_ZEROCOPY_COPIED);
+				__skb_unlink(skb, q);
+				__skb_queue_tail(&local_q, skb);
+				i++;
+			}
+			skb = skb_next;
+		}
+		spin_unlock_irqrestore(&q->lock, flags);
+
+		zc_info_kern_p->usr_addr = zc_info_usr_p->usr_addr;
+		zc_info_kern_p->length = i;
+
+		ret = copy_to_user(usr_addr,
+				   zc_info_kern_p,
+					struct_size(zc_info_kern_p, info, i));
+		kfree(zc_info_kern_p);
+
+		if (unlikely(ret)) {
+			spin_lock_irqsave(&q->lock, flags);
+			skb_queue_reverse_walk_safe(&local_q, skb, tmp) {
+				__skb_unlink(skb, &local_q);
+				__skb_queue_head(q, skb);
+			}
+			spin_unlock_irqrestore(&q->lock, flags);
+			return -EFAULT;
+		}
+
+		while ((skb = __skb_dequeue(&local_q)))
+			consume_skb(skb);
+		break;
 	default:
 		return -EINVAL;
 	}