Message ID | 20240626193403.3854451-3-zijianzhang@bytedance.com (mailing list archive) |
---|---|
State | Changes Requested |
Delegated to: | Netdev Maintainers |
Headers | show |
Series | net: A lightweight zero-copy notification | expand |
On Wed, 26 Jun 2024 19:34:01 +0000 zijianzhang@bytedance.com wrote: > From: Zijian Zhang <zijianzhang@bytedance.com> > > Since ____sys_sendmsg creates a kernel copy of msg_control and passes > that to the callees, put_cmsg will write into this kernel buffer. If > people want to piggyback some information like timestamps upon returning > of sendmsg. ____sys_sendmsg will have to copy_to_user to the original buf, > which is not supported. As a result, users typically have to call recvmsg > on the ERRMSG_QUEUE of the socket, incurring extra system call overhead. > > This commit supports copying cmsg to userspace in TX path by introducing > a flag MSG_CMSG_COPY_TO_USER in struct msghdr to guide the copy logic > upon returning of ___sys_sendmsg. sparse complains about the annotations: net/socket.c:2635:30: warning: incorrect type in assignment (different address spaces) net/socket.c:2635:30: expected void *msg_control net/socket.c:2635:30: got void [noderef] __user *[noderef] __user msg_control net/socket.c:2629:49: warning: dereference of noderef expression
Hi, kernel test robot noticed the following build warnings: [auto build test WARNING on net-next/main] url: https://github.com/intel-lab-lkp/linux/commits/zijianzhang-bytedance-com/selftests-fix-OOM-problem-in-msg_zerocopy-selftest/20240627-150801 base: net-next/main patch link: https://lore.kernel.org/r/20240626193403.3854451-3-zijianzhang%40bytedance.com patch subject: [PATCH net-next v6 2/4] sock: support copy cmsg to userspace in TX path config: i386-randconfig-062-20240628 (https://download.01.org/0day-ci/archive/20240628/202406281051.NAiS477l-lkp@intel.com/config) compiler: gcc-13 (Ubuntu 13.2.0-4ubuntu3) 13.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240628/202406281051.NAiS477l-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202406281051.NAiS477l-lkp@intel.com/ sparse warnings: (new ones prefixed by >>) >> net/socket.c:2635:30: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected void *msg_control @@ got void [noderef] __user *[noderef] __user msg_control @@ net/socket.c:2635:30: sparse: expected void *msg_control net/socket.c:2635:30: sparse: got void [noderef] __user *[noderef] __user msg_control >> net/socket.c:2629:49: sparse: sparse: dereference of noderef expression vim +2635 net/socket.c 2623 2624 static int sendmsg_copy_cmsg_to_user(struct msghdr *msg_sys, 2625 struct user_msghdr __user *umsg) 2626 { 2627 struct compat_msghdr __user *umsg_compat = 2628 (struct compat_msghdr __user *)umsg; > 2629 unsigned long cmsg_ptr = (unsigned long)umsg->msg_control; 2630 unsigned int flags = msg_sys->msg_flags; 2631 struct msghdr msg_user = *msg_sys; 2632 struct cmsghdr *cmsg; 2633 int err; 2634 > 2635 msg_user.msg_control = umsg->msg_control; 2636 msg_user.msg_control_is_user = true; 2637 for_each_cmsghdr(cmsg, msg_sys) { 2638 if (!CMSG_OK(msg_sys, cmsg)) 2639 break; 2640 if (cmsg_copy_to_user(cmsg)) 2641 put_cmsg(&msg_user, cmsg->cmsg_level, cmsg->cmsg_type, 2642 cmsg->cmsg_len - sizeof(*cmsg), CMSG_DATA(cmsg)); 2643 } 2644 2645 err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), COMPAT_FLAGS(umsg)); 2646 if (err) 2647 return err; 2648 if (MSG_CMSG_COMPAT & flags) 2649 err = __put_user((unsigned long)msg_user.msg_control - cmsg_ptr, 2650 &umsg_compat->msg_controllen); 2651 else 2652 err = __put_user((unsigned long)msg_user.msg_control - cmsg_ptr, 2653 &umsg->msg_controllen); 2654 return err; 2655 } 2656
zijianzhang@ wrote: > From: Zijian Zhang <zijianzhang@bytedance.com> > > Since ____sys_sendmsg creates a kernel copy of msg_control and passes > that to the callees, put_cmsg will write into this kernel buffer. If > people want to piggyback some information like timestamps upon returning > of sendmsg. ____sys_sendmsg will have to copy_to_user to the original buf, > which is not supported. As a result, users typically have to call recvmsg > on the ERRMSG_QUEUE of the socket, incurring extra system call overhead. > > This commit supports copying cmsg to userspace in TX path by introducing > a flag MSG_CMSG_COPY_TO_USER in struct msghdr to guide the copy logic > upon returning of ___sys_sendmsg. > > Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com> > Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com> > --- > include/linux/socket.h | 6 ++++++ > net/core/sock.c | 2 ++ > net/ipv4/ip_sockglue.c | 2 ++ > net/ipv6/datagram.c | 3 +++ > net/socket.c | 45 ++++++++++++++++++++++++++++++++++++++++++ > 5 files changed, 58 insertions(+) > > diff --git a/include/linux/socket.h b/include/linux/socket.h > index 89d16b90370b..35adc30c9db6 100644 > --- a/include/linux/socket.h > +++ b/include/linux/socket.h > @@ -168,6 +168,11 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr > return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); > } > > +static inline bool cmsg_copy_to_user(struct cmsghdr *__cmsg) > +{ > + return 0; > +} > + > static inline size_t msg_data_left(struct msghdr *msg) > { > return iov_iter_count(&msg->msg_iter); > @@ -329,6 +334,7 @@ struct ucred { > > #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ > #define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ > +#define MSG_CMSG_COPY_TO_USER 0x10000000 /* Copy cmsg to user space */ Careful that userspace must not be able to set this bit. See also MSG_INTERNAL_SENDMSG_FLAGS. Perhaps better to define a bit like msg_control_is_user. > #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ > #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file > descriptor received through > diff --git a/net/core/sock.c b/net/core/sock.c > index 9abc4fe25953..4a766a91ff5c 100644 > --- a/net/core/sock.c > +++ b/net/core/sock.c > @@ -2879,6 +2879,8 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, > for_each_cmsghdr(cmsg, msg) { > if (!CMSG_OK(msg, cmsg)) > return -EINVAL; > + if (cmsg_copy_to_user(cmsg)) > + msg->msg_flags |= MSG_CMSG_COPY_TO_USER; Probably better to pass msg to __sock_cmsg_send and only set this field in the specific cmsg handler that uses it. > if (cmsg->cmsg_level != SOL_SOCKET) > continue; > ret = __sock_cmsg_send(sk, cmsg, sockc); > diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c > index cf377377b52d..464d08b27fa8 100644 > --- a/net/ipv4/ip_sockglue.c > +++ b/net/ipv4/ip_sockglue.c > @@ -249,6 +249,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, > for_each_cmsghdr(cmsg, msg) { > if (!CMSG_OK(msg, cmsg)) > return -EINVAL; > + if (cmsg_copy_to_user(cmsg)) > + msg->msg_flags |= MSG_CMSG_COPY_TO_USER; > #if IS_ENABLED(CONFIG_IPV6) > if (allow_ipv6 && > cmsg->cmsg_level == SOL_IPV6 && > diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c > index fff78496803d..b0341faf7f83 100644 > --- a/net/ipv6/datagram.c > +++ b/net/ipv6/datagram.c > @@ -776,6 +776,9 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, > goto exit_f; > } > > + if (cmsg_copy_to_user(cmsg)) > + msg->msg_flags |= MSG_CMSG_COPY_TO_USER; > + > if (cmsg->cmsg_level == SOL_SOCKET) { > err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc); > if (err) > diff --git a/net/socket.c b/net/socket.c > index e416920e9399..6523cf5a7f32 100644 > --- a/net/socket.c > +++ b/net/socket.c > @@ -2621,6 +2621,39 @@ static int sendmsg_copy_msghdr(struct msghdr *msg, > return 0; > } > > +static int sendmsg_copy_cmsg_to_user(struct msghdr *msg_sys, > + struct user_msghdr __user *umsg) > +{ > + struct compat_msghdr __user *umsg_compat = > + (struct compat_msghdr __user *)umsg; > + unsigned long cmsg_ptr = (unsigned long)umsg->msg_control; > + unsigned int flags = msg_sys->msg_flags; > + struct msghdr msg_user = *msg_sys; > + struct cmsghdr *cmsg; > + int err; > + > + msg_user.msg_control = umsg->msg_control; > + msg_user.msg_control_is_user = true; > + for_each_cmsghdr(cmsg, msg_sys) { > + if (!CMSG_OK(msg_sys, cmsg)) > + break; > + if (cmsg_copy_to_user(cmsg)) > + put_cmsg(&msg_user, cmsg->cmsg_level, cmsg->cmsg_type, > + cmsg->cmsg_len - sizeof(*cmsg), CMSG_DATA(cmsg)); > + } Alternatively just copy the entire msg_control if any cmsg wants to be copied back. The others will be unmodified. No need to iterate then. > + > + err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), COMPAT_FLAGS(umsg)); > + if (err) > + return err; Does this value need to be written? > + if (MSG_CMSG_COMPAT & flags) > + err = __put_user((unsigned long)msg_user.msg_control - cmsg_ptr, > + &umsg_compat->msg_controllen); > + else > + err = __put_user((unsigned long)msg_user.msg_control - cmsg_ptr, > + &umsg->msg_controllen); > + return err; > +} > + > static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, > struct msghdr *msg_sys, unsigned int flags, > struct used_address *used_address, > @@ -2638,6 +2671,18 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, > > err = ____sys_sendmsg(sock, msg_sys, flags, used_address, > allowed_msghdr_flags); > + if (err < 0) > + goto out; > + > + if (msg_sys->msg_flags & MSG_CMSG_COPY_TO_USER) { > + ssize_t len = err; > + > + err = sendmsg_copy_cmsg_to_user(msg_sys, msg); > + if (err) > + goto out; > + err = len; > + } > +out: > kfree(iov); > return err; > } > -- > 2.20.1 >
On 6/30/24 7:43 AM, Willem de Bruijn wrote: > zijianzhang@ wrote: >> From: Zijian Zhang <zijianzhang@bytedance.com> >> >> Since ____sys_sendmsg creates a kernel copy of msg_control and passes >> that to the callees, put_cmsg will write into this kernel buffer. If >> people want to piggyback some information like timestamps upon returning >> of sendmsg. ____sys_sendmsg will have to copy_to_user to the original buf, >> which is not supported. As a result, users typically have to call recvmsg >> on the ERRMSG_QUEUE of the socket, incurring extra system call overhead. >> >> This commit supports copying cmsg to userspace in TX path by introducing >> a flag MSG_CMSG_COPY_TO_USER in struct msghdr to guide the copy logic >> upon returning of ___sys_sendmsg. >> >> Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com> >> Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com> >> --- >> include/linux/socket.h | 6 ++++++ >> net/core/sock.c | 2 ++ >> net/ipv4/ip_sockglue.c | 2 ++ >> net/ipv6/datagram.c | 3 +++ >> net/socket.c | 45 ++++++++++++++++++++++++++++++++++++++++++ >> 5 files changed, 58 insertions(+) >> >> diff --git a/include/linux/socket.h b/include/linux/socket.h >> index 89d16b90370b..35adc30c9db6 100644 >> --- a/include/linux/socket.h >> +++ b/include/linux/socket.h >> @@ -168,6 +168,11 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr >> return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); >> } >> >> +static inline bool cmsg_copy_to_user(struct cmsghdr *__cmsg) >> +{ >> + return 0; >> +} >> + >> static inline size_t msg_data_left(struct msghdr *msg) >> { >> return iov_iter_count(&msg->msg_iter); >> @@ -329,6 +334,7 @@ struct ucred { >> >> #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ >> #define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ >> +#define MSG_CMSG_COPY_TO_USER 0x10000000 /* Copy cmsg to user space */ > > Careful that userspace must not be able to set this bit. See also > MSG_INTERNAL_SENDMSG_FLAGS. > > Perhaps better to define a bit like msg_control_is_user. > >> #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ >> #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file >> descriptor received through >> diff --git a/net/core/sock.c b/net/core/sock.c >> index 9abc4fe25953..4a766a91ff5c 100644 >> --- a/net/core/sock.c >> +++ b/net/core/sock.c >> @@ -2879,6 +2879,8 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, >> for_each_cmsghdr(cmsg, msg) { >> if (!CMSG_OK(msg, cmsg)) >> return -EINVAL; >> + if (cmsg_copy_to_user(cmsg)) >> + msg->msg_flags |= MSG_CMSG_COPY_TO_USER; > > Probably better to pass msg to __sock_cmsg_send and only set this > field in the specific cmsg handler that uses it. > Thanks for the above suggestions! >> if (cmsg->cmsg_level != SOL_SOCKET) >> continue; >> ret = __sock_cmsg_send(sk, cmsg, sockc); ... >> +static int sendmsg_copy_cmsg_to_user(struct msghdr *msg_sys, >> + struct user_msghdr __user *umsg) >> +{ >> + struct compat_msghdr __user *umsg_compat = >> + (struct compat_msghdr __user *)umsg; >> + unsigned long cmsg_ptr = (unsigned long)umsg->msg_control; >> + unsigned int flags = msg_sys->msg_flags; >> + struct msghdr msg_user = *msg_sys; >> + struct cmsghdr *cmsg; >> + int err; >> + >> + msg_user.msg_control = umsg->msg_control; >> + msg_user.msg_control_is_user = true; >> + for_each_cmsghdr(cmsg, msg_sys) { >> + if (!CMSG_OK(msg_sys, cmsg)) >> + break; >> + if (cmsg_copy_to_user(cmsg)) >> + put_cmsg(&msg_user, cmsg->cmsg_level, cmsg->cmsg_type, >> + cmsg->cmsg_len - sizeof(*cmsg), CMSG_DATA(cmsg)); >> + } > > Alternatively just copy the entire msg_control if any cmsg wants to > be copied back. The others will be unmodified. No need to iterate > then. > Copy the entire msg_control via copy_to_user does not take MSG_CMSG_COMPAT into account. I may have to use put_cmsg to deal with the compat version, and thus have to keep the for loop? If so, I may keep the function cmsg_copy_to_user to avoid extra copy? >> + >> + err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), COMPAT_FLAGS(umsg)); >> + if (err) >> + return err; > > Does this value need to be written? > I did this according to ____sys_recvmsg, maybe it's useful to export flag like MSG_CTRUNC to users? >> + if (MSG_CMSG_COMPAT & flags) >> + err = __put_user((unsigned long)msg_user.msg_control - cmsg_ptr, >> + &umsg_compat->msg_controllen); >> + else >> + err = __put_user((unsigned long)msg_user.msg_control - cmsg_ptr, >> + &umsg->msg_controllen); >> + return err; >> +} >> + >> static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, >> struct msghdr *msg_sys, unsigned int flags, >> struct used_address *used_address, >> @@ -2638,6 +2671,18 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, >> >> err = ____sys_sendmsg(sock, msg_sys, flags, used_address, >> allowed_msghdr_flags); >> + if (err < 0) >> + goto out; >> + >> + if (msg_sys->msg_flags & MSG_CMSG_COPY_TO_USER) { >> + ssize_t len = err; >> + >> + err = sendmsg_copy_cmsg_to_user(msg_sys, msg); >> + if (err) >> + goto out; >> + err = len; >> + } >> +out: >> kfree(iov); >> return err; >> } >> -- >> 2.20.1 >> > >
diff --git a/include/linux/socket.h b/include/linux/socket.h index 89d16b90370b..35adc30c9db6 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -168,6 +168,11 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr return __cmsg_nxthdr(__msg->msg_control, __msg->msg_controllen, __cmsg); } +static inline bool cmsg_copy_to_user(struct cmsghdr *__cmsg) +{ + return 0; +} + static inline size_t msg_data_left(struct msghdr *msg) { return iov_iter_count(&msg->msg_iter); @@ -329,6 +334,7 @@ struct ucred { #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ +#define MSG_CMSG_COPY_TO_USER 0x10000000 /* Copy cmsg to user space */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through diff --git a/net/core/sock.c b/net/core/sock.c index 9abc4fe25953..4a766a91ff5c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2879,6 +2879,8 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; + if (cmsg_copy_to_user(cmsg)) + msg->msg_flags |= MSG_CMSG_COPY_TO_USER; if (cmsg->cmsg_level != SOL_SOCKET) continue; ret = __sock_cmsg_send(sk, cmsg, sockc); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index cf377377b52d..464d08b27fa8 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -249,6 +249,8 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; + if (cmsg_copy_to_user(cmsg)) + msg->msg_flags |= MSG_CMSG_COPY_TO_USER; #if IS_ENABLED(CONFIG_IPV6) if (allow_ipv6 && cmsg->cmsg_level == SOL_IPV6 && diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index fff78496803d..b0341faf7f83 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -776,6 +776,9 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, goto exit_f; } + if (cmsg_copy_to_user(cmsg)) + msg->msg_flags |= MSG_CMSG_COPY_TO_USER; + if (cmsg->cmsg_level == SOL_SOCKET) { err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc); if (err) diff --git a/net/socket.c b/net/socket.c index e416920e9399..6523cf5a7f32 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2621,6 +2621,39 @@ static int sendmsg_copy_msghdr(struct msghdr *msg, return 0; } +static int sendmsg_copy_cmsg_to_user(struct msghdr *msg_sys, + struct user_msghdr __user *umsg) +{ + struct compat_msghdr __user *umsg_compat = + (struct compat_msghdr __user *)umsg; + unsigned long cmsg_ptr = (unsigned long)umsg->msg_control; + unsigned int flags = msg_sys->msg_flags; + struct msghdr msg_user = *msg_sys; + struct cmsghdr *cmsg; + int err; + + msg_user.msg_control = umsg->msg_control; + msg_user.msg_control_is_user = true; + for_each_cmsghdr(cmsg, msg_sys) { + if (!CMSG_OK(msg_sys, cmsg)) + break; + if (cmsg_copy_to_user(cmsg)) + put_cmsg(&msg_user, cmsg->cmsg_level, cmsg->cmsg_type, + cmsg->cmsg_len - sizeof(*cmsg), CMSG_DATA(cmsg)); + } + + err = __put_user((msg_sys->msg_flags & ~MSG_CMSG_COMPAT), COMPAT_FLAGS(umsg)); + if (err) + return err; + if (MSG_CMSG_COMPAT & flags) + err = __put_user((unsigned long)msg_user.msg_control - cmsg_ptr, + &umsg_compat->msg_controllen); + else + err = __put_user((unsigned long)msg_user.msg_control - cmsg_ptr, + &umsg->msg_controllen); + return err; +} + static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, struct msghdr *msg_sys, unsigned int flags, struct used_address *used_address, @@ -2638,6 +2671,18 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, err = ____sys_sendmsg(sock, msg_sys, flags, used_address, allowed_msghdr_flags); + if (err < 0) + goto out; + + if (msg_sys->msg_flags & MSG_CMSG_COPY_TO_USER) { + ssize_t len = err; + + err = sendmsg_copy_cmsg_to_user(msg_sys, msg); + if (err) + goto out; + err = len; + } +out: kfree(iov); return err; }