From patchwork Mon Jan 24 09:43:18 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Hao Xu X-Patchwork-Id: 12721826 X-Patchwork-Delegate: kuba@kernel.org Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 3DF5CC433FE for ; Mon, 24 Jan 2022 09:43:38 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S234919AbiAXJnf (ORCPT ); Mon, 24 Jan 2022 04:43:35 -0500 Received: from out4436.biz.mail.alibaba.com ([47.88.44.36]:26492 "EHLO out4436.biz.mail.alibaba.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231812AbiAXJnc (ORCPT ); Mon, 24 Jan 2022 04:43:32 -0500 X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R231e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=e01e04423;MF=haoxu@linux.alibaba.com;NM=1;PH=DS;RN=10;SR=0;TI=SMTPD_---0V2iRAzW_1643017409; Received: from hao-A29R.hz.ali.com(mailfrom:haoxu@linux.alibaba.com fp:SMTPD_---0V2iRAzW_1643017409) by smtp.aliyun-inc.com(127.0.0.1); Mon, 24 Jan 2022 17:43:30 +0800 From: Hao Xu To: netdev@vger.kernel.org, io-uring@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Eric Dumazet , "David S . Miller" , Jakub Kicinski , Hideaki YOSHIFUJI , David Ahern , Joseph Qi Subject: [PATCH 1/3] net-zerocopy: split zerocopy receive to several parts Date: Mon, 24 Jan 2022 17:43:18 +0800 Message-Id: <20220124094320.900713-2-haoxu@linux.alibaba.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220124094320.900713-1-haoxu@linux.alibaba.com> References: <20220124094320.900713-1-haoxu@linux.alibaba.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org X-Patchwork-Delegate: kuba@kernel.org Split the zerocopy receive code to several parts so that we can use them easily in other places like io_uring. Signed-off-by: Hao Xu Reported-by: kernel test robot --- include/net/tcp.h | 5 ++ net/ipv4/tcp.c | 128 +++++++++++++++++++++++++++------------------- 2 files changed, 80 insertions(+), 53 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 44e442bf23f9..ba0e7957bdfb 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -419,6 +419,11 @@ void tcp_data_ready(struct sock *sk); #ifdef CONFIG_MMU int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); +int zc_receive_check(struct tcp_zerocopy_receive *zc, int *lenp, + char __user *optval, int __user *optlen); +int zc_receive_update(struct sock *sk, struct tcp_zerocopy_receive *zc, int len, + char __user *optval, struct scm_timestamping_internal *tss, + int err); #endif void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3b75836db19b..d47e3ccf7cdb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3936,6 +3936,76 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, return stats; } +int zc_receive_check(struct tcp_zerocopy_receive *zc, int *lenp, + char __user *optval, int __user *optlen) +{ + int len = *lenp, err; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0 || + len < offsetofend(struct tcp_zerocopy_receive, length)) + return -EINVAL; + if (unlikely(len > sizeof(*zc))) { + err = check_zeroed_user(optval + sizeof(*zc), + len - sizeof(*zc)); + if (err < 1) + return err == 0 ? -EINVAL : err; + len = sizeof(*zc); + if (put_user(len, optlen)) + return -EFAULT; + } + if (copy_from_user(zc, optval, len)) + return -EFAULT; + + if (zc->reserved) + return -EINVAL; + if (zc->msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) + return -EINVAL; + + *lenp = len; + return 0; +} + +int zc_receive_update(struct sock *sk, struct tcp_zerocopy_receive *zc, int len, + char __user *optval, struct scm_timestamping_internal *tss, + int err) +{ + sk_defer_free_flush(sk); + if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) + goto zerocopy_rcv_cmsg; + switch (len) { + case offsetofend(struct tcp_zerocopy_receive, msg_flags): + goto zerocopy_rcv_cmsg; + case offsetofend(struct tcp_zerocopy_receive, msg_controllen): + case offsetofend(struct tcp_zerocopy_receive, msg_control): + case offsetofend(struct tcp_zerocopy_receive, flags): + case offsetofend(struct tcp_zerocopy_receive, copybuf_len): + case offsetofend(struct tcp_zerocopy_receive, copybuf_address): + case offsetofend(struct tcp_zerocopy_receive, err): + goto zerocopy_rcv_sk_err; + case offsetofend(struct tcp_zerocopy_receive, inq): + goto zerocopy_rcv_inq; + case offsetofend(struct tcp_zerocopy_receive, length): + default: + goto zerocopy_rcv_out; + } +zerocopy_rcv_cmsg: + if (zc->msg_flags & TCP_CMSG_TS) + tcp_zc_finalize_rx_tstamp(sk, zc, tss); + else + zc->msg_flags = 0; +zerocopy_rcv_sk_err: + if (!err) + zc->err = sock_error(sk); +zerocopy_rcv_inq: + zc->inq = tcp_inq_hint(sk); +zerocopy_rcv_out: + if (!err && copy_to_user(optval, zc, len)) + err = -EFAULT; + return err; +} + static int do_tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { @@ -4192,64 +4262,16 @@ static int do_tcp_getsockopt(struct sock *sk, int level, struct tcp_zerocopy_receive zc = {}; int err; - if (get_user(len, optlen)) - return -EFAULT; - if (len < 0 || - len < offsetofend(struct tcp_zerocopy_receive, length)) - return -EINVAL; - if (unlikely(len > sizeof(zc))) { - err = check_zeroed_user(optval + sizeof(zc), - len - sizeof(zc)); - if (err < 1) - return err == 0 ? -EINVAL : err; - len = sizeof(zc); - if (put_user(len, optlen)) - return -EFAULT; - } - if (copy_from_user(&zc, optval, len)) - return -EFAULT; - if (zc.reserved) - return -EINVAL; - if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) - return -EINVAL; + err = zc_receive_check(&zc, &len, optval, optlen); + if (err) + return err; + lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); release_sock(sk); - sk_defer_free_flush(sk); - if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) - goto zerocopy_rcv_cmsg; - switch (len) { - case offsetofend(struct tcp_zerocopy_receive, msg_flags): - goto zerocopy_rcv_cmsg; - case offsetofend(struct tcp_zerocopy_receive, msg_controllen): - case offsetofend(struct tcp_zerocopy_receive, msg_control): - case offsetofend(struct tcp_zerocopy_receive, flags): - case offsetofend(struct tcp_zerocopy_receive, copybuf_len): - case offsetofend(struct tcp_zerocopy_receive, copybuf_address): - case offsetofend(struct tcp_zerocopy_receive, err): - goto zerocopy_rcv_sk_err; - case offsetofend(struct tcp_zerocopy_receive, inq): - goto zerocopy_rcv_inq; - case offsetofend(struct tcp_zerocopy_receive, length): - default: - goto zerocopy_rcv_out; - } -zerocopy_rcv_cmsg: - if (zc.msg_flags & TCP_CMSG_TS) - tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); - else - zc.msg_flags = 0; -zerocopy_rcv_sk_err: - if (!err) - zc.err = sock_error(sk); -zerocopy_rcv_inq: - zc.inq = tcp_inq_hint(sk); -zerocopy_rcv_out: - if (!err && copy_to_user(optval, &zc, len)) - err = -EFAULT; - return err; + return zc_receive_update(sk, &zc, len, optval, &tss, err); } #endif default: From patchwork Mon Jan 24 09:43:19 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Hao Xu X-Patchwork-Id: 12721827 X-Patchwork-Delegate: kuba@kernel.org Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 81F7DC433EF for ; Mon, 24 Jan 2022 09:43:38 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231812AbiAXJng (ORCPT ); Mon, 24 Jan 2022 04:43:36 -0500 Received: from out30-57.freemail.mail.aliyun.com ([115.124.30.57]:47637 "EHLO out30-57.freemail.mail.aliyun.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229823AbiAXJnd (ORCPT ); Mon, 24 Jan 2022 04:43:33 -0500 X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R121e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=e01e04407;MF=haoxu@linux.alibaba.com;NM=1;PH=DS;RN=10;SR=0;TI=SMTPD_---0V2iRAzl_1643017410; Received: from hao-A29R.hz.ali.com(mailfrom:haoxu@linux.alibaba.com fp:SMTPD_---0V2iRAzl_1643017410) by smtp.aliyun-inc.com(127.0.0.1); Mon, 24 Jan 2022 17:43:31 +0800 From: Hao Xu To: netdev@vger.kernel.org, io-uring@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Eric Dumazet , "David S . Miller" , Jakub Kicinski , Hideaki YOSHIFUJI , David Ahern , Joseph Qi Subject: [PATCH 2/3] net-zerocopy: remove static for tcp_zerocopy_receive() Date: Mon, 24 Jan 2022 17:43:19 +0800 Message-Id: <20220124094320.900713-3-haoxu@linux.alibaba.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220124094320.900713-1-haoxu@linux.alibaba.com> References: <20220124094320.900713-1-haoxu@linux.alibaba.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org X-Patchwork-Delegate: kuba@kernel.org Remove static for tcp_zerocopy_receive() since we are going to reference it in io_uring. Signed-off-by: Hao Xu --- include/net/tcp.h | 3 +++ net/ipv4/tcp.c | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index ba0e7957bdfb..f4108dea6a82 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -424,6 +424,9 @@ int zc_receive_check(struct tcp_zerocopy_receive *zc, int *lenp, int zc_receive_update(struct sock *sk, struct tcp_zerocopy_receive *zc, int len, char __user *optval, struct scm_timestamping_internal *tss, int err); +int tcp_zerocopy_receive(struct sock *sk, + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss); #endif void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d47e3ccf7cdb..b08a04f58b42 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2066,9 +2066,9 @@ static void tcp_zc_finalize_rx_tstamp(struct sock *sk, } #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 -static int tcp_zerocopy_receive(struct sock *sk, - struct tcp_zerocopy_receive *zc, - struct scm_timestamping_internal *tss) +int tcp_zerocopy_receive(struct sock *sk, + struct tcp_zerocopy_receive *zc, + struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; From patchwork Mon Jan 24 09:43:20 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Hao Xu X-Patchwork-Id: 12721828 X-Patchwork-Delegate: kuba@kernel.org Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 0289DC433F5 for ; Mon, 24 Jan 2022 09:43:41 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S236506AbiAXJnj (ORCPT ); Mon, 24 Jan 2022 04:43:39 -0500 Received: from out30-42.freemail.mail.aliyun.com ([115.124.30.42]:60036 "EHLO out30-42.freemail.mail.aliyun.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S234683AbiAXJni (ORCPT ); Mon, 24 Jan 2022 04:43:38 -0500 X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R531e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=e01e04400;MF=haoxu@linux.alibaba.com;NM=1;PH=DS;RN=10;SR=0;TI=SMTPD_---0V2iRB-._1643017411; Received: from hao-A29R.hz.ali.com(mailfrom:haoxu@linux.alibaba.com fp:SMTPD_---0V2iRB-._1643017411) by smtp.aliyun-inc.com(127.0.0.1); Mon, 24 Jan 2022 17:43:32 +0800 From: Hao Xu To: netdev@vger.kernel.org, io-uring@vger.kernel.org Cc: Jens Axboe , Pavel Begunkov , Eric Dumazet , "David S . Miller" , Jakub Kicinski , Hideaki YOSHIFUJI , David Ahern , Joseph Qi Subject: [PATCH 3/3] io_uring: zerocopy receive Date: Mon, 24 Jan 2022 17:43:20 +0800 Message-Id: <20220124094320.900713-4-haoxu@linux.alibaba.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220124094320.900713-1-haoxu@linux.alibaba.com> References: <20220124094320.900713-1-haoxu@linux.alibaba.com> MIME-Version: 1.0 Precedence: bulk List-ID: X-Mailing-List: netdev@vger.kernel.org Integrate the current zerocopy receive solution to io_uring for eazier use. The current calling process is: 1) mmap a range of virtual address 2) poll() to wait for data ready of the sockfd 3) call getsockopt() to map the address in 1) to physical pages 4) access the data. By integrating it to io_uring, 2) and 3) can be merged: 1) mmap a range of virtual address 2) prepare a sqe and submit 3) get a cqe which indicates data is ready and mapped 4) access the data which reduce one system call and make users be unaware of 3) Signed-off-by: Hao Xu Reported-by: kernel test robot Reported-by: kernel test robot --- fs/io_uring.c | 72 +++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 1 + 2 files changed, 73 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 422d6de48688..5826d84400f6 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -81,6 +81,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -581,6 +582,12 @@ struct io_sr_msg { size_t len; }; +struct io_recvzc { + struct file *file; + char __user *u_zc; + int __user *u_len; +}; + struct io_open { struct file *file; int dfd; @@ -855,6 +862,7 @@ struct io_kiocb { struct io_mkdir mkdir; struct io_symlink symlink; struct io_hardlink hardlink; + struct io_recvzc recvzc; }; u8 opcode; @@ -1105,6 +1113,12 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_MKDIRAT] = {}, [IORING_OP_SYMLINKAT] = {}, [IORING_OP_LINKAT] = {}, + [IORING_OP_RECVZC] = { + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollin = 1, + .audit_skip = 1, + }, }; /* requests with any of those set should undergo io_disarm_next() */ @@ -5243,6 +5257,59 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static int io_recvzc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_recvzc *recvzc = &req->recvzc; + +#ifndef CONFIG_MMU + return -EOPNOTSUPP; +#endif + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->len || sqe->buf_index) + return -EINVAL; + + recvzc->u_zc = u64_to_user_ptr(READ_ONCE(sqe->addr)); + recvzc->u_len = u64_to_user_ptr(READ_ONCE(sqe->off)); + + return 0; +} + +static int io_recvzc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct scm_timestamping_internal tss; + struct io_recvzc *recvzc = &req->recvzc; + struct tcp_zerocopy_receive zc; + char __user *u_zc = recvzc->u_zc; + int __user *u_len = recvzc->u_len; + int len = 0; + struct socket *sock; + struct sock *sk; + int err; + + if (!(req->flags & REQ_F_POLLED)) + return -EAGAIN; + + err = zc_receive_check(&zc, &len, u_zc, u_len); + if (err) + goto out; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + sk = sock->sk; + lock_sock(sk); + err = tcp_zerocopy_receive(sk, &zc, &tss); + release_sock(sk); + err = zc_receive_update(sk, &zc, len, u_zc, &tss, err); + +out: + __io_req_complete(req, issue_flags, err, 0); + + return 0; +} + static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_accept *accept = &req->accept; @@ -6563,6 +6630,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_symlinkat_prep(req, sqe); case IORING_OP_LINKAT: return io_linkat_prep(req, sqe); + case IORING_OP_RECVZC: + return io_recvzc_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6846,6 +6915,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_LINKAT: ret = io_linkat(req, issue_flags); break; + case IORING_OP_RECVZC: + ret = io_recvzc(req, issue_flags); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 787f491f0d2a..79eb43c64da2 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -143,6 +143,7 @@ enum { IORING_OP_MKDIRAT, IORING_OP_SYMLINKAT, IORING_OP_LINKAT, + IORING_OP_RECVZC, /* this goes last, obviously */ IORING_OP_LAST,