Message ID | 20230316152618.711970-28-dhowells@redhat.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | splice, net: Replace sendpage with sendmsg(MSG_SPLICE_PAGES) | expand |
> On Mar 16, 2023, at 11:26, David Howells <dhowells@redhat.com> wrote: > > When transmitting data, call down into TCP using a single sendmsg with > MSG_SPLICE_PAGES to indicate that content should be spliced rather than > performing several sendmsg and sendpage calls to transmit header, data > pages and trailer. > > To make this work, the data is assembled in a bio_vec array and attached to > a BVEC-type iterator. The bio_vec array has two extra slots before the > first for headers and one after the last for a trailer. The headers and > trailer are copied into memory acquired from zcopy_alloc() which just > breaks a page up into small pieces that can be freed with put_page(). > > Signed-off-by: David Howells <dhowells@redhat.com> > cc: Trond Myklebust <trond.myklebust@hammerspace.com> > cc: Anna Schumaker <anna@kernel.org> > cc: Chuck Lever <chuck.lever@oracle.com> > cc: Jeff Layton <jlayton@kernel.org> > cc: "David S. Miller" <davem@davemloft.net> > cc: Eric Dumazet <edumazet@google.com> > cc: Jakub Kicinski <kuba@kernel.org> > cc: Paolo Abeni <pabeni@redhat.com> > cc: Jens Axboe <axboe@kernel.dk> > cc: Matthew Wilcox <willy@infradead.org> > cc: linux-nfs@vger.kernel.org > cc: netdev@vger.kernel.org > --- > net/sunrpc/svcsock.c | 70 ++++++++++++-------------------------------- > net/sunrpc/xdr.c | 24 ++++++++++++--- > 2 files changed, 38 insertions(+), 56 deletions(-) > > diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c > index 03a4f5615086..1fa41ddbc40e 100644 > --- a/net/sunrpc/svcsock.c > +++ b/net/sunrpc/svcsock.c > @@ -36,6 +36,7 @@ > #include <linux/skbuff.h> > #include <linux/file.h> > #include <linux/freezer.h> > +#include <linux/zcopy_alloc.h> > #include <net/sock.h> > #include <net/checksum.h> > #include <net/ip.h> > @@ -1060,16 +1061,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) > return 0; /* record not complete */ > } > > -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, > - int flags) > -{ > - return kernel_sendpage(sock, virt_to_page(vec->iov_base), > - offset_in_page(vec->iov_base), > - vec->iov_len, flags); > -} > - > /* > - * kernel_sendpage() is used exclusively to reduce the number of > + * MSG_SPLICE_PAGES is used exclusively to reduce the number of > * copy operations in this path. Therefore the caller must ensure > * that the pages backing @xdr are unchanging. > * > @@ -1081,65 +1074,38 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr, > { > const struct kvec *head = xdr->head; > const struct kvec *tail = xdr->tail; > - struct kvec rm = { > - .iov_base = &marker, > - .iov_len = sizeof(marker), > - }; > struct msghdr msg = { > - .msg_flags = 0, > + .msg_flags = MSG_SPLICE_PAGES, > }; > - int ret; > + int ret, n = xdr_buf_pagecount(xdr), size; > > *sentp = 0; > ret = xdr_alloc_bvec(xdr, GFP_KERNEL); > if (ret < 0) > return ret; > > - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len); > + ret = zcopy_memdup(sizeof(marker), &marker, &xdr->bvec[-2], GFP_KERNEL); > if (ret < 0) > return ret; > - *sentp += ret; > - if (ret != rm.iov_len) > - return -EAGAIN; > > - ret = svc_tcp_send_kvec(sock, head, 0); > + ret = zcopy_memdup(head->iov_len, head->iov_base, &xdr->bvec[-1], GFP_KERNEL); > if (ret < 0) > return ret; > - *sentp += ret; > - if (ret != head->iov_len) > - goto out; > > - if (xdr->page_len) { > - unsigned int offset, len, remaining; > - struct bio_vec *bvec; > - > - bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT); > - offset = offset_in_page(xdr->page_base); > - remaining = xdr->page_len; > - while (remaining > 0) { > - len = min(remaining, bvec->bv_len - offset); > - ret = kernel_sendpage(sock, bvec->bv_page, > - bvec->bv_offset + offset, > - len, 0); > - if (ret < 0) > - return ret; > - *sentp += ret; > - if (ret != len) > - goto out; > - remaining -= len; > - offset = 0; > - bvec++; > - } > - } > + ret = zcopy_memdup(tail->iov_len, tail->iov_base, &xdr->bvec[n], GFP_KERNEL); > + if (ret < 0) > + return ret; > > - if (tail->iov_len) { > - ret = svc_tcp_send_kvec(sock, tail, 0); > - if (ret < 0) > - return ret; > - *sentp += ret; > - } > + size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len; > + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 2, n + 3, size); > > -out: > + ret = sock_sendmsg(sock, &msg); > + if (ret < 0) > + return ret; > + if (ret > 0) > + *sentp = ret; > + if (ret != size) > + return -EAGAIN; > return 0; > } > > diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c > index 36835b2f5446..6dff0b4f17b8 100644 > --- a/net/sunrpc/xdr.c > +++ b/net/sunrpc/xdr.c > @@ -145,14 +145,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) > { > size_t i, n = xdr_buf_pagecount(buf); > > - if (n != 0 && buf->bvec == NULL) { > - buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); > + if (buf->bvec == NULL) { > + /* Allow for two headers and a trailer to be attached */ > + buf->bvec = kmalloc_array(n + 3, sizeof(buf->bvec[0]), gfp); > if (!buf->bvec) > return -ENOMEM; > + buf->bvec += 2; > + buf->bvec[-2].bv_page = NULL; > + buf->bvec[-1].bv_page = NULL; NACK. > for (i = 0; i < n; i++) { > bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE, > 0); > } > + buf->bvec[n].bv_page = NULL; > } > return 0; > } > @@ -160,8 +165,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) > void > xdr_free_bvec(struct xdr_buf *buf) > { > - kfree(buf->bvec); > - buf->bvec = NULL; > + if (buf->bvec) { > + size_t n = xdr_buf_pagecount(buf); > + > + if (buf->bvec[-2].bv_page) > + put_page(buf->bvec[-2].bv_page); > + if (buf->bvec[-1].bv_page) > + put_page(buf->bvec[-1].bv_page); > + if (buf->bvec[n].bv_page) > + put_page(buf->bvec[n].bv_page); > + buf->bvec -= 2; > + kfree(buf->bvec); > + buf->bvec = NULL; > + } > } > > /** >
Trond Myklebust <trondmy@hammerspace.com> wrote: > > + buf->bvec += 2; > > + buf->bvec[-2].bv_page = NULL; > > + buf->bvec[-1].bv_page = NULL; > > NACK. Can you elaborate? Is it that you dislike allocating extra slots for protocol bits? Or just that the bvec[] is offset by 2? Or some other reason? David
Note: this is the first I've seen of this series -- not sure why I never received any of these patches. That means I haven't seen the cover letter and do not have any context for this proposed change. > On Mar 16, 2023, at 12:17 PM, Trond Myklebust <trondmy@hammerspace.com> wrote: > >> On Mar 16, 2023, at 11:26, David Howells <dhowells@redhat.com> wrote: >> >> When transmitting data, call down into TCP using a single sendmsg with >> MSG_SPLICE_PAGES to indicate that content should be spliced rather than >> performing several sendmsg and sendpage calls to transmit header, data >> pages and trailer. We've tried combining the sendpages calls in here before. It results in a significant and measurable performance regression. See: da1661b93bf4 ("SUNRPC: Teach server to use xprt_sock_sendmsg for socket sends") and it's subsequent revert: 4a85a6a3320b ("SUNRPC: Handle TCP socket sends with kernel_sendpage() again") Therefore, this kind of change needs to be accompanied by both benchmark results and some field testing to convince me it won't cause harm. Also, I'd rather see struct xdr_buf changed to /replace/ the head/pagevec/tail arrangement with bvecs before we do this kind of overhaul. And, we have to make certain that this doesn't break operation with kTLS sockets... do they support MSG_SPLICE_PAGES ? >> To make this work, the data is assembled in a bio_vec array and attached to >> a BVEC-type iterator. The bio_vec array has two extra slots before the >> first for headers and one after the last for a trailer. The headers and >> trailer are copied into memory acquired from zcopy_alloc() which just >> breaks a page up into small pieces that can be freed with put_page(). >> >> Signed-off-by: David Howells <dhowells@redhat.com> >> cc: Trond Myklebust <trond.myklebust@hammerspace.com> >> cc: Anna Schumaker <anna@kernel.org> >> cc: Chuck Lever <chuck.lever@oracle.com> >> cc: Jeff Layton <jlayton@kernel.org> >> cc: "David S. Miller" <davem@davemloft.net> >> cc: Eric Dumazet <edumazet@google.com> >> cc: Jakub Kicinski <kuba@kernel.org> >> cc: Paolo Abeni <pabeni@redhat.com> >> cc: Jens Axboe <axboe@kernel.dk> >> cc: Matthew Wilcox <willy@infradead.org> >> cc: linux-nfs@vger.kernel.org >> cc: netdev@vger.kernel.org >> --- >> net/sunrpc/svcsock.c | 70 ++++++++++++-------------------------------- >> net/sunrpc/xdr.c | 24 ++++++++++++--- >> 2 files changed, 38 insertions(+), 56 deletions(-) >> >> diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c >> index 03a4f5615086..1fa41ddbc40e 100644 >> --- a/net/sunrpc/svcsock.c >> +++ b/net/sunrpc/svcsock.c >> @@ -36,6 +36,7 @@ >> #include <linux/skbuff.h> >> #include <linux/file.h> >> #include <linux/freezer.h> >> +#include <linux/zcopy_alloc.h> >> #include <net/sock.h> >> #include <net/checksum.h> >> #include <net/ip.h> >> @@ -1060,16 +1061,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) >> return 0; /* record not complete */ >> } >> >> -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, >> - int flags) >> -{ >> - return kernel_sendpage(sock, virt_to_page(vec->iov_base), >> - offset_in_page(vec->iov_base), >> - vec->iov_len, flags); >> -} >> - >> /* >> - * kernel_sendpage() is used exclusively to reduce the number of >> + * MSG_SPLICE_PAGES is used exclusively to reduce the number of >> * copy operations in this path. Therefore the caller must ensure >> * that the pages backing @xdr are unchanging. >> * >> @@ -1081,65 +1074,38 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr, >> { >> const struct kvec *head = xdr->head; >> const struct kvec *tail = xdr->tail; >> - struct kvec rm = { >> - .iov_base = &marker, >> - .iov_len = sizeof(marker), >> - }; >> struct msghdr msg = { >> - .msg_flags = 0, >> + .msg_flags = MSG_SPLICE_PAGES, >> }; >> - int ret; >> + int ret, n = xdr_buf_pagecount(xdr), size; >> >> *sentp = 0; >> ret = xdr_alloc_bvec(xdr, GFP_KERNEL); >> if (ret < 0) >> return ret; >> >> - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len); >> + ret = zcopy_memdup(sizeof(marker), &marker, &xdr->bvec[-2], GFP_KERNEL); >> if (ret < 0) >> return ret; >> - *sentp += ret; >> - if (ret != rm.iov_len) >> - return -EAGAIN; >> >> - ret = svc_tcp_send_kvec(sock, head, 0); >> + ret = zcopy_memdup(head->iov_len, head->iov_base, &xdr->bvec[-1], GFP_KERNEL); >> if (ret < 0) >> return ret; >> - *sentp += ret; >> - if (ret != head->iov_len) >> - goto out; >> >> - if (xdr->page_len) { >> - unsigned int offset, len, remaining; >> - struct bio_vec *bvec; >> - >> - bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT); >> - offset = offset_in_page(xdr->page_base); >> - remaining = xdr->page_len; >> - while (remaining > 0) { >> - len = min(remaining, bvec->bv_len - offset); >> - ret = kernel_sendpage(sock, bvec->bv_page, >> - bvec->bv_offset + offset, >> - len, 0); >> - if (ret < 0) >> - return ret; >> - *sentp += ret; >> - if (ret != len) >> - goto out; >> - remaining -= len; >> - offset = 0; >> - bvec++; >> - } >> - } >> + ret = zcopy_memdup(tail->iov_len, tail->iov_base, &xdr->bvec[n], GFP_KERNEL); >> + if (ret < 0) >> + return ret; >> >> - if (tail->iov_len) { >> - ret = svc_tcp_send_kvec(sock, tail, 0); >> - if (ret < 0) >> - return ret; >> - *sentp += ret; >> - } >> + size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len; >> + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 2, n + 3, size); >> >> -out: >> + ret = sock_sendmsg(sock, &msg); >> + if (ret < 0) >> + return ret; >> + if (ret > 0) >> + *sentp = ret; >> + if (ret != size) >> + return -EAGAIN; >> return 0; >> } >> >> diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c >> index 36835b2f5446..6dff0b4f17b8 100644 >> --- a/net/sunrpc/xdr.c >> +++ b/net/sunrpc/xdr.c >> @@ -145,14 +145,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) >> { >> size_t i, n = xdr_buf_pagecount(buf); >> >> - if (n != 0 && buf->bvec == NULL) { >> - buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); >> + if (buf->bvec == NULL) { >> + /* Allow for two headers and a trailer to be attached */ >> + buf->bvec = kmalloc_array(n + 3, sizeof(buf->bvec[0]), gfp); >> if (!buf->bvec) >> return -ENOMEM; >> + buf->bvec += 2; >> + buf->bvec[-2].bv_page = NULL; >> + buf->bvec[-1].bv_page = NULL; > > NACK. > >> for (i = 0; i < n; i++) { >> bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE, >> 0); >> } >> + buf->bvec[n].bv_page = NULL; >> } >> return 0; >> } >> @@ -160,8 +165,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) >> void >> xdr_free_bvec(struct xdr_buf *buf) >> { >> - kfree(buf->bvec); >> - buf->bvec = NULL; >> + if (buf->bvec) { >> + size_t n = xdr_buf_pagecount(buf); >> + >> + if (buf->bvec[-2].bv_page) >> + put_page(buf->bvec[-2].bv_page); >> + if (buf->bvec[-1].bv_page) >> + put_page(buf->bvec[-1].bv_page); >> + if (buf->bvec[n].bv_page) >> + put_page(buf->bvec[n].bv_page); >> + buf->bvec -= 2; >> + kfree(buf->bvec); >> + buf->bvec = NULL; >> + } >> } >> >> /** >> > -- Chuck Lever
On Mar 16, 2023, at 12:24, David Howells <dhowells@redhat.com> wrote: Trond Myklebust <trondmy@hammerspace.com> wrote: + buf->bvec += 2; + buf->bvec[-2].bv_page = NULL; + buf->bvec[-1].bv_page = NULL; NACK. Can you elaborate? Is it that you dislike allocating extra slots for protocol bits? Or just that the bvec[] is offset by 2? Or some other reason? 1) This is code that is common to the client and the server. Why are we adding unused 3 bvec slots to every client RPC call? 2) It obfuscates the existence of these bvec slots. 3) knfsd may use splice_direct_to_actor() in order to avoid copying the page cache data into private buffers (it just takes a reference to the pages). Using MSG_SPLICE_PAGES will presumably require it to protect those pages against further writes while the socket is referencing them.
Chuck Lever III <chuck.lever@oracle.com> wrote: > That means I haven't seen the cover letter and do not have any > context for this proposed change. https://lore.kernel.org/linux-fsdevel/20230316152618.711970-1-dhowells@redhat.com/ > We've tried combining the sendpages calls in here before. It > results in a significant and measurable performance regression. > See: > > da1661b93bf4 ("SUNRPC: Teach server to use xprt_sock_sendmsg for socket sends") The commit replaced the use of sendpage with sendmsg, but that took away the zerocopy aspect of sendpage. The idea behind MSG_SPLICE_PAGES is that it allows you to do keep that. I'll have to try reapplying this commit and adding the MSG_SPLICE_PAGES flag. > Therefore, this kind of change needs to be accompanied by both > benchmark results and some field testing to convince me it won't > cause harm. Yep. > And, we have to make certain that this doesn't break operation > with kTLS sockets... do they support MSG_SPLICE_PAGES ? I haven't yet tackled AF_TLS, AF_KCM or AF_SMC as they seem significantly more complex than TCP and UDP. I thought I'd get some feedback on what I have before I tried my hand at those. David
> On Mar 16, 2023, at 1:28 PM, David Howells <dhowells@redhat.com> wrote: > > Chuck Lever III <chuck.lever@oracle.com> wrote: > >> That means I haven't seen the cover letter and do not have any >> context for this proposed change. > > https://lore.kernel.org/linux-fsdevel/20230316152618.711970-1-dhowells@redhat.com/ > >> We've tried combining the sendpages calls in here before. It >> results in a significant and measurable performance regression. >> See: >> >> da1661b93bf4 ("SUNRPC: Teach server to use xprt_sock_sendmsg for socket sends") > > The commit replaced the use of sendpage with sendmsg, but that took away the > zerocopy aspect of sendpage. The idea behind MSG_SPLICE_PAGES is that it > allows you to do keep that. I'll have to try reapplying this commit and > adding the MSG_SPLICE_PAGES flag. Note that, as Trond point out, NFSD can handle an NFS READ request with either a splice actor or by copying through a vector, depending on what the underlying filesystem can support and whether we are using a security flavor that requires stable pages. Grep for RQ_SPLICE_OK. Eventually we want to make use of iomaps to ensure that reading areas of a file that are not allocated on disk does not trigger an extent allocation. Anna is working on that, but I have no idea what it will look like. We can talk more at LSF, if you'll both be around. Also... I find I have to put back the use of MSG_MORE and friends in here, otherwise kTLS will split each of these kernel_sendsomething() calls into its own TLS record. This code is likely going to look different after support for RPC-with-TLS goes in. >> Therefore, this kind of change needs to be accompanied by both >> benchmark results and some field testing to convince me it won't >> cause harm. > > Yep. > >> And, we have to make certain that this doesn't break operation >> with kTLS sockets... do they support MSG_SPLICE_PAGES ? > > I haven't yet tackled AF_TLS, AF_KCM or AF_SMC as they seem significantly more > complex than TCP and UDP. I thought I'd get some feedback on what I have > before I tried my hand at those. OK, I didn't mean AF_TLS, I meant the stuff under net/tls, which is AF_INET[6] and TCP, but with a ULP in place. It's got its own sendpage and sendmsg methods that choke when an unrecognized MSG_ flag is present. But OK, you're just asking for feedback, so I'll put my red pencil down. -- Chuck Lever
Trond Myklebust <trondmy@hammerspace.com> wrote: > 1) This is code that is common to the client and the server. Why are we > adding unused 3 bvec slots to every client RPC call? Fair point, but I'm trying to avoid making four+ sendmsg calls in nfsd rather than one. > 2) It obfuscates the existence of these bvec slots. True, it'd be nice to find a better way to do it. Question is, can the client make use of MSG_SPLICE_PAGES also? > 3) knfsd may use splice_direct_to_actor() in order to avoid copying the page > cache data into private buffers (it just takes a reference to the > pages). Using MSG_SPLICE_PAGES will presumably require it to protect those > pages against further writes while the socket is referencing them. Upstream sunrpc is using sendpage with TCP. It already has that issue. MSG_SPLICE_PAGES is a way of doing sendpage through sendmsg. David
> On Mar 16, 2023, at 14:06, David Howells <dhowells@redhat.com> wrote: > > Trond Myklebust <trondmy@hammerspace.com> wrote: > >> 1) This is code that is common to the client and the server. Why are we >> adding unused 3 bvec slots to every client RPC call? > > Fair point, but I'm trying to avoid making four+ sendmsg calls in nfsd rather > than one. Add an enum iter_type for ITER_ITER ? :-) Otherwise, please just split these functions into one for knfsd and a separate one for the client. > >> 2) It obfuscates the existence of these bvec slots. > > True, it'd be nice to find a better way to do it. Question is, can the client > make use of MSG_SPLICE_PAGES also? The requirement for O_DIRECT support means we get the stable write issues with added extra spicy sauce. > >> 3) knfsd may use splice_direct_to_actor() in order to avoid copying the page >> cache data into private buffers (it just takes a reference to the >> pages). Using MSG_SPLICE_PAGES will presumably require it to protect those >> pages against further writes while the socket is referencing them. > > Upstream sunrpc is using sendpage with TCP. It already has that issue. > MSG_SPLICE_PAGES is a way of doing sendpage through sendmsg. Fair enough. I do seem to remember a schism with the knfsd developers over that issue.
Chuck Lever III <chuck.lever@oracle.com> wrote: > Therefore, this kind of change needs to be accompanied by both > benchmark results and some field testing to convince me it won't > cause harm. Btw, what do you use to benchmark NFS performance? David
> On Mar 16, 2023, at 5:21 PM, David Howells <dhowells@redhat.com> wrote: > > Chuck Lever III <chuck.lever@oracle.com> wrote: > >> Therefore, this kind of change needs to be accompanied by both >> benchmark results and some field testing to convince me it won't >> cause harm. > > Btw, what do you use to benchmark NFS performance? It depends on what I'm trying to observe. I have only a small handful of systems in my lab, which is why I was not able to immediately detect the effects of the zero-copy change in my lab. Daire has a large client cohort on a fast network, so is able to see the impact of that kind of change quite readily. A perhaps more interesting question is what kind of tooling would I use to measure the performance of the proposed change. The bottom line is whether or not applications on clients can see a change. NFS client implementations can hide server and network latency improvement from applications, and RPC-on-TCP adds palpable latency as well that reduces the efficacy of server performance optimizations. For that I might use a multi-threaded fio workload with fixed record sizes (2KB, 8KB, 128KB, 1MB) and then look at the throughput numbers and latency distribution for each size. In a single thread qd=1 test, iozone can show changes in READ latency pretty clearly, though most folks believe qd=1 tests are junk. I generally run such tests on 100GbE with a tmpfs or NVMe export to take filesystem latencies out of the equation, although that might matter more for WRITE latency if you can keep your READ workload completely in server memory. To measure server-side behavior without the effects of the network or client, NFSD has a built-in trace point, nfsd:svc_stats_latency, that records the latency in microseconds of each RPC. Run the above workloads and record this tracepoint (perhaps with a capture filter to record only the latency of READ operations). Then you can post-process the raw latencies to get an average latency and deviation, or even look at latency distribution to see if the shape of the outlier curve has changed. I use awk for this. [ Sidebar: you can use this tracepoint to track latency outliers too, but that's another topic. ] Second, I might try a flame graph study to measure changes in instruction path length, and also capture an average cycles- per-byte-read value. Looking at CPU cache misses can often be a rathole, but flame graphs can surface changes there too. And lastly, you might want to visit lock_stats to see if there is any significant change in lock contention. An unexpected increase in lock contention can often rob positive changes made in other areas. My guess is that for the RQ_SPLICE_OK case, the difference would amount to the elimination of the kernel_sendpage calls, which are indirect, but not terribly expensive. Those calls amount to a significant cost only on large I/O. It might not amount to much relative to the other costs in the READ path. So the real purpose here would have to be refactoring to use bvecs instead of the bespoke xdr_buf structure, and I would like to see support for bvecs in all of our transports (looking at you, RDMA) to make this truly worthwhile. I had started this a while back, but lack of a bvec-based RDMA API made it less interesting to me. It isn't clear to me yet whether bvecs or folios should be the replacement for xdr_buf's head/pages/tail, but I'm a paid-in-full member of the uneducated rabble. This might sound like a lot of pushback, but actually I am open to discussing clean-ups in this area, including the one you proposed. Just getting a little more careful about this kind of change as time goes on. And it sounds like you were already aware of the most recent previous attempt at this kind of improvement. -- Chuck Lever
Trond Myklebust <trondmy@hammerspace.com> wrote:
> Add an enum iter_type for ITER_ITER ? :-)
Actually, that might not be such a bad idea, now that I've pondered on it some
more. Give it an array of iterators and add a flag to each iterator to say if
it can be spliced from or not.
Once ITER_PIPE is killed off, advancing and reverting over it should be pretty
straightforward - though each iterator would also need to keep track of how
big it started off as in order that it can be reverted over.
David
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 03a4f5615086..1fa41ddbc40e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -36,6 +36,7 @@ #include <linux/skbuff.h> #include <linux/file.h> #include <linux/freezer.h> +#include <linux/zcopy_alloc.h> #include <net/sock.h> #include <net/checksum.h> #include <net/ip.h> @@ -1060,16 +1061,8 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp) return 0; /* record not complete */ } -static int svc_tcp_send_kvec(struct socket *sock, const struct kvec *vec, - int flags) -{ - return kernel_sendpage(sock, virt_to_page(vec->iov_base), - offset_in_page(vec->iov_base), - vec->iov_len, flags); -} - /* - * kernel_sendpage() is used exclusively to reduce the number of + * MSG_SPLICE_PAGES is used exclusively to reduce the number of * copy operations in this path. Therefore the caller must ensure * that the pages backing @xdr are unchanging. * @@ -1081,65 +1074,38 @@ static int svc_tcp_sendmsg(struct socket *sock, struct xdr_buf *xdr, { const struct kvec *head = xdr->head; const struct kvec *tail = xdr->tail; - struct kvec rm = { - .iov_base = &marker, - .iov_len = sizeof(marker), - }; struct msghdr msg = { - .msg_flags = 0, + .msg_flags = MSG_SPLICE_PAGES, }; - int ret; + int ret, n = xdr_buf_pagecount(xdr), size; *sentp = 0; ret = xdr_alloc_bvec(xdr, GFP_KERNEL); if (ret < 0) return ret; - ret = kernel_sendmsg(sock, &msg, &rm, 1, rm.iov_len); + ret = zcopy_memdup(sizeof(marker), &marker, &xdr->bvec[-2], GFP_KERNEL); if (ret < 0) return ret; - *sentp += ret; - if (ret != rm.iov_len) - return -EAGAIN; - ret = svc_tcp_send_kvec(sock, head, 0); + ret = zcopy_memdup(head->iov_len, head->iov_base, &xdr->bvec[-1], GFP_KERNEL); if (ret < 0) return ret; - *sentp += ret; - if (ret != head->iov_len) - goto out; - if (xdr->page_len) { - unsigned int offset, len, remaining; - struct bio_vec *bvec; - - bvec = xdr->bvec + (xdr->page_base >> PAGE_SHIFT); - offset = offset_in_page(xdr->page_base); - remaining = xdr->page_len; - while (remaining > 0) { - len = min(remaining, bvec->bv_len - offset); - ret = kernel_sendpage(sock, bvec->bv_page, - bvec->bv_offset + offset, - len, 0); - if (ret < 0) - return ret; - *sentp += ret; - if (ret != len) - goto out; - remaining -= len; - offset = 0; - bvec++; - } - } + ret = zcopy_memdup(tail->iov_len, tail->iov_base, &xdr->bvec[n], GFP_KERNEL); + if (ret < 0) + return ret; - if (tail->iov_len) { - ret = svc_tcp_send_kvec(sock, tail, 0); - if (ret < 0) - return ret; - *sentp += ret; - } + size = sizeof(marker) + head->iov_len + xdr->page_len + tail->iov_len; + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, xdr->bvec - 2, n + 3, size); -out: + ret = sock_sendmsg(sock, &msg); + if (ret < 0) + return ret; + if (ret > 0) + *sentp = ret; + if (ret != size) + return -EAGAIN; return 0; } diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 36835b2f5446..6dff0b4f17b8 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -145,14 +145,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) { size_t i, n = xdr_buf_pagecount(buf); - if (n != 0 && buf->bvec == NULL) { - buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); + if (buf->bvec == NULL) { + /* Allow for two headers and a trailer to be attached */ + buf->bvec = kmalloc_array(n + 3, sizeof(buf->bvec[0]), gfp); if (!buf->bvec) return -ENOMEM; + buf->bvec += 2; + buf->bvec[-2].bv_page = NULL; + buf->bvec[-1].bv_page = NULL; for (i = 0; i < n; i++) { bvec_set_page(&buf->bvec[i], buf->pages[i], PAGE_SIZE, 0); } + buf->bvec[n].bv_page = NULL; } return 0; } @@ -160,8 +165,19 @@ xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) void xdr_free_bvec(struct xdr_buf *buf) { - kfree(buf->bvec); - buf->bvec = NULL; + if (buf->bvec) { + size_t n = xdr_buf_pagecount(buf); + + if (buf->bvec[-2].bv_page) + put_page(buf->bvec[-2].bv_page); + if (buf->bvec[-1].bv_page) + put_page(buf->bvec[-1].bv_page); + if (buf->bvec[n].bv_page) + put_page(buf->bvec[n].bv_page); + buf->bvec -= 2; + kfree(buf->bvec); + buf->bvec = NULL; + } } /**
When transmitting data, call down into TCP using a single sendmsg with MSG_SPLICE_PAGES to indicate that content should be spliced rather than performing several sendmsg and sendpage calls to transmit header, data pages and trailer. To make this work, the data is assembled in a bio_vec array and attached to a BVEC-type iterator. The bio_vec array has two extra slots before the first for headers and one after the last for a trailer. The headers and trailer are copied into memory acquired from zcopy_alloc() which just breaks a page up into small pieces that can be freed with put_page(). Signed-off-by: David Howells <dhowells@redhat.com> cc: Trond Myklebust <trond.myklebust@hammerspace.com> cc: Anna Schumaker <anna@kernel.org> cc: Chuck Lever <chuck.lever@oracle.com> cc: Jeff Layton <jlayton@kernel.org> cc: "David S. Miller" <davem@davemloft.net> cc: Eric Dumazet <edumazet@google.com> cc: Jakub Kicinski <kuba@kernel.org> cc: Paolo Abeni <pabeni@redhat.com> cc: Jens Axboe <axboe@kernel.dk> cc: Matthew Wilcox <willy@infradead.org> cc: linux-nfs@vger.kernel.org cc: netdev@vger.kernel.org --- net/sunrpc/svcsock.c | 70 ++++++++++++-------------------------------- net/sunrpc/xdr.c | 24 ++++++++++++--- 2 files changed, 38 insertions(+), 56 deletions(-)