Message ID | 20191017183152.2517.67599.stgit@oracle-102.nfsv4bat.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Sometimes pull up the Send buffer | expand |
On 10/17/2019 2:31 PM, Chuck Lever wrote: > On some platforms, DMA mapping part of a page is more costly than > copying bytes. Restore the pull-up code and use that when we > think it's going to be faster. The heuristic for now is to pull-up > when the size of the RPC message body fits in the buffer underlying > the head iovec. > > Indeed, not involving the I/O MMU can help the RPC/RDMA transport > scale better for tiny I/Os across more RDMA devices. This is because > interaction with the I/O MMU is eliminated, as is handling a Send > completion, for each of these small I/Os. Without the explicit > unmapping, the NIC no longer needs to do a costly internal TLB shoot > down for buffers that are just a handful of bytes. This is good stuff. Do you have any performance data for the new strategy, especially latencies and local CPU cycles per byte? Tom. > Signed-off-by: Chuck Lever <chuck.lever@oracle.com> > --- > net/sunrpc/xprtrdma/backchannel.c | 2 - > net/sunrpc/xprtrdma/rpc_rdma.c | 82 +++++++++++++++++++++++++++++++++++-- > net/sunrpc/xprtrdma/verbs.c | 2 - > net/sunrpc/xprtrdma/xprt_rdma.h | 2 + > 4 files changed, 81 insertions(+), 7 deletions(-) > > diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h > index 16572ae7..336a65d 100644 > --- a/include/trace/events/rpcrdma.h > +++ b/include/trace/events/rpcrdma.h > @@ -532,6 +532,8 @@ > DEFINE_WRCH_EVENT(reply); > > TRACE_DEFINE_ENUM(rpcrdma_noch); > +TRACE_DEFINE_ENUM(rpcrdma_noch_pullup); > +TRACE_DEFINE_ENUM(rpcrdma_noch_mapped); > TRACE_DEFINE_ENUM(rpcrdma_readch); > TRACE_DEFINE_ENUM(rpcrdma_areadch); > TRACE_DEFINE_ENUM(rpcrdma_writech); > @@ -540,6 +542,8 @@ > #define xprtrdma_show_chunktype(x) \ > __print_symbolic(x, \ > { rpcrdma_noch, "inline" }, \ > + { rpcrdma_noch_pullup, "pullup" }, \ > + { rpcrdma_noch_mapped, "mapped" }, \ > { rpcrdma_readch, "read list" }, \ > { rpcrdma_areadch, "*read list" }, \ > { rpcrdma_writech, "write list" }, \ > diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c > index 50e075f..1168524 100644 > --- a/net/sunrpc/xprtrdma/backchannel.c > +++ b/net/sunrpc/xprtrdma/backchannel.c > @@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) > *p = xdr_zero; > > if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, > - &rqst->rq_snd_buf, rpcrdma_noch)) > + &rqst->rq_snd_buf, rpcrdma_noch_pullup)) > return -EIO; > > trace_xprtrdma_cb_reply(rqst); > diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c > index a441dbf..4ad8889 100644 > --- a/net/sunrpc/xprtrdma/rpc_rdma.c > +++ b/net/sunrpc/xprtrdma/rpc_rdma.c > @@ -392,7 +392,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, > unsigned int pos; > int nsegs; > > - if (rtype == rpcrdma_noch) > + if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) > goto done; > > pos = rqst->rq_snd_buf.head[0].iov_len; > @@ -691,6 +691,72 @@ static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, > return false; > } > > +/* Copy the tail to the end of the head buffer. > + */ > +static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, > + struct rpcrdma_req *req, > + struct xdr_buf *xdr) > +{ > + unsigned char *dst; > + > + dst = (unsigned char *)xdr->head[0].iov_base; > + dst += xdr->head[0].iov_len + xdr->page_len; > + memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); > + r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; > +} > + > +/* Copy pagelist content into the head buffer. > + */ > +static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, > + struct rpcrdma_req *req, > + struct xdr_buf *xdr) > +{ > + unsigned int len, page_base, remaining; > + struct page **ppages; > + unsigned char *src, *dst; > + > + dst = (unsigned char *)xdr->head[0].iov_base; > + dst += xdr->head[0].iov_len; > + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); > + page_base = offset_in_page(xdr->page_base); > + remaining = xdr->page_len; > + while (remaining) { > + src = page_address(*ppages); > + src += page_base; > + len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); > + memcpy(dst, src, len); > + r_xprt->rx_stats.pullup_copy_count += len; > + > + ppages++; > + dst += len; > + remaining -= len; > + page_base = 0; > + } > +} > + > +/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. > + * When the head, pagelist, and tail are small, a pull-up copy > + * is considerably less costly than DMA mapping the components > + * of @xdr. > + * > + * Assumptions: > + * - the caller has already verified that the total length > + * of the RPC Call body will fit into @rl_sendbuf. > + */ > +static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, > + struct rpcrdma_req *req, > + struct xdr_buf *xdr) > +{ > + if (unlikely(xdr->tail[0].iov_len)) > + rpcrdma_pullup_tail_iov(r_xprt, req, xdr); > + > + if (unlikely(xdr->page_len)) > + rpcrdma_pullup_pagelist(r_xprt, req, xdr); > + > + /* The whole RPC message resides in the head iovec now */ > + return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); > +} > + > static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, > struct rpcrdma_req *req, > struct xdr_buf *xdr) > @@ -779,7 +845,11 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, > goto out_unmap; > > switch (rtype) { > - case rpcrdma_noch: > + case rpcrdma_noch_pullup: > + if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) > + goto out_unmap; > + break; > + case rpcrdma_noch_mapped: > if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) > goto out_unmap; > break; > @@ -827,6 +897,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, > struct rpcrdma_req *req = rpcr_to_rdmar(rqst); > struct xdr_stream *xdr = &req->rl_stream; > enum rpcrdma_chunktype rtype, wtype; > + struct xdr_buf *buf = &rqst->rq_snd_buf; > bool ddp_allowed; > __be32 *p; > int ret; > @@ -884,8 +955,9 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, > */ > if (rpcrdma_args_inline(r_xprt, rqst)) { > *p++ = rdma_msg; > - rtype = rpcrdma_noch; > - } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { > + rtype = buf->len < rdmab_length(req->rl_sendbuf) ? > + rpcrdma_noch_pullup : rpcrdma_noch_mapped; > + } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { > *p++ = rdma_msg; > rtype = rpcrdma_readch; > } else { > @@ -927,7 +999,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, > goto out_err; > > ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, > - &rqst->rq_snd_buf, rtype); > + buf, rtype); > if (ret) > goto out_err; > > diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c > index 2f46582..a514e2c 100644 > --- a/net/sunrpc/xprtrdma/verbs.c > +++ b/net/sunrpc/xprtrdma/verbs.c > @@ -1165,7 +1165,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) > for (i = 0; i < buf->rb_max_requests; i++) { > struct rpcrdma_req *req; > > - req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE, > + req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2, > GFP_KERNEL); > if (!req) > goto out; > diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h > index cdd6a3d..5d15140 100644 > --- a/net/sunrpc/xprtrdma/xprt_rdma.h > +++ b/net/sunrpc/xprtrdma/xprt_rdma.h > @@ -554,6 +554,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, > > enum rpcrdma_chunktype { > rpcrdma_noch = 0, > + rpcrdma_noch_pullup, > + rpcrdma_noch_mapped, > rpcrdma_readch, > rpcrdma_areadch, > rpcrdma_writech, > > >
Hi Tom- > On Oct 18, 2019, at 4:17 PM, Tom Talpey <tom@talpey.com> wrote: > > On 10/17/2019 2:31 PM, Chuck Lever wrote: >> On some platforms, DMA mapping part of a page is more costly than >> copying bytes. Restore the pull-up code and use that when we >> think it's going to be faster. The heuristic for now is to pull-up >> when the size of the RPC message body fits in the buffer underlying >> the head iovec. >> Indeed, not involving the I/O MMU can help the RPC/RDMA transport >> scale better for tiny I/Os across more RDMA devices. This is because >> interaction with the I/O MMU is eliminated, as is handling a Send >> completion, for each of these small I/Os. Without the explicit >> unmapping, the NIC no longer needs to do a costly internal TLB shoot >> down for buffers that are just a handful of bytes. > > This is good stuff. Do you have any performance data for the new > strategy, especially latencies and local CPU cycles per byte? Saves almost a microsecond of RT latency on my NFS client that uses a real Intel IOMMU. On my other NFS client, the DMA map operations are always a no-op. This savings applies only to NFS WRITE, of course. I don't have a good benchmark for cycles per byte. Do you have any suggestions? Not sure how I would account for cycles spent handling Send completions, for example. > Tom. > >> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> >> --- >> net/sunrpc/xprtrdma/backchannel.c | 2 - >> net/sunrpc/xprtrdma/rpc_rdma.c | 82 +++++++++++++++++++++++++++++++++++-- >> net/sunrpc/xprtrdma/verbs.c | 2 - >> net/sunrpc/xprtrdma/xprt_rdma.h | 2 + >> 4 files changed, 81 insertions(+), 7 deletions(-) >> diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h >> index 16572ae7..336a65d 100644 >> --- a/include/trace/events/rpcrdma.h >> +++ b/include/trace/events/rpcrdma.h >> @@ -532,6 +532,8 @@ >> DEFINE_WRCH_EVENT(reply); >> TRACE_DEFINE_ENUM(rpcrdma_noch); >> +TRACE_DEFINE_ENUM(rpcrdma_noch_pullup); >> +TRACE_DEFINE_ENUM(rpcrdma_noch_mapped); >> TRACE_DEFINE_ENUM(rpcrdma_readch); >> TRACE_DEFINE_ENUM(rpcrdma_areadch); >> TRACE_DEFINE_ENUM(rpcrdma_writech); >> @@ -540,6 +542,8 @@ >> #define xprtrdma_show_chunktype(x) \ >> __print_symbolic(x, \ >> { rpcrdma_noch, "inline" }, \ >> + { rpcrdma_noch_pullup, "pullup" }, \ >> + { rpcrdma_noch_mapped, "mapped" }, \ >> { rpcrdma_readch, "read list" }, \ >> { rpcrdma_areadch, "*read list" }, \ >> { rpcrdma_writech, "write list" }, \ >> diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c >> index 50e075f..1168524 100644 >> --- a/net/sunrpc/xprtrdma/backchannel.c >> +++ b/net/sunrpc/xprtrdma/backchannel.c >> @@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) >> *p = xdr_zero; >> if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, >> - &rqst->rq_snd_buf, rpcrdma_noch)) >> + &rqst->rq_snd_buf, rpcrdma_noch_pullup)) >> return -EIO; >> trace_xprtrdma_cb_reply(rqst); >> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c >> index a441dbf..4ad8889 100644 >> --- a/net/sunrpc/xprtrdma/rpc_rdma.c >> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c >> @@ -392,7 +392,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, >> unsigned int pos; >> int nsegs; >> - if (rtype == rpcrdma_noch) >> + if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) >> goto done; >> pos = rqst->rq_snd_buf.head[0].iov_len; >> @@ -691,6 +691,72 @@ static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, >> return false; >> } >> +/* Copy the tail to the end of the head buffer. >> + */ >> +static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, >> + struct rpcrdma_req *req, >> + struct xdr_buf *xdr) >> +{ >> + unsigned char *dst; >> + >> + dst = (unsigned char *)xdr->head[0].iov_base; >> + dst += xdr->head[0].iov_len + xdr->page_len; >> + memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); >> + r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; >> +} >> + >> +/* Copy pagelist content into the head buffer. >> + */ >> +static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, >> + struct rpcrdma_req *req, >> + struct xdr_buf *xdr) >> +{ >> + unsigned int len, page_base, remaining; >> + struct page **ppages; >> + unsigned char *src, *dst; >> + >> + dst = (unsigned char *)xdr->head[0].iov_base; >> + dst += xdr->head[0].iov_len; >> + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); >> + page_base = offset_in_page(xdr->page_base); >> + remaining = xdr->page_len; >> + while (remaining) { >> + src = page_address(*ppages); >> + src += page_base; >> + len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); >> + memcpy(dst, src, len); >> + r_xprt->rx_stats.pullup_copy_count += len; >> + >> + ppages++; >> + dst += len; >> + remaining -= len; >> + page_base = 0; >> + } >> +} >> + >> +/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. >> + * When the head, pagelist, and tail are small, a pull-up copy >> + * is considerably less costly than DMA mapping the components >> + * of @xdr. >> + * >> + * Assumptions: >> + * - the caller has already verified that the total length >> + * of the RPC Call body will fit into @rl_sendbuf. >> + */ >> +static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, >> + struct rpcrdma_req *req, >> + struct xdr_buf *xdr) >> +{ >> + if (unlikely(xdr->tail[0].iov_len)) >> + rpcrdma_pullup_tail_iov(r_xprt, req, xdr); >> + >> + if (unlikely(xdr->page_len)) >> + rpcrdma_pullup_pagelist(r_xprt, req, xdr); >> + >> + /* The whole RPC message resides in the head iovec now */ >> + return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); >> +} >> + >> static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, >> struct rpcrdma_req *req, >> struct xdr_buf *xdr) >> @@ -779,7 +845,11 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, >> goto out_unmap; >> switch (rtype) { >> - case rpcrdma_noch: >> + case rpcrdma_noch_pullup: >> + if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) >> + goto out_unmap; >> + break; >> + case rpcrdma_noch_mapped: >> if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) >> goto out_unmap; >> break; >> @@ -827,6 +897,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, >> struct rpcrdma_req *req = rpcr_to_rdmar(rqst); >> struct xdr_stream *xdr = &req->rl_stream; >> enum rpcrdma_chunktype rtype, wtype; >> + struct xdr_buf *buf = &rqst->rq_snd_buf; >> bool ddp_allowed; >> __be32 *p; >> int ret; >> @@ -884,8 +955,9 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, >> */ >> if (rpcrdma_args_inline(r_xprt, rqst)) { >> *p++ = rdma_msg; >> - rtype = rpcrdma_noch; >> - } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { >> + rtype = buf->len < rdmab_length(req->rl_sendbuf) ? >> + rpcrdma_noch_pullup : rpcrdma_noch_mapped; >> + } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { >> *p++ = rdma_msg; >> rtype = rpcrdma_readch; >> } else { >> @@ -927,7 +999,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, >> goto out_err; >> ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, >> - &rqst->rq_snd_buf, rtype); >> + buf, rtype); >> if (ret) >> goto out_err; >> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c >> index 2f46582..a514e2c 100644 >> --- a/net/sunrpc/xprtrdma/verbs.c >> +++ b/net/sunrpc/xprtrdma/verbs.c >> @@ -1165,7 +1165,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) >> for (i = 0; i < buf->rb_max_requests; i++) { >> struct rpcrdma_req *req; >> - req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE, >> + req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2, >> GFP_KERNEL); >> if (!req) >> goto out; >> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h >> index cdd6a3d..5d15140 100644 >> --- a/net/sunrpc/xprtrdma/xprt_rdma.h >> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h >> @@ -554,6 +554,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, >> enum rpcrdma_chunktype { >> rpcrdma_noch = 0, >> + rpcrdma_noch_pullup, >> + rpcrdma_noch_mapped, >> rpcrdma_readch, >> rpcrdma_areadch, >> rpcrdma_writech, -- Chuck Lever
On 10/18/2019 7:34 PM, Chuck Lever wrote: > Hi Tom- > >> On Oct 18, 2019, at 4:17 PM, Tom Talpey <tom@talpey.com> wrote: >> >> On 10/17/2019 2:31 PM, Chuck Lever wrote: >>> On some platforms, DMA mapping part of a page is more costly than >>> copying bytes. Restore the pull-up code and use that when we >>> think it's going to be faster. The heuristic for now is to pull-up >>> when the size of the RPC message body fits in the buffer underlying >>> the head iovec. >>> Indeed, not involving the I/O MMU can help the RPC/RDMA transport >>> scale better for tiny I/Os across more RDMA devices. This is because >>> interaction with the I/O MMU is eliminated, as is handling a Send >>> completion, for each of these small I/Os. Without the explicit >>> unmapping, the NIC no longer needs to do a costly internal TLB shoot >>> down for buffers that are just a handful of bytes. >> >> This is good stuff. Do you have any performance data for the new >> strategy, especially latencies and local CPU cycles per byte? > > Saves almost a microsecond of RT latency on my NFS client that uses > a real Intel IOMMU. On my other NFS client, the DMA map operations > are always a no-op. This savings applies only to NFS WRITE, of course. > > I don't have a good benchmark for cycles per byte. Do you have any > suggestions? Not sure how I would account for cycles spent handling > Send completions, for example. Cycles per byte is fairly simple but like all performance measurement the trick is in the setup. Because of platform variations, it's best to compare results on the same hardware. The absolute value isn't as meaningful. Here's a rough sketch of one approach. - Configure BIOS and OS to hold CPU frequency constant: - ACPI C-states off - Turbo mode off - Power management off (OS needs this too) - Anything else relevant to clock variation - Hyperthreading off - (hyperthreads don't add work linearly) - Calculate core count X clock frequency - (e.g. 8 X 3GHz = 24G cycles/sec) Now, use a benchmark which runs the desired workload and reports %CPU. For a given interval, record the total bytes transferred, time spent, and CPU load. (e.g. 100GB, 100 sec, 20%). Finally, compute CpB (the 1/sec terms cancel out): 20% x 24Gcps = 4.8G cps 100GB / 100s = 1G bps 4.8Gcps / 1 GBps = 4.8cpb Like I said, it's rough, but surprisingly telling. A similar metric is cycles per IOP, and since you're focusing on small i/o with this change, it might also be an interesting calculation. Simply replace total bytes/sec with IOPS. Tom. >>> Signed-off-by: Chuck Lever <chuck.lever@oracle.com> >>> --- >>> net/sunrpc/xprtrdma/backchannel.c | 2 - >>> net/sunrpc/xprtrdma/rpc_rdma.c | 82 +++++++++++++++++++++++++++++++++++-- >>> net/sunrpc/xprtrdma/verbs.c | 2 - >>> net/sunrpc/xprtrdma/xprt_rdma.h | 2 + >>> 4 files changed, 81 insertions(+), 7 deletions(-) >>> diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h >>> index 16572ae7..336a65d 100644 >>> --- a/include/trace/events/rpcrdma.h >>> +++ b/include/trace/events/rpcrdma.h >>> @@ -532,6 +532,8 @@ >>> DEFINE_WRCH_EVENT(reply); >>> TRACE_DEFINE_ENUM(rpcrdma_noch); >>> +TRACE_DEFINE_ENUM(rpcrdma_noch_pullup); >>> +TRACE_DEFINE_ENUM(rpcrdma_noch_mapped); >>> TRACE_DEFINE_ENUM(rpcrdma_readch); >>> TRACE_DEFINE_ENUM(rpcrdma_areadch); >>> TRACE_DEFINE_ENUM(rpcrdma_writech); >>> @@ -540,6 +542,8 @@ >>> #define xprtrdma_show_chunktype(x) \ >>> __print_symbolic(x, \ >>> { rpcrdma_noch, "inline" }, \ >>> + { rpcrdma_noch_pullup, "pullup" }, \ >>> + { rpcrdma_noch_mapped, "mapped" }, \ >>> { rpcrdma_readch, "read list" }, \ >>> { rpcrdma_areadch, "*read list" }, \ >>> { rpcrdma_writech, "write list" }, \ >>> diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c >>> index 50e075f..1168524 100644 >>> --- a/net/sunrpc/xprtrdma/backchannel.c >>> +++ b/net/sunrpc/xprtrdma/backchannel.c >>> @@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) >>> *p = xdr_zero; >>> if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, >>> - &rqst->rq_snd_buf, rpcrdma_noch)) >>> + &rqst->rq_snd_buf, rpcrdma_noch_pullup)) >>> return -EIO; >>> trace_xprtrdma_cb_reply(rqst); >>> diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c >>> index a441dbf..4ad8889 100644 >>> --- a/net/sunrpc/xprtrdma/rpc_rdma.c >>> +++ b/net/sunrpc/xprtrdma/rpc_rdma.c >>> @@ -392,7 +392,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, >>> unsigned int pos; >>> int nsegs; >>> - if (rtype == rpcrdma_noch) >>> + if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) >>> goto done; >>> pos = rqst->rq_snd_buf.head[0].iov_len; >>> @@ -691,6 +691,72 @@ static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, >>> return false; >>> } >>> +/* Copy the tail to the end of the head buffer. >>> + */ >>> +static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, >>> + struct rpcrdma_req *req, >>> + struct xdr_buf *xdr) >>> +{ >>> + unsigned char *dst; >>> + >>> + dst = (unsigned char *)xdr->head[0].iov_base; >>> + dst += xdr->head[0].iov_len + xdr->page_len; >>> + memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); >>> + r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; >>> +} >>> + >>> +/* Copy pagelist content into the head buffer. >>> + */ >>> +static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, >>> + struct rpcrdma_req *req, >>> + struct xdr_buf *xdr) >>> +{ >>> + unsigned int len, page_base, remaining; >>> + struct page **ppages; >>> + unsigned char *src, *dst; >>> + >>> + dst = (unsigned char *)xdr->head[0].iov_base; >>> + dst += xdr->head[0].iov_len; >>> + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); >>> + page_base = offset_in_page(xdr->page_base); >>> + remaining = xdr->page_len; >>> + while (remaining) { >>> + src = page_address(*ppages); >>> + src += page_base; >>> + len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); >>> + memcpy(dst, src, len); >>> + r_xprt->rx_stats.pullup_copy_count += len; >>> + >>> + ppages++; >>> + dst += len; >>> + remaining -= len; >>> + page_base = 0; >>> + } >>> +} >>> + >>> +/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. >>> + * When the head, pagelist, and tail are small, a pull-up copy >>> + * is considerably less costly than DMA mapping the components >>> + * of @xdr. >>> + * >>> + * Assumptions: >>> + * - the caller has already verified that the total length >>> + * of the RPC Call body will fit into @rl_sendbuf. >>> + */ >>> +static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, >>> + struct rpcrdma_req *req, >>> + struct xdr_buf *xdr) >>> +{ >>> + if (unlikely(xdr->tail[0].iov_len)) >>> + rpcrdma_pullup_tail_iov(r_xprt, req, xdr); >>> + >>> + if (unlikely(xdr->page_len)) >>> + rpcrdma_pullup_pagelist(r_xprt, req, xdr); >>> + >>> + /* The whole RPC message resides in the head iovec now */ >>> + return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); >>> +} >>> + >>> static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, >>> struct rpcrdma_req *req, >>> struct xdr_buf *xdr) >>> @@ -779,7 +845,11 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, >>> goto out_unmap; >>> switch (rtype) { >>> - case rpcrdma_noch: >>> + case rpcrdma_noch_pullup: >>> + if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) >>> + goto out_unmap; >>> + break; >>> + case rpcrdma_noch_mapped: >>> if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) >>> goto out_unmap; >>> break; >>> @@ -827,6 +897,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, >>> struct rpcrdma_req *req = rpcr_to_rdmar(rqst); >>> struct xdr_stream *xdr = &req->rl_stream; >>> enum rpcrdma_chunktype rtype, wtype; >>> + struct xdr_buf *buf = &rqst->rq_snd_buf; >>> bool ddp_allowed; >>> __be32 *p; >>> int ret; >>> @@ -884,8 +955,9 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, >>> */ >>> if (rpcrdma_args_inline(r_xprt, rqst)) { >>> *p++ = rdma_msg; >>> - rtype = rpcrdma_noch; >>> - } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { >>> + rtype = buf->len < rdmab_length(req->rl_sendbuf) ? >>> + rpcrdma_noch_pullup : rpcrdma_noch_mapped; >>> + } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { >>> *p++ = rdma_msg; >>> rtype = rpcrdma_readch; >>> } else { >>> @@ -927,7 +999,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, >>> goto out_err; >>> ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, >>> - &rqst->rq_snd_buf, rtype); >>> + buf, rtype); >>> if (ret) >>> goto out_err; >>> diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c >>> index 2f46582..a514e2c 100644 >>> --- a/net/sunrpc/xprtrdma/verbs.c >>> +++ b/net/sunrpc/xprtrdma/verbs.c >>> @@ -1165,7 +1165,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) >>> for (i = 0; i < buf->rb_max_requests; i++) { >>> struct rpcrdma_req *req; >>> - req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE, >>> + req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2, >>> GFP_KERNEL); >>> if (!req) >>> goto out; >>> diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h >>> index cdd6a3d..5d15140 100644 >>> --- a/net/sunrpc/xprtrdma/xprt_rdma.h >>> +++ b/net/sunrpc/xprtrdma/xprt_rdma.h >>> @@ -554,6 +554,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, >>> enum rpcrdma_chunktype { >>> rpcrdma_noch = 0, >>> + rpcrdma_noch_pullup, >>> + rpcrdma_noch_mapped, >>> rpcrdma_readch, >>> rpcrdma_areadch, >>> rpcrdma_writech, > > -- > Chuck Lever > > > > >
> On Oct 19, 2019, at 12:36 PM, Tom Talpey <tom@talpey.com> wrote: > > On 10/18/2019 7:34 PM, Chuck Lever wrote: >> Hi Tom- >>> On Oct 18, 2019, at 4:17 PM, Tom Talpey <tom@talpey.com> wrote: >>> >>> On 10/17/2019 2:31 PM, Chuck Lever wrote: >>>> On some platforms, DMA mapping part of a page is more costly than >>>> copying bytes. Restore the pull-up code and use that when we >>>> think it's going to be faster. The heuristic for now is to pull-up >>>> when the size of the RPC message body fits in the buffer underlying >>>> the head iovec. >>>> Indeed, not involving the I/O MMU can help the RPC/RDMA transport >>>> scale better for tiny I/Os across more RDMA devices. This is because >>>> interaction with the I/O MMU is eliminated, as is handling a Send >>>> completion, for each of these small I/Os. Without the explicit >>>> unmapping, the NIC no longer needs to do a costly internal TLB shoot >>>> down for buffers that are just a handful of bytes. >>> >>> This is good stuff. Do you have any performance data for the new >>> strategy, especially latencies and local CPU cycles per byte? >> Saves almost a microsecond of RT latency on my NFS client that uses >> a real Intel IOMMU. On my other NFS client, the DMA map operations >> are always a no-op. This savings applies only to NFS WRITE, of course. >> I don't have a good benchmark for cycles per byte. Do you have any >> suggestions? Not sure how I would account for cycles spent handling >> Send completions, for example. > > Cycles per byte is fairly simple but like all performance measurement > the trick is in the setup. Because of platform variations, it's best > to compare results on the same hardware. The absolute value isn't as > meaningful. Here's a rough sketch of one approach. > > - Configure BIOS and OS to hold CPU frequency constant: > - ACPI C-states off > - Turbo mode off > - Power management off (OS needs this too) > - Anything else relevant to clock variation > - Hyperthreading off > - (hyperthreads don't add work linearly) > - Calculate core count X clock frequency > - (e.g. 8 X 3GHz = 24G cycles/sec) > > Now, use a benchmark which runs the desired workload and reports %CPU. > For a given interval, record the total bytes transferred, time spent, > and CPU load. (e.g. 100GB, 100 sec, 20%). > > Finally, compute CpB (the 1/sec terms cancel out): > 20% x 24Gcps = 4.8G cps > 100GB / 100s = 1G bps > 4.8Gcps / 1 GBps = 4.8cpb > > Like I said, it's rough, but surprisingly telling. A similar metric > is cycles per IOP, and since you're focusing on small i/o with this > change, it might also be an interesting calculation. Simply replace > total bytes/sec with IOPS. Systems under test: • 12 Haswell cores x 1.6GHz = 19.2 billion cps • Server is exporting a tmpfs filesystem • Client and server using CX-3 Pro on 56Gbps InfiniBand • Kernel is v5.4-rc4 • iozone -M -+u -i0 -i1 -s1g -r1k -t12 -I The purpose of this test is to compare the two kernels, not to publish an absolute performance value. Both kernels below have a number of CPU-intensive debugging options enabled, which might tend to increase CPU cycles per byte or per I/O, and might also amplify the differences between the two kernels. *** With DMA-mapping kernel (confirmed after test - total pull-up was zero bytes): WRITE tests: • Write test: CPU Utilization: Wall time 496.136 CPU time 812.879 CPU utilization 163.84 % • Re-write test: CPU utilization: Wall time 500.266 CPU time 822.810 CPU utilization 164.47 % Final mountstats results: WRITE: 25161863 ops (50%) avg bytes sent per op: 1172 avg bytes received per op: 136 backlog wait: 0.094913 RTT: 0.048245 total execute time: 0.213270 (milliseconds) Based solely on the iozone Write test: 12 threads x 1GB file = 12 GB transferred 12 GB / 496 s = 25973227 Bps 19.2 billion cps / 25973227 Bps = 740 cpB @ 1KB I/O Based on both the iozone Write and Re-write tests: 25161863 ops / 996 s = 25263 IOps 19.2 billion cps / 25263 IOps = 760004 cpIO READ tests: • Read test: CPU utilization: Wall time 451.762 CPU time 826.888 CPU utilization 183.04 % • Re-read test: CPU utilization: Wall time 452.543 CPU time 827.575 CPU utilization 182.87 % Final mountstats results: READ: 25146066 ops (49%) avg bytes sent per op: 140 avg bytes received per op: 1152 backlog wait: 0.092140 RTT: 0.045202 total execute time: 0.205996 (milliseconds) Based solely on the iozone Read test: 12 threads x 1GB file = 12 GB transferred 12 GB / 451 s = 28569627 Bps 19.2 billion cps / 28569627 Bps = 672 cpB @ 1KB I/O Based on both the iozone Read and Re-read tests: 25146066 ops / 903 s = 27847 IOps 19.2 billion cps / 27847 IOps = 689481 cpIO *** With pull-up kernel (confirmed after test - total pull-up was 25763734528 bytes): WRITE tests: • Write test: CPU Utilization: Wall time 453.318 CPU time 839.581 CPU utilization 185.21 % • Re-write test: CPU utilization: Wall time 458.717 CPU time 850.335 CPU utilization 185.37 % Final mountstats results: WRITE: 25159897 ops (50%) avg bytes sent per op: 1172 avg bytes received per op: 136 backlog wait: 0.080036 RTT: 0.049674 total execute time: 0.183426 (milliseconds) Based solely on the iozone Write test: 12 threads x 1GB file = 12 GB transferred 12 GB / 453 s = 28443492 Bps 19.2 billion cps / 28443492 Bps = 675 cpB @ 1KB I/O Based on both the iozone Write and Re-write tests: 25159897 ops / 911 s = 27617 IOps 19.2 billion cps / 27617 IOps = 695223 cpIO READ tests: • Read test: CPU utilization: Wall time 451.248 CPU time 834.203 CPU utilization 184.87 % • Re-read test: CPU utilization: Wall time 451.113 CPU time 834.302 CPU utilization 184.94 % Final mountstats results: READ: 25149527 ops (49%) avg bytes sent per op: 140 avg bytes received per op: 1152 backlog wait: 0.091011 RTT: 0.045790 total execute time: 0.203793 (milliseconds) Based solely on the iozone Read test: 12 threads x 1GB file = 12 GB transferred 12 GB / 451 s = 28569627 Bps 19.2 billion cps / 28569627 Bps = 672 cpB @ 1KB I/O Based on both the iozone Read and Re-read tests: 25149527 ops / 902 s = 27881 IOps 19.2 billion cps / 27881 IOps = 688641 cpIO *** Analysis: For both kernels, the READ tests are close. This demonstrates that the patch does not have any gross effects on the READ path, as expected. The WRITE tests are more remarkable. • Mean total execute time per WRITE RPC decreases by about 30 microseconds. Almost half of that is decreased backlog wait. • Mean round-trip time increases by a microsecond and a half. My earlier report that RT decreased by a microsecond was based on a QD=1 direct latency measure. • For 1KB WRITE: IOPS, Cycles per byte written and Cycles per I/O are now within spitting distance of the same metrics for 1KB READ. -- Chuck Lever
On 10/22/2019 12:08 PM, Chuck Lever wrote: > > >> On Oct 19, 2019, at 12:36 PM, Tom Talpey <tom@talpey.com> wrote: >> >> On 10/18/2019 7:34 PM, Chuck Lever wrote: >>> Hi Tom- >>>> On Oct 18, 2019, at 4:17 PM, Tom Talpey <tom@talpey.com> wrote: >>>> >>>> On 10/17/2019 2:31 PM, Chuck Lever wrote: >>>>> On some platforms, DMA mapping part of a page is more costly than >>>>> copying bytes. Restore the pull-up code and use that when we >>>>> think it's going to be faster. The heuristic for now is to pull-up >>>>> when the size of the RPC message body fits in the buffer underlying >>>>> the head iovec. >>>>> Indeed, not involving the I/O MMU can help the RPC/RDMA transport >>>>> scale better for tiny I/Os across more RDMA devices. This is because >>>>> interaction with the I/O MMU is eliminated, as is handling a Send >>>>> completion, for each of these small I/Os. Without the explicit >>>>> unmapping, the NIC no longer needs to do a costly internal TLB shoot >>>>> down for buffers that are just a handful of bytes. >>>> >>>> This is good stuff. Do you have any performance data for the new >>>> strategy, especially latencies and local CPU cycles per byte? >>> Saves almost a microsecond of RT latency on my NFS client that uses >>> a real Intel IOMMU. On my other NFS client, the DMA map operations >>> are always a no-op. This savings applies only to NFS WRITE, of course. >>> I don't have a good benchmark for cycles per byte. Do you have any >>> suggestions? Not sure how I would account for cycles spent handling >>> Send completions, for example. >> >> Cycles per byte is fairly simple but like all performance measurement >> the trick is in the setup. Because of platform variations, it's best >> to compare results on the same hardware. The absolute value isn't as >> meaningful. Here's a rough sketch of one approach. >> >> - Configure BIOS and OS to hold CPU frequency constant: >> - ACPI C-states off >> - Turbo mode off >> - Power management off (OS needs this too) >> - Anything else relevant to clock variation >> - Hyperthreading off >> - (hyperthreads don't add work linearly) >> - Calculate core count X clock frequency >> - (e.g. 8 X 3GHz = 24G cycles/sec) >> >> Now, use a benchmark which runs the desired workload and reports %CPU. >> For a given interval, record the total bytes transferred, time spent, >> and CPU load. (e.g. 100GB, 100 sec, 20%). >> >> Finally, compute CpB (the 1/sec terms cancel out): >> 20% x 24Gcps = 4.8G cps >> 100GB / 100s = 1G bps >> 4.8Gcps / 1 GBps = 4.8cpb >> >> Like I said, it's rough, but surprisingly telling. A similar metric >> is cycles per IOP, and since you're focusing on small i/o with this >> change, it might also be an interesting calculation. Simply replace >> total bytes/sec with IOPS. > > Systems under test: > > • 12 Haswell cores x 1.6GHz = 19.2 billion cps > • Server is exporting a tmpfs filesystem > • Client and server using CX-3 Pro on 56Gbps InfiniBand > • Kernel is v5.4-rc4 > • iozone -M -+u -i0 -i1 -s1g -r1k -t12 -I > > The purpose of this test is to compare the two kernels, not to publish an absolute performance value. Both kernels below have a number of CPU-intensive debugging options enabled, which might tend to increase CPU cycles per byte or per I/O, and might also amplify the differences between the two kernels. > > > > *** With DMA-mapping kernel (confirmed after test - total pull-up was zero bytes): > > WRITE tests: > > • Write test: CPU Utilization: Wall time 496.136 CPU time 812.879 CPU utilization 163.84 % > • Re-write test: CPU utilization: Wall time 500.266 CPU time 822.810 CPU utilization 164.47 % Ah, the math I suggested earlier needs a different approah for these absolute utilizations. The >100% numbers indicate these are per-core, so 164% means 1.64 cores' worth of load. The math I suggested is for relative, where the core count is corrected to 100% total. For these, ignore the core count, just take the frequency and multiply by the percent. This means your cpb and cpio numbers are 12X the value. It's no big deal since we're just comparing before/after though. > Final mountstats results: > > WRITE: > 25161863 ops (50%) > avg bytes sent per op: 1172 avg bytes received per op: 136 > backlog wait: 0.094913 RTT: 0.048245 total execute time: 0.213270 (milliseconds) > > Based solely on the iozone Write test: > 12 threads x 1GB file = 12 GB transferred > 12 GB / 496 s = 25973227 Bps > 19.2 billion cps / 25973227 Bps = 740 cpB @ 1KB I/O > > Based on both the iozone Write and Re-write tests: > 25161863 ops / 996 s = 25263 IOps > 19.2 billion cps / 25263 IOps = 760004 cpIO So, 62cpb and and 63Kcpio, corrected. Seems a bit high, but as I said earlier, it's hard to compare across platforms. > READ tests: > > • Read test: CPU utilization: Wall time 451.762 CPU time 826.888 CPU utilization 183.04 % > • Re-read test: CPU utilization: Wall time 452.543 CPU time 827.575 CPU utilization 182.87 % > > Final mountstats results: > > READ: > 25146066 ops (49%) > avg bytes sent per op: 140 avg bytes received per op: 1152 > backlog wait: 0.092140 RTT: 0.045202 total execute time: 0.205996 (milliseconds) > > Based solely on the iozone Read test: > 12 threads x 1GB file = 12 GB transferred > 12 GB / 451 s = 28569627 Bps > 19.2 billion cps / 28569627 Bps = 672 cpB @ 1KB I/O > > Based on both the iozone Read and Re-read tests: > 25146066 ops / 903 s = 27847 IOps > 19.2 billion cps / 27847 IOps = 689481 cpIO > > > > *** With pull-up kernel (confirmed after test - total pull-up was 25763734528 bytes): > > WRITE tests: > • Write test: CPU Utilization: Wall time 453.318 CPU time 839.581 CPU utilization 185.21 % > • Re-write test: CPU utilization: Wall time 458.717 CPU time 850.335 CPU utilization 185.37 % > > Final mountstats results: > > WRITE: > 25159897 ops (50%) > avg bytes sent per op: 1172 avg bytes received per op: 136 > backlog wait: 0.080036 RTT: 0.049674 total execute time: 0.183426 (milliseconds) > > Based solely on the iozone Write test: > 12 threads x 1GB file = 12 GB transferred > 12 GB / 453 s = 28443492 Bps > 19.2 billion cps / 28443492 Bps = 675 cpB @ 1KB I/O > > Based on both the iozone Write and Re-write tests: > 25159897 ops / 911 s = 27617 IOps > 19.2 billion cps / 27617 IOps = 695223 cpIO > > > READ tests: > > • Read test: CPU utilization: Wall time 451.248 CPU time 834.203 CPU utilization 184.87 % > • Re-read test: CPU utilization: Wall time 451.113 CPU time 834.302 CPU utilization 184.94 % > > Final mountstats results: > > READ: > 25149527 ops (49%) > avg bytes sent per op: 140 avg bytes received per op: 1152 > backlog wait: 0.091011 RTT: 0.045790 total execute time: 0.203793 (milliseconds) > > Based solely on the iozone Read test: > 12 threads x 1GB file = 12 GB transferred > 12 GB / 451 s = 28569627 Bps > 19.2 billion cps / 28569627 Bps = 672 cpB @ 1KB I/O > > Based on both the iozone Read and Re-read tests: > 25149527 ops / 902 s = 27881 IOps > 19.2 billion cps / 27881 IOps = 688641 cpIO > > > > *** Analysis: > > For both kernels, the READ tests are close. This demonstrates that the patch does not have any gross effects on the READ path, as expected. Well, close, but noticably lower in the pullup approach. This bears out your suspicion that the IOMMU is not a trivial cost. It's also likely to be a single-threading point, which will cause queuing as well as overhead. Of course, the cost may be worth it, for security, or scatter/gather optimization, etc. But for small i/o, the pullup is a simple and effective approach. > The WRITE tests are more remarkable. > • Mean total execute time per WRITE RPC decreases by about 30 microseconds. Almost half of that is decreased backlog wait. > • Mean round-trip time increases by a microsecond and a half. My earlier report that RT decreased by a microsecond was based on a QD=1 direct latency measure. > • For 1KB WRITE: IOPS, Cycles per byte written and Cycles per I/O are now within spitting distance of the same metrics for 1KB READ. Excellent result. The backlog wait may be in part avoiding the IOMMU programming, and queuing on it before mapping. Bringing the numbers close to READ corroborates this. I assume you are using RoCE or IB, which doesn't require remote registration of the source buffers? The pullup would improve those even more. Curious that the RTT goes up by such a large number. It seems really high. This might become a noticable penalty on single-threaded (metadata) workloads. Tom.
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 16572ae7..336a65d 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -532,6 +532,8 @@ DEFINE_WRCH_EVENT(reply); TRACE_DEFINE_ENUM(rpcrdma_noch); +TRACE_DEFINE_ENUM(rpcrdma_noch_pullup); +TRACE_DEFINE_ENUM(rpcrdma_noch_mapped); TRACE_DEFINE_ENUM(rpcrdma_readch); TRACE_DEFINE_ENUM(rpcrdma_areadch); TRACE_DEFINE_ENUM(rpcrdma_writech); @@ -540,6 +542,8 @@ #define xprtrdma_show_chunktype(x) \ __print_symbolic(x, \ { rpcrdma_noch, "inline" }, \ + { rpcrdma_noch_pullup, "pullup" }, \ + { rpcrdma_noch_mapped, "mapped" }, \ { rpcrdma_readch, "read list" }, \ { rpcrdma_areadch, "*read list" }, \ { rpcrdma_writech, "write list" }, \ diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 50e075f..1168524 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -79,7 +79,7 @@ static int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) *p = xdr_zero; if (rpcrdma_prepare_send_sges(r_xprt, req, RPCRDMA_HDRLEN_MIN, - &rqst->rq_snd_buf, rpcrdma_noch)) + &rqst->rq_snd_buf, rpcrdma_noch_pullup)) return -EIO; trace_xprtrdma_cb_reply(rqst); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a441dbf..4ad8889 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -392,7 +392,7 @@ static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, unsigned int pos; int nsegs; - if (rtype == rpcrdma_noch) + if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) goto done; pos = rqst->rq_snd_buf.head[0].iov_len; @@ -691,6 +691,72 @@ static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, return false; } +/* Copy the tail to the end of the head buffer. + */ +static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + unsigned char *dst; + + dst = (unsigned char *)xdr->head[0].iov_base; + dst += xdr->head[0].iov_len + xdr->page_len; + memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); + r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; +} + +/* Copy pagelist content into the head buffer. + */ +static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + unsigned int len, page_base, remaining; + struct page **ppages; + unsigned char *src, *dst; + + dst = (unsigned char *)xdr->head[0].iov_base; + dst += xdr->head[0].iov_len; + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_base = offset_in_page(xdr->page_base); + remaining = xdr->page_len; + while (remaining) { + src = page_address(*ppages); + src += page_base; + len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); + memcpy(dst, src, len); + r_xprt->rx_stats.pullup_copy_count += len; + + ppages++; + dst += len; + remaining -= len; + page_base = 0; + } +} + +/* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. + * When the head, pagelist, and tail are small, a pull-up copy + * is considerably less costly than DMA mapping the components + * of @xdr. + * + * Assumptions: + * - the caller has already verified that the total length + * of the RPC Call body will fit into @rl_sendbuf. + */ +static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_req *req, + struct xdr_buf *xdr) +{ + if (unlikely(xdr->tail[0].iov_len)) + rpcrdma_pullup_tail_iov(r_xprt, req, xdr); + + if (unlikely(xdr->page_len)) + rpcrdma_pullup_pagelist(r_xprt, req, xdr); + + /* The whole RPC message resides in the head iovec now */ + return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); +} + static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, struct xdr_buf *xdr) @@ -779,7 +845,11 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, goto out_unmap; switch (rtype) { - case rpcrdma_noch: + case rpcrdma_noch_pullup: + if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) + goto out_unmap; + break; + case rpcrdma_noch_mapped: if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) goto out_unmap; break; @@ -827,6 +897,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct xdr_stream *xdr = &req->rl_stream; enum rpcrdma_chunktype rtype, wtype; + struct xdr_buf *buf = &rqst->rq_snd_buf; bool ddp_allowed; __be32 *p; int ret; @@ -884,8 +955,9 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, */ if (rpcrdma_args_inline(r_xprt, rqst)) { *p++ = rdma_msg; - rtype = rpcrdma_noch; - } else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) { + rtype = buf->len < rdmab_length(req->rl_sendbuf) ? + rpcrdma_noch_pullup : rpcrdma_noch_mapped; + } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { *p++ = rdma_msg; rtype = rpcrdma_readch; } else { @@ -927,7 +999,7 @@ inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, goto out_err; ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, - &rqst->rq_snd_buf, rtype); + buf, rtype); if (ret) goto out_err; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2f46582..a514e2c 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -1165,7 +1165,7 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; - req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE, + req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2, GFP_KERNEL); if (!req) goto out; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index cdd6a3d..5d15140 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -554,6 +554,8 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt, enum rpcrdma_chunktype { rpcrdma_noch = 0, + rpcrdma_noch_pullup, + rpcrdma_noch_mapped, rpcrdma_readch, rpcrdma_areadch, rpcrdma_writech,
On some platforms, DMA mapping part of a page is more costly than copying bytes. Restore the pull-up code and use that when we think it's going to be faster. The heuristic for now is to pull-up when the size of the RPC message body fits in the buffer underlying the head iovec. Indeed, not involving the I/O MMU can help the RPC/RDMA transport scale better for tiny I/Os across more RDMA devices. This is because interaction with the I/O MMU is eliminated, as is handling a Send completion, for each of these small I/Os. Without the explicit unmapping, the NIC no longer needs to do a costly internal TLB shoot down for buffers that are just a handful of bytes. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> --- net/sunrpc/xprtrdma/backchannel.c | 2 - net/sunrpc/xprtrdma/rpc_rdma.c | 82 +++++++++++++++++++++++++++++++++++-- net/sunrpc/xprtrdma/verbs.c | 2 - net/sunrpc/xprtrdma/xprt_rdma.h | 2 + 4 files changed, 81 insertions(+), 7 deletions(-)