@@ -1639,6 +1639,24 @@ TRACE_EVENT(svcrdma_dma_map_rwctx,
)
);
+TRACE_EVENT(svcrdma_send_pullup,
+ TP_PROTO(
+ unsigned int len
+ ),
+
+ TP_ARGS(len),
+
+ TP_STRUCT__entry(
+ __field(unsigned int, len)
+ ),
+
+ TP_fast_assign(
+ __entry->len = len;
+ ),
+
+ TP_printk("len=%u", __entry->len)
+);
+
TRACE_EVENT(svcrdma_send_failed,
TP_PROTO(
const struct svc_rqst *rqst,
@@ -539,6 +539,7 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
/**
* svc_rdma_pull_up_needed - Determine whether to use pull-up
* @rdma: controlling transport
+ * @sctxt: send_ctxt for the Send WR
* @rctxt: Write and Reply chunks provided by client
* @xdr: xdr_buf containing RPC message to transmit
*
@@ -547,11 +548,22 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma,
* %false otherwise
*/
static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma,
+ struct svc_rdma_send_ctxt *sctxt,
const struct svc_rdma_recv_ctxt *rctxt,
struct xdr_buf *xdr)
{
int elements;
+ /* For small messages, copying bytes is cheaper than DMA
+ * mapping.
+ */
+ if (sctxt->sc_hdrbuf.len + xdr->len <
+ RPCRDMA_V1_DEF_INLINE_SIZE >> 1)
+ return true;
+
+ /* Check whether the xdr_buf has more elements than can
+ * fit in a single RDMA Send.
+ */
/* xdr->head */
elements = 1;
@@ -634,6 +646,7 @@ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma,
memcpy(dst, tailbase, taillen);
sctxt->sc_sges[0].length += xdr->len;
+ trace_svcrdma_send_pullup(sctxt->sc_sges[0].length);
return 0;
}
@@ -667,7 +680,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma,
if (rctxt && rctxt->rc_reply_chunk)
return 0;
- if (svc_rdma_pull_up_needed(rdma, rctxt, xdr))
+ if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr))
return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr);
++sctxt->sc_cur_sge_no;
On some platforms, DMA mapping part of a page is more costly than copying bytes. Indeed, not involving the I/O MMU can help the RPC/RDMA transport scale better for tiny I/Os across more RDMA devices. This is because interaction with the I/O MMU is eliminated for each of these small I/Os. Without the explicit unmapping, the NIC no longer needs to do a costly internal TLB shoot down for buffers that are just a handful of bytes. Since pull-up is now a more a frequent operation, I've introduced a trace point in the pull-up path. It can be used for debugging or user-space tools that count pull-up frequency. Signed-off-by: Chuck Lever <chuck.lever@oracle.com> --- include/trace/events/rpcrdma.h | 18 ++++++++++++++++++ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 15 ++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-)