diff mbox

[RFC/PATCH,v2] IPoIB: Leave space in skb linear buffer for IP headers

Message ID 1365109940-22916-1-git-send-email-roland@kernel.org (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Roland Dreier April 4, 2013, 9:12 p.m. UTC
From: Roland Dreier <roland@purestorage.com>

Markus Stockhausen <markus.stockhausen@gmx.de> noticed that IPoIB was
spending significant time doing memcpy() in __pskb_pull_tail().  He
found that this is because his adapter reports a maximum MTU of 4K,
which causes IPoIB datagram mode to receive all the actual data in a
separate page in the fragment list.

We're already allocating extra tailroom for the skb linear part, so we
might as well use it.

Cc: Eric Dumazet <edumazet@google.com>
Reported-by: Markus Stockhausen <markus.stockhausen@gmx.de>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
v2: Try to handle the case where we get all the data in the linear part
of the skb and don't need the frag part at all.

 drivers/infiniband/ulp/ipoib/ipoib.h    |  3 ++-
 drivers/infiniband/ulp/ipoib/ipoib_ib.c | 17 ++++++-----------
 2 files changed, 8 insertions(+), 12 deletions(-)

Comments

Markus Stockhausen April 5, 2013, 6:42 p.m. UTC | #1
>From: Roland Dreier <roland@purestorage.com>
>
>Markus Stockhausen <markus.stockhausen@gmx.de> noticed that IPoIB was
>spending significant time doing memcpy() in __pskb_pull_tail().  He
>found that this is because his adapter reports a maximum MTU of 4K,
>which causes IPoIB datagram mode to receive all the actual data in a
>separate page in the fragment list.
>
>We're already allocating extra tailroom for the skb linear part, so we
>might as well use it.
>
>Cc: Eric Dumazet <edumazet@google.com>
>Reported-by: Markus Stockhausen <markus.stockhausen@gmx.de>
>Signed-off-by: Roland Dreier <roland@purestorage.com>
>---
>v2: Try to handle the case where we get all the data in the linear part
>of the skb and don't need the frag part at all.
>
> drivers/infiniband/ulp/ipoib/ipoib.h    |  3 ++-
> drivers/infiniband/ulp/ipoib/ipoib_ib.c | 17 ++++++-----------
> 2 files changed, 8 insertions(+), 12 deletions(-)
>
>diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h
>b/drivers/infiniband/ulp/ipoib/ipoib.h
>index eb71aaa..ab2cc4c 100644
>--- a/drivers/infiniband/ulp/ipoib/ipoib.h
>+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
>@@ -64,7 +64,8 @@ enum ipoib_flush_level {
> enum {
>     IPOIB_ENCAP_LEN          = 4,
> 
>-    IPOIB_UD_HEAD_SIZE      = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
>+    /* add 128 bytes of tailroom for IP/TCP headers */
>+    IPOIB_UD_HEAD_SIZE      = IB_GRH_BYTES + IPOIB_ENCAP_LEN + 128,
>     IPOIB_UD_RX_SG          = 2, /* max buffer needed for 4K mtu */
> 
>     IPOIB_CM_MTU          = 0x10000 - 0x10, /* padding to align header
>to 16 */
>diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>index 2cfa76f..ecf4faf 100644
>--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>@@ -109,11 +109,12 @@ static void ipoib_ud_skb_put_frags(struct
>ipoib_dev_priv *priv,
>                    struct sk_buff *skb,
>                    unsigned int length)
> {
>-    if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
>+    if (ipoib_ud_need_sg(priv->max_ib_mtu) &&
>+        length > IPOIB_UD_HEAD_SIZE) {
>         skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
>         unsigned int size;
>         /*
>-         * There is only two buffers needed for max_payload = 4K,
>+         * There are only two buffers needed for max_payload = 4K,
>          * first buf size is IPOIB_UD_HEAD_SIZE
>          */
>         skb->tail += IPOIB_UD_HEAD_SIZE;
>@@ -156,18 +157,12 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct
>net_device *dev, int id)
>     struct ipoib_dev_priv *priv = netdev_priv(dev);
>     struct sk_buff *skb;
>     int buf_size;
>-    int tailroom;
>     u64 *mapping;
> 
>-    if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
>-        buf_size = IPOIB_UD_HEAD_SIZE;
>-        tailroom = 128; /* reserve some tailroom for IP/TCP headers */
>-    } else {
>-        buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
>-        tailroom = 0;
>-    }
>+    buf_size = ipoib_ud_need_sg(priv->max_ib_mtu) ?
>+        IPOIB_UD_HEAD_SIZE : IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
> 
>-    skb = dev_alloc_skb(buf_size + tailroom + 4);
>+    skb = dev_alloc_skb(buf_size + 4);
>     if (unlikely(!skb))
>         return NULL;
> 
>-- 
>1.8.1.2
>

Hello,

this patch is better than the first one. No more lockups of the server.
Nevertheless I'm sorry to tell you that we are not finished yet. After
running some promising netperf/netserver tests I switched over to NFS.
The machine behaves like working with an active handbrake. Throughput
varies and sometimes communication is totally stopped. dmesg shows some
very ugly output.

[39604.701231] RPC: multiple fragments per record not supported
[39604.706444] RPC: multiple fragments per record not supported
[39604.720131] RPC: multiple fragments per record not supported
[39604.720172] rpc-srv/tcp: nfsd: got error -32 when sending 140
               bytes - shutting down socket
[39604.750376] RPC: multiple fragments per record not supported
[39604.767136] RPC: multiple fragments per record not supported
[39604.803122] RPC: multiple fragments per record not supported
[39604.836599] RPC: multiple fragments per record not supported
[39604.846579] RPC: multiple fragments per record not supported
[39604.854730] RPC: multiple fragments per record not supported
[39604.862385] RPC: multiple fragments per record not supported
[39604.911370] rpc-srv/tcp: nfsd: got error -32 when sending 140
               bytes - shutting down socket
[39607.278661] rpc-srv/tcp: nfsd: got error -32 when sending 140
               bytes - shutting down socket
[39629.669290] receive_cb_reply: Got unrecognized reply: calldir
               0x1 xpt_bc_xprt           (null) xid 50916007
[39629.669303] net_ratelimit: 98 callbacks suppressed
[39629.669306] RPC: multiple fragments per record not supported


Maybe you you know what problem is left.

Thank you very much.

Markus.

P.S. I did some more comprehensive tests on my Intel L5420 NFS
DDR Infiniband Server with my initial netperf/netserver scenario
and the dirty trick forcing the device max MTU to 3K. With a
powerful client throughput towards the server raised from 5Gbit/s
to 10Gbit/s. NFS large writes (receive operations) raised from
350MByte/s to 510MByte/s.

So if you have any chance to build a patch that uses fragments
depending on current set MTU people with older 2K switches
will be real winners.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roland Dreier April 5, 2013, 8:44 p.m. UTC | #2
On Fri, Apr 5, 2013 at 11:42 AM, Markus Stockhausen
<markus.stockhausen@gmx.de> wrote:
> this patch is better than the first one. No more lockups of the server.
> Nevertheless I'm sorry to tell you that we are not finished yet. After
> running some promising netperf/netserver tests I switched over to NFS.
> The machine behaves like working with an active handbrake. Throughput
> varies and sometimes communication is totally stopped. dmesg shows some
> very ugly output.

Thanks.  I guess I'll have to set up a testbed and debug this myself.
I'll try to work on that this weekend.

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roland Dreier April 8, 2013, 3:59 p.m. UTC | #3
On Fri, Apr 5, 2013 at 1:44 PM, Roland Dreier <roland@kernel.org> wrote:
> Thanks.  I guess I'll have to set up a testbed and debug this myself.
> I'll try to work on that this weekend.

So I was able to reproduce the problem (or at least a problem) with
netpipe.  Doing "NPtcp -i -2" hit bad data pretty quickly (where the
receiver didn't get the same data as the sender sent).

It turns out that the problem is with packets that don't go into the
fragment page ... with my patch applied, ipoib sends skbs up the stack
that have an empty page in the fragment list, and this apparently
messes something up in the TCP/IP code.

I have a hacky version that clears the frag list if the page isn't
used, and that seems to work fine.  However I want to clean it up; I
should have a good version later today.

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roland Dreier April 8, 2013, 4:50 p.m. UTC | #4
On Mon, Apr 8, 2013 at 9:44 AM, Eric Dumazet <edumazet@google.com> wrote:
> Am empty page in the frag list ? You mean a frag with a zero length ?

Yep... the code would do

    skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);

but then later on it did the equivalent of

    skb_frag_size_set(frag, 0);
    skb->truesize += PAGE_SIZE;

and somehow when data is flowing both directions with small packets,
that would lead the RX-side handling to return corrupt data to
userspace.

Not sure if it's worth figuring this out entirely ... the more
efficient way for ipoib to do things is to reuse that page for next
time, not pass it up the stack unused.

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Markus Stockhausen April 8, 2013, 7:03 p.m. UTC | #5
Am 08.04.13 18:50 schrieb "Roland Dreier" unter <roland@kernel.org>:

>On Mon, Apr 8, 2013 at 9:44 AM, Eric Dumazet <edumazet@google.com> wrote:
>> Am empty page in the frag list ? You mean a frag with a zero length ?
>
>Yep... the code would do
>
>    skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
>
>but then later on it did the equivalent of
>
>    skb_frag_size_set(frag, 0);
>    skb->truesize += PAGE_SIZE;
>
>and somehow when data is flowing both directions with small packets,
>that would lead the RX-side handling to return corrupt data to
>userspace.
>
>Not sure if it's worth figuring this out entirely ... the more
>efficient way for ipoib to do things is to reuse that page for next
>time, not pass it up the stack unused.
>
> - R.

First I thought perfect memory allocation inside ipoib_alloc_rx_skb()
could be realized by passing the received packet size from function
ipoib_ib_handle_rx_wc(). But I'm unable to estimate the requirements
of the similar call inside ipoib_ib_post_receives().

Nevertheless thank you very much for spending time on this.

Markus


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Roland Dreier April 8, 2013, 10:04 p.m. UTC | #6
On Mon, Apr 8, 2013 at 12:03 PM, Markus Stockhausen
<markus.stockhausen@gmx.de> wrote:
> First I thought perfect memory allocation inside ipoib_alloc_rx_skb()
> could be realized by passing the received packet size from function
> ipoib_ib_handle_rx_wc(). But I'm unable to estimate the requirements
> of the similar call inside ipoib_ib_post_receives().

The problem is that we have to allocate the receive buffer before we
know how big the packet we're going to receive is.

Anyway, see my latest patch.  I'm pretty sure it should work -- and I
hope the performance is reasonable.  Your approach of hacking the max
mtu avoids the fragmented skb completely, whereas I'm just making sure
the TCP/IP headers are in the linear part.

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index eb71aaa..ab2cc4c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -64,7 +64,8 @@  enum ipoib_flush_level {
 enum {
 	IPOIB_ENCAP_LEN		  = 4,
 
-	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
+	/* add 128 bytes of tailroom for IP/TCP headers */
+	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN + 128,
 	IPOIB_UD_RX_SG		  = 2, /* max buffer needed for 4K mtu */
 
 	IPOIB_CM_MTU		  = 0x10000 - 0x10, /* padding to align header to 16 */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 2cfa76f..ecf4faf 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -109,11 +109,12 @@  static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
 				   struct sk_buff *skb,
 				   unsigned int length)
 {
-	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
+	if (ipoib_ud_need_sg(priv->max_ib_mtu) &&
+	    length > IPOIB_UD_HEAD_SIZE) {
 		skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
 		unsigned int size;
 		/*
-		 * There is only two buffers needed for max_payload = 4K,
+		 * There are only two buffers needed for max_payload = 4K,
 		 * first buf size is IPOIB_UD_HEAD_SIZE
 		 */
 		skb->tail += IPOIB_UD_HEAD_SIZE;
@@ -156,18 +157,12 @@  static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct sk_buff *skb;
 	int buf_size;
-	int tailroom;
 	u64 *mapping;
 
-	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
-		buf_size = IPOIB_UD_HEAD_SIZE;
-		tailroom = 128; /* reserve some tailroom for IP/TCP headers */
-	} else {
-		buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
-		tailroom = 0;
-	}
+	buf_size = ipoib_ud_need_sg(priv->max_ib_mtu) ?
+		IPOIB_UD_HEAD_SIZE : IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
 
-	skb = dev_alloc_skb(buf_size + tailroom + 4);
+	skb = dev_alloc_skb(buf_size + 4);
 	if (unlikely(!skb))
 		return NULL;