diff mbox

ib_ipoib: Scatter-Gather support in connected mode

Message ID 1422357682-8934-1-git-send-email-yuval.shaia@oracle.com (mailing list archive)
State Rejected
Headers show

Commit Message

Yuval Shaia Jan. 27, 2015, 11:21 a.m. UTC
With this fix Scatter-Gather will be supported also in connected mode

Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
---
 drivers/infiniband/ulp/ipoib/ipoib.h      |    8 +--
 drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  107 +++++++++++++++++++++++------
 drivers/infiniband/ulp/ipoib/ipoib_ib.c   |    3 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c |    3 +-
 4 files changed, 91 insertions(+), 30 deletions(-)

Comments

Yann Droneaud Jan. 28, 2015, 11:36 a.m. UTC | #1
Hi,

Le mardi 27 janvier 2015 à 03:21 -0800, Yuval Shaia a écrit :
> With this fix Scatter-Gather will be supported also in connected mode
> 

Please explain the issue with current kernel and the advantage
of the proposed fix in the commit message. Perhaps benchmarks
against various HCAs would be welcome.

> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> ---
>  drivers/infiniband/ulp/ipoib/ipoib.h      |    8 +--
>  drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  107 +++++++++++++++++++++++------
>  drivers/infiniband/ulp/ipoib/ipoib_ib.c   |    3 +-
>  drivers/infiniband/ulp/ipoib/ipoib_main.c |    3 +-
>  4 files changed, 91 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> index d7562be..fb42834 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> @@ -170,11 +170,6 @@ struct ipoib_tx_buf {
>  	u64		mapping[MAX_SKB_FRAGS + 1];
>  };
>  
> -struct ipoib_cm_tx_buf {
> -	struct sk_buff *skb;
> -	u64		mapping;
> -};
> -
>  struct ib_cm_id;
>  
>  struct ipoib_cm_data {
> @@ -233,7 +228,7 @@ struct ipoib_cm_tx {
>  	struct net_device   *dev;
>  	struct ipoib_neigh  *neigh;
>  	struct ipoib_path   *path;
> -	struct ipoib_cm_tx_buf *tx_ring;
> +	struct ipoib_tx_buf *tx_ring;
>  	unsigned	     tx_head;
>  	unsigned	     tx_tail;
>  	unsigned long	     flags;
> @@ -496,6 +491,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
>  
>  void ipoib_mcast_dev_down(struct net_device *dev);
>  void ipoib_mcast_dev_flush(struct net_device *dev);
> +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
>  
>  #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
>  struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> index 933efce..056e4d2 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> @@ -88,6 +88,31 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
>  		ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
>  }
>  
> +static void ipoib_cm_dma_unmap_tx(struct ipoib_dev_priv *priv,
> +				  struct ipoib_tx_buf *tx_req)
> +{
> +	struct sk_buff *skb;
> +	int i, offs;
> +
> +	skb = tx_req->skb;
> +	if (skb_shinfo(skb)->nr_frags) {
> +		offs = 0;
> +		if (skb_headlen(skb)) {
> +			ib_dma_unmap_single(priv->ca, tx_req->mapping[0],
> +					    skb_headlen(skb), DMA_TO_DEVICE);
> +			offs = 1;
> +		}
> +		for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
> +			const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> +
> +			ib_dma_unmap_single(priv->ca, tx_req->mapping[i + offs],
> +					    skb_frag_size(frag), DMA_TO_DEVICE);
> +		}
> +	} else
> +		ib_dma_unmap_single(priv->ca, tx_req->mapping[0], skb->len,
> +				    DMA_TO_DEVICE);
> +}
> +
>  static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> @@ -707,11 +732,39 @@ static inline int post_send(struct ipoib_dev_priv *priv,
>  	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
>  }
>  
> +static inline int post_send_sg(struct ipoib_dev_priv *priv,
> +			       struct ipoib_cm_tx *tx,
> +			       unsigned int wr_id,
> +			       struct sk_buff *skb,
> +			       u64 mapping[MAX_SKB_FRAGS + 1])
> +{
> +	struct ib_send_wr *bad_wr;
> +	int i, off;
> +	skb_frag_t *frags = skb_shinfo(skb)->frags;
> +	int nr_frags = skb_shinfo(skb)->nr_frags;
> +
> +	if (skb_headlen(skb)) {
> +		priv->tx_sge[0].addr = mapping[0];
> +		priv->tx_sge[0].length = skb_headlen(skb);
> +		off = 1;
> +	} else
> +		off = 0;
> +
> +	for (i = 0; i < nr_frags; ++i) {
> +		priv->tx_sge[i + off].addr = mapping[i + off];
> +		priv->tx_sge[i + off].length = frags[i].size;
> +	}
> +	priv->tx_wr.num_sge = nr_frags + off;
> +	priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
> +
> +	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
> +}
> +
>  void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> -	struct ipoib_cm_tx_buf *tx_req;
> -	u64 addr;
> +	struct ipoib_tx_buf *tx_req;
> +	u64 addr = 0;
>  	int rc;
>  
>  	if (unlikely(skb->len > tx->mtu)) {
> @@ -735,24 +788,37 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
>  	 */
>  	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
>  	tx_req->skb = skb;
> -	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
> -	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> -		++dev->stats.tx_errors;
> -		dev_kfree_skb_any(skb);
> -		return;
> -	}
>  
> -	tx_req->mapping = addr;
> +	if (skb_shinfo(skb)->nr_frags) {
> +		if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
> +			++dev->stats.tx_errors;
> +			dev_kfree_skb_any(skb);
> +			return;
> +		}
> +		rc = post_send_sg(priv, tx, tx->tx_head &
> +				  (ipoib_sendq_size - 1),
> +				  skb, tx_req->mapping);
> +	} else {
> +		addr = ib_dma_map_single(priv->ca, skb->data, skb->len,
> +					 DMA_TO_DEVICE);
> +		if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> +			++dev->stats.tx_errors;
> +			dev_kfree_skb_any(skb);
> +			return;
> +		}
> +
> +		tx_req->mapping[0] = addr;
>  
> -	skb_orphan(skb);
> -	skb_dst_drop(skb);
> +		skb_orphan(skb);
> +		skb_dst_drop(skb);
>  
> -	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> -		       addr, skb->len);
> +		rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> +			       addr, skb->len);
> +	}
>  	if (unlikely(rc)) {
>  		ipoib_warn(priv, "post_send failed, error %d\n", rc);
>  		++dev->stats.tx_errors;
> -		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
> +		ipoib_cm_dma_unmap_tx(priv, tx_req);
>  		dev_kfree_skb_any(skb);
>  	} else {
>  		dev->trans_start = jiffies;
> @@ -777,7 +843,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  	struct ipoib_cm_tx *tx = wc->qp->qp_context;
>  	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
> -	struct ipoib_cm_tx_buf *tx_req;
> +	struct ipoib_tx_buf *tx_req;
>  	unsigned long flags;
>  
>  	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
> @@ -791,7 +857,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>  
>  	tx_req = &tx->tx_ring[wr_id];
>  
> -	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
> +	ipoib_cm_dma_unmap_tx(priv, tx_req);
>  
>  	/* FIXME: is this right? Shouldn't we only increment on success? */
>  	++dev->stats.tx_packets;
> @@ -1036,6 +1102,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
>  
>  	struct ib_qp *tx_qp;
>  
> +	if (dev->features & NETIF_F_SG)
> +		attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
> +
>  	tx_qp = ib_create_qp(priv->pd, &attr);
>  	if (PTR_ERR(tx_qp) == -EINVAL) {
>  		ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
> @@ -1170,7 +1239,7 @@ err_tx:
>  static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
> -	struct ipoib_cm_tx_buf *tx_req;
> +	struct ipoib_tx_buf *tx_req;
>  	unsigned long begin;
>  
>  	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
> @@ -1197,8 +1266,7 @@ timeout:
>  
>  	while ((int) p->tx_tail - (int) p->tx_head < 0) {
>  		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
> -		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
> -				    DMA_TO_DEVICE);
> +		ipoib_cm_dma_unmap_tx(priv, tx_req);
>  		dev_kfree_skb_any(tx_req->skb);
>  		++p->tx_tail;
>  		netif_tx_lock_bh(p->dev);
> @@ -1455,7 +1523,6 @@ static void ipoib_cm_stale_task(struct work_struct *work)
>  	spin_unlock_irq(&priv->lock);
>  }
>  
> -

No need to remove this line.

>  static ssize_t show_mode(struct device *d, struct device_attribute *attr,
>  			 char *buf)
>  {
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index 72626c3..9e11447 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -312,8 +312,7 @@ repost:
>  			   "for buf %d\n", wr_id);
>  }
>  
> -static int ipoib_dma_map_tx(struct ib_device *ca,
> -			    struct ipoib_tx_buf *tx_req)
> +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
>  {
>  	struct sk_buff *skb = tx_req->skb;
>  	u64 *mapping = tx_req->mapping;
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index 58b5aa3..f9314c6 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -190,7 +190,7 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  
>  	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
> -		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
> +		features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
>  
>  	return features;
>  }
> @@ -233,7 +233,6 @@ int ipoib_set_mode(struct net_device *dev, const char *buf)
>  			   "will cause multicast packet drops\n");
>  		netdev_update_features(dev);
>  		rtnl_unlock();
> -		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;

Sure about this removal ? I don't see it related to scatter-gather.

>  
>  		ipoib_flush_paths(dev);
>  		rtnl_lock();

Regards.
Yuval Shaia Feb. 1, 2015, 7:09 a.m. UTC | #2
On Wed, Jan 28, 2015 at 12:36:27PM +0100, Yann Droneaud wrote:
> Hi,
> 
> Le mardi 27 janvier 2015 à 03:21 -0800, Yuval Shaia a écrit :
> > With this fix Scatter-Gather will be supported also in connected mode
> > 
> 
> Please explain the issue with current kernel and the advantage
> of the proposed fix in the commit message. Perhaps benchmarks
> against various HCAs would be welcome.
The benefits of using SG are well known.
I think the question should be the other way around - why not supporting SG in CM?

I will add the following text to commit message:
By default, IPoIB-CM driver uses 64k MTU. Larger MTU gives better performance.
This MTU plus overhead puts the memory allocation for IP based packets at 32 4k pages (order 5), which have to be contiguous.
When the system memory under pressure, it was observed that allocating 128k contiguous physical memory is difficult and causes serious errors (such as system becomes unusable).
This enhancement resolve the issue by removing the physically contiguous memory requirement using Scatter/Gather feature that exists in Linux stack.
> 
> > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> > ---
> >  drivers/infiniband/ulp/ipoib/ipoib.h      |    8 +--
> >  drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  107 +++++++++++++++++++++++------
> >  drivers/infiniband/ulp/ipoib/ipoib_ib.c   |    3 +-
> >  drivers/infiniband/ulp/ipoib/ipoib_main.c |    3 +-
> >  4 files changed, 91 insertions(+), 30 deletions(-)
> > 
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> > index d7562be..fb42834 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> > @@ -170,11 +170,6 @@ struct ipoib_tx_buf {
> >  	u64		mapping[MAX_SKB_FRAGS + 1];
> >  };
> >  
> > -struct ipoib_cm_tx_buf {
> > -	struct sk_buff *skb;
> > -	u64		mapping;
> > -};
> > -
> >  struct ib_cm_id;
> >  
> >  struct ipoib_cm_data {
> > @@ -233,7 +228,7 @@ struct ipoib_cm_tx {
> >  	struct net_device   *dev;
> >  	struct ipoib_neigh  *neigh;
> >  	struct ipoib_path   *path;
> > -	struct ipoib_cm_tx_buf *tx_ring;
> > +	struct ipoib_tx_buf *tx_ring;
> >  	unsigned	     tx_head;
> >  	unsigned	     tx_tail;
> >  	unsigned long	     flags;
> > @@ -496,6 +491,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
> >  
> >  void ipoib_mcast_dev_down(struct net_device *dev);
> >  void ipoib_mcast_dev_flush(struct net_device *dev);
> > +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
> >  
> >  #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
> >  struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > index 933efce..056e4d2 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > @@ -88,6 +88,31 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
> >  		ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
> >  }
> >  
> > +static void ipoib_cm_dma_unmap_tx(struct ipoib_dev_priv *priv,
> > +				  struct ipoib_tx_buf *tx_req)
> > +{
> > +	struct sk_buff *skb;
> > +	int i, offs;
> > +
> > +	skb = tx_req->skb;
> > +	if (skb_shinfo(skb)->nr_frags) {
> > +		offs = 0;
> > +		if (skb_headlen(skb)) {
> > +			ib_dma_unmap_single(priv->ca, tx_req->mapping[0],
> > +					    skb_headlen(skb), DMA_TO_DEVICE);
> > +			offs = 1;
> > +		}
> > +		for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
> > +			const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> > +
> > +			ib_dma_unmap_single(priv->ca, tx_req->mapping[i + offs],
> > +					    skb_frag_size(frag), DMA_TO_DEVICE);
> > +		}
> > +	} else
> > +		ib_dma_unmap_single(priv->ca, tx_req->mapping[0], skb->len,
> > +				    DMA_TO_DEVICE);
> > +}
> > +
> >  static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
> >  {
> >  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> > @@ -707,11 +732,39 @@ static inline int post_send(struct ipoib_dev_priv *priv,
> >  	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
> >  }
> >  
> > +static inline int post_send_sg(struct ipoib_dev_priv *priv,
> > +			       struct ipoib_cm_tx *tx,
> > +			       unsigned int wr_id,
> > +			       struct sk_buff *skb,
> > +			       u64 mapping[MAX_SKB_FRAGS + 1])
> > +{
> > +	struct ib_send_wr *bad_wr;
> > +	int i, off;
> > +	skb_frag_t *frags = skb_shinfo(skb)->frags;
> > +	int nr_frags = skb_shinfo(skb)->nr_frags;
> > +
> > +	if (skb_headlen(skb)) {
> > +		priv->tx_sge[0].addr = mapping[0];
> > +		priv->tx_sge[0].length = skb_headlen(skb);
> > +		off = 1;
> > +	} else
> > +		off = 0;
> > +
> > +	for (i = 0; i < nr_frags; ++i) {
> > +		priv->tx_sge[i + off].addr = mapping[i + off];
> > +		priv->tx_sge[i + off].length = frags[i].size;
> > +	}
> > +	priv->tx_wr.num_sge = nr_frags + off;
> > +	priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
> > +
> > +	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
> > +}
> > +
> >  void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
> >  {
> >  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> > -	struct ipoib_cm_tx_buf *tx_req;
> > -	u64 addr;
> > +	struct ipoib_tx_buf *tx_req;
> > +	u64 addr = 0;
> >  	int rc;
> >  
> >  	if (unlikely(skb->len > tx->mtu)) {
> > @@ -735,24 +788,37 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> >  	 */
> >  	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
> >  	tx_req->skb = skb;
> > -	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
> > -	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> > -		++dev->stats.tx_errors;
> > -		dev_kfree_skb_any(skb);
> > -		return;
> > -	}
> >  
> > -	tx_req->mapping = addr;
> > +	if (skb_shinfo(skb)->nr_frags) {
> > +		if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
> > +			++dev->stats.tx_errors;
> > +			dev_kfree_skb_any(skb);
> > +			return;
> > +		}
> > +		rc = post_send_sg(priv, tx, tx->tx_head &
> > +				  (ipoib_sendq_size - 1),
> > +				  skb, tx_req->mapping);
> > +	} else {
> > +		addr = ib_dma_map_single(priv->ca, skb->data, skb->len,
> > +					 DMA_TO_DEVICE);
> > +		if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> > +			++dev->stats.tx_errors;
> > +			dev_kfree_skb_any(skb);
> > +			return;
> > +		}
> > +
> > +		tx_req->mapping[0] = addr;
> >  
> > -	skb_orphan(skb);
> > -	skb_dst_drop(skb);
> > +		skb_orphan(skb);
> > +		skb_dst_drop(skb);
> >  
> > -	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> > -		       addr, skb->len);
> > +		rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> > +			       addr, skb->len);
> > +	}
> >  	if (unlikely(rc)) {
> >  		ipoib_warn(priv, "post_send failed, error %d\n", rc);
> >  		++dev->stats.tx_errors;
> > -		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
> > +		ipoib_cm_dma_unmap_tx(priv, tx_req);
> >  		dev_kfree_skb_any(skb);
> >  	} else {
> >  		dev->trans_start = jiffies;
> > @@ -777,7 +843,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
> >  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> >  	struct ipoib_cm_tx *tx = wc->qp->qp_context;
> >  	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
> > -	struct ipoib_cm_tx_buf *tx_req;
> > +	struct ipoib_tx_buf *tx_req;
> >  	unsigned long flags;
> >  
> >  	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
> > @@ -791,7 +857,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
> >  
> >  	tx_req = &tx->tx_ring[wr_id];
> >  
> > -	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
> > +	ipoib_cm_dma_unmap_tx(priv, tx_req);
> >  
> >  	/* FIXME: is this right? Shouldn't we only increment on success? */
> >  	++dev->stats.tx_packets;
> > @@ -1036,6 +1102,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
> >  
> >  	struct ib_qp *tx_qp;
> >  
> > +	if (dev->features & NETIF_F_SG)
> > +		attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
> > +
> >  	tx_qp = ib_create_qp(priv->pd, &attr);
> >  	if (PTR_ERR(tx_qp) == -EINVAL) {
> >  		ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
> > @@ -1170,7 +1239,7 @@ err_tx:
> >  static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
> >  {
> >  	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
> > -	struct ipoib_cm_tx_buf *tx_req;
> > +	struct ipoib_tx_buf *tx_req;
> >  	unsigned long begin;
> >  
> >  	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
> > @@ -1197,8 +1266,7 @@ timeout:
> >  
> >  	while ((int) p->tx_tail - (int) p->tx_head < 0) {
> >  		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
> > -		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
> > -				    DMA_TO_DEVICE);
> > +		ipoib_cm_dma_unmap_tx(priv, tx_req);
> >  		dev_kfree_skb_any(tx_req->skb);
> >  		++p->tx_tail;
> >  		netif_tx_lock_bh(p->dev);
> > @@ -1455,7 +1523,6 @@ static void ipoib_cm_stale_task(struct work_struct *work)
> >  	spin_unlock_irq(&priv->lock);
> >  }
> >  
> > -
> 
> No need to remove this line.
> 
> >  static ssize_t show_mode(struct device *d, struct device_attribute *attr,
> >  			 char *buf)
> >  {
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > index 72626c3..9e11447 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > @@ -312,8 +312,7 @@ repost:
> >  			   "for buf %d\n", wr_id);
> >  }
> >  
> > -static int ipoib_dma_map_tx(struct ib_device *ca,
> > -			    struct ipoib_tx_buf *tx_req)
> > +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
> >  {
> >  	struct sk_buff *skb = tx_req->skb;
> >  	u64 *mapping = tx_req->mapping;
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > index 58b5aa3..f9314c6 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > @@ -190,7 +190,7 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
> >  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> >  
> >  	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
> > -		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
> > +		features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
> >  
> >  	return features;
> >  }
> > @@ -233,7 +233,6 @@ int ipoib_set_mode(struct net_device *dev, const char *buf)
> >  			   "will cause multicast packet drops\n");
> >  		netdev_update_features(dev);
> >  		rtnl_unlock();
> > -		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
> 
> Sure about this removal ? I don't see it related to scatter-gather.
> 
> >  
> >  		ipoib_flush_paths(dev);
> >  		rtnl_lock();
> 
> Regards.
> 
> -- 
> Yann Droneaud
> OPTEYA
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Shaia March 22, 2015, 9:21 a.m. UTC | #3
On Sun, Mar 15, 2015 at 05:16:16PM +0200, Yuval Shaia wrote:
> Hi,
> I didn't got any further comments on this one.
> Any idea why SG in CM is un-welcome?
By mistake I sent a private mail only.
Cc: Roland Dreier <roland@kernel.org>
Cc: Sean Hefty <sean.hefty@intel.com>
Cc: Hal Rosenstock <hal.rosenstock@gmail.com>

Your advice would be very appreciated.

Yuval
> 
> I have one small fix to this patch but it is 99% ready.
> 
> Yuval
> 
> On Sun, Feb 01, 2015 at 09:09:42AM +0200, Yuval Shaia wrote:
> > On Wed, Jan 28, 2015 at 12:36:27PM +0100, Yann Droneaud wrote:
> > > Hi,
> > > 
> > > Le mardi 27 janvier 2015 à 03:21 -0800, Yuval Shaia a écrit :
> > > > With this fix Scatter-Gather will be supported also in connected mode
> > > > 
> > > 
> > > Please explain the issue with current kernel and the advantage
> > > of the proposed fix in the commit message. Perhaps benchmarks
> > > against various HCAs would be welcome.
> > The benefits of using SG are well known.
> > I think the question should be the other way around - why not supporting SG in CM?
> > 
> > I will add the following text to commit message:
> > By default, IPoIB-CM driver uses 64k MTU. Larger MTU gives better performance.
> > This MTU plus overhead puts the memory allocation for IP based packets at 32 4k pages (order 5), which have to be contiguous.
> > When the system memory under pressure, it was observed that allocating 128k contiguous physical memory is difficult and causes serious errors (such as system becomes unusable).
> > This enhancement resolve the issue by removing the physically contiguous memory requirement using Scatter/Gather feature that exists in Linux stack.
> > > 
> > > > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> > > > ---
> > > >  drivers/infiniband/ulp/ipoib/ipoib.h      |    8 +--
> > > >  drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  107 +++++++++++++++++++++++------
> > > >  drivers/infiniband/ulp/ipoib/ipoib_ib.c   |    3 +-
> > > >  drivers/infiniband/ulp/ipoib/ipoib_main.c |    3 +-
> > > >  4 files changed, 91 insertions(+), 30 deletions(-)
> > > > 
> > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> > > > index d7562be..fb42834 100644
> > > > --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> > > > @@ -170,11 +170,6 @@ struct ipoib_tx_buf {
> > > >  	u64		mapping[MAX_SKB_FRAGS + 1];
> > > >  };
> > > >  
> > > > -struct ipoib_cm_tx_buf {
> > > > -	struct sk_buff *skb;
> > > > -	u64		mapping;
> > > > -};
> > > > -
> > > >  struct ib_cm_id;
> > > >  
> > > >  struct ipoib_cm_data {
> > > > @@ -233,7 +228,7 @@ struct ipoib_cm_tx {
> > > >  	struct net_device   *dev;
> > > >  	struct ipoib_neigh  *neigh;
> > > >  	struct ipoib_path   *path;
> > > > -	struct ipoib_cm_tx_buf *tx_ring;
> > > > +	struct ipoib_tx_buf *tx_ring;
> > > >  	unsigned	     tx_head;
> > > >  	unsigned	     tx_tail;
> > > >  	unsigned long	     flags;
> > > > @@ -496,6 +491,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
> > > >  
> > > >  void ipoib_mcast_dev_down(struct net_device *dev);
> > > >  void ipoib_mcast_dev_flush(struct net_device *dev);
> > > > +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
> > > >  
> > > >  #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
> > > >  struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
> > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > > index 933efce..056e4d2 100644
> > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > > > @@ -88,6 +88,31 @@ static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
> > > >  		ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
> > > >  }
> > > >  
> > > > +static void ipoib_cm_dma_unmap_tx(struct ipoib_dev_priv *priv,
> > > > +				  struct ipoib_tx_buf *tx_req)
> > > > +{
> > > > +	struct sk_buff *skb;
> > > > +	int i, offs;
> > > > +
> > > > +	skb = tx_req->skb;
> > > > +	if (skb_shinfo(skb)->nr_frags) {
> > > > +		offs = 0;
> > > > +		if (skb_headlen(skb)) {
> > > > +			ib_dma_unmap_single(priv->ca, tx_req->mapping[0],
> > > > +					    skb_headlen(skb), DMA_TO_DEVICE);
> > > > +			offs = 1;
> > > > +		}
> > > > +		for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
> > > > +			const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
> > > > +
> > > > +			ib_dma_unmap_single(priv->ca, tx_req->mapping[i + offs],
> > > > +					    skb_frag_size(frag), DMA_TO_DEVICE);
> > > > +		}
> > > > +	} else
> > > > +		ib_dma_unmap_single(priv->ca, tx_req->mapping[0], skb->len,
> > > > +				    DMA_TO_DEVICE);
> > > > +}
> > > > +
> > > >  static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
> > > >  {
> > > >  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> > > > @@ -707,11 +732,39 @@ static inline int post_send(struct ipoib_dev_priv *priv,
> > > >  	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
> > > >  }
> > > >  
> > > > +static inline int post_send_sg(struct ipoib_dev_priv *priv,
> > > > +			       struct ipoib_cm_tx *tx,
> > > > +			       unsigned int wr_id,
> > > > +			       struct sk_buff *skb,
> > > > +			       u64 mapping[MAX_SKB_FRAGS + 1])
> > > > +{
> > > > +	struct ib_send_wr *bad_wr;
> > > > +	int i, off;
> > > > +	skb_frag_t *frags = skb_shinfo(skb)->frags;
> > > > +	int nr_frags = skb_shinfo(skb)->nr_frags;
> > > > +
> > > > +	if (skb_headlen(skb)) {
> > > > +		priv->tx_sge[0].addr = mapping[0];
> > > > +		priv->tx_sge[0].length = skb_headlen(skb);
> > > > +		off = 1;
> > > > +	} else
> > > > +		off = 0;
> > > > +
> > > > +	for (i = 0; i < nr_frags; ++i) {
> > > > +		priv->tx_sge[i + off].addr = mapping[i + off];
> > > > +		priv->tx_sge[i + off].length = frags[i].size;
> > > > +	}
> > > > +	priv->tx_wr.num_sge = nr_frags + off;
> > > > +	priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
> > > > +
> > > > +	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
> > > > +}
> > > > +
> > > >  void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
> > > >  {
> > > >  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> > > > -	struct ipoib_cm_tx_buf *tx_req;
> > > > -	u64 addr;
> > > > +	struct ipoib_tx_buf *tx_req;
> > > > +	u64 addr = 0;
> > > >  	int rc;
> > > >  
> > > >  	if (unlikely(skb->len > tx->mtu)) {
> > > > @@ -735,24 +788,37 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> > > >  	 */
> > > >  	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
> > > >  	tx_req->skb = skb;
> > > > -	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
> > > > -	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> > > > -		++dev->stats.tx_errors;
> > > > -		dev_kfree_skb_any(skb);
> > > > -		return;
> > > > -	}
> > > >  
> > > > -	tx_req->mapping = addr;
> > > > +	if (skb_shinfo(skb)->nr_frags) {
> > > > +		if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
> > > > +			++dev->stats.tx_errors;
> > > > +			dev_kfree_skb_any(skb);
> > > > +			return;
> > > > +		}
> > > > +		rc = post_send_sg(priv, tx, tx->tx_head &
> > > > +				  (ipoib_sendq_size - 1),
> > > > +				  skb, tx_req->mapping);
> > > > +	} else {
> > > > +		addr = ib_dma_map_single(priv->ca, skb->data, skb->len,
> > > > +					 DMA_TO_DEVICE);
> > > > +		if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> > > > +			++dev->stats.tx_errors;
> > > > +			dev_kfree_skb_any(skb);
> > > > +			return;
> > > > +		}
> > > > +
> > > > +		tx_req->mapping[0] = addr;
> > > >  
> > > > -	skb_orphan(skb);
> > > > -	skb_dst_drop(skb);
> > > > +		skb_orphan(skb);
> > > > +		skb_dst_drop(skb);
> > > >  
> > > > -	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> > > > -		       addr, skb->len);
> > > > +		rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> > > > +			       addr, skb->len);
> > > > +	}
> > > >  	if (unlikely(rc)) {
> > > >  		ipoib_warn(priv, "post_send failed, error %d\n", rc);
> > > >  		++dev->stats.tx_errors;
> > > > -		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
> > > > +		ipoib_cm_dma_unmap_tx(priv, tx_req);
> > > >  		dev_kfree_skb_any(skb);
> > > >  	} else {
> > > >  		dev->trans_start = jiffies;
> > > > @@ -777,7 +843,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
> > > >  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> > > >  	struct ipoib_cm_tx *tx = wc->qp->qp_context;
> > > >  	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
> > > > -	struct ipoib_cm_tx_buf *tx_req;
> > > > +	struct ipoib_tx_buf *tx_req;
> > > >  	unsigned long flags;
> > > >  
> > > >  	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
> > > > @@ -791,7 +857,7 @@ void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
> > > >  
> > > >  	tx_req = &tx->tx_ring[wr_id];
> > > >  
> > > > -	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
> > > > +	ipoib_cm_dma_unmap_tx(priv, tx_req);
> > > >  
> > > >  	/* FIXME: is this right? Shouldn't we only increment on success? */
> > > >  	++dev->stats.tx_packets;
> > > > @@ -1036,6 +1102,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
> > > >  
> > > >  	struct ib_qp *tx_qp;
> > > >  
> > > > +	if (dev->features & NETIF_F_SG)
> > > > +		attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
> > > > +
> > > >  	tx_qp = ib_create_qp(priv->pd, &attr);
> > > >  	if (PTR_ERR(tx_qp) == -EINVAL) {
> > > >  		ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
> > > > @@ -1170,7 +1239,7 @@ err_tx:
> > > >  static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
> > > >  {
> > > >  	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
> > > > -	struct ipoib_cm_tx_buf *tx_req;
> > > > +	struct ipoib_tx_buf *tx_req;
> > > >  	unsigned long begin;
> > > >  
> > > >  	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
> > > > @@ -1197,8 +1266,7 @@ timeout:
> > > >  
> > > >  	while ((int) p->tx_tail - (int) p->tx_head < 0) {
> > > >  		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
> > > > -		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
> > > > -				    DMA_TO_DEVICE);
> > > > +		ipoib_cm_dma_unmap_tx(priv, tx_req);
> > > >  		dev_kfree_skb_any(tx_req->skb);
> > > >  		++p->tx_tail;
> > > >  		netif_tx_lock_bh(p->dev);
> > > > @@ -1455,7 +1523,6 @@ static void ipoib_cm_stale_task(struct work_struct *work)
> > > >  	spin_unlock_irq(&priv->lock);
> > > >  }
> > > >  
> > > > -
> > > 
> > > No need to remove this line.
> > > 
> > > >  static ssize_t show_mode(struct device *d, struct device_attribute *attr,
> > > >  			 char *buf)
> > > >  {
> > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > > > index 72626c3..9e11447 100644
> > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > > > @@ -312,8 +312,7 @@ repost:
> > > >  			   "for buf %d\n", wr_id);
> > > >  }
> > > >  
> > > > -static int ipoib_dma_map_tx(struct ib_device *ca,
> > > > -			    struct ipoib_tx_buf *tx_req)
> > > > +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
> > > >  {
> > > >  	struct sk_buff *skb = tx_req->skb;
> > > >  	u64 *mapping = tx_req->mapping;
> > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > index 58b5aa3..f9314c6 100644
> > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > @@ -190,7 +190,7 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
> > > >  	struct ipoib_dev_priv *priv = netdev_priv(dev);
> > > >  
> > > >  	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
> > > > -		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
> > > > +		features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
> > > >  
> > > >  	return features;
> > > >  }
> > > > @@ -233,7 +233,6 @@ int ipoib_set_mode(struct net_device *dev, const char *buf)
> > > >  			   "will cause multicast packet drops\n");
> > > >  		netdev_update_features(dev);
> > > >  		rtnl_unlock();
> > > > -		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
> > > 
> > > Sure about this removal ? I don't see it related to scatter-gather.
> > > 
> > > >  
> > > >  		ipoib_flush_paths(dev);
> > > >  		rtnl_lock();
> > > 
> > > Regards.
> > > 
> > > -- 
> > > Yann Droneaud
> > > OPTEYA
> > > 
> > > 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe March 23, 2015, 5:17 p.m. UTC | #4
On Sun, Mar 22, 2015 at 11:21:50AM +0200, Yuval Shaia wrote:
> On Sun, Mar 15, 2015 at 05:16:16PM +0200, Yuval Shaia wrote:
> > Hi,
> > I didn't got any further comments on this one.
> > Any idea why SG in CM is un-welcome?
> By mistake I sent a private mail only.
> Cc: Roland Dreier <roland@kernel.org>
> Cc: Sean Hefty <sean.hefty@intel.com>
> Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
> 
> Your advice would be very appreciated.

I haven't looked in detail at the patch, but in principle, using S/G
when ever possible should be the default, even if this creates a
performance regression.

It is well known that high order allocations are problematic in Linux
and should be avoided, and I also have seen systems blow up because of
high order IPoIB allocations.

That said, there may be cases where S/G is not possible, you should
try and get Mellanox to comment if all their offloads work on all
their cards when S/G is used. Work may be required to resolve any of
these constraints. I'd like to belive there is some reason why we've
been doing high order allocations for so many years.

FWIW, I would probably choose to default S/G over any other offload
acceleration.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ira Weiny April 1, 2015, 5:17 p.m. UTC | #5
On Mon, Mar 23, 2015 at 11:17:49AM -0600, Jason Gunthorpe wrote:
> On Sun, Mar 22, 2015 at 11:21:50AM +0200, Yuval Shaia wrote:
> > On Sun, Mar 15, 2015 at 05:16:16PM +0200, Yuval Shaia wrote:
> > > Hi,
> > > I didn't got any further comments on this one.
> > > Any idea why SG in CM is un-welcome?
> > By mistake I sent a private mail only.
> > Cc: Roland Dreier <roland@kernel.org>
> > Cc: Sean Hefty <sean.hefty@intel.com>
> > Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
> > 
> > Your advice would be very appreciated.
> 
> I haven't looked in detail at the patch, but in principle, using S/G
> when ever possible should be the default, even if this creates a
> performance regression.
> 
> It is well known that high order allocations are problematic in Linux
> and should be avoided, and I also have seen systems blow up because of
> high order IPoIB allocations.
> 
> That said, there may be cases where S/G is not possible, you should
> try and get Mellanox to comment if all their offloads work on all
> their cards when S/G is used. Work may be required to resolve any of
> these constraints. I'd like to belive there is some reason why we've
> been doing high order allocations for so many years.
> 
> FWIW, I would probably choose to default S/G over any other offload
> acceleration.

I concur with Jason's assessment.

As Yann asked before:

What hardware have you tested this on?  Do you have any performance
measurements?  Or do you have a reproducer for some of the allocation issues
which have been seen?

I can't comment on how this may affect Mellanox Hardware but it seems like it
will work fine with Qib hardware.

Ira


> 
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Shaia April 1, 2015, 8:08 p.m. UTC | #6
On Wed, Apr 01, 2015 at 01:17:19PM -0400, ira.weiny wrote:
> On Mon, Mar 23, 2015 at 11:17:49AM -0600, Jason Gunthorpe wrote:
> > On Sun, Mar 22, 2015 at 11:21:50AM +0200, Yuval Shaia wrote:
> > > On Sun, Mar 15, 2015 at 05:16:16PM +0200, Yuval Shaia wrote:
> > > > Hi,
> > > > I didn't got any further comments on this one.
> > > > Any idea why SG in CM is un-welcome?
> > > By mistake I sent a private mail only.
> > > Cc: Roland Dreier <roland@kernel.org>
> > > Cc: Sean Hefty <sean.hefty@intel.com>
> > > Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
> > > 
> > > Your advice would be very appreciated.
> > 
> > I haven't looked in detail at the patch, but in principle, using S/G
> > when ever possible should be the default, even if this creates a
> > performance regression.
> > 
> > It is well known that high order allocations are problematic in Linux
> > and should be avoided, and I also have seen systems blow up because of
> > high order IPoIB allocations.
> > 
> > That said, there may be cases where S/G is not possible, you should
> > try and get Mellanox to comment if all their offloads work on all
> > their cards when S/G is used. Work may be required to resolve any of
> > these constraints. I'd like to belive there is some reason why we've
> > been doing high order allocations for so many years.
> > 
> > FWIW, I would probably choose to default S/G over any other offload
> > acceleration.
> 
> I concur with Jason's assessment.
> 
> As Yann asked before:
> 
> What hardware have you tested this on?  Do you have any performance
> measurements?  Or do you have a reproducer for some of the allocation issues
> which have been seen?
Tested on Mellanox MT26428. Have here also CX3, will update if there will be an issue.
No impact on performances.
I did not try to reproduce the issue but people that do got this dump:
pr  7 09:33:30 dbnode kernel: Call Trace:
Apr  7 09:33:30 dbnode kernel:  [<ffffffff810ddf74>] __alloc_pages_nodemask+0x524/0x595
Apr  7 09:33:30 dbnode kernel:  [<ffffffff8110da3f>] kmem_getpages+0x4f/0xf4
Apr  7 09:33:30 dbnode kernel:  [<ffffffff8110dc12>] fallback_alloc+0x12e/0x1ce
Apr  7 09:33:30 dbnode kernel:  [<ffffffff8110ddd3>] ____cache_alloc_node+0x121/0x134
Apr  7 09:33:30 dbnode kernel:  [<ffffffff8110e3f3>] kmem_cache_alloc_node_notrace+0x84/0xb9
Apr  7 09:33:30 dbnode kernel:  [<ffffffff8110e46e>] __kmalloc_node+0x46/0x73
Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b9aa8>] ? __alloc_skb+0x72/0x13d
Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b9aa8>] __alloc_skb+0x72/0x13d
Apr  7 09:33:30 dbnode kernel:  [<ffffffff813f2364>] sk_stream_alloc_skb+0x3d/0xaf
Apr  7 09:33:30 dbnode kernel:  [<ffffffff813f35b5>] tcp_sendmsg+0x176/0x6cf
Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b0d5f>] __sock_sendmsg+0x5e/0x67
Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b1644>] sock_sendmsg+0xcc/0xe5
Apr  7 09:33:30 dbnode kernel:  [<ffffffff810b4d09>] ? delayacct_end+0x7d/0x88
Apr  7 09:33:30 dbnode kernel:  [<ffffffff8104a3b0>] ? delayacct_blkio_end+0x26/0x40
Apr  7 09:33:30 dbnode kernel:  [<ffffffff81077030>] ? autoremove_wake_function+0x0/0x3d
Apr  7 09:33:30 dbnode kernel:  [<ffffffff81456f1d>] ? __wait_on_bit+0x6c/0x7c
Apr  7 09:33:30 dbnode kernel:  [<ffffffff810d7b70>] ? sync_page+0x0/0x4d
Apr  7 09:33:30 dbnode kernel:  [<ffffffff8111656e>] ? __pfn_to_section+0x12/0x14
Apr  7 09:33:30 dbnode kernel:  [<ffffffff811165a2>] ? lookup_page_cgroup+0x32/0x48
Apr  7 09:33:30 dbnode kernel:  [<ffffffff81100a61>] ? swap_entry_free+0x7a/0xf3
Apr  7 09:33:30 dbnode kernel:  [<ffffffff8111c239>] ? fget_light+0x34/0x73
Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b0fcb>] ? sockfd_lookup_light+0x20/0x58
Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b22cf>] sys_sendto+0x12f/0x171
Apr  7 09:33:30 dbnode kernel:  [<ffffffff810a9d23>] ? audit_syscall_entry+0x103/0x12f
Apr  7 09:33:30 dbnode kernel:  [<ffffffff81011db2>] system_call_fastpath+0x16/0x1b
> 
> I can't comment on how this may affect Mellanox Hardware but it seems like it
> will work fine with Qib hardware.
> 
> Ira
> 
> 
> > 
> > Jason
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Shaia April 1, 2015, 8:33 p.m. UTC | #7
On Mon, Mar 23, 2015 at 11:17:49AM -0600, Jason Gunthorpe wrote:
> On Sun, Mar 22, 2015 at 11:21:50AM +0200, Yuval Shaia wrote:
> > On Sun, Mar 15, 2015 at 05:16:16PM +0200, Yuval Shaia wrote:
> > > Hi,
> > > I didn't got any further comments on this one.
> > > Any idea why SG in CM is un-welcome?
> > By mistake I sent a private mail only.
> > Cc: Roland Dreier <roland@kernel.org>
> > Cc: Sean Hefty <sean.hefty@intel.com>
> > Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
> > 
> > Your advice would be very appreciated.
> 
> I haven't looked in detail at the patch, but in principle, using S/G
> when ever possible should be the default, even if this creates a
> performance regression.
Per my testings - no impact on performance.
> 
> It is well known that high order allocations are problematic in Linux
> and should be avoided, and I also have seen systems blow up because of
> high order IPoIB allocations.
Yes, snapshot from oops:
Call Trace:
 [<ffffffff810ddf74>] __alloc_pages_nodemask+0x524/0x595
 [<ffffffff8110da3f>] kmem_getpages+0x4f/0xf4
 [<ffffffff8110dc12>] fallback_alloc+0x12e/0x1ce
 [<ffffffff8110ddd3>] ____cache_alloc_node+0x121/0x134
 [<ffffffff8110e3f3>] kmem_cache_alloc_node_notrace+0x84/0xb9
 [<ffffffff8110e46e>] __kmalloc_node+0x46/0x73
 [<ffffffff813b9aa8>] ? __alloc_skb+0x72/0x13d
> 
> That said, there may be cases where S/G is not possible, you should
> try and get Mellanox to comment if all their offloads work on all
> their cards when S/G is used. Work may be required to resolve any of
> these constraints. I'd like to belive there is some reason why we've
> been doing high order allocations for so many years.
Old kernels (2.6.39) restrict the usage of S/G only to H/W which supports CSUM offload.
Mellanox HCA supports CSUM offload only for UD and this is why we can find S/G support for UD but not for CM.
With the introduction of commit ec5f061 this limitation is no longer there so no reason not to add this support.
> 
> FWIW, I would probably choose to default S/G over any other offload
> acceleration.
> 
> Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Shaia May 5, 2015, 12:23 p.m. UTC | #8
Hi All,
If there are no more comments and no rejections can we take this patch to next stage and merge it?
Who maintain this part?

Yuval

On Wed, Apr 01, 2015 at 11:08:45PM +0300, Yuval Shaia wrote:
> On Wed, Apr 01, 2015 at 01:17:19PM -0400, ira.weiny wrote:
> > On Mon, Mar 23, 2015 at 11:17:49AM -0600, Jason Gunthorpe wrote:
> > > On Sun, Mar 22, 2015 at 11:21:50AM +0200, Yuval Shaia wrote:
> > > > On Sun, Mar 15, 2015 at 05:16:16PM +0200, Yuval Shaia wrote:
> > > > > Hi,
> > > > > I didn't got any further comments on this one.
> > > > > Any idea why SG in CM is un-welcome?
> > > > By mistake I sent a private mail only.
> > > > Cc: Roland Dreier <roland@kernel.org>
> > > > Cc: Sean Hefty <sean.hefty@intel.com>
> > > > Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
> > > > 
> > > > Your advice would be very appreciated.
> > > 
> > > I haven't looked in detail at the patch, but in principle, using S/G
> > > when ever possible should be the default, even if this creates a
> > > performance regression.
> > > 
> > > It is well known that high order allocations are problematic in Linux
> > > and should be avoided, and I also have seen systems blow up because of
> > > high order IPoIB allocations.
> > > 
> > > That said, there may be cases where S/G is not possible, you should
> > > try and get Mellanox to comment if all their offloads work on all
> > > their cards when S/G is used. Work may be required to resolve any of
> > > these constraints. I'd like to belive there is some reason why we've
> > > been doing high order allocations for so many years.
> > > 
> > > FWIW, I would probably choose to default S/G over any other offload
> > > acceleration.
> > 
> > I concur with Jason's assessment.
> > 
> > As Yann asked before:
> > 
> > What hardware have you tested this on?  Do you have any performance
> > measurements?  Or do you have a reproducer for some of the allocation issues
> > which have been seen?
> Tested on Mellanox MT26428. Have here also CX3, will update if there will be an issue.
> No impact on performances.
> I did not try to reproduce the issue but people that do got this dump:
> pr  7 09:33:30 dbnode kernel: Call Trace:
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff810ddf74>] __alloc_pages_nodemask+0x524/0x595
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff8110da3f>] kmem_getpages+0x4f/0xf4
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff8110dc12>] fallback_alloc+0x12e/0x1ce
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff8110ddd3>] ____cache_alloc_node+0x121/0x134
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff8110e3f3>] kmem_cache_alloc_node_notrace+0x84/0xb9
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff8110e46e>] __kmalloc_node+0x46/0x73
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b9aa8>] ? __alloc_skb+0x72/0x13d
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b9aa8>] __alloc_skb+0x72/0x13d
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff813f2364>] sk_stream_alloc_skb+0x3d/0xaf
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff813f35b5>] tcp_sendmsg+0x176/0x6cf
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b0d5f>] __sock_sendmsg+0x5e/0x67
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b1644>] sock_sendmsg+0xcc/0xe5
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff810b4d09>] ? delayacct_end+0x7d/0x88
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff8104a3b0>] ? delayacct_blkio_end+0x26/0x40
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff81077030>] ? autoremove_wake_function+0x0/0x3d
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff81456f1d>] ? __wait_on_bit+0x6c/0x7c
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff810d7b70>] ? sync_page+0x0/0x4d
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff8111656e>] ? __pfn_to_section+0x12/0x14
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff811165a2>] ? lookup_page_cgroup+0x32/0x48
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff81100a61>] ? swap_entry_free+0x7a/0xf3
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff8111c239>] ? fget_light+0x34/0x73
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b0fcb>] ? sockfd_lookup_light+0x20/0x58
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff813b22cf>] sys_sendto+0x12f/0x171
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff810a9d23>] ? audit_syscall_entry+0x103/0x12f
> Apr  7 09:33:30 dbnode kernel:  [<ffffffff81011db2>] system_call_fastpath+0x16/0x1b
> > 
> > I can't comment on how this may affect Mellanox Hardware but it seems like it
> > will work fine with Qib hardware.
> > 
> > Ira
> > 
> > 
> > > 
> > > Jason
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford May 7, 2015, 2:45 p.m. UTC | #9
On Wed, 2015-04-01 at 23:33 +0300, Yuval Shaia wrote:
> On Mon, Mar 23, 2015 at 11:17:49AM -0600, Jason Gunthorpe wrote:
> > On Sun, Mar 22, 2015 at 11:21:50AM +0200, Yuval Shaia wrote:
> > > On Sun, Mar 15, 2015 at 05:16:16PM +0200, Yuval Shaia wrote:
> > > > Hi,
> > > > I didn't got any further comments on this one.
> > > > Any idea why SG in CM is un-welcome?
> > > By mistake I sent a private mail only.
> > > Cc: Roland Dreier <roland@kernel.org>
> > > Cc: Sean Hefty <sean.hefty@intel.com>
> > > Cc: Hal Rosenstock <hal.rosenstock@gmail.com>
> > > 
> > > Your advice would be very appreciated.
> > 
> > I haven't looked in detail at the patch, but in principle, using S/G
> > when ever possible should be the default, even if this creates a
> > performance regression.
> Per my testings - no impact on performance.
> > 
> > It is well known that high order allocations are problematic in Linux
> > and should be avoided, and I also have seen systems blow up because of
> > high order IPoIB allocations.
> Yes, snapshot from oops:
> Call Trace:
>  [<ffffffff810ddf74>] __alloc_pages_nodemask+0x524/0x595
>  [<ffffffff8110da3f>] kmem_getpages+0x4f/0xf4
>  [<ffffffff8110dc12>] fallback_alloc+0x12e/0x1ce
>  [<ffffffff8110ddd3>] ____cache_alloc_node+0x121/0x134
>  [<ffffffff8110e3f3>] kmem_cache_alloc_node_notrace+0x84/0xb9
>  [<ffffffff8110e46e>] __kmalloc_node+0x46/0x73
>  [<ffffffff813b9aa8>] ? __alloc_skb+0x72/0x13d
> > 
> > That said, there may be cases where S/G is not possible, you should
> > try and get Mellanox to comment if all their offloads work on all
> > their cards when S/G is used. Work may be required to resolve any of
> > these constraints. I'd like to belive there is some reason why we've
> > been doing high order allocations for so many years.
> Old kernels (2.6.39) restrict the usage of S/G only to H/W which supports CSUM offload.
> Mellanox HCA supports CSUM offload only for UD and this is why we can find S/G support for UD but not for CM.
> With the introduction of commit ec5f061 this limitation is no longer there so no reason not to add this support.

Can you repost the patch with a cleaned up commit message that includes
a reference to this prior restriction and the fact that it no longer
exists please?
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index d7562be..fb42834 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -170,11 +170,6 @@  struct ipoib_tx_buf {
 	u64		mapping[MAX_SKB_FRAGS + 1];
 };
 
-struct ipoib_cm_tx_buf {
-	struct sk_buff *skb;
-	u64		mapping;
-};
-
 struct ib_cm_id;
 
 struct ipoib_cm_data {
@@ -233,7 +228,7 @@  struct ipoib_cm_tx {
 	struct net_device   *dev;
 	struct ipoib_neigh  *neigh;
 	struct ipoib_path   *path;
-	struct ipoib_cm_tx_buf *tx_ring;
+	struct ipoib_tx_buf *tx_ring;
 	unsigned	     tx_head;
 	unsigned	     tx_tail;
 	unsigned long	     flags;
@@ -496,6 +491,7 @@  int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
 
 void ipoib_mcast_dev_down(struct net_device *dev);
 void ipoib_mcast_dev_flush(struct net_device *dev);
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index 933efce..056e4d2 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -88,6 +88,31 @@  static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
 		ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
 }
 
+static void ipoib_cm_dma_unmap_tx(struct ipoib_dev_priv *priv,
+				  struct ipoib_tx_buf *tx_req)
+{
+	struct sk_buff *skb;
+	int i, offs;
+
+	skb = tx_req->skb;
+	if (skb_shinfo(skb)->nr_frags) {
+		offs = 0;
+		if (skb_headlen(skb)) {
+			ib_dma_unmap_single(priv->ca, tx_req->mapping[0],
+					    skb_headlen(skb), DMA_TO_DEVICE);
+			offs = 1;
+		}
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+			const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+			ib_dma_unmap_single(priv->ca, tx_req->mapping[i + offs],
+					    skb_frag_size(frag), DMA_TO_DEVICE);
+		}
+	} else
+		ib_dma_unmap_single(priv->ca, tx_req->mapping[0], skb->len,
+				    DMA_TO_DEVICE);
+}
+
 static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -707,11 +732,39 @@  static inline int post_send(struct ipoib_dev_priv *priv,
 	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
 }
 
+static inline int post_send_sg(struct ipoib_dev_priv *priv,
+			       struct ipoib_cm_tx *tx,
+			       unsigned int wr_id,
+			       struct sk_buff *skb,
+			       u64 mapping[MAX_SKB_FRAGS + 1])
+{
+	struct ib_send_wr *bad_wr;
+	int i, off;
+	skb_frag_t *frags = skb_shinfo(skb)->frags;
+	int nr_frags = skb_shinfo(skb)->nr_frags;
+
+	if (skb_headlen(skb)) {
+		priv->tx_sge[0].addr = mapping[0];
+		priv->tx_sge[0].length = skb_headlen(skb);
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < nr_frags; ++i) {
+		priv->tx_sge[i + off].addr = mapping[i + off];
+		priv->tx_sge[i + off].length = frags[i].size;
+	}
+	priv->tx_wr.num_sge = nr_frags + off;
+	priv->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
+
+	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
+}
+
 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
-	struct ipoib_cm_tx_buf *tx_req;
-	u64 addr;
+	struct ipoib_tx_buf *tx_req;
+	u64 addr = 0;
 	int rc;
 
 	if (unlikely(skb->len > tx->mtu)) {
@@ -735,24 +788,37 @@  void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 	 */
 	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
 	tx_req->skb = skb;
-	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
-	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
-		++dev->stats.tx_errors;
-		dev_kfree_skb_any(skb);
-		return;
-	}
 
-	tx_req->mapping = addr;
+	if (skb_shinfo(skb)->nr_frags) {
+		if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
+			++dev->stats.tx_errors;
+			dev_kfree_skb_any(skb);
+			return;
+		}
+		rc = post_send_sg(priv, tx, tx->tx_head &
+				  (ipoib_sendq_size - 1),
+				  skb, tx_req->mapping);
+	} else {
+		addr = ib_dma_map_single(priv->ca, skb->data, skb->len,
+					 DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
+			++dev->stats.tx_errors;
+			dev_kfree_skb_any(skb);
+			return;
+		}
+
+		tx_req->mapping[0] = addr;
 
-	skb_orphan(skb);
-	skb_dst_drop(skb);
+		skb_orphan(skb);
+		skb_dst_drop(skb);
 
-	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
-		       addr, skb->len);
+		rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
+			       addr, skb->len);
+	}
 	if (unlikely(rc)) {
 		ipoib_warn(priv, "post_send failed, error %d\n", rc);
 		++dev->stats.tx_errors;
-		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
+		ipoib_cm_dma_unmap_tx(priv, tx_req);
 		dev_kfree_skb_any(skb);
 	} else {
 		dev->trans_start = jiffies;
@@ -777,7 +843,7 @@  void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_cm_tx *tx = wc->qp->qp_context;
 	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
-	struct ipoib_cm_tx_buf *tx_req;
+	struct ipoib_tx_buf *tx_req;
 	unsigned long flags;
 
 	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
@@ -791,7 +857,7 @@  void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 
 	tx_req = &tx->tx_ring[wr_id];
 
-	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
+	ipoib_cm_dma_unmap_tx(priv, tx_req);
 
 	/* FIXME: is this right? Shouldn't we only increment on success? */
 	++dev->stats.tx_packets;
@@ -1036,6 +1102,9 @@  static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
 
 	struct ib_qp *tx_qp;
 
+	if (dev->features & NETIF_F_SG)
+		attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
+
 	tx_qp = ib_create_qp(priv->pd, &attr);
 	if (PTR_ERR(tx_qp) == -EINVAL) {
 		ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
@@ -1170,7 +1239,7 @@  err_tx:
 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
-	struct ipoib_cm_tx_buf *tx_req;
+	struct ipoib_tx_buf *tx_req;
 	unsigned long begin;
 
 	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
@@ -1197,8 +1266,7 @@  timeout:
 
 	while ((int) p->tx_tail - (int) p->tx_head < 0) {
 		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
-		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
-				    DMA_TO_DEVICE);
+		ipoib_cm_dma_unmap_tx(priv, tx_req);
 		dev_kfree_skb_any(tx_req->skb);
 		++p->tx_tail;
 		netif_tx_lock_bh(p->dev);
@@ -1455,7 +1523,6 @@  static void ipoib_cm_stale_task(struct work_struct *work)
 	spin_unlock_irq(&priv->lock);
 }
 
-
 static ssize_t show_mode(struct device *d, struct device_attribute *attr,
 			 char *buf)
 {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 72626c3..9e11447 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -312,8 +312,7 @@  repost:
 			   "for buf %d\n", wr_id);
 }
 
-static int ipoib_dma_map_tx(struct ib_device *ca,
-			    struct ipoib_tx_buf *tx_req)
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
 {
 	struct sk_buff *skb = tx_req->skb;
 	u64 *mapping = tx_req->mapping;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 58b5aa3..f9314c6 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -190,7 +190,7 @@  static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
 	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
-		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
+		features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
 
 	return features;
 }
@@ -233,7 +233,6 @@  int ipoib_set_mode(struct net_device *dev, const char *buf)
 			   "will cause multicast packet drops\n");
 		netdev_update_features(dev);
 		rtnl_unlock();
-		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
 
 		ipoib_flush_paths(dev);
 		rtnl_lock();