diff mbox

ib_ipoib: CSUM support in connected mode

Message ID 1412423147-8402-1-git-send-email-yuval.shaia@oracle.com (mailing list archive)
State Rejected
Headers show

Commit Message

Yuval Shaia Oct. 4, 2014, 11:45 a.m. UTC
This enhancement suggest the usage of IB CRC instead of CSUM in IPoIB CM.
IPoIB Connected Mode driver uses RC (Reliable Connection) which guarantees the
corruption free delivery of the packet.
InfiniBand uses 32b CRC which provides stronger data integrity protection
compare to 16b IP Checksum.
So, there is no added value that IP Checksum provides in the IB world.
The proposal is to tell that network stack that IPoIB-CM supports IP Checksum
offload.
This enables Linux to save the time of checksum calculation of IPoIB CM packets.
Network sends the IP packet without adding the IP Checksum to the header.
On the receive side, IPoIB driver again tells the network stack that IP
Checksum is good for the incoming packets and network stack avoids the IP
Checksum calculations.
During connection establishment the driver determine if the other end supports
IB CRC as checksum.
This is done so driver will be able to calculate checksum before transmiting
the packet in case the other end does not support this feature.
A support for fragmented skb is added to transmit path.

Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
---
 drivers/infiniband/ulp/ipoib/ipoib.h      |   25 ++++++
 drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  117 ++++++++++++++++++++++++----
 drivers/infiniband/ulp/ipoib/ipoib_ib.c   |    3 +-
 drivers/infiniband/ulp/ipoib/ipoib_main.c |   19 ++++-
 4 files changed, 143 insertions(+), 21 deletions(-)

Comments

Yuval Shaia Oct. 5, 2014, 2:09 a.m. UTC | #1
Hi all,
Sorry but this patch will not go smoothly to latest kernel.
I'm currently adjusting it to fit to v3.17-rc4 and will send new one soon.

Yuval

On Sat, Oct 04, 2014 at 04:45:47AM -0700, Yuval Shaia wrote:
> This enhancement suggest the usage of IB CRC instead of CSUM in IPoIB CM.
> IPoIB Connected Mode driver uses RC (Reliable Connection) which guarantees the
> corruption free delivery of the packet.
> InfiniBand uses 32b CRC which provides stronger data integrity protection
> compare to 16b IP Checksum.
> So, there is no added value that IP Checksum provides in the IB world.
> The proposal is to tell that network stack that IPoIB-CM supports IP Checksum
> offload.
> This enables Linux to save the time of checksum calculation of IPoIB CM packets.
> Network sends the IP packet without adding the IP Checksum to the header.
> On the receive side, IPoIB driver again tells the network stack that IP
> Checksum is good for the incoming packets and network stack avoids the IP
> Checksum calculations.
> During connection establishment the driver determine if the other end supports
> IB CRC as checksum.
> This is done so driver will be able to calculate checksum before transmiting
> the packet in case the other end does not support this feature.
> A support for fragmented skb is added to transmit path.
> 
> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> ---
>  drivers/infiniband/ulp/ipoib/ipoib.h      |   25 ++++++
>  drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  117 ++++++++++++++++++++++++----
>  drivers/infiniband/ulp/ipoib/ipoib_ib.c   |    3 +-
>  drivers/infiniband/ulp/ipoib/ipoib_main.c |   19 ++++-
>  4 files changed, 143 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> index 0ed9aed..6fedf83 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> @@ -100,6 +100,7 @@ enum {
>  	IPOIB_FLAG_AUTO_MODER     = 13, /*indicates moderation is running*/
>  	/*indicates if event handler was registered*/
>  	IPOIB_FLAG_EVENTS_REGISTERED    = 14,
> +	IPOIB_FLAG_CSUM		  = 15,
>  
>  	IPOIB_MAX_BACKOFF_SECONDS = 16,
>  
> @@ -192,9 +193,28 @@ struct ipoib_pmtu_update {
>  
>  struct ib_cm_id;
>  
> +#define IPOIB_CM_PROTO_SIG		0x2211
> +#define IPOIB_CM_PROTO_VER		(1UL << 12)
> +
> +static inline int ipoib_cm_check_proto_sig(u16 proto_sig)
> +{
> +	return proto_sig & IPOIB_CM_PROTO_SIG;
> +};
> +
> +static inline int ipoib_cm_check_proto_ver(u16 caps)
> +{
> +	return caps & IPOIB_CM_PROTO_VER;
> +};
> +
> +enum ipoib_cm_data_caps {
> +	IPOIB_CM_CAPS_IBCRC_AS_CSUM	= 1UL << 0,
> +};
> +
>  struct ipoib_cm_data {
>  	__be32 qpn; /* High byte MUST be ignored on receive */
>  	__be32 mtu;
> +	__be16 sig; /* must be IPOIB_CM_PROTO_SIG */
> +	__be16 caps; /* 4 bits proto ver and 12 bits capabilities */
>  };
>  
>  /*
> @@ -241,6 +261,7 @@ struct ipoib_cm_rx {
>  	int			recv_count;
>  	u32			qpn;
>  	int index; /* For ring counters */
> +	u16			caps;
>  };
>  
>  struct ipoib_cm_tx {
> @@ -255,6 +276,7 @@ struct ipoib_cm_tx {
>  	unsigned	     tx_tail;
>  	unsigned long	     flags;
>  	u32		     mtu;
> +	u16		     caps;
>  };
>  
>  struct ipoib_cm_rx_buf {
> @@ -558,6 +580,8 @@ void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid);
>  extern struct workqueue_struct *ipoib_workqueue;
>  extern struct workqueue_struct *ipoib_auto_moder_workqueue;
>  
> +extern int cm_ibcrc_as_csum;
> +
>  /* functions */
>  
>  int ipoib_poll(struct napi_struct *napi, int budget);
> @@ -613,6 +637,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
>  
>  void ipoib_mcast_dev_down(struct net_device *dev);
>  void ipoib_mcast_dev_flush(struct net_device *dev);
> +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
>  
>  #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
>  struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> index db0303e..553569a 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> @@ -436,9 +436,16 @@ static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  	struct ipoib_cm_data data = {};
>  	struct ib_cm_rep_param rep = {};
> +	u16 caps = 0;
> +
> +	caps |= IPOIB_CM_PROTO_VER;
> +	if (cm_ibcrc_as_csum)
> +		caps |= IPOIB_CM_CAPS_IBCRC_AS_CSUM;
>  
>  	data.qpn = cpu_to_be32(priv->qp->qp_num);
>  	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
> +	data.sig = cpu_to_be16(IPOIB_CM_PROTO_SIG);
> +	data.caps = cpu_to_be16(caps);
>  
>  	rep.private_data = &data;
>  	rep.private_data_len = sizeof data;
> @@ -458,6 +465,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
>  	struct ipoib_cm_data *data = event->private_data;
>  	unsigned psn;
>  	int ret;
> +	struct ipoib_cm_data *cm_data;
>  
>  	ipoib_dbg(priv, "REQ arrived\n");
>  	p = kzalloc(sizeof *p, GFP_KERNEL);
> @@ -480,6 +488,13 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
>  		goto err_qp;
>  	}
>  
> +	cm_data = (struct ipoib_cm_data *)event->private_data;
> +	ipoib_dbg(priv, "Otherend sig=0x%x\n", be16_to_cpu(cm_data->sig));
> +	if (ipoib_cm_check_proto_sig(be16_to_cpu(cm_data->sig)) &&
> +	    ipoib_cm_check_proto_ver(be16_to_cpu(cm_data->caps)))
> +		p->caps = be16_to_cpu(cm_data->caps);
> +	ipoib_dbg(priv, "Otherend caps=0x%x\n", p->caps);
> +
>  	psn = random32() & 0xffffff;
>  	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
>  	if (ret)
> @@ -696,6 +711,9 @@ copied:
>          if (skb->dev->priv_flags & IFF_EIPOIB_VIF)
>  		set_skb_oob_cb_data(skb, wc, NULL);
>  
> +	if (cm_ibcrc_as_csum)
> +		skb->ip_summed = CHECKSUM_UNNECESSARY;
> +
>  	netif_receive_skb(skb);
>  
>  repost:
> @@ -732,14 +750,43 @@ static inline int post_send(struct ipoib_dev_priv *priv,
>  	return ib_post_send(tx->qp, &send_ring->tx_wr, &bad_wr);
>  }
>  
> +static inline int post_send_sg(struct ipoib_dev_priv *priv,
> +			       struct ipoib_cm_tx *tx,
> +			       unsigned int wr_id,
> +			       struct sk_buff *skb,
> +			       u64 mapping[MAX_SKB_FRAGS + 1],
> +			       struct ipoib_send_ring *send_ring)
> +{
> +	struct ib_send_wr *bad_wr;
> +	int i, off;
> +	skb_frag_t *frags = skb_shinfo(skb)->frags;
> +	int nr_frags = skb_shinfo(skb)->nr_frags;
> +	if (skb_headlen(skb)) {
> +		send_ring->tx_sge[0].addr = mapping[0];
> +		send_ring->tx_sge[0].length = skb_headlen(skb);
> +		off = 1;
> +	} else
> +		off = 0;
> +
> +	for (i = 0; i < nr_frags; ++i) {
> +		send_ring->tx_sge[i + off].addr = mapping[i + off];
> +		send_ring->tx_sge[i + off].length = frags[i].size;
> +	}
> +	send_ring->tx_wr.num_sge = nr_frags + off;
> +	send_ring->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
> +
> +	return ib_post_send(tx->qp, &send_ring->tx_wr, &bad_wr);
> +}
> +
>  void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
>  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  	struct ipoib_cm_tx_buf *tx_req;
> -	u64 addr;
> +	u64 addr = 0;
>  	int rc;
>  	struct ipoib_send_ring *send_ring;
>  	u16 queue_index;
> +	struct ipoib_tx_buf sg_tx_req;
>  
>  	queue_index = skb_get_queue_mapping(skb);
>  	send_ring = priv->send_ring + queue_index;
> @@ -768,26 +815,45 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
>  	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
>  	tx_req->skb = skb;
>  
> -	if (skb->len < ipoib_inline_thold && !skb_shinfo(skb)->nr_frags) {
> -		addr = (u64) skb->data;
> -		send_ring->tx_wr.send_flags |= IB_SEND_INLINE;
> -		tx_req->is_inline = 1;
> -	} else {
> -		addr = ib_dma_map_single(priv->ca, skb->data,
> -			skb->len, DMA_TO_DEVICE);
> -		if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> -			++send_ring->stats.tx_errors;
> +	/* Calculate checksum if we support ibcrc_as_csum but peer does not */
> +	if ((skb->ip_summed != CHECKSUM_NONE) && cm_ibcrc_as_csum &&
> +	    !(tx->caps & IPOIB_CM_CAPS_IBCRC_AS_CSUM))
> +		skb_checksum_help(skb);
> +
> +	if (skb_shinfo(skb)->nr_frags) {
> +		sg_tx_req.skb = skb;
> +		if (unlikely(ipoib_dma_map_tx(priv->ca, &sg_tx_req))) {
> +			++dev->stats.tx_errors;
>  			dev_kfree_skb_any(skb);
>  			return;
>  		}
> -
> -		tx_req->mapping = addr;
>  		tx_req->is_inline = 0;
> -		send_ring->tx_wr.send_flags &= ~IB_SEND_INLINE;
> -	}
> +		rc = post_send_sg(priv, tx, tx->tx_head &
> +				  (ipoib_sendq_size - 1),
> +				  skb, sg_tx_req.mapping, send_ring);
> +	} else {
> +		if (skb->len < ipoib_inline_thold &&
> +		    !skb_shinfo(skb)->nr_frags) {
> +			addr = (u64) skb->data;
> +			send_ring->tx_wr.send_flags |= IB_SEND_INLINE;
> +			tx_req->is_inline = 1;
> +		} else {
> +			addr = ib_dma_map_single(priv->ca, skb->data,
> +				skb->len, DMA_TO_DEVICE);
> +			if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> +				++send_ring->stats.tx_errors;
> +				dev_kfree_skb_any(skb);
> +				return;
> +			}
>  
> -	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> -		       addr, skb->len, send_ring);
> +			tx_req->mapping = addr;
> +			tx_req->is_inline = 0;
> +			send_ring->tx_wr.send_flags &= ~IB_SEND_INLINE;
> +		}
> +
> +		rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> +			       addr, skb->len, send_ring);
> +	}
>  	if (unlikely(rc)) {
>  		ipoib_warn(priv, "%s: post_send failed, error %d queue_index: %d skb->len: %d\n",
>  			   __func__, rc, queue_index, skb->len);
> @@ -1012,6 +1078,7 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
>  	struct ib_qp_attr qp_attr;
>  	int qp_attr_mask, ret;
>  	struct sk_buff *skb;
> +	struct ipoib_cm_data *cm_data;
>  
>  	p->mtu = be32_to_cpu(data->mtu);
>  
> @@ -1021,6 +1088,13 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
>  		return -EINVAL;
>  	}
>  
> +	cm_data = (struct ipoib_cm_data *)event->private_data;
> +	ipoib_dbg(priv, "Otherend sig=0x%x\n", be16_to_cpu(cm_data->sig));
> +	if (ipoib_cm_check_proto_sig(be16_to_cpu(cm_data->sig)) &&
> +	    ipoib_cm_check_proto_ver(be16_to_cpu(cm_data->caps)))
> +		p->caps = be16_to_cpu(cm_data->caps);
> +	ipoib_dbg(priv, "Otherend caps=0x%x\n", p->caps);
> +
>  	qp_attr.qp_state = IB_QPS_RTR;
>  	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
>  	if (ret) {
> @@ -1105,6 +1179,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
>  
>  	spin_unlock_irqrestore(&priv->lock, flags);
>  
> +	if (dev->features & NETIF_F_SG)
> +		attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
> +
>  	return ib_create_qp(priv->pd, &attr);
>  }
>  
> @@ -1116,9 +1193,16 @@ static int ipoib_cm_send_req(struct net_device *dev,
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  	struct ipoib_cm_data data = {};
>  	struct ib_cm_req_param req = {};
> +	u16 caps = 0;
> +
> +	caps |= IPOIB_CM_PROTO_VER;
> +	if (cm_ibcrc_as_csum)
> +		caps |= IPOIB_CM_CAPS_IBCRC_AS_CSUM;
>  
>  	data.qpn = cpu_to_be32(priv->qp->qp_num);
>  	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
> +	data.sig = cpu_to_be16(IPOIB_CM_PROTO_SIG);
> +	data.caps = cpu_to_be16(caps);
>  
>  	req.primary_path		= pathrec;
>  	req.alternate_path		= NULL;
> @@ -1594,7 +1678,6 @@ static void ipoib_cm_stale_task(struct work_struct *work)
>  	spin_unlock_irq(&priv->lock);
>  }
>  
> -
>  static ssize_t show_mode(struct device *d, struct device_attribute *attr,
>  			 char *buf)
>  {
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index 9e7d824..28f36dd 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -349,8 +349,7 @@ repost:
>  			   "for buf %d\n", wr_id);
>  }
>  
> -static int ipoib_dma_map_tx(struct ib_device *ca,
> -			    struct ipoib_tx_buf *tx_req)
> +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
>  {
>  	struct sk_buff *skb = tx_req->skb;
>  	u64 *mapping = tx_req->mapping;
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index dcf18df..b17c7ab 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -69,6 +69,12 @@ module_param_named(debug_level, ipoib_debug_level, int, 0644);
>  MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default: 0) (0-1)");
>  #endif
>  
> +int cm_ibcrc_as_csum;
> +
> +module_param_named(cm_ibcrc_as_csum, cm_ibcrc_as_csum, int, 0444);
> +MODULE_PARM_DESC(cm_ibcrc_as_csum, "Indicates whether to utilize IB-CRC as "
> +		 "CSUM in connected mode,(default: 0)");
> +
>  struct ipoib_path_iter {
>  	struct net_device *dev;
>  	struct ipoib_path  path;
> @@ -195,7 +201,12 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
>  	struct ipoib_dev_priv *priv = netdev_priv(dev);
>  
>  	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
> -		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
> +		if (cm_ibcrc_as_csum && (test_bit(IPOIB_FLAG_CSUM,
> +						  &priv->flags)))
> +			dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
> +		else
> +			features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM |
> +				      NETIF_F_TSO);
>  
>  	return features;
>  }
> @@ -249,7 +260,11 @@ int ipoib_set_mode(struct net_device *dev, const char *buf)
>  
>  		send_ring = priv->send_ring;
>  		for (i = 0; i < priv->num_tx_queues; i++) {
> -			send_ring->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
> +			if (cm_ibcrc_as_csum && (test_bit(IPOIB_FLAG_CSUM,
> +							  &priv->flags)))
> +				send_ring->tx_wr.send_flags |= IB_SEND_IP_CSUM;
> +			else
> +				send_ring->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
>  			send_ring++;
>  		}
>  
> -- 
> 1.7.1
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Oct. 5, 2014, 10:35 p.m. UTC | #2
On Sat, Oct 04, 2014 at 04:45:47AM -0700, Yuval Shaia wrote:

> +	if (cm_ibcrc_as_csum)
> +		skb->ip_summed = CHECKSUM_UNNECESSARY;
> +

As I said before, I think this scheme should be using CHECKSUM_PARTIAL
so packets can be forwarded correctly.

 *      PARTIAL: identical to the case for output below.  This may occur
 *          on a packet received directly from another Linux OS, e.g.,
 *          a virtualised Linux kernel on the same host.  The packet can
 *          be treated in the same way as UNNECESSARY except that on
 *          output (i.e., forwarding) the checksum must be filled in
 *          by the OS or the hardware.

Which is exactly what we want.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Shaia Oct. 8, 2014, 10:40 a.m. UTC | #3
On Sun, Oct 05, 2014 at 04:35:00PM -0600, Jason Gunthorpe wrote:
> On Sat, Oct 04, 2014 at 04:45:47AM -0700, Yuval Shaia wrote:
> 
> > +	if (cm_ibcrc_as_csum)
> > +		skb->ip_summed = CHECKSUM_UNNECESSARY;
> > +
> 
> As I said before, I think this scheme should be using CHECKSUM_PARTIAL
> so packets can be forwarded correctly.
> 
>  *      PARTIAL: identical to the case for output below.  This may occur
>  *          on a packet received directly from another Linux OS, e.g.,
>  *          a virtualised Linux kernel on the same host.  The packet can
>  *          be treated in the same way as UNNECESSARY except that on
>  *          output (i.e., forwarding) the checksum must be filled in
>  *          by the OS or the hardware.
> 
> Which is exactly what we want.
Will do, thanks.
This is in case this code runs on bridge, right?
> 
> Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jason Gunthorpe Oct. 8, 2014, 5:14 p.m. UTC | #4
On Wed, Oct 08, 2014 at 01:40:29PM +0300, Yuval Shaia wrote:
> > Which is exactly what we want.

> Will do, thanks.
> This is in case this code runs on bridge, right?

It covers many cases, including linux briding and routing functions,
as well as packet dumping.

I don't know if any special actions are needed to support
CHECKSUM_PARTIAL, you should probably check the other drivers that use
it.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Chuck Lever Oct. 8, 2014, 6:34 p.m. UTC | #5
Hi Yuval-

On Oct 4, 2014, at 7:45 AM, Yuval Shaia <yuval.shaia@oracle.com> wrote:

> This enhancement suggest the usage of IB CRC instead of CSUM in IPoIB CM.
> IPoIB Connected Mode driver uses RC (Reliable Connection) which guarantees the
> corruption free delivery of the packet.

Some naïve questions:

Is this interoperable with other IPoIB implementations, such as Solaris?

Are there implications for IPv6, where checksumming is not optional?
Maybe I misunderstood what is being replaced.

What happens if IPoIB uses UD instead of RC?

When IPoIB traffic is routed off the IB fabric, isn’t end-to-end checksumming
lost because the router node has to fabricate an IP checksum that wasn’t
provided by the sender?



> InfiniBand uses 32b CRC which provides stronger data integrity protection
> compare to 16b IP Checksum.
> So, there is no added value that IP Checksum provides in the IB world.
> The proposal is to tell that network stack that IPoIB-CM supports IP Checksum
> offload.
> This enables Linux to save the time of checksum calculation of IPoIB CM packets.
> Network sends the IP packet without adding the IP Checksum to the header.
> On the receive side, IPoIB driver again tells the network stack that IP
> Checksum is good for the incoming packets and network stack avoids the IP
> Checksum calculations.
> During connection establishment the driver determine if the other end supports
> IB CRC as checksum.
> This is done so driver will be able to calculate checksum before transmiting
> the packet in case the other end does not support this feature.
> A support for fragmented skb is added to transmit path.
> 
> Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> ---
> drivers/infiniband/ulp/ipoib/ipoib.h      |   25 ++++++
> drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  117 ++++++++++++++++++++++++----
> drivers/infiniband/ulp/ipoib/ipoib_ib.c   |    3 +-
> drivers/infiniband/ulp/ipoib/ipoib_main.c |   19 ++++-
> 4 files changed, 143 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> index 0ed9aed..6fedf83 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> @@ -100,6 +100,7 @@ enum {
> 	IPOIB_FLAG_AUTO_MODER     = 13, /*indicates moderation is running*/
> 	/*indicates if event handler was registered*/
> 	IPOIB_FLAG_EVENTS_REGISTERED    = 14,
> +	IPOIB_FLAG_CSUM		  = 15,
> 
> 	IPOIB_MAX_BACKOFF_SECONDS = 16,
> 
> @@ -192,9 +193,28 @@ struct ipoib_pmtu_update {
> 
> struct ib_cm_id;
> 
> +#define IPOIB_CM_PROTO_SIG		0x2211
> +#define IPOIB_CM_PROTO_VER		(1UL << 12)
> +
> +static inline int ipoib_cm_check_proto_sig(u16 proto_sig)
> +{
> +	return proto_sig & IPOIB_CM_PROTO_SIG;
> +};
> +
> +static inline int ipoib_cm_check_proto_ver(u16 caps)
> +{
> +	return caps & IPOIB_CM_PROTO_VER;
> +};
> +
> +enum ipoib_cm_data_caps {
> +	IPOIB_CM_CAPS_IBCRC_AS_CSUM	= 1UL << 0,
> +};
> +
> struct ipoib_cm_data {
> 	__be32 qpn; /* High byte MUST be ignored on receive */
> 	__be32 mtu;
> +	__be16 sig; /* must be IPOIB_CM_PROTO_SIG */
> +	__be16 caps; /* 4 bits proto ver and 12 bits capabilities */
> };
> 
> /*
> @@ -241,6 +261,7 @@ struct ipoib_cm_rx {
> 	int			recv_count;
> 	u32			qpn;
> 	int index; /* For ring counters */
> +	u16			caps;
> };
> 
> struct ipoib_cm_tx {
> @@ -255,6 +276,7 @@ struct ipoib_cm_tx {
> 	unsigned	     tx_tail;
> 	unsigned long	     flags;
> 	u32		     mtu;
> +	u16		     caps;
> };
> 
> struct ipoib_cm_rx_buf {
> @@ -558,6 +580,8 @@ void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid);
> extern struct workqueue_struct *ipoib_workqueue;
> extern struct workqueue_struct *ipoib_auto_moder_workqueue;
> 
> +extern int cm_ibcrc_as_csum;
> +
> /* functions */
> 
> int ipoib_poll(struct napi_struct *napi, int budget);
> @@ -613,6 +637,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
> 
> void ipoib_mcast_dev_down(struct net_device *dev);
> void ipoib_mcast_dev_flush(struct net_device *dev);
> +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
> 
> #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
> struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> index db0303e..553569a 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> @@ -436,9 +436,16 @@ static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
> 	struct ipoib_dev_priv *priv = netdev_priv(dev);
> 	struct ipoib_cm_data data = {};
> 	struct ib_cm_rep_param rep = {};
> +	u16 caps = 0;
> +
> +	caps |= IPOIB_CM_PROTO_VER;
> +	if (cm_ibcrc_as_csum)
> +		caps |= IPOIB_CM_CAPS_IBCRC_AS_CSUM;
> 
> 	data.qpn = cpu_to_be32(priv->qp->qp_num);
> 	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
> +	data.sig = cpu_to_be16(IPOIB_CM_PROTO_SIG);
> +	data.caps = cpu_to_be16(caps);
> 
> 	rep.private_data = &data;
> 	rep.private_data_len = sizeof data;
> @@ -458,6 +465,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
> 	struct ipoib_cm_data *data = event->private_data;
> 	unsigned psn;
> 	int ret;
> +	struct ipoib_cm_data *cm_data;
> 
> 	ipoib_dbg(priv, "REQ arrived\n");
> 	p = kzalloc(sizeof *p, GFP_KERNEL);
> @@ -480,6 +488,13 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
> 		goto err_qp;
> 	}
> 
> +	cm_data = (struct ipoib_cm_data *)event->private_data;
> +	ipoib_dbg(priv, "Otherend sig=0x%x\n", be16_to_cpu(cm_data->sig));
> +	if (ipoib_cm_check_proto_sig(be16_to_cpu(cm_data->sig)) &&
> +	    ipoib_cm_check_proto_ver(be16_to_cpu(cm_data->caps)))
> +		p->caps = be16_to_cpu(cm_data->caps);
> +	ipoib_dbg(priv, "Otherend caps=0x%x\n", p->caps);
> +
> 	psn = random32() & 0xffffff;
> 	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
> 	if (ret)
> @@ -696,6 +711,9 @@ copied:
>         if (skb->dev->priv_flags & IFF_EIPOIB_VIF)
> 		set_skb_oob_cb_data(skb, wc, NULL);
> 
> +	if (cm_ibcrc_as_csum)
> +		skb->ip_summed = CHECKSUM_UNNECESSARY;
> +
> 	netif_receive_skb(skb);
> 
> repost:
> @@ -732,14 +750,43 @@ static inline int post_send(struct ipoib_dev_priv *priv,
> 	return ib_post_send(tx->qp, &send_ring->tx_wr, &bad_wr);
> }
> 
> +static inline int post_send_sg(struct ipoib_dev_priv *priv,
> +			       struct ipoib_cm_tx *tx,
> +			       unsigned int wr_id,
> +			       struct sk_buff *skb,
> +			       u64 mapping[MAX_SKB_FRAGS + 1],
> +			       struct ipoib_send_ring *send_ring)
> +{
> +	struct ib_send_wr *bad_wr;
> +	int i, off;
> +	skb_frag_t *frags = skb_shinfo(skb)->frags;
> +	int nr_frags = skb_shinfo(skb)->nr_frags;
> +	if (skb_headlen(skb)) {
> +		send_ring->tx_sge[0].addr = mapping[0];
> +		send_ring->tx_sge[0].length = skb_headlen(skb);
> +		off = 1;
> +	} else
> +		off = 0;
> +
> +	for (i = 0; i < nr_frags; ++i) {
> +		send_ring->tx_sge[i + off].addr = mapping[i + off];
> +		send_ring->tx_sge[i + off].length = frags[i].size;
> +	}
> +	send_ring->tx_wr.num_sge = nr_frags + off;
> +	send_ring->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
> +
> +	return ib_post_send(tx->qp, &send_ring->tx_wr, &bad_wr);
> +}
> +
> void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
> {
> 	struct ipoib_dev_priv *priv = netdev_priv(dev);
> 	struct ipoib_cm_tx_buf *tx_req;
> -	u64 addr;
> +	u64 addr = 0;
> 	int rc;
> 	struct ipoib_send_ring *send_ring;
> 	u16 queue_index;
> +	struct ipoib_tx_buf sg_tx_req;
> 
> 	queue_index = skb_get_queue_mapping(skb);
> 	send_ring = priv->send_ring + queue_index;
> @@ -768,26 +815,45 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> 	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
> 	tx_req->skb = skb;
> 
> -	if (skb->len < ipoib_inline_thold && !skb_shinfo(skb)->nr_frags) {
> -		addr = (u64) skb->data;
> -		send_ring->tx_wr.send_flags |= IB_SEND_INLINE;
> -		tx_req->is_inline = 1;
> -	} else {
> -		addr = ib_dma_map_single(priv->ca, skb->data,
> -			skb->len, DMA_TO_DEVICE);
> -		if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> -			++send_ring->stats.tx_errors;
> +	/* Calculate checksum if we support ibcrc_as_csum but peer does not */
> +	if ((skb->ip_summed != CHECKSUM_NONE) && cm_ibcrc_as_csum &&
> +	    !(tx->caps & IPOIB_CM_CAPS_IBCRC_AS_CSUM))
> +		skb_checksum_help(skb);
> +
> +	if (skb_shinfo(skb)->nr_frags) {
> +		sg_tx_req.skb = skb;
> +		if (unlikely(ipoib_dma_map_tx(priv->ca, &sg_tx_req))) {
> +			++dev->stats.tx_errors;
> 			dev_kfree_skb_any(skb);
> 			return;
> 		}
> -
> -		tx_req->mapping = addr;
> 		tx_req->is_inline = 0;
> -		send_ring->tx_wr.send_flags &= ~IB_SEND_INLINE;
> -	}
> +		rc = post_send_sg(priv, tx, tx->tx_head &
> +				  (ipoib_sendq_size - 1),
> +				  skb, sg_tx_req.mapping, send_ring);
> +	} else {
> +		if (skb->len < ipoib_inline_thold &&
> +		    !skb_shinfo(skb)->nr_frags) {
> +			addr = (u64) skb->data;
> +			send_ring->tx_wr.send_flags |= IB_SEND_INLINE;
> +			tx_req->is_inline = 1;
> +		} else {
> +			addr = ib_dma_map_single(priv->ca, skb->data,
> +				skb->len, DMA_TO_DEVICE);
> +			if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> +				++send_ring->stats.tx_errors;
> +				dev_kfree_skb_any(skb);
> +				return;
> +			}
> 
> -	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> -		       addr, skb->len, send_ring);
> +			tx_req->mapping = addr;
> +			tx_req->is_inline = 0;
> +			send_ring->tx_wr.send_flags &= ~IB_SEND_INLINE;
> +		}
> +
> +		rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> +			       addr, skb->len, send_ring);
> +	}
> 	if (unlikely(rc)) {
> 		ipoib_warn(priv, "%s: post_send failed, error %d queue_index: %d skb->len: %d\n",
> 			   __func__, rc, queue_index, skb->len);
> @@ -1012,6 +1078,7 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
> 	struct ib_qp_attr qp_attr;
> 	int qp_attr_mask, ret;
> 	struct sk_buff *skb;
> +	struct ipoib_cm_data *cm_data;
> 
> 	p->mtu = be32_to_cpu(data->mtu);
> 
> @@ -1021,6 +1088,13 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
> 		return -EINVAL;
> 	}
> 
> +	cm_data = (struct ipoib_cm_data *)event->private_data;
> +	ipoib_dbg(priv, "Otherend sig=0x%x\n", be16_to_cpu(cm_data->sig));
> +	if (ipoib_cm_check_proto_sig(be16_to_cpu(cm_data->sig)) &&
> +	    ipoib_cm_check_proto_ver(be16_to_cpu(cm_data->caps)))
> +		p->caps = be16_to_cpu(cm_data->caps);
> +	ipoib_dbg(priv, "Otherend caps=0x%x\n", p->caps);
> +
> 	qp_attr.qp_state = IB_QPS_RTR;
> 	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
> 	if (ret) {
> @@ -1105,6 +1179,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
> 
> 	spin_unlock_irqrestore(&priv->lock, flags);
> 
> +	if (dev->features & NETIF_F_SG)
> +		attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
> +
> 	return ib_create_qp(priv->pd, &attr);
> }
> 
> @@ -1116,9 +1193,16 @@ static int ipoib_cm_send_req(struct net_device *dev,
> 	struct ipoib_dev_priv *priv = netdev_priv(dev);
> 	struct ipoib_cm_data data = {};
> 	struct ib_cm_req_param req = {};
> +	u16 caps = 0;
> +
> +	caps |= IPOIB_CM_PROTO_VER;
> +	if (cm_ibcrc_as_csum)
> +		caps |= IPOIB_CM_CAPS_IBCRC_AS_CSUM;
> 
> 	data.qpn = cpu_to_be32(priv->qp->qp_num);
> 	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
> +	data.sig = cpu_to_be16(IPOIB_CM_PROTO_SIG);
> +	data.caps = cpu_to_be16(caps);
> 
> 	req.primary_path		= pathrec;
> 	req.alternate_path		= NULL;
> @@ -1594,7 +1678,6 @@ static void ipoib_cm_stale_task(struct work_struct *work)
> 	spin_unlock_irq(&priv->lock);
> }
> 
> -
> static ssize_t show_mode(struct device *d, struct device_attribute *attr,
> 			 char *buf)
> {
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index 9e7d824..28f36dd 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -349,8 +349,7 @@ repost:
> 			   "for buf %d\n", wr_id);
> }
> 
> -static int ipoib_dma_map_tx(struct ib_device *ca,
> -			    struct ipoib_tx_buf *tx_req)
> +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
> {
> 	struct sk_buff *skb = tx_req->skb;
> 	u64 *mapping = tx_req->mapping;
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index dcf18df..b17c7ab 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -69,6 +69,12 @@ module_param_named(debug_level, ipoib_debug_level, int, 0644);
> MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default: 0) (0-1)");
> #endif
> 
> +int cm_ibcrc_as_csum;
> +
> +module_param_named(cm_ibcrc_as_csum, cm_ibcrc_as_csum, int, 0444);
> +MODULE_PARM_DESC(cm_ibcrc_as_csum, "Indicates whether to utilize IB-CRC as "
> +		 "CSUM in connected mode,(default: 0)");
> +
> struct ipoib_path_iter {
> 	struct net_device *dev;
> 	struct ipoib_path  path;
> @@ -195,7 +201,12 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
> 	struct ipoib_dev_priv *priv = netdev_priv(dev);
> 
> 	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
> -		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
> +		if (cm_ibcrc_as_csum && (test_bit(IPOIB_FLAG_CSUM,
> +						  &priv->flags)))
> +			dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
> +		else
> +			features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM |
> +				      NETIF_F_TSO);
> 
> 	return features;
> }
> @@ -249,7 +260,11 @@ int ipoib_set_mode(struct net_device *dev, const char *buf)
> 
> 		send_ring = priv->send_ring;
> 		for (i = 0; i < priv->num_tx_queues; i++) {
> -			send_ring->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
> +			if (cm_ibcrc_as_csum && (test_bit(IPOIB_FLAG_CSUM,
> +							  &priv->flags)))
> +				send_ring->tx_wr.send_flags |= IB_SEND_IP_CSUM;
> +			else
> +				send_ring->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
> 			send_ring++;
> 		}
> 
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
Chuck Lever
chuck[dot]lever[at]oracle[dot]com



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Yuval Shaia Oct. 9, 2014, 8:13 p.m. UTC | #6
On Wed, Oct 08, 2014 at 02:34:07PM -0400, Chuck Lever wrote:
> Hi Yuval-
> 
> On Oct 4, 2014, at 7:45 AM, Yuval Shaia <yuval.shaia@oracle.com> wrote:
> 
> > This enhancement suggest the usage of IB CRC instead of CSUM in IPoIB CM.
> > IPoIB Connected Mode driver uses RC (Reliable Connection) which guarantees the
> > corruption free delivery of the packet.
> 
> Some naïve questions:
> 
> Is this interoperable with other IPoIB implementations, such as Solaris?
Yes, part of the implementation deals with how driver interact with legacy peer and this includes Solaris as well as legacy Linux.
> 
> Are there implications for IPv6, where checksumming is not optional?
> Maybe I misunderstood what is being replaced.
This is not a removal of checksumming, just replacement with other (and better) mechanism.
The good part is that no module will be affected by it as the driver will fake checksumming to upper networking layer.
> 
> What happens if IPoIB uses UD instead of RC?
You mean CM-UD? 
> 
> When IPoIB traffic is routed off the IB fabric, isn’t end-to-end checksumming
> lost because the router node has to fabricate an IP checksum that wasn’t
> provided by the sender?
Yes, it is lost.
This feature should be disabled in such setups.
This feature can be used only in LAN.
> 
> 
> 
> > InfiniBand uses 32b CRC which provides stronger data integrity protection
> > compare to 16b IP Checksum.
> > So, there is no added value that IP Checksum provides in the IB world.
> > The proposal is to tell that network stack that IPoIB-CM supports IP Checksum
> > offload.
> > This enables Linux to save the time of checksum calculation of IPoIB CM packets.
> > Network sends the IP packet without adding the IP Checksum to the header.
> > On the receive side, IPoIB driver again tells the network stack that IP
> > Checksum is good for the incoming packets and network stack avoids the IP
> > Checksum calculations.
> > During connection establishment the driver determine if the other end supports
> > IB CRC as checksum.
> > This is done so driver will be able to calculate checksum before transmiting
> > the packet in case the other end does not support this feature.
> > A support for fragmented skb is added to transmit path.
> > 
> > Signed-off-by: Yuval Shaia <yuval.shaia@oracle.com>
> > ---
> > drivers/infiniband/ulp/ipoib/ipoib.h      |   25 ++++++
> > drivers/infiniband/ulp/ipoib/ipoib_cm.c   |  117 ++++++++++++++++++++++++----
> > drivers/infiniband/ulp/ipoib/ipoib_ib.c   |    3 +-
> > drivers/infiniband/ulp/ipoib/ipoib_main.c |   19 ++++-
> > 4 files changed, 143 insertions(+), 21 deletions(-)
> > 
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> > index 0ed9aed..6fedf83 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> > @@ -100,6 +100,7 @@ enum {
> > 	IPOIB_FLAG_AUTO_MODER     = 13, /*indicates moderation is running*/
> > 	/*indicates if event handler was registered*/
> > 	IPOIB_FLAG_EVENTS_REGISTERED    = 14,
> > +	IPOIB_FLAG_CSUM		  = 15,
> > 
> > 	IPOIB_MAX_BACKOFF_SECONDS = 16,
> > 
> > @@ -192,9 +193,28 @@ struct ipoib_pmtu_update {
> > 
> > struct ib_cm_id;
> > 
> > +#define IPOIB_CM_PROTO_SIG		0x2211
> > +#define IPOIB_CM_PROTO_VER		(1UL << 12)
> > +
> > +static inline int ipoib_cm_check_proto_sig(u16 proto_sig)
> > +{
> > +	return proto_sig & IPOIB_CM_PROTO_SIG;
> > +};
> > +
> > +static inline int ipoib_cm_check_proto_ver(u16 caps)
> > +{
> > +	return caps & IPOIB_CM_PROTO_VER;
> > +};
> > +
> > +enum ipoib_cm_data_caps {
> > +	IPOIB_CM_CAPS_IBCRC_AS_CSUM	= 1UL << 0,
> > +};
> > +
> > struct ipoib_cm_data {
> > 	__be32 qpn; /* High byte MUST be ignored on receive */
> > 	__be32 mtu;
> > +	__be16 sig; /* must be IPOIB_CM_PROTO_SIG */
> > +	__be16 caps; /* 4 bits proto ver and 12 bits capabilities */
> > };
> > 
> > /*
> > @@ -241,6 +261,7 @@ struct ipoib_cm_rx {
> > 	int			recv_count;
> > 	u32			qpn;
> > 	int index; /* For ring counters */
> > +	u16			caps;
> > };
> > 
> > struct ipoib_cm_tx {
> > @@ -255,6 +276,7 @@ struct ipoib_cm_tx {
> > 	unsigned	     tx_tail;
> > 	unsigned long	     flags;
> > 	u32		     mtu;
> > +	u16		     caps;
> > };
> > 
> > struct ipoib_cm_rx_buf {
> > @@ -558,6 +580,8 @@ void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid);
> > extern struct workqueue_struct *ipoib_workqueue;
> > extern struct workqueue_struct *ipoib_auto_moder_workqueue;
> > 
> > +extern int cm_ibcrc_as_csum;
> > +
> > /* functions */
> > 
> > int ipoib_poll(struct napi_struct *napi, int budget);
> > @@ -613,6 +637,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
> > 
> > void ipoib_mcast_dev_down(struct net_device *dev);
> > void ipoib_mcast_dev_flush(struct net_device *dev);
> > +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
> > 
> > #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
> > struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > index db0303e..553569a 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
> > @@ -436,9 +436,16 @@ static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
> > 	struct ipoib_dev_priv *priv = netdev_priv(dev);
> > 	struct ipoib_cm_data data = {};
> > 	struct ib_cm_rep_param rep = {};
> > +	u16 caps = 0;
> > +
> > +	caps |= IPOIB_CM_PROTO_VER;
> > +	if (cm_ibcrc_as_csum)
> > +		caps |= IPOIB_CM_CAPS_IBCRC_AS_CSUM;
> > 
> > 	data.qpn = cpu_to_be32(priv->qp->qp_num);
> > 	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
> > +	data.sig = cpu_to_be16(IPOIB_CM_PROTO_SIG);
> > +	data.caps = cpu_to_be16(caps);
> > 
> > 	rep.private_data = &data;
> > 	rep.private_data_len = sizeof data;
> > @@ -458,6 +465,7 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
> > 	struct ipoib_cm_data *data = event->private_data;
> > 	unsigned psn;
> > 	int ret;
> > +	struct ipoib_cm_data *cm_data;
> > 
> > 	ipoib_dbg(priv, "REQ arrived\n");
> > 	p = kzalloc(sizeof *p, GFP_KERNEL);
> > @@ -480,6 +488,13 @@ static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
> > 		goto err_qp;
> > 	}
> > 
> > +	cm_data = (struct ipoib_cm_data *)event->private_data;
> > +	ipoib_dbg(priv, "Otherend sig=0x%x\n", be16_to_cpu(cm_data->sig));
> > +	if (ipoib_cm_check_proto_sig(be16_to_cpu(cm_data->sig)) &&
> > +	    ipoib_cm_check_proto_ver(be16_to_cpu(cm_data->caps)))
> > +		p->caps = be16_to_cpu(cm_data->caps);
> > +	ipoib_dbg(priv, "Otherend caps=0x%x\n", p->caps);
> > +
> > 	psn = random32() & 0xffffff;
> > 	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
> > 	if (ret)
> > @@ -696,6 +711,9 @@ copied:
> >         if (skb->dev->priv_flags & IFF_EIPOIB_VIF)
> > 		set_skb_oob_cb_data(skb, wc, NULL);
> > 
> > +	if (cm_ibcrc_as_csum)
> > +		skb->ip_summed = CHECKSUM_UNNECESSARY;
> > +
> > 	netif_receive_skb(skb);
> > 
> > repost:
> > @@ -732,14 +750,43 @@ static inline int post_send(struct ipoib_dev_priv *priv,
> > 	return ib_post_send(tx->qp, &send_ring->tx_wr, &bad_wr);
> > }
> > 
> > +static inline int post_send_sg(struct ipoib_dev_priv *priv,
> > +			       struct ipoib_cm_tx *tx,
> > +			       unsigned int wr_id,
> > +			       struct sk_buff *skb,
> > +			       u64 mapping[MAX_SKB_FRAGS + 1],
> > +			       struct ipoib_send_ring *send_ring)
> > +{
> > +	struct ib_send_wr *bad_wr;
> > +	int i, off;
> > +	skb_frag_t *frags = skb_shinfo(skb)->frags;
> > +	int nr_frags = skb_shinfo(skb)->nr_frags;
> > +	if (skb_headlen(skb)) {
> > +		send_ring->tx_sge[0].addr = mapping[0];
> > +		send_ring->tx_sge[0].length = skb_headlen(skb);
> > +		off = 1;
> > +	} else
> > +		off = 0;
> > +
> > +	for (i = 0; i < nr_frags; ++i) {
> > +		send_ring->tx_sge[i + off].addr = mapping[i + off];
> > +		send_ring->tx_sge[i + off].length = frags[i].size;
> > +	}
> > +	send_ring->tx_wr.num_sge = nr_frags + off;
> > +	send_ring->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
> > +
> > +	return ib_post_send(tx->qp, &send_ring->tx_wr, &bad_wr);
> > +}
> > +
> > void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
> > {
> > 	struct ipoib_dev_priv *priv = netdev_priv(dev);
> > 	struct ipoib_cm_tx_buf *tx_req;
> > -	u64 addr;
> > +	u64 addr = 0;
> > 	int rc;
> > 	struct ipoib_send_ring *send_ring;
> > 	u16 queue_index;
> > +	struct ipoib_tx_buf sg_tx_req;
> > 
> > 	queue_index = skb_get_queue_mapping(skb);
> > 	send_ring = priv->send_ring + queue_index;
> > @@ -768,26 +815,45 @@ void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
> > 	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
> > 	tx_req->skb = skb;
> > 
> > -	if (skb->len < ipoib_inline_thold && !skb_shinfo(skb)->nr_frags) {
> > -		addr = (u64) skb->data;
> > -		send_ring->tx_wr.send_flags |= IB_SEND_INLINE;
> > -		tx_req->is_inline = 1;
> > -	} else {
> > -		addr = ib_dma_map_single(priv->ca, skb->data,
> > -			skb->len, DMA_TO_DEVICE);
> > -		if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> > -			++send_ring->stats.tx_errors;
> > +	/* Calculate checksum if we support ibcrc_as_csum but peer does not */
> > +	if ((skb->ip_summed != CHECKSUM_NONE) && cm_ibcrc_as_csum &&
> > +	    !(tx->caps & IPOIB_CM_CAPS_IBCRC_AS_CSUM))
> > +		skb_checksum_help(skb);
> > +
> > +	if (skb_shinfo(skb)->nr_frags) {
> > +		sg_tx_req.skb = skb;
> > +		if (unlikely(ipoib_dma_map_tx(priv->ca, &sg_tx_req))) {
> > +			++dev->stats.tx_errors;
> > 			dev_kfree_skb_any(skb);
> > 			return;
> > 		}
> > -
> > -		tx_req->mapping = addr;
> > 		tx_req->is_inline = 0;
> > -		send_ring->tx_wr.send_flags &= ~IB_SEND_INLINE;
> > -	}
> > +		rc = post_send_sg(priv, tx, tx->tx_head &
> > +				  (ipoib_sendq_size - 1),
> > +				  skb, sg_tx_req.mapping, send_ring);
> > +	} else {
> > +		if (skb->len < ipoib_inline_thold &&
> > +		    !skb_shinfo(skb)->nr_frags) {
> > +			addr = (u64) skb->data;
> > +			send_ring->tx_wr.send_flags |= IB_SEND_INLINE;
> > +			tx_req->is_inline = 1;
> > +		} else {
> > +			addr = ib_dma_map_single(priv->ca, skb->data,
> > +				skb->len, DMA_TO_DEVICE);
> > +			if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
> > +				++send_ring->stats.tx_errors;
> > +				dev_kfree_skb_any(skb);
> > +				return;
> > +			}
> > 
> > -	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> > -		       addr, skb->len, send_ring);
> > +			tx_req->mapping = addr;
> > +			tx_req->is_inline = 0;
> > +			send_ring->tx_wr.send_flags &= ~IB_SEND_INLINE;
> > +		}
> > +
> > +		rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
> > +			       addr, skb->len, send_ring);
> > +	}
> > 	if (unlikely(rc)) {
> > 		ipoib_warn(priv, "%s: post_send failed, error %d queue_index: %d skb->len: %d\n",
> > 			   __func__, rc, queue_index, skb->len);
> > @@ -1012,6 +1078,7 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
> > 	struct ib_qp_attr qp_attr;
> > 	int qp_attr_mask, ret;
> > 	struct sk_buff *skb;
> > +	struct ipoib_cm_data *cm_data;
> > 
> > 	p->mtu = be32_to_cpu(data->mtu);
> > 
> > @@ -1021,6 +1088,13 @@ static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
> > 		return -EINVAL;
> > 	}
> > 
> > +	cm_data = (struct ipoib_cm_data *)event->private_data;
> > +	ipoib_dbg(priv, "Otherend sig=0x%x\n", be16_to_cpu(cm_data->sig));
> > +	if (ipoib_cm_check_proto_sig(be16_to_cpu(cm_data->sig)) &&
> > +	    ipoib_cm_check_proto_ver(be16_to_cpu(cm_data->caps)))
> > +		p->caps = be16_to_cpu(cm_data->caps);
> > +	ipoib_dbg(priv, "Otherend caps=0x%x\n", p->caps);
> > +
> > 	qp_attr.qp_state = IB_QPS_RTR;
> > 	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
> > 	if (ret) {
> > @@ -1105,6 +1179,9 @@ static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
> > 
> > 	spin_unlock_irqrestore(&priv->lock, flags);
> > 
> > +	if (dev->features & NETIF_F_SG)
> > +		attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
> > +
> > 	return ib_create_qp(priv->pd, &attr);
> > }
> > 
> > @@ -1116,9 +1193,16 @@ static int ipoib_cm_send_req(struct net_device *dev,
> > 	struct ipoib_dev_priv *priv = netdev_priv(dev);
> > 	struct ipoib_cm_data data = {};
> > 	struct ib_cm_req_param req = {};
> > +	u16 caps = 0;
> > +
> > +	caps |= IPOIB_CM_PROTO_VER;
> > +	if (cm_ibcrc_as_csum)
> > +		caps |= IPOIB_CM_CAPS_IBCRC_AS_CSUM;
> > 
> > 	data.qpn = cpu_to_be32(priv->qp->qp_num);
> > 	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
> > +	data.sig = cpu_to_be16(IPOIB_CM_PROTO_SIG);
> > +	data.caps = cpu_to_be16(caps);
> > 
> > 	req.primary_path		= pathrec;
> > 	req.alternate_path		= NULL;
> > @@ -1594,7 +1678,6 @@ static void ipoib_cm_stale_task(struct work_struct *work)
> > 	spin_unlock_irq(&priv->lock);
> > }
> > 
> > -
> > static ssize_t show_mode(struct device *d, struct device_attribute *attr,
> > 			 char *buf)
> > {
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > index 9e7d824..28f36dd 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> > @@ -349,8 +349,7 @@ repost:
> > 			   "for buf %d\n", wr_id);
> > }
> > 
> > -static int ipoib_dma_map_tx(struct ib_device *ca,
> > -			    struct ipoib_tx_buf *tx_req)
> > +int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
> > {
> > 	struct sk_buff *skb = tx_req->skb;
> > 	u64 *mapping = tx_req->mapping;
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > index dcf18df..b17c7ab 100644
> > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > @@ -69,6 +69,12 @@ module_param_named(debug_level, ipoib_debug_level, int, 0644);
> > MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default: 0) (0-1)");
> > #endif
> > 
> > +int cm_ibcrc_as_csum;
> > +
> > +module_param_named(cm_ibcrc_as_csum, cm_ibcrc_as_csum, int, 0444);
> > +MODULE_PARM_DESC(cm_ibcrc_as_csum, "Indicates whether to utilize IB-CRC as "
> > +		 "CSUM in connected mode,(default: 0)");
> > +
> > struct ipoib_path_iter {
> > 	struct net_device *dev;
> > 	struct ipoib_path  path;
> > @@ -195,7 +201,12 @@ static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
> > 	struct ipoib_dev_priv *priv = netdev_priv(dev);
> > 
> > 	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
> > -		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
> > +		if (cm_ibcrc_as_csum && (test_bit(IPOIB_FLAG_CSUM,
> > +						  &priv->flags)))
> > +			dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
> > +		else
> > +			features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM |
> > +				      NETIF_F_TSO);
> > 
> > 	return features;
> > }
> > @@ -249,7 +260,11 @@ int ipoib_set_mode(struct net_device *dev, const char *buf)
> > 
> > 		send_ring = priv->send_ring;
> > 		for (i = 0; i < priv->num_tx_queues; i++) {
> > -			send_ring->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
> > +			if (cm_ibcrc_as_csum && (test_bit(IPOIB_FLAG_CSUM,
> > +							  &priv->flags)))
> > +				send_ring->tx_wr.send_flags |= IB_SEND_IP_CSUM;
> > +			else
> > +				send_ring->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
> > 			send_ring++;
> > 		}
> > 
> > -- 
> > 1.7.1
> > 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> --
> Chuck Lever
> chuck[dot]lever[at]oracle[dot]com
> 
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 0ed9aed..6fedf83 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -100,6 +100,7 @@  enum {
 	IPOIB_FLAG_AUTO_MODER     = 13, /*indicates moderation is running*/
 	/*indicates if event handler was registered*/
 	IPOIB_FLAG_EVENTS_REGISTERED    = 14,
+	IPOIB_FLAG_CSUM		  = 15,
 
 	IPOIB_MAX_BACKOFF_SECONDS = 16,
 
@@ -192,9 +193,28 @@  struct ipoib_pmtu_update {
 
 struct ib_cm_id;
 
+#define IPOIB_CM_PROTO_SIG		0x2211
+#define IPOIB_CM_PROTO_VER		(1UL << 12)
+
+static inline int ipoib_cm_check_proto_sig(u16 proto_sig)
+{
+	return proto_sig & IPOIB_CM_PROTO_SIG;
+};
+
+static inline int ipoib_cm_check_proto_ver(u16 caps)
+{
+	return caps & IPOIB_CM_PROTO_VER;
+};
+
+enum ipoib_cm_data_caps {
+	IPOIB_CM_CAPS_IBCRC_AS_CSUM	= 1UL << 0,
+};
+
 struct ipoib_cm_data {
 	__be32 qpn; /* High byte MUST be ignored on receive */
 	__be32 mtu;
+	__be16 sig; /* must be IPOIB_CM_PROTO_SIG */
+	__be16 caps; /* 4 bits proto ver and 12 bits capabilities */
 };
 
 /*
@@ -241,6 +261,7 @@  struct ipoib_cm_rx {
 	int			recv_count;
 	u32			qpn;
 	int index; /* For ring counters */
+	u16			caps;
 };
 
 struct ipoib_cm_tx {
@@ -255,6 +276,7 @@  struct ipoib_cm_tx {
 	unsigned	     tx_tail;
 	unsigned long	     flags;
 	u32		     mtu;
+	u16		     caps;
 };
 
 struct ipoib_cm_rx_buf {
@@ -558,6 +580,8 @@  void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid);
 extern struct workqueue_struct *ipoib_workqueue;
 extern struct workqueue_struct *ipoib_auto_moder_workqueue;
 
+extern int cm_ibcrc_as_csum;
+
 /* functions */
 
 int ipoib_poll(struct napi_struct *napi, int budget);
@@ -613,6 +637,7 @@  int ipoib_mcast_stop_thread(struct net_device *dev, int flush);
 
 void ipoib_mcast_dev_down(struct net_device *dev);
 void ipoib_mcast_dev_flush(struct net_device *dev);
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req);
 
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
index db0303e..553569a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -436,9 +436,16 @@  static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_cm_data data = {};
 	struct ib_cm_rep_param rep = {};
+	u16 caps = 0;
+
+	caps |= IPOIB_CM_PROTO_VER;
+	if (cm_ibcrc_as_csum)
+		caps |= IPOIB_CM_CAPS_IBCRC_AS_CSUM;
 
 	data.qpn = cpu_to_be32(priv->qp->qp_num);
 	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+	data.sig = cpu_to_be16(IPOIB_CM_PROTO_SIG);
+	data.caps = cpu_to_be16(caps);
 
 	rep.private_data = &data;
 	rep.private_data_len = sizeof data;
@@ -458,6 +465,7 @@  static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
 	struct ipoib_cm_data *data = event->private_data;
 	unsigned psn;
 	int ret;
+	struct ipoib_cm_data *cm_data;
 
 	ipoib_dbg(priv, "REQ arrived\n");
 	p = kzalloc(sizeof *p, GFP_KERNEL);
@@ -480,6 +488,13 @@  static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
 		goto err_qp;
 	}
 
+	cm_data = (struct ipoib_cm_data *)event->private_data;
+	ipoib_dbg(priv, "Otherend sig=0x%x\n", be16_to_cpu(cm_data->sig));
+	if (ipoib_cm_check_proto_sig(be16_to_cpu(cm_data->sig)) &&
+	    ipoib_cm_check_proto_ver(be16_to_cpu(cm_data->caps)))
+		p->caps = be16_to_cpu(cm_data->caps);
+	ipoib_dbg(priv, "Otherend caps=0x%x\n", p->caps);
+
 	psn = random32() & 0xffffff;
 	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
 	if (ret)
@@ -696,6 +711,9 @@  copied:
         if (skb->dev->priv_flags & IFF_EIPOIB_VIF)
 		set_skb_oob_cb_data(skb, wc, NULL);
 
+	if (cm_ibcrc_as_csum)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
 	netif_receive_skb(skb);
 
 repost:
@@ -732,14 +750,43 @@  static inline int post_send(struct ipoib_dev_priv *priv,
 	return ib_post_send(tx->qp, &send_ring->tx_wr, &bad_wr);
 }
 
+static inline int post_send_sg(struct ipoib_dev_priv *priv,
+			       struct ipoib_cm_tx *tx,
+			       unsigned int wr_id,
+			       struct sk_buff *skb,
+			       u64 mapping[MAX_SKB_FRAGS + 1],
+			       struct ipoib_send_ring *send_ring)
+{
+	struct ib_send_wr *bad_wr;
+	int i, off;
+	skb_frag_t *frags = skb_shinfo(skb)->frags;
+	int nr_frags = skb_shinfo(skb)->nr_frags;
+	if (skb_headlen(skb)) {
+		send_ring->tx_sge[0].addr = mapping[0];
+		send_ring->tx_sge[0].length = skb_headlen(skb);
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < nr_frags; ++i) {
+		send_ring->tx_sge[i + off].addr = mapping[i + off];
+		send_ring->tx_sge[i + off].length = frags[i].size;
+	}
+	send_ring->tx_wr.num_sge = nr_frags + off;
+	send_ring->tx_wr.wr_id = wr_id | IPOIB_OP_CM;
+
+	return ib_post_send(tx->qp, &send_ring->tx_wr, &bad_wr);
+}
+
 void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_cm_tx_buf *tx_req;
-	u64 addr;
+	u64 addr = 0;
 	int rc;
 	struct ipoib_send_ring *send_ring;
 	u16 queue_index;
+	struct ipoib_tx_buf sg_tx_req;
 
 	queue_index = skb_get_queue_mapping(skb);
 	send_ring = priv->send_ring + queue_index;
@@ -768,26 +815,45 @@  void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_
 	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
 	tx_req->skb = skb;
 
-	if (skb->len < ipoib_inline_thold && !skb_shinfo(skb)->nr_frags) {
-		addr = (u64) skb->data;
-		send_ring->tx_wr.send_flags |= IB_SEND_INLINE;
-		tx_req->is_inline = 1;
-	} else {
-		addr = ib_dma_map_single(priv->ca, skb->data,
-			skb->len, DMA_TO_DEVICE);
-		if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
-			++send_ring->stats.tx_errors;
+	/* Calculate checksum if we support ibcrc_as_csum but peer does not */
+	if ((skb->ip_summed != CHECKSUM_NONE) && cm_ibcrc_as_csum &&
+	    !(tx->caps & IPOIB_CM_CAPS_IBCRC_AS_CSUM))
+		skb_checksum_help(skb);
+
+	if (skb_shinfo(skb)->nr_frags) {
+		sg_tx_req.skb = skb;
+		if (unlikely(ipoib_dma_map_tx(priv->ca, &sg_tx_req))) {
+			++dev->stats.tx_errors;
 			dev_kfree_skb_any(skb);
 			return;
 		}
-
-		tx_req->mapping = addr;
 		tx_req->is_inline = 0;
-		send_ring->tx_wr.send_flags &= ~IB_SEND_INLINE;
-	}
+		rc = post_send_sg(priv, tx, tx->tx_head &
+				  (ipoib_sendq_size - 1),
+				  skb, sg_tx_req.mapping, send_ring);
+	} else {
+		if (skb->len < ipoib_inline_thold &&
+		    !skb_shinfo(skb)->nr_frags) {
+			addr = (u64) skb->data;
+			send_ring->tx_wr.send_flags |= IB_SEND_INLINE;
+			tx_req->is_inline = 1;
+		} else {
+			addr = ib_dma_map_single(priv->ca, skb->data,
+				skb->len, DMA_TO_DEVICE);
+			if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
+				++send_ring->stats.tx_errors;
+				dev_kfree_skb_any(skb);
+				return;
+			}
 
-	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
-		       addr, skb->len, send_ring);
+			tx_req->mapping = addr;
+			tx_req->is_inline = 0;
+			send_ring->tx_wr.send_flags &= ~IB_SEND_INLINE;
+		}
+
+		rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
+			       addr, skb->len, send_ring);
+	}
 	if (unlikely(rc)) {
 		ipoib_warn(priv, "%s: post_send failed, error %d queue_index: %d skb->len: %d\n",
 			   __func__, rc, queue_index, skb->len);
@@ -1012,6 +1078,7 @@  static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
 	struct ib_qp_attr qp_attr;
 	int qp_attr_mask, ret;
 	struct sk_buff *skb;
+	struct ipoib_cm_data *cm_data;
 
 	p->mtu = be32_to_cpu(data->mtu);
 
@@ -1021,6 +1088,13 @@  static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *even
 		return -EINVAL;
 	}
 
+	cm_data = (struct ipoib_cm_data *)event->private_data;
+	ipoib_dbg(priv, "Otherend sig=0x%x\n", be16_to_cpu(cm_data->sig));
+	if (ipoib_cm_check_proto_sig(be16_to_cpu(cm_data->sig)) &&
+	    ipoib_cm_check_proto_ver(be16_to_cpu(cm_data->caps)))
+		p->caps = be16_to_cpu(cm_data->caps);
+	ipoib_dbg(priv, "Otherend caps=0x%x\n", p->caps);
+
 	qp_attr.qp_state = IB_QPS_RTR;
 	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
 	if (ret) {
@@ -1105,6 +1179,9 @@  static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_
 
 	spin_unlock_irqrestore(&priv->lock, flags);
 
+	if (dev->features & NETIF_F_SG)
+		attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
+
 	return ib_create_qp(priv->pd, &attr);
 }
 
@@ -1116,9 +1193,16 @@  static int ipoib_cm_send_req(struct net_device *dev,
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct ipoib_cm_data data = {};
 	struct ib_cm_req_param req = {};
+	u16 caps = 0;
+
+	caps |= IPOIB_CM_PROTO_VER;
+	if (cm_ibcrc_as_csum)
+		caps |= IPOIB_CM_CAPS_IBCRC_AS_CSUM;
 
 	data.qpn = cpu_to_be32(priv->qp->qp_num);
 	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+	data.sig = cpu_to_be16(IPOIB_CM_PROTO_SIG);
+	data.caps = cpu_to_be16(caps);
 
 	req.primary_path		= pathrec;
 	req.alternate_path		= NULL;
@@ -1594,7 +1678,6 @@  static void ipoib_cm_stale_task(struct work_struct *work)
 	spin_unlock_irq(&priv->lock);
 }
 
-
 static ssize_t show_mode(struct device *d, struct device_attribute *attr,
 			 char *buf)
 {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 9e7d824..28f36dd 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -349,8 +349,7 @@  repost:
 			   "for buf %d\n", wr_id);
 }
 
-static int ipoib_dma_map_tx(struct ib_device *ca,
-			    struct ipoib_tx_buf *tx_req)
+int ipoib_dma_map_tx(struct ib_device *ca, struct ipoib_tx_buf *tx_req)
 {
 	struct sk_buff *skb = tx_req->skb;
 	u64 *mapping = tx_req->mapping;
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index dcf18df..b17c7ab 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -69,6 +69,12 @@  module_param_named(debug_level, ipoib_debug_level, int, 0644);
 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default: 0) (0-1)");
 #endif
 
+int cm_ibcrc_as_csum;
+
+module_param_named(cm_ibcrc_as_csum, cm_ibcrc_as_csum, int, 0444);
+MODULE_PARM_DESC(cm_ibcrc_as_csum, "Indicates whether to utilize IB-CRC as "
+		 "CSUM in connected mode,(default: 0)");
+
 struct ipoib_path_iter {
 	struct net_device *dev;
 	struct ipoib_path  path;
@@ -195,7 +201,12 @@  static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_featu
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 
 	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
-		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
+		if (cm_ibcrc_as_csum && (test_bit(IPOIB_FLAG_CSUM,
+						  &priv->flags)))
+			dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
+		else
+			features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM |
+				      NETIF_F_TSO);
 
 	return features;
 }
@@ -249,7 +260,11 @@  int ipoib_set_mode(struct net_device *dev, const char *buf)
 
 		send_ring = priv->send_ring;
 		for (i = 0; i < priv->num_tx_queues; i++) {
-			send_ring->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+			if (cm_ibcrc_as_csum && (test_bit(IPOIB_FLAG_CSUM,
+							  &priv->flags)))
+				send_ring->tx_wr.send_flags |= IB_SEND_IP_CSUM;
+			else
+				send_ring->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
 			send_ring++;
 		}