Message ID | 20180114223603.19961-4-bmt@zurich.ibm.com (mailing list archive) |
---|---|
State | RFC |
Headers | show |
> Subject: [PATCH v3 03/13] Attach/detach SoftiWarp to/from network and RDMA > subsystem > > Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com> > --- > drivers/infiniband/sw/siw/siw_main.c | 816 > +++++++++++++++++++++++++++++++++++ > 1 file changed, 816 insertions(+) > create mode 100644 drivers/infiniband/sw/siw/siw_main.c > > diff --git a/drivers/infiniband/sw/siw/siw_main.c > b/drivers/infiniband/sw/siw/siw_main.c > new file mode 100644 > index 000000000000..1b7fc58d4eb9 > --- /dev/null > +++ b/drivers/infiniband/sw/siw/siw_main.c > @@ -0,0 +1,816 @@ > +/* > + * Software iWARP device driver > + * > + * Authors: Bernard Metzler <bmt@zurich.ibm.com> > + * > + * Copyright (c) 2008-2017, IBM Corporation > + * > + * This software is available to you under a choice of one of two > + * licenses. You may choose to be licensed under the terms of the GNU > + * General Public License (GPL) Version 2, available from the file > + * COPYING in the main directory of this source tree, or the > + * BSD license below: > + * > + * Redistribution and use in source and binary forms, with or > + * without modification, are permitted provided that the following > + * conditions are met: > + * > + * - Redistributions of source code must retain the above copyright notice, > + * this list of conditions and the following disclaimer. > + * > + * - Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * - Neither the name of IBM nor the names of its contributors may be > + * used to endorse or promote products derived from this software without > + * specific prior written permission. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, > + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES > OF > + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND > + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT > HOLDERS > + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN > + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR > IN > + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN > THE > + * SOFTWARE. > + */ > + > +#include <linux/init.h> > +#include <linux/errno.h> > +#include <linux/netdevice.h> > +#include <linux/inetdevice.h> > +#include <net/net_namespace.h> > +#include <linux/rtnetlink.h> > +#include <linux/if_arp.h> > +#include <linux/list.h> > +#include <linux/kernel.h> > +#include <linux/dma-mapping.h> > + > +#include <rdma/ib_verbs.h> > +#include <rdma/ib_smi.h> > +#include <rdma/ib_user_verbs.h> > + > +#include "siw.h" > +#include "siw_obj.h" > +#include "siw_cm.h" > +#include "siw_verbs.h" > +#include <linux/kthread.h> > + > +MODULE_AUTHOR("Bernard Metzler"); > +MODULE_DESCRIPTION("Software iWARP Driver"); > +MODULE_LICENSE("Dual BSD/GPL"); > +MODULE_VERSION("0.2"); > + > +/* transmit from user buffer, if possible */ > +const bool zcopy_tx; > + > +/* Restrict usage of GSO, if hardware peer iwarp is unable to process > + * large packets. gso_seg_limit = 1 lets siw send only packets up to > + * one real MTU in size, but severly limits maximum bandwidth. > + * gso_seg_limit = 0 makes use of GSO (and more than doubles throughput > + * for large transfers). > + */ > +const int gso_seg_limit; > + The GSO configuration needs to default to enable interoperation with all vendors (and comply with the RFCs). So make it 1 please. Jason, would configfs be a reasonable way to allow tweaking these globals? > +/* Attach siw also with loopback devices */ > +const bool loopback_enabled = true; > + I think I asked this before. Why have a knob to enable/disable loopback? > +/* We try to negotiate CRC on, if true */ > +const bool mpa_crc_required; > + > +/* MPA CRC on/off enforced */ > +const bool mpa_crc_strict; > + > +/* Set TCP_NODELAY, and push messages asap */ > +const bool siw_lowdelay = true; > +/* Set TCP_QUICKACK */ > +const bool tcp_quickack; > + > +/* Select MPA version to be used during connection setup */ > +u_char mpa_version = MPA_REVISION_2; > + > +/* Selects MPA P2P mode (additional handshake during connection > + * setup, if true > + */ > +const bool peer_to_peer; > + > +static LIST_HEAD(siw_devlist); > + > +struct task_struct *siw_tx_thread[NR_CPUS]; > +struct crypto_shash *siw_crypto_shash; > + > +static ssize_t show_sw_version(struct device *dev, > + struct device_attribute *attr, char *buf) > +{ > + struct siw_device *sdev = container_of(dev, struct siw_device, > + base_dev.dev); > + > + return sprintf(buf, "%x\n", sdev->attrs.version); > +} > + > +static DEVICE_ATTR(sw_version, 0444, show_sw_version, NULL); > + > +static struct device_attribute *siw_dev_attributes[] = { > + &dev_attr_sw_version > +}; > + > +static int siw_modify_port(struct ib_device *base_dev, u8 port, int mask, > + struct ib_port_modify *props) > +{ > + return -EOPNOTSUPP; > +} > + > +static int siw_device_register(struct siw_device *sdev) > +{ > + struct ib_device *base_dev = &sdev->base_dev; > + int rv, i; > + static int dev_id = 1; > + > + rv = ib_register_device(base_dev, NULL); > + if (rv) { > + pr_warn("siw: %s: registration error %d\n", > + base_dev->name, rv); > + return rv; > + } > + > + for (i = 0; i < ARRAY_SIZE(siw_dev_attributes); ++i) { > + rv = device_create_file(&base_dev->dev, siw_dev_attributes[i]); > + if (rv) { > + pr_warn("siw: %s: create file error: rv=%d\n", > + base_dev->name, rv); > + ib_unregister_device(base_dev); > + return rv; > + } > + } > + siw_debugfs_add_device(sdev); > + > + sdev->attrs.vendor_part_id = dev_id++; > + > + siw_dbg(sdev, "HWaddr=%02x.%02x.%02x.%02x.%02x.%02x\n", > + *(u8 *)sdev->netdev->dev_addr, > + *((u8 *)sdev->netdev->dev_addr + 1), > + *((u8 *)sdev->netdev->dev_addr + 2), > + *((u8 *)sdev->netdev->dev_addr + 3), > + *((u8 *)sdev->netdev->dev_addr + 4), > + *((u8 *)sdev->netdev->dev_addr + 5)); > + > + sdev->is_registered = 1; > + > + return 0; > +} > + > +static void siw_device_deregister(struct siw_device *sdev) > +{ > + int i; > + > + siw_debugfs_del_device(sdev); > + > + if (sdev->is_registered) { > + > + siw_dbg(sdev, "deregister\n"); > + > + for (i = 0; i < ARRAY_SIZE(siw_dev_attributes); ++i) > + device_remove_file(&sdev->base_dev.dev, > + siw_dev_attributes[i]); > + > + ib_unregister_device(&sdev->base_dev); > + } > + if (atomic_read(&sdev->num_ctx) || atomic_read(&sdev->num_srq) || > + atomic_read(&sdev->num_mr) || atomic_read(&sdev->num_cep) || > + atomic_read(&sdev->num_qp) || atomic_read(&sdev->num_cq) || > + atomic_read(&sdev->num_pd)) { > + pr_warn("siw at %s: orphaned resources!\n", > + sdev->netdev->name); > + pr_warn(" CTX %d, SRQ %d, QP %d, CQ %d, MEM %d, CEP > %d, PD %d\n", > + atomic_read(&sdev->num_ctx), > + atomic_read(&sdev->num_srq), > + atomic_read(&sdev->num_qp), > + atomic_read(&sdev->num_cq), > + atomic_read(&sdev->num_mr), > + atomic_read(&sdev->num_cep), > + atomic_read(&sdev->num_pd)); > + } > + > + while (!list_empty(&sdev->cep_list)) { > + struct siw_cep *cep = list_entry(sdev->cep_list.next, > + struct siw_cep, devq); > + list_del(&cep->devq); > + pr_warn("siw: at %s: free orphaned CEP 0x%p, state %d\n", > + sdev->base_dev.name, cep, cep->state); > + kfree(cep); > + } > + sdev->is_registered = 0; > +} > + > +static void siw_device_destroy(struct siw_device *sdev) > +{ > + siw_dbg(sdev, "destroy device\n"); > + siw_idr_release(sdev); > + > + kfree(sdev->base_dev.iwcm); > + dev_put(sdev->netdev); > + > + ib_dealloc_device(&sdev->base_dev); > +} > + > +static struct siw_device *siw_dev_from_netdev(struct net_device *dev) > +{ > + if (!list_empty(&siw_devlist)) { > + struct list_head *pos; > + > + list_for_each(pos, &siw_devlist) { > + struct siw_device *sdev = > + list_entry(pos, struct siw_device, list); > + if (sdev->netdev == dev) > + return sdev; > + } > + } > + return NULL; > +} > + > +static int siw_create_tx_threads(void) > +{ > + int cpu, rv, assigned = 0; > + > + for_each_online_cpu(cpu) { > + /* Skip HT cores */ > + if (cpu % cpumask_weight(topology_sibling_cpumask(cpu))) { > + siw_tx_thread[cpu] = NULL; > + continue; > + } > + siw_tx_thread[cpu] = kthread_create(siw_run_sq, > + (unsigned long *)(long)cpu, > + "siw_tx/%d", cpu); > + if (IS_ERR(siw_tx_thread[cpu])) { > + rv = PTR_ERR(siw_tx_thread[cpu]); > + siw_tx_thread[cpu] = NULL; > + pr_info("Creating TX thread for CPU %d failed", cpu); > + continue; > + } > + kthread_bind(siw_tx_thread[cpu], cpu); > + > + wake_up_process(siw_tx_thread[cpu]); > + assigned++; > + } > + return assigned; > +} > + I know in v2 review, you discussed the TX threads. And you mentioned you had tried workq threads [1], but the introduced lots of delay. Have you re-looked at the workq implementation? If your analysis is several years old, workq threads might provide what you need nowadays... [1] https://www.spinics.net/lists/linux-rdma/msg55646.html Steve. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 23, 2018 at 10:33:48AM -0600, Steve Wise wrote: > > +/* Restrict usage of GSO, if hardware peer iwarp is unable to process > > + * large packets. gso_seg_limit = 1 lets siw send only packets up to > > + * one real MTU in size, but severly limits maximum bandwidth. > > + * gso_seg_limit = 0 makes use of GSO (and more than doubles throughput > > + * for large transfers). > > + */ > > +const int gso_seg_limit; > > + > > The GSO configuration needs to default to enable interoperation with all > vendors (and comply with the RFCs). So make it 1 please. The thing we call GSO in the netstack should be totally transparent on the wire, so at the very least this needs some additional elaboration. > Jason, would configfs be a reasonable way to allow tweaking these globals? Why would we ever even bother to support a mode that is non-conformant on the wire? Just remove it.. Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > On Tue, Jan 23, 2018 at 10:33:48AM -0600, Steve Wise wrote: > > > > +/* Restrict usage of GSO, if hardware peer iwarp is unable to process > > > + * large packets. gso_seg_limit = 1 lets siw send only packets up to > > > + * one real MTU in size, but severly limits maximum bandwidth. > > > + * gso_seg_limit = 0 makes use of GSO (and more than doubles throughput > > > + * for large transfers). > > > + */ > > > +const int gso_seg_limit; > > > + > > > > The GSO configuration needs to default to enable interoperation with all > > vendors (and comply with the RFCs). So make it 1 please. > > The thing we call GSO in the netstack should be totally transparent on > the wire, so at the very least this needs some additional elaboration. > From: https://tools.ietf.org/html/rfc5041#section-5.2 "At the Data Source, the DDP layer MUST segment the data contained in a ULP message into a series of DDP Segments, where each DDP Segment contains a DDP Header and ULP Payload, and MUST be no larger than the MULPDU value Advertised by the LLP." Where MULDPDU is the maximum ULP PDU that will fit in the TCP MSS... > > Jason, would configfs be a reasonable way to allow tweaking these globals? > > Why would we ever even bother to support a mode that is non-conformant > on the wire? Just remove it.. For soft iwarp, the throughput is greatly increased with allowing these iWARP protocol PDUs to span many TCP segments. So there could be an argument for leaving it as a knob for soft iwarp <-> soft iwarp configurations. But IMO it needs to default to RFC compliance and interoperability with hw implementations. Steve. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 23, 2018 at 10:58:01AM -0600, Steve Wise wrote: > From: https://tools.ietf.org/html/rfc5041#section-5.2 > > "At the Data Source, the DDP layer MUST segment the data contained in > a ULP message into a series of DDP Segments, where each DDP Segment > contains a DDP Header and ULP Payload, and MUST be no larger than the > MULPDU value Advertised by the LLP." > > Where MULDPDU is the maximum ULP PDU that will fit in the TCP MSS... But exceeding the MULPDU has nothing to do with the netstack GSO function.. right? GSO is entirely a local node optimization that should not be detectable on the wire. > > > Jason, would configfs be a reasonable way to allow tweaking these > globals? > > > > Why would we ever even bother to support a mode that is non-conformant > > on the wire? Just remove it.. > > For soft iwarp, the throughput is greatly increased with allowing these > iWARP protocol PDUs to span many TCP segments. So there could be an > argument for leaving it as a knob for soft iwarp <-> soft iwarp > configurations. But IMO it needs to default to RFC compliance and > interoperability with hw implementations. IMHO the purpose of things like rxe and siw is not maximum raw performance (certainly not during the initial kernel accept phase), so it should not include any non-conformant cruft.. Jsaon -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
-----Jason Gunthorpe <jgg@ziepe.ca> wrote: ----- >To: Steve Wise <swise@opengridcomputing.com> >From: Jason Gunthorpe <jgg@ziepe.ca> >Date: 01/23/2018 05:43PM >Cc: "'Bernard Metzler'" <bmt@zurich.ibm.com>, >linux-rdma@vger.kernel.org >Subject: Re: [PATCH v3 03/13] Attach/detach SoftiWarp to/from network >and RDMA subsystem > >On Tue, Jan 23, 2018 at 10:33:48AM -0600, Steve Wise wrote: > >> > +/* Restrict usage of GSO, if hardware peer iwarp is unable to >process >> > + * large packets. gso_seg_limit = 1 lets siw send only packets >up to >> > + * one real MTU in size, but severly limits maximum bandwidth. >> > + * gso_seg_limit = 0 makes use of GSO (and more than doubles >throughput >> > + * for large transfers). >> > + */ >> > +const int gso_seg_limit; >> > + >> >> The GSO configuration needs to default to enable interoperation >with all >> vendors (and comply with the RFCs). So make it 1 please. > >The thing we call GSO in the netstack should be totally transparent >on >the wire, so at the very least this needs some additional >elaboration. > >> Jason, would configfs be a reasonable way to allow tweaking these >globals? > >Why would we ever even bother to support a mode that is >non-conformant >on the wire? Just remove it.. > siw as a software RDMA provider benefits from GSO alot. Building 8 frames with 8 headers and trailers brings max throughput down to some 3.5GB/s. Using GSO, and shipping 64K we see 7.8GB/s or so. That's what TCP on the socket API has to offer. Strictly speaking, siw reads the current segment size as it gets it from TCP. Older Chelsio adapters were able to process large packets, the latest HW is not it seems. I was thinking of leaving frame size as one MTU, but check if the peer (siw?) can handle more. There are sufficient spare bits in the MPA req/rep header to negotiate that, which shall be ignored by hardware ;) siw can never fully comply with what a hardware iwarp wants to get - it sits on top of a TCP stream. Under load, we see lots of frame fragmentation somewhere in the middle - sometimes just the header makes it glued after the trailer of the previous frame in one TCP segment, sometimes the wire-frame breaks in the middle of the data part, sometimes all data of a DDP frame are there, but the trailer checksum is still missing...the TCP send window is something siw cannot influence. Thanks, Bernard. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
--- Bernard Metzler, PhD Tech. Leader High Performance I/O, Principal Research Staff IBM Zurich Research Laboratory Saeumerstrasse 4 CH-8803 Rueschlikon, Switzerland +41 44 724 8605 -----"Steve Wise" <swise@opengridcomputing.com> wrote: ----- >To: "'Jason Gunthorpe'" <jgg@ziepe.ca> >From: "Steve Wise" <swise@opengridcomputing.com> >Date: 01/23/2018 05:58PM >Cc: "'Bernard Metzler'" <bmt@zurich.ibm.com>, ><linux-rdma@vger.kernel.org> >Subject: RE: [PATCH v3 03/13] Attach/detach SoftiWarp to/from network >and RDMA subsystem > >> >> On Tue, Jan 23, 2018 at 10:33:48AM -0600, Steve Wise wrote: >> >> > > +/* Restrict usage of GSO, if hardware peer iwarp is unable to >process >> > > + * large packets. gso_seg_limit = 1 lets siw send only packets >up to >> > > + * one real MTU in size, but severly limits maximum bandwidth. >> > > + * gso_seg_limit = 0 makes use of GSO (and more than doubles >throughput >> > > + * for large transfers). >> > > + */ >> > > +const int gso_seg_limit; >> > > + >> > >> > The GSO configuration needs to default to enable interoperation >with all >> > vendors (and comply with the RFCs). So make it 1 please. >> >> The thing we call GSO in the netstack should be totally transparent >on >> the wire, so at the very least this needs some additional >elaboration. >> > >From: >https://urldefense.proofpoint.com/v2/url?u=https-3A__tools.ietf.org_h >tml_rfc5041-23section-2D5.2&d=DwICAg&c=jf_iaSHvJObTbx-siA1ZOg&r=2TaYX >Q0T-r8ZO1PP1alNwU_QJcRRLfmYTAgd3QCvqSc&m=maNYrMpuQjDoXG0NADT5M-LhIwmL >KrmFzWtWh4J3sAc&s=ZDYAQqhyaTuRtFqQpA95k0oT6ildygWkkmJ3fqIWNzU&e= > >"At the Data Source, the DDP layer MUST segment the data contained in > a ULP message into a series of DDP Segments, where each DDP >Segment > contains a DDP Header and ULP Payload, and MUST be no larger than >the > MULPDU value Advertised by the LLP." TCP advertises 64k on the kernel socket ;) -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > On Tue, Jan 23, 2018 at 10:58:01AM -0600, Steve Wise wrote: > > > From: https://tools.ietf.org/html/rfc5041#section-5.2 > > > > "At the Data Source, the DDP layer MUST segment the data contained in > > a ULP message into a series of DDP Segments, where each DDP Segment > > contains a DDP Header and ULP Payload, and MUST be no larger than the > > MULPDU value Advertised by the LLP." > > > > Where MULDPDU is the maximum ULP PDU that will fit in the TCP MSS... > > But exceeding the MULPDU has nothing to do with the netstack GSO > function.. right? GSO is entirely a local node optimization that > should not be detectable on the wire. It is not detectable by TCP on the wire, however the iWARP protocols that impose message boundaries, among other things, require that the iWARP PDU fits in a single TCP segment. Since softiwarp is building the iwarp PDU, if it builds one based on a 64K GSO advertised MSS, then the resulting wire packets will have man TCP segments all containing parts of a single iWARP PDU, which violates the spec I quoted. > > > > > Jason, would configfs be a reasonable way to allow tweaking these > > globals? > > > > > > Why would we ever even bother to support a mode that is non-conformant > > > on the wire? Just remove it.. > > > > For soft iwarp, the throughput is greatly increased with allowing these > > iWARP protocol PDUs to span many TCP segments. So there could be an > > argument for leaving it as a knob for soft iwarp <-> soft iwarp > > configurations. But IMO it needs to default to RFC compliance and > > interoperability with hw implementations. > > IMHO the purpose of things like rxe and siw is not maximum raw > performance (certainly not during the initial kernel accept phase), I agree. > so > it should not include any non-conformant cruft.. > IMO leaving it as a non-default-enabled knob is ok in this case. Steve -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 23, 2018 at 11:24:24AM -0600, Steve Wise wrote: > > > > On Tue, Jan 23, 2018 at 10:58:01AM -0600, Steve Wise wrote: > > > > > From: https://tools.ietf.org/html/rfc5041#section-5.2 > > > > > > "At the Data Source, the DDP layer MUST segment the data contained in > > > a ULP message into a series of DDP Segments, where each DDP Segment > > > contains a DDP Header and ULP Payload, and MUST be no larger than the > > > MULPDU value Advertised by the LLP." > > > > > > Where MULDPDU is the maximum ULP PDU that will fit in the TCP MSS... > > > > But exceeding the MULPDU has nothing to do with the netstack GSO > > function.. right? GSO is entirely a local node optimization that > > should not be detectable on the wire. > > It is not detectable by TCP on the wire, however the iWARP protocols that > impose message boundaries, among other things, require that the iWARP PDU > fits in a single TCP segment. Since softiwarp is building the iwarp PDU, if > it builds one based on a 64K GSO advertised MSS, then the resulting wire > packets will have man TCP segments all containing parts of a single iWARP > PDU, which violates the spec I quoted. But that still has nothing to do with GSO, can't you GSS up to MULPDU? Isn't the issue here more that, as Bernard says, siw is totally broken since it can't control the TCP layer segmentation boundaries? :( Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
-----Jason Gunthorpe <jgg@ziepe.ca> wrote: ----- >To: Steve Wise <swise@opengridcomputing.com> >From: Jason Gunthorpe <jgg@ziepe.ca> >Date: 01/23/2018 06:28PM >Cc: "'Bernard Metzler'" <bmt@zurich.ibm.com>, >linux-rdma@vger.kernel.org >Subject: Re: [PATCH v3 03/13] Attach/detach SoftiWarp to/from network >and RDMA subsystem > >On Tue, Jan 23, 2018 at 11:24:24AM -0600, Steve Wise wrote: >> > >> > On Tue, Jan 23, 2018 at 10:58:01AM -0600, Steve Wise wrote: >> > >> > > From: >https://urldefense.proofpoint.com/v2/url?u=https-3A__tools.ietf.org_h >tml_rfc5041-23section-2D5.2&d=DwIBAg&c=jf_iaSHvJObTbx-siA1ZOg&r=2TaYX >Q0T-r8ZO1PP1alNwU_QJcRRLfmYTAgd3QCvqSc&m=bZXyEf2ir9_OhkryeDGP_iwh7sDL >DExD5C6xOEAgGAE&s=N-AUE48HDPRCcFKzdGiM59z8OSmoZKD9AkS1WBpnfLY&e= >> > > >> > > "At the Data Source, the DDP layer MUST segment the data >contained in >> > > a ULP message into a series of DDP Segments, where each DDP >Segment >> > > contains a DDP Header and ULP Payload, and MUST be no larger >than the >> > > MULPDU value Advertised by the LLP." >> > > >> > > Where MULDPDU is the maximum ULP PDU that will fit in the TCP >MSS... >> > >> > But exceeding the MULPDU has nothing to do with the netstack GSO >> > function.. right? GSO is entirely a local node optimization that >> > should not be detectable on the wire. >> >> It is not detectable by TCP on the wire, however the iWARP >protocols that >> impose message boundaries, among other things, require that the >iWARP PDU >> fits in a single TCP segment. Since softiwarp is building the >iwarp PDU, if >> it builds one based on a 64K GSO advertised MSS, then the resulting >wire >> packets will have man TCP segments all containing parts of a single >iWARP >> PDU, which violates the spec I quoted. > >But that still has nothing to do with GSO, can't you GSS up to >MULPDU? > >Isn't the issue here more that, as Bernard says, siw is totally >broken >since it can't control the TCP layer segmentation boundaries? :( > totally broken. hmm. Not integrating siw with the TCP kernel code makes it impossible to avoid these things. Can kernel iSCSI avoid fragmentation and mis-alignment? No. I wanted to avoid tweaking kernel TCP code. That would make siw non acceptable. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > On Tue, Jan 23, 2018 at 11:24:24AM -0600, Steve Wise wrote: > > > > > > On Tue, Jan 23, 2018 at 10:58:01AM -0600, Steve Wise wrote: > > > > > > > From: https://tools.ietf.org/html/rfc5041#section-5.2 > > > > > > > > "At the Data Source, the DDP layer MUST segment the data contained in > > > > a ULP message into a series of DDP Segments, where each DDP Segment > > > > contains a DDP Header and ULP Payload, and MUST be no larger than the > > > > MULPDU value Advertised by the LLP." > > > > > > > > Where MULDPDU is the maximum ULP PDU that will fit in the TCP MSS... > > > > > > But exceeding the MULPDU has nothing to do with the netstack GSO > > > function.. right? GSO is entirely a local node optimization that > > > should not be detectable on the wire. > > > > It is not detectable by TCP on the wire, however the iWARP protocols that > > impose message boundaries, among other things, require that the iWARP PDU > > fits in a single TCP segment. Since softiwarp is building the iwarp PDU, if > > it builds one based on a 64K GSO advertised MSS, then the resulting wire > > packets will have man TCP segments all containing parts of a single iWARP > > PDU, which violates the spec I quoted. > > But that still has nothing to do with GSO, can't you GSS up to MULPDU? > I don't understand your question. > Isn't the issue here more that, as Bernard says, siw is totally broken > since it can't control the TCP layer segmentation boundaries? :( > Since the iwarp protocols run on top of TCP/IP, there is always the case that some middle box resegments tcp segments differently, so a good iwarp HW implementation should deal with funny alignments, partial iWARP PDUs arriving, etc. But the RFCs, as I read them, want implementations to try "really hard" to avoid spanning an iWARP PDU across many TCP segments. And I think siw should do the same, by default. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> >"At the Data Source, the DDP layer MUST segment the data contained in > > a ULP message into a series of DDP Segments, where each DDP > >Segment > > contains a DDP Header and ULP Payload, and MUST be no larger than > >the > > MULPDU value Advertised by the LLP." > > TCP advertises 64k on the kernel socket ;) Yes, but it also advertises the actual wire MSS, right? -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Jan 23, 2018 at 11:42:20AM -0600, Steve Wise wrote: > Since the iwarp protocols run on top of TCP/IP, there is always the case > that some middle box resegments tcp segments differently, so a good iwarp HW > implementation should deal with funny alignments, partial iWARP PDUs > arriving, etc. But the RFCs, as I read them, want implementations to try > "really hard" to avoid spanning an iWARP PDU across many TCP segments. And > I think siw should do the same, by default. But Bernard just said siw doesn't interoperate in certain cases because of this - so that sounds like more than 'try really hard' ?? Or is that an overstatement and it just makes the rx side slower if segmentation is not optimal? Jason -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> > On Tue, Jan 23, 2018 at 11:42:20AM -0600, Steve Wise wrote: > > > Since the iwarp protocols run on top of TCP/IP, there is always the case > > that some middle box resegments tcp segments differently, so a good iwarp > HW > > implementation should deal with funny alignments, partial iWARP PDUs > > arriving, etc. But the RFCs, as I read them, want implementations to try > > "really hard" to avoid spanning an iWARP PDU across many TCP segments. > And > > I think siw should do the same, by default. > > But Bernard just said siw doesn't interoperate in certain cases > because of this - so that sounds like more than 'try really hard' ?? > > Or is that an overstatement and it just makes the rx side slower if > segmentation is not optimal? Creating 64K iWARP PDUs causes interoperability problems. If siw builds iWARP PDUs that fit within the TCP wire MSS, then these problems are avoided, and siw becomes more spec compliant. If the iWARP PDUs are built this way, then nothing the tcp stack does will cause problems other than slow down things. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
-----"Steve Wise" <swise@opengridcomputing.com> wrote: ----- >To: "'Jason Gunthorpe'" <jgg@ziepe.ca> >From: "Steve Wise" <swise@opengridcomputing.com> >Date: 01/23/2018 06:55PM >Cc: "'Bernard Metzler'" <bmt@zurich.ibm.com>, ><linux-rdma@vger.kernel.org> >Subject: RE: [PATCH v3 03/13] Attach/detach SoftiWarp to/from network >and RDMA subsystem > >> >> On Tue, Jan 23, 2018 at 11:42:20AM -0600, Steve Wise wrote: >> >> > Since the iwarp protocols run on top of TCP/IP, there is always >the case >> > that some middle box resegments tcp segments differently, so a >good >iwarp >> HW >> > implementation should deal with funny alignments, partial iWARP >PDUs >> > arriving, etc. But the RFCs, as I read them, want >implementations to >try >> > "really hard" to avoid spanning an iWARP PDU across many TCP >segments. >> And >> > I think siw should do the same, by default. >> >> But Bernard just said siw doesn't interoperate in certain cases >> because of this - so that sounds like more than 'try really hard' >?? >> >> Or is that an overstatement and it just makes the rx side slower if >> segmentation is not optimal? > >Creating 64K iWARP PDUs causes interoperability problems. If siw >builds >iWARP PDUs that fit within the TCP wire MSS, then these problems are >avoided, and siw becomes more spec compliant. If the iWARP PDUs are >built >this way, then nothing the tcp stack does will cause problems other >than >slow down things. > Right. It's just the slowdown what hurts me. But it may improve the need to buy real iWarp HW for good performance ;) See, I came for the other side looking at that - how to make best use of kernel services, if I anyway cannot completely control the wire shape of the frames I push. But I understand that some HW cannot deal with it. Same is true for small messages. if siw puts one 8 byte payload packet in one TCP frame, we end up with 350k IOPs. If we allow to pull multiple of it into one frame, if the send queue has more work to do, it goes much higher, and we avoid much extra delay. I recently removed that optimization, just to avoid discussion on it. -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c new file mode 100644 index 000000000000..1b7fc58d4eb9 --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_main.c @@ -0,0 +1,816 @@ +/* + * Software iWARP device driver + * + * Authors: Bernard Metzler <bmt@zurich.ibm.com> + * + * Copyright (c) 2008-2017, IBM Corporation + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * - Neither the name of IBM nor the names of its contributors may be + * used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/netdevice.h> +#include <linux/inetdevice.h> +#include <net/net_namespace.h> +#include <linux/rtnetlink.h> +#include <linux/if_arp.h> +#include <linux/list.h> +#include <linux/kernel.h> +#include <linux/dma-mapping.h> + +#include <rdma/ib_verbs.h> +#include <rdma/ib_smi.h> +#include <rdma/ib_user_verbs.h> + +#include "siw.h" +#include "siw_obj.h" +#include "siw_cm.h" +#include "siw_verbs.h" +#include <linux/kthread.h> + +MODULE_AUTHOR("Bernard Metzler"); +MODULE_DESCRIPTION("Software iWARP Driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_VERSION("0.2"); + +/* transmit from user buffer, if possible */ +const bool zcopy_tx; + +/* Restrict usage of GSO, if hardware peer iwarp is unable to process + * large packets. gso_seg_limit = 1 lets siw send only packets up to + * one real MTU in size, but severly limits maximum bandwidth. + * gso_seg_limit = 0 makes use of GSO (and more than doubles throughput + * for large transfers). + */ +const int gso_seg_limit; + +/* Attach siw also with loopback devices */ +const bool loopback_enabled = true; + +/* We try to negotiate CRC on, if true */ +const bool mpa_crc_required; + +/* MPA CRC on/off enforced */ +const bool mpa_crc_strict; + +/* Set TCP_NODELAY, and push messages asap */ +const bool siw_lowdelay = true; +/* Set TCP_QUICKACK */ +const bool tcp_quickack; + +/* Select MPA version to be used during connection setup */ +u_char mpa_version = MPA_REVISION_2; + +/* Selects MPA P2P mode (additional handshake during connection + * setup, if true + */ +const bool peer_to_peer; + +static LIST_HEAD(siw_devlist); + +struct task_struct *siw_tx_thread[NR_CPUS]; +struct crypto_shash *siw_crypto_shash; + +static ssize_t show_sw_version(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct siw_device *sdev = container_of(dev, struct siw_device, + base_dev.dev); + + return sprintf(buf, "%x\n", sdev->attrs.version); +} + +static DEVICE_ATTR(sw_version, 0444, show_sw_version, NULL); + +static struct device_attribute *siw_dev_attributes[] = { + &dev_attr_sw_version +}; + +static int siw_modify_port(struct ib_device *base_dev, u8 port, int mask, + struct ib_port_modify *props) +{ + return -EOPNOTSUPP; +} + +static int siw_device_register(struct siw_device *sdev) +{ + struct ib_device *base_dev = &sdev->base_dev; + int rv, i; + static int dev_id = 1; + + rv = ib_register_device(base_dev, NULL); + if (rv) { + pr_warn("siw: %s: registration error %d\n", + base_dev->name, rv); + return rv; + } + + for (i = 0; i < ARRAY_SIZE(siw_dev_attributes); ++i) { + rv = device_create_file(&base_dev->dev, siw_dev_attributes[i]); + if (rv) { + pr_warn("siw: %s: create file error: rv=%d\n", + base_dev->name, rv); + ib_unregister_device(base_dev); + return rv; + } + } + siw_debugfs_add_device(sdev); + + sdev->attrs.vendor_part_id = dev_id++; + + siw_dbg(sdev, "HWaddr=%02x.%02x.%02x.%02x.%02x.%02x\n", + *(u8 *)sdev->netdev->dev_addr, + *((u8 *)sdev->netdev->dev_addr + 1), + *((u8 *)sdev->netdev->dev_addr + 2), + *((u8 *)sdev->netdev->dev_addr + 3), + *((u8 *)sdev->netdev->dev_addr + 4), + *((u8 *)sdev->netdev->dev_addr + 5)); + + sdev->is_registered = 1; + + return 0; +} + +static void siw_device_deregister(struct siw_device *sdev) +{ + int i; + + siw_debugfs_del_device(sdev); + + if (sdev->is_registered) { + + siw_dbg(sdev, "deregister\n"); + + for (i = 0; i < ARRAY_SIZE(siw_dev_attributes); ++i) + device_remove_file(&sdev->base_dev.dev, + siw_dev_attributes[i]); + + ib_unregister_device(&sdev->base_dev); + } + if (atomic_read(&sdev->num_ctx) || atomic_read(&sdev->num_srq) || + atomic_read(&sdev->num_mr) || atomic_read(&sdev->num_cep) || + atomic_read(&sdev->num_qp) || atomic_read(&sdev->num_cq) || + atomic_read(&sdev->num_pd)) { + pr_warn("siw at %s: orphaned resources!\n", + sdev->netdev->name); + pr_warn(" CTX %d, SRQ %d, QP %d, CQ %d, MEM %d, CEP %d, PD %d\n", + atomic_read(&sdev->num_ctx), + atomic_read(&sdev->num_srq), + atomic_read(&sdev->num_qp), + atomic_read(&sdev->num_cq), + atomic_read(&sdev->num_mr), + atomic_read(&sdev->num_cep), + atomic_read(&sdev->num_pd)); + } + + while (!list_empty(&sdev->cep_list)) { + struct siw_cep *cep = list_entry(sdev->cep_list.next, + struct siw_cep, devq); + list_del(&cep->devq); + pr_warn("siw: at %s: free orphaned CEP 0x%p, state %d\n", + sdev->base_dev.name, cep, cep->state); + kfree(cep); + } + sdev->is_registered = 0; +} + +static void siw_device_destroy(struct siw_device *sdev) +{ + siw_dbg(sdev, "destroy device\n"); + siw_idr_release(sdev); + + kfree(sdev->base_dev.iwcm); + dev_put(sdev->netdev); + + ib_dealloc_device(&sdev->base_dev); +} + +static struct siw_device *siw_dev_from_netdev(struct net_device *dev) +{ + if (!list_empty(&siw_devlist)) { + struct list_head *pos; + + list_for_each(pos, &siw_devlist) { + struct siw_device *sdev = + list_entry(pos, struct siw_device, list); + if (sdev->netdev == dev) + return sdev; + } + } + return NULL; +} + +static int siw_create_tx_threads(void) +{ + int cpu, rv, assigned = 0; + + for_each_online_cpu(cpu) { + /* Skip HT cores */ + if (cpu % cpumask_weight(topology_sibling_cpumask(cpu))) { + siw_tx_thread[cpu] = NULL; + continue; + } + siw_tx_thread[cpu] = kthread_create(siw_run_sq, + (unsigned long *)(long)cpu, + "siw_tx/%d", cpu); + if (IS_ERR(siw_tx_thread[cpu])) { + rv = PTR_ERR(siw_tx_thread[cpu]); + siw_tx_thread[cpu] = NULL; + pr_info("Creating TX thread for CPU %d failed", cpu); + continue; + } + kthread_bind(siw_tx_thread[cpu], cpu); + + wake_up_process(siw_tx_thread[cpu]); + assigned++; + } + return assigned; +} + +static int siw_dev_qualified(struct net_device *netdev) +{ + /* + * Additional hardware support can be added here + * (e.g. ARPHRD_FDDI, ARPHRD_ATM, ...) - see + * <linux/if_arp.h> for type identifiers. + */ + if (netdev->type == ARPHRD_ETHER || + netdev->type == ARPHRD_IEEE802 || + (netdev->type == ARPHRD_LOOPBACK && loopback_enabled)) + return 1; + + return 0; +} + +static DEFINE_PER_CPU(atomic_t, use_cnt = ATOMIC_INIT(0)); + +static struct { + struct cpumask **tx_valid_cpus; + int num_nodes; +} siw_cpu_info; + +static int siw_init_cpulist(void) +{ + int i, num_nodes; + + num_nodes = num_possible_nodes(); + siw_cpu_info.num_nodes = num_nodes; + + siw_cpu_info.tx_valid_cpus = kcalloc(num_nodes, sizeof(void *), + GFP_KERNEL); + if (!siw_cpu_info.tx_valid_cpus) { + siw_cpu_info.num_nodes = 0; + return -ENOMEM; + } + + for (i = 0; i < siw_cpu_info.num_nodes; i++) { + siw_cpu_info.tx_valid_cpus[i] = kzalloc(sizeof(struct cpumask), + GFP_KERNEL); + if (!siw_cpu_info.tx_valid_cpus[i]) + goto out_err; + + cpumask_clear(siw_cpu_info.tx_valid_cpus[i]); + } + for_each_possible_cpu(i) + cpumask_set_cpu(i, siw_cpu_info.tx_valid_cpus[cpu_to_node(i)]); + + return 0; + +out_err: + siw_cpu_info.num_nodes = 0; + while (i) { + kfree(siw_cpu_info.tx_valid_cpus[i]); + siw_cpu_info.tx_valid_cpus[i--] = NULL; + } + kfree(siw_cpu_info.tx_valid_cpus); + siw_cpu_info.tx_valid_cpus = NULL; + + return -ENOMEM; +} + +static void siw_destroy_cpulist(void) +{ + int i = 0; + + while (i < siw_cpu_info.num_nodes) + kfree(siw_cpu_info.tx_valid_cpus[i++]); + + kfree(siw_cpu_info.tx_valid_cpus); +} + +/* + * Choose CPU with least number of active QP's from NUMA node of + * TX interface. + */ +int siw_get_tx_cpu(struct siw_device *sdev) +{ + const struct cpumask *tx_cpumask; + int i, num_cpus, cpu, tx_cpu = -1, min_use, + node = sdev->numa_node; + + if (node < 0) + tx_cpumask = cpu_online_mask; + else + tx_cpumask = siw_cpu_info.tx_valid_cpus[node]; + + num_cpus = cpumask_weight(tx_cpumask); + if (!num_cpus) { + /* no CPU on this NUMA node */ + tx_cpumask = cpu_online_mask; + num_cpus = cpumask_weight(tx_cpumask); + } + if (!num_cpus) { + pr_warn("siw: no tx cpu found\n"); + return tx_cpu; + } + cpu = cpumask_first(tx_cpumask); + + for (i = 0, min_use = SIW_MAX_QP; i < num_cpus; + i++, cpu = cpumask_next(cpu, tx_cpumask)) { + int usage; + + /* Skip any cores which have no TX thread */ + if (!siw_tx_thread[cpu]) + continue; + + usage = atomic_inc_return(&per_cpu(use_cnt, cpu)); + + if (usage < min_use) { + min_use = usage; + tx_cpu = cpu; + } else { + atomic_dec_return(&per_cpu(use_cnt, cpu)); + } + if (min_use == 1) + break; + } + siw_dbg(sdev, "tx cpu %d, node %d, %d qp's\n", + cpu, node, min_use); + + return tx_cpu; +} + +void siw_put_tx_cpu(int cpu) +{ + atomic_dec(&per_cpu(use_cnt, cpu)); +} + +static void siw_verbs_sq_flush(struct ib_qp *base_qp) +{ + struct siw_qp *qp = siw_qp_base2siw(base_qp); + + down_write(&qp->state_lock); + siw_sq_flush(qp); + up_write(&qp->state_lock); +} + +static void siw_verbs_rq_flush(struct ib_qp *base_qp) +{ + struct siw_qp *qp = siw_qp_base2siw(base_qp); + + down_write(&qp->state_lock); + siw_rq_flush(qp); + up_write(&qp->state_lock); +} + +static struct ib_ah *siw_create_ah(struct ib_pd *pd, struct rdma_ah_attr *attr, + struct ib_udata *udata) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static int siw_destroy_ah(struct ib_ah *ah) +{ + return -EOPNOTSUPP; +} + +static struct siw_device *siw_device_create(struct net_device *netdev) +{ + struct siw_device *sdev; + struct ib_device *base_dev; + struct device *parent = netdev->dev.parent; + + sdev = (struct siw_device *)ib_alloc_device(sizeof(*sdev)); + if (!sdev) + goto out; + + base_dev = &sdev->base_dev; + + if (!parent) { + /* + * The loopback device has no parent device, + * so it appears as a top-level device. To support + * loopback device connectivity, take this device + * as the parent device. Skip all other devices + * w/o parent device. + */ + if (netdev->type != ARPHRD_LOOPBACK) { + pr_warn("siw: device %s skipped (no parent dev)\n", + netdev->name); + ib_dealloc_device(base_dev); + sdev = NULL; + goto out; + } + parent = &netdev->dev; + } + base_dev->iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL); + if (!base_dev->iwcm) { + ib_dealloc_device(base_dev); + sdev = NULL; + goto out; + } + + sdev->netdev = netdev; + list_add_tail(&sdev->list, &siw_devlist); + + strcpy(base_dev->name, SIW_IBDEV_PREFIX); + strlcpy(base_dev->name + strlen(SIW_IBDEV_PREFIX), netdev->name, + IB_DEVICE_NAME_MAX - strlen(SIW_IBDEV_PREFIX)); + + memset(&base_dev->node_guid, 0, sizeof(base_dev->node_guid)); + + if (netdev->type != ARPHRD_LOOPBACK) { + memcpy(&base_dev->node_guid, netdev->dev_addr, 6); + } else { + /* + * The loopback device does not have a HW address, + * but connection mangagement lib expects gid != 0 + */ + size_t gidlen = min_t(size_t, strlen(base_dev->name), 6); + + memcpy(&base_dev->node_guid, base_dev->name, gidlen); + } + base_dev->owner = THIS_MODULE; + + base_dev->uverbs_cmd_mask = + (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | + (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) | + (1ull << IB_USER_VERBS_CMD_QUERY_PORT) | + (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | + (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | + (1ull << IB_USER_VERBS_CMD_POLL_CQ) | + (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) | + (1ull << IB_USER_VERBS_CMD_CREATE_QP) | + (1ull << IB_USER_VERBS_CMD_QUERY_QP) | + (1ull << IB_USER_VERBS_CMD_MODIFY_QP) | + (1ull << IB_USER_VERBS_CMD_DESTROY_QP) | + (1ull << IB_USER_VERBS_CMD_POST_SEND) | + (1ull << IB_USER_VERBS_CMD_POST_RECV) | + (1ull << IB_USER_VERBS_CMD_CREATE_SRQ) | + (1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) | + (1ull << IB_USER_VERBS_CMD_QUERY_SRQ) | + (1ull << IB_USER_VERBS_CMD_DESTROY_SRQ) | + (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_DEREG_MR) | + (1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV); + + base_dev->node_type = RDMA_NODE_RNIC; + memcpy(base_dev->node_desc, SIW_NODE_DESC_COMMON, + sizeof(SIW_NODE_DESC_COMMON)); + + /* + * Current model (one-to-one device association): + * One Softiwarp device per net_device or, equivalently, + * per physical port. + */ + base_dev->phys_port_cnt = 1; + + base_dev->dev.parent = parent; + base_dev->dev.dma_ops = &dma_virt_ops; + + base_dev->num_comp_vectors = num_possible_cpus(); + base_dev->query_device = siw_query_device; + base_dev->query_port = siw_query_port; + base_dev->get_port_immutable = siw_get_port_immutable; + base_dev->query_qp = siw_query_qp; + base_dev->modify_port = siw_modify_port; + base_dev->query_pkey = siw_query_pkey; + base_dev->query_gid = siw_query_gid; + base_dev->alloc_ucontext = siw_alloc_ucontext; + base_dev->dealloc_ucontext = siw_dealloc_ucontext; + base_dev->mmap = siw_mmap; + base_dev->alloc_pd = siw_alloc_pd; + base_dev->dealloc_pd = siw_dealloc_pd; + base_dev->create_ah = siw_create_ah; + base_dev->destroy_ah = siw_destroy_ah; + base_dev->create_qp = siw_create_qp; + base_dev->modify_qp = siw_verbs_modify_qp; + base_dev->destroy_qp = siw_destroy_qp; + base_dev->create_cq = siw_create_cq; + base_dev->destroy_cq = siw_destroy_cq; + base_dev->resize_cq = NULL; + base_dev->poll_cq = siw_poll_cq; + base_dev->get_dma_mr = siw_get_dma_mr; + base_dev->reg_user_mr = siw_reg_user_mr; + base_dev->dereg_mr = siw_dereg_mr; + base_dev->alloc_mr = siw_alloc_mr; + base_dev->map_mr_sg = siw_map_mr_sg; + base_dev->dealloc_mw = NULL; + + base_dev->create_srq = siw_create_srq; + base_dev->modify_srq = siw_modify_srq; + base_dev->query_srq = siw_query_srq; + base_dev->destroy_srq = siw_destroy_srq; + base_dev->post_srq_recv = siw_post_srq_recv; + + base_dev->attach_mcast = NULL; + base_dev->detach_mcast = NULL; + base_dev->process_mad = siw_no_mad; + + base_dev->req_notify_cq = siw_req_notify_cq; + base_dev->post_send = siw_post_send; + base_dev->post_recv = siw_post_receive; + + base_dev->drain_sq = siw_verbs_sq_flush; + base_dev->drain_rq = siw_verbs_rq_flush; + + base_dev->iwcm->connect = siw_connect; + base_dev->iwcm->accept = siw_accept; + base_dev->iwcm->reject = siw_reject; + base_dev->iwcm->create_listen = siw_create_listen; + base_dev->iwcm->destroy_listen = siw_destroy_listen; + base_dev->iwcm->add_ref = siw_qp_get_ref; + base_dev->iwcm->rem_ref = siw_qp_put_ref; + base_dev->iwcm->get_qp = siw_get_base_qp; + + sdev->attrs.version = VERSION_ID_SOFTIWARP; + sdev->attrs.vendor_id = SIW_VENDOR_ID; + sdev->attrs.sw_version = VERSION_ID_SOFTIWARP; + sdev->attrs.max_qp = SIW_MAX_QP; + sdev->attrs.max_qp_wr = SIW_MAX_QP_WR; + sdev->attrs.max_ord = SIW_MAX_ORD_QP; + sdev->attrs.max_ird = SIW_MAX_IRD_QP; + sdev->attrs.cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; + sdev->attrs.max_sge = SIW_MAX_SGE; + sdev->attrs.max_sge_rd = SIW_MAX_SGE_RD; + sdev->attrs.max_cq = SIW_MAX_CQ; + sdev->attrs.max_cqe = SIW_MAX_CQE; + sdev->attrs.max_mr = SIW_MAX_MR; + sdev->attrs.max_mr_size = rlimit(RLIMIT_MEMLOCK); + sdev->attrs.max_pd = SIW_MAX_PD; + sdev->attrs.max_mw = SIW_MAX_MW; + sdev->attrs.max_fmr = SIW_MAX_FMR; + sdev->attrs.max_srq = SIW_MAX_SRQ; + sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR; + sdev->attrs.max_srq_sge = SIW_MAX_SGE; + + siw_idr_init(sdev); + INIT_LIST_HEAD(&sdev->cep_list); + INIT_LIST_HEAD(&sdev->qp_list); + INIT_LIST_HEAD(&sdev->mr_list); + + atomic_set(&sdev->num_ctx, 0); + atomic_set(&sdev->num_srq, 0); + atomic_set(&sdev->num_qp, 0); + atomic_set(&sdev->num_cq, 0); + atomic_set(&sdev->num_mr, 0); + atomic_set(&sdev->num_pd, 0); + atomic_set(&sdev->num_cep, 0); + + sdev->numa_node = dev_to_node(parent); + + sdev->is_registered = 0; +out: + if (sdev) + dev_hold(netdev); + + return sdev; +} + +static int siw_netdev_event(struct notifier_block *nb, unsigned long event, + void *arg) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(arg); + struct in_device *in_dev; + struct siw_device *sdev; + + dev_dbg(&netdev->dev, "siw: event %lu\n", event); + + if (dev_net(netdev) != &init_net) + goto done; + + sdev = siw_dev_from_netdev(netdev); + + switch (event) { + + case NETDEV_UP: + if (!sdev) + break; + + if (sdev->is_registered) { + sdev->state = IB_PORT_ACTIVE; + siw_port_event(sdev, 1, IB_EVENT_PORT_ACTIVE); + break; + } + in_dev = in_dev_get(netdev); + if (!in_dev) { + dev_dbg(&netdev->dev, "siw: no in_device\n"); + sdev->state = IB_PORT_INIT; + break; + } + if (in_dev->ifa_list) { + sdev->state = IB_PORT_ACTIVE; + if (siw_device_register(sdev)) + sdev->state = IB_PORT_INIT; + } else { + dev_dbg(&netdev->dev, "siw: no ifa_list\n"); + sdev->state = IB_PORT_INIT; + } + in_dev_put(in_dev); + + break; + + case NETDEV_DOWN: + if (sdev && sdev->is_registered) { + sdev->state = IB_PORT_DOWN; + siw_port_event(sdev, 1, IB_EVENT_PORT_ERR); + break; + } + break; + + case NETDEV_REGISTER: + if (!sdev) { + if (!siw_dev_qualified(netdev)) + break; + + sdev = siw_device_create(netdev); + if (sdev) { + sdev->state = IB_PORT_INIT; + dev_dbg(&netdev->dev, "siw: new device\n"); + } + } + break; + + case NETDEV_UNREGISTER: + if (sdev) { + if (sdev->is_registered) + siw_device_deregister(sdev); + list_del(&sdev->list); + siw_device_destroy(sdev); + } + break; + + case NETDEV_CHANGEADDR: + if (sdev->is_registered) + siw_port_event(sdev, 1, IB_EVENT_LID_CHANGE); + + break; + /* + * Todo: Below netdev events are currently not handled. + */ + case NETDEV_CHANGEMTU: + case NETDEV_GOING_DOWN: + case NETDEV_CHANGE: + + break; + + default: + break; + } +done: + return NOTIFY_OK; +} + +static struct notifier_block siw_netdev_nb = { + .notifier_call = siw_netdev_event, +}; + +/* + * siw_init_module - Initialize Softiwarp module and register with netdev + * subsystem to create Softiwarp devices per net_device + */ +static __init int siw_init_module(void) +{ + int rv; + int nr_cpu; + + if (SENDPAGE_THRESH < SIW_MAX_INLINE) { + pr_info("siw: sendpage threshold too small: %u\n", + (int)SENDPAGE_THRESH); + rv = EINVAL; + goto out_error; + } + rv = siw_init_cpulist(); + if (rv) + goto out_error; + + rv = siw_cm_init(); + if (rv) + goto out_error; + + siw_debug_init(); + + /* + * Allocate CRC SHASH object. Fail loading siw only, if CRC is + * required by kernel module + */ + siw_crypto_shash = crypto_alloc_shash("crc32c", 0, 0); + if (IS_ERR(siw_crypto_shash)) { + pr_info("siw: Loading CRC32c failed: %ld\n", + PTR_ERR(siw_crypto_shash)); + siw_crypto_shash = NULL; + if (mpa_crc_required == true) + goto out_error; + } + rv = register_netdevice_notifier(&siw_netdev_nb); + if (rv) { + siw_debugfs_delete(); + goto out_error; + } + for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) + siw_tx_thread[nr_cpu] = NULL; + + if (!siw_create_tx_threads()) { + pr_info("siw: Could not start any TX thread\n"); + unregister_netdevice_notifier(&siw_netdev_nb); + goto out_error; + } + pr_info("SoftiWARP attached\n"); + return 0; + +out_error: + for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) { + if (siw_tx_thread[nr_cpu]) { + siw_stop_tx_thread(nr_cpu); + siw_tx_thread[nr_cpu] = NULL; + } + } + if (siw_crypto_shash) + crypto_free_shash(siw_crypto_shash); + + pr_info("SoftIWARP attach failed. Error: %d\n", rv); + + siw_cm_exit(); + siw_destroy_cpulist(); + + return rv; +} + +static void __exit siw_exit_module(void) +{ + int nr_cpu; + + for (nr_cpu = 0; nr_cpu < nr_cpu_ids; nr_cpu++) { + if (siw_tx_thread[nr_cpu]) { + siw_stop_tx_thread(nr_cpu); + siw_tx_thread[nr_cpu] = NULL; + } + } + unregister_netdevice_notifier(&siw_netdev_nb); + + siw_cm_exit(); + + while (!list_empty(&siw_devlist)) { + struct siw_device *sdev = + list_entry(siw_devlist.next, struct siw_device, list); + list_del(&sdev->list); + if (sdev->is_registered) + siw_device_deregister(sdev); + + siw_device_destroy(sdev); + } + if (siw_crypto_shash) + crypto_free_shash(siw_crypto_shash); + + siw_debugfs_delete(); + siw_destroy_cpulist(); + + pr_info("SoftiWARP detached\n"); +} + +module_init(siw_init_module); +module_exit(siw_exit_module);
Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com> --- drivers/infiniband/sw/siw/siw_main.c | 816 +++++++++++++++++++++++++++++++++++ 1 file changed, 816 insertions(+) create mode 100644 drivers/infiniband/sw/siw/siw_main.c