@@ -107,6 +107,7 @@ source "drivers/infiniband/hw/hfi1/Kconfig"
source "drivers/infiniband/hw/qedr/Kconfig"
source "drivers/infiniband/sw/rdmavt/Kconfig"
source "drivers/infiniband/sw/rxe/Kconfig"
+source "drivers/infiniband/sw/loopback/Kconfig"
endif
source "drivers/infiniband/ulp/ipoib/Kconfig"
@@ -1,2 +1,3 @@
obj-$(CONFIG_INFINIBAND_RDMAVT) += rdmavt/
obj-$(CONFIG_RDMA_RXE) += rxe/
+obj-$(CONFIG_RDMA_LOOPBACK) += loopback/
new file mode 100644
@@ -0,0 +1,14 @@
+config RDMA_LOOPBACK
+ tristate "loopback (RoCE) driver"
+ depends on INET && INFINIBAND
+ depends on ARCH_DMA_ADDR_T_64BIT
+ select DMA_VIRT_OPS
+ help
+ This driver implements the InfiniBand RDMA transport over
+ the Linux network lo netdevice. It enables a system to
+ use a standard lo(loopback) netdevice to emulate completely
+ software RDMA driver. It doesn't implement any tranport
+ layers. It implements only data copier and verbs layer as
+ it works only on top of local lo device. It follows standard
+ Infiniband specification version 1.3 and RoCE annex. This is
+ zero configuration driver.
new file mode 100644
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_RDMA_LOOPBACK) += rdma_loopback.o
+
+rdma_loopback-y := loopback.o helper.o
new file mode 100644
@@ -0,0 +1,1603 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2019 Mellanox Technologies. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_mad.h>
+
+#include "loopback_helper.h"
+
+enum {
+ LOOPBACK_MAX_MR = (1 << 24) - 1,
+ LOOPBACK_MAX_QP = (1 << 24) - 1,
+ LOOPBACK_MAX_CQ = LOOPBACK_MAX_QP,
+ LOOPBACK_MAX_AH = INT_MAX,
+ LOOPBACK_MAX_PD = INT_MAX,
+};
+
+struct loopback_uctx {
+ struct ib_ucontext ibuctx;
+};
+
+struct loopback_pd {
+ struct ib_pd ibpd;
+};
+
+struct loopback_cq {
+ /* resource entry must be first */
+ struct loopback_resource res;
+ struct ib_cq ibcq;
+ struct loopback_fifo cqes;
+ enum ib_cq_notify_flags armed;
+};
+
+enum {
+ LOOPBACK_MR_TYPE_PHY = 0, /* access physical sges of kernel */
+ LOOPBACK_MR_TYPE_USER = 1, /* userspace MR */
+ LOOPBACK_MR_TYPE_DMA = 2, /* DMA MR of kernel */
+ LOOPBACK_MR_TYPE_FRMR = 3, /* FRMR */
+};
+
+struct loopback_usr_mr {
+ struct ib_umem *umem;
+ /* array of pages for this mr to access in datapath */
+ struct page *pages;
+ size_t pages_alloc_size;
+};
+
+struct loopback_frmr {
+ int pg_iter;
+};
+
+struct loopback_mr {
+ struct loopback_resource res;
+ struct ib_mr ibmr;
+ int type;
+ int access;
+
+ u32 fbo;
+ u64 *pg_tbl;
+ /* we store the page shift to make common use for frmr and user mr */
+ u32 page_shift;
+ union {
+ struct loopback_usr_mr umr;
+ struct loopback_frmr frmr;
+ } u;
+};
+
+static u32 mr_id_to_mkey(u32 id)
+{
+ return id << 8;
+}
+
+struct loopback_qp {
+ struct loopback_resource res;
+ struct ib_qp ibqp;
+ enum ib_qp_state state;
+ struct loopback_fifo rqes;
+
+ gfp_t cqe_alloc_flags;
+ gfp_t rqe_alloc_flags;
+ bool user_qp;
+ struct ib_qp_attr attr;
+ struct ib_qp_init_attr init_attr;
+
+};
+
+struct loopback_ah {
+ struct loopback_resource res;
+ struct ib_ah ibah;
+ struct rdma_ah_attr attr;
+};
+
+struct rdma_loopdev {
+ struct ib_device dev;
+
+ struct loopback_resource_table mr_tbl;
+ struct loopback_resource_table cq_tbl;
+ struct loopback_resource_table qp_tbl;
+ struct loopback_resource_table ah_tbl;
+
+ struct ib_port_attr port_attr;
+ struct loopback_mr zero_mr;
+};
+
+static inline struct rdma_loopdev *ib_to_loopdev(struct ib_device *dev)
+{
+ return container_of(dev, struct rdma_loopdev, dev);
+}
+
+static inline struct loopback_mr *ib_to_loop_mr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct loopback_mr, ibmr);
+}
+
+static inline struct loopback_cq *ib_to_loop_cq(struct ib_cq *ibcq)
+{
+ return container_of(ibcq, struct loopback_cq, ibcq);
+}
+
+static inline struct loopback_ah *ib_to_loop_ah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct loopback_ah, ibah);
+}
+
+static inline struct loopback_qp *ib_to_loop_qp(struct ib_qp *ibqp)
+{
+ return container_of(ibqp, struct loopback_qp, ibqp);
+}
+
+struct loopback_cqe {
+ struct list_head list;
+ struct ib_wc wc;
+};
+
+struct loopback_rqe {
+ struct list_head list;
+
+ /* copy of recv wr */
+ struct ib_recv_wr wr;
+ struct ib_sge sges[0];
+};
+
+static struct rdma_loopdev *loopdev;
+static struct net_device *lo;
+
+static void init_loopdev_tables(struct rdma_loopdev *ld)
+{
+ init_table(&ld->ah_tbl, 0, LOOPBACK_MAX_AH, UINT_MAX, 0);
+ init_table(&ld->cq_tbl, 0, LOOPBACK_MAX_CQ, UINT_MAX, 0);
+ init_table(&ld->mr_tbl, 0, LOOPBACK_MAX_MR, 0xffffff00, 8);
+ init_table(&ld->qp_tbl, 1, LOOPBACK_MAX_QP, UINT_MAX, 0);
+}
+
+static int loopback_query_device(struct ib_device *dev,
+ struct ib_device_attr *attr,
+ struct ib_udata *uhw)
+{
+ memset(attr, 0, sizeof(*attr));
+
+ attr->sys_image_guid = dev->node_guid;
+ attr->max_mr_size = ULONG_MAX;
+ attr->page_size_cap = 0;
+ attr->vendor_id = 0;
+ attr->vendor_part_id = 0;
+ attr->hw_ver = 0;
+ attr->max_qp = LOOPBACK_MAX_QP;
+ attr->max_qp_wr = 65536;
+ attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS;
+ attr->max_send_sge = 16;
+ attr->max_recv_sge = 16;
+ attr->max_sge_rd = 16;
+ attr->max_cq = LOOPBACK_MAX_CQ;
+ attr->max_cqe = 65536;
+ attr->max_mr = LOOPBACK_MAX_MR;
+ attr->max_pd = LOOPBACK_MAX_PD;
+ attr->max_qp_rd_atom = 64;
+ attr->max_ee_rd_atom = 0;
+ attr->max_res_rd_atom = 64;
+ attr->max_qp_init_rd_atom = 64;
+ attr->max_ee_init_rd_atom = 64;
+ attr->atomic_cap = 0;
+ attr->masked_atomic_cap = 0;
+ attr->max_ee = 0;
+ attr->max_rdd = 0;
+ attr->max_mw = 0;
+ attr->max_raw_ipv6_qp = 0;
+ attr->max_raw_ethy_qp = 0;
+ attr->max_mcast_grp = 0;
+ attr->max_mcast_qp_attach = 0;
+ attr->max_total_mcast_qp_attach = 0;
+ attr->max_ah = LOOPBACK_MAX_AH;
+ attr->max_srq = 0;
+ attr->max_srq_wr = 0;
+ attr->max_srq_sge = 0;
+ attr->max_fast_reg_page_list_len = 4;
+ attr->max_pkeys = 1;
+ attr->local_ca_ack_delay = 16;
+ attr->sig_prot_cap = 0;
+ attr->sig_guard_cap = 0;
+ attr->timestamp_mask = 0;
+ attr->hca_core_clock = 0; /* TODO */
+ return 0;
+}
+
+static int loopback_query_port(struct ib_device *dev, u8 port_num,
+ struct ib_port_attr *attr)
+{
+ struct rdma_loopdev *ld = ib_to_loopdev(dev);
+
+ *attr = ld->port_attr;
+ attr->max_mtu = IB_MTU_4096;
+ attr->active_mtu = ib_mtu_int_to_enum(lo->mtu);
+ attr->ip_gids = 1;
+ attr->max_msg_sz = 1 << 24;
+ attr->active_width = 2;
+ attr->active_speed = IB_SPEED_HDR;
+ attr->max_vl_num = 1;
+ attr->phys_state = 5; /* TODO */
+ attr->port_cap_flags = IB_PORT_CM_SUP;
+ if (dev_get_flags(lo) & IFF_UP)
+ attr->state = IB_PORT_ACTIVE;
+ else
+ attr->state = IB_PORT_DOWN;
+ return 0;
+}
+
+static struct net_device *
+loopback_get_netdev(struct ib_device *device, u8 port_num)
+{
+ dev_hold(lo);
+ return lo;
+}
+
+static int loopback_query_pkey(struct ib_device *device,
+ u8 port_num, u16 index, u16 *pkey)
+{
+ *pkey = 0xffff;
+ return 0;
+}
+
+static enum rdma_link_layer
+loopback_get_link_layer(struct ib_device *dev, u8 port_num)
+{
+ return IB_LINK_LAYER_ETHERNET;
+}
+
+static int loopback_port_immutable(struct ib_device *dev, u8 port_num,
+ struct ib_port_immutable *immutable)
+{
+ struct rdma_loopdev *ld = ib_to_loopdev(dev);
+
+ immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
+ immutable->pkey_tbl_len = ld->port_attr.pkey_tbl_len;
+ immutable->gid_tbl_len = ld->port_attr.gid_tbl_len;
+ immutable->max_mad_size = IB_MGMT_MAD_SIZE;
+ return 0;
+}
+
+static int
+loopback_alloc_ucontext(struct ib_ucontext *ibuctx, struct ib_udata *udata)
+{
+ return 0;
+}
+
+static void loopback_dealloc_ucontext(struct ib_ucontext *ibuctx)
+{
+}
+
+static int
+loopback_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ /* Yes, we can further enhance core to not need null routines. */
+ return 0;
+}
+
+static void loopback_dealloc_pd(struct ib_pd *ibpd)
+{
+}
+
+static int loopback_poll_cq(struct ib_cq *ibcq, int num_entries,
+ struct ib_wc *wc)
+{
+ struct loopback_cq *cq = ib_to_loop_cq(ibcq);
+ struct loopback_cqe *cqe;
+ struct list_head *entry;
+ int wc_count = 0;
+
+ while (num_entries) {
+ entry = pop_from_fifo(&cq->cqes);
+ if (!entry)
+ break;
+ cqe = container_of(entry, struct loopback_cqe, list);
+ memcpy(&wc[wc_count], &cqe->wc, sizeof(cqe->wc));
+ num_entries--;
+ wc_count++;
+ kfree(cqe);
+ }
+ return wc_count;
+}
+
+static void attempt_notify_cq(struct loopback_cq *cq)
+{
+ unsigned long flags;
+ u64 entries;
+
+ spin_lock_irqsave(&cq->cqes.lock, flags);
+ entries = get_fifo_entries(&cq->cqes);
+ if (cq->armed && entries && cq->ibcq.comp_handler)
+ (*cq->ibcq.comp_handler)(&cq->ibcq, cq->ibcq.cq_context);
+ spin_unlock_irqrestore(&cq->cqes.lock, flags);
+}
+
+static int loopback_req_notify_cq(struct ib_cq *ibcq,
+ enum ib_cq_notify_flags arm)
+{
+ struct loopback_cq *cq = ib_to_loop_cq(ibcq);
+ unsigned long flags;
+ u64 entries;
+
+ spin_lock_irqsave(&cq->cqes.lock, flags);
+ cq->armed = arm;
+ entries = get_fifo_entries(&cq->cqes);
+ if (cq->armed && entries && cq->ibcq.comp_handler)
+ (*cq->ibcq.comp_handler)(&cq->ibcq, cq->ibcq.cq_context);
+ spin_unlock_irqrestore(&cq->cqes.lock, flags);
+ return 0;
+}
+
+static struct ib_cq *
+loopback_create_cq(struct ib_device *dev,
+ const struct ib_cq_init_attr *attr,
+ struct ib_ucontext *context,
+ struct ib_udata *udata)
+{
+ struct rdma_loopdev *ld = ib_to_loopdev(dev);
+ struct loopback_cq *cq;
+ int ret;
+
+ cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+ if (!cq)
+ return ERR_PTR(-ENOMEM);
+ init_fifo(&cq->cqes);
+ ret = attach_table_id(&ld->cq_tbl, &cq->res);
+ if (ret) {
+ kfree(cq);
+ return ERR_PTR(ret);
+ }
+ return &cq->ibcq;
+}
+
+static int loopback_destroy_cq(struct ib_cq *ibcq)
+{
+ struct loopback_cq *cq = ib_to_loop_cq(ibcq);
+ struct rdma_loopdev *ld = ib_to_loopdev(ibcq->device);
+
+ detach_table_id(&ld->cq_tbl, &cq->res);
+ kfree(cq);
+ return 0;
+}
+
+static struct ib_mr *loopback_get_dma_mr(struct ib_pd *ibpd, int access)
+{
+ struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device);
+ struct loopback_mr *mr;
+ int ret;
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+ mr->type = LOOPBACK_MR_TYPE_DMA;
+ mr->ibmr.length = ULONG_MAX;
+ mr->access = access;
+ ret = attach_table_id(&ld->mr_tbl, &mr->res);
+ if (ret) {
+ kfree(mr);
+ return ERR_PTR(ret);
+ }
+ mr->ibmr.lkey = mr_id_to_mkey(mr->res.id);
+ mr->ibmr.rkey = mr->ibmr.lkey;
+ return &mr->ibmr;
+}
+
+static size_t mr_pages_store_size(struct ib_umem *umem)
+{
+ return ib_umem_page_count(umem) * sizeof(struct page *);
+}
+
+static void fill_pg_table(struct loopback_mr *mr, struct ib_umem *umem)
+{
+ int page_size = BIT(mr->u.umr.umem->page_shift);
+ struct scatterlist *sg;
+ int pg_iter = 0;
+ void *map_va;
+ int pg_idx;
+ int pages;
+ int i;
+
+ for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+ pages = sg_dma_len(sg) >> mr->u.umr.umem->page_shift;
+ map_va = page_address(sg_page(sg));
+
+ for (pg_idx = 0; pg_idx < pages; pg_idx++, pg_iter++)
+ mr->pg_tbl[pg_iter] =
+ (u64)map_va + (pg_idx * page_size);
+ }
+}
+
+static void free_mr(struct loopback_mr *mr)
+{
+ kfree(mr->pg_tbl);
+ if (mr->type == LOOPBACK_MR_TYPE_USER) {
+ if (mr->u.umr.umem)
+ ib_umem_release(mr->u.umr.umem);
+ }
+ kfree(mr);
+}
+
+static int loopback_dereg_mr(struct ib_mr *ibmr)
+{
+ struct rdma_loopdev *ld = ib_to_loopdev(ibmr->device);
+ struct loopback_mr *mr = ib_to_loop_mr(ibmr);
+
+ /* First we must drop the reference, so nothing new starts on this
+ * mr, followed by wait for any ongoing operations.
+ * after that free the umem etc. This is done through table callback.
+ */
+ detach_table_id(&ld->mr_tbl, &mr->res);
+ free_mr(mr);
+ return 0;
+}
+
+static struct ib_mr *
+loopback_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 iova,
+ int access, struct ib_udata *udata)
+{
+ struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device);
+ struct loopback_mr *mr;
+ struct ib_umem *umem;
+ struct page *pages;
+ size_t alloc_size;
+ int ret;
+
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+ mr->type = LOOPBACK_MR_TYPE_USER;
+
+ umem = ib_umem_get(udata, start, length, access, 0);
+ if (!umem) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ mr->u.umr.umem = umem;
+
+ alloc_size = mr_pages_store_size(umem);
+ mr->pg_tbl = kmalloc(alloc_size, GFP_KERNEL);
+ if (!mr->pg_tbl) {
+ /* TODO: Remove 2GB registration limit */
+ pr_err("%s alloc_size = %zu, page cnt = %d\n", __func__,
+ alloc_size, ib_umem_page_count(umem));
+ ret = -ENOMEM;
+ goto err;
+ }
+ memset(mr->pg_tbl, 0, alloc_size);
+ mr->u.umr.pages = pages;
+ mr->u.umr.pages_alloc_size = alloc_size;
+ mr->fbo = ib_umem_offset(umem);
+ mr->page_shift = umem->page_shift;
+ mr->access = access;
+ mr->ibmr.iova = iova;
+ mr->ibmr.length = length;
+ fill_pg_table(mr, umem);
+
+ ret = attach_table_id(&ld->mr_tbl, &mr->res);
+ if (ret)
+ goto err;
+ mr->ibmr.lkey = mr_id_to_mkey(mr->res.id);
+ mr->ibmr.rkey = mr->ibmr.lkey;
+ pr_debug("%s mr=0x%x fbo=0x%x, len=%lld pg_sz=%d pg_cnt=%d\n",
+ __func__, mr->ibmr.lkey, mr->fbo,
+ mr->ibmr.length, mr->ibmr.page_size,
+ ib_umem_page_count(umem));
+ return &mr->ibmr;
+
+err:
+ free_mr(mr);
+ return ERR_PTR(ret);
+}
+
+static void *get_dma_mr_va(const struct loopback_mr *mr, u64 va,
+ u32 cpy_len, u32 *ret_len)
+{
+ *ret_len = cpy_len;
+ return ((void *)(uintptr_t)va);
+}
+
+static void *get_virt_mr_va(const struct loopback_mr *mr, u64 va,
+ u32 cpy_len, u32 *ret_len)
+{
+ u64 zero_based_offset;
+ u32 in_pg_offset;
+ u32 byte_offset;
+ u64 pg_addr;
+ void *vaddr;
+ int pg_idx;
+
+ /* zero_based_offset accounts for fbo; due to which it can be
+ * offset by one page.
+ */
+ zero_based_offset = (va - mr->ibmr.iova) + mr->fbo;
+ pg_idx = zero_based_offset / BIT(mr->page_shift);
+
+ byte_offset = va - mr->ibmr.iova;
+
+ if (mr->fbo) {
+ int bytes_in_first_page = 0;
+
+ bytes_in_first_page = BIT(mr->page_shift) - mr->fbo;
+ in_pg_offset = byte_offset -
+ ((BIT(mr->page_shift) * (pg_idx - 1)) + bytes_in_first_page);
+ } else {
+ in_pg_offset = byte_offset - (BIT(mr->page_shift) * pg_idx);
+ }
+ pg_addr = mr->pg_tbl[pg_idx];
+ pg_addr += in_pg_offset;
+ vaddr = ((void *)(uintptr_t)pg_addr);
+ *ret_len = min_t(u32, BIT(mr->page_shift) - in_pg_offset, cpy_len);
+ return vaddr;
+}
+
+static void *get_mr_va(const struct loopback_mr *mr, u64 va,
+ u32 cpy_len, u32 *ret_len)
+{
+ switch (mr->type) {
+ case LOOPBACK_MR_TYPE_PHY:
+ case LOOPBACK_MR_TYPE_DMA:
+ return get_dma_mr_va(mr, va, cpy_len, ret_len);
+ case LOOPBACK_MR_TYPE_USER:
+ case LOOPBACK_MR_TYPE_FRMR:
+ return get_virt_mr_va(mr, va, cpy_len, ret_len);
+ }
+ return NULL;
+}
+
+static void generate_rc_rq_cqe(struct loopback_qp *dqp,
+ struct loopback_rqe *rqe,
+ u32 recv_len,
+ enum ib_wc_status rqe_status,
+ u32 inv_key, u32 wc_flags)
+{
+ struct loopback_cq *recv_cq = ib_to_loop_cq(dqp->ibqp.recv_cq);
+ struct loopback_cqe *rq_cqe;
+
+ /* Generate receive completion on best effort basis */
+ rq_cqe = kzalloc(sizeof(*rq_cqe), dqp->cqe_alloc_flags);
+ if (!rq_cqe)
+ return;
+
+ rq_cqe->wc.qp = &dqp->ibqp;
+ rq_cqe->wc.wr_cqe = rqe->wr.wr_cqe;
+ rq_cqe->wc.status = rqe_status;
+ rq_cqe->wc.byte_len = recv_len;
+ rq_cqe->wc.opcode = IB_WC_RECV;
+ rq_cqe->wc.qp = &dqp->ibqp;
+ rq_cqe->wc.port_num = 1;
+ rq_cqe->wc.wc_flags = wc_flags;
+ rq_cqe->wc.ex.invalidate_rkey = inv_key;
+ push_to_fifo(&recv_cq->cqes, &rq_cqe->list);
+ attempt_notify_cq(recv_cq);
+}
+
+static void generate_ud_rq_cqe(const struct loopback_qp *sqp,
+ struct loopback_qp *dqp,
+ struct loopback_rqe *rqe,
+ u32 recv_len,
+ u8 network_hdr_type, int wc_flags,
+ enum ib_wc_status rqe_status)
+{
+ struct loopback_cq *recv_cq = ib_to_loop_cq(dqp->ibqp.recv_cq);
+ struct loopback_cqe *rq_cqe;
+
+ /* Generate receive completion on best effort basis */
+ rq_cqe = kzalloc(sizeof(*rq_cqe), dqp->cqe_alloc_flags);
+ if (!rq_cqe)
+ return;
+
+ rq_cqe->wc.qp = &dqp->ibqp;
+ rq_cqe->wc.wr_cqe = rqe->wr.wr_cqe;
+ rq_cqe->wc.status = rqe_status;
+ rq_cqe->wc.byte_len = recv_len;
+ rq_cqe->wc.opcode = IB_WC_RECV;
+ rq_cqe->wc.qp = &dqp->ibqp;
+ rq_cqe->wc.src_qp = sqp->ibqp.qp_num;
+ rq_cqe->wc.port_num = 1;
+ rq_cqe->wc.network_hdr_type = network_hdr_type;
+ rq_cqe->wc.wc_flags = wc_flags;
+ push_to_fifo(&recv_cq->cqes, &rq_cqe->list);
+ attempt_notify_cq(recv_cq);
+}
+
+static bool is_qp_supported(enum ib_qp_type type)
+{
+ return (type == IB_QPT_GSI || type == IB_QPT_RC);
+}
+
+static struct ib_qp *
+loopback_create_qp(struct ib_pd *ibpd, struct ib_qp_init_attr *attr,
+ struct ib_udata *udata)
+{
+ struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device);
+ struct loopback_qp *qp;
+ int ret;
+
+ if (!is_qp_supported(attr->qp_type))
+ return ERR_PTR(-EINVAL);
+
+ qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+ if (!qp)
+ return ERR_PTR(-ENOMEM);
+
+ qp->init_attr = *attr;
+ /* Since we generate cqes under rcu read lock,
+ * cqe allocations are atomic.
+ */
+ qp->cqe_alloc_flags = udata ? GFP_KERNEL : GFP_ATOMIC;
+ qp->rqe_alloc_flags = udata ? GFP_KERNEL : GFP_ATOMIC;
+ qp->user_qp = udata ? true : false;
+ init_fifo(&qp->rqes);
+ if (attr->qp_type == IB_QPT_GSI)
+ ret = attach_table_id_for_id(&ld->qp_tbl, &qp->res, 1);
+ else
+ ret = attach_table_id(&ld->qp_tbl, &qp->res);
+ if (ret) {
+ kfree(qp);
+ return ERR_PTR(ret);
+ }
+ qp->ibqp.qp_num = qp->res.id;
+ return &qp->ibqp;
+}
+
+static void loopbak_flush_rq(struct loopback_qp *qp, bool gen_cqe)
+{
+ struct loopback_rqe *rqe;
+ struct list_head *entry;
+
+ while (1) {
+ entry = pop_from_fifo(&qp->rqes);
+ if (!entry)
+ break;
+ rqe = container_of(entry, struct loopback_rqe, list);
+ if (gen_cqe)
+ generate_rc_rq_cqe(qp, rqe, 0, IB_WC_WR_FLUSH_ERR,
+ 0, 0);
+ kfree(rqe);
+ }
+}
+
+static int loopback_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int mask, struct ib_udata *udata)
+{
+ struct loopback_qp *qp = ib_to_loop_qp(ibqp);
+
+ if (mask & IB_QP_ACCESS_FLAGS)
+ qp->attr.qp_access_flags = attr->qp_access_flags;
+ if (mask & IB_QP_DEST_QPN)
+ qp->attr.dest_qp_num = attr->dest_qp_num;
+ if (mask & IB_QP_STATE) {
+ WRITE_ONCE(qp->state, attr->qp_state);
+
+ put_table_entry(&qp->res);
+ /* Wait for all datapath operations to stop */
+ wait_for_completion(&qp->res.completion);
+
+ if (attr->qp_state == IB_QPS_ERR ||
+ attr->qp_state == IB_QPS_RESET) {
+ loopbak_flush_rq(qp, true);
+ }
+ /* Reinit the refcount so that new data path ops can start
+ * after a new state. There is extremely rare corner case
+ * where RTR->RTS transition time, sender is sending the
+ * data, which this driver doesn't support currently.
+ */
+ refcount_set(&qp->res.refcount, 1);
+ }
+ return 0;
+}
+
+static int loopback_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int mask, struct ib_qp_init_attr *init_attr)
+{
+ struct loopback_qp *qp = ib_to_loop_qp(ibqp);
+
+ *attr = qp->attr;
+ *init_attr = qp->init_attr;
+ return 0;
+}
+
+static int loopback_destroy_qp(struct ib_qp *ibqp)
+{
+ struct rdma_loopdev *ld = ib_to_loopdev(ibqp->device);
+ struct loopback_qp *qp = ib_to_loop_qp(ibqp);
+
+ detach_table_id(&ld->qp_tbl, &qp->res);
+ loopbak_flush_rq(qp, false);
+ kfree(qp);
+ return 0;
+}
+
+static enum ib_wc_status sq_opcode_to_wc_opcode(enum ib_wr_opcode opcode)
+{
+ switch (opcode) {
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ return IB_WC_RDMA_WRITE;
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ return IB_WC_SEND;
+ case IB_WR_RDMA_READ:
+ return IB_WC_RDMA_READ;
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ return IB_WC_COMP_SWAP;
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ return IB_WC_FETCH_ADD;
+ case IB_WR_LSO:
+ return IB_WC_LSO;
+ case IB_WR_LOCAL_INV:
+ return IB_WC_LOCAL_INV;
+ case IB_WR_REG_MR:
+ case IB_WR_REG_SIG_MR:
+ return IB_WC_REG_MR;
+ case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+ return IB_WC_MASKED_COMP_SWAP;
+ case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+ return IB_WC_MASKED_FETCH_ADD;
+ default:
+ /* TODO: no better default value */
+ return IB_WC_SEND;
+ }
+}
+
+static u64 get_sges_len(const struct ib_sge *sg_list, int num_sges)
+{
+ u64 size = 0;
+ int i;
+
+ for (i = 0; i < num_sges; i++)
+ size += sg_list[i].length;
+
+ return size;
+}
+
+static u64 get_send_wqe_len(const struct ib_send_wr *wr)
+{
+ u64 send_len = 0;
+
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_INV:
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_WRITE:
+ send_len = get_sges_len(wr->sg_list, wr->num_sge);
+ break;
+ default:
+ break;
+ }
+ return send_len;
+}
+
+static u64 get_rqe_len(struct loopback_rqe *rqe)
+{
+ return get_sges_len(&rqe->sges[0], rqe->wr.num_sge);
+}
+
+static void put_mr(struct loopback_mr *mr)
+{
+ if (!IS_ERR(mr) && mr)
+ put_table_entry(&mr->res);
+}
+
+static struct loopback_mr *get_mr_for_key(struct rdma_loopdev *ld, u32 key)
+{
+ struct loopback_resource *mr_entry;
+ struct loopback_mr *mr;
+
+ mr_entry = get_table_entry_by_id(&ld->mr_tbl, key);
+ if (!mr_entry)
+ return ERR_PTR(-EINVAL);
+ mr = container_of(mr_entry, struct loopback_mr, res);
+ return mr;
+}
+
+static int validate_mr_access(const struct loopback_mr *mr,
+ const struct loopback_qp *qp,
+ enum ib_wr_opcode opc, u64 addr, u32 len)
+{
+ if (len > mr->ibmr.length ||
+ (qp->user_qp && mr->type != LOOPBACK_MR_TYPE_USER))
+ return -EACCES;
+
+ if ((opc == IB_WR_RDMA_WRITE ||
+ opc == IB_WR_RDMA_WRITE_WITH_IMM ||
+ opc == IB_WR_RDMA_READ ||
+ opc == IB_WR_ATOMIC_CMP_AND_SWP ||
+ opc == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+ ((qp->attr.qp_access_flags & mr->access) == 0))
+ return -EACCES;
+
+ /* MR length and iova checks are applicable to FRMR and user type */
+ if (mr->type == LOOPBACK_MR_TYPE_PHY ||
+ mr->type == LOOPBACK_MR_TYPE_DMA)
+ return 0;
+
+ /* address within range check */
+ if (addr < mr->ibmr.iova || addr > mr->ibmr.iova + mr->ibmr.length ||
+ addr + len > mr->ibmr.iova + mr->ibmr.length)
+ return -EACCES;
+ return 0;
+}
+
+static struct loopback_mr *
+get_mr_for_wr_sge(struct rdma_loopdev *ld, const struct loopback_qp *qp,
+ enum ib_wr_opcode opc, const struct ib_sge *sge)
+{
+ struct loopback_mr *mr;
+ int ret;
+
+ mr = get_mr_for_key(ld, sge->lkey);
+ if (IS_ERR(mr))
+ return mr;
+
+ ret = validate_mr_access(mr, qp, opc, sge->addr, sge->length);
+ if (ret) {
+ put_mr(mr);
+ mr = ERR_PTR(ret);
+ }
+ return mr;
+}
+
+static struct loopback_mr *
+get_mr_for_rkey(struct rdma_loopdev *ld, struct loopback_qp *qp,
+ enum ib_wr_opcode opc, u64 addr, u32 len, u32 mkey)
+{
+ struct loopback_mr *mr;
+ int ret;
+
+ mr = get_mr_for_key(ld, mkey);
+ if (IS_ERR(mr))
+ return mr;
+
+ ret = validate_mr_access(mr, qp, opc, addr, len);
+ if (ret) {
+ put_mr(mr);
+ mr = ERR_PTR(ret);
+ }
+ return mr;
+}
+
+static int
+copy_data_wqe_to_rqe(struct rdma_loopdev *ld, struct loopback_qp *sqp,
+ struct loopback_qp *dqp,
+ const struct ib_send_wr *wr, struct loopback_rqe *rqe,
+ u32 send_len, u32 dst_fbo)
+{
+ struct loopback_mr *src_mr = NULL;
+ struct loopback_mr *dst_mr = NULL;
+ u32 src_sge_len = 0;
+ u32 dst_sge_len = 0;
+ u64 src_sge_va = 0;
+ u64 dst_sge_va = 0;
+ u32 ret_len = 0;
+ u32 cpy_len = 0;
+ void *src_addr;
+ void *dst_addr;
+ int s_idx = 0;
+ int d_idx = 0;
+ int ret = 0;
+
+ while (send_len) {
+ if (!src_mr) {
+ src_mr = get_mr_for_wr_sge(ld, sqp, wr->opcode,
+ &wr->sg_list[s_idx]);
+ if (IS_ERR(src_mr)) {
+ ret = PTR_ERR(src_mr);
+ goto err;
+ }
+ src_sge_len = wr->sg_list[s_idx].length;
+ src_sge_va = wr->sg_list[s_idx].addr;
+ }
+ if (!dst_mr) {
+ dst_mr = get_mr_for_wr_sge(ld, sqp, wr->opcode,
+ &rqe->sges[d_idx]);
+ if (IS_ERR(dst_mr)) {
+ ret = PTR_ERR(dst_mr);
+ goto err;
+ }
+ dst_sge_len = rqe->sges[d_idx].length;
+ dst_sge_va = rqe->sges[d_idx].addr + dst_fbo;
+ }
+
+ /* copy data of minimum length between and src and dst sge */
+ cpy_len = min_t(u32, src_sge_len, dst_sge_len);
+
+ src_addr = get_mr_va(src_mr, src_sge_va, cpy_len, &ret_len);
+ cpy_len = min_t(u32, cpy_len, ret_len);
+
+ dst_addr = get_mr_va(dst_mr, dst_sge_va, cpy_len, &ret_len);
+ cpy_len = min_t(u32, cpy_len, ret_len);
+
+ memcpy(dst_addr, src_addr, cpy_len);
+
+ src_sge_len -= cpy_len;
+ dst_sge_len -= cpy_len;
+ src_sge_va += cpy_len;
+ dst_sge_va += cpy_len;
+ send_len -= cpy_len;
+
+ if (!src_sge_len) {
+ s_idx++;
+ put_mr(src_mr);
+ src_mr = NULL;
+ }
+ if (!dst_sge_len) {
+ d_idx++;
+ put_mr(dst_mr);
+ dst_mr = NULL;
+ }
+ dst_fbo = 0;
+ }
+err:
+ put_mr(dst_mr);
+ put_mr(src_mr);
+ return ret;
+}
+
+static int copy_data_wqe_rkey(struct rdma_loopdev *ld, struct loopback_qp *sqp,
+ struct loopback_qp *dqp,
+ const struct ib_send_wr *wr)
+{
+ const struct ib_rdma_wr *rdmawr = rdma_wr(wr);
+ u64 rdma_len = get_send_wqe_len(wr);
+ struct loopback_mr *wr_mr = NULL;
+ struct loopback_mr *rkey_mr;
+ u32 wr_sge_len = 0;
+ u64 wr_sge_va = 0;
+ void *wr_sge_addr;
+ u64 rkey_va = 0;
+ u32 ret_len = 0;
+ u32 cpy_len = 0;
+ void *rkey_addr;
+ int sge_idx = 0;
+ int ret = 0;
+
+ rkey_va = rdmawr->remote_addr;
+ rkey_mr = get_mr_for_rkey(ld, dqp, wr->opcode, rkey_va,
+ rdma_len, rdmawr->rkey);
+ if (IS_ERR(rkey_mr))
+ return PTR_ERR(rkey_mr);
+
+ while (rdma_len) {
+ if (!wr_mr) {
+ wr_mr = get_mr_for_wr_sge(ld, sqp, wr->opcode,
+ &wr->sg_list[sge_idx]);
+ if (IS_ERR(wr_mr)) {
+ ret = PTR_ERR(wr_mr);
+ goto err;
+ }
+ wr_sge_len = wr->sg_list[sge_idx].length;
+ wr_sge_va = wr->sg_list[sge_idx].addr;
+ }
+
+ cpy_len = wr_sge_len;
+
+ wr_sge_addr = get_mr_va(wr_mr, wr_sge_va, cpy_len, &ret_len);
+ cpy_len = min_t(u32, cpy_len, ret_len);
+
+ rkey_addr = get_mr_va(rkey_mr, rkey_va, cpy_len, &ret_len);
+ cpy_len = min_t(u32, cpy_len, ret_len);
+
+ if (wr->opcode == IB_WR_RDMA_READ) {
+ /* rdma read => read from remote rkey to local sges */
+ memcpy(wr_sge_addr, rkey_addr, cpy_len);
+ } else {
+ /* rdma write => local sges to remote rkey */
+ memcpy(rkey_addr, wr_sge_addr, cpy_len);
+ }
+
+ wr_sge_len -= cpy_len;
+ wr_sge_va += cpy_len;
+ rkey_va += cpy_len;
+ rdma_len -= cpy_len;
+
+ if (!wr_sge_len) {
+ sge_idx++;
+ put_mr(wr_mr);
+ wr_mr = NULL;
+ }
+ }
+err:
+ put_mr(wr_mr);
+ put_mr(rkey_mr);
+ return ret;
+}
+
+static int
+write_ud_grh_hdr(struct rdma_loopdev *ld, struct loopback_qp *dqp,
+ const struct ib_send_wr *wr, struct loopback_rqe *rqe)
+{
+ union rdma_network_hdr hdr = {};
+ struct loopback_mr *dst_mr;
+ u32 dst_sge_len;
+ void *dst_addr;
+ u64 dst_sge_va;
+ u32 ret_len = 0;
+ int ret = 0;
+
+ /* Even though spec allows to split first 40 bytes header in 40 sges,
+ * there isn't good usecase, so this expect minimum 40 bytes sge.
+ */
+ dst_sge_va = rqe->sges[0].addr;
+ dst_sge_len = rqe->sges[0].length;
+ if (dst_sge_len < sizeof(hdr))
+ return -EINVAL;
+
+ dst_mr = get_mr_for_wr_sge(ld, dqp, wr->opcode, &rqe->sges[0]);
+ if (IS_ERR(dst_mr))
+ return PTR_ERR(dst_mr);
+ dst_addr = get_mr_va(dst_mr, dst_sge_va, sizeof(hdr), &ret_len);
+ if (ret_len != sizeof(hdr)) {
+ ret = -EINVAL;
+ goto done;
+ }
+ /* TODO: consider ipv6 */
+ hdr.roce4grh.saddr = htonl(0x7f000001);
+ hdr.roce4grh.daddr = htonl(0x7f000001);
+ hdr.roce4grh.ttl = 1;
+ memcpy(dst_addr, &hdr, sizeof(hdr));
+done:
+ put_mr(dst_mr);
+ return ret;
+}
+
+static void
+post_one_ud_send_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp,
+ struct loopback_qp *dqp,
+ const struct ib_send_wr *wr,
+ struct loopback_cqe *wq_cqe)
+{
+ u32 send_len = get_send_wqe_len(wr);
+ struct loopback_rqe *rqe = NULL;
+ enum ib_wc_status recv_status;
+ struct list_head *rqe_entry;
+ u32 recv_len;
+ int ret;
+
+ rqe_entry = pop_from_fifo(&dqp->rqes);
+ if (!rqe_entry) {
+ wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+ return;
+ }
+ rqe = container_of(rqe_entry, struct loopback_rqe, list);
+ recv_len = get_rqe_len(rqe);
+
+ if (send_len + sizeof(union rdma_network_hdr) > recv_len) {
+ recv_len = 0;
+ recv_status = IB_WC_GENERAL_ERR;
+ wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+ } else {
+ /* copy minimum data + grh of what is sent and rqe size */
+ recv_len = min_t(u32, send_len +
+ sizeof(union rdma_network_hdr), recv_len);
+
+ ret = write_ud_grh_hdr(ld, dqp, wr, rqe);
+ if (ret) {
+ recv_status = IB_WC_GENERAL_ERR;
+ wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+ goto done;
+ }
+ ret = copy_data_wqe_to_rqe(ld, qp, dqp, wr, rqe, send_len,
+ sizeof(union rdma_network_hdr));
+ if (ret) {
+ recv_status = IB_WC_GENERAL_ERR;
+ wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+ } else {
+ recv_status = IB_WC_SUCCESS;
+ wq_cqe->wc.status = IB_WC_SUCCESS;
+ }
+ }
+done:
+ generate_ud_rq_cqe(qp, dqp, rqe, recv_len, RDMA_NETWORK_IPV4,
+ IB_WC_WITH_NETWORK_HDR_TYPE |
+ IB_WC_GRH, recv_status);
+ kfree(rqe);
+}
+
+static struct loopback_qp *
+get_qp_by_qpn(struct rdma_loopdev *ld, u32 qpn)
+{
+ struct loopback_resource *entry;
+ struct loopback_qp *qp;
+
+ entry = get_table_entry_by_id(&ld->qp_tbl, qpn);
+ if (!entry)
+ return ERR_PTR(-EINVAL);
+ qp = container_of(entry, struct loopback_qp, res);
+ return qp;
+}
+
+static struct loopback_qp *get_qp(struct loopback_qp *qp)
+{
+ return refcount_inc_not_zero(&qp->res.refcount) ? qp : ERR_PTR(-EINVAL);
+}
+
+static void put_qp(struct loopback_qp *qp)
+{
+ if (!IS_ERR(qp) && qp)
+ put_table_entry(&qp->res);
+}
+
+static void
+post_one_ud_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp,
+ const struct ib_send_wr *wr,
+ struct loopback_cqe *wq_cqe)
+{
+ const struct ib_ud_wr *ud_wqe = ud_wr(wr);
+ struct loopback_qp *dqp;
+
+ dqp = get_qp_by_qpn(ld, ud_wqe->remote_qpn);
+ if (IS_ERR(dqp))
+ goto done;
+
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ post_one_ud_send_wqe(ld, qp, dqp, wr, wq_cqe);
+ break;
+ default:
+ wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+ break;
+ }
+done:
+ wq_cqe->wc.src_qp = qp->ibqp.qp_num;
+ put_qp(dqp);
+}
+
+static int invalidate_rkey(struct rdma_loopdev *ld, u32 inv_key)
+{
+ struct loopback_mr *mr;
+
+ mr = get_mr_for_key(ld, inv_key);
+ if (IS_ERR(mr))
+ return PTR_ERR(mr);
+ /* Fail invalidation if there are active users for now, as this is
+ * extremely rare scenario and not well known use case.
+ */
+ if (refcount_read(&mr->res.refcount) > 2)
+ return -EINVAL;
+ xa_clear_mark(&ld->mr_tbl.ids, mr->res.id,
+ LOOPBACK_RESOURCE_STATE_VALID);
+ put_mr(mr);
+ return 0;
+}
+
+static void
+process_one_rc_linv(struct rdma_loopdev *ld, struct loopback_qp *qp,
+ const struct ib_send_wr *wr,
+ struct loopback_cqe *wq_cqe)
+{
+ u32 inv_key = wr->ex.invalidate_rkey;
+ int ret;
+
+ ret = invalidate_rkey(ld, inv_key);
+ if (ret)
+ wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+ else
+ wq_cqe->wc.status = IB_WC_SUCCESS;
+}
+
+static void
+process_one_rc_send_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp,
+ struct loopback_qp *dqp,
+ const struct ib_send_wr *wr,
+ struct loopback_cqe *wq_cqe)
+{
+ enum ib_wc_status recv_status;
+ struct loopback_rqe *rqe;
+ struct list_head *entry;
+ u32 rqe_wc_flags = 0;
+ u32 inv_key = 0;
+ u32 send_len;
+ u32 recv_len;
+ int ret;
+
+ entry = pop_from_fifo(&dqp->rqes);
+ if (!entry) {
+ wq_cqe->wc.status = IB_WC_RNR_RETRY_EXC_ERR;
+ rcu_read_unlock();
+ return;
+ }
+ rqe = container_of(entry, struct loopback_rqe, list);
+ send_len = get_send_wqe_len(wr);
+ recv_len = get_rqe_len(rqe);
+ if (send_len > recv_len) {
+ recv_len = 0;
+ recv_status = IB_WC_GENERAL_ERR;
+ wq_cqe->wc.status = IB_WC_REM_INV_REQ_ERR;
+ } else {
+ /* copy minimum data of what is sent and rqe size */
+ recv_len = min_t(u32, send_len, recv_len);
+ ret = copy_data_wqe_to_rqe(ld, qp, dqp, wr,
+ rqe, send_len, 0);
+ if (ret) {
+ recv_status = IB_WC_LOC_LEN_ERR;
+ wq_cqe->wc.status = IB_WC_REM_INV_REQ_ERR;
+ } else {
+ recv_status = IB_WC_SUCCESS;
+ wq_cqe->wc.status = IB_WC_SUCCESS;
+ }
+ }
+ if (!ret && wr->opcode == IB_WR_SEND_WITH_INV) {
+ ret = invalidate_rkey(ld, wr->ex.invalidate_rkey);
+ if (ret) {
+ recv_status = IB_WC_LOC_LEN_ERR;
+ wq_cqe->wc.status = IB_WC_REM_INV_REQ_ERR;
+ } else {
+ inv_key = wr->ex.invalidate_rkey;
+ rqe_wc_flags = IB_WC_WITH_INVALIDATE;
+ recv_status = IB_WC_SUCCESS;
+ wq_cqe->wc.status = IB_WC_SUCCESS;
+ }
+ }
+
+ generate_rc_rq_cqe(dqp, rqe, recv_len, recv_status,
+ inv_key, rqe_wc_flags);
+ kfree(rqe);
+}
+
+static void
+process_one_rc_rw_wqe(struct rdma_loopdev *ld, struct loopback_qp *qp,
+ struct loopback_qp *dqp,
+ const struct ib_send_wr *wr,
+ struct loopback_cqe *wq_cqe)
+{
+ int ret;
+
+ ret = copy_data_wqe_rkey(ld, qp, dqp, wr);
+ if (ret)
+ wq_cqe->wc.status = IB_WC_REM_ACCESS_ERR;
+ else
+ wq_cqe->wc.status = IB_WC_SUCCESS;
+}
+
+static void
+process_one_rc_wqe(struct rdma_loopdev *ld, struct loopback_qp *sqp,
+ struct loopback_qp *dqp,
+ const struct ib_send_wr *wr,
+ struct loopback_cqe *wq_cqe)
+{
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_INV:
+ process_one_rc_send_wqe(ld, sqp, dqp, wr, wq_cqe);
+ break;
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_WRITE:
+ process_one_rc_rw_wqe(ld, sqp, dqp, wr, wq_cqe);
+ break;
+ case IB_WR_LOCAL_INV:
+ process_one_rc_linv(ld, sqp, wr, wq_cqe);
+ break;
+ default:
+ wq_cqe->wc.status = IB_WC_GENERAL_ERR;
+ break;
+ }
+}
+
+static void
+post_one_rc_wqe(struct rdma_loopdev *ld, struct loopback_qp *sqp,
+ const struct ib_send_wr *wr, struct loopback_cqe *wq_cqe)
+{
+ struct loopback_qp *dqp;
+
+ dqp = get_qp_by_qpn(ld, sqp->attr.dest_qp_num);
+ if (IS_ERR(dqp)) {
+ wq_cqe->wc.status = IB_WC_RETRY_EXC_ERR;
+ goto done;
+ }
+ process_one_rc_wqe(ld, sqp, dqp, wr, wq_cqe);
+
+done:
+ wq_cqe->wc.src_qp = sqp->ibqp.qp_num;
+ put_qp(dqp);
+}
+
+static int post_one_send(struct loopback_qp *qp, const struct ib_send_wr *wr)
+{
+ struct rdma_loopdev *ld = ib_to_loopdev(qp->ibqp.device);
+ struct loopback_cq *send_cq = ib_to_loop_cq(qp->ibqp.send_cq);
+ struct loopback_cqe *cqe;
+ struct loopback_qp *sqp;
+ int ret = 0;
+
+ sqp = get_qp(qp);
+ if (IS_ERR(sqp))
+ return -EINVAL;
+
+ cqe = kzalloc(sizeof(*cqe), qp->cqe_alloc_flags);
+ if (!cqe) {
+ ret = -ENOMEM;
+ goto alloc_err;
+ }
+
+ if (qp->state != IB_QPS_RTS) {
+ cqe->wc.status = IB_WC_WR_FLUSH_ERR;
+ goto done;
+ }
+
+ switch (qp->ibqp.qp_type) {
+ case IB_QPT_GSI:
+ post_one_ud_wqe(ld, qp, wr, cqe);
+ break;
+ case IB_QPT_RC:
+ post_one_rc_wqe(ld, qp, wr, cqe);
+ break;
+ default:
+ break;
+ }
+
+done:
+ cqe->wc.opcode = sq_opcode_to_wc_opcode(wr->opcode);
+ cqe->wc.wr_cqe = wr->wr_cqe;
+ cqe->wc.qp = &sqp->ibqp;
+ cqe->wc.port_num = 1;
+ if (wr->send_flags & IB_SEND_SIGNALED ||
+ qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR ||
+ cqe->wc.status != IB_WC_SUCCESS) {
+ push_to_fifo(&send_cq->cqes, &cqe->list);
+ attempt_notify_cq(send_cq);
+ } else {
+ kfree(cqe);
+ }
+alloc_err:
+ put_qp(sqp);
+ return ret;
+}
+
+static int loopback_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr)
+{
+ struct loopback_qp *qp = ib_to_loop_qp(ibqp);
+ int err = 0;
+
+ while (wr) {
+ err = post_one_send(qp, wr);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ break;
+ }
+ wr = wr->next;
+ }
+ return err;
+}
+
+static int post_one_recv(struct loopback_qp *qp, const struct ib_recv_wr *wr)
+{
+ struct loopback_rqe *rqe;
+ struct loopback_qp *sqp;
+ enum ib_qp_state state;
+ int ret = 0;
+
+ sqp = get_qp(qp);
+ if (IS_ERR(sqp))
+ return -EINVAL;
+
+ rqe = kzalloc(struct_size(rqe, sges, wr->num_sge), qp->rqe_alloc_flags);
+ if (!rqe) {
+ ret = -ENOMEM;
+ goto alloc_err;
+ }
+
+ rqe->wr = *wr;
+ memcpy(&rqe->sges[0], wr->sg_list, wr->num_sge * sizeof(rqe->sges[0]));
+
+ state = qp->state;
+ if (state == IB_QPS_INIT || state == IB_QPS_RTR || state == IB_QPS_RTS)
+ push_to_fifo(&qp->rqes, &rqe->list);
+ else
+ ret = -EINVAL;
+
+ if (ret) {
+ generate_rc_rq_cqe(qp, rqe, 0, IB_WC_WR_FLUSH_ERR, 0, 0);
+ kfree(rqe);
+ ret = 0;
+ }
+alloc_err:
+ put_qp(sqp);
+ return ret;
+}
+
+static int loopback_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ struct loopback_qp *qp = ib_to_loop_qp(ibqp);
+ int err = 0;
+
+ while (wr) {
+ err = post_one_recv(qp, wr);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ break;
+ }
+ wr = wr->next;
+ }
+ return err;
+}
+
+static struct ib_ah *
+loopback_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
+ u32 flags, struct ib_udata *udata)
+{
+ struct rdma_loopdev *ld = ib_to_loopdev(ibpd->device);
+ struct loopback_ah *ah;
+ int ret;
+
+ ah = kzalloc(sizeof(*ah), GFP_KERNEL);
+ if (!ah)
+ return ERR_PTR(-ENOMEM);
+ ah->attr = *attr;
+ ret = attach_table_id(&ld->ah_tbl, &ah->res);
+ if (ret) {
+ kfree(ah);
+ return ERR_PTR(ret);
+ }
+ return &ah->ibah;
+}
+
+static int loopback_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
+{
+ struct loopback_ah *ah = ib_to_loop_ah(ibah);
+
+ memset(attr, 0, sizeof(*attr));
+ *attr = ah->attr;
+ attr->type = ibah->type;
+ return 0;
+}
+
+static int loopback_destroy_ah(struct ib_ah *ibah, u32 flags)
+{
+ struct rdma_loopdev *ld = ib_to_loopdev(ibah->device);
+ struct loopback_ah *ah = ib_to_loop_ah(ibah);
+
+ detach_table_id(&ld->ah_tbl, &ah->res);
+ kfree(ah);
+ return 0;
+}
+
+static const struct ib_device_ops rdma_loopdev_ops = {
+ .alloc_pd = loopback_alloc_pd,
+ .alloc_ucontext = loopback_alloc_ucontext,
+ .create_ah = loopback_create_ah,
+ .create_cq = loopback_create_cq,
+ .create_qp = loopback_create_qp,
+ .dealloc_pd = loopback_dealloc_pd,
+ .dealloc_ucontext = loopback_dealloc_ucontext,
+ .dereg_mr = loopback_dereg_mr,
+ .destroy_ah = loopback_destroy_ah,
+ .destroy_cq = loopback_destroy_cq,
+ .destroy_qp = loopback_destroy_qp,
+ .get_dma_mr = loopback_get_dma_mr,
+ .get_link_layer = loopback_get_link_layer,
+ .get_netdev = loopback_get_netdev,
+ .get_port_immutable = loopback_port_immutable,
+ .modify_qp = loopback_modify_qp,
+ .poll_cq = loopback_poll_cq,
+ .post_recv = loopback_post_recv,
+ .post_send = loopback_post_send,
+ .query_ah = loopback_query_ah,
+ .query_device = loopback_query_device,
+ .query_pkey = loopback_query_pkey,
+ .query_port = loopback_query_port,
+ .query_qp = loopback_query_qp,
+ .reg_user_mr = loopback_reg_user_mr,
+ .req_notify_cq = loopback_req_notify_cq,
+ INIT_RDMA_OBJ_SIZE(ib_pd, loopback_pd, ibpd),
+ INIT_RDMA_OBJ_SIZE(ib_ucontext, loopback_uctx, ibuctx),
+};
+
+static void init_rdma_loopdev(struct rdma_loopdev *ld)
+{
+ struct ib_device *dev = &ld->dev;
+
+ strlcpy(dev->node_desc, "lo", sizeof(dev->node_desc));
+
+ dev->owner = THIS_MODULE;
+ dev->node_type = RDMA_NODE_IB_CA;
+ dev->phys_port_cnt = 1;
+ dev->num_comp_vectors = num_possible_cpus();
+ dev->dev.parent = &lo->dev;
+ dev->local_dma_lkey = 0;
+ dev->dev.dma_ops = &dma_virt_ops;
+ dev->node_guid = 0x7f0001;
+ dma_coerce_mask_and_coherent(&dev->dev,
+ dma_get_required_mask(&dev->dev));
+
+ ld->port_attr.pkey_tbl_len = 1;
+ /* deault, 127.0.0.1 and ::1 */
+ ld->port_attr.gid_tbl_len = 3;
+
+ dev->uverbs_abi_ver = 2;
+ dev->uverbs_cmd_mask = BIT_ULL(IB_USER_VERBS_CMD_GET_CONTEXT)
+ | BIT_ULL(IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)
+ | BIT_ULL(IB_USER_VERBS_CMD_QUERY_DEVICE)
+ | BIT_ULL(IB_USER_VERBS_CMD_QUERY_PORT)
+ | BIT_ULL(IB_USER_VERBS_CMD_ALLOC_PD)
+ | BIT_ULL(IB_USER_VERBS_CMD_DEALLOC_PD)
+ | BIT_ULL(IB_USER_VERBS_CMD_CREATE_QP)
+ | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_QP)
+ | BIT_ULL(IB_USER_VERBS_CMD_QUERY_QP)
+ | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_QP)
+ | BIT_ULL(IB_USER_VERBS_CMD_POST_SEND)
+ | BIT_ULL(IB_USER_VERBS_CMD_POST_RECV)
+ | BIT_ULL(IB_USER_VERBS_CMD_CREATE_CQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_CQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_POLL_CQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)
+ | BIT_ULL(IB_USER_VERBS_CMD_REG_MR)
+ | BIT_ULL(IB_USER_VERBS_CMD_DEREG_MR)
+ | BIT_ULL(IB_USER_VERBS_CMD_CREATE_AH)
+ | BIT_ULL(IB_USER_VERBS_CMD_MODIFY_AH)
+ | BIT_ULL(IB_USER_VERBS_CMD_QUERY_AH)
+ | BIT_ULL(IB_USER_VERBS_CMD_DESTROY_AH)
+ ;
+
+ ib_set_device_ops(dev, &rdma_loopdev_ops);
+
+ dev->driver_id = RDMA_DRIVER_LOOPBACK;
+
+ init_loopdev_tables(ld);
+}
+
+static void cleanup_zero_lkey_mr(struct rdma_loopdev *ld)
+{
+ detach_table_id(&ld->mr_tbl, &ld->zero_mr.res);
+}
+
+static int init_zero_lkey_mr(struct rdma_loopdev *ld)
+{
+ ld->zero_mr.type = LOOPBACK_MR_TYPE_PHY;
+ ld->zero_mr.ibmr.length = ULONG_MAX;
+ ld->zero_mr.access = IB_ACCESS_LOCAL_WRITE;
+ return attach_table_id_for_id(&ld->mr_tbl, &ld->zero_mr.res, 0);
+}
+
+static int loopback_init(void)
+{
+ int ret;
+
+ lo = dev_get_by_name(&init_net, "lo");
+ if (!lo)
+ return -ENODEV;
+
+ loopdev = ib_alloc_device(rdma_loopdev, dev);
+ if (!loopdev) {
+ ret = -ENOMEM;
+ goto alloc_err;
+ }
+ init_rdma_loopdev(loopdev);
+
+ ret = init_zero_lkey_mr(loopdev);
+ if (ret)
+ goto mr_err;
+
+ ret = ib_register_device(&loopdev->dev, "lo");
+ if (ret)
+ goto reg_err;
+ return 0;
+
+reg_err:
+ cleanup_zero_lkey_mr(loopdev);
+mr_err:
+ ib_dealloc_device(&loopdev->dev);
+alloc_err:
+ dev_put(lo);
+ return ret;
+}
+
+static void loopback_cleanup(void)
+{
+ ib_unregister_device(&loopdev->dev);
+ cleanup_zero_lkey_mr(loopdev);
+ ib_dealloc_device(&loopdev->dev);
+ dev_put(lo);
+}
+
+module_init(loopback_init);
+module_exit(loopback_cleanup);
+MODULE_LICENSE("GPL");
@@ -102,6 +102,7 @@ enum rdma_driver_id {
RDMA_DRIVER_RXE,
RDMA_DRIVER_HFI1,
RDMA_DRIVER_QIB,
+ RDMA_DRIVER_LOOPBACK,
};
#endif
This is the most simplest rdma (RoCE) loopback driver. It simplements rdma device on top of 'lo' netdevice. Since data doesn't leave a system, it doesn't emulate any tranport, network or link layers. It implements fully functional verbs layer and supported by data copy engine. Signed-off-by: Parav Pandit <parav@mellanox.com> --- drivers/infiniband/Kconfig | 1 + drivers/infiniband/sw/Makefile | 1 + drivers/infiniband/sw/loopback/Kconfig | 14 + drivers/infiniband/sw/loopback/Makefile | 4 + drivers/infiniband/sw/loopback/loopback.c | 1603 +++++++++++++++++++++++++++++ include/uapi/rdma/rdma_user_ioctl_cmds.h | 1 + 6 files changed, 1624 insertions(+) create mode 100644 drivers/infiniband/sw/loopback/Kconfig create mode 100644 drivers/infiniband/sw/loopback/Makefile create mode 100644 drivers/infiniband/sw/loopback/loopback.c