@@ -579,6 +579,7 @@ if (HAVE_COHERENT_DMA)
add_subdirectory(providers/bnxt_re)
add_subdirectory(providers/cxgb3) # NO SPARSE
add_subdirectory(providers/cxgb4) # NO SPARSE
+add_subdirectory(providers/efa)
add_subdirectory(providers/hns)
add_subdirectory(providers/i40iw) # NO SPARSE
add_subdirectory(providers/mlx4)
@@ -61,6 +61,11 @@ M: Steve Wise <swise@opengridcomputing.com>
S: Supported
F: providers/cxgb4/
+EFA USERSPACE PROVIDER (for efa.ko)
+M: Gal Pressman <galpress@amazon.com>
+S: Supported
+F: providers/efa/
+
HF1 USERSPACE PROVIDER (for hf1.ko)
M: Mike Marciniszyn <mike.marciniszyn@intel.com>
M: Dennis Dalessandro <dennis.dalessandro@intel.com>
@@ -28,6 +28,7 @@ is included:
- qedr.ko
- rdma_rxe.ko
- vmw_pvrdma.ko
+ - efa.ko
Additional service daemons are provided for:
- srp_daemon (ib_srp.ko)
@@ -64,19 +64,21 @@ Package: ibverbs-providers
Architecture: linux-any
Multi-Arch: same
Depends: ${misc:Depends}, ${shlibs:Depends}
-Provides: libcxgb3-1, libipathverbs1, libmlx4-1, libmlx5-1, libmthca1, libnes1
+Provides: libcxgb3-1, libipathverbs1, libmlx4-1, libmlx5-1, libmthca1, libnes1, libefa-1
Replaces: libcxgb3-1 (<< 15),
libipathverbs1 (<< 15),
libmlx4-1 (<< 15),
libmlx5-1 (<< 15),
libmthca1 (<< 15),
- libnes1 (<< 15)
+ libnes1 (<< 15),
+ libefa-1 (<< 15)
Breaks: libcxgb3-1 (<< 15),
libipathverbs1 (<< 15),
libmlx4-1 (<< 15),
libmlx5-1 (<< 15),
libmthca1 (<< 15),
- libnes1 (<< 15)
+ libnes1 (<< 15),
+ libefa-1 (<< 15)
Description: User space provider drivers for libibverbs
libibverbs is a library that allows userspace processes to use RDMA
"verbs" as described in the InfiniBand Architecture Specification and
@@ -105,6 +107,7 @@ Description: User space provider drivers for libibverbs
- qedr: QLogic QL4xxx RoCE HCAs
- rxe: A software implementation of the RoCE protocol
- vmw_pvrdma: VMware paravirtual RDMA device
+ - efa: Amazon Elastic Fabric Adapter
Package: ibverbs-utils
Architecture: linux-any
@@ -154,6 +154,10 @@ Files: providers/cxgb3/*
Copyright: 2003-2016, Chelsio Communications, Inc.
License: BSD-MIT or GPL-2
+Files: providers/efa/*
+Copyright: 2019 Amazon.com, Inc. or its affiliates.
+License: BSD-2-clause or GPL-2
+
Files: providers/hfi1verbs/*
Copyright: 2005 PathScale, Inc.
2006-2009 QLogic Corporation
@@ -2,6 +2,7 @@ publish_internal_headers(rdma
rdma/bnxt_re-abi.h
rdma/cxgb3-abi.h
rdma/cxgb4-abi.h
+ rdma/efa-abi.h
rdma/hns-abi.h
rdma/i40iw-abi.h
rdma/ib_user_cm.h
@@ -60,6 +61,7 @@ rdma_kernel_provider_abi(
rdma/bnxt_re-abi.h
rdma/cxgb3-abi.h
rdma/cxgb4-abi.h
+ rdma/efa-abi.h
rdma/hns-abi.h
rdma/i40iw-abi.h
rdma/ib_user_verbs.h
new file mode 100644
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef EFA_ABI_USER_H
+#define EFA_ABI_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define EFA_UVERBS_ABI_VERSION 1
+
+/*
+ * Keep structs aligned to 8 bytes.
+ * Keep reserved fields as arrays of __u8 named reserved_XXX where XXX is the
+ * hex bit offset of the field.
+ */
+
+enum efa_ibv_user_cmds_supp_udata {
+ EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0,
+ EFA_USER_CMDS_SUPP_UDATA_CREATE_AH = 1 << 1,
+};
+
+struct efa_ibv_alloc_ucontext_resp {
+ __u32 comp_mask;
+ __u32 cmds_supp_udata_mask;
+ __u16 sub_cqs_per_cq;
+ __u16 inline_buf_size;
+ __u32 max_llq_size; /* bytes */
+};
+
+struct efa_ibv_alloc_pd_resp {
+ __u32 comp_mask;
+ __u16 pdn;
+ __u8 reserved_30[0x2];
+};
+
+struct efa_ibv_create_cq {
+ __u32 comp_mask;
+ __u32 cq_entry_size;
+ __u16 num_sub_cqs;
+ __u8 reserved_50[0x6];
+};
+
+struct efa_ibv_create_cq_resp {
+ __u32 comp_mask;
+ __u8 reserved_20[0x4];
+ __aligned_u64 q_mmap_key;
+ __aligned_u64 q_mmap_size;
+ __u16 cq_idx;
+ __u8 reserved_d0[0x6];
+};
+
+enum {
+ EFA_QP_DRIVER_TYPE_SRD = 0,
+};
+
+struct efa_ibv_create_qp {
+ __u32 comp_mask;
+ __u32 rq_ring_size; /* bytes */
+ __u32 sq_ring_size; /* bytes */
+ __u32 driver_qp_type;
+};
+
+struct efa_ibv_create_qp_resp {
+ __u32 comp_mask;
+ /* the offset inside the page of the rq db */
+ __u32 rq_db_offset;
+ /* the offset inside the page of the sq db */
+ __u32 sq_db_offset;
+ /* the offset inside the page of descriptors buffer */
+ __u32 llq_desc_offset;
+ __aligned_u64 rq_mmap_key;
+ __aligned_u64 rq_mmap_size;
+ __aligned_u64 rq_db_mmap_key;
+ __aligned_u64 sq_db_mmap_key;
+ __aligned_u64 llq_desc_mmap_key;
+ __u16 send_sub_cq_idx;
+ __u16 recv_sub_cq_idx;
+ __u8 reserved_1e0[0x4];
+};
+
+struct efa_ibv_create_ah_resp {
+ __u32 comp_mask;
+ __u16 efa_address_handle;
+ __u8 reserved_30[0x2];
+};
+
+struct efa_ibv_ex_query_device_resp {
+ __u32 comp_mask;
+ __u32 max_sq_wr;
+ __u32 max_rq_wr;
+ __u16 max_sq_sge;
+ __u16 max_rq_sge;
+};
+
+#endif /* EFA_ABI_USER_H */
@@ -102,6 +102,7 @@ enum rdma_driver_id {
RDMA_DRIVER_RXE,
RDMA_DRIVER_HFI1,
RDMA_DRIVER_QIB,
+ RDMA_DRIVER_EFA,
};
#endif
@@ -1960,6 +1960,7 @@ extern const struct verbs_device_ops verbs_provider_ocrdma;
extern const struct verbs_device_ops verbs_provider_qedr;
extern const struct verbs_device_ops verbs_provider_rxe;
extern const struct verbs_device_ops verbs_provider_vmw_pvrdma;
+extern const struct verbs_device_ops verbs_provider_efa;
extern const struct verbs_device_ops verbs_provider_all;
extern const struct verbs_device_ops verbs_provider_none;
void ibv_static_providers(void *unused, ...);
new file mode 100644
@@ -0,0 +1,4 @@
+rdma_provider(efa
+ efa.c
+ verbs.c
+)
new file mode 100644
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __EFA_ABI_H__
+#define __EFA_ABI_H__
+
+#include <infiniband/kern-abi.h>
+#include <kernel-abi/efa-abi.h>
+#include <rdma/efa-abi.h>
+
+#define EFA_ABI_VERSION 1
+
+DECLARE_DRV_CMD(efa_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT, empty,
+ efa_ibv_alloc_ucontext_resp);
+DECLARE_DRV_CMD(efa_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, empty,
+ efa_ibv_alloc_pd_resp);
+DECLARE_DRV_CMD(efa_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, efa_ibv_create_cq,
+ efa_ibv_create_cq_resp);
+DECLARE_DRV_CMD(efa_create_qp, IB_USER_VERBS_CMD_CREATE_QP, efa_ibv_create_qp,
+ efa_ibv_create_qp_resp);
+DECLARE_DRV_CMD(efa_create_ah, IB_USER_VERBS_CMD_CREATE_AH, empty,
+ efa_ibv_create_ah_resp);
+DECLARE_DRV_CMD(efa_query_device_ex, IB_USER_VERBS_EX_CMD_QUERY_DEVICE, empty,
+ efa_ibv_ex_query_device_resp);
+
+#endif /* __EFA_ABI_H__ */
new file mode 100644
@@ -0,0 +1,155 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "efa.h"
+#include "verbs.h"
+
+#define PCI_VENDOR_ID_AMAZON 0x1d0f
+
+static const struct verbs_match_ent efa_table[] = {
+ VERBS_PCI_MATCH(PCI_VENDOR_ID_AMAZON, 0xefa0, NULL),
+};
+
+static const struct verbs_context_ops efa_ctx_ops = {
+ .alloc_pd = efa_alloc_pd,
+ .create_ah = efa_create_ah,
+ .create_cq = efa_create_cq,
+ .create_qp = efa_create_qp,
+ .dealloc_pd = efa_dealloc_pd,
+ .dereg_mr = efa_dereg_mr,
+ .destroy_ah = efa_destroy_ah,
+ .destroy_cq = efa_destroy_cq,
+ .destroy_qp = efa_destroy_qp,
+ .modify_qp = efa_modify_qp,
+ .poll_cq = efa_poll_cq,
+ .post_recv = efa_post_recv,
+ .post_send = efa_post_send,
+ .query_device = efa_query_device,
+ .query_device_ex = efa_query_device_ex,
+ .query_port = efa_query_port,
+ .query_qp = efa_query_qp,
+ .reg_mr = efa_reg_mr,
+};
+
+static struct verbs_context *efa_alloc_context(struct ibv_device *vdev,
+ int cmd_fd,
+ void *private_data)
+{
+ struct efa_alloc_ucontext_resp resp;
+ struct ibv_device_attr_ex attr;
+ struct ibv_get_context cmd;
+ struct efa_context *ctx;
+ int err;
+
+ ctx = verbs_init_and_alloc_context(vdev, cmd_fd, ctx, ibvctx,
+ RDMA_DRIVER_EFA);
+ if (!ctx)
+ return NULL;
+
+ memset(&resp, 0, sizeof(resp));
+ if (ibv_cmd_get_context(&ctx->ibvctx, &cmd, sizeof(cmd),
+ &resp.ibv_resp, sizeof(resp)))
+ goto failed;
+
+ ctx->sub_cqs_per_cq = resp.sub_cqs_per_cq;
+ ctx->cmds_supp_udata_mask = resp.cmds_supp_udata_mask;
+ ctx->cqe_size = sizeof(struct efa_io_rx_cdesc);
+ ctx->inline_buf_size = resp.inline_buf_size;
+ ctx->max_llq_size = resp.max_llq_size;
+ pthread_spin_init(&ctx->qp_table_lock, PTHREAD_PROCESS_PRIVATE);
+
+ verbs_set_ops(&ctx->ibvctx, &efa_ctx_ops);
+
+ err = efa_query_device_ex(&ctx->ibvctx.context, NULL, &attr,
+ sizeof(attr));
+ if (err)
+ goto failed;
+
+ ctx->qp_table = calloc(attr.orig_attr.max_qp, sizeof(*ctx->qp_table));
+ if (!ctx->qp_table)
+ goto failed;
+
+ return &ctx->ibvctx;
+
+failed:
+ verbs_uninit_context(&ctx->ibvctx);
+ free(ctx);
+ return NULL;
+}
+
+static void efa_free_context(struct ibv_context *ibvctx)
+{
+ struct efa_context *ctx = to_efa_context(ibvctx);
+
+ pthread_spin_destroy(&ctx->qp_table_lock);
+ verbs_uninit_context(&ctx->ibvctx);
+ free(ctx);
+}
+
+static struct verbs_device *efa_device_alloc(struct verbs_sysfs_dev *sysfs_dev)
+{
+ struct efa_dev *dev;
+
+ dev = calloc(1, sizeof(*dev));
+ if (!dev)
+ return NULL;
+
+ dev->pg_sz = sysconf(_SC_PAGESIZE);
+
+ return &dev->vdev;
+}
+
+static void efa_uninit_device(struct verbs_device *verbs_device)
+{
+ struct efa_dev *dev = to_efa_dev(&verbs_device->device);
+
+ free(dev);
+}
+
+static const struct verbs_device_ops efa_dev_ops = {
+ .name = "efa",
+ .match_min_abi_version = EFA_ABI_VERSION,
+ .match_max_abi_version = EFA_ABI_VERSION,
+ .match_table = efa_table,
+ .alloc_device = efa_device_alloc,
+ .uninit_device = efa_uninit_device,
+ .alloc_context = efa_alloc_context,
+ .free_context = efa_free_context,
+};
+PROVIDER_DRIVER(efa, efa_dev_ops);
new file mode 100644
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __EFA_H__
+#define __EFA_H__
+
+#include <inttypes.h>
+#include <pthread.h>
+#include <stddef.h>
+
+#include <infiniband/driver.h>
+#include <util/udma_barrier.h>
+
+#include "efa-abi.h"
+#include "efa_io_defs.h"
+
+static inline unsigned long align(unsigned long val, unsigned long align)
+{
+ return (val + align - 1) & ~(align - 1);
+}
+
+static inline uint32_t align_up_queue_size(uint32_t req)
+{
+ req--;
+ req |= req >> 1;
+ req |= req >> 2;
+ req |= req >> 4;
+ req |= req >> 8;
+ req |= req >> 16;
+ req++;
+ return req;
+}
+
+struct efa_context {
+ struct verbs_context ibvctx;
+ uint32_t cmds_supp_udata_mask;
+ uint16_t sub_cqs_per_cq;
+ uint16_t inline_buf_size;
+ uint32_t max_llq_size;
+ size_t cqe_size;
+ struct efa_qp **qp_table;
+ pthread_spinlock_t qp_table_lock;
+};
+
+struct efa_pd {
+ struct ibv_pd ibvpd;
+ struct efa_context *context;
+ uint16_t pdn;
+};
+
+struct efa_sub_cq {
+ uint16_t consumed_cnt;
+ int phase;
+ uint8_t *buf;
+ int qmask;
+ int cqe_size;
+ uint32_t ref_cnt;
+};
+
+struct efa_cq {
+ struct ibv_cq ibvcq;
+ uint32_t cqn;
+ size_t cqe_size;
+ uint8_t *buf;
+ size_t buf_size;
+ struct efa_sub_cq *sub_cq_arr;
+ uint16_t num_sub_cqs;
+ /* Index of next sub cq idx to poll. This is used to guarantee fairness for sub cqs */
+ uint16_t next_poll_idx;
+ pthread_spinlock_t lock;
+};
+
+struct efa_wq {
+ uint64_t *wrid;
+ /* wrid_idx_pool: Pool of free indexes in the wrid array, used to select the
+ * wrid entry to be used to hold the next tx packet's context.
+ * At init time, entry N will hold value N, as OOO tx-completions arrive,
+ * the value stored in a given entry might not equal the entry's index.
+ */
+ uint32_t *wrid_idx_pool;
+ uint32_t wqe_cnt;
+ uint32_t wqe_posted;
+ uint32_t wqe_completed;
+ uint16_t desc_idx;
+ uint16_t desc_mask;
+ /* wrid_idx_pool_next: Index of the next entry to use in wrid_idx_pool. */
+ uint16_t wrid_idx_pool_next;
+ int max_sge;
+ int phase;
+ pthread_spinlock_t wqlock;
+};
+
+struct efa_rq {
+ struct efa_wq wq;
+ uint32_t *db;
+ uint8_t *buf;
+ size_t buf_size;
+ uint16_t sub_cq_idx;
+};
+
+struct efa_sq {
+ struct efa_wq wq;
+ uint32_t *db;
+ uint8_t *desc;
+ uint32_t desc_offset;
+ size_t desc_ring_mmap_size;
+ size_t max_inline_data;
+ uint16_t sub_cq_idx;
+};
+
+struct efa_qp {
+ struct ibv_qp ibvqp;
+ struct efa_sq sq;
+ struct efa_rq rq;
+ int page_size;
+ struct efa_cq *rcq;
+ struct efa_cq *scq;
+ int sq_sig_all;
+};
+
+struct efa_mr {
+ struct verbs_mr vmr;
+};
+
+struct efa_ah {
+ struct ibv_ah ibvah;
+ uint16_t efa_ah;
+};
+
+struct efa_dev {
+ struct verbs_device vdev;
+ uint8_t abi_version;
+ uint32_t pg_sz;
+ uint32_t max_sq_wr;
+ uint32_t max_rq_wr;
+ uint16_t max_sq_sge;
+ uint16_t max_rq_sge;
+};
+
+static inline struct efa_dev *to_efa_dev(struct ibv_device *ibvdev)
+{
+ return container_of(ibvdev, struct efa_dev, vdev.device);
+}
+
+static inline struct efa_context *to_efa_context(struct ibv_context *ibvctx)
+{
+ return container_of(ibvctx, struct efa_context, ibvctx.context);
+}
+
+static inline struct efa_pd *to_efa_pd(struct ibv_pd *ibvpd)
+{
+ return container_of(ibvpd, struct efa_pd, ibvpd);
+}
+
+static inline struct efa_cq *to_efa_cq(struct ibv_cq *ibvcq)
+{
+ return container_of(ibvcq, struct efa_cq, ibvcq);
+}
+
+static inline struct efa_qp *to_efa_qp(struct ibv_qp *ibvqp)
+{
+ return container_of(ibvqp, struct efa_qp, ibvqp);
+}
+
+static inline struct efa_ah *to_efa_ah(struct ibv_ah *ibvah)
+{
+ return container_of(ibvah, struct efa_ah, ibvah);
+}
+
+#endif /* __EFA_H__ */
new file mode 100644
@@ -0,0 +1,679 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _EFA_IO_H_
+#define _EFA_IO_H_
+
+#define BIT(nr) (1UL << (nr))
+#define GENMASK(h, l) (((1U << ((h) - (l) + 1)) - 1) << (l))
+
+#define EFA_IO_TX_DESC_NUM_BUFS 2
+#define EFA_IO_TX_DESC_INLINE_MAX_SIZE 32
+#define EFA_IO_TX_DESC_IMM_DATA_SIZE 4
+
+enum efa_io_queue_type {
+ /* send queue (of a QP) */
+ EFA_IO_SEND_QUEUE = 1,
+ /* recv queue (of a QP) */
+ EFA_IO_RECV_QUEUE = 2,
+};
+
+enum efa_io_send_op_type {
+ /* invalid op */
+ EFA_IO_INVALID_OP = 0,
+ /* send message */
+ EFA_IO_SEND = 1,
+ /* RDMA read, future, not supported yet */
+ EFA_IO_RDMA_READ = 2,
+ /* RDMA write, future, not supported yet */
+ EFA_IO_RDMA_WRITE = 3,
+};
+
+enum efa_io_comp_status {
+ /* Successful completion */
+ EFA_IO_COMP_STATUS_OK = 0,
+ /* Flushed during QP destroy */
+ EFA_IO_COMP_STATUS_FLUSHED = 1,
+ /* Internal QP error */
+ EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR = 2,
+ /* Bad operation type */
+ EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE = 3,
+ /* Bad AH */
+ EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH = 4,
+ /* LKEY not registered or does not match IOVA */
+ EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY = 5,
+ /* Message too long */
+ EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH = 6,
+ /* Destination ENI is down or does not run EFA */
+ EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS = 7,
+ /* Connection was reset by remote side */
+ EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT = 8,
+ /* Bad dest QP number (QP does not exist or is in error state) */
+ EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN = 9,
+ /* Destination resource not ready (no WQEs posted on RQ) */
+ EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR = 10,
+ /* Receiver SGL too short */
+ EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH = 11,
+ /* Unexpected status returned by responder */
+ EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS = 12,
+};
+
+/* Tx Meta descriptor. */
+struct efa_io_tx_meta_desc {
+ /* Verbs-generated Request ID */
+ uint16_t req_id;
+
+ /*
+ * control flags
+ * 3:0 : op_type - operation type: send/rdma/fast mem
+ * ops/etc
+ * 4 : has_imm - immediate_data field carries valid
+ * data.
+ * 5 : inline_msg - inline mode - inline message data
+ * follows this descriptor (no buffer descriptors).
+ * Note that it is different from immediate data
+ * 6 : meta_extension - Extended metadata. MBZ
+ * 7 : meta_desc - Indicates metadata descriptor.
+ * Must be set.
+ */
+ uint8_t ctrl1;
+
+ /*
+ * control flags
+ * 0 : phase - phase bit.
+ * 1 : reserved25 - MBZ
+ * 2 : first - Indicates first descriptor in
+ * transaction. Must be set.
+ * 3 : last - Indicates last descriptor in
+ * transaction. Must be set.
+ * 4 : comp_req - Indicates whether completion should
+ * be posted, after packet is transmitted. Valid only
+ * for the first descriptor
+ * 7:5 : reserved29 - MBZ
+ */
+ uint8_t ctrl2;
+
+ /* destination QP number */
+ uint16_t dest_qp_num;
+
+ /*
+ * If inline_msg bit is set, length of inline message in bytes,
+ * otherwise length of SGL (number of buffers).
+ */
+ uint16_t len;
+
+ /*
+ * immediate data: if has_imm is set, then this field is included
+ * within Tx message and reported in remote Rx completion.
+ */
+ uint32_t immediate_data;
+
+ /* Address handle */
+ uint16_t ah;
+
+ uint16_t reserved;
+};
+
+/*
+ * Tx buffer descriptor, for any transport type. Preceded by metadata
+ * descriptor.
+ */
+struct efa_io_tx_buf_desc {
+ /* length in bytes */
+ uint16_t length;
+
+ /*
+ * control flags
+ * 6:0 : reserved16
+ * 7 : meta_desc - MBZ
+ */
+ uint8_t ctrl1;
+
+ /*
+ * control flags
+ * 0 : phase - phase bit
+ * 1 : reserved25 - MBZ
+ * 2 : first - Indicates first descriptor in
+ * transaction. MBZ
+ * 3 : last - Indicates last descriptor in transaction
+ * 7:4 : reserved28 - MBZ
+ */
+ uint8_t ctrl;
+
+ /* memory translation key */
+ uint32_t lkey;
+
+ /* Buffer address bits[31:0] */
+ uint32_t buf_addr_lo;
+
+ /*
+ * 15:0 : buf_addr_hi - Buffer Pointer[47:32]
+ * 31:16 : reserved - Reserved
+ */
+ uint32_t buf_addr_hi;
+};
+
+/* Tx meta descriptor for UD */
+struct efa_io_tx_ud_meta {
+ /* Queue key */
+ uint32_t qkey;
+
+ uint8_t reserved[12];
+};
+
+/* Remote memory address */
+struct efa_io_remote_mem_addr {
+ /* length in bytes */
+ uint16_t length;
+
+ /*
+ * control flags
+ * 5:0 : reserved16
+ * 6 : meta_extension - Must be set
+ * 7 : meta_desc - Must be set
+ */
+ uint8_t ctrl1;
+
+ /*
+ * control flags
+ * 0 : phase - phase bit
+ * 1 : reserved25 - MBZ
+ * 2 : first - Indicates first descriptor in
+ * transaction. MBZ
+ * 3 : last - Indicates last descriptor in transaction
+ * 7:4 : reserved28 - MBZ
+ */
+ uint8_t ctrl;
+
+ /* remote memory translation key */
+ uint32_t rkey;
+
+ /* Buffer address bits[31:0] */
+ uint32_t buf_addr_lo;
+
+ /* Buffer address bits[63:32] */
+ uint32_t buf_addr_hi;
+};
+
+/*
+ * Tx WQE, composed of tx meta descriptors followed by either tx buffer
+ * descriptors or inline data
+ */
+struct efa_io_tx_wqe {
+ /* TX meta */
+ struct efa_io_tx_meta_desc common;
+
+ union {
+ /* Tx meta for UD */
+ struct efa_io_tx_ud_meta ud;
+
+ /* Reserved Tx meta for SRD */
+ uint8_t srd_padding[16];
+
+ /* RDMA memory address */
+ struct efa_io_remote_mem_addr rdma_mem_addr;
+ } u;
+
+ union {
+ /* buffer descriptors */
+ struct efa_io_tx_buf_desc sgl[2];
+
+ /* inline data */
+ uint8_t inline_data[32];
+ } data;
+};
+
+/*
+ * Rx buffer descriptor; RX WQE is composed of one or more RX buffer
+ * descriptors.
+ */
+struct efa_io_rx_desc {
+ /* Buffer address bits[31:0] */
+ uint32_t buf_addr_lo;
+
+ /* Buffer Pointer[63:32] */
+ uint32_t buf_addr_hi;
+
+ /* Verbs-generated request id. */
+ uint16_t req_id;
+
+ /* Length in bytes. */
+ uint16_t length;
+
+ /*
+ * LKey and control flags
+ * 23:0 : lkey
+ * 29:24 : reserved - MBZ
+ * 30 : first - Indicates first descriptor in WQE
+ * 31 : last - Indicates last descriptor in WQE
+ */
+ uint32_t lkey_ctrl;
+};
+
+/* Common IO completion descriptor */
+struct efa_io_cdesc_common {
+ /*
+ * verbs-generated request ID, as provided in the completed tx or rx
+ * descriptor.
+ */
+ uint16_t req_id;
+
+ /* status */
+ uint8_t status;
+
+ /*
+ * flags
+ * 0 : phase - Phase bit
+ * 2:1 : q_type - enum efa_io_queue_type: send/recv
+ * 3 : has_imm - indicates that immediate data is
+ * present - for RX completions only
+ * 4 : wide_completion - indicates that wide
+ * completion format is used
+ * 7:5 : reserved29
+ */
+ uint8_t flags;
+
+ /* local QP number */
+ uint16_t qp_num;
+
+ /* Transferred length */
+ uint16_t length;
+};
+
+/* Tx completion descriptor */
+struct efa_io_tx_cdesc {
+ /* Common completion info */
+ struct efa_io_cdesc_common common;
+};
+
+/* Rx Completion Descriptor */
+struct efa_io_rx_cdesc {
+ /* Common completion info */
+ struct efa_io_cdesc_common common;
+
+ /* Remote Address Handle FW index, 0xFFFF indicates invalid ah */
+ uint16_t ah;
+
+ /* Source QP number */
+ uint16_t src_qp_num;
+
+ /* Immediate data */
+ uint32_t imm;
+};
+
+/* Extended Rx Completion Descriptor */
+struct efa_io_rx_cdesc_wide {
+ /* Base RX completion info */
+ struct efa_io_rx_cdesc rx_cdesc_base;
+
+ /*
+ * Word 0 of remote (source) address, needed only for in-band
+ * ad-hoc AH support
+ */
+ uint32_t src_addr_0;
+
+ /*
+ * Word 1 of remote (source) address, needed only for in-band
+ * ad-hoc AH support
+ */
+ uint32_t src_addr_1;
+
+ /*
+ * Word 2 of remote (source) address, needed only for in-band
+ * ad-hoc AH support
+ */
+ uint32_t src_addr_2;
+
+ /*
+ * Word 3 of remote (source) address, needed only for in-band
+ * ad-hoc AH support
+ */
+ uint32_t src_addr_3;
+};
+
+/* tx_meta_desc */
+#define EFA_IO_TX_META_DESC_OP_TYPE_MASK GENMASK(3, 0)
+#define EFA_IO_TX_META_DESC_HAS_IMM_SHIFT 4
+#define EFA_IO_TX_META_DESC_HAS_IMM_MASK BIT(4)
+#define EFA_IO_TX_META_DESC_INLINE_MSG_SHIFT 5
+#define EFA_IO_TX_META_DESC_INLINE_MSG_MASK BIT(5)
+#define EFA_IO_TX_META_DESC_META_EXTENSION_SHIFT 6
+#define EFA_IO_TX_META_DESC_META_EXTENSION_MASK BIT(6)
+#define EFA_IO_TX_META_DESC_META_DESC_SHIFT 7
+#define EFA_IO_TX_META_DESC_META_DESC_MASK BIT(7)
+#define EFA_IO_TX_META_DESC_PHASE_MASK BIT(0)
+#define EFA_IO_TX_META_DESC_FIRST_SHIFT 2
+#define EFA_IO_TX_META_DESC_FIRST_MASK BIT(2)
+#define EFA_IO_TX_META_DESC_LAST_SHIFT 3
+#define EFA_IO_TX_META_DESC_LAST_MASK BIT(3)
+#define EFA_IO_TX_META_DESC_COMP_REQ_SHIFT 4
+#define EFA_IO_TX_META_DESC_COMP_REQ_MASK BIT(4)
+
+/* tx_buf_desc */
+#define EFA_IO_TX_BUF_DESC_META_DESC_SHIFT 7
+#define EFA_IO_TX_BUF_DESC_META_DESC_MASK BIT(7)
+#define EFA_IO_TX_BUF_DESC_PHASE_MASK BIT(0)
+#define EFA_IO_TX_BUF_DESC_FIRST_SHIFT 2
+#define EFA_IO_TX_BUF_DESC_FIRST_MASK BIT(2)
+#define EFA_IO_TX_BUF_DESC_LAST_SHIFT 3
+#define EFA_IO_TX_BUF_DESC_LAST_MASK BIT(3)
+#define EFA_IO_TX_BUF_DESC_BUF_ADDR_HI_MASK GENMASK(15, 0)
+
+/* remote_mem_addr */
+#define EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_SHIFT 6
+#define EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_MASK BIT(6)
+#define EFA_IO_REMOTE_MEM_ADDR_META_DESC_SHIFT 7
+#define EFA_IO_REMOTE_MEM_ADDR_META_DESC_MASK BIT(7)
+#define EFA_IO_REMOTE_MEM_ADDR_PHASE_MASK BIT(0)
+#define EFA_IO_REMOTE_MEM_ADDR_FIRST_SHIFT 2
+#define EFA_IO_REMOTE_MEM_ADDR_FIRST_MASK BIT(2)
+#define EFA_IO_REMOTE_MEM_ADDR_LAST_SHIFT 3
+#define EFA_IO_REMOTE_MEM_ADDR_LAST_MASK BIT(3)
+
+/* rx_desc */
+#define EFA_IO_RX_DESC_LKEY_MASK GENMASK(23, 0)
+#define EFA_IO_RX_DESC_FIRST_SHIFT 30
+#define EFA_IO_RX_DESC_FIRST_MASK BIT(30)
+#define EFA_IO_RX_DESC_LAST_SHIFT 31
+#define EFA_IO_RX_DESC_LAST_MASK BIT(31)
+
+/* cdesc_common */
+#define EFA_IO_CDESC_COMMON_PHASE_MASK BIT(0)
+#define EFA_IO_CDESC_COMMON_Q_TYPE_SHIFT 1
+#define EFA_IO_CDESC_COMMON_Q_TYPE_MASK GENMASK(2, 1)
+#define EFA_IO_CDESC_COMMON_HAS_IMM_SHIFT 3
+#define EFA_IO_CDESC_COMMON_HAS_IMM_MASK BIT(3)
+#define EFA_IO_CDESC_COMMON_WIDE_COMPLETION_SHIFT 4
+#define EFA_IO_CDESC_COMMON_WIDE_COMPLETION_MASK BIT(4)
+
+static inline uint8_t get_efa_io_tx_meta_desc_op_type(const struct efa_io_tx_meta_desc *p)
+{
+ return p->ctrl1 & EFA_IO_TX_META_DESC_OP_TYPE_MASK;
+}
+
+static inline void set_efa_io_tx_meta_desc_op_type(struct efa_io_tx_meta_desc *p, uint8_t val)
+{
+ p->ctrl1 |= val & EFA_IO_TX_META_DESC_OP_TYPE_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_meta_desc_has_imm(const struct efa_io_tx_meta_desc *p)
+{
+ return (p->ctrl1 & EFA_IO_TX_META_DESC_HAS_IMM_MASK) >> EFA_IO_TX_META_DESC_HAS_IMM_SHIFT;
+}
+
+static inline void set_efa_io_tx_meta_desc_has_imm(struct efa_io_tx_meta_desc *p, uint8_t val)
+{
+ p->ctrl1 |= (val << EFA_IO_TX_META_DESC_HAS_IMM_SHIFT) & EFA_IO_TX_META_DESC_HAS_IMM_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_meta_desc_inline_msg(const struct efa_io_tx_meta_desc *p)
+{
+ return (p->ctrl1 & EFA_IO_TX_META_DESC_INLINE_MSG_MASK) >> EFA_IO_TX_META_DESC_INLINE_MSG_SHIFT;
+}
+
+static inline void set_efa_io_tx_meta_desc_inline_msg(struct efa_io_tx_meta_desc *p, uint8_t val)
+{
+ p->ctrl1 |= (val << EFA_IO_TX_META_DESC_INLINE_MSG_SHIFT) & EFA_IO_TX_META_DESC_INLINE_MSG_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_meta_desc_meta_extension(const struct efa_io_tx_meta_desc *p)
+{
+ return (p->ctrl1 & EFA_IO_TX_META_DESC_META_EXTENSION_MASK) >> EFA_IO_TX_META_DESC_META_EXTENSION_SHIFT;
+}
+
+static inline void set_efa_io_tx_meta_desc_meta_extension(struct efa_io_tx_meta_desc *p, uint8_t val)
+{
+ p->ctrl1 |= (val << EFA_IO_TX_META_DESC_META_EXTENSION_SHIFT) & EFA_IO_TX_META_DESC_META_EXTENSION_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_meta_desc_meta_desc(const struct efa_io_tx_meta_desc *p)
+{
+ return (p->ctrl1 & EFA_IO_TX_META_DESC_META_DESC_MASK) >> EFA_IO_TX_META_DESC_META_DESC_SHIFT;
+}
+
+static inline void set_efa_io_tx_meta_desc_meta_desc(struct efa_io_tx_meta_desc *p, uint8_t val)
+{
+ p->ctrl1 |= (val << EFA_IO_TX_META_DESC_META_DESC_SHIFT) & EFA_IO_TX_META_DESC_META_DESC_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_meta_desc_phase(const struct efa_io_tx_meta_desc *p)
+{
+ return p->ctrl2 & EFA_IO_TX_META_DESC_PHASE_MASK;
+}
+
+static inline void set_efa_io_tx_meta_desc_phase(struct efa_io_tx_meta_desc *p, uint8_t val)
+{
+ p->ctrl2 |= val & EFA_IO_TX_META_DESC_PHASE_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_meta_desc_first(const struct efa_io_tx_meta_desc *p)
+{
+ return (p->ctrl2 & EFA_IO_TX_META_DESC_FIRST_MASK) >> EFA_IO_TX_META_DESC_FIRST_SHIFT;
+}
+
+static inline void set_efa_io_tx_meta_desc_first(struct efa_io_tx_meta_desc *p, uint8_t val)
+{
+ p->ctrl2 |= (val << EFA_IO_TX_META_DESC_FIRST_SHIFT) & EFA_IO_TX_META_DESC_FIRST_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_meta_desc_last(const struct efa_io_tx_meta_desc *p)
+{
+ return (p->ctrl2 & EFA_IO_TX_META_DESC_LAST_MASK) >> EFA_IO_TX_META_DESC_LAST_SHIFT;
+}
+
+static inline void set_efa_io_tx_meta_desc_last(struct efa_io_tx_meta_desc *p, uint8_t val)
+{
+ p->ctrl2 |= (val << EFA_IO_TX_META_DESC_LAST_SHIFT) & EFA_IO_TX_META_DESC_LAST_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_meta_desc_comp_req(const struct efa_io_tx_meta_desc *p)
+{
+ return (p->ctrl2 & EFA_IO_TX_META_DESC_COMP_REQ_MASK) >> EFA_IO_TX_META_DESC_COMP_REQ_SHIFT;
+}
+
+static inline void set_efa_io_tx_meta_desc_comp_req(struct efa_io_tx_meta_desc *p, uint8_t val)
+{
+ p->ctrl2 |= (val << EFA_IO_TX_META_DESC_COMP_REQ_SHIFT) & EFA_IO_TX_META_DESC_COMP_REQ_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_buf_desc_meta_desc(const struct efa_io_tx_buf_desc *p)
+{
+ return (p->ctrl1 & EFA_IO_TX_BUF_DESC_META_DESC_MASK) >> EFA_IO_TX_BUF_DESC_META_DESC_SHIFT;
+}
+
+static inline void set_efa_io_tx_buf_desc_meta_desc(struct efa_io_tx_buf_desc *p, uint8_t val)
+{
+ p->ctrl1 |= (val << EFA_IO_TX_BUF_DESC_META_DESC_SHIFT) & EFA_IO_TX_BUF_DESC_META_DESC_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_buf_desc_phase(const struct efa_io_tx_buf_desc *p)
+{
+ return p->ctrl & EFA_IO_TX_BUF_DESC_PHASE_MASK;
+}
+
+static inline void set_efa_io_tx_buf_desc_phase(struct efa_io_tx_buf_desc *p, uint8_t val)
+{
+ p->ctrl |= val & EFA_IO_TX_BUF_DESC_PHASE_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_buf_desc_first(const struct efa_io_tx_buf_desc *p)
+{
+ return (p->ctrl & EFA_IO_TX_BUF_DESC_FIRST_MASK) >> EFA_IO_TX_BUF_DESC_FIRST_SHIFT;
+}
+
+static inline void set_efa_io_tx_buf_desc_first(struct efa_io_tx_buf_desc *p, uint8_t val)
+{
+ p->ctrl |= (val << EFA_IO_TX_BUF_DESC_FIRST_SHIFT) & EFA_IO_TX_BUF_DESC_FIRST_MASK;
+}
+
+static inline uint8_t get_efa_io_tx_buf_desc_last(const struct efa_io_tx_buf_desc *p)
+{
+ return (p->ctrl & EFA_IO_TX_BUF_DESC_LAST_MASK) >> EFA_IO_TX_BUF_DESC_LAST_SHIFT;
+}
+
+static inline void set_efa_io_tx_buf_desc_last(struct efa_io_tx_buf_desc *p, uint8_t val)
+{
+ p->ctrl |= (val << EFA_IO_TX_BUF_DESC_LAST_SHIFT) & EFA_IO_TX_BUF_DESC_LAST_MASK;
+}
+
+static inline uint32_t get_efa_io_tx_buf_desc_buf_addr_hi(const struct efa_io_tx_buf_desc *p)
+{
+ return p->buf_addr_hi & EFA_IO_TX_BUF_DESC_BUF_ADDR_HI_MASK;
+}
+
+static inline void set_efa_io_tx_buf_desc_buf_addr_hi(struct efa_io_tx_buf_desc *p, uint32_t val)
+{
+ p->buf_addr_hi |= val & EFA_IO_TX_BUF_DESC_BUF_ADDR_HI_MASK;
+}
+
+static inline uint8_t get_efa_io_remote_mem_addr_meta_extension(const struct efa_io_remote_mem_addr *p)
+{
+ return (p->ctrl1 & EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_MASK) >> EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_SHIFT;
+}
+
+static inline void set_efa_io_remote_mem_addr_meta_extension(struct efa_io_remote_mem_addr *p, uint8_t val)
+{
+ p->ctrl1 |= (val << EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_SHIFT) & EFA_IO_REMOTE_MEM_ADDR_META_EXTENSION_MASK;
+}
+
+static inline uint8_t get_efa_io_remote_mem_addr_meta_desc(const struct efa_io_remote_mem_addr *p)
+{
+ return (p->ctrl1 & EFA_IO_REMOTE_MEM_ADDR_META_DESC_MASK) >> EFA_IO_REMOTE_MEM_ADDR_META_DESC_SHIFT;
+}
+
+static inline void set_efa_io_remote_mem_addr_meta_desc(struct efa_io_remote_mem_addr *p, uint8_t val)
+{
+ p->ctrl1 |= (val << EFA_IO_REMOTE_MEM_ADDR_META_DESC_SHIFT) & EFA_IO_REMOTE_MEM_ADDR_META_DESC_MASK;
+}
+
+static inline uint8_t get_efa_io_remote_mem_addr_phase(const struct efa_io_remote_mem_addr *p)
+{
+ return p->ctrl & EFA_IO_REMOTE_MEM_ADDR_PHASE_MASK;
+}
+
+static inline void set_efa_io_remote_mem_addr_phase(struct efa_io_remote_mem_addr *p, uint8_t val)
+{
+ p->ctrl |= val & EFA_IO_REMOTE_MEM_ADDR_PHASE_MASK;
+}
+
+static inline uint8_t get_efa_io_remote_mem_addr_first(const struct efa_io_remote_mem_addr *p)
+{
+ return (p->ctrl & EFA_IO_REMOTE_MEM_ADDR_FIRST_MASK) >> EFA_IO_REMOTE_MEM_ADDR_FIRST_SHIFT;
+}
+
+static inline void set_efa_io_remote_mem_addr_first(struct efa_io_remote_mem_addr *p, uint8_t val)
+{
+ p->ctrl |= (val << EFA_IO_REMOTE_MEM_ADDR_FIRST_SHIFT) & EFA_IO_REMOTE_MEM_ADDR_FIRST_MASK;
+}
+
+static inline uint8_t get_efa_io_remote_mem_addr_last(const struct efa_io_remote_mem_addr *p)
+{
+ return (p->ctrl & EFA_IO_REMOTE_MEM_ADDR_LAST_MASK) >> EFA_IO_REMOTE_MEM_ADDR_LAST_SHIFT;
+}
+
+static inline void set_efa_io_remote_mem_addr_last(struct efa_io_remote_mem_addr *p, uint8_t val)
+{
+ p->ctrl |= (val << EFA_IO_REMOTE_MEM_ADDR_LAST_SHIFT) & EFA_IO_REMOTE_MEM_ADDR_LAST_MASK;
+}
+
+static inline uint32_t get_efa_io_rx_desc_lkey(const struct efa_io_rx_desc *p)
+{
+ return p->lkey_ctrl & EFA_IO_RX_DESC_LKEY_MASK;
+}
+
+static inline void set_efa_io_rx_desc_lkey(struct efa_io_rx_desc *p, uint32_t val)
+{
+ p->lkey_ctrl |= val & EFA_IO_RX_DESC_LKEY_MASK;
+}
+
+static inline uint32_t get_efa_io_rx_desc_first(const struct efa_io_rx_desc *p)
+{
+ return (p->lkey_ctrl & EFA_IO_RX_DESC_FIRST_MASK) >> EFA_IO_RX_DESC_FIRST_SHIFT;
+}
+
+static inline void set_efa_io_rx_desc_first(struct efa_io_rx_desc *p, uint32_t val)
+{
+ p->lkey_ctrl |= (val << EFA_IO_RX_DESC_FIRST_SHIFT) & EFA_IO_RX_DESC_FIRST_MASK;
+}
+
+static inline uint32_t get_efa_io_rx_desc_last(const struct efa_io_rx_desc *p)
+{
+ return (p->lkey_ctrl & EFA_IO_RX_DESC_LAST_MASK) >> EFA_IO_RX_DESC_LAST_SHIFT;
+}
+
+static inline void set_efa_io_rx_desc_last(struct efa_io_rx_desc *p, uint32_t val)
+{
+ p->lkey_ctrl |= (val << EFA_IO_RX_DESC_LAST_SHIFT) & EFA_IO_RX_DESC_LAST_MASK;
+}
+
+static inline uint8_t get_efa_io_cdesc_common_phase(const struct efa_io_cdesc_common *p)
+{
+ return p->flags & EFA_IO_CDESC_COMMON_PHASE_MASK;
+}
+
+static inline void set_efa_io_cdesc_common_phase(struct efa_io_cdesc_common *p, uint8_t val)
+{
+ p->flags |= val & EFA_IO_CDESC_COMMON_PHASE_MASK;
+}
+
+static inline uint8_t get_efa_io_cdesc_common_q_type(const struct efa_io_cdesc_common *p)
+{
+ return (p->flags & EFA_IO_CDESC_COMMON_Q_TYPE_MASK) >> EFA_IO_CDESC_COMMON_Q_TYPE_SHIFT;
+}
+
+static inline void set_efa_io_cdesc_common_q_type(struct efa_io_cdesc_common *p, uint8_t val)
+{
+ p->flags |= (val << EFA_IO_CDESC_COMMON_Q_TYPE_SHIFT) & EFA_IO_CDESC_COMMON_Q_TYPE_MASK;
+}
+
+static inline uint8_t get_efa_io_cdesc_common_has_imm(const struct efa_io_cdesc_common *p)
+{
+ return (p->flags & EFA_IO_CDESC_COMMON_HAS_IMM_MASK) >> EFA_IO_CDESC_COMMON_HAS_IMM_SHIFT;
+}
+
+static inline void set_efa_io_cdesc_common_has_imm(struct efa_io_cdesc_common *p, uint8_t val)
+{
+ p->flags |= (val << EFA_IO_CDESC_COMMON_HAS_IMM_SHIFT) & EFA_IO_CDESC_COMMON_HAS_IMM_MASK;
+}
+
+static inline uint8_t get_efa_io_cdesc_common_wide_completion(const struct efa_io_cdesc_common *p)
+{
+ return (p->flags & EFA_IO_CDESC_COMMON_WIDE_COMPLETION_MASK) >> EFA_IO_CDESC_COMMON_WIDE_COMPLETION_SHIFT;
+}
+
+static inline void set_efa_io_cdesc_common_wide_completion(struct efa_io_cdesc_common *p, uint8_t val)
+{
+ p->flags |= (val << EFA_IO_CDESC_COMMON_WIDE_COMPLETION_SHIFT) & EFA_IO_CDESC_COMMON_WIDE_COMPLETION_MASK;
+}
+
+#endif /*_EFA_IO_H_ */
new file mode 100644
@@ -0,0 +1,1142 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <malloc.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <ccan/minmax.h>
+
+#include <util/compiler.h>
+#include <util/mmio.h>
+
+#include "efa.h"
+#include "verbs.h"
+
+int efa_query_device(struct ibv_context *ibvctx,
+ struct ibv_device_attr *dev_attr)
+{
+ struct efa_context *ctx = to_efa_context(ibvctx);
+ struct ibv_query_device cmd;
+ uint8_t fw_ver[8];
+ int err;
+
+ memset(dev_attr, 0, sizeof(struct ibv_device_attr));
+ err = ibv_cmd_query_device(ibvctx, dev_attr, (uint64_t *)&fw_ver,
+ &cmd, sizeof(cmd));
+ if (err)
+ return err;
+
+ dev_attr->max_qp_wr = min_t(int, dev_attr->max_qp_wr,
+ ctx->max_llq_size / sizeof(struct efa_io_tx_wqe));
+ snprintf(dev_attr->fw_ver, sizeof(dev_attr->fw_ver), "%u.%u.%u.%u",
+ fw_ver[0], fw_ver[1], fw_ver[2], fw_ver[3]);
+
+ return 0;
+}
+
+int efa_query_port(struct ibv_context *ibvctx, uint8_t port,
+ struct ibv_port_attr *port_attr)
+{
+ struct ibv_query_port cmd;
+
+ memset(port_attr, 0, sizeof(struct ibv_port_attr));
+ return ibv_cmd_query_port(ibvctx, port, port_attr, &cmd, sizeof(cmd));
+}
+
+int efa_query_device_ex(struct ibv_context *context,
+ const struct ibv_query_device_ex_input *input,
+ struct ibv_device_attr_ex *attr,
+ size_t attr_size)
+{
+ struct efa_context *ctx = to_efa_context(context);
+ struct efa_dev *dev = to_efa_dev(context->device);
+ int cmd_supp_uhw = ctx->cmds_supp_udata_mask &
+ EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
+ struct efa_query_device_ex_resp resp;
+ struct ibv_query_device_ex cmd;
+ struct ibv_device_attr *a;
+ uint8_t fw_ver[8];
+ int err;
+
+ memset(&resp, 0, sizeof(resp));
+ err = ibv_cmd_query_device_ex(
+ context, input, attr, attr_size, (uint64_t *)&fw_ver, &cmd,
+ sizeof(cmd), &resp.ibv_resp,
+ cmd_supp_uhw ? sizeof(resp) : sizeof(resp.ibv_resp));
+ if (err)
+ return err;
+
+ dev->max_sq_wr = resp.max_sq_wr;
+ dev->max_rq_wr = resp.max_rq_wr;
+ dev->max_sq_sge = resp.max_sq_sge;
+ dev->max_rq_sge = resp.max_rq_sge;
+
+ a = &attr->orig_attr;
+ a->max_qp_wr = min_t(int, a->max_qp_wr,
+ ctx->max_llq_size / sizeof(struct efa_io_tx_wqe));
+ snprintf(a->fw_ver, sizeof(a->fw_ver), "%u.%u.%u.%u",
+ fw_ver[0], fw_ver[1], fw_ver[2], fw_ver[3]);
+
+ return 0;
+}
+
+struct ibv_pd *efa_alloc_pd(struct ibv_context *ibvctx)
+{
+ struct efa_alloc_pd_resp resp = {};
+ struct ibv_alloc_pd cmd;
+ struct efa_pd *pd;
+
+ pd = calloc(1, sizeof(*pd));
+ if (!pd)
+ return NULL;
+
+ if (ibv_cmd_alloc_pd(ibvctx, &pd->ibvpd, &cmd, sizeof(cmd),
+ &resp.ibv_resp, sizeof(resp)))
+ goto out;
+
+ pd->context = to_efa_context(ibvctx);
+ pd->pdn = resp.pdn;
+
+ return &pd->ibvpd;
+
+out:
+ free(pd);
+ return NULL;
+}
+
+int efa_dealloc_pd(struct ibv_pd *ibvpd)
+{
+ struct efa_pd *pd = to_efa_pd(ibvpd);
+ int err;
+
+ err = ibv_cmd_dealloc_pd(ibvpd);
+ if (err)
+ return err;
+ free(pd);
+
+ return 0;
+}
+
+struct ibv_mr *efa_reg_mr(struct ibv_pd *ibvpd, void *sva, size_t len,
+ int access)
+{
+ struct ib_uverbs_reg_mr_resp resp;
+ struct ibv_reg_mr cmd;
+ struct efa_mr *mr;
+
+ mr = calloc(1, sizeof(*mr));
+ if (!mr)
+ return NULL;
+
+ if (ibv_cmd_reg_mr(ibvpd, sva, len, (uintptr_t)sva, access, &mr->vmr,
+ &cmd, sizeof(cmd), &resp, sizeof(resp))) {
+ free(mr);
+ return NULL;
+ }
+
+ return &mr->vmr.ibv_mr;
+}
+
+int efa_dereg_mr(struct verbs_mr *vmr)
+{
+ struct efa_mr *mr = container_of(vmr, struct efa_mr, vmr);
+ int err;
+
+ err = ibv_cmd_dereg_mr(vmr);
+ if (err)
+ return err;
+ free(mr);
+
+ return 0;
+}
+
+static uint32_t efa_sub_cq_get_current_index(struct efa_sub_cq *sub_cq)
+{
+ return sub_cq->consumed_cnt & sub_cq->qmask;
+}
+
+static int efa_cqe_is_pending(struct efa_io_cdesc_common *cqe_common,
+ int phase)
+{
+ return (cqe_common->flags & EFA_IO_CDESC_COMMON_PHASE_MASK) == phase;
+}
+
+static struct efa_io_cdesc_common *
+efa_sub_cq_get_cqe(struct efa_sub_cq *sub_cq, int entry)
+{
+ return (struct efa_io_cdesc_common *)(sub_cq->buf +
+ (entry * sub_cq->cqe_size));
+}
+
+static void efa_sub_cq_initialize(struct efa_sub_cq *sub_cq, uint8_t *buf,
+ int sub_cq_size, int cqe_size)
+{
+ sub_cq->consumed_cnt = 0;
+ sub_cq->phase = 1;
+ sub_cq->buf = buf;
+ sub_cq->qmask = sub_cq_size - 1;
+ sub_cq->cqe_size = cqe_size;
+ sub_cq->ref_cnt = 0;
+}
+
+struct ibv_cq *efa_create_cq(struct ibv_context *ibvctx, int ncqe,
+ struct ibv_comp_channel *channel, int vec)
+{
+ struct efa_context *ctx = to_efa_context(ibvctx);
+ struct efa_create_cq_resp resp = {};
+ struct efa_create_cq cmd = {};
+ uint16_t num_sub_cqs;
+ struct efa_cq *cq;
+ int sub_buf_size;
+ int sub_cq_size;
+ uint8_t *buf;
+ int i;
+
+ cq = calloc(1, sizeof(*cq));
+ if (!cq)
+ return NULL;
+
+ num_sub_cqs = ctx->sub_cqs_per_cq;
+ cmd.num_sub_cqs = num_sub_cqs;
+ cmd.cq_entry_size = ctx->cqe_size;
+
+ memset(&resp, 0, sizeof(resp));
+ ncqe = align_up_queue_size(ncqe);
+ if (ibv_cmd_create_cq(ibvctx, ncqe, channel, vec,
+ &cq->ibvcq, &cmd.ibv_cmd, sizeof(cmd),
+ &resp.ibv_resp, sizeof(resp)))
+ goto err_free_cq;
+
+ sub_cq_size = cq->ibvcq.cqe;
+ cq->cqn = resp.cq_idx;
+ cq->buf_size = resp.q_mmap_size;
+ cq->num_sub_cqs = num_sub_cqs;
+ cq->cqe_size = ctx->cqe_size;
+
+ pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE);
+
+ cq->buf = mmap(NULL, cq->buf_size, PROT_READ, MAP_SHARED,
+ ibvctx->cmd_fd, resp.q_mmap_key);
+ if (cq->buf == MAP_FAILED)
+ goto err_destroy_spinlock;
+
+ cq->sub_cq_arr = calloc(cq->num_sub_cqs, sizeof(*cq->sub_cq_arr));
+ if (!cq->sub_cq_arr)
+ goto err_unmap;
+
+ buf = cq->buf;
+ sub_buf_size = cq->cqe_size * sub_cq_size;
+ for (i = 0; i < num_sub_cqs; i++) {
+ efa_sub_cq_initialize(&cq->sub_cq_arr[i], buf, sub_cq_size,
+ cq->cqe_size);
+ buf += sub_buf_size;
+ }
+
+ return &cq->ibvcq;
+
+err_unmap:
+ munmap(cq->buf, cq->buf_size);
+err_destroy_spinlock:
+ pthread_spin_destroy(&cq->lock);
+ ibv_cmd_destroy_cq(&cq->ibvcq);
+err_free_cq:
+ free(cq);
+ return NULL;
+}
+
+int efa_destroy_cq(struct ibv_cq *ibvcq)
+{
+ struct efa_cq *cq = to_efa_cq(ibvcq);
+ int err;
+
+ free(cq->sub_cq_arr);
+ munmap(cq->buf, cq->buf_size);
+
+ pthread_spin_destroy(&cq->lock);
+
+ err = ibv_cmd_destroy_cq(ibvcq);
+ if (err)
+ return err;
+
+ free(cq);
+
+ return 0;
+}
+
+static struct efa_io_cdesc_common *
+cq_next_sub_cqe_get(struct efa_sub_cq *sub_cq)
+{
+ struct efa_io_cdesc_common *cqe;
+ uint32_t current_index;
+
+ current_index = efa_sub_cq_get_current_index(sub_cq);
+ cqe = efa_sub_cq_get_cqe(sub_cq, current_index);
+ if (efa_cqe_is_pending(cqe, sub_cq->phase)) {
+ /* Do not read the rest of the completion entry before the
+ * phase bit has been validated.
+ */
+ udma_from_device_barrier();
+ sub_cq->consumed_cnt++;
+ if (!efa_sub_cq_get_current_index(sub_cq))
+ sub_cq->phase = 1 - sub_cq->phase;
+ return cqe;
+ }
+
+ return NULL;
+}
+
+static enum ibv_wc_status to_ibv_status(enum efa_io_comp_status status)
+{
+ switch (status) {
+ case EFA_IO_COMP_STATUS_OK:
+ return IBV_WC_SUCCESS;
+ case EFA_IO_COMP_STATUS_FLUSHED:
+ return IBV_WC_WR_FLUSH_ERR;
+ case EFA_IO_COMP_STATUS_LOCAL_ERROR_QP_INTERNAL_ERROR:
+ case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_OP_TYPE:
+ case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_AH:
+ return IBV_WC_LOC_QP_OP_ERR;
+ case EFA_IO_COMP_STATUS_LOCAL_ERROR_INVALID_LKEY:
+ return IBV_WC_LOC_PROT_ERR;
+ case EFA_IO_COMP_STATUS_LOCAL_ERROR_BAD_LENGTH:
+ return IBV_WC_LOC_LEN_ERR;
+ case EFA_IO_COMP_STATUS_REMOTE_ERROR_ABORT:
+ return IBV_WC_REM_ABORT_ERR;
+ case EFA_IO_COMP_STATUS_REMOTE_ERROR_RNR:
+ return IBV_WC_RNR_RETRY_EXC_ERR;
+ case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_DEST_QPN:
+ return IBV_WC_REM_INV_RD_REQ_ERR;
+ case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_STATUS:
+ return IBV_WC_BAD_RESP_ERR;
+ case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_LENGTH:
+ return IBV_WC_REM_INV_REQ_ERR;
+ case EFA_IO_COMP_STATUS_REMOTE_ERROR_BAD_ADDRESS:
+ default:
+ return IBV_WC_GENERAL_ERR;
+ }
+}
+
+static int efa_poll_sub_cq(struct efa_cq *cq, struct efa_sub_cq *sub_cq,
+ struct efa_qp **cur_qp, struct ibv_wc *wc)
+{
+ struct efa_context *ctx = to_efa_context(cq->ibvcq.context);
+ struct efa_io_cdesc_common *cqe;
+ uint32_t qpn, wrid_idx;
+ struct efa_wq *wq;
+
+ cqe = cq_next_sub_cqe_get(sub_cq);
+ if (!cqe)
+ return ENOMEM;
+
+ qpn = cqe->qp_num;
+ if (!*cur_qp || qpn != (*cur_qp)->ibvqp.qp_num) {
+ /* We do not have to take the QP table lock here,
+ * because CQs will be locked while QPs are removed
+ * from the table.
+ */
+ *cur_qp = ctx->qp_table[qpn];
+ if (!*cur_qp)
+ return EINVAL;
+ }
+
+ wrid_idx = cqe->req_id;
+ wc->status = to_ibv_status(cqe->status);
+ wc->vendor_err = cqe->status;
+ if (get_efa_io_cdesc_common_q_type(cqe) == EFA_IO_SEND_QUEUE) {
+ wq = &(*cur_qp)->sq.wq;
+ wc->opcode = IBV_WC_SEND;
+ } else {
+ struct efa_io_rx_cdesc *rcqe =
+ container_of(cqe, struct efa_io_rx_cdesc, common);
+
+ wq = &(*cur_qp)->rq.wq;
+
+ wc->byte_len = cqe->length;
+ wc->opcode = IBV_WC_RECV;
+ wc->src_qp = rcqe->src_qp_num;
+ wc->sl = 0;
+ wc->slid = 0;
+ }
+
+ wc->wc_flags = 0;
+ wc->qp_num = qpn;
+ wq->wrid_idx_pool_next--;
+ wq->wrid_idx_pool[wq->wrid_idx_pool_next] = wrid_idx;
+ wc->wr_id = wq->wrid[wrid_idx];
+ wq->wqe_completed++;
+
+ return 0;
+}
+
+static int efa_poll_sub_cqs(struct efa_cq *cq, struct ibv_wc *wc)
+{
+ uint16_t num_sub_cqs = cq->num_sub_cqs;
+ struct efa_sub_cq *sub_cq;
+ struct efa_qp *qp = NULL;
+ uint16_t sub_cq_idx;
+ int err = ENOMEM;
+
+ for (sub_cq_idx = 0; sub_cq_idx < num_sub_cqs; sub_cq_idx++) {
+ sub_cq = &cq->sub_cq_arr[cq->next_poll_idx++];
+ cq->next_poll_idx %= num_sub_cqs;
+
+ if (!sub_cq->ref_cnt)
+ continue;
+
+ err = efa_poll_sub_cq(cq, sub_cq, &qp, wc);
+ if (err != ENOMEM)
+ break;
+ }
+
+ return err;
+}
+
+int efa_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc)
+{
+ struct efa_cq *cq = to_efa_cq(ibvcq);
+ ssize_t ret = 0;
+ int i;
+
+ pthread_spin_lock(&cq->lock);
+ for (i = 0; i < nwc; i++) {
+ ret = efa_poll_sub_cqs(cq, &wc[i]);
+ if (ret) {
+ if (ret == ENOMEM)
+ ret = 0;
+ break;
+ }
+ }
+ pthread_spin_unlock(&cq->lock);
+
+ return i ?: -ret;
+}
+
+static void efa_cq_inc_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx)
+{
+ cq->sub_cq_arr[sub_cq_idx].ref_cnt++;
+}
+
+static void efa_cq_dec_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx)
+{
+ cq->sub_cq_arr[sub_cq_idx].ref_cnt--;
+}
+
+static void efa_wq_terminate(struct efa_wq *wq)
+{
+ pthread_spin_destroy(&wq->wqlock);
+ free(wq->wrid_idx_pool);
+ free(wq->wrid);
+}
+
+static int efa_wq_initialize(struct efa_wq *wq)
+{
+ int err;
+ int i;
+
+ wq->wrid = malloc(wq->wqe_cnt * sizeof(*wq->wrid));
+ if (!wq->wrid)
+ return ENOMEM;
+
+ wq->wrid_idx_pool = malloc(wq->wqe_cnt * sizeof(uint32_t));
+ if (!wq->wrid_idx_pool) {
+ err = ENOMEM;
+ goto err_free_wrid;
+ }
+
+ /* Initialize the wrid free indexes pool. */
+ for (i = 0; i < wq->wqe_cnt; i++)
+ wq->wrid_idx_pool[i] = i;
+
+ pthread_spin_init(&wq->wqlock, PTHREAD_PROCESS_PRIVATE);
+
+ return 0;
+
+err_free_wrid:
+ free(wq->wrid);
+
+ return err;
+}
+
+static void efa_sq_terminate(struct efa_qp *qp)
+{
+ void *db_aligned;
+
+ if (!qp->sq.wq.wrid)
+ return;
+
+ db_aligned = (void *)((uintptr_t)qp->sq.db & ~(qp->page_size - 1));
+ munmap(db_aligned, qp->page_size);
+ munmap(qp->sq.desc - qp->sq.desc_offset, qp->sq.desc_ring_mmap_size);
+
+ efa_wq_terminate(&qp->sq.wq);
+}
+
+static int efa_sq_initialize(struct efa_qp *qp, struct efa_create_qp_resp *resp)
+{
+ size_t desc_ring_size;
+ uint8_t *db_base;
+ int err;
+
+ if (!qp->sq.wq.wqe_cnt)
+ return 0;
+
+ err = efa_wq_initialize(&qp->sq.wq);
+ if (err)
+ return err;
+
+ qp->sq.desc_offset = resp->llq_desc_offset;
+ desc_ring_size = qp->sq.wq.wqe_cnt * sizeof(struct efa_io_tx_wqe);
+ qp->sq.desc_ring_mmap_size = align(desc_ring_size + qp->sq.desc_offset,
+ qp->page_size);
+ qp->sq.max_inline_data = resp->ibv_resp.max_inline_data;
+
+ qp->sq.desc = mmap(NULL, qp->sq.desc_ring_mmap_size, PROT_WRITE,
+ MAP_SHARED, qp->ibvqp.context->cmd_fd,
+ resp->llq_desc_mmap_key);
+ if (qp->sq.desc == MAP_FAILED)
+ goto err_terminate_wq;
+
+ qp->sq.desc += qp->sq.desc_offset;
+
+ db_base = mmap(NULL, qp->page_size, PROT_WRITE, MAP_SHARED,
+ qp->ibvqp.context->cmd_fd, resp->sq_db_mmap_key);
+ if (db_base == MAP_FAILED)
+ goto err_unmap_desc_ring;
+
+ qp->sq.db = (uint32_t *)(db_base + resp->sq_db_offset);
+ qp->sq.sub_cq_idx = resp->send_sub_cq_idx;
+
+ return 0;
+
+err_unmap_desc_ring:
+ munmap(qp->sq.desc - qp->sq.desc_offset, qp->sq.desc_ring_mmap_size);
+err_terminate_wq:
+ efa_wq_terminate(&qp->sq.wq);
+ return EINVAL;
+}
+
+static void efa_rq_terminate(struct efa_qp *qp)
+{
+ void *db_aligned;
+
+ if (!qp->rq.wq.wrid)
+ return;
+
+ db_aligned = (void *)((uintptr_t)qp->rq.db & ~(qp->page_size - 1));
+ munmap(db_aligned, qp->page_size);
+ munmap(qp->rq.buf, qp->rq.buf_size);
+
+ efa_wq_terminate(&qp->rq.wq);
+}
+
+static int efa_rq_initialize(struct efa_qp *qp, struct efa_create_qp_resp *resp)
+{
+ uint8_t *db_base;
+ int err;
+
+ if (!qp->rq.wq.wqe_cnt)
+ return 0;
+
+ err = efa_wq_initialize(&qp->rq.wq);
+ if (err)
+ return err;
+
+ qp->rq.buf_size = resp->rq_mmap_size;
+ qp->rq.buf = mmap(NULL, qp->rq.buf_size, PROT_WRITE, MAP_SHARED,
+ qp->ibvqp.context->cmd_fd, resp->rq_mmap_key);
+ if (qp->rq.buf == MAP_FAILED)
+ goto err_terminate_wq;
+
+ db_base = mmap(NULL, qp->page_size, PROT_WRITE, MAP_SHARED,
+ qp->ibvqp.context->cmd_fd, resp->rq_db_mmap_key);
+ if (db_base == MAP_FAILED)
+ goto err_unmap_rq_buf;
+
+ qp->rq.db = (uint32_t *)(db_base + resp->rq_db_offset);
+ qp->rq.sub_cq_idx = resp->recv_sub_cq_idx;
+
+ return 0;
+
+err_unmap_rq_buf:
+ munmap(qp->rq.buf, qp->rq.buf_size);
+err_terminate_wq:
+ efa_wq_terminate(&qp->rq.wq);
+ return EINVAL;
+}
+
+static void efa_qp_init_indices(struct efa_qp *qp)
+{
+ qp->sq.wq.wqe_posted = 0;
+ qp->sq.wq.wqe_completed = 0;
+ qp->sq.wq.desc_idx = 0;
+ qp->sq.wq.wrid_idx_pool_next = 0;
+
+ qp->rq.wq.wqe_posted = 0;
+ qp->rq.wq.wqe_completed = 0;
+ qp->rq.wq.desc_idx = 0;
+ qp->rq.wq.wrid_idx_pool_next = 0;
+}
+
+static void efa_setup_qp(struct efa_qp *qp,
+ struct ibv_qp_cap *cap,
+ size_t page_size)
+{
+ uint16_t rq_desc_cnt;
+
+ efa_qp_init_indices(qp);
+
+ qp->sq.wq.wqe_cnt = align_up_queue_size(cap->max_send_wr);
+ qp->sq.wq.max_sge = cap->max_send_sge;
+ qp->sq.wq.desc_mask = qp->sq.wq.wqe_cnt - 1;
+
+ qp->rq.wq.max_sge = cap->max_recv_sge;
+ rq_desc_cnt = align_up_queue_size(cap->max_recv_sge * cap->max_recv_wr);
+ qp->rq.wq.desc_mask = rq_desc_cnt - 1;
+ qp->rq.wq.wqe_cnt = rq_desc_cnt / qp->rq.wq.max_sge;
+
+ qp->page_size = page_size;
+}
+
+static void efa_lock_cqs(struct ibv_qp *ibvqp)
+{
+ struct efa_cq *send_cq = to_efa_cq(ibvqp->send_cq);
+ struct efa_cq *recv_cq = to_efa_cq(ibvqp->recv_cq);
+
+ if (recv_cq == send_cq && recv_cq) {
+ pthread_spin_lock(&recv_cq->lock);
+ } else {
+ if (recv_cq)
+ pthread_spin_lock(&recv_cq->lock);
+ if (send_cq)
+ pthread_spin_lock(&send_cq->lock);
+ }
+}
+
+static void efa_unlock_cqs(struct ibv_qp *ibvqp)
+{
+ struct efa_cq *send_cq = to_efa_cq(ibvqp->send_cq);
+ struct efa_cq *recv_cq = to_efa_cq(ibvqp->recv_cq);
+
+ if (recv_cq == send_cq && recv_cq) {
+ pthread_spin_unlock(&recv_cq->lock);
+ } else {
+ if (recv_cq)
+ pthread_spin_unlock(&recv_cq->lock);
+ if (send_cq)
+ pthread_spin_unlock(&send_cq->lock);
+ }
+}
+
+static int efa_check_qp_attr(struct efa_dev *dev,
+ struct ibv_qp_init_attr *attr)
+{
+ if (!attr->recv_cq || !attr->send_cq)
+ return EINVAL;
+
+ if (attr->srq)
+ return EINVAL;
+
+ return 0;
+}
+
+static int efa_check_qp_limits(struct efa_dev *dev,
+ struct ibv_qp_init_attr *attr)
+{
+ if (attr->cap.max_send_sge > dev->max_sq_sge)
+ return EINVAL;
+
+ if (attr->cap.max_recv_sge > dev->max_rq_sge)
+ return EINVAL;
+
+ if (attr->cap.max_send_wr > dev->max_sq_wr)
+ return EINVAL;
+
+ if (attr->cap.max_recv_wr > dev->max_rq_wr)
+ return EINVAL;
+
+ return 0;
+}
+
+static struct ibv_qp *create_qp(struct ibv_pd *ibvpd,
+ struct ibv_qp_init_attr *attr,
+ uint32_t driver_qp_type)
+{
+ struct efa_context *ctx = to_efa_context(ibvpd->context);
+ struct efa_dev *dev = to_efa_dev(ibvpd->context->device);
+ struct efa_create_qp_resp resp = {};
+ struct efa_create_qp req = {};
+ struct efa_cq *send_cq;
+ struct efa_cq *recv_cq;
+ struct efa_qp *qp;
+ int err;
+
+ err = efa_check_qp_attr(dev, attr);
+ if (err)
+ return NULL;
+
+ err = efa_check_qp_limits(dev, attr);
+ if (err)
+ return NULL;
+
+ qp = calloc(1, sizeof(*qp));
+ if (!qp)
+ return NULL;
+
+ efa_setup_qp(qp, &attr->cap, dev->pg_sz);
+
+ attr->cap.max_send_wr = qp->sq.wq.wqe_cnt;
+ attr->cap.max_recv_wr = qp->rq.wq.wqe_cnt;
+
+ req.rq_ring_size = (qp->rq.wq.desc_mask + 1) *
+ sizeof(struct efa_io_rx_desc);
+ req.sq_ring_size = (attr->cap.max_send_wr) *
+ sizeof(struct efa_io_tx_wqe);
+ if (attr->qp_type == IBV_QPT_DRIVER)
+ req.driver_qp_type = driver_qp_type;
+
+ if (ibv_cmd_create_qp(ibvpd, &qp->ibvqp, attr, &req.ibv_cmd,
+ sizeof(req), &resp.ibv_resp, sizeof(resp)))
+ goto err_free_qp;
+
+ qp->ibvqp.state = IBV_QPS_RESET;
+ qp->sq_sig_all = attr->sq_sig_all;
+
+ err = efa_rq_initialize(qp, &resp);
+ if (err)
+ goto err_destroy_qp;
+
+ err = efa_sq_initialize(qp, &resp);
+ if (err)
+ goto err_terminate_rq;
+
+ pthread_spin_lock(&ctx->qp_table_lock);
+ ctx->qp_table[qp->ibvqp.qp_num] = qp;
+ pthread_spin_unlock(&ctx->qp_table_lock);
+
+ if (attr->send_cq) {
+ send_cq = to_efa_cq(attr->send_cq);
+ qp->scq = send_cq;
+ pthread_spin_lock(&send_cq->lock);
+ efa_cq_inc_ref_cnt(send_cq, resp.send_sub_cq_idx);
+ pthread_spin_unlock(&send_cq->lock);
+ }
+
+ if (attr->recv_cq) {
+ recv_cq = to_efa_cq(attr->recv_cq);
+ qp->rcq = recv_cq;
+ pthread_spin_lock(&recv_cq->lock);
+ efa_cq_inc_ref_cnt(recv_cq, resp.recv_sub_cq_idx);
+ pthread_spin_unlock(&recv_cq->lock);
+ }
+
+ return &qp->ibvqp;
+
+err_terminate_rq:
+ efa_rq_terminate(qp);
+err_destroy_qp:
+ ibv_cmd_destroy_qp(&qp->ibvqp);
+err_free_qp:
+ free(qp);
+ return NULL;
+}
+
+struct ibv_qp *efa_create_qp(struct ibv_pd *ibvpd,
+ struct ibv_qp_init_attr *attr)
+{
+ if (attr->qp_type != IBV_QPT_UD)
+ return NULL;
+
+ return create_qp(ibvpd, attr, 0);
+}
+
+int efa_modify_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr,
+ int attr_mask)
+{
+ struct efa_qp *qp = to_efa_qp(ibvqp);
+ struct ibv_modify_qp cmd;
+ int err;
+
+ err = ibv_cmd_modify_qp(ibvqp, attr, attr_mask, &cmd, sizeof(cmd));
+ if (err)
+ return err;
+
+ if (attr_mask & IBV_QP_STATE) {
+ qp->ibvqp.state = attr->qp_state;
+ /* transition to reset */
+ if (qp->ibvqp.state == IBV_QPS_RESET)
+ efa_qp_init_indices(qp);
+ }
+
+ return 0;
+}
+
+int efa_query_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr,
+ int attr_mask, struct ibv_qp_init_attr *init_attr)
+{
+ struct ibv_query_qp cmd;
+
+ return ibv_cmd_query_qp(ibvqp, attr, attr_mask, init_attr,
+ &cmd, sizeof(cmd));
+}
+
+int efa_destroy_qp(struct ibv_qp *ibvqp)
+{
+ struct efa_context *ctx = to_efa_context(ibvqp->context);
+ struct efa_qp *qp = to_efa_qp(ibvqp);
+ int err;
+
+ pthread_spin_lock(&ctx->qp_table_lock);
+ efa_lock_cqs(ibvqp);
+
+ if (ibvqp->send_cq)
+ efa_cq_dec_ref_cnt(to_efa_cq(ibvqp->send_cq),
+ qp->sq.sub_cq_idx);
+
+ if (ibvqp->recv_cq)
+ efa_cq_dec_ref_cnt(to_efa_cq(ibvqp->recv_cq),
+ qp->rq.sub_cq_idx);
+
+ ctx->qp_table[ibvqp->qp_num] = NULL;
+
+ efa_unlock_cqs(ibvqp);
+ pthread_spin_unlock(&ctx->qp_table_lock);
+
+ efa_sq_terminate(qp);
+ efa_rq_terminate(qp);
+
+ err = ibv_cmd_destroy_qp(ibvqp);
+ if (err)
+ return err;
+
+ free(qp);
+ return 0;
+}
+
+static void efa_post_send_sgl(struct ibv_send_wr *wr,
+ struct efa_io_tx_wqe *tx_wqe,
+ int *desc_size)
+{
+ struct efa_io_tx_buf_desc *tx_buf;
+ struct ibv_sge *sge;
+ uintptr_t addr;
+ size_t i;
+
+ for (i = 0; i < wr->num_sge; i++) {
+ sge = &wr->sg_list[i];
+ tx_buf = &tx_wqe->data.sgl[i];
+ addr = sge->addr;
+
+ /* Set TX buffer desc from SGE */
+ tx_buf->length = sge->length;
+ tx_buf->lkey = sge->lkey;
+ tx_buf->buf_addr_lo = addr & 0xffffffff;
+ set_efa_io_tx_buf_desc_buf_addr_hi(tx_buf,
+ (uint64_t)addr >> 32);
+ }
+
+ *desc_size += sizeof(*tx_buf) * wr->num_sge;
+}
+
+static void efa_post_send_inline_data(const struct ibv_send_wr *wr,
+ struct efa_io_tx_wqe *tx_wqe,
+ int *desc_size)
+{
+ const struct ibv_sge *sgl = wr->sg_list;
+ uint32_t total_length = 0;
+ uint32_t length;
+ size_t i;
+
+ for (i = 0; i < wr->num_sge; i++) {
+ length = sgl[i].length;
+
+ memcpy(tx_wqe->data.inline_data + total_length,
+ (void *)(uintptr_t)sgl[i].addr, length);
+ total_length += length;
+ }
+
+ *desc_size += total_length;
+
+ set_efa_io_tx_meta_desc_inline_msg(&tx_wqe->common, 1);
+ tx_wqe->common.len = total_length;
+}
+
+static size_t efa_sge_total_bytes(const struct ibv_send_wr *wr)
+{
+ size_t bytes = 0;
+ size_t i;
+
+ for (i = 0; i < wr->num_sge; i++)
+ bytes += wr->sg_list[i].length;
+
+ return bytes;
+}
+
+static ssize_t efa_post_send_validate(struct efa_qp *qp,
+ const struct ibv_send_wr *wr)
+{
+ if (unlikely(qp->ibvqp.state != IBV_QPS_RTS &&
+ qp->ibvqp.state != IBV_QPS_SQD))
+ return EINVAL;
+
+ if (unlikely(wr->opcode != IBV_WR_SEND))
+ return EINVAL;
+
+ if (unlikely(!qp->scq))
+ return EINVAL;
+
+ if (unlikely(wr->num_sge > qp->sq.wq.max_sge))
+ return EINVAL;
+
+ if (unlikely(!(wr->send_flags & IBV_SEND_SIGNALED) && !qp->sq_sig_all))
+ return EINVAL;
+
+ if (unlikely(wr->send_flags & ~(IBV_SEND_SIGNALED | IBV_SEND_INLINE)))
+ return EINVAL;
+
+ if (unlikely(wr->send_flags & IBV_SEND_INLINE &&
+ efa_sge_total_bytes(wr) > qp->sq.max_inline_data))
+ return EINVAL;
+
+ if (unlikely(qp->sq.wq.wqe_posted - qp->sq.wq.wqe_completed ==
+ qp->sq.wq.wqe_cnt))
+ return ENOMEM;
+
+ return 0;
+}
+
+int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr,
+ struct ibv_send_wr **bad)
+{
+ struct efa_io_tx_meta_desc *meta_desc;
+ struct efa_qp *qp = to_efa_qp(ibvqp);
+ uint32_t sq_desc_offset, wrid_idx;
+ struct efa_io_tx_wqe tx_wqe;
+ struct efa_ah *ah;
+ int desc_size;
+ int err = 0;
+
+ pthread_spin_lock(&qp->sq.wq.wqlock);
+ while (wr) {
+ desc_size = sizeof(tx_wqe.common) + sizeof(tx_wqe.u);
+
+ err = efa_post_send_validate(qp, wr);
+ if (err) {
+ *bad = wr;
+ goto ring_db;
+ }
+
+ memset(&tx_wqe, 0, sizeof(tx_wqe));
+ meta_desc = &tx_wqe.common;
+ ah = to_efa_ah(wr->wr.ud.ah);
+
+ if (wr->send_flags & IBV_SEND_INLINE) {
+ efa_post_send_inline_data(wr, &tx_wqe, &desc_size);
+ } else {
+ meta_desc->len = wr->num_sge;
+ efa_post_send_sgl(wr, &tx_wqe, &desc_size);
+ }
+
+ /* Get the next wrid to be used from the index pool */
+ wrid_idx = qp->sq.wq.wrid_idx_pool[qp->sq.wq.wrid_idx_pool_next];
+ qp->sq.wq.wrid[wrid_idx] = wr->wr_id;
+ meta_desc->req_id = wrid_idx;
+ qp->sq.wq.wqe_posted++;
+
+ /* Will never overlap, as efa_post_send_validate() succeeded */
+ qp->sq.wq.wrid_idx_pool_next++;
+ assert(qp->sq.wq.wrid_idx_pool_next <= qp->sq.wq.wqe_cnt);
+
+ /* Set rest of the descriptor fields */
+ set_efa_io_tx_meta_desc_meta_desc(meta_desc, 1);
+ set_efa_io_tx_meta_desc_phase(meta_desc, qp->sq.wq.phase);
+ set_efa_io_tx_meta_desc_first(meta_desc, 1);
+ set_efa_io_tx_meta_desc_last(meta_desc, 1);
+ meta_desc->dest_qp_num = wr->wr.ud.remote_qpn;
+ set_efa_io_tx_meta_desc_comp_req(meta_desc, 1);
+ meta_desc->ah = ah->efa_ah;
+ tx_wqe.u.ud.qkey = wr->wr.ud.remote_qkey;
+
+ /* Copy descriptor */
+ sq_desc_offset = (qp->sq.wq.desc_idx & qp->sq.wq.desc_mask) *
+ sizeof(tx_wqe);
+ memcpy(qp->sq.desc + sq_desc_offset, &tx_wqe, desc_size);
+
+ /* advance index and change phase */
+ qp->sq.wq.desc_idx++;
+ if (!(qp->sq.wq.desc_idx & qp->sq.wq.desc_mask))
+ qp->sq.wq.phase++;
+
+ wr = wr->next;
+ }
+
+ring_db:
+ udma_to_device_barrier();
+ mmio_write32(qp->sq.db, qp->sq.wq.desc_idx);
+
+ pthread_spin_unlock(&qp->sq.wq.wqlock);
+ return err;
+}
+
+static ssize_t efa_post_recv_validate(struct efa_qp *qp, struct ibv_recv_wr *wr)
+{
+ if (unlikely(qp->ibvqp.state == IBV_QPS_RESET ||
+ qp->ibvqp.state == IBV_QPS_ERR))
+ return EINVAL;
+
+ if (unlikely(!qp->rcq))
+ return EINVAL;
+
+ if (unlikely(wr->num_sge > qp->rq.wq.max_sge))
+ return EINVAL;
+
+ if (unlikely(qp->rq.wq.wqe_posted - qp->rq.wq.wqe_completed ==
+ qp->rq.wq.wqe_cnt))
+ return ENOMEM;
+
+ return 0;
+}
+
+int efa_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr,
+ struct ibv_recv_wr **bad)
+{
+ struct efa_qp *qp = to_efa_qp(ibvqp);
+ uint32_t wqe_index, rq_desc_offset;
+ struct efa_io_rx_desc rx_buf;
+ uintptr_t addr;
+ int err = 0;
+ size_t i;
+
+ pthread_spin_lock(&qp->rq.wq.wqlock);
+ while (wr) {
+ err = efa_post_recv_validate(qp, wr);
+ if (err) {
+ *bad = wr;
+ goto ring_db;
+ }
+
+ memset(&rx_buf, 0, sizeof(rx_buf));
+
+ /* Save wrid */
+ /* Get the next wrid to be used from the index pool */
+ wqe_index = qp->rq.wq.wrid_idx_pool[qp->rq.wq.wrid_idx_pool_next];
+ qp->rq.wq.wrid[wqe_index] = wr->wr_id;
+ rx_buf.req_id = wqe_index;
+ qp->rq.wq.wqe_posted++;
+
+ /* Will never overlap, as efa_post_recv_validate() succeeded */
+ qp->rq.wq.wrid_idx_pool_next++;
+ assert(qp->rq.wq.wrid_idx_pool_next <= qp->rq.wq.wqe_cnt);
+
+ /* Default init of the rx buffer */
+ set_efa_io_rx_desc_first(&rx_buf, 1);
+ set_efa_io_rx_desc_last(&rx_buf, 0);
+
+ for (i = 0; i < wr->num_sge; i++) {
+ /* Set last indication if need) */
+ if (i == wr->num_sge - 1)
+ set_efa_io_rx_desc_last(&rx_buf, 1);
+
+ addr = wr->sg_list[i].addr;
+
+ /* Set RX buffer desc from SGE */
+ rx_buf.length = wr->sg_list[i].length;
+ set_efa_io_rx_desc_lkey(&rx_buf, wr->sg_list[i].lkey);
+ rx_buf.buf_addr_lo = addr;
+ rx_buf.buf_addr_hi = (uint64_t)addr >> 32;
+
+ /* Copy descriptor to RX ring */
+ rq_desc_offset = (qp->rq.wq.desc_idx & qp->rq.wq.desc_mask) * sizeof(rx_buf);
+ memcpy(qp->rq.buf + rq_desc_offset, &rx_buf, sizeof(rx_buf));
+
+ /* Wrap rx descriptor index */
+ qp->rq.wq.desc_idx++;
+ if (!(qp->rq.wq.desc_idx & qp->rq.wq.desc_mask))
+ qp->rq.wq.phase++;
+
+ /* reset descriptor for next iov */
+ memset(&rx_buf, 0, sizeof(rx_buf));
+ }
+ wr = wr->next;
+ }
+
+ring_db:
+ udma_to_device_barrier();
+ mmio_write32(qp->rq.db, qp->rq.wq.desc_idx);
+
+ pthread_spin_unlock(&qp->rq.wq.wqlock);
+ return err;
+}
+
+struct ibv_ah *efa_create_ah(struct ibv_pd *ibvpd, struct ibv_ah_attr *attr)
+{
+ struct efa_create_ah_resp resp = {};
+ struct efa_ah *ah;
+ int err;
+
+ ah = calloc(1, sizeof(*ah));
+ if (!ah)
+ return NULL;
+
+ memset(&resp, 0, sizeof(resp));
+ err = ibv_cmd_create_ah(ibvpd, &ah->ibvah, attr,
+ &resp.ibv_resp, sizeof(resp));
+ if (err) {
+ free(ah);
+ return NULL;
+ }
+
+ ah->efa_ah = resp.efa_address_handle;
+
+ return &ah->ibvah;
+}
+
+int efa_destroy_ah(struct ibv_ah *ibvah)
+{
+ struct efa_ah *ah;
+ int err;
+
+ ah = to_efa_ah(ibvah);
+ err = ibv_cmd_destroy_ah(ibvah);
+ if (err)
+ return err;
+ free(ah);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,72 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __EFA_VERBS_H__
+#define __EFA_VERBS_H__
+
+#include <infiniband/driver.h>
+#include <infiniband/verbs.h>
+
+int efa_query_device(struct ibv_context *uctx, struct ibv_device_attr *attr);
+int efa_query_port(struct ibv_context *uctx, uint8_t port,
+ struct ibv_port_attr *attr);
+int efa_query_device_ex(struct ibv_context *context,
+ const struct ibv_query_device_ex_input *input,
+ struct ibv_device_attr_ex *attr, size_t attr_size);
+struct ibv_pd *efa_alloc_pd(struct ibv_context *uctx);
+int efa_dealloc_pd(struct ibv_pd *ibvpd);
+struct ibv_mr *efa_reg_mr(struct ibv_pd *ibvpd, void *buf, size_t len,
+ int ibv_access_flags);
+int efa_dereg_mr(struct verbs_mr *vmr);
+
+struct ibv_cq *efa_create_cq(struct ibv_context *uctx, int ncqe,
+ struct ibv_comp_channel *ch, int vec);
+int efa_destroy_cq(struct ibv_cq *ibvcq);
+int efa_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc);
+
+struct ibv_qp *efa_create_qp(struct ibv_pd *ibvpd,
+ struct ibv_qp_init_attr *attr);
+int efa_modify_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr,
+ int ibv_qp_attr_mask);
+int efa_query_qp(struct ibv_qp *ibvqp, struct ibv_qp_attr *attr, int attr_mask,
+ struct ibv_qp_init_attr *init_attr);
+int efa_destroy_qp(struct ibv_qp *ibvqp);
+int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr,
+ struct ibv_send_wr **bad);
+int efa_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr,
+ struct ibv_recv_wr **bad);
+
+struct ibv_ah *efa_create_ah(struct ibv_pd *ibvpd, struct ibv_ah_attr *attr);
+int efa_destroy_ah(struct ibv_ah *ibvah);
+
+#endif /* __EFA_VERBS_H__ */
@@ -119,6 +119,8 @@ Provides: libocrdma = %{version}-%{release}
Obsoletes: libocrdma < %{version}-%{release}
Provides: librxe = %{version}-%{release}
Obsoletes: librxe < %{version}-%{release}
+Provides: libefa = %{version}-%{release}
+Obsoletes: libefa < %{version}-%{release}
%description -n libibverbs
libibverbs is a library that allows userspace processes to use RDMA
@@ -143,6 +145,7 @@ Device-specific plug-in ibverbs userspace drivers are included:
- libqedr: QLogic QL4xxx RoCE HCA
- librxe: A software implementation of the RoCE protocol
- libvmw_pvrdma: VMware paravirtual RDMA device
+- libefa: Amazon Elastic Fabric Adapter
%package -n libibverbs-utils
Summary: Examples for the libibverbs library
@@ -177,6 +177,7 @@ Obsoletes: libmthca-rdmav2 < %{version}-%{release}
Obsoletes: libnes-rdmav2 < %{version}-%{release}
Obsoletes: libocrdma-rdmav2 < %{version}-%{release}
Obsoletes: librxe-rdmav2 < %{version}-%{release}
+Obsoletes: libefa-rdmav2 < %{version}-%{release}
%if 0%{?dma_coherent}
Requires: %{mlx4_lname} = %{version}-%{release}
Requires: %{mlx5_lname} = %{version}-%{release}
@@ -208,6 +209,7 @@ Device-specific plug-in ibverbs userspace drivers are included:
- libqedr: QLogic QL4xxx RoCE HCA
- librxe: A software implementation of the RoCE protocol
- libvmw_pvrdma: VMware paravirtual RDMA device
+- libefa: Amazon Elastic Fabric Adapter
%package -n %verbs_lname
Summary: Ibverbs runtime library
Introduce a provider that exposes EFA devices to user applications via the verbs interface. Signed-off-by: Gal Pressman <galpress@amazon.com> --- CMakeLists.txt | 1 + MAINTAINERS | 5 + README.md | 1 + debian/control | 9 +- debian/copyright | 4 + kernel-headers/CMakeLists.txt | 2 + kernel-headers/rdma/efa-abi.h | 129 ++++ kernel-headers/rdma/rdma_user_ioctl_cmds.h | 1 + libibverbs/verbs.h | 1 + providers/efa/CMakeLists.txt | 4 + providers/efa/efa-abi.h | 56 ++ providers/efa/efa.c | 155 ++++ providers/efa/efa.h | 200 +++++ providers/efa/efa_io_defs.h | 679 +++++++++++++++++ providers/efa/verbs.c | 1142 ++++++++++++++++++++++++++++ providers/efa/verbs.h | 72 ++ redhat/rdma-core.spec | 3 + suse/rdma-core.spec | 2 + 18 files changed, 2463 insertions(+), 3 deletions(-) create mode 100644 kernel-headers/rdma/efa-abi.h create mode 100644 providers/efa/CMakeLists.txt create mode 100644 providers/efa/efa-abi.h create mode 100644 providers/efa/efa.c create mode 100644 providers/efa/efa.h create mode 100644 providers/efa/efa_io_defs.h create mode 100644 providers/efa/verbs.c create mode 100644 providers/efa/verbs.h