diff mbox series

[rdma-core,11/27] mlx5: Enable interrupt command mode over vfio

Message ID 20210720081647.1980-12-yishaih@nvidia.com (mailing list archive)
State Not Applicable
Headers show
Series Introduce mlx5 user space driver over VFIO | expand

Commit Message

Yishai Hadas July 20, 2021, 8:16 a.m. UTC
Enable interrupt command mode over vfio by using EQ and its related
device stuff.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
---
 providers/mlx5/mlx5_ifc.h  | 150 ++++++++++++++++++
 providers/mlx5/mlx5_vfio.c | 373 ++++++++++++++++++++++++++++++++++++++++++++-
 providers/mlx5/mlx5_vfio.h |  65 ++++++++
 3 files changed, 582 insertions(+), 6 deletions(-)
diff mbox series

Patch

diff --git a/providers/mlx5/mlx5_ifc.h b/providers/mlx5/mlx5_ifc.h
index 4b7a4c2..2129779 100644
--- a/providers/mlx5/mlx5_ifc.h
+++ b/providers/mlx5/mlx5_ifc.h
@@ -51,6 +51,8 @@  enum {
 	MLX5_CMD_OP_QUERY_ISSI = 0x10a,
 	MLX5_CMD_OP_SET_ISSI = 0x10b,
 	MLX5_CMD_OP_CREATE_MKEY = 0x200,
+	MLX5_CMD_OP_CREATE_EQ = 0x301,
+	MLX5_CMD_OP_DESTROY_EQ = 0x302,
 	MLX5_CMD_OP_CREATE_QP = 0x500,
 	MLX5_CMD_OP_RST2INIT_QP = 0x502,
 	MLX5_CMD_OP_INIT2RTR_QP = 0x503,
@@ -65,6 +67,8 @@  enum {
 	MLX5_CMD_OP_QUERY_NIC_VPORT_CONTEXT = 0x754,
 	MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT = 0x755,
 	MLX5_CMD_OP_QUERY_ROCE_ADDRESS = 0x760,
+	MLX5_CMD_OP_ALLOC_UAR = 0x802,
+	MLX5_CMD_OP_DEALLOC_UAR = 0x803,
 	MLX5_CMD_OP_ACCESS_REG = 0x805,
 	MLX5_CMD_OP_QUERY_LAG = 0x842,
 	MLX5_CMD_OP_CREATE_TIR = 0x900,
@@ -118,6 +122,15 @@  enum {
 	MLX5_CAP_PORT_TYPE_ETH = 0x1,
 };
 
+enum mlx5_event {
+	MLX5_EVENT_TYPE_CMD = 0x0a,
+	MLX5_EVENT_TYPE_PAGE_REQUEST = 0xb,
+};
+
+enum {
+	MLX5_EQ_DOORBEL_OFFSET = 0x40,
+};
+
 struct mlx5_ifc_atomic_caps_bits {
 	u8         reserved_at_0[0x40];
 
@@ -4434,4 +4447,141 @@  struct mlx5_ifc_set_hca_cap_in_bits {
 	union mlx5_ifc_hca_cap_union_bits capability;
 };
 
+struct mlx5_ifc_alloc_uar_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x8];
+	u8         uar[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_alloc_uar_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_uar_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_dealloc_uar_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x8];
+	u8         uar[0x18];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_eqc_bits {
+	u8         status[0x4];
+	u8         reserved_at_4[0x9];
+	u8         ec[0x1];
+	u8         oi[0x1];
+	u8         reserved_at_f[0x5];
+	u8         st[0x4];
+	u8         reserved_at_18[0x8];
+
+	u8         reserved_at_20[0x20];
+
+	u8         reserved_at_40[0x14];
+	u8         page_offset[0x6];
+	u8         reserved_at_5a[0x6];
+
+	u8         reserved_at_60[0x3];
+	u8         log_eq_size[0x5];
+	u8         uar_page[0x18];
+
+	u8         reserved_at_80[0x20];
+
+	u8         reserved_at_a0[0x18];
+	u8         intr[0x8];
+
+	u8         reserved_at_c0[0x3];
+	u8         log_page_size[0x5];
+	u8         reserved_at_c8[0x18];
+
+	u8         reserved_at_e0[0x60];
+
+	u8         reserved_at_140[0x8];
+	u8         consumer_counter[0x18];
+
+	u8         reserved_at_160[0x8];
+	u8         producer_counter[0x18];
+
+	u8         reserved_at_180[0x80];
+};
+
+struct mlx5_ifc_create_eq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x18];
+	u8         eq_number[0x8];
+
+	u8         reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_create_eq_in_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x40];
+
+	struct mlx5_ifc_eqc_bits eq_context_entry;
+
+	u8         reserved_at_280[0x40];
+
+	u8         event_bitmask[4][0x40];
+
+	u8         reserved_at_3c0[0x4c0];
+
+	u8         pas[][0x40];
+};
+
+struct mlx5_ifc_destroy_eq_out_bits {
+	u8         status[0x8];
+	u8         reserved_at_8[0x18];
+
+	u8         syndrome[0x20];
+
+	u8         reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_destroy_eq_in_bits {
+	u8         opcode[0x10];
+	u8         reserved_at_10[0x10];
+
+	u8         reserved_at_20[0x10];
+	u8         op_mod[0x10];
+
+	u8         reserved_at_40[0x18];
+	u8         eq_number[0x8];
+
+	u8         reserved_at_60[0x20];
+};
+
 #endif /* MLX5_IFC_H */
diff --git a/providers/mlx5/mlx5_vfio.c b/providers/mlx5/mlx5_vfio.c
index 97d3ce6..dbb9858 100644
--- a/providers/mlx5/mlx5_vfio.c
+++ b/providers/mlx5/mlx5_vfio.c
@@ -19,6 +19,7 @@ 
 #include <linux/vfio.h>
 #include <sys/eventfd.h>
 #include <sys/ioctl.h>
+#include <poll.h>
 #include <util/mmio.h>
 
 #include "mlx5dv.h"
@@ -26,6 +27,10 @@ 
 #include "mlx5.h"
 #include "mlx5_ifc.h"
 
+enum {
+	MLX5_VFIO_CMD_VEC_IDX,
+};
+
 static void mlx5_vfio_free_cmd_msg(struct mlx5_vfio_context *ctx,
 				   struct mlx5_cmd_msg *msg);
 
@@ -223,6 +228,37 @@  static const char *cmd_status_str(uint8_t status)
 	}
 }
 
+static struct mlx5_eqe *get_eqe(struct mlx5_eq *eq, uint32_t entry)
+{
+	return eq->vaddr + entry * MLX5_EQE_SIZE;
+}
+
+static struct mlx5_eqe *mlx5_eq_get_eqe(struct mlx5_eq *eq, uint32_t cc)
+{
+	uint32_t ci = eq->cons_index + cc;
+	struct mlx5_eqe *eqe;
+
+	eqe = get_eqe(eq, ci & (eq->nent - 1));
+	eqe = ((eqe->owner & 1) ^ !!(ci & eq->nent)) ? NULL : eqe;
+
+	if (eqe)
+		udma_from_device_barrier();
+
+	return eqe;
+}
+
+static void eq_update_ci(struct mlx5_eq *eq, uint32_t cc, int arm)
+{
+	__be32 *addr = eq->doorbell + (arm ? 0 : 2);
+	uint32_t val;
+
+	eq->cons_index += cc;
+	val = (eq->cons_index & 0xffffff) | (eq->eqn << 24);
+
+	mmio_write32_be(addr, htobe32(val));
+	udma_to_device_barrier();
+}
+
 static void mlx5_cmd_mbox_status(void *out, uint8_t *status, uint32_t *syndrome)
 {
 	*status = DEVX_GET(mbox_out, out, status);
@@ -315,6 +351,85 @@  static int mlx5_copy_to_msg(struct mlx5_cmd_msg *to, void *from, int size,
 	return 0;
 }
 
+/* The HCA will think the queue has overflowed if we don't tell it we've been
+ * processing events.
+ * We create EQs with MLX5_NUM_SPARE_EQE extra entries,
+ * so we must update our consumer index at least that often.
+ */
+static inline uint32_t mlx5_eq_update_cc(struct mlx5_eq *eq, uint32_t cc)
+{
+	if (unlikely(cc >= MLX5_NUM_SPARE_EQE)) {
+		eq_update_ci(eq, cc, 0);
+		cc = 0;
+	}
+	return cc;
+}
+
+static int mlx5_vfio_cmd_comp(struct mlx5_vfio_context *ctx, unsigned long slot)
+{
+	uint64_t u = 1;
+	ssize_t s;
+
+	s = write(ctx->cmd.cmds[slot].completion_event_fd, &u,
+		  sizeof(uint64_t));
+	if (s != sizeof(uint64_t))
+		return -1;
+
+	return 0;
+}
+
+static int mlx5_vfio_process_cmd_eqe(struct mlx5_vfio_context *ctx,
+				     struct mlx5_eqe *eqe)
+{
+	struct mlx5_eqe_cmd *cmd_eqe = &eqe->data.cmd;
+	unsigned long vector = be32toh(cmd_eqe->vector);
+	unsigned long slot;
+	int count = 0;
+	int ret;
+
+	for (slot = 0; slot < MLX5_MAX_COMMANDS; slot++) {
+		if (vector & (1 << slot)) {
+			assert(ctx->cmd.cmds[slot].comp_func);
+			ret = ctx->cmd.cmds[slot].comp_func(ctx, slot);
+			if (ret)
+				return ret;
+
+			vector &= ~(1 << slot);
+			count++;
+		}
+	}
+
+	assert(!vector && count);
+	return 0;
+}
+
+static int mlx5_vfio_process_async_events(struct mlx5_vfio_context *ctx)
+{
+	struct mlx5_eqe *eqe;
+	int ret = 0;
+	int cc = 0;
+
+	pthread_mutex_lock(&ctx->eq_lock);
+	while ((eqe = mlx5_eq_get_eqe(&ctx->async_eq, cc))) {
+		switch (eqe->type) {
+		case MLX5_EVENT_TYPE_CMD:
+			ret = mlx5_vfio_process_cmd_eqe(ctx, eqe);
+			break;
+		default:
+			break;
+		}
+
+		cc = mlx5_eq_update_cc(&ctx->async_eq, ++cc);
+		if (ret)
+			goto out;
+	}
+
+out:
+	eq_update_ci(&ctx->async_eq, cc, 1);
+	pthread_mutex_unlock(&ctx->eq_lock);
+	return ret;
+}
+
 static int mlx5_vfio_enlarge_cmd_msg(struct mlx5_vfio_context *ctx, struct mlx5_cmd_msg *cmd_msg,
 				     struct mlx5_cmd_layout *cmd_lay, uint32_t len, bool is_in)
 {
@@ -333,6 +448,49 @@  static int mlx5_vfio_enlarge_cmd_msg(struct mlx5_vfio_context *ctx, struct mlx5_
 	return 0;
 }
 
+static int mlx5_vfio_wait_event(struct mlx5_vfio_context *ctx,
+				unsigned int slot)
+{
+	struct mlx5_cmd_layout *cmd_lay = ctx->cmd.cmds[slot].lay;
+	uint64_t u;
+	ssize_t s;
+	int err;
+
+	struct pollfd fds[2] = {
+		{ .fd = ctx->cmd_comp_fd, .events = POLLIN },
+		{ .fd = ctx->cmd.cmds[slot].completion_event_fd, .events = POLLIN }
+		};
+
+	while (true) {
+		err = poll(fds, 2, -1);
+		if (err < 0 && errno != EAGAIN) {
+			mlx5_err(ctx->dbg_fp, "mlx5_vfio_wait_event, poll failed, errno=%d\n", errno);
+			return errno;
+		}
+		if (fds[0].revents & POLLIN) {
+			s = read(fds[0].fd, &u, sizeof(uint64_t));
+			if (s < 0 && errno != EAGAIN) {
+				mlx5_err(ctx->dbg_fp, "mlx5_vfio_wait_event, read failed, errno=%d\n", errno);
+				return errno;
+			}
+
+			err = mlx5_vfio_process_async_events(ctx);
+			if (err)
+				return err;
+		}
+		if (fds[1].revents & POLLIN) {
+			s = read(fds[1].fd, &u, sizeof(uint64_t));
+			if (s < 0 && errno != EAGAIN) {
+				mlx5_err(ctx->dbg_fp, "mlx5_vfio_wait_event, read failed, slot=%d, errno=%d\n",
+					 slot, errno);
+				return errno;
+			}
+			if (!(mmio_read8(&cmd_lay->status_own) & 0x1))
+				return 0;
+		}
+	}
+}
+
 /* One minute for the sake of bringup */
 #define MLX5_CMD_TIMEOUT_MSEC (60 * 1000)
 
@@ -430,10 +588,17 @@  static int mlx5_vfio_cmd_exec(struct mlx5_vfio_context *ctx, void *in,
 	udma_to_device_barrier();
 	mmio_write32_be(&init_seg->cmd_dbell, htobe32(0x1 << slot));
 
-	err = mlx5_vfio_poll_timeout(cmd_lay);
-	if (err)
-		goto end;
-	udma_from_device_barrier();
+	if (ctx->have_eq) {
+		err = mlx5_vfio_wait_event(ctx, slot);
+		if (err)
+			goto end;
+	} else {
+		err = mlx5_vfio_poll_timeout(cmd_lay);
+		if (err)
+			goto end;
+		udma_from_device_barrier();
+	}
+
 	err = mlx5_copy_from_msg(out, cmd_out, olen, cmd_lay);
 	if (err)
 		goto end;
@@ -608,6 +773,9 @@  static int mlx5_vfio_setup_cmd_slot(struct mlx5_vfio_context *ctx, int slot)
 		goto err_fd;
 	}
 
+	if (slot != MLX5_MAX_COMMANDS - 1)
+		cmd_slot->comp_func = mlx5_vfio_cmd_comp;
+
 	pthread_mutex_init(&cmd_slot->lock, NULL);
 
 	return 0;
@@ -889,7 +1057,7 @@  mlx5_vfio_enable_msix(struct mlx5_vfio_context *ctx)
 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
 	irq_set->start = 0;
 	fd_ptr = (int *)&irq_set->data;
-	fd_ptr[0] = ctx->cmd_comp_fd;
+	fd_ptr[MLX5_VFIO_CMD_VEC_IDX] = ctx->cmd_comp_fd;
 
 	return ioctl(ctx->device_fd, VFIO_DEVICE_SET_IRQS, irq_set);
 }
@@ -907,7 +1075,7 @@  static int mlx5_vfio_init_async_fd(struct mlx5_vfio_context *ctx)
 		return -1;
 
 	/* set up an eventfd for command completion interrupts */
-	ctx->cmd_comp_fd = eventfd(0, EFD_CLOEXEC);
+	ctx->cmd_comp_fd = eventfd(0, EFD_CLOEXEC | O_NONBLOCK);
 	if (ctx->cmd_comp_fd < 0)
 		return -1;
 
@@ -988,6 +1156,193 @@  close_cont:
 	return -1;
 }
 
+enum {
+	MLX5_EQE_OWNER_INIT_VAL = 0x1,
+};
+
+static void init_eq_buf(struct mlx5_eq *eq)
+{
+	struct mlx5_eqe *eqe;
+	int i;
+
+	for (i = 0; i < eq->nent; i++) {
+		eqe = get_eqe(eq, i);
+		eqe->owner = MLX5_EQE_OWNER_INIT_VAL;
+	}
+}
+
+static uint64_t uar2iova(struct mlx5_vfio_context *ctx, uint32_t index)
+{
+	return (uint64_t)((void *)ctx->bar_map + (index * MLX5_ADAPTER_PAGE_SIZE));
+}
+
+static int mlx5_vfio_alloc_uar(struct mlx5_vfio_context *ctx, uint32_t *uarn)
+{
+	uint32_t out[DEVX_ST_SZ_DW(alloc_uar_out)] = {};
+	uint32_t in[DEVX_ST_SZ_DW(alloc_uar_in)] = {};
+	int err;
+
+	DEVX_SET(alloc_uar_in, in, opcode, MLX5_CMD_OP_ALLOC_UAR);
+	err = mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
+	if (!err)
+		*uarn = DEVX_GET(alloc_uar_out, out, uar);
+
+	return err;
+}
+
+static void mlx5_vfio_dealloc_uar(struct mlx5_vfio_context *ctx, uint32_t uarn)
+{
+	uint32_t out[DEVX_ST_SZ_DW(dealloc_uar_out)] = {};
+	uint32_t in[DEVX_ST_SZ_DW(dealloc_uar_in)] = {};
+
+	DEVX_SET(dealloc_uar_in, in, opcode, MLX5_CMD_OP_DEALLOC_UAR);
+	DEVX_SET(dealloc_uar_in, in, uar, uarn);
+	mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
+}
+
+static void mlx5_vfio_destroy_eq(struct mlx5_vfio_context *ctx, struct mlx5_eq *eq)
+{
+	uint32_t in[DEVX_ST_SZ_DW(destroy_eq_in)] = {};
+	uint32_t out[DEVX_ST_SZ_DW(destroy_eq_out)] = {};
+
+	DEVX_SET(destroy_eq_in, in, opcode, MLX5_CMD_OP_DESTROY_EQ);
+	DEVX_SET(destroy_eq_in, in, eq_number, eq->eqn);
+
+	mlx5_vfio_cmd_exec(ctx, in, sizeof(in), out, sizeof(out), 0);
+	mlx5_vfio_unregister_mem(ctx, eq->iova, eq->iova_size);
+	iset_insert_range(ctx->iova_alloc, eq->iova, eq->iova_size);
+	free(eq->vaddr);
+}
+
+static void destroy_async_eqs(struct mlx5_vfio_context *ctx)
+{
+	ctx->have_eq = false;
+	mlx5_vfio_destroy_eq(ctx, &ctx->async_eq);
+	mlx5_vfio_dealloc_uar(ctx, ctx->eqs_uar.uarn);
+}
+
+static int
+create_map_eq(struct mlx5_vfio_context *ctx, struct mlx5_eq *eq,
+	      struct mlx5_eq_param *param)
+{
+	uint32_t out[DEVX_ST_SZ_DW(create_eq_out)] = {};
+	uint8_t vecidx = param->irq_index;
+	__be64 *pas;
+	void *eqc;
+	int inlen;
+	uint32_t *in;
+	int err;
+	int i;
+	int alloc_size;
+
+	pthread_mutex_init(&ctx->eq_lock, NULL);
+	eq->nent = roundup_pow_of_two(param->nent + MLX5_NUM_SPARE_EQE);
+	eq->cons_index = 0;
+	alloc_size = eq->nent * MLX5_EQE_SIZE;
+	eq->iova_size = max(roundup_pow_of_two(alloc_size), ctx->iova_min_page_size);
+
+	inlen = DEVX_ST_SZ_BYTES(create_eq_in) +
+		DEVX_FLD_SZ_BYTES(create_eq_in, pas[0]) * 1;
+
+	in = calloc(1, inlen);
+	if (!in)
+		return ENOMEM;
+
+	pas = (__be64 *)DEVX_ADDR_OF(create_eq_in, in, pas);
+
+	err = posix_memalign(&eq->vaddr, eq->iova_size, alloc_size);
+	if (err) {
+		errno = err;
+		goto end;
+	}
+
+	err = iset_alloc_range(ctx->iova_alloc, eq->iova_size, &eq->iova);
+	if (err)
+		goto err_range;
+
+	err = mlx5_vfio_register_mem(ctx, eq->vaddr, eq->iova, eq->iova_size);
+	if (err)
+		goto err_reg;
+
+	pas[0] = htobe64(eq->iova);
+	init_eq_buf(eq);
+	DEVX_SET(create_eq_in, in, opcode, MLX5_CMD_OP_CREATE_EQ);
+
+	for (i = 0; i < 4; i++)
+		DEVX_ARRAY_SET64(create_eq_in, in, event_bitmask, i,
+				 param->mask[i]);
+
+	eqc = DEVX_ADDR_OF(create_eq_in, in, eq_context_entry);
+	DEVX_SET(eqc, eqc, log_eq_size, ilog32(eq->nent - 1));
+	DEVX_SET(eqc, eqc, uar_page, ctx->eqs_uar.uarn);
+	DEVX_SET(eqc, eqc, intr, vecidx);
+	DEVX_SET(eqc, eqc, log_page_size, ilog32(eq->iova_size - 1) - MLX5_ADAPTER_PAGE_SHIFT);
+
+	err = mlx5_vfio_cmd_exec(ctx, in, inlen, out, sizeof(out), 0);
+	if (err)
+		goto err_cmd;
+
+	eq->vecidx = vecidx;
+	eq->eqn = DEVX_GET(create_eq_out, out, eq_number);
+	eq->doorbell = (void *)ctx->eqs_uar.iova + MLX5_EQ_DOORBEL_OFFSET;
+
+	free(in);
+	return 0;
+
+err_cmd:
+	mlx5_vfio_unregister_mem(ctx, eq->iova, eq->iova_size);
+err_reg:
+	iset_insert_range(ctx->iova_alloc, eq->iova, eq->iova_size);
+err_range:
+	free(eq->vaddr);
+end:
+	free(in);
+	return err;
+}
+
+static int
+setup_async_eq(struct mlx5_vfio_context *ctx, struct mlx5_eq_param *param,
+	       struct mlx5_eq *eq)
+{
+	int err;
+
+	err = create_map_eq(ctx, eq, param);
+	if (err)
+		return err;
+
+	eq_update_ci(eq, 0, 1);
+
+	return 0;
+}
+
+static int create_async_eqs(struct mlx5_vfio_context *ctx)
+{
+	struct mlx5_eq_param param = {};
+	int err;
+
+	err = mlx5_vfio_alloc_uar(ctx, &ctx->eqs_uar.uarn);
+	if (err)
+		return err;
+
+	ctx->eqs_uar.iova = uar2iova(ctx, ctx->eqs_uar.uarn);
+
+	param = (struct mlx5_eq_param) {
+		.irq_index = MLX5_VFIO_CMD_VEC_IDX,
+		.nent = MLX5_NUM_CMD_EQE,
+		.mask[0] = 1ull << MLX5_EVENT_TYPE_CMD,
+	};
+
+	err = setup_async_eq(ctx, &param, &ctx->async_eq);
+	if (err)
+		goto err;
+
+	ctx->have_eq = true;
+	return 0;
+err:
+	mlx5_vfio_dealloc_uar(ctx, ctx->eqs_uar.uarn);
+	return err;
+}
+
 static int mlx5_vfio_enable_hca(struct mlx5_vfio_context *ctx)
 {
 	uint32_t in[DEVX_ST_SZ_DW(enable_hca_in)] = {};
@@ -1497,6 +1852,7 @@  static void mlx5_vfio_free_context(struct ibv_context *ibctx)
 {
 	struct mlx5_vfio_context *ctx = to_mvfio_ctx(ibctx);
 
+	destroy_async_eqs(ctx);
 	mlx5_vfio_teardown_hca(ctx);
 	mlx5_vfio_clean_cmd_interface(ctx);
 	mlx5_vfio_clean_device_dma(ctx);
@@ -1541,9 +1897,14 @@  mlx5_vfio_alloc_context(struct ibv_device *ibdev,
 	if (mlx5_vfio_setup_function(mctx))
 		goto clean_cmd;
 
+	if (create_async_eqs(mctx))
+		goto func_teardown;
+
 	verbs_set_ops(&mctx->vctx, &mlx5_vfio_common_ops);
 	return &mctx->vctx;
 
+func_teardown:
+	mlx5_vfio_teardown_hca(mctx);
 clean_cmd:
 	mlx5_vfio_clean_cmd_interface(mctx);
 err_dma:
diff --git a/providers/mlx5/mlx5_vfio.h b/providers/mlx5/mlx5_vfio.h
index 225c1b9..449a5c5 100644
--- a/providers/mlx5/mlx5_vfio.h
+++ b/providers/mlx5/mlx5_vfio.h
@@ -60,6 +60,8 @@  struct mlx5_vfio_device {
 #define MLX5_VFIO_CAP_ROCE_MAX(ctx, cap) \
 	DEVX_GET(roce_cap, ctx->caps.hca_max[MLX5_CAP_ROCE], cap)
 
+struct mlx5_vfio_context;
+
 struct mlx5_reg_host_endianness {
 	uint8_t he;
 	uint8_t rsvd[15];
@@ -149,12 +151,16 @@  struct mlx5_cmd_msg {
 	struct mlx5_cmd_mailbox *next;
 };
 
+typedef int (*vfio_cmd_slot_comp)(struct mlx5_vfio_context *ctx,
+				  unsigned long slot);
+
 struct mlx5_vfio_cmd_slot {
 	struct mlx5_cmd_layout *lay;
 	struct mlx5_cmd_msg in;
 	struct mlx5_cmd_msg out;
 	pthread_mutex_t lock;
 	int completion_event_fd;
+	vfio_cmd_slot_comp comp_func;
 };
 
 struct mlx5_vfio_cmd {
@@ -165,6 +171,62 @@  struct mlx5_vfio_cmd {
 	struct mlx5_vfio_cmd_slot cmds[MLX5_MAX_COMMANDS];
 };
 
+struct mlx5_eq_param {
+	uint8_t irq_index;
+	int nent;
+	uint64_t mask[4];
+};
+
+struct mlx5_eq {
+	__be32 *doorbell;
+	uint32_t cons_index;
+	unsigned int vecidx;
+	uint8_t eqn;
+	int nent;
+	void *vaddr;
+	uint64_t iova;
+	uint64_t iova_size;
+};
+
+struct mlx5_eqe_cmd {
+	__be32 vector;
+	__be32 rsvd[6];
+};
+
+struct mlx5_eqe_page_req {
+	__be16 ec_function;
+	__be16 func_id;
+	__be32 num_pages;
+	__be32 rsvd1[5];
+};
+
+union ev_data {
+	__be32 raw[7];
+	struct mlx5_eqe_cmd cmd;
+	struct mlx5_eqe_page_req req_pages;
+};
+
+struct mlx5_eqe {
+	uint8_t rsvd0;
+	uint8_t type;
+	uint8_t rsvd1;
+	uint8_t sub_type;
+	__be32 rsvd2[7];
+	union ev_data data;
+	__be16 rsvd3;
+	uint8_t signature;
+	uint8_t owner;
+};
+
+#define MLX5_EQE_SIZE (sizeof(struct mlx5_eqe))
+#define MLX5_NUM_CMD_EQE   (32)
+#define MLX5_NUM_SPARE_EQE (0x80)
+
+struct mlx5_vfio_eqs_uar {
+	uint32_t uarn;
+	uint64_t iova;
+};
+
 struct mlx5_vfio_context {
 	struct verbs_context vctx;
 	int container_fd;
@@ -183,6 +245,9 @@  struct mlx5_vfio_context {
 		uint32_t hca_cur[MLX5_CAP_NUM][DEVX_UN_SZ_DW(hca_cap_union)];
 		uint32_t hca_max[MLX5_CAP_NUM][DEVX_UN_SZ_DW(hca_cap_union)];
 	} caps;
+	struct mlx5_eq async_eq;
+	struct mlx5_vfio_eqs_uar eqs_uar;
+	pthread_mutex_t eq_lock;
 };
 
 static inline struct mlx5_vfio_device *to_mvfio_dev(struct ibv_device *ibdev)