diff mbox series

[for-next,v3,06/12] RDMA/erdma: Add event queue implementation

Message ID 20220217030116.6324-7-chengyou.xc@alibaba-inc.com (mailing list archive)
State Changes Requested
Headers show
Series Elastic RDMA Adapter (ERDMA) driver | expand

Commit Message

Cheng Xu Feb. 17, 2022, 3:01 a.m. UTC
From: Cheng Xu <chengyou@linux.alibaba.com>

Event queue (EQ) is the main notifcaition way from erdma hardware to its
driver. Each erdma device contains 2 kinds EQs: asynchronous EQ (AEQ) and
completion EQ (CEQ). Per device has 1 AEQ, which used for RDMA async event
report, and max to 32 CEQs (numbered for CEQ0 to CEQ31). CEQ0 is used for
cmdq completion event report, and the reset CEQs are used for RDMA
completion event report.

Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
---
 drivers/infiniband/hw/erdma/erdma_eq.c | 366 +++++++++++++++++++++++++
 1 file changed, 366 insertions(+)
 create mode 100644 drivers/infiniband/hw/erdma/erdma_eq.c

Comments

Ma Ca Feb. 22, 2022, 7:23 a.m. UTC | #1
On 2022/2/17 11:01, Cheng Xu wrote:
> From: Cheng Xu <chengyou@linux.alibaba.com>
>
> Event queue (EQ) is the main notifcaition way from erdma hardware to its
> driver. Each erdma device contains 2 kinds EQs: asynchronous EQ (AEQ) and
> completion EQ (CEQ). Per device has 1 AEQ, which used for RDMA async event
> report, and max to 32 CEQs (numbered for CEQ0 to CEQ31). CEQ0 is used for
> cmdq completion event report, and the reset CEQs are used for RDMA
> completion event report.
>
> Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
> ---
>   drivers/infiniband/hw/erdma/erdma_eq.c | 366 +++++++++++++++++++++++++
>   1 file changed, 366 insertions(+)
>   create mode 100644 drivers/infiniband/hw/erdma/erdma_eq.c
>
> diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c
> new file mode 100644
> index 000000000000..2a2215710e94
> --- /dev/null
> +++ b/drivers/infiniband/hw/erdma/erdma_eq.c
> @@ -0,0 +1,366 @@
> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
> +
> +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
> +/*          Kai Shen <kaishen@linux.alibaba.com> */
> +/* Copyright (c) 2020-2022, Alibaba Group. */
> +
> +#include <linux/errno.h>
> +#include <linux/types.h>
> +#include <linux/pci.h>
> +
> +#include <rdma/iw_cm.h>
> +#include <rdma/ib_verbs.h>
> +#include <rdma/ib_user_verbs.h>
> +
> +#include "erdma.h"
> +#include "erdma_cm.h"
> +#include "erdma_hw.h"
> +#include "erdma_verbs.h"
> +
> +void notify_eq(struct erdma_eq *eq)
> +{
> +	u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) |
> +		      FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1);
> +
> +	*eq->db_record = db_data;
> +	writeq(db_data, eq->db_addr);
> +
> +	atomic64_inc(&eq->notify_num);
> +}
> +
> +static void *get_eq_entry(struct erdma_eq *eq, u16 idx)
> +{
> +	idx &= (eq->depth - 1);
> +
> +	return eq->qbuf + (idx << EQE_SHIFT);
> +}
> +
> +static void *get_valid_eqe(struct erdma_eq *eq)
> +{
> +	u64 *eqe = (u64 *)get_eq_entry(eq, eq->ci);
> +	u64 val = READ_ONCE(*eqe);
> +
> +	if (FIELD_GET(ERDMA_CEQE_HDR_O_MASK, val) == eq->owner) {
> +		dma_rmb();
> +		eq->ci++;
> +		if ((eq->ci & (eq->depth - 1)) == 0)
> +			eq->owner = !eq->owner;
> +
> +		atomic64_inc(&eq->event_num);
> +		return eqe;
> +	}
> +
> +	return NULL;
> +}
> +
> +static int erdma_poll_aeq_event(struct erdma_eq *aeq, void *out)
> +{
> +	struct erdma_aeqe *aeqe;
> +
> +	aeqe = (struct erdma_aeqe *)get_valid_eqe(aeq);
> +	if (aeqe && out) {
> +		memcpy(out, aeqe, EQE_SIZE);
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +int erdma_poll_ceq_event(struct erdma_eq *ceq)
> +{
> +	u64 *ceqe;
> +	u64 val;
> +
> +	ceqe = (u64 *)get_valid_eqe(ceq);
> +	if (ceqe) {
> +		val = READ_ONCE(*ceqe);
> +		return FIELD_GET(ERDMA_CEQE_HDR_CQN_MASK, val);
> +	}
> +
> +	return -1;
> +}
> +
> +void erdma_aeq_event_handler(struct erdma_dev *dev)
> +{
> +	struct erdma_aeqe aeqe;
> +	u32 cqn, qpn;
> +	struct erdma_qp *qp;
> +	struct erdma_cq *cq;
> +	struct ib_event event;
> +
> +	memset(&event, 0, sizeof(event));
> +	while (erdma_poll_aeq_event(&dev->aeq, &aeqe)) {
> +		if (FIELD_GET(ERDMA_AEQE_HDR_TYPE_MASK, aeqe.hdr) ==
> +		    ERDMA_AE_TYPE_CQ_ERR) {
> +			cqn = aeqe.event_data0;
> +			cq = find_cq_by_cqn(dev, cqn);
> +			if (!cq)
> +				continue;
> +			event.device = cq->ibcq.device;
> +			event.element.cq = &cq->ibcq;
> +			event.event = IB_EVENT_CQ_ERR;
> +			if (cq->ibcq.event_handler)
> +				cq->ibcq.event_handler(&event,
> +						       cq->ibcq.cq_context);
> +		} else {
> +			qpn = aeqe.event_data0;
> +			qp = find_qp_by_qpn(dev, qpn);
> +			if (!qp)
> +				continue;
> +
> +			event.device = qp->ibqp.device;
> +			event.element.qp = &qp->ibqp;
> +			event.event = IB_EVENT_QP_FATAL;
> +			if (qp->ibqp.event_handler)
> +				qp->ibqp.event_handler(&event,
> +						       qp->ibqp.qp_context);
> +		}
> +	}
> +
> +	notify_eq(&dev->aeq);
> +}
> +
> +int erdma_aeq_init(struct erdma_dev *dev)
> +{
> +	struct erdma_eq *eq = &dev->aeq;
> +	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
> +
> +	eq->qbuf = dma_alloc_coherent(&dev->pdev->dev,
> +				      WARPPED_BUFSIZE(buf_size),
> +				      &eq->qbuf_dma_addr, GFP_KERNEL);
> +	if (!eq->qbuf)
> +		return -ENOMEM;
> +
> +	memset(eq->qbuf, 0, WARPPED_BUFSIZE(buf_size));
It's better replace the memset() by GFP_ZERO.
> +
> +	spin_lock_init(&eq->lock);
> +	atomic64_set(&eq->event_num, 0);
> +	atomic64_set(&eq->notify_num, 0);
> +
> +	eq->depth = ERDMA_DEFAULT_EQ_DEPTH;
> +	eq->db_addr = (u64 __iomem *)(dev->func_bar + ERDMA_REGS_AEQ_DB_REG);
> +	eq->db_record = (u64 *)(eq->qbuf + buf_size);
> +	eq->owner = 1;
> +
> +	erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG,
> +			  upper_32_bits(eq->qbuf_dma_addr));
> +	erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG,
> +			  lower_32_bits(eq->qbuf_dma_addr));
> +	erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth);
> +	erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG,
> +			  eq->qbuf_dma_addr + buf_size);
> +
> +	return 0;
> +}
> +
> +void erdma_aeq_destroy(struct erdma_dev *dev)
> +{
> +	struct erdma_eq *eq = &dev->aeq;
> +	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
> +
> +	dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf,
> +			  eq->qbuf_dma_addr);
> +}
> +
> +#define MAX_POLL_CHUNK_SIZE 16
> +void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb)
> +{
> +	int cqn;
> +	struct erdma_cq *cq;
> +	struct erdma_dev *dev = ceq_cb->dev;
> +	u32 poll_cnt = 0;
> +
> +	if (!ceq_cb->ready)
> +		return;
> +
> +	while ((cqn = erdma_poll_ceq_event(&ceq_cb->eq)) != -1) {
> +		poll_cnt++;
> +		if (cqn == 0)
> +			continue;
> +
> +		cq = find_cq_by_cqn(dev, cqn);
> +		if (!cq)
> +			continue;
> +
> +		if (rdma_is_kernel_res(&cq->ibcq.res))
> +			cq->kern_cq.cmdsn++;
> +
> +		if (cq->ibcq.comp_handler)
> +			cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
> +
> +		if (poll_cnt >= MAX_POLL_CHUNK_SIZE)
> +			break;
> +	}
> +
> +	notify_eq(&ceq_cb->eq);
> +}
> +
> +static irqreturn_t erdma_intr_ceq_handler(int irq, void *data)
> +{
> +	struct erdma_eq_cb *ceq_cb = data;
> +
> +	tasklet_schedule(&ceq_cb->tasklet);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static void erdma_intr_ceq_task(unsigned long data)
> +{
> +	erdma_ceq_completion_handler((struct erdma_eq_cb *)data);
> +}
> +
> +static int erdma_set_ceq_irq(struct erdma_dev *dev, u16 ceqn)
> +{
> +	struct erdma_eq_cb *eqc = &dev->ceqs[ceqn];
> +	cpumask_t affinity_hint_mask;
> +	u32 cpu;
> +	int err;
> +
> +	snprintf(eqc->irq_name, ERDMA_IRQNAME_SIZE, "erdma-ceq%u@pci:%s",
> +		ceqn, pci_name(dev->pdev));
> +	eqc->msix_vector = pci_irq_vector(dev->pdev, ceqn + 1);
> +
> +	tasklet_init(&dev->ceqs[ceqn].tasklet, erdma_intr_ceq_task,
> +		     (unsigned long)&dev->ceqs[ceqn]);
> +
> +	cpu = cpumask_local_spread(ceqn + 1, dev->attrs.numa_node);
> +	cpumask_set_cpu(cpu, &affinity_hint_mask);
> +
> +	err = request_irq(eqc->msix_vector, erdma_intr_ceq_handler, 0,
> +			  eqc->irq_name, eqc);
> +	if (err) {
> +		dev_err(&dev->pdev->dev, "failed to request_irq(%d)\n", err);
> +		return err;
> +	}
> +
> +	irq_set_affinity_hint(eqc->msix_vector, &affinity_hint_mask);
> +
> +	return 0;
> +}
> +
> +static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn)
> +{
> +	struct erdma_eq_cb *eqc = &dev->ceqs[ceqn];
> +
> +	irq_set_affinity_hint(eqc->msix_vector, NULL);
> +	free_irq(eqc->msix_vector, eqc);
> +}
> +
> +static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq)
> +{
> +	struct erdma_cmdq_create_eq_req req;
> +	dma_addr_t db_info_dma_addr;
> +
> +	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
> +				CMDQ_OPCODE_CREATE_EQ);
> +	req.eqn = eqn;
> +	req.depth = ilog2(eq->depth);
> +	req.qbuf_addr = eq->qbuf_dma_addr;
> +	req.qtype = 1; /* CEQ */
> +	/* Vector index is the same sa EQN. */
same sa -> same as
> +	req.vector_idx = eqn;
> +	db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT);
> +	req.db_dma_addr_l = lower_32_bits(db_info_dma_addr);
> +	req.db_dma_addr_h = upper_32_bits(db_info_dma_addr);
> +
> +	return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req,
> +				   sizeof(struct erdma_cmdq_create_eq_req),
> +				   NULL, NULL);
> +}
> +
> +static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn)
> +{
> +	struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
> +	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
> +	int ret;
> +
> +	eq->qbuf = dma_alloc_coherent(&dev->pdev->dev,
> +				      WARPPED_BUFSIZE(buf_size),
> +				      &eq->qbuf_dma_addr, GFP_KERNEL);
> +	if (!eq->qbuf)
> +		return -ENOMEM;
> +
> +	memset(eq->qbuf, 0, WARPPED_BUFSIZE(buf_size));
ditto
> +
> +	spin_lock_init(&eq->lock);
> +	atomic64_set(&eq->event_num, 0);
> +	atomic64_set(&eq->notify_num, 0);
> +
> +	eq->depth = ERDMA_DEFAULT_EQ_DEPTH;
> +	eq->db_addr = (u64 __iomem *)(dev->func_bar +
> +				      ERDMA_REGS_CEQ_DB_BASE_REG +
> +				      (ceqn + 1) * 8);
> +	eq->db_record = (u64 *)(eq->qbuf + buf_size);
> +	eq->ci = 0;
> +	eq->owner = 1;
> +	dev->ceqs[ceqn].dev = dev;
> +
> +	/* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */
> +	ret = create_eq_cmd(dev, ceqn + 1, eq);
> +	dev->ceqs[ceqn].ready = ret ? false : true;
> +
> +	return ret;
> +}
> +
> +static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn)
> +{
> +	struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
> +	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
> +	struct erdma_cmdq_destroy_eq_req req;
> +	int err;
> +
> +	dev->ceqs[ceqn].ready = 0;
> +
> +	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
> +				CMDQ_OPCODE_DESTROY_EQ);
> +	/* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */
> +	req.eqn = ceqn + 1;
> +	req.qtype = 1;
> +	req.vector_idx = ceqn + 1;
> +
> +	err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
> +				  NULL);
> +	if (err)
> +		return;
> +
> +	dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf,
> +			  eq->qbuf_dma_addr);
memleak maybe occurs at here when post failed.
> +}
> +
> +int erdma_ceqs_init(struct erdma_dev *dev)
> +{
> +	u32 i, j;
> +	int err = 0;
> +
> +	for (i = 0; i < dev->attrs.irq_num - 1; i++) {
> +		err = erdma_ceq_init_one(dev, i);
> +		if (err)
> +			goto out_err;
> +
> +		err = erdma_set_ceq_irq(dev, i);
> +		if (err) {
> +			erdma_ceq_uninit_one(dev, i);
> +			goto out_err;
> +		}
> +	}
> +
> +	return 0;
> +
> +out_err:
> +	for (j = 0; j < i; j++) {
> +		erdma_free_ceq_irq(dev, j);
> +		erdma_ceq_uninit_one(dev, j);
> +	}
> +
> +	return err;
> +}
> +
> +void erdma_ceqs_uninit(struct erdma_dev *dev)
> +{
> +	u32 i;
> +
> +	for (i = 0; i < dev->attrs.irq_num - 1; i++) {
> +		erdma_free_ceq_irq(dev, i);
> +		erdma_ceq_uninit_one(dev, i);
> +	}
> +}
Dust Li Feb. 22, 2022, 7:53 a.m. UTC | #2
On Thu, Feb 17, 2022 at 11:01:10AM +0800, Cheng Xu wrote:
>From: Cheng Xu <chengyou@linux.alibaba.com>
>
>Event queue (EQ) is the main notifcaition way from erdma hardware to its
>driver. Each erdma device contains 2 kinds EQs: asynchronous EQ (AEQ) and
>completion EQ (CEQ). Per device has 1 AEQ, which used for RDMA async event
>report, and max to 32 CEQs (numbered for CEQ0 to CEQ31). CEQ0 is used for
>cmdq completion event report, and the reset CEQs are used for RDMA

reset --> rest ?

>completion event report.
>
>Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
>---
> drivers/infiniband/hw/erdma/erdma_eq.c | 366 +++++++++++++++++++++++++
> 1 file changed, 366 insertions(+)
> create mode 100644 drivers/infiniband/hw/erdma/erdma_eq.c
>
>diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c
>new file mode 100644
>index 000000000000..2a2215710e94
>--- /dev/null
>+++ b/drivers/infiniband/hw/erdma/erdma_eq.c
>@@ -0,0 +1,366 @@
>+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>+
>+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
>+/*          Kai Shen <kaishen@linux.alibaba.com> */
>+/* Copyright (c) 2020-2022, Alibaba Group. */
>+
>+#include <linux/errno.h>
>+#include <linux/types.h>
>+#include <linux/pci.h>
>+
>+#include <rdma/iw_cm.h>
>+#include <rdma/ib_verbs.h>
>+#include <rdma/ib_user_verbs.h>
>+
>+#include "erdma.h"
>+#include "erdma_cm.h"
>+#include "erdma_hw.h"
>+#include "erdma_verbs.h"
>+
>+void notify_eq(struct erdma_eq *eq)
>+{
>+	u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) |
>+		      FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1);
>+
>+	*eq->db_record = db_data;
>+	writeq(db_data, eq->db_addr);
>+
>+	atomic64_inc(&eq->notify_num);
>+}
>+
>+static void *get_eq_entry(struct erdma_eq *eq, u16 idx)
>+{
>+	idx &= (eq->depth - 1);
>+
>+	return eq->qbuf + (idx << EQE_SHIFT);
>+}
>+
>+static void *get_valid_eqe(struct erdma_eq *eq)
>+{
>+	u64 *eqe = (u64 *)get_eq_entry(eq, eq->ci);
>+	u64 val = READ_ONCE(*eqe);
>+
>+	if (FIELD_GET(ERDMA_CEQE_HDR_O_MASK, val) == eq->owner) {
>+		dma_rmb();
>+		eq->ci++;
>+		if ((eq->ci & (eq->depth - 1)) == 0)
>+			eq->owner = !eq->owner;
>+
>+		atomic64_inc(&eq->event_num);
>+		return eqe;
>+	}
>+
>+	return NULL;
>+}
>+
>+static int erdma_poll_aeq_event(struct erdma_eq *aeq, void *out)
>+{
>+	struct erdma_aeqe *aeqe;
>+
>+	aeqe = (struct erdma_aeqe *)get_valid_eqe(aeq);
>+	if (aeqe && out) {
>+		memcpy(out, aeqe, EQE_SIZE);
>+		return 1;
>+	}
>+
>+	return 0;
>+}
>+
>+int erdma_poll_ceq_event(struct erdma_eq *ceq)
>+{
>+	u64 *ceqe;
>+	u64 val;
>+
>+	ceqe = (u64 *)get_valid_eqe(ceq);
>+	if (ceqe) {
>+		val = READ_ONCE(*ceqe);
>+		return FIELD_GET(ERDMA_CEQE_HDR_CQN_MASK, val);
>+	}
>+
>+	return -1;
>+}

Probably not a real issue. Just wonder why not use the same function
format (i.e. return value, argument) for the above 2 erdma_poll_xxx_event()s ?

>+
>+void erdma_aeq_event_handler(struct erdma_dev *dev)
>+{
>+	struct erdma_aeqe aeqe;
>+	u32 cqn, qpn;
>+	struct erdma_qp *qp;
>+	struct erdma_cq *cq;
>+	struct ib_event event;
>+
>+	memset(&event, 0, sizeof(event));
>+	while (erdma_poll_aeq_event(&dev->aeq, &aeqe)) {
>+		if (FIELD_GET(ERDMA_AEQE_HDR_TYPE_MASK, aeqe.hdr) ==
>+		    ERDMA_AE_TYPE_CQ_ERR) {
>+			cqn = aeqe.event_data0;
>+			cq = find_cq_by_cqn(dev, cqn);
>+			if (!cq)
>+				continue;
>+			event.device = cq->ibcq.device;
>+			event.element.cq = &cq->ibcq;
>+			event.event = IB_EVENT_CQ_ERR;
>+			if (cq->ibcq.event_handler)
>+				cq->ibcq.event_handler(&event,
>+						       cq->ibcq.cq_context);
>+		} else {
>+			qpn = aeqe.event_data0;
>+			qp = find_qp_by_qpn(dev, qpn);
>+			if (!qp)
>+				continue;
>+
>+			event.device = qp->ibqp.device;
>+			event.element.qp = &qp->ibqp;
>+			event.event = IB_EVENT_QP_FATAL;
>+			if (qp->ibqp.event_handler)
>+				qp->ibqp.event_handler(&event,
>+						       qp->ibqp.qp_context);
>+		}
>+	}
>+
>+	notify_eq(&dev->aeq);
>+}
>+
>+int erdma_aeq_init(struct erdma_dev *dev)
>+{
>+	struct erdma_eq *eq = &dev->aeq;
>+	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
>+
>+	eq->qbuf = dma_alloc_coherent(&dev->pdev->dev,
>+				      WARPPED_BUFSIZE(buf_size),
>+				      &eq->qbuf_dma_addr, GFP_KERNEL);
>+	if (!eq->qbuf)
>+		return -ENOMEM;
>+
>+	memset(eq->qbuf, 0, WARPPED_BUFSIZE(buf_size));
>+
>+	spin_lock_init(&eq->lock);
>+	atomic64_set(&eq->event_num, 0);
>+	atomic64_set(&eq->notify_num, 0);
>+
>+	eq->depth = ERDMA_DEFAULT_EQ_DEPTH;
>+	eq->db_addr = (u64 __iomem *)(dev->func_bar + ERDMA_REGS_AEQ_DB_REG);
>+	eq->db_record = (u64 *)(eq->qbuf + buf_size);
>+	eq->owner = 1;
>+
>+	erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG,
>+			  upper_32_bits(eq->qbuf_dma_addr));
>+	erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG,
>+			  lower_32_bits(eq->qbuf_dma_addr));
>+	erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth);
>+	erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG,
>+			  eq->qbuf_dma_addr + buf_size);
>+
>+	return 0;
>+}
>+
>+void erdma_aeq_destroy(struct erdma_dev *dev)
>+{
>+	struct erdma_eq *eq = &dev->aeq;
>+	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
>+
>+	dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf,
>+			  eq->qbuf_dma_addr);
>+}
>+
>+#define MAX_POLL_CHUNK_SIZE 16
>+void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb)
>+{
>+	int cqn;
>+	struct erdma_cq *cq;
>+	struct erdma_dev *dev = ceq_cb->dev;
>+	u32 poll_cnt = 0;
>+
>+	if (!ceq_cb->ready)
>+		return;
>+
>+	while ((cqn = erdma_poll_ceq_event(&ceq_cb->eq)) != -1) {
>+		poll_cnt++;
>+		if (cqn == 0)
>+			continue;
>+
>+		cq = find_cq_by_cqn(dev, cqn);
>+		if (!cq)
>+			continue;
>+
>+		if (rdma_is_kernel_res(&cq->ibcq.res))
>+			cq->kern_cq.cmdsn++;
>+
>+		if (cq->ibcq.comp_handler)
>+			cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
>+
>+		if (poll_cnt >= MAX_POLL_CHUNK_SIZE)
>+			break;
>+	}
>+
>+	notify_eq(&ceq_cb->eq);
>+}
>+
>+static irqreturn_t erdma_intr_ceq_handler(int irq, void *data)
>+{
>+	struct erdma_eq_cb *ceq_cb = data;
>+
>+	tasklet_schedule(&ceq_cb->tasklet);
>+
>+	return IRQ_HANDLED;
>+}
>+
>+static void erdma_intr_ceq_task(unsigned long data)
>+{
>+	erdma_ceq_completion_handler((struct erdma_eq_cb *)data);
>+}
>+
>+static int erdma_set_ceq_irq(struct erdma_dev *dev, u16 ceqn)
>+{
>+	struct erdma_eq_cb *eqc = &dev->ceqs[ceqn];
>+	cpumask_t affinity_hint_mask;
>+	u32 cpu;
>+	int err;
>+
>+	snprintf(eqc->irq_name, ERDMA_IRQNAME_SIZE, "erdma-ceq%u@pci:%s",
>+		ceqn, pci_name(dev->pdev));
>+	eqc->msix_vector = pci_irq_vector(dev->pdev, ceqn + 1);
>+
>+	tasklet_init(&dev->ceqs[ceqn].tasklet, erdma_intr_ceq_task,
>+		     (unsigned long)&dev->ceqs[ceqn]);
>+
>+	cpu = cpumask_local_spread(ceqn + 1, dev->attrs.numa_node);
>+	cpumask_set_cpu(cpu, &affinity_hint_mask);
>+
>+	err = request_irq(eqc->msix_vector, erdma_intr_ceq_handler, 0,
>+			  eqc->irq_name, eqc);
>+	if (err) {
>+		dev_err(&dev->pdev->dev, "failed to request_irq(%d)\n", err);
>+		return err;
>+	}
>+
>+	irq_set_affinity_hint(eqc->msix_vector, &affinity_hint_mask);
>+
>+	return 0;
>+}
>+
>+static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn)
>+{
>+	struct erdma_eq_cb *eqc = &dev->ceqs[ceqn];
>+
>+	irq_set_affinity_hint(eqc->msix_vector, NULL);
>+	free_irq(eqc->msix_vector, eqc);
>+}
>+
>+static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq)
>+{
>+	struct erdma_cmdq_create_eq_req req;
>+	dma_addr_t db_info_dma_addr;
>+
>+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
>+				CMDQ_OPCODE_CREATE_EQ);
>+	req.eqn = eqn;
>+	req.depth = ilog2(eq->depth);
>+	req.qbuf_addr = eq->qbuf_dma_addr;
>+	req.qtype = 1; /* CEQ */
>+	/* Vector index is the same sa EQN. */
>+	req.vector_idx = eqn;
>+	db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT);
>+	req.db_dma_addr_l = lower_32_bits(db_info_dma_addr);
>+	req.db_dma_addr_h = upper_32_bits(db_info_dma_addr);
>+
>+	return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req,
>+				   sizeof(struct erdma_cmdq_create_eq_req),
>+				   NULL, NULL);
>+}
>+
>+static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn)
>+{
>+	struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
>+	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
>+	int ret;
>+
>+	eq->qbuf = dma_alloc_coherent(&dev->pdev->dev,
>+				      WARPPED_BUFSIZE(buf_size),
>+				      &eq->qbuf_dma_addr, GFP_KERNEL);
>+	if (!eq->qbuf)
>+		return -ENOMEM;
>+
>+	memset(eq->qbuf, 0, WARPPED_BUFSIZE(buf_size));
>+
>+	spin_lock_init(&eq->lock);
>+	atomic64_set(&eq->event_num, 0);
>+	atomic64_set(&eq->notify_num, 0);
>+
>+	eq->depth = ERDMA_DEFAULT_EQ_DEPTH;
>+	eq->db_addr = (u64 __iomem *)(dev->func_bar +
>+				      ERDMA_REGS_CEQ_DB_BASE_REG +
>+				      (ceqn + 1) * 8);
>+	eq->db_record = (u64 *)(eq->qbuf + buf_size);
>+	eq->ci = 0;
>+	eq->owner = 1;
>+	dev->ceqs[ceqn].dev = dev;
>+
>+	/* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */
>+	ret = create_eq_cmd(dev, ceqn + 1, eq);
>+	dev->ceqs[ceqn].ready = ret ? false : true;
>+
>+	return ret;
>+}
>+
>+static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn)
>+{
>+	struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
>+	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
>+	struct erdma_cmdq_destroy_eq_req req;
>+	int err;
>+
>+	dev->ceqs[ceqn].ready = 0;
>+
>+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
>+				CMDQ_OPCODE_DESTROY_EQ);
>+	/* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */
>+	req.eqn = ceqn + 1;
>+	req.qtype = 1;
>+	req.vector_idx = ceqn + 1;
>+
>+	err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
>+				  NULL);
>+	if (err)
>+		return;
>+
>+	dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf,
>+			  eq->qbuf_dma_addr);
>+}
>+
>+int erdma_ceqs_init(struct erdma_dev *dev)
>+{
>+	u32 i, j;
>+	int err = 0;
>+
>+	for (i = 0; i < dev->attrs.irq_num - 1; i++) {
>+		err = erdma_ceq_init_one(dev, i);
>+		if (err)
>+			goto out_err;
>+
>+		err = erdma_set_ceq_irq(dev, i);
>+		if (err) {
>+			erdma_ceq_uninit_one(dev, i);
>+			goto out_err;
>+		}
>+	}
>+
>+	return 0;
>+
>+out_err:
>+	for (j = 0; j < i; j++) {
>+		erdma_free_ceq_irq(dev, j);
>+		erdma_ceq_uninit_one(dev, j);
>+	}
>+
>+	return err;
>+}
>+
>+void erdma_ceqs_uninit(struct erdma_dev *dev)
>+{
>+	u32 i;
>+
>+	for (i = 0; i < dev->attrs.irq_num - 1; i++) {
>+		erdma_free_ceq_irq(dev, i);
>+		erdma_ceq_uninit_one(dev, i);
>+	}
>+}
>-- 
>2.27.0
Cheng Xu Feb. 23, 2022, 2:24 a.m. UTC | #3
On 2/22/22 3:23 PM, Ma Ca wrote:
> 
> On 2022/2/17 11:01, Cheng Xu wrote:
>> From: Cheng Xu <chengyou@linux.alibaba.com>
>>

<...>

>> +    eq->qbuf = dma_alloc_coherent(&dev->pdev->dev,
>> +                      WARPPED_BUFSIZE(buf_size),
>> +                      &eq->qbuf_dma_addr, GFP_KERNEL);
>> +    if (!eq->qbuf)
>> +        return -ENOMEM;
>> +
>> +    memset(eq->qbuf, 0, WARPPED_BUFSIZE(buf_size));
> It's better replace the memset() by GFP_ZERO.

It is better, I will go through all the code and replace them.

Thanks.

>> +
>> +    spin_lock_init(&eq->lock);
>> +    atomic64_set(&eq->event_num, 0);
>> +    atomic64_set(&eq->notify_num, 0);
>> +
>> +    eq->depth = ERDMA_DEFAULT_EQ_DEPTH;
>> +    eq->db_addr = (u64 __iomem *)(dev->func_bar + 
>> ERDMA_REGS_AEQ_DB_REG);
>> +    eq->db_record = (u64 *)(eq->qbuf + buf_size);
>> +    eq->owner = 1;
>> +

<...>

>> +    erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
>> +                CMDQ_OPCODE_CREATE_EQ);
>> +    req.eqn = eqn;
>> +    req.depth = ilog2(eq->depth);
>> +    req.qbuf_addr = eq->qbuf_dma_addr;
>> +    req.qtype = 1; /* CEQ */
>> +    /* Vector index is the same sa EQN. */
> same sa -> same as

Will fix it.

>> +    req.vector_idx = eqn;
>> +    db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT);
>> +    req.db_dma_addr_l = lower_32_bits(db_info_dma_addr);
>> +    req.db_dma_addr_h = upper_32_bits(db_info_dma_addr);
>> +
>> +    return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req,
>> +                   sizeof(struct erdma_cmdq_create_eq_req),
>> +                   NULL, NULL);
>> +}
>> +
>> +static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn)
>> +{
>> +    struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
>> +    u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
>> +    int ret;
>> +
>> +    eq->qbuf = dma_alloc_coherent(&dev->pdev->dev,
>> +                      WARPPED_BUFSIZE(buf_size),
>> +                      &eq->qbuf_dma_addr, GFP_KERNEL);
>> +    if (!eq->qbuf)
>> +        return -ENOMEM;
>> +
>> +    memset(eq->qbuf, 0, WARPPED_BUFSIZE(buf_size));
> ditto
<...>

>> +static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn)
>> +{
>> +    struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
>> +    u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
>> +    struct erdma_cmdq_destroy_eq_req req;
>> +    int err;
>> +
>> +    dev->ceqs[ceqn].ready = 0;
>> +
>> +    erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
>> +                CMDQ_OPCODE_DESTROY_EQ);
>> +    /* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */
>> +    req.eqn = ceqn + 1;
>> +    req.qtype = 1;
>> +    req.vector_idx = ceqn + 1;
>> +
>> +    err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), 
>> NULL,
>> +                  NULL);
>> +    if (err)
>> +        return;
>> +
>> +    dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), 
>> eq->qbuf,
>> +              eq->qbuf_dma_addr);
> memleak maybe occurs at here when post failed.

If command failed, it means that the EQ is still owned by hardware, and
the queue buffer may be written by hardware after it is freed. I think
the current implementation is better than free it if command failed.

Thanks,
Cheng Xu
Cheng Xu Feb. 23, 2022, 2:26 a.m. UTC | #4
On 2/22/22 3:53 PM, dust.li wrote:
> On Thu, Feb 17, 2022 at 11:01:10AM +0800, Cheng Xu wrote:
>> From: Cheng Xu <chengyou@linux.alibaba.com>
>>
>> Event queue (EQ) is the main notifcaition way from erdma hardware to its
>> driver. Each erdma device contains 2 kinds EQs: asynchronous EQ (AEQ) and
>> completion EQ (CEQ). Per device has 1 AEQ, which used for RDMA async event
>> report, and max to 32 CEQs (numbered for CEQ0 to CEQ31). CEQ0 is used for
>> cmdq completion event report, and the reset CEQs are used for RDMA
> 
> reset --> rest ?
> 

Yes, will fix it.

>> completion event report.
>>
>> Signed-off-by: Cheng Xu <chengyou@linux.alibaba.com>
>> ---
>> drivers/infiniband/hw/erdma/erdma_eq.c | 366 +++++++++++++++++++++++++
>> 1 file changed, 366 insertions(+)
>> create mode 100644 drivers/infiniband/hw/erdma/erdma_eq.c
>>
>> diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c
>> new file mode 100644
>> index 000000000000..2a2215710e94
>> --- /dev/null
>> +++ b/drivers/infiniband/hw/erdma/erdma_eq.c
>> @@ -0,0 +1,366 @@
>> +// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>> +
>> +/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
>> +/*          Kai Shen <kaishen@linux.alibaba.com> */
>> +/* Copyright (c) 2020-2022, Alibaba Group. */
>> +
>> +#include <linux/errno.h>
>> +#include <linux/types.h>
>> +#include <linux/pci.h>
>> +
>> +#include <rdma/iw_cm.h>
>> +#include <rdma/ib_verbs.h>
>> +#include <rdma/ib_user_verbs.h>
>> +
>> +#include "erdma.h"
>> +#include "erdma_cm.h"
>> +#include "erdma_hw.h"
>> +#include "erdma_verbs.h"
>> +
>> +void notify_eq(struct erdma_eq *eq)
>> +{
>> +	u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) |
>> +		      FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1);
>> +
>> +	*eq->db_record = db_data;
>> +	writeq(db_data, eq->db_addr);
>> +
>> +	atomic64_inc(&eq->notify_num);
>> +}
>> +
>> +static void *get_eq_entry(struct erdma_eq *eq, u16 idx)
>> +{
>> +	idx &= (eq->depth - 1);
>> +
>> +	return eq->qbuf + (idx << EQE_SHIFT);
>> +}
>> +
>> +static void *get_valid_eqe(struct erdma_eq *eq)
>> +{
>> +	u64 *eqe = (u64 *)get_eq_entry(eq, eq->ci);
>> +	u64 val = READ_ONCE(*eqe);
>> +
>> +	if (FIELD_GET(ERDMA_CEQE_HDR_O_MASK, val) == eq->owner) {
>> +		dma_rmb();
>> +		eq->ci++;
>> +		if ((eq->ci & (eq->depth - 1)) == 0)
>> +			eq->owner = !eq->owner;
>> +
>> +		atomic64_inc(&eq->event_num);
>> +		return eqe;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +static int erdma_poll_aeq_event(struct erdma_eq *aeq, void *out)
>> +{
>> +	struct erdma_aeqe *aeqe;
>> +
>> +	aeqe = (struct erdma_aeqe *)get_valid_eqe(aeq);
>> +	if (aeqe && out) {
>> +		memcpy(out, aeqe, EQE_SIZE);
>> +		return 1;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +int erdma_poll_ceq_event(struct erdma_eq *ceq)
>> +{
>> +	u64 *ceqe;
>> +	u64 val;
>> +
>> +	ceqe = (u64 *)get_valid_eqe(ceq);
>> +	if (ceqe) {
>> +		val = READ_ONCE(*ceqe);
>> +		return FIELD_GET(ERDMA_CEQE_HDR_CQN_MASK, val);
>> +	}
>> +
>> +	return -1;
>> +}
> 
> Probably not a real issue. Just wonder why not use the same function
> format (i.e. return value, argument) for the above 2 erdma_poll_xxx_event()s ?
> 

Will consider this.

Thanks,
Cheng Xu
diff mbox series

Patch

diff --git a/drivers/infiniband/hw/erdma/erdma_eq.c b/drivers/infiniband/hw/erdma/erdma_eq.c
new file mode 100644
index 000000000000..2a2215710e94
--- /dev/null
+++ b/drivers/infiniband/hw/erdma/erdma_eq.c
@@ -0,0 +1,366 @@ 
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+
+/* Authors: Cheng Xu <chengyou@linux.alibaba.com> */
+/*          Kai Shen <kaishen@linux.alibaba.com> */
+/* Copyright (c) 2020-2022, Alibaba Group. */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "erdma.h"
+#include "erdma_cm.h"
+#include "erdma_hw.h"
+#include "erdma_verbs.h"
+
+void notify_eq(struct erdma_eq *eq)
+{
+	u64 db_data = FIELD_PREP(ERDMA_EQDB_CI_MASK, eq->ci) |
+		      FIELD_PREP(ERDMA_EQDB_ARM_MASK, 1);
+
+	*eq->db_record = db_data;
+	writeq(db_data, eq->db_addr);
+
+	atomic64_inc(&eq->notify_num);
+}
+
+static void *get_eq_entry(struct erdma_eq *eq, u16 idx)
+{
+	idx &= (eq->depth - 1);
+
+	return eq->qbuf + (idx << EQE_SHIFT);
+}
+
+static void *get_valid_eqe(struct erdma_eq *eq)
+{
+	u64 *eqe = (u64 *)get_eq_entry(eq, eq->ci);
+	u64 val = READ_ONCE(*eqe);
+
+	if (FIELD_GET(ERDMA_CEQE_HDR_O_MASK, val) == eq->owner) {
+		dma_rmb();
+		eq->ci++;
+		if ((eq->ci & (eq->depth - 1)) == 0)
+			eq->owner = !eq->owner;
+
+		atomic64_inc(&eq->event_num);
+		return eqe;
+	}
+
+	return NULL;
+}
+
+static int erdma_poll_aeq_event(struct erdma_eq *aeq, void *out)
+{
+	struct erdma_aeqe *aeqe;
+
+	aeqe = (struct erdma_aeqe *)get_valid_eqe(aeq);
+	if (aeqe && out) {
+		memcpy(out, aeqe, EQE_SIZE);
+		return 1;
+	}
+
+	return 0;
+}
+
+int erdma_poll_ceq_event(struct erdma_eq *ceq)
+{
+	u64 *ceqe;
+	u64 val;
+
+	ceqe = (u64 *)get_valid_eqe(ceq);
+	if (ceqe) {
+		val = READ_ONCE(*ceqe);
+		return FIELD_GET(ERDMA_CEQE_HDR_CQN_MASK, val);
+	}
+
+	return -1;
+}
+
+void erdma_aeq_event_handler(struct erdma_dev *dev)
+{
+	struct erdma_aeqe aeqe;
+	u32 cqn, qpn;
+	struct erdma_qp *qp;
+	struct erdma_cq *cq;
+	struct ib_event event;
+
+	memset(&event, 0, sizeof(event));
+	while (erdma_poll_aeq_event(&dev->aeq, &aeqe)) {
+		if (FIELD_GET(ERDMA_AEQE_HDR_TYPE_MASK, aeqe.hdr) ==
+		    ERDMA_AE_TYPE_CQ_ERR) {
+			cqn = aeqe.event_data0;
+			cq = find_cq_by_cqn(dev, cqn);
+			if (!cq)
+				continue;
+			event.device = cq->ibcq.device;
+			event.element.cq = &cq->ibcq;
+			event.event = IB_EVENT_CQ_ERR;
+			if (cq->ibcq.event_handler)
+				cq->ibcq.event_handler(&event,
+						       cq->ibcq.cq_context);
+		} else {
+			qpn = aeqe.event_data0;
+			qp = find_qp_by_qpn(dev, qpn);
+			if (!qp)
+				continue;
+
+			event.device = qp->ibqp.device;
+			event.element.qp = &qp->ibqp;
+			event.event = IB_EVENT_QP_FATAL;
+			if (qp->ibqp.event_handler)
+				qp->ibqp.event_handler(&event,
+						       qp->ibqp.qp_context);
+		}
+	}
+
+	notify_eq(&dev->aeq);
+}
+
+int erdma_aeq_init(struct erdma_dev *dev)
+{
+	struct erdma_eq *eq = &dev->aeq;
+	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
+
+	eq->qbuf = dma_alloc_coherent(&dev->pdev->dev,
+				      WARPPED_BUFSIZE(buf_size),
+				      &eq->qbuf_dma_addr, GFP_KERNEL);
+	if (!eq->qbuf)
+		return -ENOMEM;
+
+	memset(eq->qbuf, 0, WARPPED_BUFSIZE(buf_size));
+
+	spin_lock_init(&eq->lock);
+	atomic64_set(&eq->event_num, 0);
+	atomic64_set(&eq->notify_num, 0);
+
+	eq->depth = ERDMA_DEFAULT_EQ_DEPTH;
+	eq->db_addr = (u64 __iomem *)(dev->func_bar + ERDMA_REGS_AEQ_DB_REG);
+	eq->db_record = (u64 *)(eq->qbuf + buf_size);
+	eq->owner = 1;
+
+	erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_H_REG,
+			  upper_32_bits(eq->qbuf_dma_addr));
+	erdma_reg_write32(dev, ERDMA_REGS_AEQ_ADDR_L_REG,
+			  lower_32_bits(eq->qbuf_dma_addr));
+	erdma_reg_write32(dev, ERDMA_REGS_AEQ_DEPTH_REG, eq->depth);
+	erdma_reg_write64(dev, ERDMA_AEQ_DB_HOST_ADDR_REG,
+			  eq->qbuf_dma_addr + buf_size);
+
+	return 0;
+}
+
+void erdma_aeq_destroy(struct erdma_dev *dev)
+{
+	struct erdma_eq *eq = &dev->aeq;
+	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
+
+	dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf,
+			  eq->qbuf_dma_addr);
+}
+
+#define MAX_POLL_CHUNK_SIZE 16
+void erdma_ceq_completion_handler(struct erdma_eq_cb *ceq_cb)
+{
+	int cqn;
+	struct erdma_cq *cq;
+	struct erdma_dev *dev = ceq_cb->dev;
+	u32 poll_cnt = 0;
+
+	if (!ceq_cb->ready)
+		return;
+
+	while ((cqn = erdma_poll_ceq_event(&ceq_cb->eq)) != -1) {
+		poll_cnt++;
+		if (cqn == 0)
+			continue;
+
+		cq = find_cq_by_cqn(dev, cqn);
+		if (!cq)
+			continue;
+
+		if (rdma_is_kernel_res(&cq->ibcq.res))
+			cq->kern_cq.cmdsn++;
+
+		if (cq->ibcq.comp_handler)
+			cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+		if (poll_cnt >= MAX_POLL_CHUNK_SIZE)
+			break;
+	}
+
+	notify_eq(&ceq_cb->eq);
+}
+
+static irqreturn_t erdma_intr_ceq_handler(int irq, void *data)
+{
+	struct erdma_eq_cb *ceq_cb = data;
+
+	tasklet_schedule(&ceq_cb->tasklet);
+
+	return IRQ_HANDLED;
+}
+
+static void erdma_intr_ceq_task(unsigned long data)
+{
+	erdma_ceq_completion_handler((struct erdma_eq_cb *)data);
+}
+
+static int erdma_set_ceq_irq(struct erdma_dev *dev, u16 ceqn)
+{
+	struct erdma_eq_cb *eqc = &dev->ceqs[ceqn];
+	cpumask_t affinity_hint_mask;
+	u32 cpu;
+	int err;
+
+	snprintf(eqc->irq_name, ERDMA_IRQNAME_SIZE, "erdma-ceq%u@pci:%s",
+		ceqn, pci_name(dev->pdev));
+	eqc->msix_vector = pci_irq_vector(dev->pdev, ceqn + 1);
+
+	tasklet_init(&dev->ceqs[ceqn].tasklet, erdma_intr_ceq_task,
+		     (unsigned long)&dev->ceqs[ceqn]);
+
+	cpu = cpumask_local_spread(ceqn + 1, dev->attrs.numa_node);
+	cpumask_set_cpu(cpu, &affinity_hint_mask);
+
+	err = request_irq(eqc->msix_vector, erdma_intr_ceq_handler, 0,
+			  eqc->irq_name, eqc);
+	if (err) {
+		dev_err(&dev->pdev->dev, "failed to request_irq(%d)\n", err);
+		return err;
+	}
+
+	irq_set_affinity_hint(eqc->msix_vector, &affinity_hint_mask);
+
+	return 0;
+}
+
+static void erdma_free_ceq_irq(struct erdma_dev *dev, u16 ceqn)
+{
+	struct erdma_eq_cb *eqc = &dev->ceqs[ceqn];
+
+	irq_set_affinity_hint(eqc->msix_vector, NULL);
+	free_irq(eqc->msix_vector, eqc);
+}
+
+static int create_eq_cmd(struct erdma_dev *dev, u32 eqn, struct erdma_eq *eq)
+{
+	struct erdma_cmdq_create_eq_req req;
+	dma_addr_t db_info_dma_addr;
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
+				CMDQ_OPCODE_CREATE_EQ);
+	req.eqn = eqn;
+	req.depth = ilog2(eq->depth);
+	req.qbuf_addr = eq->qbuf_dma_addr;
+	req.qtype = 1; /* CEQ */
+	/* Vector index is the same sa EQN. */
+	req.vector_idx = eqn;
+	db_info_dma_addr = eq->qbuf_dma_addr + (eq->depth << EQE_SHIFT);
+	req.db_dma_addr_l = lower_32_bits(db_info_dma_addr);
+	req.db_dma_addr_h = upper_32_bits(db_info_dma_addr);
+
+	return erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req,
+				   sizeof(struct erdma_cmdq_create_eq_req),
+				   NULL, NULL);
+}
+
+static int erdma_ceq_init_one(struct erdma_dev *dev, u16 ceqn)
+{
+	struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
+	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
+	int ret;
+
+	eq->qbuf = dma_alloc_coherent(&dev->pdev->dev,
+				      WARPPED_BUFSIZE(buf_size),
+				      &eq->qbuf_dma_addr, GFP_KERNEL);
+	if (!eq->qbuf)
+		return -ENOMEM;
+
+	memset(eq->qbuf, 0, WARPPED_BUFSIZE(buf_size));
+
+	spin_lock_init(&eq->lock);
+	atomic64_set(&eq->event_num, 0);
+	atomic64_set(&eq->notify_num, 0);
+
+	eq->depth = ERDMA_DEFAULT_EQ_DEPTH;
+	eq->db_addr = (u64 __iomem *)(dev->func_bar +
+				      ERDMA_REGS_CEQ_DB_BASE_REG +
+				      (ceqn + 1) * 8);
+	eq->db_record = (u64 *)(eq->qbuf + buf_size);
+	eq->ci = 0;
+	eq->owner = 1;
+	dev->ceqs[ceqn].dev = dev;
+
+	/* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */
+	ret = create_eq_cmd(dev, ceqn + 1, eq);
+	dev->ceqs[ceqn].ready = ret ? false : true;
+
+	return ret;
+}
+
+static void erdma_ceq_uninit_one(struct erdma_dev *dev, u16 ceqn)
+{
+	struct erdma_eq *eq = &dev->ceqs[ceqn].eq;
+	u32 buf_size = ERDMA_DEFAULT_EQ_DEPTH << EQE_SHIFT;
+	struct erdma_cmdq_destroy_eq_req req;
+	int err;
+
+	dev->ceqs[ceqn].ready = 0;
+
+	erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_COMMON,
+				CMDQ_OPCODE_DESTROY_EQ);
+	/* CEQ indexed from 1, 0 rsvd for CMDQ-EQ. */
+	req.eqn = ceqn + 1;
+	req.qtype = 1;
+	req.vector_idx = ceqn + 1;
+
+	err = erdma_post_cmd_wait(&dev->cmdq, (u64 *)&req, sizeof(req), NULL,
+				  NULL);
+	if (err)
+		return;
+
+	dma_free_coherent(&dev->pdev->dev, WARPPED_BUFSIZE(buf_size), eq->qbuf,
+			  eq->qbuf_dma_addr);
+}
+
+int erdma_ceqs_init(struct erdma_dev *dev)
+{
+	u32 i, j;
+	int err = 0;
+
+	for (i = 0; i < dev->attrs.irq_num - 1; i++) {
+		err = erdma_ceq_init_one(dev, i);
+		if (err)
+			goto out_err;
+
+		err = erdma_set_ceq_irq(dev, i);
+		if (err) {
+			erdma_ceq_uninit_one(dev, i);
+			goto out_err;
+		}
+	}
+
+	return 0;
+
+out_err:
+	for (j = 0; j < i; j++) {
+		erdma_free_ceq_irq(dev, j);
+		erdma_ceq_uninit_one(dev, j);
+	}
+
+	return err;
+}
+
+void erdma_ceqs_uninit(struct erdma_dev *dev)
+{
+	u32 i;
+
+	for (i = 0; i < dev->attrs.irq_num - 1; i++) {
+		erdma_free_ceq_irq(dev, i);
+		erdma_ceq_uninit_one(dev, i);
+	}
+}