diff mbox

[v2,2/3] nvme-rdma: don't complete requests before a send work request has completed

Message ID 20171108100616.26605-3-sagi@grimberg.me (mailing list archive)
State Not Applicable
Headers show

Commit Message

Sagi Grimberg Nov. 8, 2017, 10:06 a.m. UTC
In order to guarantee that the HCA will never get an access violation
(either from invalidated rkey or from iommu) when retrying a send
operation we must complete a request only when both send completion
and the nvme cqe has arrived. We need to set the send/recv completions
flags atomically because we might have more than a single context
accessing the request concurrently (one is cq irq-poll context and
the other is user-polling used in IOCB_HIPRI).

Only then we are safe to invalidate the rkey (if needed), unmap
the host buffers, and complete the IO.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/rdma.c | 52 ++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 8 deletions(-)

Comments

Christoph Hellwig Nov. 9, 2017, 9:21 a.m. UTC | #1
> +	}
> +
> +	spin_lock_irqsave(&req->lock, flags);
> +	req->send_completed = true;
> +	end = req->resp_completed;
> +	spin_unlock_irqrestore(&req->lock, flags);

Wouldn't it be better to use atomic bitops (test_and_set_bit and co)
for these flags instead of needing a irqsave lock?

>  	err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
> -			req->mr->need_inval ? &req->reg_wr.wr : NULL);
> +			req->mr->need_inval ? &req->reg_wr.wr : NULL, true);

Looks like the unsignalled completions just removed in the last patch
are coming back here.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg Nov. 9, 2017, 11:14 a.m. UTC | #2
On 11/09/2017 11:21 AM, Christoph Hellwig wrote:
>> +	}
>> +
>> +	spin_lock_irqsave(&req->lock, flags);
>> +	req->send_completed = true;
>> +	end = req->resp_completed;
>> +	spin_unlock_irqrestore(&req->lock, flags);
> 
> Wouldn't it be better to use atomic bitops (test_and_set_bit and co)
> for these flags instead of needing a irqsave lock?

Here it will be better, but in the next patch, where we need to check
also for MR invalidation atomically I don't know.

The logic is:
1. on RECV: complete if (send completed and MR invalidated)
2. on SEND: complete if (recv completed and MR invalidated)
3. on INV: complete if (recv completed and send completed)

We need the conditions to be atomic because the CQ can be processed
from two contexts (and trust me, it falls apart fast if we don't protect
it). Not sure I understand how I can achieve this with atomic bitops.

We could relax the spinlock to _bh I think as we are only racing with
irq-poll.

>>   	err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
>> -			req->mr->need_inval ? &req->reg_wr.wr : NULL);
>> +			req->mr->need_inval ? &req->reg_wr.wr : NULL, true);
> 
> Looks like the unsignalled completions just removed in the last patch
> are coming back here.

That's because of the async event send, not sure why its here though,
I'll check.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index c765f1d20141..998f7cb445d9 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -67,6 +67,10 @@  struct nvme_rdma_request {
 	struct nvme_request	req;
 	struct ib_mr		*mr;
 	struct nvme_rdma_qe	sqe;
+	struct nvme_completion	cqe;
+	spinlock_t		lock;
+	bool			send_completed;
+	bool			resp_completed;
 	struct ib_sge		sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
 	u32			num_sge;
 	int			nents;
@@ -332,6 +336,8 @@  static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
 	struct ib_device *ibdev = dev->dev;
 	int ret;
 
+	spin_lock_init(&req->lock);
+
 	ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
 			DMA_TO_DEVICE);
 	if (ret)
@@ -1154,6 +1160,8 @@  static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
 	req->num_sge = 1;
 	req->inline_data = false;
 	req->mr->need_inval = false;
+	req->send_completed = false;
+	req->resp_completed = false;
 
 	c->common.flags |= NVME_CMD_SGL_METABUF;
 
@@ -1190,13 +1198,30 @@  static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
 
 static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
 {
-	if (unlikely(wc->status != IB_WC_SUCCESS))
+	struct nvme_rdma_qe *qe =
+		container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
+	struct nvme_rdma_request *req =
+		container_of(qe, struct nvme_rdma_request, sqe);
+	struct request *rq = blk_mq_rq_from_pdu(req);
+	unsigned long flags;
+	bool end;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		nvme_rdma_wr_error(cq, wc, "SEND");
+		return;
+	}
+
+	spin_lock_irqsave(&req->lock, flags);
+	req->send_completed = true;
+	end = req->resp_completed;
+	spin_unlock_irqrestore(&req->lock, flags);
+	if (end)
+		nvme_end_request(rq, req->cqe.status, req->cqe.result);
 }
 
 static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
 		struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
-		struct ib_send_wr *first)
+		struct ib_send_wr *first, bool signal)
 {
 	struct ib_send_wr wr, *bad_wr;
 	int ret;
@@ -1212,7 +1237,7 @@  static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
 	wr.sg_list    = sge;
 	wr.num_sge    = num_sge;
 	wr.opcode     = IB_WR_SEND;
-	wr.send_flags = IB_SEND_SIGNALED;
+	wr.send_flags = signal ? IB_SEND_SIGNALED : 0;
 
 	if (first)
 		first->next = &wr;
@@ -1286,7 +1311,7 @@  static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
 	ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
 			DMA_TO_DEVICE);
 
-	ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
+	ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
 	WARN_ON_ONCE(ret);
 }
 
@@ -1296,6 +1321,8 @@  static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
 	struct request *rq;
 	struct nvme_rdma_request *req;
 	int ret = 0;
+	unsigned long flags;
+	bool end;
 
 	rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
 	if (!rq) {
@@ -1307,14 +1334,23 @@  static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
 	}
 	req = blk_mq_rq_to_pdu(rq);
 
-	if (rq->tag == tag)
-		ret = 1;
+	req->cqe.status = cqe->status;
+	req->cqe.result = cqe->result;
 
 	if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) &&
 	    wc->ex.invalidate_rkey == req->mr->rkey)
 		req->mr->need_inval = false;
 
-	nvme_end_request(rq, cqe->status, cqe->result);
+	spin_lock_irqsave(&req->lock, flags);
+	req->resp_completed = true;
+	end = req->send_completed;
+	spin_unlock_irqrestore(&req->lock, flags);
+	if (end) {
+		if (rq->tag == tag)
+			ret = 1;
+		nvme_end_request(rq, req->cqe.status, req->cqe.result);
+	}
+
 	return ret;
 }
 
@@ -1603,7 +1639,7 @@  static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
 			sizeof(struct nvme_command), DMA_TO_DEVICE);
 
 	err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
-			req->mr->need_inval ? &req->reg_wr.wr : NULL);
+			req->mr->need_inval ? &req->reg_wr.wr : NULL, true);
 	if (unlikely(err)) {
 		nvme_rdma_unmap_data(queue, rq);
 		goto err;