diff mbox series

[6/9] nvme: add support for batched completion of polled IO

Message ID 20211013165416.985696-7-axboe@kernel.dk (mailing list archive)
State New, archived
Headers show
Series Batched completions | expand

Commit Message

Jens Axboe Oct. 13, 2021, 4:54 p.m. UTC
Take advantage of struct io_batch, if passed in to the nvme poll handler.
If it's set, rather than complete each request individually inline, store
them in the io_batch list. We only do so for requests that will complete
successfully, anything else will be completed inline as before.

Add an mq_ops->complete_batch() handler to do the post-processing of
the io_batch list once polling is complete.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/nvme/host/core.c | 18 +++++++++++---
 drivers/nvme/host/nvme.h | 17 +++++++++++++
 drivers/nvme/host/pci.c  | 54 +++++++++++++++++++++++++++++++++++-----
 3 files changed, 80 insertions(+), 9 deletions(-)

Comments

Christoph Hellwig Oct. 14, 2021, 7:43 a.m. UTC | #1
On Wed, Oct 13, 2021 at 10:54:13AM -0600, Jens Axboe wrote:
> +void nvme_complete_batch_req(struct request *req)
> +{
> +	nvme_cleanup_cmd(req);
> +	nvme_end_req_zoned(req);
> +	req->status = BLK_STS_OK;
> +}
> +EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
> +

I'd be tempted to just merge this helper into the only caller.
nvme_cleanup_cmd is exported anyway, so this would just add an export
for nvme_end_req_zoned.

> +static __always_inline void nvme_complete_batch(struct io_batch *iob,
> +						void (*fn)(struct request *rq))
> +{
> +	struct request *req;
> +
> +	req = rq_list_peek(&iob->req_list);
> +	while (req) {
> +		fn(req);
> +		nvme_complete_batch_req(req);
> +		req = rq_list_next(req);
> +	}
> +
> +	blk_mq_end_request_batch(iob);

Can we turn this into a normal for loop?

	for (req = rq_list_peek(&iob->req_list); req; req = rq_list_next(req)) {
		..
	}

> +	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
> +		/*
> +		 * Do normal inline completion if we don't have a batch
> +		 * list, if we have an end_io handler, or if the status of
> +		 * the request isn't just normal success.
> +		 */
> +		if (!iob || req->end_io || nvme_req(req)->status)
> +			nvme_pci_complete_rq(req);
> +		else
> +			rq_list_add_tail(&iob->req_list, req);
> +	}

The check for the conditions where we can or cannot batch complete
really should go into a block layer helper.  Something like the
incremental patch below:

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ce69e9666caac..57bef8229bfab 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1034,17 +1034,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 	}
 
 	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
-		/*
-		 * Do normal inline completion if we don't have a batch
-		 * list, if we have an end_io handler, or if the status of
-		 * the request isn't just normal success.
-		 */
-		if (!iob || req->end_io || nvme_req(req)->status)
-			nvme_pci_complete_rq(req);
-		else
-			rq_list_add_tail(&iob->req_list, req);
-	}
+	if (!nvme_try_complete_req(req, cqe->status, cqe->result) &&
+	    !blk_mq_add_to_batch(req, iob, nvme_req(req)->status))
+		nvme_pci_complete_rq(req);
 }
 
 static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index aea7d866a34c6..383d887e32f6d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -773,6 +773,19 @@ void blk_mq_end_request(struct request *rq, blk_status_t error);
 void __blk_mq_end_request(struct request *rq, blk_status_t error);
 void blk_mq_end_request_batch(struct io_batch *ib);
 
+/*
+ * Batched completions only work when there is no I/O error and not special
+ * ->end_io handler.
+ */
+static inline bool blk_mq_add_to_batch(struct request *req,
+		 struct io_batch *iob, bool ioerror)
+{
+	if (!iob || req->end_io || ioerror)
+		return false;
+	rq_list_add_tail(&iob->req_list, req);
+	return true;
+}
+
 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
Jens Axboe Oct. 14, 2021, 3:30 p.m. UTC | #2
On 10/14/21 1:43 AM, Christoph Hellwig wrote:
> On Wed, Oct 13, 2021 at 10:54:13AM -0600, Jens Axboe wrote:
>> +void nvme_complete_batch_req(struct request *req)
>> +{
>> +	nvme_cleanup_cmd(req);
>> +	nvme_end_req_zoned(req);
>> +	req->status = BLK_STS_OK;
>> +}
>> +EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
>> +
> 
> I'd be tempted to just merge this helper into the only caller.
> nvme_cleanup_cmd is exported anyway, so this would just add an export
> for nvme_end_req_zoned.

Sure, I can do that.

>> +static __always_inline void nvme_complete_batch(struct io_batch *iob,
>> +						void (*fn)(struct request *rq))
>> +{
>> +	struct request *req;
>> +
>> +	req = rq_list_peek(&iob->req_list);
>> +	while (req) {
>> +		fn(req);
>> +		nvme_complete_batch_req(req);
>> +		req = rq_list_next(req);
>> +	}
>> +
>> +	blk_mq_end_request_batch(iob);
> 
> Can we turn this into a normal for loop?
> 
> 	for (req = rq_list_peek(&iob->req_list); req; req = rq_list_next(req)) {
> 		..
> 	}

If you prefer it that way for nvme, for me the while () setup is much
easier to read than a really long for line.

>> +	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
>> +		/*
>> +		 * Do normal inline completion if we don't have a batch
>> +		 * list, if we have an end_io handler, or if the status of
>> +		 * the request isn't just normal success.
>> +		 */
>> +		if (!iob || req->end_io || nvme_req(req)->status)
>> +			nvme_pci_complete_rq(req);
>> +		else
>> +			rq_list_add_tail(&iob->req_list, req);
>> +	}
> 
> The check for the conditions where we can or cannot batch complete
> really should go into a block layer helper.  Something like the
> incremental patch below:

That's a good idea, I'll add that.
Jens Axboe Oct. 14, 2021, 3:34 p.m. UTC | #3
On 10/14/21 9:30 AM, Jens Axboe wrote:
> On 10/14/21 1:43 AM, Christoph Hellwig wrote:
>> On Wed, Oct 13, 2021 at 10:54:13AM -0600, Jens Axboe wrote:
>>> +void nvme_complete_batch_req(struct request *req)
>>> +{
>>> +	nvme_cleanup_cmd(req);
>>> +	nvme_end_req_zoned(req);
>>> +	req->status = BLK_STS_OK;
>>> +}
>>> +EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
>>> +
>>
>> I'd be tempted to just merge this helper into the only caller.
>> nvme_cleanup_cmd is exported anyway, so this would just add an export
>> for nvme_end_req_zoned.
> 
> Sure, I can do that.

That'll turn it into two calls from the batch completion though, so I
skipped this change.
Christoph Hellwig Oct. 14, 2021, 4:07 p.m. UTC | #4
On Thu, Oct 14, 2021 at 09:30:57AM -0600, Jens Axboe wrote:
> > Can we turn this into a normal for loop?
> > 
> > 	for (req = rq_list_peek(&iob->req_list); req; req = rq_list_next(req)) {
> > 		..
> > 	}
> 
> If you prefer it that way for nvme, for me the while () setup is much
> easier to read than a really long for line.

I prefer the loop over the while loop.  My real preference would be
a helper macro and do:

	for_each_rq(req, &iob->req_list) {

as suggested last round.
Jens Axboe Oct. 14, 2021, 4:11 p.m. UTC | #5
On 10/14/21 10:07 AM, Christoph Hellwig wrote:
> On Thu, Oct 14, 2021 at 09:30:57AM -0600, Jens Axboe wrote:
>>> Can we turn this into a normal for loop?
>>>
>>> 	for (req = rq_list_peek(&iob->req_list); req; req = rq_list_next(req)) {
>>> 		..
>>> 	}
>>
>> If you prefer it that way for nvme, for me the while () setup is much
>> easier to read than a really long for line.
> 
> I prefer the loop over the while loop.  My real preference would be
> a helper macro and do:
> 
> 	for_each_rq(req, &iob->req_list) {
> 
> as suggested last round.

Sure, I can turn it into a helper and use that.
diff mbox series

Patch

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c2c2e8545292..4b14258a3bac 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -346,15 +346,19 @@  static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
 	return RETRY;
 }
 
-static inline void nvme_end_req(struct request *req)
+static inline void nvme_end_req_zoned(struct request *req)
 {
-	blk_status_t status = nvme_error_status(nvme_req(req)->status);
-
 	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
 	    req_op(req) == REQ_OP_ZONE_APPEND)
 		req->__sector = nvme_lba_to_sect(req->q->queuedata,
 			le64_to_cpu(nvme_req(req)->result.u64));
+}
+
+static inline void nvme_end_req(struct request *req)
+{
+	blk_status_t status = nvme_error_status(nvme_req(req)->status);
 
+	nvme_end_req_zoned(req);
 	nvme_trace_bio_complete(req);
 	blk_mq_end_request(req, status);
 }
@@ -381,6 +385,14 @@  void nvme_complete_rq(struct request *req)
 }
 EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
+void nvme_complete_batch_req(struct request *req)
+{
+	nvme_cleanup_cmd(req);
+	nvme_end_req_zoned(req);
+	req->status = BLK_STS_OK;
+}
+EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
+
 /*
  * Called to unwind from ->queue_rq on a failed command submission so that the
  * multipathing code gets called to potentially failover to another path.
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ed79a6c7e804..e0c079f704cf 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -638,6 +638,23 @@  static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 }
 
 void nvme_complete_rq(struct request *req);
+void nvme_complete_batch_req(struct request *req);
+
+static __always_inline void nvme_complete_batch(struct io_batch *iob,
+						void (*fn)(struct request *rq))
+{
+	struct request *req;
+
+	req = rq_list_peek(&iob->req_list);
+	while (req) {
+		fn(req);
+		nvme_complete_batch_req(req);
+		req = rq_list_next(req);
+	}
+
+	blk_mq_end_request_batch(iob);
+}
+
 blk_status_t nvme_host_path_error(struct request *req);
 bool nvme_cancel_request(struct request *req, void *data, bool reserved);
 void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9db6e23f41ef..ae253f6f5c80 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -959,7 +959,7 @@  static blk_status_t nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
-static void nvme_pci_complete_rq(struct request *req)
+static __always_inline void nvme_pci_unmap_rq(struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
 	struct nvme_dev *dev = iod->nvmeq->dev;
@@ -969,9 +969,19 @@  static void nvme_pci_complete_rq(struct request *req)
 			       rq_integrity_vec(req)->bv_len, rq_data_dir(req));
 	if (blk_rq_nr_phys_segments(req))
 		nvme_unmap_data(dev, req);
+}
+
+static void nvme_pci_complete_rq(struct request *req)
+{
+	nvme_pci_unmap_rq(req);
 	nvme_complete_rq(req);
 }
 
+static void nvme_pci_complete_batch(struct io_batch *iob)
+{
+	nvme_complete_batch(iob, nvme_pci_unmap_rq);
+}
+
 /* We read the CQE phase first to check if the rest of the entry is valid */
 static inline bool nvme_cqe_pending(struct nvme_queue *nvmeq)
 {
@@ -996,7 +1006,8 @@  static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
-static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
+static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
+				   struct io_batch *iob, u16 idx)
 {
 	struct nvme_completion *cqe = &nvmeq->cqes[idx];
 	__u16 command_id = READ_ONCE(cqe->command_id);
@@ -1023,8 +1034,17 @@  static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
 	}
 
 	trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
-	if (!nvme_try_complete_req(req, cqe->status, cqe->result))
-		nvme_pci_complete_rq(req);
+	if (!nvme_try_complete_req(req, cqe->status, cqe->result)) {
+		/*
+		 * Do normal inline completion if we don't have a batch
+		 * list, if we have an end_io handler, or if the status of
+		 * the request isn't just normal success.
+		 */
+		if (!iob || req->end_io || nvme_req(req)->status)
+			nvme_pci_complete_rq(req);
+		else
+			rq_list_add_tail(&iob->req_list, req);
+	}
 }
 
 static inline void nvme_update_cq_head(struct nvme_queue *nvmeq)
@@ -1050,7 +1070,7 @@  static inline int nvme_process_cq(struct nvme_queue *nvmeq)
 		 * the cqe requires a full read memory barrier
 		 */
 		dma_rmb();
-		nvme_handle_cqe(nvmeq, nvmeq->cq_head);
+		nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head);
 		nvme_update_cq_head(nvmeq);
 	}
 
@@ -1092,6 +1112,27 @@  static void nvme_poll_irqdisable(struct nvme_queue *nvmeq)
 	enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
 }
 
+static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *iob)
+{
+	int found = 0;
+
+	while (nvme_cqe_pending(nvmeq)) {
+		found++;
+		/*
+		 * load-load control dependency between phase and the rest of
+		 * the cqe requires a full read memory barrier
+		 */
+		dma_rmb();
+		nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head);
+		nvme_update_cq_head(nvmeq);
+	}
+
+	if (found)
+		nvme_ring_cq_doorbell(nvmeq);
+	return found;
+}
+
+
 static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob)
 {
 	struct nvme_queue *nvmeq = hctx->driver_data;
@@ -1101,7 +1142,7 @@  static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob)
 		return 0;
 
 	spin_lock(&nvmeq->cq_poll_lock);
-	found = nvme_process_cq(nvmeq);
+	found = nvme_poll_cq(nvmeq, iob);
 	spin_unlock(&nvmeq->cq_poll_lock);
 
 	return found;
@@ -1639,6 +1680,7 @@  static const struct blk_mq_ops nvme_mq_admin_ops = {
 static const struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_pci_complete_rq,
+	.complete_batch = nvme_pci_complete_batch,
 	.commit_rqs	= nvme_commit_rqs,
 	.init_hctx	= nvme_init_hctx,
 	.init_request	= nvme_init_request,