Message ID | 20211013165416.985696-10-axboe@kernel.dk (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Batched completions | expand |
Couldn't we do something like the incremental patch below to avoid duplicating the CQ processing? With that this patch could probably go away entirely and we could fold the change into the other nvme patch. diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 061f0b1cb0ec3..ce69e9666caac 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1059,9 +1059,8 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) } } -static inline int nvme_process_cq(struct nvme_queue *nvmeq) +static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *iob) { - DEFINE_IO_BATCH(iob); int found = 0; while (nvme_cqe_pending(nvmeq)) { @@ -1071,15 +1070,23 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq) * the cqe requires a full read memory barrier */ dma_rmb(); - nvme_handle_cqe(nvmeq, &iob, nvmeq->cq_head); + nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head); nvme_update_cq_head(nvmeq); } - if (found) { - if (iob.req_list) - nvme_pci_complete_batch(&iob); + if (found) nvme_ring_cq_doorbell(nvmeq); - } + return found; +} + +static inline int nvme_process_cq(struct nvme_queue *nvmeq) +{ + DEFINE_IO_BATCH(iob); + int found; + + found = nvme_poll_cq(nvmeq, &iob); + if (iob.req_list) + nvme_pci_complete_batch(&iob); return found; } @@ -1116,27 +1123,6 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } -static inline int nvme_poll_cq(struct nvme_queue *nvmeq, struct io_batch *iob) -{ - int found = 0; - - while (nvme_cqe_pending(nvmeq)) { - found++; - /* - * load-load control dependency between phase and the rest of - * the cqe requires a full read memory barrier - */ - dma_rmb(); - nvme_handle_cqe(nvmeq, iob, nvmeq->cq_head); - nvme_update_cq_head(nvmeq); - } - - if (found) - nvme_ring_cq_doorbell(nvmeq); - return found; -} - - static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_batch *iob) { struct nvme_queue *nvmeq = hctx->driver_data;
On 10/14/21 1:53 AM, Christoph Hellwig wrote: > Couldn't we do something like the incremental patch below to avoid > duplicating the CQ processing? With that this patch could probably go > away entirely and we could fold the change into the other nvme patch. Yeah should probably go into the first patch in some shape or form, I'll take a look.
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ae253f6f5c80..061f0b1cb0ec 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1061,6 +1061,7 @@ static inline void nvme_update_cq_head(struct nvme_queue *nvmeq) static inline int nvme_process_cq(struct nvme_queue *nvmeq) { + DEFINE_IO_BATCH(iob); int found = 0; while (nvme_cqe_pending(nvmeq)) { @@ -1070,12 +1071,15 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq) * the cqe requires a full read memory barrier */ dma_rmb(); - nvme_handle_cqe(nvmeq, NULL, nvmeq->cq_head); + nvme_handle_cqe(nvmeq, &iob, nvmeq->cq_head); nvme_update_cq_head(nvmeq); } - if (found) + if (found) { + if (iob.req_list) + nvme_pci_complete_batch(&iob); nvme_ring_cq_doorbell(nvmeq); + } return found; }
Trivial to do now, just need our own io_batch on the stack and pass that in to the usual command completion handling. I pondered making this dependent on how many entries we had to process, but even for a single entry there's no discernable difference in performance or latency. Running a sync workload over io_uring: t/io_uring -b512 -d1 -s1 -c1 -p0 -F1 -B1 -n2 /dev/nvme1n1 /dev/nvme2n1 yields the below performance before the patch: IOPS=254820, BW=124MiB/s, IOS/call=1/1, inflight=(1 1) IOPS=251174, BW=122MiB/s, IOS/call=1/1, inflight=(1 1) IOPS=250806, BW=122MiB/s, IOS/call=1/1, inflight=(1 1) and the following after: IOPS=255972, BW=124MiB/s, IOS/call=1/1, inflight=(1 1) IOPS=251920, BW=123MiB/s, IOS/call=1/1, inflight=(1 1) IOPS=251794, BW=122MiB/s, IOS/call=1/1, inflight=(1 1) which definitely isn't slower, about the same if you factor in a bit of variance. For peak performance workloads, benchmarking shows a 2% improvement. Signed-off-by: Jens Axboe <axboe@kernel.dk> --- drivers/nvme/host/pci.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)