From patchwork Fri Aug 5 16:24:44 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Keith Busch X-Patchwork-Id: 12937544 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id BB716C3F6B0 for ; Fri, 5 Aug 2022 16:26:06 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S240978AbiHEQ0D (ORCPT ); Fri, 5 Aug 2022 12:26:03 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54860 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S241098AbiHEQZq (ORCPT ); Fri, 5 Aug 2022 12:25:46 -0400 Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com [67.231.145.42]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 889B974E00 for ; Fri, 5 Aug 2022 09:25:09 -0700 (PDT) Received: from pps.filterd (m0044010.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.17.1.5/8.17.1.5) with ESMTP id 275G6r25020204 for ; Fri, 5 Aug 2022 09:25:09 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=fb.com; h=from : to : cc : subject : date : message-id : in-reply-to : references : mime-version : content-transfer-encoding : content-type; s=facebook; bh=3xuF/pw3bZy187AEV8/oDAsKPl+iDEPi1Nl0uBsulMw=; b=AzyMUl4bfYkTLvrl4Duhf7OBdH/mKXrpUkghGwjk2B+tFAjn7zk/SS8njWGMj8XI+VEu c5S8iTod6jVRB4W4n4UeotyXaPwAdUDBMfrR4LB6xlX1vXo6n7ujfP7bn6Pmr+5m2dCG FsErs/t3pxTnQqtXBQ+49MyPGDISWuUwX5Q= Received: from maileast.thefacebook.com ([163.114.130.16]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3hs0y9t3dm-10 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT) for ; Fri, 05 Aug 2022 09:25:09 -0700 Received: from twshared14818.18.frc3.facebook.com (2620:10d:c0a8:1b::d) by mail.thefacebook.com (2620:10d:c0a8:82::f) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.28; Fri, 5 Aug 2022 09:25:02 -0700 Received: by devbig007.nao1.facebook.com (Postfix, from userid 544533) id 05C69703750F; Fri, 5 Aug 2022 09:24:46 -0700 (PDT) From: Keith Busch To: , , , CC: , , Alexander Viro , Kernel Team , Keith Busch Subject: [PATCHv3 7/7] nvme-pci: implement dma_map support Date: Fri, 5 Aug 2022 09:24:44 -0700 Message-ID: <20220805162444.3985535-8-kbusch@fb.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20220805162444.3985535-1-kbusch@fb.com> References: <20220805162444.3985535-1-kbusch@fb.com> MIME-Version: 1.0 X-FB-Internal: Safe X-Proofpoint-ORIG-GUID: oy87CCUz8d8LGKD7W4ZfJ8oLyiD55nC7 X-Proofpoint-GUID: oy87CCUz8d8LGKD7W4ZfJ8oLyiD55nC7 X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.205,Aquarius:18.0.883,Hydra:6.0.517,FMLib:17.11.122.1 definitions=2022-08-05_09,2022-08-05_01,2022-06-22_01 Precedence: bulk List-ID: X-Mailing-List: io-uring@vger.kernel.org From: Keith Busch Implement callbacks to convert a registered bio_vec to a prp list, and use this for each IO that uses the returned tag. This saves repeated IO conversions and dma mapping/unmapping. In many cases, the driver can skip per-IO pool allocations entirely, potentially reducing signficant CPU cycles. Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 314 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 303 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 71a4f26ba476..d42b00c6e041 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -104,12 +104,23 @@ static bool noacpi; module_param(noacpi, bool, 0444); MODULE_PARM_DESC(noacpi, "disable acpi bios quirks"); +static const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; + struct nvme_dev; struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); +struct nvme_dma_mapping { + int nr_pages; + u16 offset; + bool needs_sync; + u8 rsvd; + dma_addr_t prp_dma_addr; + __le64 *prps; +}; + /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ @@ -544,9 +555,30 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) return true; } +static void nvme_sync_dma(struct nvme_dev *dev, struct request *req, + struct nvme_dma_mapping *mapping) +{ + int offset, i, j, length, nprps; + + offset = blk_rq_dma_offset(req) + mapping->offset; + i = offset >> NVME_CTRL_PAGE_SHIFT; + + offset = offset & (NVME_CTRL_PAGE_SIZE - 1); + length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset); + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); + + dma_sync_single_for_cpu(dev->dev, + le64_to_cpu(mapping->prps[i++]), + NVME_CTRL_PAGE_SIZE - offset, DMA_FROM_DEVICE); + for (j = 1; j < nprps; j++) { + dma_sync_single_for_cpu(dev->dev, + le64_to_cpu(mapping->prps[i++]), + NVME_CTRL_PAGE_SIZE, DMA_FROM_DEVICE); + } +} + static void nvme_free_prps(struct nvme_dev *dev, struct request *req) { - const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); dma_addr_t dma_addr = iod->first_dma; int i; @@ -576,10 +608,24 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) } } +static void nvme_free_prp_chain(struct nvme_dev *dev, struct request *req, + struct nvme_iod *iod) +{ + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], + iod->first_dma); + else if (iod->use_sgl) + nvme_free_sgls(dev, req); + else + nvme_free_prps(dev, req); + mempool_free(iod->sg, dev->iod_mempool); +} + static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + WARN_ON_ONCE(!iod->nents); if (is_pci_p2pdma_page(sg_page(iod->sg))) pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req)); @@ -589,25 +635,25 @@ static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req) static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { + struct nvme_dma_mapping *mapping = blk_rq_dma_tag(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + if (mapping) { + if (mapping->needs_sync && rq_data_dir(req) == READ) + nvme_sync_dma(dev, req, mapping); + if (iod->npages >= 0) + nvme_free_prp_chain(dev, req, iod); + return; + } + if (iod->dma_len) { dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, rq_dma_dir(req)); return; } - WARN_ON_ONCE(!iod->nents); - nvme_unmap_sg(dev, req); - if (iod->npages == 0) - dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], - iod->first_dma); - else if (iod->use_sgl) - nvme_free_sgls(dev, req); - else - nvme_free_prps(dev, req); - mempool_free(iod->sg, dev->iod_mempool); + nvme_free_prp_chain(dev, req, iod); } static void nvme_print_sgl(struct scatterlist *sgl, int nents) @@ -835,13 +881,145 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, return BLK_STS_OK; } +static blk_status_t nvme_premapped_slow(struct nvme_dev *dev, + struct request *req, struct nvme_iod *iod, + struct nvme_dma_mapping *mapping, int nprps) +{ + struct dma_pool *pool; + dma_addr_t prp_dma; + __le64 *prp_list; + void **list; + int i; + + iod->sg = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); + if (!iod->sg) + return BLK_STS_RESOURCE; + + if (nprps <= (256 / 8)) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + if (!prp_list) { + iod->npages = -1; + goto out_free_sg; + } + + list = nvme_pci_iod_list(req); + list[0] = prp_list; + iod->first_dma = prp_dma; + + for (;;) { + dma_addr_t next_prp_dma; + __le64 *next_prp_list; + + if (nprps <= last_prp + 1) { + memcpy(prp_list, &mapping->prps[i], nprps * 8); + break; + } + + memcpy(prp_list, &mapping->prps[i], NVME_CTRL_PAGE_SIZE - 8); + nprps -= last_prp; + i += last_prp; + + next_prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &next_prp_dma); + if (!next_prp_list) + goto free_prps; + + prp_list[last_prp] = cpu_to_le64(next_prp_dma); + prp_list = next_prp_list; + prp_dma = next_prp_dma; + list[iod->npages++] = prp_list; + } + return BLK_STS_OK; + +free_prps: + nvme_free_prps(dev, req); +out_free_sg: + mempool_free(iod->sg, dev->iod_mempool); + return BLK_STS_RESOURCE; +} + +static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req, + struct nvme_dma_mapping *mapping, + struct nvme_rw_command *cmnd, + struct nvme_iod *iod) +{ + bool needs_sync = mapping->needs_sync && rq_data_dir(req) == WRITE; + dma_addr_t prp_list_start, prp_list_end; + int i, offset, j, length, nprps; + blk_status_t ret; + + offset = blk_rq_dma_offset(req) + mapping->offset; + i = offset >> NVME_CTRL_PAGE_SHIFT; + + if (needs_sync) + dma_sync_single_for_device(dev->dev, + le64_to_cpu(mapping->prps[i]), + NVME_CTRL_PAGE_SIZE - offset, DMA_TO_DEVICE); + + offset = offset & (NVME_CTRL_PAGE_SIZE - 1); + cmnd->dptr.prp1 = cpu_to_le64(le64_to_cpu(mapping->prps[i++]) + offset); + + length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset); + if (length <= 0) + return BLK_STS_OK; + + if (length <= NVME_CTRL_PAGE_SIZE) { + if (needs_sync) + dma_sync_single_for_device(dev->dev, + le64_to_cpu(mapping->prps[i]), + NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE); + cmnd->dptr.prp2 = mapping->prps[i]; + return BLK_STS_OK; + } + + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); + prp_list_start = mapping->prp_dma_addr + 8 * i; + prp_list_end = prp_list_start + 8 * nprps; + + /* Optimization when remaining list fits in one nvme page */ + if ((prp_list_start >> NVME_CTRL_PAGE_SHIFT) == + (prp_list_end >> NVME_CTRL_PAGE_SHIFT)) { + cmnd->dptr.prp2 = cpu_to_le64(prp_list_start); + goto sync; + } + + ret = nvme_premapped_slow(dev, req, iod, mapping, nprps); + if (ret != BLK_STS_OK) + return ret; + + cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); +sync: + if (!needs_sync) + return BLK_STS_OK; + + i = offset >> NVME_CTRL_PAGE_SHIFT; + for (j = 0; j < nprps; j++) + dma_sync_single_for_device(dev->dev, + le64_to_cpu(mapping->prps[i++]), + NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE); + return BLK_STS_OK; +} + static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { + struct nvme_dma_mapping *mapping = blk_rq_dma_tag(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret = BLK_STS_RESOURCE; int nr_mapped; + if (mapping) { + iod->dma_len = 0; + iod->use_sgl = false; + return nvme_premapped(dev, req, mapping, &cmnd->rw, iod); + } + if (blk_rq_nr_phys_segments(req) == 1) { struct bio_vec bv = req_bvec(req); @@ -1732,6 +1910,116 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) return result; } +#ifdef CONFIG_HAS_DMA +/* + * Important: bvec must be describing a virtually contiguous buffer. + */ +static void *nvme_pci_dma_map(struct request_queue *q, + struct bio_vec *bvec, int nr_vecs) +{ + const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE); + struct nvme_ns *ns = q->queuedata; + struct nvme_dev *dev = to_nvme_dev(ns->ctrl); + struct nvme_dma_mapping *mapping; + int i, j, k, size, ppv, ret = -ENOMEM; + + if (!nr_vecs) + return ERR_PTR(-EINVAL); + + mapping = kzalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return ERR_PTR(-ENOMEM); + + mapping->nr_pages = nr_vecs * nvme_pages; + size = sizeof(*mapping->prps) * mapping->nr_pages; + mapping->prps = dma_alloc_coherent(dev->dev, size, + &mapping->prp_dma_addr, GFP_KERNEL); + if (!mapping->prps) + goto free_mapping; + + mapping->needs_sync = false; + for (i = 0, k = 0; i < nr_vecs; i++) { + struct bio_vec *bv = bvec + i; + dma_addr_t dma_addr; + + ppv = nvme_pages; + if (i == 0) { + mapping->offset = bv->bv_offset; + ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT; + } else if (bv->bv_offset) { + ret = -EINVAL; + goto err; + } + + if (bv->bv_offset + bv->bv_len != PAGE_SIZE && + i < nr_vecs - 1) { + ret = -EINVAL; + goto err; + } + + dma_addr = dma_map_bvec(dev->dev, bv, 0, 0); + if (dma_mapping_error(dev->dev, dma_addr)) { + ret = -EIO; + goto err; + } + + if (i == 0) + dma_addr -= mapping->offset; + + if (dma_need_sync(dev->dev, dma_addr)) + mapping->needs_sync = true; + + for (j = 0; j < ppv; j++) + mapping->prps[k++] = cpu_to_le64(dma_addr + + j * NVME_CTRL_PAGE_SIZE); + } + + get_device(dev->dev); + return mapping; + +err: + for (i = 0; i < k; i += ppv) { + __u64 dma_addr = le64_to_cpu(mapping->prps[i]); + ppv = nvme_pages; + + if (i == 0) + ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT; + dma_unmap_page(dev->dev, dma_addr, + PAGE_SIZE - offset_in_page(dma_addr), 0); + } + + dma_free_coherent(dev->dev, size, (void *)mapping->prps, + mapping->prp_dma_addr); +free_mapping: + kfree(mapping); + return ERR_PTR(ret); +} + +static void nvme_pci_dma_unmap(struct request_queue *q, void *dma_tag) +{ + const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE); + struct nvme_ns *ns = q->queuedata; + struct nvme_dev *dev = to_nvme_dev(ns->ctrl); + struct nvme_dma_mapping *mapping = dma_tag; + int i, ppv; + + for (i = 0; i < mapping->nr_pages; i += ppv) { + __u64 dma_addr = le64_to_cpu(mapping->prps[i]); + ppv = nvme_pages; + + if (i == 0) + ppv -= mapping->offset >> NVME_CTRL_PAGE_SHIFT; + dma_unmap_page(dev->dev, dma_addr, + PAGE_SIZE - offset_in_page(dma_addr), 0); + } + + dma_free_coherent(dev->dev, mapping->nr_pages * sizeof(*mapping->prps), + (void *)mapping->prps, mapping->prp_dma_addr); + kfree(mapping); + put_device(dev->dev); +} +#endif + static const struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, @@ -1750,6 +2038,10 @@ static const struct blk_mq_ops nvme_mq_ops = { .map_queues = nvme_pci_map_queues, .timeout = nvme_timeout, .poll = nvme_poll, +#ifdef CONFIG_HAS_DMA + .dma_map = nvme_pci_dma_map, + .dma_unmap = nvme_pci_dma_unmap, +#endif }; static void nvme_dev_remove_admin(struct nvme_dev *dev)