From patchwork Tue Jul 26 17:38:10 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Keith Busch X-Patchwork-Id: 12929592 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 78989C3F6B0 for ; Tue, 26 Jul 2022 17:38:29 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S239437AbiGZRi2 (ORCPT ); Tue, 26 Jul 2022 13:38:28 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:33098 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S239438AbiGZRi1 (ORCPT ); Tue, 26 Jul 2022 13:38:27 -0400 Received: from mx0b-00082601.pphosted.com (mx0b-00082601.pphosted.com [67.231.153.30]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 73F742E9F2 for ; Tue, 26 Jul 2022 10:38:24 -0700 (PDT) Received: from pps.filterd (m0109332.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.17.1.5/8.17.1.5) with ESMTP id 26QFUYDo009215 for ; Tue, 26 Jul 2022 10:38:23 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=fb.com; h=from : to : cc : subject : date : message-id : in-reply-to : references : mime-version : content-transfer-encoding : content-type; s=facebook; bh=SomX9IGDRyQXLRBTlADstEaF266+E6DF73QSujOv+Ew=; b=j+lYmcL1s32BeahSXdU4ag48ouVrO0tXDF3jFQePs0sjjcTHSneXvo2OdVjBuuCNrZZ8 lRTIGAanKpY0HTCW3CHrfb53g1/16Po0igyVy/w7bVRlM0hrWytv4SVjBYyP5MdvBhfh hEkI+WflKnJcFTMW4AwVuoSCc91zzHgErUc= Received: from mail.thefacebook.com ([163.114.132.120]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3hj592n3c9-5 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT) for ; Tue, 26 Jul 2022 10:38:23 -0700 Received: from twshared6324.05.ash7.facebook.com (2620:10d:c085:108::8) by mail.thefacebook.com (2620:10d:c085:11d::7) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.28; Tue, 26 Jul 2022 10:38:20 -0700 Received: by devbig007.nao1.facebook.com (Postfix, from userid 544533) id 52114698E4AA; Tue, 26 Jul 2022 10:38:15 -0700 (PDT) From: Keith Busch To: , , , CC: , , Alexander Viro , Keith Busch Subject: [PATCH 1/5] blk-mq: add ops to dma map bvec Date: Tue, 26 Jul 2022 10:38:10 -0700 Message-ID: <20220726173814.2264573-2-kbusch@fb.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20220726173814.2264573-1-kbusch@fb.com> References: <20220726173814.2264573-1-kbusch@fb.com> MIME-Version: 1.0 X-FB-Internal: Safe X-Proofpoint-ORIG-GUID: O3E-RXKXpzY6829TGCg6VzOOCZCcoHPP X-Proofpoint-GUID: O3E-RXKXpzY6829TGCg6VzOOCZCcoHPP X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.205,Aquarius:18.0.883,Hydra:6.0.517,FMLib:17.11.122.1 definitions=2022-07-26_05,2022-07-26_01,2022-06-22_01 Precedence: bulk List-ID: X-Mailing-List: io-uring@vger.kernel.org From: Keith Busch The same buffer may be used for many subsequent IO's. Instead of setting up the mapping per-IO, provide an interface that can allow a buffer to be premapped just once and referenced again later. Signed-off-by: Keith Busch --- block/bdev.c | 20 ++++++++++++++++++++ include/linux/blk-mq.h | 13 +++++++++++++ include/linux/blkdev.h | 16 ++++++++++++++++ 3 files changed, 49 insertions(+) diff --git a/block/bdev.c b/block/bdev.c index ce05175e71ce..345176e12bb1 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1069,3 +1069,23 @@ void sync_bdevs(bool wait) spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } + +#ifdef CONFIG_HAS_DMA +void *block_dma_map(struct block_device *bdev, struct bio_vec *bvec, + int nr_vecs) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q->mq_ops && q->mq_ops->dma_map) + return q->mq_ops->dma_map(q, bvec, nr_vecs); + return ERR_PTR(-EINVAL); +} + +void block_dma_unmap(struct block_device *bdev, void *dma_tag) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q->mq_ops && q->mq_ops->dma_unmap) + return q->mq_ops->dma_unmap(q, dma_tag); +} +#endif diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index effee1dc715a..e10aabb36c2c 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -639,6 +639,19 @@ struct blk_mq_ops { */ void (*show_rq)(struct seq_file *m, struct request *rq); #endif + +#ifdef CONFIG_HAS_DMA + /** + * @dma_map: Create a dma mapping. On success, returns an opaque cookie + * that the can be referenced by the driver in future requests. + */ + void *(*dma_map)(struct request_queue *q, struct bio_vec *bvec, int nr_vecs); + + /** + * @dma_unmap: Tear down a previously created dma mapping. + */ + void (*dma_unmap)(struct request_queue *q, void *dma_tag); +#endif }; enum { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d04bdf549efa..47f7f6f430d9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1526,4 +1526,20 @@ struct io_comp_batch { #define DEFINE_IO_COMP_BATCH(name) struct io_comp_batch name = { } +#ifdef CONFIG_HAS_DMA +void *block_dma_map(struct block_device *bdev, struct bio_vec *bvec, + int nr_vecs); +void block_dma_unmap(struct block_device *bdev, void *dma_tag); +#else +static inline void *block_dma_map(struct block_device *bdev, + struct bio_vec *bvec, int nr_vecs) +{ + return ERR_PTR(-ENOTSUPP); +} + +static inline void block_dma_unmap(struct block_device *bdev, void *dma_tag) +{ +} +#endif + #endif /* _LINUX_BLKDEV_H */ From patchwork Tue Jul 26 17:38:11 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Keith Busch X-Patchwork-Id: 12929593 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id B3F6CC00140 for ; Tue, 26 Jul 2022 17:38:30 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S239486AbiGZRi3 (ORCPT ); Tue, 26 Jul 2022 13:38:29 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:33108 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S239464AbiGZRi1 (ORCPT ); Tue, 26 Jul 2022 13:38:27 -0400 Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com [67.231.145.42]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 128EB2ED45 for ; Tue, 26 Jul 2022 10:38:25 -0700 (PDT) Received: from pps.filterd (m0044012.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.17.1.5/8.17.1.5) with ESMTP id 26QFiekY003616 for ; Tue, 26 Jul 2022 10:38:24 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=fb.com; h=from : to : cc : subject : date : message-id : in-reply-to : references : mime-version : content-transfer-encoding : content-type; s=facebook; bh=rDf8f4mJuzJs2cUMTvGm7RGphjWevqpaV3oDLm+Ph+s=; b=N5319n3WKP3DJt7NVTyblnmWAbz+Xv+lBBztZ7JnKqZr8nieUSxOCtyhpCtx0zv5hwS2 DsAteiT7OgByYDBjxeEI5JiWwk6P1sgwOd41mImXMVn51080JjYxokqMcEBDBFCbl1RZ d+kJYX5UvFdzRAifaqfTA+tJTY14gslsWP8= Received: from mail.thefacebook.com ([163.114.132.120]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3hj1uspjwv-1 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT) for ; Tue, 26 Jul 2022 10:38:24 -0700 Received: from twshared1866.09.ash9.facebook.com (2620:10d:c085:108::4) by mail.thefacebook.com (2620:10d:c085:21d::6) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.28; Tue, 26 Jul 2022 10:38:24 -0700 Received: by devbig007.nao1.facebook.com (Postfix, from userid 544533) id 60554698E4AC; Tue, 26 Jul 2022 10:38:15 -0700 (PDT) From: Keith Busch To: , , , CC: , , Alexander Viro , Keith Busch Subject: [PATCH 2/5] iov_iter: introduce type for preregistered dma tags Date: Tue, 26 Jul 2022 10:38:11 -0700 Message-ID: <20220726173814.2264573-3-kbusch@fb.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20220726173814.2264573-1-kbusch@fb.com> References: <20220726173814.2264573-1-kbusch@fb.com> MIME-Version: 1.0 X-FB-Internal: Safe X-Proofpoint-GUID: 2pe9cDv6qeWkZFbMUZVSGEmwFw3mSKL9 X-Proofpoint-ORIG-GUID: 2pe9cDv6qeWkZFbMUZVSGEmwFw3mSKL9 X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.205,Aquarius:18.0.883,Hydra:6.0.517,FMLib:17.11.122.1 definitions=2022-07-26_05,2022-07-26_01,2022-06-22_01 Precedence: bulk List-ID: X-Mailing-List: io-uring@vger.kernel.org From: Keith Busch Introduce a new iov_iter type representing a pre-registered DMA address tag. The tag is an opaque cookie specific to the lower level driver that created it, and can be referenced at any arbitrary offset. Signed-off-by: Keith Busch --- include/linux/uio.h | 9 +++++++++ lib/iov_iter.c | 25 ++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/include/linux/uio.h b/include/linux/uio.h index 34ba4a731179..de8af68eacb3 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -26,6 +26,7 @@ enum iter_type { ITER_PIPE, ITER_XARRAY, ITER_DISCARD, + ITER_DMA_TAG, }; struct iov_iter_state { @@ -46,6 +47,7 @@ struct iov_iter { const struct bio_vec *bvec; struct xarray *xarray; struct pipe_inode_info *pipe; + void *dma_tag; }; union { unsigned long nr_segs; @@ -85,6 +87,11 @@ static inline bool iov_iter_is_bvec(const struct iov_iter *i) return iov_iter_type(i) == ITER_BVEC; } +static inline bool iov_iter_is_dma_tag(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_DMA_TAG; +} + static inline bool iov_iter_is_pipe(const struct iov_iter *i) { return iov_iter_type(i) == ITER_PIPE; @@ -229,6 +236,8 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); +void iov_iter_dma_tag(struct iov_iter *i, unsigned int direction, void *dma_tag, + unsigned int dma_offset, unsigned long nr_segs, size_t count); void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 507e732ef7cf..e26cb0889820 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1077,6 +1077,9 @@ void iov_iter_advance(struct iov_iter *i, size_t size) i->count -= size; } else if (iov_iter_is_discard(i)) { i->count -= size; + } else if (iov_iter_is_dma_tag(i)) { + i->iov_offset += size; + i->count -= size; } } EXPORT_SYMBOL(iov_iter_advance); @@ -1201,6 +1204,22 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_bvec); +void iov_iter_dma_tag(struct iov_iter *i, unsigned int direction, + void *dma_tag, unsigned int dma_offset, + unsigned long nr_segs, size_t count) +{ + WARN_ON(direction & ~(READ | WRITE)); + *i = (struct iov_iter){ + .iter_type = ITER_DMA_TAG, + .data_source = direction, + .nr_segs = nr_segs, + .dma_tag = dma_tag, + .iov_offset = dma_offset, + .count = count + }; +} +EXPORT_SYMBOL(iov_iter_dma_tag); + void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, size_t count) @@ -2124,8 +2143,8 @@ EXPORT_SYMBOL(import_single_range); */ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { - if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i)) && - !iov_iter_is_kvec(i)) + if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && + !iov_iter_is_dma_tag(i)) && !iov_iter_is_kvec(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; @@ -2141,7 +2160,7 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; - else + else if (!iov_iter_is_dma_tag(i)) i->iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } From patchwork Tue Jul 26 17:38:12 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Keith Busch X-Patchwork-Id: 12929594 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 39F6FC19F2A for ; Tue, 26 Jul 2022 17:38:34 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229576AbiGZRib (ORCPT ); Tue, 26 Jul 2022 13:38:31 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:33178 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S239503AbiGZRi3 (ORCPT ); Tue, 26 Jul 2022 13:38:29 -0400 Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com [67.231.145.42]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id BE3062E9DA for ; Tue, 26 Jul 2022 10:38:27 -0700 (PDT) Received: from pps.filterd (m0109334.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.17.1.5/8.17.1.5) with ESMTP id 26QFWBBH010634 for ; Tue, 26 Jul 2022 10:38:27 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=fb.com; h=from : to : cc : subject : date : message-id : in-reply-to : references : mime-version : content-transfer-encoding : content-type; s=facebook; bh=wyQlYXUSk9g4uiRzXyUdO+frsHVF03nRChlrghL7lbQ=; b=SUaqRJtfBCby4ucNynjP1m/xojgFMlMgeK1MkAqYQ3fw/74tt2Qod2L+6KfjaCjVDBV5 D0zRJwMZEwvX1SvMGX7s9TXhAZxTw9k4pGgJi5hKVVUtXVZ6LRB/UNl9EOsOmBN8BNxf qQvaECuspcK2Mx9KK/W+cExQjvtTaaER/UA= Received: from maileast.thefacebook.com ([163.114.130.16]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3hhxbwr9p8-3 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT) for ; Tue, 26 Jul 2022 10:38:27 -0700 Received: from twshared30313.14.frc2.facebook.com (2620:10d:c0a8:1b::d) by mail.thefacebook.com (2620:10d:c0a8:82::d) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.28; Tue, 26 Jul 2022 10:38:24 -0700 Received: by devbig007.nao1.facebook.com (Postfix, from userid 544533) id 6A7F7698E4AE; Tue, 26 Jul 2022 10:38:15 -0700 (PDT) From: Keith Busch To: , , , CC: , , Alexander Viro , Keith Busch Subject: [PATCH 3/5] block: add dma tag bio type Date: Tue, 26 Jul 2022 10:38:12 -0700 Message-ID: <20220726173814.2264573-4-kbusch@fb.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20220726173814.2264573-1-kbusch@fb.com> References: <20220726173814.2264573-1-kbusch@fb.com> MIME-Version: 1.0 X-FB-Internal: Safe X-Proofpoint-GUID: l25mreEg8PTHWbDEmHcR6_lck66kZMgQ X-Proofpoint-ORIG-GUID: l25mreEg8PTHWbDEmHcR6_lck66kZMgQ X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.205,Aquarius:18.0.883,Hydra:6.0.517,FMLib:17.11.122.1 definitions=2022-07-26_05,2022-07-26_01,2022-06-22_01 Precedence: bulk List-ID: X-Mailing-List: io-uring@vger.kernel.org From: Keith Busch Premapped buffers don't require a generic bio_vec since these have already been dma mapped to the driver specific data structure. Repurpose the bi_io_vec with the driver specific tag as they are mutually exclusive, and provide all the setup and split helpers to support dma tags. In order to use this, a driver must implement the .dma_map() callback. If the driver provides this callback, then it must be aware that any given bio may be using a dma_tag instead of a bio_vec. Note, this isn't working with blk_integrity. Signed-off-by: Keith Busch --- block/bio.c | 25 ++++++++++++++++++++++++- block/blk-merge.c | 18 ++++++++++++++++++ include/linux/bio.h | 21 ++++++++++++--------- include/linux/blk-mq.h | 12 ++++++++++++ include/linux/blk_types.h | 6 +++++- 5 files changed, 71 insertions(+), 11 deletions(-) diff --git a/block/bio.c b/block/bio.c index ce3bc3578ac4..472bdc4fd419 100644 --- a/block/bio.c +++ b/block/bio.c @@ -229,7 +229,8 @@ static void bio_free(struct bio *bio) WARN_ON_ONCE(!bs); bio_uninit(bio); - bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + if (!bio_flagged(bio, BIO_DMA_TAGGED)) + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); mempool_free(p - bs->front_pad, &bs->bio_pool); } @@ -762,6 +763,8 @@ static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); + if (bio_flagged(bio_src, BIO_DMA_TAGGED)) + bio_set_flag(bio, BIO_DMA_TAGGED); bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_iter = bio_src->bi_iter; @@ -1151,6 +1154,21 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio_set_flag(bio, BIO_CLONED); } +static void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter) +{ + size_t size = iov_iter_count(iter); + + bio->bi_vcnt = iter->nr_segs; + bio->bi_dma_tag = iter->dma_tag; + bio->bi_iter.bi_bvec_done = iter->iov_offset; + bio->bi_iter.bi_size = size; + bio->bi_opf |= REQ_NOMERGE; + bio_set_flag(bio, BIO_NO_PAGE_REF); + bio_set_flag(bio, BIO_DMA_TAGGED); + + iov_iter_advance(iter, bio->bi_iter.bi_size); +} + static int bio_iov_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { @@ -1287,6 +1305,11 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return 0; } + if (iov_iter_is_dma_tag(iter)) { + bio_iov_dma_tag_set(bio, iter); + return 0; + } + do { ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); diff --git a/block/blk-merge.c b/block/blk-merge.c index 4c8a699754c9..240eae7666d4 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -276,6 +276,24 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, const unsigned max_bytes = get_max_io_size(q, bio) << 9; const unsigned max_segs = queue_max_segments(q); + if (bio_flagged(bio, BIO_DMA_TAGGED)) { + int offset = offset_in_page(bio->bi_iter.bi_bvec_done); + + nsegs = ALIGN(bio->bi_iter.bi_size + offset, PAGE_SIZE) >> PAGE_SHIFT; + if (bio->bi_iter.bi_size > max_bytes) { + bytes = max_bytes; + nsegs = (bytes + offset) >> PAGE_SHIFT; + } else if (nsegs > max_segs) { + nsegs = max_segs; + bytes = PAGE_SIZE * nsegs - offset; + } else { + *segs = nsegs; + return NULL; + } + + goto split; + } + bio_for_each_bvec(bv, bio, iter) { /* * If the queue doesn't support SG gaps and adding this diff --git a/include/linux/bio.h b/include/linux/bio.h index ca22b06700a9..649348bc03c2 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -61,11 +61,17 @@ static inline bool bio_has_data(struct bio *bio) return false; } +static inline bool bio_flagged(const struct bio *bio, unsigned int bit) +{ + return (bio->bi_flags & (1U << bit)) != 0; +} + static inline bool bio_no_advance_iter(const struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE || - bio_op(bio) == REQ_OP_WRITE_ZEROES; + bio_op(bio) == REQ_OP_WRITE_ZEROES || + bio_flagged(bio, BIO_DMA_TAGGED); } static inline void *bio_data(struct bio *bio) @@ -98,9 +104,11 @@ static inline void bio_advance_iter(const struct bio *bio, { iter->bi_sector += bytes >> 9; - if (bio_no_advance_iter(bio)) + if (bio_no_advance_iter(bio)) { iter->bi_size -= bytes; - else + if (bio_flagged(bio, BIO_DMA_TAGGED)) + iter->bi_bvec_done += bytes; + } else bvec_iter_advance(bio->bi_io_vec, iter, bytes); /* TODO: It is reasonable to complete bio with error here. */ } @@ -225,11 +233,6 @@ static inline void bio_cnt_set(struct bio *bio, unsigned int count) atomic_set(&bio->__bi_cnt, count); } -static inline bool bio_flagged(struct bio *bio, unsigned int bit) -{ - return (bio->bi_flags & (1U << bit)) != 0; -} - static inline void bio_set_flag(struct bio *bio, unsigned int bit) { bio->bi_flags |= (1U << bit); @@ -447,7 +450,7 @@ static inline void bio_wouldblock_error(struct bio *bio) */ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) { - if (iov_iter_is_bvec(iter)) + if (iov_iter_is_bvec(iter) || iov_iter_is_dma_tag(iter)) return 0; return iov_iter_npages(iter, max_segs); } diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e10aabb36c2c..3eb045a9ba41 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -1141,6 +1141,18 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, } void blk_dump_rq_flags(struct request *, char *); +static inline void *blk_rq_dma_tag(struct request *rq) +{ + return rq->bio && bio_flagged(rq->bio, BIO_DMA_TAGGED) ? + rq->bio->bi_dma_tag : 0; +} + +static inline size_t blk_rq_dma_offset(struct request *rq) +{ + return rq->bio && bio_flagged(rq->bio, BIO_DMA_TAGGED) ? + rq->bio->bi_iter.bi_bvec_done : 0; +} + #ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_rq_zone_no(struct request *rq) { diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 1ef99790f6ed..ea6db439acbe 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -299,7 +299,10 @@ struct bio { atomic_t __bi_cnt; /* pin count */ - struct bio_vec *bi_io_vec; /* the actual vec list */ + union { + struct bio_vec *bi_io_vec; /* the actual vec list */ + void *bi_dma_tag; /* driver specific tag */ + }; struct bio_set *bi_pool; @@ -334,6 +337,7 @@ enum { BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ + BIO_DMA_TAGGED, /* Using premmaped dma buffers */ BIO_FLAG_LAST }; From patchwork Tue Jul 26 17:38:13 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Keith Busch X-Patchwork-Id: 12929591 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 4E62AC00140 for ; Tue, 26 Jul 2022 17:38:28 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S239452AbiGZRi1 (ORCPT ); Tue, 26 Jul 2022 13:38:27 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:33058 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S238661AbiGZRi0 (ORCPT ); Tue, 26 Jul 2022 13:38:26 -0400 Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com [67.231.145.42]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 02FCA2E9DB for ; Tue, 26 Jul 2022 10:38:23 -0700 (PDT) Received: from pps.filterd (m0044010.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.17.1.5/8.17.1.5) with ESMTP id 26QEJjja001473 for ; Tue, 26 Jul 2022 10:38:22 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=fb.com; h=from : to : cc : subject : date : message-id : in-reply-to : references : mime-version : content-transfer-encoding : content-type; s=facebook; bh=upFvUplL9WdQZWsooaut7tyuD9AcqX5PVtTEGpxNelI=; b=e5hbVdD/ed7NyPaNMj2hRayAJBf3EQpIyC4KxiilOcmLx5tjxHzCy6SCbMBUXxTppJfb wxNyLI9wrHFbcsB5N/ia3m1MTpyDFVKnd9UoFxJMtLEz2FvKHw6Nn/VeYjpkHu9eOw+b Ct1e8ZkHA3XbESBfhz9lT43c9mIqI5+00X4= Received: from mail.thefacebook.com ([163.114.132.120]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3hjhxashns-2 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT) for ; Tue, 26 Jul 2022 10:38:22 -0700 Received: from twshared7556.02.ash8.facebook.com (2620:10d:c085:208::11) by mail.thefacebook.com (2620:10d:c085:21d::6) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.28; Tue, 26 Jul 2022 10:38:21 -0700 Received: by devbig007.nao1.facebook.com (Postfix, from userid 544533) id 78B33698E4B0; Tue, 26 Jul 2022 10:38:15 -0700 (PDT) From: Keith Busch To: , , , CC: , , Alexander Viro , Keith Busch Subject: [PATCH 4/5] io_uring: add support for dma pre-mapping Date: Tue, 26 Jul 2022 10:38:13 -0700 Message-ID: <20220726173814.2264573-5-kbusch@fb.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20220726173814.2264573-1-kbusch@fb.com> References: <20220726173814.2264573-1-kbusch@fb.com> MIME-Version: 1.0 X-FB-Internal: Safe X-Proofpoint-ORIG-GUID: 16xvYgdSh3-L6l8uIrkVMaJrWyYoUrXK X-Proofpoint-GUID: 16xvYgdSh3-L6l8uIrkVMaJrWyYoUrXK X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.205,Aquarius:18.0.883,Hydra:6.0.517,FMLib:17.11.122.1 definitions=2022-07-26_05,2022-07-26_01,2022-06-22_01 Precedence: bulk List-ID: X-Mailing-List: io-uring@vger.kernel.org From: Keith Busch Provide a new register operation that can request to pre-map a known bvec to the driver of the requested file descriptor's specific implementation. If successful, io_uring will use the returned dma tag for future fixed buffer requests to the same file. Signed-off-by: Keith Busch --- include/uapi/linux/io_uring.h | 12 ++++ io_uring/io_uring.c | 129 ++++++++++++++++++++++++++++++++++ io_uring/net.c | 2 +- io_uring/rsrc.c | 13 +++- io_uring/rsrc.h | 16 ++++- io_uring/rw.c | 2 +- 6 files changed, 166 insertions(+), 8 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 1463cfecb56b..daacbe899d1d 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -485,6 +485,10 @@ enum { IORING_REGISTER_NOTIFIERS = 26, IORING_UNREGISTER_NOTIFIERS = 27, + /* dma map registered buffers */ + IORING_REGISTER_MAP_BUFFERS = 28, + IORING_REGISTER_UNMAP_BUFFERS = 29, + /* this goes last */ IORING_REGISTER_LAST }; @@ -661,4 +665,12 @@ struct io_uring_recvmsg_out { __u32 flags; }; +struct io_uring_map_buffers { + __s32 fd; + __s32 buf_start; + __s32 buf_end; + __u32 flags; + __u64 rsvd[2]; +}; + #endif diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 1d600a63643b..12f7354e0423 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -3704,6 +3704,123 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, return ret; } +#ifdef CONFIG_BLOCK +static int get_map_range(struct io_ring_ctx *ctx, + struct io_uring_map_buffers *map, void __user *arg) +{ + int ret; + + if (copy_from_user(map, arg, sizeof(*map))) + return -EFAULT; + if (map->flags || map->rsvd[0] || map->rsvd[1]) + return -EINVAL; + if (map->buf_start < 0) + return -EINVAL; + if (map->buf_start >= ctx->nr_user_bufs) + return -EINVAL; + if (map->buf_end > ctx->nr_user_bufs) + map->buf_end = ctx->nr_user_bufs; + + ret = map->buf_end - map->buf_start; + if (ret <= 0) + return -EINVAL; + + return ret; +} + +void io_dma_unmap(struct io_mapped_ubuf *imu) +{ + if (imu->dma_tag) + block_dma_unmap(imu->bdev, imu->dma_tag); +} + +static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_map_buffers map; + int i, ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + ret = get_map_range(ctx, &map, arg); + if (ret < 0) + return ret; + + for (i = map.buf_start; i < map.buf_end; i++) { + struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + + io_dma_unmap(imu); + } + + return 0; +} + +static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg) +{ + struct io_uring_map_buffers map; + struct block_device *bdev; + struct file *file; + int ret, i; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = get_map_range(ctx, &map, arg); + if (ret < 0) + return ret; + + file = fget(map.fd); + if (!file) + return -EBADF; + + if (S_ISBLK(file_inode(file)->i_mode)) + bdev = I_BDEV(file->f_mapping->host); + else if (S_ISREG(file_inode(file)->i_mode)) + bdev = file->f_inode->i_sb->s_bdev; + else + return -EOPNOTSUPP; + + for (i = map.buf_start; i < map.buf_end; i++) { + struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + void *tag; + + if (imu->dma_tag) { + ret = -EBUSY; + goto err; + } + + tag = block_dma_map(bdev, imu->bvec, imu->nr_bvecs); + if (IS_ERR(tag)) { + ret = PTR_ERR(tag); + goto err; + } + + imu->dma_tag = tag; + imu->dma_file = file; + imu->bdev = bdev; + } + + fput(file); + return 0; +err: + while (--i >= map.buf_start) { + struct io_mapped_ubuf *imu = ctx->user_bufs[i]; + + io_dma_unmap(imu); + } + fput(file); + return ret; +} +#else /* CONFIG_BLOCK */ +static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg) +{ + return -EOPNOTSUPP; +} +static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_BLOCK */ + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -3870,6 +3987,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_notif_unregister(ctx); break; + case IORING_REGISTER_MAP_BUFFERS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_map_buffers(ctx, arg); + break; + case IORING_REGISTER_UNMAP_BUFFERS: + ret = -EINVAL; + if (!arg || nr_args != 1) + break; + ret = io_register_unmap_buffers(ctx, arg); + break; default: ret = -EINVAL; break; diff --git a/io_uring/net.c b/io_uring/net.c index 8276b9537194..68a996318959 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -977,7 +977,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags) if (zc->flags & IORING_RECVSEND_FIXED_BUF) { ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu, - (u64)(uintptr_t)zc->buf, zc->len); + (u64)(uintptr_t)zc->buf, zc->len, NULL); if (unlikely(ret)) return ret; } else { diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 59704b9ac537..1a7a8dedbbd5 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -148,6 +148,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo unpin_user_page(imu->bvec[i].bv_page); if (imu->acct_pages) io_unaccount_mem(ctx, imu->acct_pages); + io_dma_unmap(imu); kvfree(imu); } *slot = NULL; @@ -1285,6 +1286,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, imu->ubuf = (unsigned long) iov->iov_base; imu->ubuf_end = imu->ubuf + iov->iov_len; imu->nr_bvecs = nr_pages; + imu->dma_tag = NULL; *pimu = imu; ret = 0; done: @@ -1359,9 +1361,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len) +int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len, struct file *file) { u64 buf_end; size_t offset; @@ -1379,6 +1380,12 @@ int io_import_fixed(int ddir, struct iov_iter *iter, * and advance us to the beginning. */ offset = buf_addr - imu->ubuf; + if (imu->dma_tag && file == imu->dma_file) { + unsigned long nr_segs = (buf_addr & (PAGE_SIZE - 1)) + + (len >> PAGE_SHIFT); + iov_iter_dma_tag(iter, ddir, imu->dma_tag, offset, nr_segs, len); + return 0; + } iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len); if (offset) { diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h index f3a9a177941f..6e63b7a57b34 100644 --- a/io_uring/rsrc.h +++ b/io_uring/rsrc.h @@ -50,6 +50,11 @@ struct io_mapped_ubuf { u64 ubuf_end; unsigned int nr_bvecs; unsigned long acct_pages; + void *dma_tag; + struct file *dma_file; +#ifdef CONFIG_BLOCK + struct block_device *bdev; +#endif struct bio_vec bvec[]; }; @@ -64,9 +69,14 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill); -int io_import_fixed(int ddir, struct iov_iter *iter, - struct io_mapped_ubuf *imu, - u64 buf_addr, size_t len); +int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu, + u64 buf_addr, size_t len, struct file *file); + +#ifdef CONFIG_BLOCK +void io_dma_unmap(struct io_mapped_ubuf *imu); +#else +static inline void io_dma_unmap(struct io_mapped_ubuf *imu) {} +#endif void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx); int io_sqe_buffers_unregister(struct io_ring_ctx *ctx); diff --git a/io_uring/rw.c b/io_uring/rw.c index 2b784795103c..9e2164d09adb 100644 --- a/io_uring/rw.c +++ b/io_uring/rw.c @@ -359,7 +359,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req, ssize_t ret; if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len); + ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len, req->file); if (ret) return ERR_PTR(ret); return NULL; From patchwork Tue Jul 26 17:38:14 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Keith Busch X-Patchwork-Id: 12929595 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id A7702C00140 for ; Tue, 26 Jul 2022 17:38:38 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S238101AbiGZRig (ORCPT ); Tue, 26 Jul 2022 13:38:36 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:33194 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S239432AbiGZRia (ORCPT ); Tue, 26 Jul 2022 13:38:30 -0400 Received: from mx0a-00082601.pphosted.com (mx0a-00082601.pphosted.com [67.231.145.42]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 1E2942B279 for ; Tue, 26 Jul 2022 10:38:28 -0700 (PDT) Received: from pps.filterd (m0044010.ppops.net [127.0.0.1]) by mx0a-00082601.pphosted.com (8.17.1.5/8.17.1.5) with ESMTP id 26QEJndW001509 for ; Tue, 26 Jul 2022 10:38:27 -0700 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=fb.com; h=from : to : cc : subject : date : message-id : in-reply-to : references : mime-version : content-transfer-encoding : content-type; s=facebook; bh=xXA4PeA+Ixr6EVwKEP8AkMKM0BHBogXynqcDNFl5W98=; b=FoGwSyh9qZ4SxN67Vue0TY6aaADsWFs8DPoqTqRgFdR85fBrM4FfU/E5+XNej4twOLx2 dmuS5PKtiKfyAUGkqNYzDb0RlmCpn9Wyd2lXzRBnOqMiwlupza77UTNhNxu0GBhI6OBY WUPvRdjrPLqe3LOEG6mD7GJY8NK1Iobwrg4= Received: from maileast.thefacebook.com ([163.114.130.16]) by mx0a-00082601.pphosted.com (PPS) with ESMTPS id 3hjhxashpa-4 (version=TLSv1.2 cipher=ECDHE-RSA-AES128-GCM-SHA256 bits=128 verify=NOT) for ; Tue, 26 Jul 2022 10:38:27 -0700 Received: from twshared22413.18.frc3.facebook.com (2620:10d:c0a8:1b::d) by mail.thefacebook.com (2620:10d:c0a8:83::6) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2375.28; Tue, 26 Jul 2022 10:38:26 -0700 Received: by devbig007.nao1.facebook.com (Postfix, from userid 544533) id 8D0D1698E4B2; Tue, 26 Jul 2022 10:38:15 -0700 (PDT) From: Keith Busch To: , , , CC: , , Alexander Viro , Keith Busch Subject: [PATCH 5/5] nvme-pci: implement dma_map support Date: Tue, 26 Jul 2022 10:38:14 -0700 Message-ID: <20220726173814.2264573-6-kbusch@fb.com> X-Mailer: git-send-email 2.30.2 In-Reply-To: <20220726173814.2264573-1-kbusch@fb.com> References: <20220726173814.2264573-1-kbusch@fb.com> MIME-Version: 1.0 X-FB-Internal: Safe X-Proofpoint-ORIG-GUID: M7Ejgf-TIjQlFKBm-MWsirGhTbvWhmcp X-Proofpoint-GUID: M7Ejgf-TIjQlFKBm-MWsirGhTbvWhmcp X-Proofpoint-Virus-Version: vendor=baseguard engine=ICAP:2.0.205,Aquarius:18.0.883,Hydra:6.0.517,FMLib:17.11.122.1 definitions=2022-07-26_05,2022-07-26_01,2022-06-22_01 Precedence: bulk List-ID: X-Mailing-List: io-uring@vger.kernel.org From: Keith Busch Implement callbacks to convert a registered bio_vec to a prp list, and use this for each IO that uses the returned tag. This saves repeated IO conversions and dma mapping/unmapping. In many cases, the driver can skip per-IO pool allocations entirely, saving potentially signficant CPU cycles. Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 291 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 283 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 644664098ae7..571d955eaef0 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -110,6 +110,14 @@ struct nvme_queue; static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode); +struct nvme_dma_mapping { + int nr_pages; + u16 offset; + u8 rsvd[2]; + dma_addr_t prp_dma_addr; + __le64 *prps; +}; + /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ @@ -544,6 +552,35 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) return true; } +static void nvme_sync_dma(struct nvme_dev *dev, struct request *req) +{ + int index, offset, i, length, nprps; + struct nvme_dma_mapping *mapping; + bool needs_sync; + + mapping = blk_rq_dma_tag(req); + offset = blk_rq_dma_offset(req) + mapping->offset; + index = offset >> NVME_CTRL_PAGE_SHIFT; + needs_sync = rq_data_dir(req) == READ && + dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[index])); + + if (!needs_sync) + return; + + offset = offset & (NVME_CTRL_PAGE_SIZE - 1); + length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset); + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); + + dma_sync_single_for_cpu(dev->dev, + le64_to_cpu(mapping->prps[index++]), + NVME_CTRL_PAGE_SIZE - offset, DMA_FROM_DEVICE); + for (i = 1; i < nprps; i++) { + dma_sync_single_for_cpu(dev->dev, + le64_to_cpu(mapping->prps[index++]), + NVME_CTRL_PAGE_SIZE, DMA_FROM_DEVICE); + } +} + static void nvme_free_prps(struct nvme_dev *dev, struct request *req) { const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; @@ -576,6 +613,21 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) } } +static void nvme_free_prp_chain(struct nvme_dev *dev, struct request *req, + struct nvme_iod *iod) +{ + if (iod->npages < 0) + return; + + if (iod->npages == 0) + dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], + iod->first_dma); + else if (iod->use_sgl) + nvme_free_sgls(dev, req); + else + nvme_free_prps(dev, req); +} + static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); @@ -595,18 +647,15 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, rq_dma_dir(req)); return; + } else if (blk_rq_dma_tag(req)) { + nvme_sync_dma(dev, req); + nvme_free_prp_chain(dev, req, iod); + return; } WARN_ON_ONCE(!iod->nents); - nvme_unmap_sg(dev, req); - if (iod->npages == 0) - dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], - iod->first_dma); - else if (iod->use_sgl) - nvme_free_sgls(dev, req); - else - nvme_free_prps(dev, req); + nvme_free_prp_chain(dev, req, iod); mempool_free(iod->sg, dev->iod_mempool); } @@ -835,6 +884,122 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev, return BLK_STS_OK; } +static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req, + struct nvme_rw_command *cmnd, + struct nvme_iod *iod) +{ + static const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; + dma_addr_t prp_list_start, prp_list_end, prp_dma; + int index, offset, i, length, nprps, nprps_left; + void **list = nvme_pci_iod_list(req); + struct nvme_dma_mapping *mapping; + struct dma_pool *pool; + __le64 *prp_list; + bool needs_sync; + + mapping = blk_rq_dma_tag(req); + offset = blk_rq_dma_offset(req) + mapping->offset; + index = offset >> NVME_CTRL_PAGE_SHIFT; + offset = offset & (NVME_CTRL_PAGE_SIZE - 1); + needs_sync = rq_data_dir(req) == WRITE && + dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[index])); + + /* + * XXX: For PAGE_SIZE > NVME_CTRL_PAGE_SIZE, is it faster to save the + * PRP list implementation and sync multiple partial pages, more + * efficient to sync PAGE_SIZE and build the PRP list per-IO from a + * host PAGE_SIZE representation, or cleverly sync physically + * contiguous regions? + */ + if (needs_sync) { + dma_sync_single_for_device(dev->dev, + le64_to_cpu(mapping->prps[index]), + NVME_CTRL_PAGE_SIZE - offset, DMA_TO_DEVICE); + } + + length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset); + cmnd->dptr.prp1 = cpu_to_le64(le64_to_cpu(mapping->prps[index++]) + offset); + + if (length <= 0) + return BLK_STS_OK; + + if (length <= NVME_CTRL_PAGE_SIZE) { + if (needs_sync) + dma_sync_single_for_device(dev->dev, + le64_to_cpu(mapping->prps[index]), + NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE); + cmnd->dptr.prp2 = mapping->prps[index]; + return BLK_STS_OK; + } + + nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); + prp_list_start = mapping->prp_dma_addr + 8 * index; + prp_list_end = prp_list_start + 8 * nprps; + + /* Optimization when remaining list fits in one nvme page */ + if ((prp_list_start >> NVME_CTRL_PAGE_SHIFT) == + (prp_list_end >> NVME_CTRL_PAGE_SHIFT)) { + cmnd->dptr.prp2 = cpu_to_le64(prp_list_start); + goto sync; + } + + if (nprps <= (256 / 8)) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); + if (!prp_list) { + iod->npages = -1; + return BLK_STS_RESOURCE; + } + + list[0] = prp_list; + iod->first_dma = prp_dma; + i = 0; + for (;;) { + dma_addr_t next_prp_dma; + __le64 *next_prp_list; + + if (nprps_left <= last_prp + 1) { + memcpy(prp_list, &mapping->prps[index], nprps_left * 8); + break; + } + + memcpy(prp_list, &mapping->prps[index], + NVME_CTRL_PAGE_SIZE - 8); + nprps_left -= last_prp; + index += last_prp; + + next_prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &next_prp_dma); + if (!next_prp_list) + goto free_prps; + + prp_list[last_prp] = cpu_to_le64(next_prp_dma); + prp_list = next_prp_list; + prp_dma = next_prp_dma; + list[iod->npages++] = prp_list; + } + cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); + +sync: + if (!needs_sync) + return BLK_STS_OK; + + for (i = 0; i < nprps; i++) + dma_sync_single_for_device(dev->dev, + le64_to_cpu(mapping->prps[index++]), + NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE); + return BLK_STS_OK; + +free_prps: + nvme_free_prps(dev, req); + return BLK_STS_RESOURCE; +} + static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { @@ -842,6 +1007,12 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, blk_status_t ret = BLK_STS_RESOURCE; int nr_mapped; + if (blk_rq_dma_tag(req)) { + iod->dma_len = 0; + iod->use_sgl = false; + return nvme_premapped(dev, req, &cmnd->rw, iod); + } + if (blk_rq_nr_phys_segments(req) == 1) { struct bio_vec bv = req_bvec(req); @@ -1732,6 +1903,106 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) return result; } +#ifdef CONFIG_HAS_DMA +/* + * Important: bvec must be describing a virtually contiguous buffer. + */ +static void *nvme_pci_dma_map(struct request_queue *q, + struct bio_vec *bvec, int nr_vecs) +{ + const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE); + struct nvme_ns *ns = q->queuedata; + struct nvme_dev *dev = to_nvme_dev(ns->ctrl); + struct nvme_dma_mapping *mapping; + int i, j, k, size, ret = -ENOMEM; + + if (!nr_vecs) + return ERR_PTR(-EINVAL); + + mapping = kzalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return ERR_PTR(-ENOMEM); + + mapping->nr_pages = nr_vecs * nvme_pages; + size = sizeof(*mapping->prps) * mapping->nr_pages; + mapping->prps = dma_alloc_coherent(dev->dev, size, + &mapping->prp_dma_addr, GFP_KERNEL); + if (!mapping->prps) + goto free_mapping; + + for (i = 0, k = 0; i < nr_vecs; i++) { + struct bio_vec *bv = bvec + i; + int pages_per = nvme_pages; + dma_addr_t dma_addr; + + if (i == 0) { + mapping->offset = bv->bv_offset; + pages_per -= mapping->offset >> NVME_CTRL_PAGE_SHIFT; + } else if (bv->bv_offset) { + ret = -EINVAL; + goto err; + } + + if (bv->bv_offset + bv->bv_len != PAGE_SIZE && + i < nr_vecs - 1) { + ret = -EINVAL; + goto err; + } + + dma_addr = dma_map_bvec(dev->dev, bv, 0, 0); + if (dma_mapping_error(dev->dev, dma_addr)) { + ret = -EIO; + goto err; + } + + if (i == 0) + dma_addr -= mapping->offset; + + for (j = 0; j < nvme_pages; j++) + mapping->prps[k++] = cpu_to_le64(dma_addr + + j * NVME_CTRL_PAGE_SIZE); + } + + get_device(dev->dev); + return mapping; + +err: + for (i = 0; i < k; i += nvme_pages) { + __u64 dma_addr = le64_to_cpu(mapping->prps[i]); + + dma_unmap_page(dev->dev, dma_addr, + PAGE_SIZE - offset_in_page(dma_addr), 0); + } + + dma_free_coherent(dev->dev, size, (void *)mapping->prps, + mapping->prp_dma_addr); +free_mapping: + kfree(mapping); + return ERR_PTR(ret); +} + +static void nvme_pci_dma_unmap(struct request_queue *q, void *dma_tag) +{ + const int nvme_pages = 1 << (PAGE_SIZE - NVME_CTRL_PAGE_SIZE); + struct nvme_ns *ns = q->queuedata; + struct nvme_dev *dev = to_nvme_dev(ns->ctrl); + struct nvme_dma_mapping *mapping = dma_tag; + int i; + + for (i = 0; i < mapping->nr_pages; i += nvme_pages) { + __u64 dma_addr = le64_to_cpu(mapping->prps[i]); + + dma_unmap_page(dev->dev, dma_addr, + PAGE_SIZE - offset_in_page(dma_addr), 0); + } + + dma_free_coherent(dev->dev, mapping->nr_pages * sizeof(*mapping->prps), + (void *)mapping->prps, mapping->prp_dma_addr); + kfree(mapping); + put_device(dev->dev); +} +#endif + static const struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, .complete = nvme_pci_complete_rq, @@ -1750,6 +2021,10 @@ static const struct blk_mq_ops nvme_mq_ops = { .map_queues = nvme_pci_map_queues, .timeout = nvme_timeout, .poll = nvme_poll, +#ifdef CONFIG_HAS_DMA + .dma_map = nvme_pci_dma_map, + .dma_unmap = nvme_pci_dma_unmap, +#endif }; static void nvme_dev_remove_admin(struct nvme_dev *dev)