From patchwork Tue Mar 5 10:15:11 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581954 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2A3C355E43; Tue, 5 Mar 2024 10:15:39 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633740; cv=none; b=jvqSDWctdJjE0qN4zt7QND2g3uZgdKgEW91J+r+CKz8uR/hYwCYC8Vrd7yCF3j+OAQA5ljVqX6BzguJRSv8eJtMRIgDpLQH3OfhldK8Ne+f704qjgOKK1IOKpUc3HdHXREj4S9qfnD4CEZ9RNDG8oFbYGfmgWZVt1AYp066f9qY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633740; c=relaxed/simple; bh=EE9BRQu4WZxxi1OZrE2CKyiBDx8STpwb4qcA00ZbW2s=; h=From:To:Cc:Subject:Date:Message-ID:MIME-Version; b=GoGCJl3y/Ot6+7GCni4Jgc3Ia+JNBv8TQpvYYfhRLu7yBiW4W9bLpb7L68cLCxFo6umQBJAUI12pSYGKdZhIE5mwnCyS/GndsSM9/fYIBO6LpYZv32en8WYcmZHXgpXK64I/t+eQqHc68R6W9luue0DwCRKHt934vkfdbDN5lFQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=qOqLaLn3; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="qOqLaLn3" Received: by smtp.kernel.org (Postfix) with ESMTPSA id E9E92C433F1; Tue, 5 Mar 2024 10:15:38 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633739; bh=EE9BRQu4WZxxi1OZrE2CKyiBDx8STpwb4qcA00ZbW2s=; h=From:To:Cc:Subject:Date:From; b=qOqLaLn3iBOQV8Vpk9C7fIIoeo4gt3eJqv3Qk0ac57+rTecyJfbAwk5F9oV2CYlYi iCxE76g9a4VsFGyienbFOKBwslCX8QnHQSkps7BVaKOrdU3YgbTgSp3wgD5vuXtzjH Auf9D1w6HpIVDYu45nDqaeQDo5Qxg7SqI71cb4CF5bX33POEIF22qvG6dZC3feFMSL L8/YYU6bMkn9HMPttYWzUIqXiOvpy+fN5ejAW5p5z2cSEsV2UMHJjs77k01LVnvlWp tAQhYEiHcEmzmfYrM9sBM2FOv7/h5MX0oMiIYnb/sIlyxLNKRLomqakayy9734ligO 6YKcLLle3sv0Q== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 01/16] mm/hmm: let users to tag specific PFNs Date: Tue, 5 Mar 2024 12:15:11 +0200 Message-ID: X-Mailer: git-send-email 2.44.0 Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Introduce new sticky flag, which isn't overwritten by HMM range fault. Such flag allows users to tag specific PFNs with extra data in addition to already filled by HMM. Signed-off-by: Leon Romanovsky --- include/linux/hmm.h | 3 +++ mm/hmm.c | 34 +++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 126a36571667..b90902baa593 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -23,6 +23,7 @@ struct mmu_interval_notifier; * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID) * HMM_PFN_ERROR - accessing the pfn is impossible and the device should * fail. ie poisoned memory, special pages, no vma, etc + * HMM_PFN_STICKY - Flag preserved on input-to-output transformation * * On input: * 0 - Return the current state of the page, do not fault it. @@ -36,6 +37,8 @@ enum hmm_pfn_flags { HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1), HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2), HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3), + /* Sticky lag, carried from Input to Output */ + HMM_PFN_STICKY = 1UL << (BITS_PER_LONG - 7), HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8), /* Input flags */ diff --git a/mm/hmm.c b/mm/hmm.c index 277ddcab4947..9645a72beec0 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -44,8 +44,10 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end, { unsigned long i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) - range->hmm_pfns[i] = cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++) { + range->hmm_pfns[i] &= HMM_PFN_STICKY; + range->hmm_pfns[i] |= cpu_flags; + } return 0; } @@ -202,8 +204,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + hmm_pfns[i] &= HMM_PFN_STICKY; + hmm_pfns[i] |= pfn | cpu_flags; + } return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -236,7 +240,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) goto fault; - *hmm_pfn = 0; + *hmm_pfn = *hmm_pfn & HMM_PFN_STICKY; return 0; } @@ -253,14 +257,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, cpu_flags = HMM_PFN_VALID; if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; - *hmm_pfn = swp_offset_pfn(entry) | cpu_flags; + *hmm_pfn = (*hmm_pfn & HMM_PFN_STICKY) | swp_offset_pfn(entry) | cpu_flags; return 0; } required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (!required_fault) { - *hmm_pfn = 0; + *hmm_pfn = *hmm_pfn & HMM_PFN_STICKY; return 0; } @@ -304,11 +308,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, pte_unmap(ptep); return -EFAULT; } - *hmm_pfn = HMM_PFN_ERROR; + *hmm_pfn = (*hmm_pfn & HMM_PFN_STICKY) | HMM_PFN_ERROR; return 0; } - *hmm_pfn = pte_pfn(pte) | cpu_flags; + *hmm_pfn = (*hmm_pfn & HMM_PFN_STICKY) | pte_pfn(pte) | cpu_flags; return 0; fault: @@ -453,8 +457,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - for (i = 0; i < npages; ++i, ++pfn) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; i < npages; ++i, ++pfn) { + hmm_pfns[i] &= HMM_PFN_STICKY; + hmm_pfns[i] |= pfn | cpu_flags; + } goto out_unlock; } @@ -512,8 +518,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - range->hmm_pfns[i] = pfn | cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) { + range->hmm_pfns[i] &= HMM_PFN_STICKY; + range->hmm_pfns[i] |= pfn | cpu_flags; + } spin_unlock(ptl); return 0; From patchwork Tue Mar 5 10:15:12 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581955 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 767FB58135; Tue, 5 Mar 2024 10:15:43 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633743; cv=none; b=BG1Huex7y7xRRHurd+DylYobaMcvdDTdTzcvEMK7Dd69o9jGT/nFrziCvWTGJAKLeQgk1hX4S4ARC/lTS3Lj1+1XkKAHuqfVGHM1Lou1a4Xay7w01Qfr2dEU0ZMfYtFMJR+pjGVfGGHnl1VOk+p6piI625O47j2KzTJTmRyPSSU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633743; c=relaxed/simple; bh=XGhek5Egl9EZN5GDA4K/P4lgi/cIEFOzTc1LzkqAZ+k=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=hrXAL3IIqzs6tZ01vNaYluN/E9tzJPQq4aVVyKuEIQBC2wRvhlXwBmubpjUiOIg6aaqQbS9Np4pU2ywgCOGGf/y5C1AITn+LXOLa0g5GmLMMZIw6WV268ElA/3iiSAgbBJgouMHgbFOWSec3PluOXPVPm440unAFwxUoxge7GkM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=XCEvQ40z; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="XCEvQ40z" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 5F837C433F1; Tue, 5 Mar 2024 10:15:42 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633743; bh=XGhek5Egl9EZN5GDA4K/P4lgi/cIEFOzTc1LzkqAZ+k=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=XCEvQ40zftA5CzahxVADDhIsF5HF2PFIaFCKJl08PAjcb8phQRdD1LDvHI07G+6PZ CvBdJkiMC9pln2pklakFZdq+NgxiSqfQtDr0p0vm+4uPAZzhgnykkeCD4vUFgHznRW ZOBma0N0c7UfO8tz8sA3pTWlv0aFd55k7B11BsajPMqaNfTE2HFLWk/aE5awQigsm3 8YGZDR4iQoO0Eh3bVsL7tCcIfm91jjBwAWwMtpH2xHx88UXEnRem5M/qO+QvY/fybd zRSV/V3Bo/1pG1mbTkyZbbDGE1jDM3WdTIBsJpv95ElPKrvNelIborsyuhPgbCuKH7 DHXrXaxWh7khg== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 02/16] dma-mapping: provide an interface to allocate IOVA Date: Tue, 5 Mar 2024 12:15:12 +0200 Message-ID: <54a3554639bfb963c9919c5d7c1f449021bebdb3.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Existing .map_page() callback provides two things at the same time: allocates IOVA and links DMA pages. That combination works great for most of the callers who use it in control paths, but less effective in fast paths. These advanced callers already manage their data in some sort of database and can perform IOVA allocation in advance, leaving range linkage operation to be in fast path. Provide an interface to allocate/deallocate IOVA and next patch link/unlink DMA ranges to that specific IOVA. Signed-off-by: Leon Romanovsky --- include/linux/dma-map-ops.h | 3 +++ include/linux/dma-mapping.h | 20 ++++++++++++++++++++ kernel/dma/mapping.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 4abc60f04209..bd605b44bb57 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -83,6 +83,9 @@ struct dma_map_ops { size_t (*max_mapping_size)(struct device *dev); size_t (*opt_mapping_size)(void); unsigned long (*get_merge_boundary)(struct device *dev); + + dma_addr_t (*alloc_iova)(struct device *dev, size_t size); + void (*free_iova)(struct device *dev, dma_addr_t dma_addr, size_t size); }; #ifdef CONFIG_DMA_OPS diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 4a658de44ee9..176fb8a86d63 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -91,6 +91,16 @@ static inline void debug_dma_map_single(struct device *dev, const void *addr, } #endif /* CONFIG_DMA_API_DEBUG */ +struct dma_iova_attrs { + /* OUT field */ + dma_addr_t addr; + /* IN fields */ + struct device *dev; + size_t size; + enum dma_data_direction dir; + unsigned long attrs; +}; + #ifdef CONFIG_HAS_DMA static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { @@ -101,6 +111,9 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) return 0; } +int dma_alloc_iova(struct dma_iova_attrs *iova); +void dma_free_iova(struct dma_iova_attrs *iova); + dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs); @@ -159,6 +172,13 @@ void dma_vunmap_noncontiguous(struct device *dev, void *vaddr); int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt); #else /* CONFIG_HAS_DMA */ +static inline int dma_alloc_iova(struct dma_iova_attrs *iova) +{ + return -EOPNOTSUPP; +} +static inline void dma_free_iova(struct dma_iova_attrs *iova) +{ +} static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 58db8fd70471..b6b27bab90f3 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -183,6 +183,36 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, } EXPORT_SYMBOL(dma_unmap_page_attrs); +int dma_alloc_iova(struct dma_iova_attrs *iova) +{ + struct device *dev = iova->dev; + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_map_direct(dev, ops) || !ops->alloc_iova) { + iova->addr = 0; + return 0; + } + + iova->addr = ops->alloc_iova(dev, iova->size); + if (dma_mapping_error(dev, iova->addr)) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(dma_alloc_iova); + +void dma_free_iova(struct dma_iova_attrs *iova) +{ + struct device *dev = iova->dev; + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_map_direct(dev, ops) || !ops->free_iova) + return; + + ops->free_iova(dev, iova->addr, iova->size); +} +EXPORT_SYMBOL(dma_free_iova); + static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { From patchwork Tue Mar 5 10:15:13 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581956 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 735E15A780; Tue, 5 Mar 2024 10:15:47 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633747; cv=none; b=D0+2HVFd7R1Mhz+FAL/E4CuZmAJ0YYLpgOVIW2EoXItAqyPGuIVB/Gjtr1+NE7CgfeKgOwXHeTvlI8U5iAhR050pVMg4Ggpb6yEFAesiOW/r3tKwncbotJ0ODtmhs4OvB2bCPQMFzOw4b4SGAPEnXRpLYnDk53Zzqgombg+9nyM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633747; c=relaxed/simple; bh=qAJEud8zSEtY+UaYfguXl0FuNLhyjwr4jvzS2H85FiM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=uoCy+Yhx0QNc7qWPEIJCHRt05dD1pEyvBXDw5HdmsHQ0aAx2c8brXTOJBuPzC1Om37bAHhKJI66kT4yhdw2HhjKPxYj3D7aFfAe2GgBntnmxMSVQlFPPMGW/H5KlYNemNVb2A6qly49Xv8GUogh+HaWrM9YvKxnHz2H+l0ZX7zU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=i6KfRrmN; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="i6KfRrmN" Received: by smtp.kernel.org (Postfix) with ESMTPSA id F2653C43394; Tue, 5 Mar 2024 10:15:45 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633747; bh=qAJEud8zSEtY+UaYfguXl0FuNLhyjwr4jvzS2H85FiM=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=i6KfRrmN48zwpr58f2jn2e/4KoqDgu11fH84icYWi7KEjDlgMSwa2yskoM0pEdvED lwqbMF5T04iutucWA4erka9aP+EjX7Kjb/imGmt7yI/VQdkXDFZPfbOZatMO67TE1d R1GBVc6uz3XJqv/sD9fRb0932eSAzZZ+OLdul+PniCr2QIaCRf+7ivXT3+1lkcpffl NbvxILp9mDgF+iNO64UjpDjxS33n6TG7Ama1MHUUThE/3v8Ma0XkJPu4Dzh8vf5qTe cSuDCSx3F2rrN4NpXul9RJ+C51ATSFr+VQnbih8yd/sN8rJqiTSxCs6F2KroalPzVA AP3upF62lstFg== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 03/16] dma-mapping: provide callbacks to link/unlink pages to specific IOVA Date: Tue, 5 Mar 2024 12:15:13 +0200 Message-ID: X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Introduce new DMA link/unlink API to provide a way for advanced users to directly map/unmap pages without ned to allocate IOVA on every map call. Signed-off-by: Leon Romanovsky --- include/linux/dma-map-ops.h | 10 +++++++ include/linux/dma-mapping.h | 13 +++++++++ kernel/dma/debug.h | 2 ++ kernel/dma/direct.h | 3 ++ kernel/dma/mapping.c | 57 +++++++++++++++++++++++++++++++++++++ 5 files changed, 85 insertions(+) diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index bd605b44bb57..fd03a080df1e 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -86,6 +86,13 @@ struct dma_map_ops { dma_addr_t (*alloc_iova)(struct device *dev, size_t size); void (*free_iova)(struct device *dev, dma_addr_t dma_addr, size_t size); + dma_addr_t (*link_range)(struct device *dev, struct page *page, + unsigned long offset, dma_addr_t addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs); + void (*unlink_range)(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs); }; #ifdef CONFIG_DMA_OPS @@ -428,6 +435,9 @@ bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg, #define arch_dma_unmap_sg_direct(d, s, n) (false) #endif +#define arch_dma_link_range_direct arch_dma_map_page_direct +#define arch_dma_unlink_range_direct arch_dma_unmap_page_direct + #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, bool coherent); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 176fb8a86d63..91cc084adb53 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -113,6 +113,9 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) int dma_alloc_iova(struct dma_iova_attrs *iova); void dma_free_iova(struct dma_iova_attrs *iova); +dma_addr_t dma_link_range(struct page *page, unsigned long offset, + struct dma_iova_attrs *iova, dma_addr_t dma_offset); +void dma_unlink_range(struct dma_iova_attrs *iova, dma_addr_t dma_offset); dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, @@ -179,6 +182,16 @@ static inline int dma_alloc_iova(struct dma_iova_attrs *iova) static inline void dma_free_iova(struct dma_iova_attrs *iova) { } +static inline dma_addr_t dma_link_range(struct page *page, unsigned long offset, + struct dma_iova_attrs *iova, + dma_addr_t dma_offset) +{ + return DMA_MAPPING_ERROR; +} +static inline void dma_unlink_range(struct dma_iova_attrs *iova, + dma_addr_t dma_offset) +{ +} static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs) diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index f525197d3cae..3d529f355c6d 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -127,4 +127,6 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev, { } #endif /* CONFIG_DMA_API_DEBUG */ +#define debug_dma_link_range debug_dma_map_page +#define debug_dma_unlink_range debug_dma_unmap_page #endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 18d346118fe8..1c30e1cd607a 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -125,4 +125,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); } + +#define dma_direct_link_range dma_direct_map_page +#define dma_direct_unlink_range dma_direct_unmap_page #endif /* _KERNEL_DMA_DIRECT_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b6b27bab90f3..f989c64622c2 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -213,6 +213,63 @@ void dma_free_iova(struct dma_iova_attrs *iova) } EXPORT_SYMBOL(dma_free_iova); +/** + * dma_link_range - Link a physical page to DMA address + * @page: The page to be mapped + * @offset: The offset within the page + * @iova: Preallocated IOVA attributes + * @dma_offset: DMA offset form which this page needs to be linked + * + * dma_alloc_iova() allocates IOVA based on the size specified by ther user in + * iova->size. Call this function after IOVA allocation to link @page from + * @offset to get the DMA address. Note that very first call to this function + * will have @dma_offset set to 0 in the IOVA space allocated from + * dma_alloc_iova(). For subsequent calls to this function on same @iova, + * @dma_offset needs to be advanced by the caller with the size of previous + * page that was linked + DMA address returned for the previous page that was + * linked by this function. + */ +dma_addr_t dma_link_range(struct page *page, unsigned long offset, + struct dma_iova_attrs *iova, dma_addr_t dma_offset) +{ + struct device *dev = iova->dev; + size_t size = iova->size; + enum dma_data_direction dir = iova->dir; + unsigned long attrs = iova->attrs; + dma_addr_t addr = iova->addr + dma_offset; + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_map_direct(dev, ops) || + arch_dma_link_range_direct(dev, page_to_phys(page) + offset + size)) + addr = dma_direct_link_range(dev, page, offset, size, dir, attrs); + else if (ops->link_range) + addr = ops->link_range(dev, page, offset, addr, size, dir, attrs); + + kmsan_handle_dma(page, offset, size, dir); + debug_dma_link_range(dev, page, offset, size, dir, addr, attrs); + return addr; +} +EXPORT_SYMBOL(dma_link_range); + +void dma_unlink_range(struct dma_iova_attrs *iova, dma_addr_t dma_offset) +{ + struct device *dev = iova->dev; + size_t size = iova->size; + enum dma_data_direction dir = iova->dir; + unsigned long attrs = iova->attrs; + dma_addr_t addr = iova->addr + dma_offset; + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_map_direct(dev, ops) || + arch_dma_unlink_range_direct(dev, addr + size)) + dma_direct_unlink_range(dev, addr, size, dir, attrs); + else if (ops->unlink_range) + ops->unlink_range(dev, addr, size, dir, attrs); + + debug_dma_unlink_range(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_unlink_range); + static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { From patchwork Tue Mar 5 10:15:14 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581957 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C14465B668; Tue, 5 Mar 2024 10:15:51 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633751; cv=none; b=Nhw2vS0DpxIp0GCQ0e1wnX2qmnc28BY8jm49CuG9GPR78TVK34+ZZbGD+lebjKwLuJIpHQEssl7D9vHd2k7MZBmhu+IVZSwI1+e1TVX9lTgXja9Qdat1G3jp8Nqn6SswZ0iAuBiLU8p7hDDeJ9otMtlfYjzMRnbZqCSHSaQYdKA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633751; c=relaxed/simple; bh=W6hmByMbviZow30AysrbmdCiStk7NPI/+8Qeoey1JY8=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=HokV6wlnf8Mfb4IKZnypS3yOU/npM9PT1ea6yYYXj4VdPz+1WtdyKlTnD/zS9XmBIfHseHTzYgXpLf4SpNS2KS10WFuUJOjpTbmQAgct87+V+yCAOg9VeFIQtOav4pYwwf24GBeecbFpIG7GrRy/4aUHfgVc/2UW/XHOK7rSZOg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=l7gVIpZC; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="l7gVIpZC" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 99266C433A6; Tue, 5 Mar 2024 10:15:50 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633751; bh=W6hmByMbviZow30AysrbmdCiStk7NPI/+8Qeoey1JY8=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=l7gVIpZCN4qfruYV+rhZs6OVB7eTnW+2jECepsOGpeOe6WYvjXT9SqiFpBQzTVqwz XZ0+mj9fWXef0KdsccKP98pCRc30F8WZsLr+lNyYrNYr3ah1H9ji1UrOI4hdLXFzf0 CXiKAAqhEBrPGQ+EI22ylfpvgmIPUXyhaTu/hkQYZKAaa92IBeLzIwCbEMlXgEp/+X SCobxhPdH3p7yiwMHielMCXj+TLdArAdfOiZodMpIPkTamBq50gmavt5TvXHw8Qguf gxgKOtzpIwVZVp1iDgIDemvks4MTC3dkgSc2SRsBHlswlYQDO8Faa4CI1/l3SS+j8R M12Dmxecw3+fg== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 04/16] iommu/dma: Provide an interface to allow preallocate IOVA Date: Tue, 5 Mar 2024 12:15:14 +0200 Message-ID: X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Separate IOVA allocation to dedicated callback so it will allow cache of IOVA and reuse it in fast paths for devices which support ODP (on-demand-paging) mechanism. Signed-off-by: Leon Romanovsky --- drivers/iommu/dma-iommu.c | 50 +++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 50ccc4f1ef81..e55726783501 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -356,7 +356,7 @@ int iommu_dma_init_fq(struct iommu_domain *domain) atomic_set(&cookie->fq_timer_on, 0); /* * Prevent incomplete fq state being observable. Pairs with path from - * __iommu_dma_unmap() through iommu_dma_free_iova() to queue_iova() + * __iommu_dma_unmap() through __iommu_dma_free_iova() to queue_iova() */ smp_wmb(); WRITE_ONCE(cookie->fq_domain, domain); @@ -760,7 +760,7 @@ static int dma_info_to_prot(enum dma_data_direction dir, bool coherent, } } -static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, +static dma_addr_t __iommu_dma_alloc_iova(struct iommu_domain *domain, size_t size, u64 dma_limit, struct device *dev) { struct iommu_dma_cookie *cookie = domain->iova_cookie; @@ -806,7 +806,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, return (dma_addr_t)iova << shift; } -static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie, +static void __iommu_dma_free_iova(struct iommu_dma_cookie *cookie, dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather) { struct iova_domain *iovad = &cookie->iovad; @@ -843,7 +843,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, if (!iotlb_gather.queued) iommu_iotlb_sync(domain, &iotlb_gather); - iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); + __iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); } static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, @@ -861,12 +861,12 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, size = iova_align(iovad, size + iova_off); - iova = iommu_dma_alloc_iova(domain, size, dma_mask, dev); + iova = __iommu_dma_alloc_iova(domain, size, dma_mask, dev); if (!iova) return DMA_MAPPING_ERROR; if (iommu_map(domain, iova, phys - iova_off, size, prot, GFP_ATOMIC)) { - iommu_dma_free_iova(cookie, iova, size, NULL); + __iommu_dma_free_iova(cookie, iova, size, NULL); return DMA_MAPPING_ERROR; } return iova + iova_off; @@ -970,7 +970,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, return NULL; size = iova_align(iovad, size); - iova = iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask, dev); + iova = __iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask, dev); if (!iova) goto out_free_pages; @@ -1004,7 +1004,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, out_free_sg: sg_free_table(sgt); out_free_iova: - iommu_dma_free_iova(cookie, iova, size, NULL); + __iommu_dma_free_iova(cookie, iova, size, NULL); out_free_pages: __iommu_dma_free_pages(pages, count); return NULL; @@ -1436,7 +1436,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, if (!iova_len) return __finalise_sg(dev, sg, nents, 0); - iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev); + iova = __iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev); if (!iova) { ret = -ENOMEM; goto out_restore_sg; @@ -1453,7 +1453,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, return __finalise_sg(dev, sg, nents, iova); out_free_iova: - iommu_dma_free_iova(cookie, iova, iova_len, NULL); + __iommu_dma_free_iova(cookie, iova, iova_len, NULL); out_restore_sg: __invalidate_sg(sg, nents); out: @@ -1706,6 +1706,30 @@ static size_t iommu_dma_opt_mapping_size(void) return iova_rcache_range(); } +static dma_addr_t iommu_dma_alloc_iova(struct device *dev, size_t size) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + dma_addr_t dma_mask = dma_get_mask(dev); + + size = iova_align(iovad, size); + return __iommu_dma_alloc_iova(domain, size, dma_mask, dev); +} + +static void iommu_dma_free_iova(struct device *dev, dma_addr_t iova, + size_t size) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + struct iommu_iotlb_gather iotlb_gather; + + size = iova_align(iovad, size); + iommu_iotlb_gather_init(&iotlb_gather); + __iommu_dma_free_iova(cookie, iova, size, &iotlb_gather); +} + static const struct dma_map_ops iommu_dma_ops = { .flags = DMA_F_PCI_P2PDMA_SUPPORTED, .alloc = iommu_dma_alloc, @@ -1728,6 +1752,8 @@ static const struct dma_map_ops iommu_dma_ops = { .unmap_resource = iommu_dma_unmap_resource, .get_merge_boundary = iommu_dma_get_merge_boundary, .opt_mapping_size = iommu_dma_opt_mapping_size, + .alloc_iova = iommu_dma_alloc_iova, + .free_iova = iommu_dma_free_iova, }; /* @@ -1776,7 +1802,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, if (!msi_page) return NULL; - iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev); + iova = __iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev); if (!iova) goto out_free_page; @@ -1790,7 +1816,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return msi_page; out_free_iova: - iommu_dma_free_iova(cookie, iova, size, NULL); + __iommu_dma_free_iova(cookie, iova, size, NULL); out_free_page: kfree(msi_page); return NULL; From patchwork Tue Mar 5 10:15:15 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581958 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3744B54909; Tue, 5 Mar 2024 10:15:55 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633756; cv=none; b=H+/tjgaxpSrJ4GSdbdu2jIV+M6bo5XWuxMtki5XBOzBlkd03MGJtJsknnnVM164nLiC3ulu+80qYGhvv22OIDlhVJWHJKzSM+LbUKrtZdnmrFcfCtPosIiFEWYXxmgJCVtw1p9c137EZv9tbBiWVZCAt2bKrxw0Fl80FXzrdQvU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633756; c=relaxed/simple; bh=fDITfGtiOJR6jcG42lBCR+mHX3ThOxLIN0l1OlLSZWc=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=JD6Vm/HoLDJi+HeY0COWDW4qDEppCHjBIGM1n7MYa4161m4y363+CLxFF52A7Vy00pgxKnh29a8E4kNd1C//nS6eOmuRvmov/qzk4bIma2s946/DcM7Mp6S3DU/qyF65pvmFdy3GH5b4ywg3hc4cpAZRnFUsLmytwwmztszV+u0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=FJmAaEr8; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="FJmAaEr8" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 227D7C43390; Tue, 5 Mar 2024 10:15:55 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633755; bh=fDITfGtiOJR6jcG42lBCR+mHX3ThOxLIN0l1OlLSZWc=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=FJmAaEr87QFSt+uhHLYl3gzt6kLzOVu3dHN83lz1uM5vdU4CtuiCZSBs2d0P6f7BV GWXZ8G8+rt+JyKRB/va6I/qMbCj10myc6JCPYOQrIVY080OPwdHmgH7PshCI7OdYvx S+8B3TE6kbwnDp1zFlZHyAqOGX540TJFbUp2yzfmgTwEOjHK1bYG4a+pTLK2lshrsJ 0ZL5kYsyPGuWAJhCC+4EVVQJ4aM7aJ+7UClB7aCzOYGySCNA1+YOKR2MpEUzqIGEJc rQ8HJlcOeQfXjUvz+PO9tPYDEZlh6/vLugZCqhD1CyhnnPcovlPVF6ssENSQ+sV3QQ O7+tmhjiv284g== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 05/16] iommu/dma: Prepare map/unmap page functions to receive IOVA Date: Tue, 5 Mar 2024 12:15:15 +0200 Message-ID: <13187a8682ab4f8708ca88cc4363f90e64e14ccc.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Extend the existing map_page/unmap_page function implementations to get preallocated IOVA. In such case, the IOVA allocation needs to be skipped, but rest of the code stays the same. Signed-off-by: Leon Romanovsky --- drivers/iommu/dma-iommu.c | 68 ++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index e55726783501..dbdd373a609a 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -824,7 +824,7 @@ static void __iommu_dma_free_iova(struct iommu_dma_cookie *cookie, } static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, - size_t size) + size_t size, bool free_iova) { struct iommu_domain *domain = iommu_get_dma_domain(dev); struct iommu_dma_cookie *cookie = domain->iova_cookie; @@ -843,17 +843,19 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, if (!iotlb_gather.queued) iommu_iotlb_sync(domain, &iotlb_gather); - __iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); + if (free_iova) + __iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); } static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, - size_t size, int prot, u64 dma_mask) + dma_addr_t iova, size_t size, int prot, + u64 dma_mask) { struct iommu_domain *domain = iommu_get_dma_domain(dev); struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; size_t iova_off = iova_offset(iovad, phys); - dma_addr_t iova; + bool no_iova = !iova; if (static_branch_unlikely(&iommu_deferred_attach_enabled) && iommu_deferred_attach(dev, domain)) @@ -861,12 +863,14 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, size = iova_align(iovad, size + iova_off); - iova = __iommu_dma_alloc_iova(domain, size, dma_mask, dev); + if (no_iova) + iova = __iommu_dma_alloc_iova(domain, size, dma_mask, dev); if (!iova) return DMA_MAPPING_ERROR; if (iommu_map(domain, iova, phys - iova_off, size, prot, GFP_ATOMIC)) { - __iommu_dma_free_iova(cookie, iova, size, NULL); + if (no_iova) + __iommu_dma_free_iova(cookie, iova, size, NULL); return DMA_MAPPING_ERROR; } return iova + iova_off; @@ -1031,7 +1035,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size, return vaddr; out_unmap: - __iommu_dma_unmap(dev, *dma_handle, size); + __iommu_dma_unmap(dev, *dma_handle, size, true); __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT); return NULL; } @@ -1060,7 +1064,7 @@ static void iommu_dma_free_noncontiguous(struct device *dev, size_t size, { struct dma_sgt_handle *sh = sgt_handle(sgt); - __iommu_dma_unmap(dev, sgt->sgl->dma_address, size); + __iommu_dma_unmap(dev, sgt->sgl->dma_address, size, true); __iommu_dma_free_pages(sh->pages, PAGE_ALIGN(size) >> PAGE_SHIFT); sg_free_table(&sh->sgt); kfree(sh); @@ -1131,9 +1135,11 @@ static void iommu_dma_sync_sg_for_device(struct device *dev, arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); } -static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +static dma_addr_t __iommu_dma_map_pages(struct device *dev, struct page *page, + unsigned long offset, dma_addr_t iova, + size_t size, + enum dma_data_direction dir, + unsigned long attrs) { phys_addr_t phys = page_to_phys(page) + offset; bool coherent = dev_is_dma_coherent(dev); @@ -1141,7 +1147,7 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, struct iommu_domain *domain = iommu_get_dma_domain(dev); struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; - dma_addr_t iova, dma_mask = dma_get_mask(dev); + dma_addr_t addr, dma_mask = dma_get_mask(dev); /* * If both the physical buffer start address and size are @@ -1182,14 +1188,23 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) arch_sync_dma_for_device(phys, size, dir); - iova = __iommu_dma_map(dev, phys, size, prot, dma_mask); - if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys)) + addr = __iommu_dma_map(dev, phys, iova, size, prot, dma_mask); + if (addr == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys)) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); - return iova; + return addr; } -static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir, unsigned long attrs) +static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + return __iommu_dma_map_pages(dev, page, offset, 0, size, dir, attrs); +} + +static void __iommu_dma_unmap_pages(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs, bool free_iova) { struct iommu_domain *domain = iommu_get_dma_domain(dev); phys_addr_t phys; @@ -1201,12 +1216,19 @@ static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(phys, size, dir); - __iommu_dma_unmap(dev, dma_handle, size); + __iommu_dma_unmap(dev, dma_handle, size, free_iova); if (unlikely(is_swiotlb_buffer(dev, phys))) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } +static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + __iommu_dma_unmap_pages(dev, dma_handle, size, dir, attrs, true); +} + /* * Prepare a successfully-mapped scatterlist to give back to the caller. * @@ -1509,13 +1531,13 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, } if (end) - __iommu_dma_unmap(dev, start, end - start); + __iommu_dma_unmap(dev, start, end - start, true); } static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, unsigned long attrs) { - return __iommu_dma_map(dev, phys, size, + return __iommu_dma_map(dev, phys, 0, size, dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO, dma_get_mask(dev)); } @@ -1523,7 +1545,7 @@ static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys, static void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle, size_t size, enum dma_data_direction dir, unsigned long attrs) { - __iommu_dma_unmap(dev, handle, size); + __iommu_dma_unmap(dev, handle, size, true); } static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr) @@ -1560,7 +1582,7 @@ static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr) static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle, unsigned long attrs) { - __iommu_dma_unmap(dev, handle, size); + __iommu_dma_unmap(dev, handle, size, true); __iommu_dma_free(dev, size, cpu_addr); } @@ -1626,7 +1648,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t size, if (!cpu_addr) return NULL; - *handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot, + *handle = __iommu_dma_map(dev, page_to_phys(page), 0, size, ioprot, dev->coherent_dma_mask); if (*handle == DMA_MAPPING_ERROR) { __iommu_dma_free(dev, size, cpu_addr); From patchwork Tue Mar 5 10:15:16 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581959 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id ECFDE5CDC6; Tue, 5 Mar 2024 10:15:59 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633760; cv=none; b=RDqAoL2nk+vGzssuadn2q+/8XyfHyWyw8CHt1If05Hw/CXiQZmrFGOAyoeOL1jv4qgtlyUf6uNRlghejNI4ERrUw+6Hx6NZpQNwqWHBJZqRVE0QXbMTuHsYqUJ7CJCst/hsoBEqhVeEME6FS3dK9DtAPcqJ6yxzvx8rasCNWBaM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633760; c=relaxed/simple; bh=buI8QIvB9qQbf6UKyQhoBlduuXAN0B68Jk0a66fCfeA=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=SxBu1etgmH0JqbhpADDnOSQlhqUSllXAzxABfxBe/q+grL+UIVur4p7R1bSxLvZt0whlN1JShScKUOMoX7hIFzwNZUSqGnKWrfYhoKSc/QvFqPAlvgaOzV99QdzUg/qizDdoLbtYkUulsLmeA8x1IzC2vJC8huUY2eMxdP8smqk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=hD0RcuJh; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="hD0RcuJh" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 2967FC433B2; Tue, 5 Mar 2024 10:15:59 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633759; bh=buI8QIvB9qQbf6UKyQhoBlduuXAN0B68Jk0a66fCfeA=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=hD0RcuJh00bicYhn0Ta6Ju9jPnBJ6gf5WneHPY3WuSGytfQ3Rc+DL/DoEICP3FM75 TX/RbpezM81fbB7g0M+/pUDbb5pBwp/U/cmchQIoNl1PajmoWeXVhqKZvBxR2j5GQ/ Xdi601RNr9cazQkefNIGomp16HunWfV85J/ruCFKJmmEcwPO6ApMTZcJ4UuZr1UsdF IMcAM0aIzs2G0O2qY8xakXVQgmneCmZ8NqSMafyLl2qF/5B13f9WQleGpTBPKI7GT6 DWWSpgNOxOOcGpvNc8RrhGDXjyzyq5YkBJQZFkgzf+EGBUNHt2uhxv49wDRBLex/sm mq/GhRcT5gfAA== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 06/16] iommu/dma: Implement link/unlink page callbacks Date: Tue, 5 Mar 2024 12:15:16 +0200 Message-ID: <1d3d26afcdbf95b053a3a44ceff34a4fa5334582.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Add an implementation of link/unlink interface to perform in map/unmap pages in fast patch for pre-allocated IOVA. Signed-off-by: Leon Romanovsky --- drivers/iommu/dma-iommu.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index dbdd373a609a..b683c4a4e9f8 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1752,6 +1752,21 @@ static void iommu_dma_free_iova(struct device *dev, dma_addr_t iova, __iommu_dma_free_iova(cookie, iova, size, &iotlb_gather); } +static dma_addr_t iommu_dma_link_range(struct device *dev, struct page *page, + unsigned long offset, dma_addr_t iova, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + return __iommu_dma_map_pages(dev, page, offset, iova, size, dir, attrs); +} + +static void iommu_dma_unlink_range(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + __iommu_dma_unmap_pages(dev, addr, size, dir, attrs, false); +} + static const struct dma_map_ops iommu_dma_ops = { .flags = DMA_F_PCI_P2PDMA_SUPPORTED, .alloc = iommu_dma_alloc, @@ -1776,6 +1791,8 @@ static const struct dma_map_ops iommu_dma_ops = { .opt_mapping_size = iommu_dma_opt_mapping_size, .alloc_iova = iommu_dma_alloc_iova, .free_iova = iommu_dma_free_iova, + .link_range = iommu_dma_link_range, + .unlink_range = iommu_dma_unlink_range, }; /* From patchwork Tue Mar 5 10:15:17 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581960 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9A2325D726; Tue, 5 Mar 2024 10:16:03 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633763; cv=none; b=ti521+51pzwwV6vrv67Tny67VFuoiIWlEAmx7VIHFnj1Jyrv5MeNnn17sqYkN6tI6Pj88BoJ+8e6qIWwmi+g3kaAy9dzbZwe4Qnofpdi93/lNR4YO8XxE3u7i6E8XAHmPkvB1v28TxOhVdYLLP7jPQz4Cwt4VCxl3mp+5Egy4to= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633763; c=relaxed/simple; bh=E8ahaIx2eHfPJk0G81HhbupoEvLGKO0ealmbIxFQ2TY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=gR88c/9ASjak6+jkyvzJkRvhb0Ojg2e6rC14snhppieGYeshfiFiWIbZ9WNASU7P0QyFmt+jBfpYctUiklKxXgM779lUir04/SDYUNdLipyiOK2kn6N4sX6I41JjSw7IpL7gnoiw0uE4w7BooWDHcayxhkC0lj/sxLGHI9xNkSo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=Cimcj0rC; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="Cimcj0rC" Received: by smtp.kernel.org (Postfix) with ESMTPSA id A89F2C433C7; Tue, 5 Mar 2024 10:16:02 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633763; bh=E8ahaIx2eHfPJk0G81HhbupoEvLGKO0ealmbIxFQ2TY=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=Cimcj0rCFoKg3KWHRwTEgnoqc8qrWIhxvFizI1kU7loe/t73myo7KMiGNWQpQmDYK YkCKGSvq63GGBNQ2CLnL24z1RXznU+8F5XkWj/5/ZksOhZ0kBQccN2aT7CgAKP9v9x yl9owxSK+HXI1J5nvkmlDd9aQR0PNfqbHP8QIEKRMOoQAznwZDxj6A2AMZSNfEgi32 fiuIyDdyjU1OE4zsZOG6BbNIThso3SY9ITUaG4nRwQoZTR27ICISHS92MWtSOWycWO /4uENNscCuCDEqeAucc8qKvGe+qf/1JpEGEZlgyvbFkeMhHjGMpx2I6GdKSl26ERxE hhl/ov41ZTT0g== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 07/16] RDMA/umem: Preallocate and cache IOVA for UMEM ODP Date: Tue, 5 Mar 2024 12:15:17 +0200 Message-ID: <47cc27fbaf9f4bd19edbcaac380bdd9684c5d12f.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky As a preparation to provide two step interface to map pages, preallocate IOVA when UMEM is initialized. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_odp.c | 16 +++++++++++++++- include/rdma/ib_umem_odp.h | 1 + include/rdma/ib_verbs.h | 18 ++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e9fa22d31c23..f69d1233dc82 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -50,6 +50,7 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, const struct mmu_interval_notifier_ops *ops) { + struct ib_device *dev = umem_odp->umem.ibdev; int ret; umem_odp->umem.is_odp = 1; @@ -87,15 +88,25 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, goto out_pfn_list; } + umem_odp->iova.dev = dev->dma_device; + umem_odp->iova.size = end - start; + umem_odp->iova.dir = DMA_BIDIRECTIONAL; + ret = ib_dma_alloc_iova(dev, &umem_odp->iova); + if (ret) + goto out_dma_list; + + ret = mmu_interval_notifier_insert(&umem_odp->notifier, umem_odp->umem.owning_mm, start, end - start, ops); if (ret) - goto out_dma_list; + goto out_free_iova; } return 0; +out_free_iova: + ib_dma_free_iova(dev, &umem_odp->iova); out_dma_list: kvfree(umem_odp->dma_list); out_pfn_list: @@ -262,6 +273,8 @@ EXPORT_SYMBOL(ib_umem_odp_get); void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { + struct ib_device *dev = umem_odp->umem.ibdev; + /* * Ensure that no more pages are mapped in the umem. * @@ -274,6 +287,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) ib_umem_end(umem_odp)); mutex_unlock(&umem_odp->umem_mutex); mmu_interval_notifier_remove(&umem_odp->notifier); + ib_dma_free_iova(dev, &umem_odp->iova); kvfree(umem_odp->dma_list); kvfree(umem_odp->pfn_list); } diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 0844c1d05ac6..bb2d7f2a5b04 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -23,6 +23,7 @@ struct ib_umem_odp { * See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT. */ dma_addr_t *dma_list; + struct dma_iova_attrs iova; /* * The umem_mutex protects the page_list and dma_list fields of an ODP * umem, allowing only a single thread to map/unmap pages. The mutex diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b7b6b58dd348..e71fa19187cc 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4077,6 +4077,24 @@ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) return dma_mapping_error(dev->dma_device, dma_addr); } +static inline int ib_dma_alloc_iova(struct ib_device *dev, + struct dma_iova_attrs *iova) +{ + if (ib_uses_virt_dma(dev)) + return 0; + + return dma_alloc_iova(iova); +} + +static inline void ib_dma_free_iova(struct ib_device *dev, + struct dma_iova_attrs *iova) +{ + if (ib_uses_virt_dma(dev)) + return; + + dma_free_iova(iova); +} + /** * ib_dma_map_single - Map a kernel virtual address to DMA address * @dev: The device for which the dma_addr is to be created From patchwork Tue Mar 5 10:15:18 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581961 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 22F9E5491F; Tue, 5 Mar 2024 10:16:07 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633768; cv=none; b=cm5N/Cz2l839WbW7yPaWvp00mEZ2odrcX7o2arS3p8cyirlImeE/Lo5VxkFE8cdEXFmK2GZoh0fWvENu3kn+QLnN7VE91qGFU2dNSRb/N0jWF3LgjDmYDRQIeyY2XY53bJxA7oytwUUyc/KNtMZT4nnG26I6dKUKzz+BBzSrqIY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633768; c=relaxed/simple; bh=aiC62CDa6AOfDDD5eldoWW566/DmWHKZI7ImrxLpIDI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=T7MyGHzHaR0ecnjfLHZUa8bSvG7UomuHac/7Y7l7WL5G/dsNjMZklml1q0mK7Tz9gfoHvoYuvloe1q4XUWbGCuSiFHPk3vH15tEs33PO4D3qoGGLmicaK6CoHbPXLv/SQYOOoYfySyEbTs3m3ainW/mY4JC/3XJodcPpwWqCZfY= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=odKrLpzw; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="odKrLpzw" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8AB2BC43390; Tue, 5 Mar 2024 10:16:06 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633767; bh=aiC62CDa6AOfDDD5eldoWW566/DmWHKZI7ImrxLpIDI=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=odKrLpzwo1+ckRGRNVfwyJTveQktZ8kd/S5yCucQpqZHcu4+kPj9mCgK/xgfYyPv3 I1yxGUXHANUZsFY2/zao6yrw+TC1BO6Ug1tU2XFw43OtQovJo9iqD1Ai4PUuOW2ek4 X/9Elu7Uq5ZyXbu6pqOIjh/WRKRxCRaxho9zx8G+nbRwIw8EWxL/eMZEcymmglY26U WQHazEjuKBpCIJbiC5wZCkJazHnRrQozUZyHySSTthx/1IlEePhg78T3ZBxMiNMYg+ cr2bSbBp/+udUr2p98mwq716WamQdsJgUH8ZChwuFyRcfal6uhhPyUvWkzAuA7xFxs NQnrJCt3MltQw== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 08/16] RDMA/umem: Store ODP access mask information in PFN Date: Tue, 5 Mar 2024 12:15:18 +0200 Message-ID: <88b042d29a28a2866d5bc5ca20bdba4a71bc7aca.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky As a preparation to remove of dma_list, store access mask in PFN pointer and not in dma_addr_t. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_odp.c | 99 +++++++++++----------------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/odp.c | 37 ++++++----- include/rdma/ib_umem_odp.h | 13 ---- 4 files changed, 59 insertions(+), 91 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index f69d1233dc82..3619fb78f786 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -310,22 +310,11 @@ EXPORT_SYMBOL(ib_umem_odp_release); static int ib_umem_odp_map_dma_single_page( struct ib_umem_odp *umem_odp, unsigned int dma_index, - struct page *page, - u64 access_mask) + struct page *page) { struct ib_device *dev = umem_odp->umem.ibdev; dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; - if (*dma_addr) { - /* - * If the page is already dma mapped it means it went through - * a non-invalidating trasition, like read-only to writable. - * Resync the flags. - */ - *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask; - return 0; - } - *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, DMA_BIDIRECTIONAL); if (ib_dma_mapping_error(dev, *dma_addr)) { @@ -333,7 +322,6 @@ static int ib_umem_odp_map_dma_single_page( return -EFAULT; } umem_odp->npages++; - *dma_addr |= access_mask; return 0; } @@ -369,9 +357,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, struct hmm_range range = {}; unsigned long timeout; - if (access_mask == 0) - return -EINVAL; - if (user_virt < ib_umem_start(umem_odp) || user_virt + bcnt > ib_umem_end(umem_odp)) return -EFAULT; @@ -397,7 +382,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, if (fault) { range.default_flags = HMM_PFN_REQ_FAULT; - if (access_mask & ODP_WRITE_ALLOWED_BIT) + if (access_mask & HMM_PFN_WRITE) range.default_flags |= HMM_PFN_REQ_WRITE; } @@ -429,22 +414,17 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, for (pfn_index = 0; pfn_index < num_pfns; pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { - if (fault) { - /* - * Since we asked for hmm_range_fault() to populate - * pages it shouldn't return an error entry on success. - */ - WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); - WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); - } else { - if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) { - WARN_ON(umem_odp->dma_list[dma_index]); - continue; - } - access_mask = ODP_READ_ALLOWED_BIT; - if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE) - access_mask |= ODP_WRITE_ALLOWED_BIT; - } + /* + * Since we asked for hmm_range_fault() to populate + * pages it shouldn't return an error entry on success. + */ + WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); + WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); + if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) + continue; + + if (range.hmm_pfns[pfn_index] & HMM_PFN_STICKY) + continue; hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); /* If a hugepage was detected and ODP wasn't set for, the umem @@ -459,13 +439,13 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, } ret = ib_umem_odp_map_dma_single_page( - umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]), - access_mask); + umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index])); if (ret < 0) { ibdev_dbg(umem_odp->umem.ibdev, "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); break; } + range.hmm_pfns[pfn_index] |= HMM_PFN_STICKY; } /* upon success lock should stay on hold for the callee */ if (!ret) @@ -485,7 +465,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - dma_addr_t dma_addr; dma_addr_t dma; int idx; u64 addr; @@ -496,34 +475,34 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, virt = max_t(u64, virt, ib_umem_start(umem_odp)); bound = min_t(u64, bound, ib_umem_end(umem_odp)); for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { + unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; + struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; dma = umem_odp->dma_list[idx]; - /* The access flags guaranteed a valid DMA address in case was NULL */ - if (dma) { - unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; - struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); - - dma_addr = dma & ODP_DMA_ADDR_MASK; - ib_dma_unmap_page(dev, dma_addr, - BIT(umem_odp->page_shift), - DMA_BIDIRECTIONAL); - if (dma & ODP_WRITE_ALLOWED_BIT) { - struct page *head_page = compound_head(page); - /* - * set_page_dirty prefers being called with - * the page lock. However, MMU notifiers are - * called sometimes with and sometimes without - * the lock. We rely on the umem_mutex instead - * to prevent other mmu notifiers from - * continuing and allowing the page mapping to - * be removed. - */ - set_page_dirty(head_page); - } - umem_odp->dma_list[idx] = 0; - umem_odp->npages--; + if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID)) + continue; + if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_STICKY)) + continue; + + ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift), + DMA_BIDIRECTIONAL); + if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) { + struct page *head_page = compound_head(page); + /* + * set_page_dirty prefers being called with + * the page lock. However, MMU notifiers are + * called sometimes with and sometimes without + * the lock. We rely on the umem_mutex instead + * to prevent other mmu notifiers from + * continuing and allowing the page mapping to + * be removed. + */ + set_page_dirty(head_page); } + umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_STICKY; + umem_odp->npages--; } } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index bbe79b86c717..4f368242680d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -334,6 +334,7 @@ struct mlx5_ib_flow_db { #define MLX5_IB_UPD_XLT_PD BIT(4) #define MLX5_IB_UPD_XLT_ACCESS BIT(5) #define MLX5_IB_UPD_XLT_INDIRECT BIT(6) +#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7) /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. * diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 4a04cbc5b78a..5713fe25f4de 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "mlx5_ib.h" #include "cmd.h" @@ -143,22 +144,12 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, } } -static u64 umem_dma_to_mtt(dma_addr_t umem_dma) -{ - u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; - - if (umem_dma & ODP_READ_ALLOWED_BIT) - mtt_entry |= MLX5_IB_MTT_READ; - if (umem_dma & ODP_WRITE_ALLOWED_BIT) - mtt_entry |= MLX5_IB_MTT_WRITE; - - return mtt_entry; -} - static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE; + unsigned long pfn; dma_addr_t pa; size_t i; @@ -166,8 +157,17 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, return; for (i = 0; i < nentries; i++) { + pfn = odp->pfn_list[idx + i]; + if (!(pfn & HMM_PFN_VALID)) + /* Initial ODP init */ + continue; + pa = odp->dma_list[idx + i]; - pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); + pa |= MLX5_IB_MTT_READ; + if ((pfn & HMM_PFN_WRITE) && !downgrade) + pa |= MLX5_IB_MTT_WRITE; + + pas[i] = cpu_to_be64(pa); } } @@ -268,8 +268,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, * estimate the cost of another UMR vs. the cost of bigger * UMR. */ - if (umem_odp->dma_list[idx] & - (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { + if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) { if (!in_block) { blk_start_idx = idx; in_block = 1; @@ -555,7 +554,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, { int page_shift, ret, np; bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; - u64 access_mask; + u64 access_mask = 0; u64 start_idx; bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT); u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC; @@ -563,12 +562,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, if (flags & MLX5_PF_FLAGS_ENABLE) xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; + if (flags & MLX5_PF_FLAGS_DOWNGRADE) + xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE; + page_shift = odp->page_shift; start_idx = (user_va - ib_umem_start(odp)) >> page_shift; - access_mask = ODP_READ_ALLOWED_BIT; if (odp->umem.writable && !downgrade) - access_mask |= ODP_WRITE_ALLOWED_BIT; + access_mask |= HMM_PFN_WRITE; np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault); if (np < 0) diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index bb2d7f2a5b04..095b1297cfb1 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -68,19 +68,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) umem_odp->page_shift; } -/* - * The lower 2 bits of the DMA address signal the R/W permissions for - * the entry. To upgrade the permissions, provide the appropriate - * bitmask to the map_dma_pages function. - * - * Be aware that upgrading a mapped address might result in change of - * the DMA address for the page. - */ -#define ODP_READ_ALLOWED_BIT (1<<0ULL) -#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) - -#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) - #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_umem_odp * From patchwork Tue Mar 5 10:15:19 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581962 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 666A85F466; Tue, 5 Mar 2024 10:16:11 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633771; cv=none; b=VRXgAD++odSxnW8JWBHtqH9HG5RZZPAakXy5R75duY2uHuEog73F92tFvM1++Ca48yUw6DG9ZzjWYvd1jQ5FMZt4jYJpA0la58I9WrvyQE+RUxjJisngfh05CKl/SjE2O0yg+/jO9VxOw7Fkk4sHWnvjEwZ12HdQXsES1W4qWVo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633771; c=relaxed/simple; bh=VVD1wxwJfNeGBkq+0kcqb+dcMkI1qAL6P7iY3YcEByk=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=kl1+NLQ4X55smiqwuFCpGbCPXc2IqAcpS/xWCuZVp7RnjmpB2nyy2+UqSWx/nbvMr0nmbNyTFqZhL8RipK+r2e16bRwPgXGbKcd57hTFehi/dYJi3saGVVChtOh0R/F5ArWooWhs0muYaMUDm1AlmTNnbsNih8r8H7ZJShMOVLU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=Re5/PREc; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="Re5/PREc" Received: by smtp.kernel.org (Postfix) with ESMTPSA id B2030C433F1; Tue, 5 Mar 2024 10:16:10 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633771; bh=VVD1wxwJfNeGBkq+0kcqb+dcMkI1qAL6P7iY3YcEByk=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=Re5/PREctans6bjAaCScdeo+OGx9Q0gK1V/DUyZejXI/cOEwqo9NbvH0bS0ukSRzY IO/UKo4/zvHoXhFlzBm7rkytV2fXO3sfDTwKk1YF4g7KnjIZJnhC38ThoJ8egOf97m g4a34Fomxt5HbL1QobDIYe5WO49HDJFoBuSUR3sxyehwushCKt9mQaenNr4ah8bZjI iRyOhCdtO8jqtiPOwv6kquAL+SI7/sZiOaVipfxPtHPM9YEoRLtuvc3A7Jy/k5unuZ O5i2rMBduMf6GtLh0Na4vtbnu75fR3CLlqQUL81nhWRM2LWeNit78vQf/3WGAsKjtj dpiN3RI5d/DnQ== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 09/16] RDMA/core: Separate DMA mapping to caching IOVA and page linkage Date: Tue, 5 Mar 2024 12:15:19 +0200 Message-ID: <22f9bd2e33ca2ec2b3d3bbd4cbac55122991e02f.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Reuse newly added DMA API to cache IOVA and only link/unlink pages in fast path. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_odp.c | 57 ++---------------------------- drivers/infiniband/hw/mlx5/odp.c | 22 +++++++++++- include/rdma/ib_umem_odp.h | 8 +---- include/rdma/ib_verbs.h | 36 +++++++++++++++++++ 4 files changed, 61 insertions(+), 62 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 3619fb78f786..1301009a6b78 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -81,20 +81,13 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, if (!umem_odp->pfn_list) return -ENOMEM; - umem_odp->dma_list = kvcalloc( - ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL); - if (!umem_odp->dma_list) { - ret = -ENOMEM; - goto out_pfn_list; - } umem_odp->iova.dev = dev->dma_device; umem_odp->iova.size = end - start; umem_odp->iova.dir = DMA_BIDIRECTIONAL; ret = ib_dma_alloc_iova(dev, &umem_odp->iova); if (ret) - goto out_dma_list; - + goto out_pfn_list; ret = mmu_interval_notifier_insert(&umem_odp->notifier, umem_odp->umem.owning_mm, @@ -107,8 +100,6 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, out_free_iova: ib_dma_free_iova(dev, &umem_odp->iova); -out_dma_list: - kvfree(umem_odp->dma_list); out_pfn_list: kvfree(umem_odp->pfn_list); return ret; @@ -288,7 +279,6 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) mutex_unlock(&umem_odp->umem_mutex); mmu_interval_notifier_remove(&umem_odp->notifier); ib_dma_free_iova(dev, &umem_odp->iova); - kvfree(umem_odp->dma_list); kvfree(umem_odp->pfn_list); } put_pid(umem_odp->tgid); @@ -296,40 +286,10 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) } EXPORT_SYMBOL(ib_umem_odp_release); -/* - * Map for DMA and insert a single page into the on-demand paging page tables. - * - * @umem: the umem to insert the page to. - * @dma_index: index in the umem to add the dma to. - * @page: the page struct to map and add. - * @access_mask: access permissions needed for this page. - * - * The function returns -EFAULT if the DMA mapping operation fails. - * - */ -static int ib_umem_odp_map_dma_single_page( - struct ib_umem_odp *umem_odp, - unsigned int dma_index, - struct page *page) -{ - struct ib_device *dev = umem_odp->umem.ibdev; - dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; - - *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(dev, *dma_addr)) { - *dma_addr = 0; - return -EFAULT; - } - umem_odp->npages++; - return 0; -} - /** * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. * * Maps the range passed in the argument to DMA addresses. - * The DMA addresses of the mapped pages is updated in umem_odp->dma_list. * Upon success the ODP MR will be locked to let caller complete its device * page table update. * @@ -437,15 +397,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, __func__, hmm_order, page_shift); break; } - - ret = ib_umem_odp_map_dma_single_page( - umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index])); - if (ret < 0) { - ibdev_dbg(umem_odp->umem.ibdev, - "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); - break; - } - range.hmm_pfns[pfn_index] |= HMM_PFN_STICKY; } /* upon success lock should stay on hold for the callee */ if (!ret) @@ -465,7 +416,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - dma_addr_t dma; int idx; u64 addr; struct ib_device *dev = umem_odp->umem.ibdev; @@ -479,15 +429,14 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - dma = umem_odp->dma_list[idx]; if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID)) continue; if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_STICKY)) continue; - ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift), - DMA_BIDIRECTIONAL); + ib_dma_unlink_range(dev, &umem_odp->iova, + idx * (1 << umem_odp->page_shift)); if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) { struct page *head_page = compound_head(page); /* diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 5713fe25f4de..13d61f1ab40b 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -149,6 +149,7 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE; + struct ib_device *dev = odp->umem.ibdev; unsigned long pfn; dma_addr_t pa; size_t i; @@ -162,12 +163,31 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, /* Initial ODP init */ continue; - pa = odp->dma_list[idx + i]; + if (pfn & HMM_PFN_STICKY && odp->iova.addr) + /* + * We are in this flow when there is a need to resync flags, + * for example when page was already linked in prefetch call + * with READ flag and now we need to add WRITE flag + * + * This page was already programmed to HW and we don't want/need + * to unlink and link it again just to resync flags. + * + * The DMA address calculation below is based on the fact that + * RDMA UMEM doesn't work with swiotlb. + */ + pa = odp->iova.addr + (idx + i) * (1 << odp->page_shift); + else + pa = ib_dma_link_range(dev, hmm_pfn_to_page(pfn), 0, &odp->iova, + (idx + i) * (1 << odp->page_shift)); + WARN_ON_ONCE(ib_dma_mapping_error(dev, pa)); + pa |= MLX5_IB_MTT_READ; if ((pfn & HMM_PFN_WRITE) && !downgrade) pa |= MLX5_IB_MTT_WRITE; pas[i] = cpu_to_be64(pa); + odp->pfn_list[idx + i] |= HMM_PFN_STICKY; + odp->npages++; } } diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 095b1297cfb1..a786556c65f9 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -17,15 +17,9 @@ struct ib_umem_odp { /* An array of the pfns included in the on-demand paging umem. */ unsigned long *pfn_list; - /* - * An array with DMA addresses mapped for pfns in pfn_list. - * The lower two bits designate access permissions. - * See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT. - */ - dma_addr_t *dma_list; struct dma_iova_attrs iova; /* - * The umem_mutex protects the page_list and dma_list fields of an ODP + * The umem_mutex protects the page_list field of an ODP * umem, allowing only a single thread to map/unmap pages. The mutex * also protects access to the mmu notifier counters. */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e71fa19187cc..c9e2bcd5268a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4160,6 +4160,42 @@ static inline void ib_dma_unmap_page(struct ib_device *dev, dma_unmap_page(dev->dma_device, addr, size, direction); } +/** + * ib_dma_link_range - Link a physical page to DMA address + * @dev: The device for which the dma_addr is to be created + * @page: The page to be mapped + * @offset: The offset within the page + * @iova: Preallocated IOVA attributes + * @dma_offset: DMA offset + */ +static inline dma_addr_t ib_dma_link_range(struct ib_device *dev, + struct page *page, + unsigned long offset, + struct dma_iova_attrs *iova, + dma_addr_t dma_offset) +{ + if (ib_uses_virt_dma(dev)) + return (uintptr_t)(page_address(page) + offset); + + return dma_link_range(page, offset, iova, dma_offset); +} + +/** + * ib_dma_unlink_range - Unlink a mapping created by ib_dma_link_page() + * @dev: The device for which the DMA address was created + * @iova: DMA IOVA properties + * @dma_offset: DMA offset + */ +static inline void ib_dma_unlink_range(struct ib_device *dev, + struct dma_iova_attrs *iova, + dma_addr_t dma_offset) +{ + if (ib_uses_virt_dma(dev)) + return; + + dma_unlink_range(iova, dma_offset); +} + int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents); static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, From patchwork Tue Mar 5 10:15:20 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581963 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C5B9255784; Tue, 5 Mar 2024 10:16:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633775; cv=none; b=YG14tpb/kis4XsYTVL+DC1CKMZ7MaGxohDVXMkSygLs4TikYb/kRyAzGnofvzuU9qMVhyqlbIEmTBsUMPq2RplfTuJC5HRzNQrvBcV162gMd5TZre0IbDu6l0KfSEbhIeT0gOZRC4/nlVoCXbP0A1AY3xuO3Dh0CEvRvTyE0Z0k= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633775; c=relaxed/simple; bh=inT0MN5YzB4Hs5AmOhFFcCz6zSy1Lq0FeGslm64Slg8=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Jd7lYq4SNcWRO+BcI7f+ZVyYOj0ucEjTbzWhTnalFBOx2fgSdnrj+Q1oJo4lDFqiKFj+L1jP7y2tD8xWHWQGxy3UcJ/C8R/m6VClEChfIAN/kJaVuK5w6CzH2ElNu8/i9RWJuNjaOVKTy7KvT9CGDIj4t4Usyy2Vusz7kmK8Xy0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=fCHRXvyt; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="fCHRXvyt" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 89E64C43394; Tue, 5 Mar 2024 10:16:14 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633775; bh=inT0MN5YzB4Hs5AmOhFFcCz6zSy1Lq0FeGslm64Slg8=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=fCHRXvytmeqDSxdCD2EfXr+ZdX+jOkx56Bb2+TCYBawnqgSnLqcXlkFgDF5qmcEOp E7PMxA+vsc9YowiUDs+N+vKmKNosJclbR8xYEqnO1rRvVNV4WnPpECqgsHVJpNlMKA lB8rm/cBF860sPiBVcTFadxXT46nj5cyTjjsLn7L1IQPJJauWBLzOauVDxRC7JMriq hqBOuvcngfO3Ag48oQOmKFqqEd24H42kIyQ6pFAEtQwXjfX4+Lv6LdF8+Wba5yxu2m Q4I0UNsudFH3cUZ/4s6hLK+r8y5Wx3e1b6prKFfNnu92fF6dlE3FGGxbKpEWGUnQh+ 39aiIZNMUkzUA== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 10/16] RDMA/umem: Prevent UMEM ODP creation with SWIOTLB Date: Tue, 5 Mar 2024 12:15:20 +0200 Message-ID: <8c6d5e7db2d1a01888cc7b9b9850b05e19c75c64.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky RDMA UMEM never supported DMA addresses returned from SWIOTLB, as these addresses should be programmed to the hardware which is not aware that it is bounce buffers and not real ones. Instead of silently leave broken system for the users who didn't know it, let's be explicit and return an error to them. Signed-off-by: Leon Romanovsky --- Documentation/core-api/dma-attributes.rst | 7 +++ drivers/infiniband/core/umem_odp.c | 77 +++++++++++------------ include/linux/dma-mapping.h | 6 ++ kernel/dma/direct.h | 4 +- kernel/dma/mapping.c | 4 ++ 5 files changed, 58 insertions(+), 40 deletions(-) diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1887d92e8e92..b337ec65d506 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,3 +130,10 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_NO_TRANSLATION +----------------------- + +This attribute is used to indicate to the DMA-mapping subsystem that the +buffer is not subject to any address translation. This is used for devices +that doesn't need buffer bouncing or fixing DMA addresses. diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1301009a6b78..57c56000f60e 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -50,51 +50,50 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, const struct mmu_interval_notifier_ops *ops) { + size_t page_size = 1UL << umem_odp->page_shift; struct ib_device *dev = umem_odp->umem.ibdev; + size_t ndmas, npfns; + unsigned long start; + unsigned long end; int ret; umem_odp->umem.is_odp = 1; mutex_init(&umem_odp->umem_mutex); - if (!umem_odp->is_implicit_odp) { - size_t page_size = 1UL << umem_odp->page_shift; - unsigned long start; - unsigned long end; - size_t ndmas, npfns; - - start = ALIGN_DOWN(umem_odp->umem.address, page_size); - if (check_add_overflow(umem_odp->umem.address, - (unsigned long)umem_odp->umem.length, - &end)) - return -EOVERFLOW; - end = ALIGN(end, page_size); - if (unlikely(end < page_size)) - return -EOVERFLOW; - - ndmas = (end - start) >> umem_odp->page_shift; - if (!ndmas) - return -EINVAL; - - npfns = (end - start) >> PAGE_SHIFT; - umem_odp->pfn_list = kvcalloc( - npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL); - if (!umem_odp->pfn_list) - return -ENOMEM; - - - umem_odp->iova.dev = dev->dma_device; - umem_odp->iova.size = end - start; - umem_odp->iova.dir = DMA_BIDIRECTIONAL; - ret = ib_dma_alloc_iova(dev, &umem_odp->iova); - if (ret) - goto out_pfn_list; - - ret = mmu_interval_notifier_insert(&umem_odp->notifier, - umem_odp->umem.owning_mm, - start, end - start, ops); - if (ret) - goto out_free_iova; - } + if (umem_odp->is_implicit_odp) + return 0; + + start = ALIGN_DOWN(umem_odp->umem.address, page_size); + if (check_add_overflow(umem_odp->umem.address, + (unsigned long)umem_odp->umem.length, &end)) + return -EOVERFLOW; + end = ALIGN(end, page_size); + if (unlikely(end < page_size)) + return -EOVERFLOW; + + ndmas = (end - start) >> umem_odp->page_shift; + if (!ndmas) + return -EINVAL; + + npfns = (end - start) >> PAGE_SHIFT; + umem_odp->pfn_list = + kvcalloc(npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL); + if (!umem_odp->pfn_list) + return -ENOMEM; + + umem_odp->iova.dev = dev->dma_device; + umem_odp->iova.size = end - start; + umem_odp->iova.dir = DMA_BIDIRECTIONAL; + umem_odp->iova.attrs = DMA_ATTR_NO_TRANSLATION; + ret = ib_dma_alloc_iova(dev, &umem_odp->iova); + if (ret) + goto out_pfn_list; + + ret = mmu_interval_notifier_insert(&umem_odp->notifier, + umem_odp->umem.owning_mm, start, + end - start, ops); + if (ret) + goto out_free_iova; return 0; diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 91cc084adb53..89945e707a9b 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -62,6 +62,12 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * DMA_ATTR_NO_TRANSLATION: used to indicate that the buffer should not be mapped + * through address translation. + */ +#define DMA_ATTR_NO_TRANSLATION (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 1c30e1cd607a..1c9ec204c999 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -92,6 +92,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, if (is_swiotlb_force_bounce(dev)) { if (is_pci_p2pdma_page(page)) return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_NO_TRANSLATION) + return DMA_MAPPING_ERROR; return swiotlb_map(dev, phys, size, dir, attrs); } @@ -99,7 +101,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, dma_kmalloc_needs_bounce(dev, size, dir)) { if (is_pci_p2pdma_page(page)) return DMA_MAPPING_ERROR; - if (is_swiotlb_active(dev)) + if (is_swiotlb_active(dev) && !(attrs & DMA_ATTR_NO_TRANSLATION)) return swiotlb_map(dev, phys, size, dir, attrs); dev_WARN_ONCE(dev, 1, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index f989c64622c2..49b1fde510c5 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -188,6 +188,10 @@ int dma_alloc_iova(struct dma_iova_attrs *iova) struct device *dev = iova->dev; const struct dma_map_ops *ops = get_dma_ops(dev); + if (dma_map_direct(dev, ops) && is_swiotlb_force_bounce(dev) && + iova->attrs & DMA_ATTR_NO_TRANSLATION) + return -EOPNOTSUPP; + if (dma_map_direct(dev, ops) || !ops->alloc_iova) { iova->addr = 0; return 0; From patchwork Tue Mar 5 10:15:21 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581964 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6B46B604DD; Tue, 5 Mar 2024 10:16:19 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633779; cv=none; b=uSiHnTIJuy6D2CjwPrDq/Sfzhwl8D+orpVWu7Q5xKOcHwdad+3B+y4ovuw3FmByeKI12gFLBpFms3EGVHw9jXhuLvnMaQeysfksnlu5ndWhO79lt47lr8TEk3e5TayLKcqzO8jjqQkK3XJYhiUUS1qz5kC8Sv6Ad1ZGnlnXAYPU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633779; c=relaxed/simple; bh=utZ3KdFtdsQAESX2EpyzMrGiuaExMhARHy+JfNBKyD0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Gaa0E8D7Q52mHYennzZXIJn1CP28aEIxfqLxXhW2GZz18bcr4QMnUF190Jg78nye98YaqBELxHIvbL98V25PpQPAeprBPvPWwJo/G7Nrf2IFaqIgYNY1aSu77NsBn2460LVCh6qgAWLBUiJxSVAkSI1tZxv3bj0JJJ0m7h1aZNA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=L2QU2W8h; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="L2QU2W8h" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 6AE5DC433C7; Tue, 5 Mar 2024 10:16:18 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633779; bh=utZ3KdFtdsQAESX2EpyzMrGiuaExMhARHy+JfNBKyD0=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=L2QU2W8hopktnjqtLh1dUA+1NmamN3jO2hagY+NwOntBW3lcniRVvhjZhygSySvOU zmT1qrJchWcoqJ2R8mvRPMDLWCJ3zSLtuMGhw4NQgQMYE8T+IPVf6ITidiC1VwGRir gtTb1oFKQVhxzVAOxoUh7eF6aVHsgij8QiaxeWlZYGmhQGHOcBXawhmIlI8lY89hFI r3LrJVTUJ9WGUE1lZOaEerkHmLwDIG0ZpwWAZk8UiAYs18llZFSY7SCbVioinjMAVf lBBpzlWMXaUIBxBZqwoyXLVY13am2lZaFJpc9bCZSCgoFvDIi5G4lUMLgNXKm0lxt6 HypLpEtFTk/XQ== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 11/16] vfio/mlx5: Explicitly use number of pages instead of allocated length Date: Tue, 5 Mar 2024 12:15:21 +0200 Message-ID: <01606f62be051034035ef1501b7c721b8a319dcc.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky allocated_length is a multiple of page size and number of pages, so let's change the functions to accept number of pages. It opens us a venue to combine receive and send paths together with code readability improvement. Signed-off-by: Leon Romanovsky --- drivers/vfio/pci/mlx5/cmd.c | 31 ++++++++--------- drivers/vfio/pci/mlx5/cmd.h | 10 +++--- drivers/vfio/pci/mlx5/main.c | 65 +++++++++++++++++++++++------------- 3 files changed, 62 insertions(+), 44 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index efd1d252cdc9..45104e47b7b2 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -305,8 +305,7 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey) { - size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : - recv_buf->npages; + size_t npages = buf ? buf->npages : recv_buf->npages; int err = 0, inlen; __be64 *mtt; void *mkc; @@ -362,7 +361,7 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) if (mvdev->mdev_detach) return -ENOTCONN; - if (buf->dmaed || !buf->allocated_length) + if (buf->dmaed || !buf->npages) return -EINVAL; ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); @@ -403,8 +402,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) } struct mlx5_vhca_data_buffer * -mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf; @@ -416,9 +414,8 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, buf->dma_dir = dma_dir; buf->migf = migf; - if (length) { - ret = mlx5vf_add_migration_pages(buf, - DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (npages) { + ret = mlx5vf_add_migration_pages(buf, npages); if (ret) goto end; @@ -444,8 +441,8 @@ void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) } struct mlx5_vhca_data_buffer * -mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir) +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf, *temp_buf; struct list_head free_list; @@ -460,7 +457,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { if (buf->dma_dir == dma_dir) { list_del_init(&buf->buf_elm); - if (buf->allocated_length >= length) { + if (buf->npages >= npages) { spin_unlock_irq(&migf->list_lock); goto found; } @@ -474,7 +471,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, } } spin_unlock_irq(&migf->list_lock); - buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + buf = mlx5vf_alloc_data_buffer(migf, npages, dma_dir); found: while ((temp_buf = list_first_entry_or_null(&free_list, @@ -645,7 +642,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); - MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, size, buf->npages * PAGE_SIZE); MLX5_SET(save_vhca_state_in, in, incremental, inc); MLX5_SET(save_vhca_state_in, in, set_track, track); @@ -668,8 +665,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } if (!header_buf) { - header_buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + u32 npages = DIV_ROUND_UP( + sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE); + + header_buf = + mlx5vf_get_data_buffer(migf, npages, DMA_NONE); if (IS_ERR(header_buf)) { err = PTR_ERR(header_buf); goto err_free; diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index f2c7227fa683..887267ebbd8a 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -60,7 +60,7 @@ struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; u64 length; - u64 allocated_length; + u32 npages; u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; @@ -219,12 +219,12 @@ int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); struct mlx5_vhca_data_buffer * -mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir); +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir); void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); struct mlx5_vhca_data_buffer * -mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir); +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir); void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index fe09a8c8af95..b11b1c27d284 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -94,7 +94,7 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, if (ret) goto err; - buf->allocated_length += filled * PAGE_SIZE; + buf->npages += filled; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, @@ -352,6 +352,7 @@ static struct mlx5_vhca_data_buffer * mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, u8 index, size_t required_length) { + u32 npages = DIV_ROUND_UP(required_length, PAGE_SIZE); struct mlx5_vhca_data_buffer *buf = migf->buf[index]; u8 chunk_num; @@ -359,12 +360,11 @@ mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, chunk_num = buf->stop_copy_chunk_num; buf->migf->buf[index] = NULL; /* Checking whether the pre-allocated buffer can fit */ - if (buf->allocated_length >= required_length) + if (buf->npages >= npages) return buf; mlx5vf_put_data_buffer(buf); - buf = mlx5vf_get_data_buffer(buf->migf, required_length, - DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(buf->migf, npages, DMA_FROM_DEVICE); if (IS_ERR(buf)) return buf; @@ -417,7 +417,9 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, u8 *to_buff; int ret; - header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); + BUILD_BUG_ON(size > PAGE_SIZE); + header_buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(size, PAGE_SIZE), + DMA_NONE); if (IS_ERR(header_buf)) return PTR_ERR(header_buf); @@ -432,7 +434,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, to_buff = kmap_local_page(page); memcpy(to_buff, &header, sizeof(header)); header_buf->length = sizeof(header); - data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); + data.stop_copy_size = cpu_to_le64(migf->buf[0]->npages * PAGE_SIZE); memcpy(to_buff + sizeof(header), &data, sizeof(data)); header_buf->length += sizeof(data); kunmap_local(to_buff); @@ -481,15 +483,22 @@ static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; for (i = 0; i < num_chunks; i++) { - buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer( + migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE), + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; } + BUILD_BUG_ON(sizeof(struct mlx5_vf_migration_header) > + PAGE_SIZE); migf->buf[i] = buf; - buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + buf = mlx5vf_get_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; @@ -597,7 +606,8 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, * We finished transferring the current state and the device has a * dirty state, save a new state to be ready for. */ - buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE), + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); mlx5vf_mark_err(migf); @@ -718,8 +728,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) if (track) { /* leave the allocated buffer ready for the stop-copy phase */ - buf = mlx5vf_alloc_data_buffer(migf, - migf->buf[0]->allocated_length, DMA_FROM_DEVICE); + buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages, + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_pd; @@ -783,16 +793,15 @@ mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, const char __user **buf, size_t *len, loff_t *pos, ssize_t *done) { + u32 npages = DIV_ROUND_UP(requested_length, PAGE_SIZE); int ret; if (requested_length > MAX_LOAD_SIZE) return -ENOMEM; - if (vhca_buf->allocated_length < requested_length) { - ret = mlx5vf_add_migration_pages( - vhca_buf, - DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, - PAGE_SIZE)); + if (vhca_buf->npages < npages) { + ret = mlx5vf_add_migration_pages(vhca_buf, + npages - vhca_buf->npages); if (ret) return ret; } @@ -992,11 +1001,14 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, goto out_unlock; break; case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: - if (vhca_buf_header->allocated_length < migf->record_size) { + { + u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE); + + if (vhca_buf_header->npages < npages) { mlx5vf_free_data_buffer(vhca_buf_header); - migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, - migf->record_size, DMA_NONE); + migf->buf_header[0] = mlx5vf_alloc_data_buffer( + migf, npages, DMA_NONE); if (IS_ERR(migf->buf_header[0])) { ret = PTR_ERR(migf->buf_header[0]); migf->buf_header[0] = NULL; @@ -1009,6 +1021,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, vhca_buf_header->start_pos = migf->max_pos; migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; break; + } case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, &buf, &len, pos, &done); @@ -1019,12 +1032,13 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, { u64 size = max(migf->record_size, migf->stop_copy_prep_size); + u32 npages = DIV_ROUND_UP(size, PAGE_SIZE); - if (vhca_buf->allocated_length < size) { + if (vhca_buf->npages < npages) { mlx5vf_free_data_buffer(vhca_buf); migf->buf[0] = mlx5vf_alloc_data_buffer(migf, - size, DMA_TO_DEVICE); + npages, DMA_TO_DEVICE); if (IS_ERR(migf->buf[0])) { ret = PTR_ERR(migf->buf[0]); migf->buf[0] = NULL; @@ -1115,8 +1129,11 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->buf[0] = buf; if (MLX5VF_PRE_COPY_SUPP(mvdev)) { - buf = mlx5vf_alloc_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + buf = mlx5vf_alloc_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_buf; From patchwork Tue Mar 5 10:15:22 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581965 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0D16255E47; Tue, 5 Mar 2024 10:16:24 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633784; cv=none; b=tMV60WY+mhb2u3rAaoDnI4oV0gt2fkct3o/XDGhcTksKqLW75Vnd3gaaB3dYc+Opn9kMxqftLHQ/E/OKHKkMnYGXJ1uEjeZHgZXVkYwPdEP+8EJ7EZgPJJbGTBWElgdOqluh5Knz/JpiRK/t9jGqd3YdFkT+CotSN3oWuyWhWpc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633784; c=relaxed/simple; bh=ktX9dWW2cFJcgdIR5D6ZbOLB18GCIDFrg21/P33AIJM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=icl/eJw08rMQRVzVt7SstwqNsHDA4iMK1Pmnz3IqInMwxGSNVrokCawtm0iW5xaBJZ4J8c5u2YhEWo/c7fH4HlACnv7n19pjfHRW2lfFPI0tajasWEtUCoWHPEkhm21UscUGLJHgxGQ0av7YDmWV+SUYDj7g2HoFLd/cDbotM6Y= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=qXJ4kuzE; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="qXJ4kuzE" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 2CCB1C433C7; Tue, 5 Mar 2024 10:16:23 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633783; bh=ktX9dWW2cFJcgdIR5D6ZbOLB18GCIDFrg21/P33AIJM=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=qXJ4kuzEHOpNp6UZHPKJ9BBgqb6aOqWMZjNX8Mr/NyeYNLbWIluFUQs/il5Qm/nbz rKyGKW497bjfduFoZd0wr9iokEFIosoXqEoSp9FYHE5Xp8JwBP0eMkdQRVZkL+6Uww ryPQG/188kpX4HM5ZM0WopQQkatTizmCi7p5dHYxsiDfGbVRvELF0KDSVp5gVbevRn iuQfGnjX1e6Gup0T5r/Y3LWU+2RN3KFhzCkDeTVBPZBoB5S341ai7MgzL+1qV7wvO/ N+O9x3owVU80s6i+zBYKUlhCTukDtgGYcYE4+1ebdg9cXWrQxpzmVk5+1Ra5IET5cB /WlNvWjN00B8g== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 12/16] vfio/mlx5: Rewrite create mkey flow to allow better code reuse Date: Tue, 5 Mar 2024 12:15:22 +0200 Message-ID: <9366169430357d953e961cd41ae912c5fbd3f568.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Change the creation of mkey to be performed in multiple steps: data allocation, DMA setup and actual call to HW to create that mkey. In this new flow, the whole input to MKEY command is saved to eliminate the need to keep array of pointers for DMA addresses for receive list and in the future patches for send list too. In addition to memory size reduce and elimination of unnecessary data movements to set MKEY input, the code is prepared for future reuse. Signed-off-by: Leon Romanovsky --- drivers/vfio/pci/mlx5/cmd.c | 149 +++++++++++++++++++++--------------- drivers/vfio/pci/mlx5/cmd.h | 3 +- 2 files changed, 88 insertions(+), 64 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 45104e47b7b2..44762980fcb9 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -300,39 +300,21 @@ static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, return ret; } -static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vhca_data_buffer *buf, - struct mlx5_vhca_recv_buf *recv_buf, - u32 *mkey) +static u32 *alloc_mkey_in(u32 npages, u32 pdn) { - size_t npages = buf ? buf->npages : recv_buf->npages; - int err = 0, inlen; - __be64 *mtt; + int inlen; void *mkc; u32 *in; inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + - sizeof(*mtt) * round_up(npages, 2); + sizeof(__be64) * round_up(npages, 2); - in = kvzalloc(inlen, GFP_KERNEL); + in = kvzalloc(inlen, GFP_KERNEL_ACCOUNT); if (!in) - return -ENOMEM; + return NULL; MLX5_SET(create_mkey_in, in, translations_octword_actual_size, DIV_ROUND_UP(npages, 2)); - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - - if (buf) { - struct sg_dma_page_iter dma_iter; - - for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) - *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); - } else { - int i; - - for (i = 0; i < npages; i++) - *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); - } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); @@ -346,9 +328,30 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); - err = mlx5_core_create_mkey(mdev, mkey, in, inlen); - kvfree(in); - return err; + + return in; +} + +static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, + struct mlx5_vhca_data_buffer *buf, u32 *mkey_in, + u32 *mkey) +{ + __be64 *mtt; + int inlen; + + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + + if (buf) { + struct sg_dma_page_iter dma_iter; + + for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) + *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); + } + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + + sizeof(__be64) * round_up(npages, 2); + + return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen); } static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) @@ -368,13 +371,22 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) if (ret) return ret; - ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); - if (ret) + buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn); + if (!buf->mkey_in) { + ret = -ENOMEM; goto err; + } + + ret = create_mkey(mdev, buf->npages, buf, buf->mkey_in, &buf->mkey); + if (ret) + goto err_create_mkey; buf->dmaed = true; return 0; + +err_create_mkey: + kvfree(buf->mkey_in); err: dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); return ret; @@ -390,6 +402,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) if (buf->dmaed) { mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + kvfree(buf->mkey_in); dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, buf->dma_dir, 0); } @@ -1286,46 +1299,45 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, return -ENOMEM; } -static int register_dma_recv_pages(struct mlx5_core_dev *mdev, - struct mlx5_vhca_recv_buf *recv_buf) +static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + u32 *mkey_in) { - int i, j; + dma_addr_t addr; + __be64 *mtt; + int i; - recv_buf->dma_addrs = kvcalloc(recv_buf->npages, - sizeof(*recv_buf->dma_addrs), - GFP_KERNEL_ACCOUNT); - if (!recv_buf->dma_addrs) - return -ENOMEM; + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); - for (i = 0; i < recv_buf->npages; i++) { - recv_buf->dma_addrs[i] = dma_map_page(mdev->device, - recv_buf->page_list[i], - 0, PAGE_SIZE, - DMA_FROM_DEVICE); - if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) - goto error; + for (i = npages - 1; i >= 0; i--) { + addr = be64_to_cpu(mtt[i]); + dma_unmap_single(mdev->device, addr, PAGE_SIZE, + DMA_FROM_DEVICE); } - return 0; - -error: - for (j = 0; j < i; j++) - dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], - PAGE_SIZE, DMA_FROM_DEVICE); - - kvfree(recv_buf->dma_addrs); - return -ENOMEM; } -static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, - struct mlx5_vhca_recv_buf *recv_buf) +static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + struct page **page_list, u32 *mkey_in) { + dma_addr_t addr; + __be64 *mtt; int i; - for (i = 0; i < recv_buf->npages; i++) - dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], - PAGE_SIZE, DMA_FROM_DEVICE); + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + + for (i = 0; i < npages; i++) { + addr = dma_map_page(mdev->device, page_list[i], 0, PAGE_SIZE, + DMA_FROM_DEVICE); + if (dma_mapping_error(mdev->device, addr)) + goto error; + + *mtt++ = cpu_to_be64(addr); + } + + return 0; - kvfree(recv_buf->dma_addrs); +error: + unregister_dma_pages(mdev, i, mkey_in); + return -ENOMEM; } static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, @@ -1334,7 +1346,8 @@ static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; mlx5_core_destroy_mkey(mdev, recv_buf->mkey); - unregister_dma_recv_pages(mdev, recv_buf); + unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in); + kvfree(recv_buf->mkey_in); free_recv_pages(&qp->recv_buf); } @@ -1350,18 +1363,28 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, if (err < 0) return err; - err = register_dma_recv_pages(mdev, recv_buf); - if (err) + recv_buf->mkey_in = alloc_mkey_in(npages, pdn); + if (!recv_buf->mkey_in) { + err = -ENOMEM; goto end; + } + + err = register_dma_pages(mdev, npages, recv_buf->page_list, + recv_buf->mkey_in); + if (err) + goto err_register_dma; - err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); + err = create_mkey(mdev, npages, NULL, recv_buf->mkey_in, + &recv_buf->mkey); if (err) goto err_create_mkey; return 0; err_create_mkey: - unregister_dma_recv_pages(mdev, recv_buf); + unregister_dma_pages(mdev, npages, recv_buf->mkey_in); +err_register_dma: + kvfree(recv_buf->mkey_in); end: free_recv_pages(recv_buf); return err; diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 887267ebbd8a..83728c0669e7 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -62,6 +62,7 @@ struct mlx5_vhca_data_buffer { u64 length; u32 npages; u32 mkey; + u32 *mkey_in; enum dma_data_direction dma_dir; u8 dmaed:1; u8 stop_copy_chunk_num; @@ -137,8 +138,8 @@ struct mlx5_vhca_cq { struct mlx5_vhca_recv_buf { u32 npages; struct page **page_list; - dma_addr_t *dma_addrs; u32 next_rq_offset; + u32 *mkey_in; u32 mkey; }; From patchwork Tue Mar 5 10:15:23 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581966 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id C9C1E612CE; Tue, 5 Mar 2024 10:16:27 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633787; cv=none; b=lcMmmw4AwKBCVhINWy8mrqL0Xq4b23A985fVzfC2Y1HegCcY9R4o9Su77ejkPh+e7LQJLu9T2H+G0ou4+Kqy+ZwZB8erm8q4WXRZoy4B6aa9h28LLBUszG8aeTL2wXsv4a9mcqyafgnfChZ6EI1eWB3BCdXx21HGAH4Rl4qtMO4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633787; c=relaxed/simple; bh=WBfM8ZvY1JfgPv87VMT5XwFqFK1pOKYXqtZNVa4/Kmg=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=kBGMRCC1SvF6QFlNSKno6GlYTnmoJlKsjihuL0L98u71Ua4Bo3K3dBF+yI1FP9eBkEWuRelXdHwlULAeBkCiTawiKLoQEXHsGj4ZAftExSEUSR0eZJnXvJA+K5Nymkefd+HGBJA23sdYu9IN3cJt7Dy4IVk7rrUhpJrENWW6eSo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=IkRk/fb0; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="IkRk/fb0" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 03941C43394; Tue, 5 Mar 2024 10:16:27 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633787; bh=WBfM8ZvY1JfgPv87VMT5XwFqFK1pOKYXqtZNVa4/Kmg=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=IkRk/fb09cNpaq1ao/YIjVQRUd7549ASV5vUrE0loQ3I40W0g4lHqMa2RnlPOC+SL 7O47LLrDs24FE154PCpzD9gxTzqQT73wARI+zqa5pCclRYy+mfUAb3VXN5E67rLGEq 3FJ29W4o1dvJg1oBPDj9nWxWk+GDyXcjXC0TwrN0jPCHmoAcRi2VmERQ0yOXxC4y76 SQTUfIFSI98tR0AmIVTshw3fDy/384A966eKFUmESBoo+LVwjOUtuC+rAvhyqDK1EM ycf5A5dNTFKwMwlUDywAbqNnUHwzdQvu0BxoWJduZUbSgbO83ABzuBc7P68A9/gK4g OYyahJ7w7P7Rw== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 13/16] vfio/mlx5: Explicitly store page list Date: Tue, 5 Mar 2024 12:15:23 +0200 Message-ID: <1d0ca7408af6e5f0bb09baffd021bc72287e5ed8.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky As a preparation to removal scatter-gather table and unifying receive and send list, explicitly store page list. Signed-off-by: Leon Romanovsky --- drivers/vfio/pci/mlx5/cmd.c | 1 + drivers/vfio/pci/mlx5/cmd.h | 1 + drivers/vfio/pci/mlx5/main.c | 35 +++++++++++++++++------------------ 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 44762980fcb9..5e2103042d9b 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -411,6 +411,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) __free_page(sg_page_iter_page(&sg_iter)); sg_free_append_table(&buf->table); + kvfree(buf->page_list); kfree(buf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 83728c0669e7..815fcb54494d 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -57,6 +57,7 @@ struct mlx5_vf_migration_header { }; struct mlx5_vhca_data_buffer { + struct page **page_list; struct sg_append_table table; loff_t start_pos; u64 length; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index b11b1c27d284..7ffe24693a55 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -69,44 +69,43 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages) { unsigned int to_alloc = npages; + size_t old_size, new_size; struct page **page_list; unsigned long filled; unsigned int to_fill; int ret; - to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); - page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); + to_fill = min_t(unsigned int, npages, + PAGE_SIZE / sizeof(*buf->page_list)); + old_size = buf->npages * sizeof(*buf->page_list); + new_size = old_size + to_fill * sizeof(*buf->page_list); + page_list = kvrealloc(buf->page_list, old_size, new_size, + GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page_list) return -ENOMEM; + buf->page_list = page_list; + do { filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); - if (!filled) { - ret = -ENOMEM; - goto err; - } + buf->page_list + buf->npages); + if (!filled) + return -ENOMEM; + to_alloc -= filled; ret = sg_alloc_append_table_from_pages( - &buf->table, page_list, filled, 0, + &buf->table, buf->page_list + buf->npages, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, GFP_KERNEL_ACCOUNT); - if (ret) - goto err; + return ret; + buf->npages += filled; - /* clean input for another bulk allocation */ - memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, - PAGE_SIZE / sizeof(*page_list)); + PAGE_SIZE / sizeof(*buf->page_list)); } while (to_alloc > 0); - kvfree(page_list); return 0; - -err: - kvfree(page_list); - return ret; } static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) From patchwork Tue Mar 5 10:15:24 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581967 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A645C63106; Tue, 5 Mar 2024 10:16:32 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633792; cv=none; b=jg59iN0F5nu4sOloPdWAahtzrdN5q39Fm2P41l5Sn9umbuN/nIsT6X51khXUf6KTGFAKeJt7uk8WaIl+pd0FpRDkQXWk2draPzI+AW100Vbe6GA/C672RGWmz2NtHdvS1kHKt7/jc3w4r5pbtqLn+QYBytp2feQmBpKBBSfX3AE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633792; c=relaxed/simple; bh=4IocgUHLj9eV065W5P3anYxI4nhTW2naxln23AtMtis=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=aQYEvOfgSg2qvxEVU2GMoZzwNhVW40tuZcwl+m24404d+1PLRweTzGd8nwfLLcHJytESuc56ilTl77uSkQyVEewaYpsBlseMuRKet1h08GAXK6aaik7t///BWkA+vz8PGUbq15iFnYylp120wE+Ft+ozbbGZnku96sz9AHcDhoI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ME6sTK28; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ME6sTK28" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 3995DC433C7; Tue, 5 Mar 2024 10:16:31 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633792; bh=4IocgUHLj9eV065W5P3anYxI4nhTW2naxln23AtMtis=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=ME6sTK286uwoXoz+IpcdDC5iiCveCz60c3QC6e2Qoo6RT1vn1QMiK79xbl2Z0iJEQ lF+WIuufy46vobE2X5BkxN4zFsuEFfqCY7nNLgqtqaGEzXhdvxV79SceqYL9IBcm1m TvpaFHgFXCBfyApy9K0LVj6Gp960SddtGc3bQwTBI9C4yeb0N89NQ9nYnZIjPV3lud HWI1R9l5KpNKb/eOURGHv33QqpDtRBoDV6zIP7rZo4UIRb0nFH+r1oENLRXAryoJBH l4+GOIMyR7QDazy9mrVtajxBlb0ezE2lg1MG7ckSp0k65UFJTLImRNCeq2jbANxI8Z 7hpa9NbeUdkHw== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC 14/16] vfio/mlx5: Convert vfio to use DMA link API Date: Tue, 5 Mar 2024 12:15:24 +0200 Message-ID: X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Remove intermediate scatter-gather table as it is not needed if DMA link API is used. This conversion reduces drastically the memory used to manage that table. Signed-off-by: Leon Romanovsky --- drivers/vfio/pci/mlx5/cmd.c | 177 ++++++++++++++++------------------- drivers/vfio/pci/mlx5/cmd.h | 8 +- drivers/vfio/pci/mlx5/main.c | 50 ++-------- 3 files changed, 91 insertions(+), 144 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 5e2103042d9b..cfae03f7b7da 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -332,26 +332,60 @@ static u32 *alloc_mkey_in(u32 npages, u32 pdn) return in; } -static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, - struct mlx5_vhca_data_buffer *buf, u32 *mkey_in, +static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, u32 *mkey_in, u32 *mkey) { + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + + sizeof(__be64) * round_up(npages, 2); + + return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen); +} + +static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + u32 *mkey_in, struct dma_iova_attrs *iova) +{ + dma_addr_t addr; __be64 *mtt; - int inlen; + int i; mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); - if (buf) { - struct sg_dma_page_iter dma_iter; + for (i = npages - 1; i >= 0; i--) { + addr = be64_to_cpu(mtt[i]); + dma_unlink_range(iova, addr); + } + dma_free_iova(iova); +} + +static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + struct page **page_list, u32 *mkey_in, + struct dma_iova_attrs *iova) +{ + dma_addr_t addr; + __be64 *mtt; + int i, err; + + iova->dev = mdev->device; + iova->size = npages * PAGE_SIZE; + err = dma_alloc_iova(iova); + if (err) + return err; + + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + + for (i = 0; i < npages; i++) { + addr = dma_link_range(page_list[i], 0, iova, i * PAGE_SIZE); + if (dma_mapping_error(mdev->device, addr)) + goto error; - for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) - *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); + *mtt++ = cpu_to_be64(addr); } - inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + - sizeof(__be64) * round_up(npages, 2); + return 0; - return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen); +error: + unregister_dma_pages(mdev, i, mkey_in, iova); + return -ENOMEM; } static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) @@ -367,17 +401,16 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) if (buf->dmaed || !buf->npages) return -EINVAL; - ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); - if (ret) - return ret; - buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn); - if (!buf->mkey_in) { - ret = -ENOMEM; - goto err; - } + if (!buf->mkey_in) + return -ENOMEM; + + ret = register_dma_pages(mdev, buf->npages, buf->page_list, + buf->mkey_in, &buf->iova); + if (ret) + goto err_register_dma; - ret = create_mkey(mdev, buf->npages, buf, buf->mkey_in, &buf->mkey); + ret = create_mkey(mdev, buf->npages, buf->mkey_in, &buf->mkey); if (ret) goto err_create_mkey; @@ -386,32 +419,39 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) return 0; err_create_mkey: + unregister_dma_pages(mdev, buf->npages, buf->mkey_in, &buf->iova); +err_register_dma: kvfree(buf->mkey_in); -err: - dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); return ret; } +static void free_page_list(u32 npages, struct page **page_list) +{ + int i; + + /* Undo alloc_pages_bulk_array() */ + for (i = npages - 1; i >= 0; i--) + __free_page(page_list[i]); + + kvfree(page_list); +} + void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) { - struct mlx5_vf_migration_file *migf = buf->migf; - struct sg_page_iter sg_iter; + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; - lockdep_assert_held(&migf->mvdev->state_mutex); - WARN_ON(migf->mvdev->mdev_detach); + lockdep_assert_held(&mvdev->state_mutex); + WARN_ON(mvdev->mdev_detach); if (buf->dmaed) { - mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + mlx5_core_destroy_mkey(mdev, buf->mkey); + unregister_dma_pages(mdev, buf->npages, buf->mkey_in, + &buf->iova); kvfree(buf->mkey_in); - dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, - buf->dma_dir, 0); } - /* Undo alloc_pages_bulk_array() */ - for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&buf->table); - kvfree(buf->page_list); + free_page_list(buf->npages, buf->page_list); kfree(buf); } @@ -426,7 +466,7 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, if (!buf) return ERR_PTR(-ENOMEM); - buf->dma_dir = dma_dir; + buf->iova.dir = dma_dir; buf->migf = migf; if (npages) { ret = mlx5vf_add_migration_pages(buf, npages); @@ -469,7 +509,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, spin_lock_irq(&migf->list_lock); list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { - if (buf->dma_dir == dma_dir) { + if (buf->iova.dir == dma_dir) { list_del_init(&buf->buf_elm); if (buf->npages >= npages) { spin_unlock_irq(&migf->list_lock); @@ -1253,17 +1293,6 @@ static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, kfree(qp); } -static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) -{ - int i; - - /* Undo alloc_pages_bulk_array() */ - for (i = 0; i < recv_buf->npages; i++) - __free_page(recv_buf->page_list[i]); - - kvfree(recv_buf->page_list); -} - static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, unsigned int npages) { @@ -1300,56 +1329,16 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, return -ENOMEM; } -static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages, - u32 *mkey_in) -{ - dma_addr_t addr; - __be64 *mtt; - int i; - - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); - - for (i = npages - 1; i >= 0; i--) { - addr = be64_to_cpu(mtt[i]); - dma_unmap_single(mdev->device, addr, PAGE_SIZE, - DMA_FROM_DEVICE); - } -} - -static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages, - struct page **page_list, u32 *mkey_in) -{ - dma_addr_t addr; - __be64 *mtt; - int i; - - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); - - for (i = 0; i < npages; i++) { - addr = dma_map_page(mdev->device, page_list[i], 0, PAGE_SIZE, - DMA_FROM_DEVICE); - if (dma_mapping_error(mdev->device, addr)) - goto error; - - *mtt++ = cpu_to_be64(addr); - } - - return 0; - -error: - unregister_dma_pages(mdev, i, mkey_in); - return -ENOMEM; -} - static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_qp *qp) { struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; mlx5_core_destroy_mkey(mdev, recv_buf->mkey); - unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in); + unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in, + &recv_buf->iova); kvfree(recv_buf->mkey_in); - free_recv_pages(&qp->recv_buf); + free_page_list(recv_buf->npages, recv_buf->page_list); } static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, @@ -1370,24 +1359,24 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, goto end; } + recv_buf->iova.dir = DMA_FROM_DEVICE; err = register_dma_pages(mdev, npages, recv_buf->page_list, - recv_buf->mkey_in); + recv_buf->mkey_in, &recv_buf->iova); if (err) goto err_register_dma; - err = create_mkey(mdev, npages, NULL, recv_buf->mkey_in, - &recv_buf->mkey); + err = create_mkey(mdev, npages, recv_buf->mkey_in, &recv_buf->mkey); if (err) goto err_create_mkey; return 0; err_create_mkey: - unregister_dma_pages(mdev, npages, recv_buf->mkey_in); + unregister_dma_pages(mdev, npages, recv_buf->mkey_in, &recv_buf->iova); err_register_dma: kvfree(recv_buf->mkey_in); end: - free_recv_pages(recv_buf); + free_page_list(npages, recv_buf->page_list); return err; } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 815fcb54494d..3a046166d9f2 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -57,22 +57,17 @@ struct mlx5_vf_migration_header { }; struct mlx5_vhca_data_buffer { + struct dma_iova_attrs iova; struct page **page_list; - struct sg_append_table table; loff_t start_pos; u64 length; u32 npages; u32 mkey; u32 *mkey_in; - enum dma_data_direction dma_dir; u8 dmaed:1; u8 stop_copy_chunk_num; struct list_head buf_elm; struct mlx5_vf_migration_file *migf; - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; }; struct mlx5vf_async_data { @@ -137,6 +132,7 @@ struct mlx5_vhca_cq { }; struct mlx5_vhca_recv_buf { + struct dma_iova_attrs iova; u32 npages; struct page **page_list; u32 next_rq_offset; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 7ffe24693a55..668c28bc429c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -34,35 +34,10 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) core_device); } -struct page * -mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, - unsigned long offset) +struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, + unsigned long offset) { - unsigned long cur_offset = 0; - struct scatterlist *sg; - unsigned int i; - - /* All accesses are sequential */ - if (offset < buf->last_offset || !buf->last_offset_sg) { - buf->last_offset = 0; - buf->last_offset_sg = buf->table.sgt.sgl; - buf->sg_last_entry = 0; - } - - cur_offset = buf->last_offset; - - for_each_sg(buf->last_offset_sg, sg, - buf->table.sgt.orig_nents - buf->sg_last_entry, i) { - if (offset < sg->length + cur_offset) { - buf->last_offset_sg = sg; - buf->sg_last_entry += i; - buf->last_offset = cur_offset; - return nth_page(sg_page(sg), - (offset - cur_offset) / PAGE_SIZE); - } - cur_offset += sg->length; - } - return NULL; + return buf->page_list[offset / PAGE_SIZE]; } int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, @@ -72,13 +47,9 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, size_t old_size, new_size; struct page **page_list; unsigned long filled; - unsigned int to_fill; - int ret; - to_fill = min_t(unsigned int, npages, - PAGE_SIZE / sizeof(*buf->page_list)); old_size = buf->npages * sizeof(*buf->page_list); - new_size = old_size + to_fill * sizeof(*buf->page_list); + new_size = old_size + to_alloc * sizeof(*buf->page_list); page_list = kvrealloc(buf->page_list, old_size, new_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page_list) @@ -87,22 +58,13 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, buf->page_list = page_list; do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, + filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_alloc, buf->page_list + buf->npages); if (!filled) return -ENOMEM; to_alloc -= filled; - ret = sg_alloc_append_table_from_pages( - &buf->table, buf->page_list + buf->npages, filled, 0, - filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, - GFP_KERNEL_ACCOUNT); - if (ret) - return ret; - buf->npages += filled; - to_fill = min_t(unsigned int, to_alloc, - PAGE_SIZE / sizeof(*buf->page_list)); } while (to_alloc > 0); return 0; @@ -164,7 +126,7 @@ static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf) struct mlx5_vf_migration_file *migf = vhca_buf->migf; if (vhca_buf->stop_copy_chunk_num) { - bool is_header = vhca_buf->dma_dir == DMA_NONE; + bool is_header = vhca_buf->iova.dir == DMA_NONE; u8 chunk_num = vhca_buf->stop_copy_chunk_num; size_t next_required_umem_size = 0; From patchwork Tue Mar 5 10:15:25 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581968 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id EC89D56755; Tue, 5 Mar 2024 10:16:35 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633796; cv=none; b=qxdSSQe/FK6N54ni4h38DDdVJM8iyct6rIq1vgJTb5CuFzcJZQ8mGF3CCs8AH6crc7pvBT1TjEkL+kdtKvpFWlOWGsll007fCuMTFg2yldAxcTurZXSFnMFLpe8b4oDVCtLwsEkaoKzSDFzMfEXSlmlfno8cqskWpHRj6h4ocWA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633796; c=relaxed/simple; bh=LoE4Mk2h+x4QygNOYAaQkErlcP1J9OkI1WKvPNtdRq4=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=rV+YQS5O9L4k7WGttCxaa2KYZ5WDE8bgxPl2KdNHur7aY8c08Z0AKq51k4IIvGNQcaa2BRurtIoxHEWpYBG3/Fx6oD2tWc1mBX2FjjymxNflECxBHeTLSZMXPP24Aq1bj1Y+07WH8ihCgMTktBqTJbdycUsQ42HbqlaHf8sFB0o= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=WNEwTJ5c; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="WNEwTJ5c" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 2D045C43394; Tue, 5 Mar 2024 10:16:35 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633795; bh=LoE4Mk2h+x4QygNOYAaQkErlcP1J9OkI1WKvPNtdRq4=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=WNEwTJ5cMN9ELwzZS16ppqt8z3E74GD7UClDOZ+K258awlhlHWmthDNESMHRqcqrd FB5rGUzT1HNrfY0k9YRNPH7tpd6eRrBCHBFvW2HuHKRpDRIcieGpESqf5qYSZEu7Ot rJawxgEDFjOWV3aypoQz9qlwzRrIaDARCl71nYqsSh9yKTfAjSlP900AU9rH1yZsqq FSiuklZ0TDjrPQoHx0CLw7CspAdlMh8IJ7b9yEWuOD9m26KZiCU7D2cXRMmTCHbBNI pVkHPjju2ehO/3/QWSJNDh9vuYxqDYLnTUKNJldj71/uHBe1w+Vt8uTvJzHh9hKp99 5/3DH4IEH+yWg== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Chaitanya Kulkarni , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Leon Romanovsky , Zhu Yanjun Subject: [RFC 15/16] block: add dma_link_range() based API Date: Tue, 5 Mar 2024 12:15:25 +0200 Message-ID: <1e52aa392b9c434f55203c9d630dd06fcdb75c32.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Chaitanya Kulkarni Add two helper functions that are needed to calculate the total DMA length of the request blk_rq_get_dma_length() and to create DMA mapping blk_rq_dma_map(). blk_rq_get_dma_length() is used to get the total length of the request, when driver is allocating IOVA space for this request with the call to dma_alloc_iova(). This length is then initialized to the iova->size and passed to allocate iova call chain :- dma_map_ops->allov_iova() iommu_dma_alloc_iova() alloc_iova_fast() iova_rcache_get() OR alloc_iova() blk_rq_dma_map() iterates through bvec list and creates DMA mapping for each page using iova parameter with the help of dma_link_range(). Note that @iova is allocated & pre-initialized using dma_alloc_iova() by the caller. After creating a mapping for each page, call into the callback function @cb provided by the drive with a mapped DMA address for this page, offset into the iova space (needed at the time of unlink), length of the mapped page, and page number that is mapped in this request. Driver is responsible for using this DMA address to complete the mapping of underlying protocol-specific data structures, such as NVMe PRPs or NVMe SGLs. This callback approach allows us to iterate bvec list only once to create bvec to DMA mapping and use that DMA address in driver to build the protocol-specific data structure, essentially mapping one bvec page at a time to DMA address and using that DMA address to create underlying protocol-specific data structures. Finally, returning the number of linked count. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Leon Romanovsky --- block/blk-merge.c | 156 +++++++++++++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 9 +++ 2 files changed, 165 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 2d470cf2173e..63effc8ac1db 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -583,6 +583,162 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(__blk_rq_map_sg); +static dma_addr_t blk_dma_link_page(struct page *page, unsigned int page_offset, + struct dma_iova_attrs *iova, + dma_addr_t dma_offset) +{ + dma_addr_t dma_addr; + int ret; + + dma_addr = dma_link_range(page, page_offset, iova, dma_offset); + ret = dma_mapping_error(iova->dev, dma_addr); + if (ret) { + pr_err("dma_mapping_err %d dma_addr 0x%llx dma_offset %llu\n", + ret, dma_addr, dma_offset); + /* better way ? */ + dma_addr = 0; + } + return dma_addr; +} + +/** + * blk_rq_dma_map: block layer request to DMA mapping helper. + * + * @req : [in] request to be mapped + * @cb : [in] callback to be called for each bvec mapped bvec into + * underlaying driver. + * @cb_data : [in] callback data to be passed, privete to the underlaying + * driver. + * @iova : [in] iova to be used to create DMA mapping for this request's + * bvecs. + * Description: + * Iterates through bvec list and create dma mapping between each bvec page + * using @iova with dma_link_range(). Note that @iova needs to be allocated and + * pre-initialized using dma_alloc_iova() by the caller. After creating + * a mapping for each page, call into the callback function @cb provided by + * driver with mapped dma address for this bvec, offset into iova space, length + * of the mapped page, and bvec number that is mapped in this requets. Driver is + * responsible for using this dma address to complete the mapping of underlaying + * protocol specific data structure, such as NVMe PRPs or NVMe SGLs. This + * callback approach allows us to iterate bvec list only once to create bvec to + * DMA mapping & use that dma address in the driver to build the protocol + * specific data structure, essentially mapping one bvec page at a time to DMA + * address and use that DMA address to create underlaying protocol specific + * data structure. + * + * Caller needs to ensure @iova is initialized & allovated with using + * dma_alloc_iova(). + */ +int blk_rq_dma_map(struct request *req, driver_map_cb cb, void *cb_data, + struct dma_iova_attrs *iova) +{ + dma_addr_t curr_dma_offset = 0; + dma_addr_t prev_dma_addr = 0; + dma_addr_t dma_addr; + size_t prev_dma_len = 0; + struct req_iterator iter; + struct bio_vec bv; + int linked_cnt = 0; + + rq_for_each_bvec(bv, req, iter) { + if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) { + curr_dma_offset = prev_dma_addr + prev_dma_len; + + dma_addr = blk_dma_link_page(bv.bv_page, bv.bv_offset, + iova, curr_dma_offset); + if (!dma_addr) + break; + + cb(cb_data, linked_cnt, dma_addr, curr_dma_offset, + bv.bv_len); + + prev_dma_len = bv.bv_len; + prev_dma_addr = dma_addr; + linked_cnt++; + } else { + unsigned nbytes = bv.bv_len; + unsigned total = 0; + unsigned offset, len; + + while (nbytes > 0) { + struct page *page = bv.bv_page; + + offset = bv.bv_offset + total; + len = min(get_max_segment_size(&req->q->limits, + page, offset), + nbytes); + + page += (offset >> PAGE_SHIFT); + offset &= ~PAGE_MASK; + + curr_dma_offset = prev_dma_addr + prev_dma_len; + + dma_addr = blk_dma_link_page(page, offset, + iova, + curr_dma_offset); + if (!dma_addr) + break; + + cb(cb_data, linked_cnt, dma_addr, + curr_dma_offset, len); + + total += len; + nbytes -= len; + + prev_dma_len = len; + prev_dma_addr = dma_addr; + linked_cnt++; + } + } + } + return linked_cnt; +} +EXPORT_SYMBOL_GPL(blk_rq_dma_map); + +/* + * Calculate total DMA length needed to satisfy this request. + */ +size_t blk_rq_get_dma_length(struct request *rq) +{ + struct request_queue *q = rq->q; + struct bio *bio = rq->bio; + unsigned int offset, len; + struct bvec_iter iter; + size_t dma_length = 0; + struct bio_vec bvec; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return rq->special_vec.bv_len; + + if (!rq->bio) + return 0; + + for_each_bio(bio) { + bio_for_each_bvec(bvec, bio, iter) { + unsigned int nbytes = bvec.bv_len; + unsigned int total = 0; + + if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE) { + dma_length += bvec.bv_len; + continue; + } + + while (nbytes > 0) { + offset = bvec.bv_offset + total; + len = min(get_max_segment_size(&q->limits, + bvec.bv_page, + offset), nbytes); + total += len; + nbytes -= len; + dma_length += len; + } + } + } + + return dma_length; +} +EXPORT_SYMBOL(blk_rq_get_dma_length); + static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7a8150a5f051..80b9c7f2c3a0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -8,6 +8,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -1144,7 +1145,15 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, return __blk_rq_map_sg(q, rq, sglist, &last_sg); } + +typedef void (*driver_map_cb)(void *cb_data, u32 cnt, dma_addr_t dma_addr, + dma_addr_t offset, u32 len); + +int blk_rq_dma_map(struct request *req, driver_map_cb cb, void *cb_data, + struct dma_iova_attrs *iova); + void blk_dump_rq_flags(struct request *, char *); +size_t blk_rq_get_dma_length(struct request *rq); #ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_rq_zone_no(struct request *rq) From patchwork Tue Mar 5 10:15:26 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13581969 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D6C4765BCD; Tue, 5 Mar 2024 10:16:39 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633800; cv=none; b=pm/8+RHCkrDJc4m+/Bv8/azxpY1JZoe8O04FgDsRRmSgJuDCAWCEVvtg00Re3graXm6ZgWrhCdaFAJ1CLO4I9+sdZjikhDT5EdSqHGnmVyM8jBBvI2D1301enhNZ3A8+H6WgMtMQ095lSo4PwQtL3ExPD6cV/TvOmos3HE67igM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709633800; c=relaxed/simple; bh=POBXmxmfWLR3OdaBK59TZEAySmTXPtp10vl7hcXrrqE=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=YlYmsCcekUZjcloG/5j8XnVfQUhCXfqremZB+4bfACXLF4Rj+FdOSToJRO/hiDmd7cPu6wvq3vLCXOcETxR1vNcDMCSX2t7zWkxbqKiuqVHCZXk4KE/KgGuj+IySrBlBjTgL6IVaqVw97lwZfeSuSn23dfbWRs02IJPMH35kytc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=VTFmI4f2; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="VTFmI4f2" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 0BF3AC43609; Tue, 5 Mar 2024 10:16:39 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709633799; bh=POBXmxmfWLR3OdaBK59TZEAySmTXPtp10vl7hcXrrqE=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=VTFmI4f2DZvwdpBpwsbb8kyrGjVIZbcmQOHHwyDR0biw0twpPzx4NpvjUFpX0Gk7H C+sCQE7uq7qTB93P3sS7irPKFFCb4FaZKw01WlmNm+yE4jyYDqLuTxfuHM30FE2Vui Rmm/rt4Jlyczdj/X3Q5fhaAG/XRTQ6z7sGUWipQF7MSYaWU3wWmjSLDXZNLL2taYuH 7FpJt98diCoAw4a6AcuAWxMCiyTyNswR2gsDQO8ylGptyoFGStdREYbJ8zwj4mRDSA cg7RNSBRvoxHUvBB7jmw+w5OM9WUOmUhppGWLdgrC40bIvxrdiZPOEWuM3g4409p0e dytDSDKhAxIYA== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Chaitanya Kulkarni , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Leon Romanovsky , Zhu Yanjun Subject: [RFC 16/16] nvme-pci: use blk_rq_dma_map() for NVMe SGL Date: Tue, 5 Mar 2024 12:15:26 +0200 Message-ID: <016fc02cbfa9be3c156a6f74df38def1e09c08f1.1709631413.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: kvm@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Chaitanya Kulkarni Update nvme_iod structure to hold iova, list of DMA linked addresses and total linked count, first one is needed in the request submission path to create a request to DMA mapping and last two are needed in the request completion path to remove the DMA mapping. In nvme_map_data() initialize iova with device, direction, and iova dma length with the help of blk_rq_get_dma_length(). Allocate iova using dma_alloc_iova(). and call in nvme_pci_setup_sgls(). Call newly added blk_rq_dma_map() to create request to DMA mapping and provide a callback function nvme_pci_sgl_map(). In the callback function initialize NVMe SGL dma addresses. Finally in nvme_unmap_data() unlink the dma address and free iova. Full disclosure:- ----------------- This is an RFC to demonstrate the newly added DMA APIs can be used to map/unmap bvecs without the use of sg list, hence I've modified the pci code to only handle SGLs for now. Once we have some agreement on the structure of new DMA API I'll add support for PRPs along with all the optimization that I've removed from the code for this RFC for NVMe SGLs and PRPs. I was able to run fio verification job successfully :- $ fio fio/verify.fio --ioengine=io_uring --filename=/dev/nvme0n1 --loops=10 write-and-verify: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=io_uring, iodepth=16 fio-3.36 Starting 1 process Jobs: 1 (f=1): [V(1)][81.6%][r=12.2MiB/s][r=1559 IOPS][eta 03m:00s] write-and-verify: (groupid=0, jobs=1): err= 0: pid=4435: Mon Mar 4 20:54:48 2024 read: IOPS=2789, BW=21.8MiB/s (22.9MB/s)(6473MiB/297008msec) slat (usec): min=4, max=5124, avg=356.51, stdev=604.30 clat (nsec): min=1593, max=23376k, avg=5377076.99, stdev=2039189.93 lat (usec): min=493, max=23407, avg=5733.58, stdev=2103.22 clat percentiles (usec): | 1.00th=[ 1172], 5.00th=[ 2114], 10.00th=[ 2835], 20.00th=[ 3654], | 30.00th=[ 4228], 40.00th=[ 4752], 50.00th=[ 5276], 60.00th=[ 5800], | 70.00th=[ 6325], 80.00th=[ 7046], 90.00th=[ 8094], 95.00th=[ 8979], | 99.00th=[10421], 99.50th=[11076], 99.90th=[12780], 99.95th=[14222], | 99.99th=[16909] write: IOPS=2608, BW=20.4MiB/s (21.4MB/s)(10.0GiB/502571msec); 0 zone resets slat (usec): min=4, max=5787, avg=382.68, stdev=649.01 clat (nsec): min=521, max=23650k, avg=5751363.17, stdev=2676065.35 lat (usec): min=95, max=23674, avg=6134.04, stdev=2813.48 clat percentiles (usec): | 1.00th=[ 709], 5.00th=[ 1270], 10.00th=[ 1958], 20.00th=[ 3261], | 30.00th=[ 4228], 40.00th=[ 5014], 50.00th=[ 5800], 60.00th=[ 6521], | 70.00th=[ 7373], 80.00th=[ 8225], 90.00th=[ 9241], 95.00th=[ 9896], | 99.00th=[11469], 99.50th=[11863], 99.90th=[13960], 99.95th=[15270], | 99.99th=[17695] bw ( KiB/s): min= 1440, max=132496, per=99.28%, avg=20715.88, stdev=13123.13, samples=1013 iops : min= 180, max=16562, avg=2589.34, stdev=1640.39, samples=1013 lat (nsec) : 750=0.01% lat (usec) : 2=0.01%, 4=0.01%, 100=0.01%, 250=0.01%, 500=0.07% lat (usec) : 750=0.79%, 1000=1.22% lat (msec) : 2=5.94%, 4=18.87%, 10=69.53%, 20=3.58%, 50=0.01% cpu : usr=1.01%, sys=98.95%, ctx=1591, majf=0, minf=2286 IO depths : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=828524,1310720,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=16 Run status group 0 (all jobs): READ: bw=21.8MiB/s (22.9MB/s), 21.8MiB/s-21.8MiB/s (22.9MB/s-22.9MB/s), io=6473MiB (6787MB), run=297008-297008msec WRITE: bw=20.4MiB/s (21.4MB/s), 20.4MiB/s-20.4MiB/s (21.4MB/s-21.4MB/s), io=10.0GiB (10.7GB), run=502571-502571msec Disk stats (read/write): nvme0n1: ios=829189/1310720, sectors=13293416/20971520, merge=0/0, ticks=836561/1340351, in_queue=2176913, util=99.30% Signed-off-by: Chaitanya Kulkarni Signed-off-by: Leon Romanovsky --- drivers/nvme/host/pci.c | 220 +++++++++------------------------------- 1 file changed, 49 insertions(+), 171 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e6267a6aa380..140939228409 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -236,7 +236,9 @@ struct nvme_iod { unsigned int dma_len; /* length of single DMA segment mapping */ dma_addr_t first_dma; dma_addr_t meta_dma; - struct sg_table sgt; + struct dma_iova_attrs iova; + dma_addr_t dma_link_address[128]; + u16 nr_dma_link_address; union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; }; @@ -521,25 +523,10 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, return true; } -static void nvme_free_prps(struct nvme_dev *dev, struct request *req) -{ - const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - dma_addr_t dma_addr = iod->first_dma; - int i; - - for (i = 0; i < iod->nr_allocations; i++) { - __le64 *prp_list = iod->list[i].prp_list; - dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); - - dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); - dma_addr = next_dma_addr; - } -} - static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + u16 i; if (iod->dma_len) { dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, @@ -547,9 +534,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) return; } - WARN_ON_ONCE(!iod->sgt.nents); - - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); + for (i = 0; i < iod->nr_dma_link_address; i++) + dma_unlink_range(&iod->iova, iod->dma_link_address[i]); if (iod->nr_allocations == 0) dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list, @@ -557,120 +543,15 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) else if (iod->nr_allocations == 1) dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list, iod->first_dma); - else - nvme_free_prps(dev, req); - mempool_free(iod->sgt.sgl, dev->iod_mempool); -} - -static void nvme_print_sgl(struct scatterlist *sgl, int nents) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sgl, sg, nents, i) { - dma_addr_t phys = sg_phys(sg); - pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " - "dma_address:%pad dma_length:%d\n", - i, &phys, sg->offset, sg->length, &sg_dma_address(sg), - sg_dma_len(sg)); - } -} - -static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; - int length = blk_rq_payload_bytes(req); - struct scatterlist *sg = iod->sgt.sgl; - int dma_len = sg_dma_len(sg); - u64 dma_addr = sg_dma_address(sg); - int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); - __le64 *prp_list; - dma_addr_t prp_dma; - int nprps, i; - - length -= (NVME_CTRL_PAGE_SIZE - offset); - if (length <= 0) { - iod->first_dma = 0; - goto done; - } - - dma_len -= (NVME_CTRL_PAGE_SIZE - offset); - if (dma_len) { - dma_addr += (NVME_CTRL_PAGE_SIZE - offset); - } else { - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } - - if (length <= NVME_CTRL_PAGE_SIZE) { - iod->first_dma = dma_addr; - goto done; - } - - nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); - if (nprps <= (256 / 8)) { - pool = dev->prp_small_pool; - iod->nr_allocations = 0; - } else { - pool = dev->prp_page_pool; - iod->nr_allocations = 1; - } - - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) { - iod->nr_allocations = -1; - return BLK_STS_RESOURCE; - } - iod->list[0].prp_list = prp_list; - iod->first_dma = prp_dma; - i = 0; - for (;;) { - if (i == NVME_CTRL_PAGE_SIZE >> 3) { - __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) - goto free_prps; - iod->list[iod->nr_allocations++].prp_list = prp_list; - prp_list[0] = old_prp_list[i - 1]; - old_prp_list[i - 1] = cpu_to_le64(prp_dma); - i = 1; - } - prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= NVME_CTRL_PAGE_SIZE; - dma_addr += NVME_CTRL_PAGE_SIZE; - length -= NVME_CTRL_PAGE_SIZE; - if (length <= 0) - break; - if (dma_len > 0) - continue; - if (unlikely(dma_len < 0)) - goto bad_sgl; - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } -done: - cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl)); - cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); - return BLK_STS_OK; -free_prps: - nvme_free_prps(dev, req); - return BLK_STS_RESOURCE; -bad_sgl: - WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents), - "Invalid SGL for payload:%d nents:%d\n", - blk_rq_payload_bytes(req), iod->sgt.nents); - return BLK_STS_IOERR; + dma_free_iova(&iod->iova); } static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, - struct scatterlist *sg) + dma_addr_t dma_addr, + unsigned int dma_len) { - sge->addr = cpu_to_le64(sg_dma_address(sg)); - sge->length = cpu_to_le32(sg_dma_len(sg)); + sge->addr = cpu_to_le64(dma_addr); + sge->length = cpu_to_le32(dma_len); sge->type = NVME_SGL_FMT_DATA_DESC << 4; } @@ -682,25 +563,37 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } +struct nvme_pci_sgl_map_data { + struct nvme_iod *iod; + struct nvme_sgl_desc *sgl_list; +}; + +static void nvme_pci_sgl_map(void *data, u32 cnt, dma_addr_t dma_addr, + dma_addr_t offset, u32 len) +{ + struct nvme_pci_sgl_map_data *d = data; + struct nvme_sgl_desc *sgl_list = d->sgl_list; + struct nvme_iod *iod = d->iod; + + nvme_pci_sgl_set_data(&sgl_list[cnt], dma_addr, len); + iod->dma_link_address[cnt] = offset; + iod->nr_dma_link_address++; +} + static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, struct request *req, struct nvme_rw_command *cmd) { + unsigned int entries = blk_rq_nr_phys_segments(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; struct nvme_sgl_desc *sg_list; - struct scatterlist *sg = iod->sgt.sgl; - unsigned int entries = iod->sgt.nents; + struct dma_pool *pool; dma_addr_t sgl_dma; - int i = 0; + int linked_count; + struct nvme_pci_sgl_map_data data; /* setting the transfer type as SGL */ cmd->flags = NVME_CMD_SGL_METABUF; - if (entries == 1) { - nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); - return BLK_STS_OK; - } - if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { pool = dev->prp_small_pool; iod->nr_allocations = 0; @@ -718,11 +611,13 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, iod->list[0].sg_list = sg_list; iod->first_dma = sgl_dma; - nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); - do { - nvme_pci_sgl_set_data(&sg_list[i++], sg); - sg = sg_next(sg); - } while (--entries > 0); + data.iod = iod; + data.sgl_list = sg_list; + + linked_count = blk_rq_dma_map(req, nvme_pci_sgl_map, &data, + &iod->iova); + + nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, linked_count); return BLK_STS_OK; } @@ -788,36 +683,20 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, &cmnd->rw, &bv); } } - - iod->dma_len = 0; - iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); - if (!iod->sgt.sgl) + iod->iova.dev = dev->dev; + iod->iova.dir = rq_dma_dir(req); + iod->iova.attrs = DMA_ATTR_NO_WARN; + iod->iova.size = blk_rq_get_dma_length(req); + if (!iod->iova.size) return BLK_STS_RESOURCE; - sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); - iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl); - if (!iod->sgt.orig_nents) - goto out_free_sg; - rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); - if (rc) { - if (rc == -EREMOTEIO) - ret = BLK_STS_TARGET; - goto out_free_sg; - } + rc = dma_alloc_iova(&iod->iova); + if (rc) + return BLK_STS_RESOURCE; - if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) - ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); - else - ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); - if (ret != BLK_STS_OK) - goto out_unmap_sg; - return BLK_STS_OK; + iod->dma_len = 0; -out_unmap_sg: - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); -out_free_sg: - mempool_free(iod->sgt.sgl, dev->iod_mempool); + ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); return ret; } @@ -841,7 +720,6 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) iod->aborted = false; iod->nr_allocations = -1; - iod->sgt.nents = 0; ret = nvme_setup_cmd(req->q->queuedata, req); if (ret)