From patchwork Tue Mar 5 11:18:32 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582201 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7F79557870; Tue, 5 Mar 2024 11:18:58 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637538; cv=none; b=NRXSY+LoGSe/9uufMFBlhMO0GvfcYPMFbibOZwRU8qWeyp1vAD5RO2G9Bzp0KbqwEpAuovxzmHXRUHY54VEW2yho6uENaWXRxZmJZx0lIAKaP3iKPGH7rPb309iP+DOGSJ+Y9xE6k4ZKbC1MRpyM9x9Plly4ixlFRb+AcdKNOGQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637538; c=relaxed/simple; bh=EE9BRQu4WZxxi1OZrE2CKyiBDx8STpwb4qcA00ZbW2s=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=f5D6mFj+RtC2Sn28dy11Tlym4+zpsq8vFH2hOg0XErmu5LO7vQAsjwJRWx0AECG25CGsZxv0AU2YJEL3otmOSkptmbgQVV1/+4nGyJ844wi/qzyfhFmr7z1dELe5xqiRegIC8MvTbjJ1ADjHWS1NFtiIf0npKSnFM30x8AC3uwg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=oDVDa6DQ; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="oDVDa6DQ" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 9A2EEC43390; Tue, 5 Mar 2024 11:18:57 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637538; bh=EE9BRQu4WZxxi1OZrE2CKyiBDx8STpwb4qcA00ZbW2s=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=oDVDa6DQwNTcv3NwnWsgiiMKOIV7/K6NDlVTdgGBiYwnB7EisVZPVobiznADeFy7f ZNkpViGakTvQNDN5lcqWymg90055eoDxJGkjcEo0SLPgVGObALAW8gzuwJbHHbIrnQ GzaG57EN9Qa1s6MgxTTglPO3tlZGinvm+yuj5Uw6p0CXEuATlMU768QtIgpbaacA4s QEjkVrnPb4cNOVhk/MGHgtTp878F1cJdcZAkgg9j458WXiok8jtBXFcnRaKQl71Tny m6tk3S6ClgeMTJJuB+n/Eg16neudzyOU+cRnBAYeXVb8c2Fit16B8JJyUe8jG5cqdP Ikg+UfLv0Xv6A== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 01/16] mm/hmm: let users to tag specific PFNs Date: Tue, 5 Mar 2024 13:18:32 +0200 Message-ID: X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Introduce new sticky flag, which isn't overwritten by HMM range fault. Such flag allows users to tag specific PFNs with extra data in addition to already filled by HMM. Signed-off-by: Leon Romanovsky --- include/linux/hmm.h | 3 +++ mm/hmm.c | 34 +++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 126a36571667..b90902baa593 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -23,6 +23,7 @@ struct mmu_interval_notifier; * HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID) * HMM_PFN_ERROR - accessing the pfn is impossible and the device should * fail. ie poisoned memory, special pages, no vma, etc + * HMM_PFN_STICKY - Flag preserved on input-to-output transformation * * On input: * 0 - Return the current state of the page, do not fault it. @@ -36,6 +37,8 @@ enum hmm_pfn_flags { HMM_PFN_VALID = 1UL << (BITS_PER_LONG - 1), HMM_PFN_WRITE = 1UL << (BITS_PER_LONG - 2), HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3), + /* Sticky lag, carried from Input to Output */ + HMM_PFN_STICKY = 1UL << (BITS_PER_LONG - 7), HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8), /* Input flags */ diff --git a/mm/hmm.c b/mm/hmm.c index 277ddcab4947..9645a72beec0 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -44,8 +44,10 @@ static int hmm_pfns_fill(unsigned long addr, unsigned long end, { unsigned long i = (addr - range->start) >> PAGE_SHIFT; - for (; addr < end; addr += PAGE_SIZE, i++) - range->hmm_pfns[i] = cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++) { + range->hmm_pfns[i] &= HMM_PFN_STICKY; + range->hmm_pfns[i] |= cpu_flags; + } return 0; } @@ -202,8 +204,10 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr, return hmm_vma_fault(addr, end, required_fault, walk); pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) { + hmm_pfns[i] &= HMM_PFN_STICKY; + hmm_pfns[i] |= pfn | cpu_flags; + } return 0; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -236,7 +240,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (required_fault) goto fault; - *hmm_pfn = 0; + *hmm_pfn = *hmm_pfn & HMM_PFN_STICKY; return 0; } @@ -253,14 +257,14 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, cpu_flags = HMM_PFN_VALID; if (is_writable_device_private_entry(entry)) cpu_flags |= HMM_PFN_WRITE; - *hmm_pfn = swp_offset_pfn(entry) | cpu_flags; + *hmm_pfn = (*hmm_pfn & HMM_PFN_STICKY) | swp_offset_pfn(entry) | cpu_flags; return 0; } required_fault = hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0); if (!required_fault) { - *hmm_pfn = 0; + *hmm_pfn = *hmm_pfn & HMM_PFN_STICKY; return 0; } @@ -304,11 +308,11 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, pte_unmap(ptep); return -EFAULT; } - *hmm_pfn = HMM_PFN_ERROR; + *hmm_pfn = (*hmm_pfn & HMM_PFN_STICKY) | HMM_PFN_ERROR; return 0; } - *hmm_pfn = pte_pfn(pte) | cpu_flags; + *hmm_pfn = (*hmm_pfn & HMM_PFN_STICKY) | pte_pfn(pte) | cpu_flags; return 0; fault: @@ -453,8 +457,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end, } pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - for (i = 0; i < npages; ++i, ++pfn) - hmm_pfns[i] = pfn | cpu_flags; + for (i = 0; i < npages; ++i, ++pfn) { + hmm_pfns[i] &= HMM_PFN_STICKY; + hmm_pfns[i] |= pfn | cpu_flags; + } goto out_unlock; } @@ -512,8 +518,10 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask, } pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT); - for (; addr < end; addr += PAGE_SIZE, i++, pfn++) - range->hmm_pfns[i] = pfn | cpu_flags; + for (; addr < end; addr += PAGE_SIZE, i++, pfn++) { + range->hmm_pfns[i] &= HMM_PFN_STICKY; + range->hmm_pfns[i] |= pfn | cpu_flags; + } spin_unlock(ptl); return 0; From patchwork Tue Mar 5 11:18:33 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582202 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6289C5917C; Tue, 5 Mar 2024 11:19:02 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637543; cv=none; b=ZdS+pVkwean091/IHIbV9UTHoP5s8qj/NtbA6+KwPap/TXC4hMjgbc4gAipyCRtbxHbtMBr0CnHgB1qRLRFixnSF+IuzRqTW1K7yy4LPyIMczKcS8WC/i2pUNvDgdS+zd2RPSU/YnVzCUdOJgNEbO5/seQzlyGx4OpKNwVD22Bo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637543; c=relaxed/simple; bh=XGhek5Egl9EZN5GDA4K/P4lgi/cIEFOzTc1LzkqAZ+k=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=FhoMTcmuOkHGGu3RFYsXmK+qthJ6WEpMDgvi35CfLhsQeQjDE6LRxT85z54siRHGk3Pd9tFl8CbRfwL1kgkBTVQsaG9tfjavDMGUl+Cyi7qag6JrJgWWAdeKhqk+Os/asCDeOeTi+/KKQViu5pHQg+6PJn40fxZ+JXyPxFBbFGc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ExrnqGWc; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ExrnqGWc" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 337BFC43399; Tue, 5 Mar 2024 11:19:02 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637542; bh=XGhek5Egl9EZN5GDA4K/P4lgi/cIEFOzTc1LzkqAZ+k=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=ExrnqGWc0DJr7OpJKj54LQl6f6CrjIMS2nso33ODtmHYOG4xoeRiikz9mKhHtlevS OW18Yww55g/DlhzTuNXVBVpBGGcPumW3WdmaZ+BSIXs71LW5gBPDeb7bGZAQYLyvPv eofh7gi2/b9TS7bAWiucQd7AmGomodjGllpcXaLcXsgET3VyyxDkx+LqTWXrUBpjFt XZlLtS7lCvQofgbhGV6A2fJWURQN6Cq8QMD8H14fX9kTepqWmYZdq08iM8zY8rsSnt iAcZ0j9KR8OSybyHpCjS9/R4re/Fny883NywNVBcPJBPDkiabjHWIsmcoJpY3J4vjr YJS7GIjPXBQ2A== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 02/16] dma-mapping: provide an interface to allocate IOVA Date: Tue, 5 Mar 2024 13:18:33 +0200 Message-ID: <54a3554639bfb963c9919c5d7c1f449021bebdb3.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Existing .map_page() callback provides two things at the same time: allocates IOVA and links DMA pages. That combination works great for most of the callers who use it in control paths, but less effective in fast paths. These advanced callers already manage their data in some sort of database and can perform IOVA allocation in advance, leaving range linkage operation to be in fast path. Provide an interface to allocate/deallocate IOVA and next patch link/unlink DMA ranges to that specific IOVA. Signed-off-by: Leon Romanovsky --- include/linux/dma-map-ops.h | 3 +++ include/linux/dma-mapping.h | 20 ++++++++++++++++++++ kernel/dma/mapping.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 4abc60f04209..bd605b44bb57 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -83,6 +83,9 @@ struct dma_map_ops { size_t (*max_mapping_size)(struct device *dev); size_t (*opt_mapping_size)(void); unsigned long (*get_merge_boundary)(struct device *dev); + + dma_addr_t (*alloc_iova)(struct device *dev, size_t size); + void (*free_iova)(struct device *dev, dma_addr_t dma_addr, size_t size); }; #ifdef CONFIG_DMA_OPS diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 4a658de44ee9..176fb8a86d63 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -91,6 +91,16 @@ static inline void debug_dma_map_single(struct device *dev, const void *addr, } #endif /* CONFIG_DMA_API_DEBUG */ +struct dma_iova_attrs { + /* OUT field */ + dma_addr_t addr; + /* IN fields */ + struct device *dev; + size_t size; + enum dma_data_direction dir; + unsigned long attrs; +}; + #ifdef CONFIG_HAS_DMA static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { @@ -101,6 +111,9 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) return 0; } +int dma_alloc_iova(struct dma_iova_attrs *iova); +void dma_free_iova(struct dma_iova_attrs *iova); + dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs); @@ -159,6 +172,13 @@ void dma_vunmap_noncontiguous(struct device *dev, void *vaddr); int dma_mmap_noncontiguous(struct device *dev, struct vm_area_struct *vma, size_t size, struct sg_table *sgt); #else /* CONFIG_HAS_DMA */ +static inline int dma_alloc_iova(struct dma_iova_attrs *iova) +{ + return -EOPNOTSUPP; +} +static inline void dma_free_iova(struct dma_iova_attrs *iova) +{ +} static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 58db8fd70471..b6b27bab90f3 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -183,6 +183,36 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, } EXPORT_SYMBOL(dma_unmap_page_attrs); +int dma_alloc_iova(struct dma_iova_attrs *iova) +{ + struct device *dev = iova->dev; + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_map_direct(dev, ops) || !ops->alloc_iova) { + iova->addr = 0; + return 0; + } + + iova->addr = ops->alloc_iova(dev, iova->size); + if (dma_mapping_error(dev, iova->addr)) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(dma_alloc_iova); + +void dma_free_iova(struct dma_iova_attrs *iova) +{ + struct device *dev = iova->dev; + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_map_direct(dev, ops) || !ops->free_iova) + return; + + ops->free_iova(dev, iova->addr, iova->size); +} +EXPORT_SYMBOL(dma_free_iova); + static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { From patchwork Tue Mar 5 11:18:34 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582203 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B71BB5A4ED; Tue, 5 Mar 2024 11:19:06 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637546; cv=none; b=n0CkIn7HjHa4kw6HS8aK3x0hjVvneAhmLQceiXCqTIxrPBaC40L9PLv0SBBwPvRmTmy6v7UvJXMWp/xWM8R8Q1Ej7r1dR6OEWZ+l6lw0MMfJkywEEsqcqlnqFxCAc/T4lT5RXj5GqRGLMbogUjns95rR1iDBEwZJH0G5iwThj1c= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637546; c=relaxed/simple; bh=qAJEud8zSEtY+UaYfguXl0FuNLhyjwr4jvzS2H85FiM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=uOFeUVCht3moNhOjMhjE0hQfOV+6Uhhv2s9qdqzWtRg0lidkv1t2WSBq0OaeDS7bT3Uc752Y/DuC4XLCFEV87EqRp4Nl4go9mVE4Ocagu+FiXG89dvOODXxaZBy8VJm49ZGEzj+Fz8fU7iB6+VmJa5uensB3NAiPbxaLWAWUlb8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=KjAV+KIv; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="KjAV+KIv" Received: by smtp.kernel.org (Postfix) with ESMTPSA id C7A1EC433C7; Tue, 5 Mar 2024 11:19:05 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637546; bh=qAJEud8zSEtY+UaYfguXl0FuNLhyjwr4jvzS2H85FiM=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=KjAV+KIvzMLMIdyG6JQAPl5ekfey4SRoKdyMLevM5fk4VwjTRfuycwOlywUkEPDcy 0DlyyyZ/zaORCQOgnjFjU8rjSTbdyLBgVk6eAf8U/7rgufjmCZV0Se7mNNWv57XAwW 3kcOXGxfOWDGLKRTmHc39BKEs25lC2ncwvbVWh9JRe7ynug0LMSFdUR51UNihoDebH 7Rp6XPyJK6e/PYCkYMmdP+NsRMPF6nlrrpQ8zDLr4mpVlJhfrPCf1T4vHDBeatDyJP OIIkQ912eZthQoJoEx5LjrRYDUM2DluK9KVVx2lxW+TXlMzYxT1ExRjxvcAi/tEEWs yPGH84ns91w6w== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 03/16] dma-mapping: provide callbacks to link/unlink pages to specific IOVA Date: Tue, 5 Mar 2024 13:18:34 +0200 Message-ID: X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Introduce new DMA link/unlink API to provide a way for advanced users to directly map/unmap pages without ned to allocate IOVA on every map call. Signed-off-by: Leon Romanovsky --- include/linux/dma-map-ops.h | 10 +++++++ include/linux/dma-mapping.h | 13 +++++++++ kernel/dma/debug.h | 2 ++ kernel/dma/direct.h | 3 ++ kernel/dma/mapping.c | 57 +++++++++++++++++++++++++++++++++++++ 5 files changed, 85 insertions(+) diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index bd605b44bb57..fd03a080df1e 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -86,6 +86,13 @@ struct dma_map_ops { dma_addr_t (*alloc_iova)(struct device *dev, size_t size); void (*free_iova)(struct device *dev, dma_addr_t dma_addr, size_t size); + dma_addr_t (*link_range)(struct device *dev, struct page *page, + unsigned long offset, dma_addr_t addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs); + void (*unlink_range)(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs); }; #ifdef CONFIG_DMA_OPS @@ -428,6 +435,9 @@ bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg, #define arch_dma_unmap_sg_direct(d, s, n) (false) #endif +#define arch_dma_link_range_direct arch_dma_map_page_direct +#define arch_dma_unlink_range_direct arch_dma_unmap_page_direct + #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, bool coherent); diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 176fb8a86d63..91cc084adb53 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -113,6 +113,9 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) int dma_alloc_iova(struct dma_iova_attrs *iova); void dma_free_iova(struct dma_iova_attrs *iova); +dma_addr_t dma_link_range(struct page *page, unsigned long offset, + struct dma_iova_attrs *iova, dma_addr_t dma_offset); +void dma_unlink_range(struct dma_iova_attrs *iova, dma_addr_t dma_offset); dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, @@ -179,6 +182,16 @@ static inline int dma_alloc_iova(struct dma_iova_attrs *iova) static inline void dma_free_iova(struct dma_iova_attrs *iova) { } +static inline dma_addr_t dma_link_range(struct page *page, unsigned long offset, + struct dma_iova_attrs *iova, + dma_addr_t dma_offset) +{ + return DMA_MAPPING_ERROR; +} +static inline void dma_unlink_range(struct dma_iova_attrs *iova, + dma_addr_t dma_offset) +{ +} static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs) diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index f525197d3cae..3d529f355c6d 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -127,4 +127,6 @@ static inline void debug_dma_sync_sg_for_device(struct device *dev, { } #endif /* CONFIG_DMA_API_DEBUG */ +#define debug_dma_link_range debug_dma_map_page +#define debug_dma_unlink_range debug_dma_unmap_page #endif /* _KERNEL_DMA_DEBUG_H */ diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 18d346118fe8..1c30e1cd607a 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -125,4 +125,7 @@ static inline void dma_direct_unmap_page(struct device *dev, dma_addr_t addr, swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); } + +#define dma_direct_link_range dma_direct_map_page +#define dma_direct_unlink_range dma_direct_unmap_page #endif /* _KERNEL_DMA_DIRECT_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index b6b27bab90f3..f989c64622c2 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -213,6 +213,63 @@ void dma_free_iova(struct dma_iova_attrs *iova) } EXPORT_SYMBOL(dma_free_iova); +/** + * dma_link_range - Link a physical page to DMA address + * @page: The page to be mapped + * @offset: The offset within the page + * @iova: Preallocated IOVA attributes + * @dma_offset: DMA offset form which this page needs to be linked + * + * dma_alloc_iova() allocates IOVA based on the size specified by ther user in + * iova->size. Call this function after IOVA allocation to link @page from + * @offset to get the DMA address. Note that very first call to this function + * will have @dma_offset set to 0 in the IOVA space allocated from + * dma_alloc_iova(). For subsequent calls to this function on same @iova, + * @dma_offset needs to be advanced by the caller with the size of previous + * page that was linked + DMA address returned for the previous page that was + * linked by this function. + */ +dma_addr_t dma_link_range(struct page *page, unsigned long offset, + struct dma_iova_attrs *iova, dma_addr_t dma_offset) +{ + struct device *dev = iova->dev; + size_t size = iova->size; + enum dma_data_direction dir = iova->dir; + unsigned long attrs = iova->attrs; + dma_addr_t addr = iova->addr + dma_offset; + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_map_direct(dev, ops) || + arch_dma_link_range_direct(dev, page_to_phys(page) + offset + size)) + addr = dma_direct_link_range(dev, page, offset, size, dir, attrs); + else if (ops->link_range) + addr = ops->link_range(dev, page, offset, addr, size, dir, attrs); + + kmsan_handle_dma(page, offset, size, dir); + debug_dma_link_range(dev, page, offset, size, dir, addr, attrs); + return addr; +} +EXPORT_SYMBOL(dma_link_range); + +void dma_unlink_range(struct dma_iova_attrs *iova, dma_addr_t dma_offset) +{ + struct device *dev = iova->dev; + size_t size = iova->size; + enum dma_data_direction dir = iova->dir; + unsigned long attrs = iova->attrs; + dma_addr_t addr = iova->addr + dma_offset; + const struct dma_map_ops *ops = get_dma_ops(dev); + + if (dma_map_direct(dev, ops) || + arch_dma_unlink_range_direct(dev, addr + size)) + dma_direct_unlink_range(dev, addr, size, dir, attrs); + else if (ops->unlink_range) + ops->unlink_range(dev, addr, size, dir, attrs); + + debug_dma_unlink_range(dev, addr, size, dir); +} +EXPORT_SYMBOL(dma_unlink_range); + static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { From patchwork Tue Mar 5 11:18:35 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582204 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B930F5B5A9; Tue, 5 Mar 2024 11:19:10 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637550; cv=none; b=l5YhC6EsRTVx2sDjwR5V/2Jp4rI39dQDBiQS/jHeSEN6U6RDL2IM+BRD5Wy+s4ykPY3BZog1T1xbUzK6fpj2RdJw20inBVKaMpWjNcmJ87nscAlfH3yzENTwVHC7gM0+g71C30idhHB9SDjo50glrQkRqP8flkdd1bSnX1I4I4I= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637550; c=relaxed/simple; bh=W6hmByMbviZow30AysrbmdCiStk7NPI/+8Qeoey1JY8=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=IxrWI2Mg+fqy2BH6nKEc83hC280UkDqbNgfHvNhhMYVibsbZWBHyyDgxCMnGcTCoSp8sH/QTnT93IUpjH6R+cWkhYEv+DOVnXOJHU4RDL+eWr3Ks80khrSjTrTOQ0PWXLGKt45ydLK3f4YbMw7X8b/MnIRXVcP22RYMmOMRB4mk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=MpdeScnU; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="MpdeScnU" Received: by smtp.kernel.org (Postfix) with ESMTPSA id C71CDC43399; Tue, 5 Mar 2024 11:19:09 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637550; bh=W6hmByMbviZow30AysrbmdCiStk7NPI/+8Qeoey1JY8=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=MpdeScnUIeYk+dhaM84hI3cY+U/wRl1QwW6ChtfURz2UuXIkV/XmMb1iErLTtmXiG /HjgMX099w475jFo54Xe8Q+caajG6pt9XGe0DUtiQeES0xUT7BVd7CoWZIw8EA+uwo 6dsPHe/M7bBaiSWUKfrDQaBJaxVr4WKQToT6uKCEnE7l4hHS5sYzkV1muh3ik4mCtm eJmV3SQytdzwIJJShgEZZtirWhrLq9A38CmemQNqmmnucs2zB1yVGEf9bDZylrpk3V omibGAuvCU7jmybm4/4tvGmiTBVuJ6HkGBg7ybwdZHbeop+Scpi/j4bhWKRUfyTUEQ 3R/7MjGMtOeCw== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 04/16] iommu/dma: Provide an interface to allow preallocate IOVA Date: Tue, 5 Mar 2024 13:18:35 +0200 Message-ID: X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Separate IOVA allocation to dedicated callback so it will allow cache of IOVA and reuse it in fast paths for devices which support ODP (on-demand-paging) mechanism. Signed-off-by: Leon Romanovsky --- drivers/iommu/dma-iommu.c | 50 +++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 50ccc4f1ef81..e55726783501 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -356,7 +356,7 @@ int iommu_dma_init_fq(struct iommu_domain *domain) atomic_set(&cookie->fq_timer_on, 0); /* * Prevent incomplete fq state being observable. Pairs with path from - * __iommu_dma_unmap() through iommu_dma_free_iova() to queue_iova() + * __iommu_dma_unmap() through __iommu_dma_free_iova() to queue_iova() */ smp_wmb(); WRITE_ONCE(cookie->fq_domain, domain); @@ -760,7 +760,7 @@ static int dma_info_to_prot(enum dma_data_direction dir, bool coherent, } } -static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, +static dma_addr_t __iommu_dma_alloc_iova(struct iommu_domain *domain, size_t size, u64 dma_limit, struct device *dev) { struct iommu_dma_cookie *cookie = domain->iova_cookie; @@ -806,7 +806,7 @@ static dma_addr_t iommu_dma_alloc_iova(struct iommu_domain *domain, return (dma_addr_t)iova << shift; } -static void iommu_dma_free_iova(struct iommu_dma_cookie *cookie, +static void __iommu_dma_free_iova(struct iommu_dma_cookie *cookie, dma_addr_t iova, size_t size, struct iommu_iotlb_gather *gather) { struct iova_domain *iovad = &cookie->iovad; @@ -843,7 +843,7 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, if (!iotlb_gather.queued) iommu_iotlb_sync(domain, &iotlb_gather); - iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); + __iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); } static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, @@ -861,12 +861,12 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, size = iova_align(iovad, size + iova_off); - iova = iommu_dma_alloc_iova(domain, size, dma_mask, dev); + iova = __iommu_dma_alloc_iova(domain, size, dma_mask, dev); if (!iova) return DMA_MAPPING_ERROR; if (iommu_map(domain, iova, phys - iova_off, size, prot, GFP_ATOMIC)) { - iommu_dma_free_iova(cookie, iova, size, NULL); + __iommu_dma_free_iova(cookie, iova, size, NULL); return DMA_MAPPING_ERROR; } return iova + iova_off; @@ -970,7 +970,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, return NULL; size = iova_align(iovad, size); - iova = iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask, dev); + iova = __iommu_dma_alloc_iova(domain, size, dev->coherent_dma_mask, dev); if (!iova) goto out_free_pages; @@ -1004,7 +1004,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev, out_free_sg: sg_free_table(sgt); out_free_iova: - iommu_dma_free_iova(cookie, iova, size, NULL); + __iommu_dma_free_iova(cookie, iova, size, NULL); out_free_pages: __iommu_dma_free_pages(pages, count); return NULL; @@ -1436,7 +1436,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, if (!iova_len) return __finalise_sg(dev, sg, nents, 0); - iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev); + iova = __iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev); if (!iova) { ret = -ENOMEM; goto out_restore_sg; @@ -1453,7 +1453,7 @@ static int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg, return __finalise_sg(dev, sg, nents, iova); out_free_iova: - iommu_dma_free_iova(cookie, iova, iova_len, NULL); + __iommu_dma_free_iova(cookie, iova, iova_len, NULL); out_restore_sg: __invalidate_sg(sg, nents); out: @@ -1706,6 +1706,30 @@ static size_t iommu_dma_opt_mapping_size(void) return iova_rcache_range(); } +static dma_addr_t iommu_dma_alloc_iova(struct device *dev, size_t size) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + dma_addr_t dma_mask = dma_get_mask(dev); + + size = iova_align(iovad, size); + return __iommu_dma_alloc_iova(domain, size, dma_mask, dev); +} + +static void iommu_dma_free_iova(struct device *dev, dma_addr_t iova, + size_t size) +{ + struct iommu_domain *domain = iommu_get_dma_domain(dev); + struct iommu_dma_cookie *cookie = domain->iova_cookie; + struct iova_domain *iovad = &cookie->iovad; + struct iommu_iotlb_gather iotlb_gather; + + size = iova_align(iovad, size); + iommu_iotlb_gather_init(&iotlb_gather); + __iommu_dma_free_iova(cookie, iova, size, &iotlb_gather); +} + static const struct dma_map_ops iommu_dma_ops = { .flags = DMA_F_PCI_P2PDMA_SUPPORTED, .alloc = iommu_dma_alloc, @@ -1728,6 +1752,8 @@ static const struct dma_map_ops iommu_dma_ops = { .unmap_resource = iommu_dma_unmap_resource, .get_merge_boundary = iommu_dma_get_merge_boundary, .opt_mapping_size = iommu_dma_opt_mapping_size, + .alloc_iova = iommu_dma_alloc_iova, + .free_iova = iommu_dma_free_iova, }; /* @@ -1776,7 +1802,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, if (!msi_page) return NULL; - iova = iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev); + iova = __iommu_dma_alloc_iova(domain, size, dma_get_mask(dev), dev); if (!iova) goto out_free_page; @@ -1790,7 +1816,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev, return msi_page; out_free_iova: - iommu_dma_free_iova(cookie, iova, size, NULL); + __iommu_dma_free_iova(cookie, iova, size, NULL); out_free_page: kfree(msi_page); return NULL; From patchwork Tue Mar 5 11:18:36 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582205 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id F07475676A; Tue, 5 Mar 2024 11:19:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637555; cv=none; b=sfoHu4AomiLMp3L63kf1/xlAUylwmzm35t7Fxe7AKgRh1gfLPkpRyWBjfYNmfRLZm2QK79MKmmn/Kb2G85BGJJnBlS6F7HX+VIP/RdleuHTnsewWAmVCEEMaTNSbhaX5/GYUdGZnlu9XpCbva6u+mqhqLRAJIdpqEmS6lQT+bSk= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637555; c=relaxed/simple; bh=fDITfGtiOJR6jcG42lBCR+mHX3ThOxLIN0l1OlLSZWc=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=Z0dGs4oq7SJth9HGvCYLWg3v5rQ/EzMlc8fmfv5XvhkiYP5FQbPzpMhlZLUwZG/T/KZgg7ReLmTZgo+8/qtGeYeCFFzFbIKSZQlx2dd9+ffJbTlkQiVrzOH3IuuCugPsLgaEdh0iH5Ena75xtbldT3lTdi66vfvxG2r3r5y+KTA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=UtR+RVnD; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="UtR+RVnD" Received: by smtp.kernel.org (Postfix) with ESMTPSA id CF30FC43399; Tue, 5 Mar 2024 11:19:13 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637554; bh=fDITfGtiOJR6jcG42lBCR+mHX3ThOxLIN0l1OlLSZWc=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=UtR+RVnDuKHTmkad8An+k+08jfF/EbhAR2Ufm0sY+g7oiOpnM3KqMyBeFha9FWL7u b2boG4rlmSWII7lfBNUMjOhEuaYoY23cUWa/HkOBe/t6ai8nTS3MPFMk2LN3IkjZ39 BMLsiBs1EKWHs80tDc5Rxm7i3whu9DKChJtmQbKiYp5rHil6wGXNsTaJJ7p90P7Z28 VxkSTyod4a91q2BlUn5JwOCrTAHYYFefrP6ABgy8IV3ATLOOz7Lpps3mK9vbjNmWWb ve/h7tSzs2BOjn9kxTPFrDD5D+Rg1KdtwI3dCfQbpH++v7TFOZIIYca6Gx4SZqTQtc DbdmkCjQVpg1g== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 05/16] iommu/dma: Prepare map/unmap page functions to receive IOVA Date: Tue, 5 Mar 2024 13:18:36 +0200 Message-ID: <13187a8682ab4f8708ca88cc4363f90e64e14ccc.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Extend the existing map_page/unmap_page function implementations to get preallocated IOVA. In such case, the IOVA allocation needs to be skipped, but rest of the code stays the same. Signed-off-by: Leon Romanovsky --- drivers/iommu/dma-iommu.c | 68 ++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 23 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index e55726783501..dbdd373a609a 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -824,7 +824,7 @@ static void __iommu_dma_free_iova(struct iommu_dma_cookie *cookie, } static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, - size_t size) + size_t size, bool free_iova) { struct iommu_domain *domain = iommu_get_dma_domain(dev); struct iommu_dma_cookie *cookie = domain->iova_cookie; @@ -843,17 +843,19 @@ static void __iommu_dma_unmap(struct device *dev, dma_addr_t dma_addr, if (!iotlb_gather.queued) iommu_iotlb_sync(domain, &iotlb_gather); - __iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); + if (free_iova) + __iommu_dma_free_iova(cookie, dma_addr, size, &iotlb_gather); } static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, - size_t size, int prot, u64 dma_mask) + dma_addr_t iova, size_t size, int prot, + u64 dma_mask) { struct iommu_domain *domain = iommu_get_dma_domain(dev); struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; size_t iova_off = iova_offset(iovad, phys); - dma_addr_t iova; + bool no_iova = !iova; if (static_branch_unlikely(&iommu_deferred_attach_enabled) && iommu_deferred_attach(dev, domain)) @@ -861,12 +863,14 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys, size = iova_align(iovad, size + iova_off); - iova = __iommu_dma_alloc_iova(domain, size, dma_mask, dev); + if (no_iova) + iova = __iommu_dma_alloc_iova(domain, size, dma_mask, dev); if (!iova) return DMA_MAPPING_ERROR; if (iommu_map(domain, iova, phys - iova_off, size, prot, GFP_ATOMIC)) { - __iommu_dma_free_iova(cookie, iova, size, NULL); + if (no_iova) + __iommu_dma_free_iova(cookie, iova, size, NULL); return DMA_MAPPING_ERROR; } return iova + iova_off; @@ -1031,7 +1035,7 @@ static void *iommu_dma_alloc_remap(struct device *dev, size_t size, return vaddr; out_unmap: - __iommu_dma_unmap(dev, *dma_handle, size); + __iommu_dma_unmap(dev, *dma_handle, size, true); __iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT); return NULL; } @@ -1060,7 +1064,7 @@ static void iommu_dma_free_noncontiguous(struct device *dev, size_t size, { struct dma_sgt_handle *sh = sgt_handle(sgt); - __iommu_dma_unmap(dev, sgt->sgl->dma_address, size); + __iommu_dma_unmap(dev, sgt->sgl->dma_address, size, true); __iommu_dma_free_pages(sh->pages, PAGE_ALIGN(size) >> PAGE_SHIFT); sg_free_table(&sh->sgt); kfree(sh); @@ -1131,9 +1135,11 @@ static void iommu_dma_sync_sg_for_device(struct device *dev, arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); } -static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, enum dma_data_direction dir, - unsigned long attrs) +static dma_addr_t __iommu_dma_map_pages(struct device *dev, struct page *page, + unsigned long offset, dma_addr_t iova, + size_t size, + enum dma_data_direction dir, + unsigned long attrs) { phys_addr_t phys = page_to_phys(page) + offset; bool coherent = dev_is_dma_coherent(dev); @@ -1141,7 +1147,7 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, struct iommu_domain *domain = iommu_get_dma_domain(dev); struct iommu_dma_cookie *cookie = domain->iova_cookie; struct iova_domain *iovad = &cookie->iovad; - dma_addr_t iova, dma_mask = dma_get_mask(dev); + dma_addr_t addr, dma_mask = dma_get_mask(dev); /* * If both the physical buffer start address and size are @@ -1182,14 +1188,23 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) arch_sync_dma_for_device(phys, size, dir); - iova = __iommu_dma_map(dev, phys, size, prot, dma_mask); - if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys)) + addr = __iommu_dma_map(dev, phys, iova, size, prot, dma_mask); + if (addr == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys)) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); - return iova; + return addr; } -static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction dir, unsigned long attrs) +static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) +{ + return __iommu_dma_map_pages(dev, page, offset, 0, size, dir, attrs); +} + +static void __iommu_dma_unmap_pages(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs, bool free_iova) { struct iommu_domain *domain = iommu_get_dma_domain(dev); phys_addr_t phys; @@ -1201,12 +1216,19 @@ static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) arch_sync_dma_for_cpu(phys, size, dir); - __iommu_dma_unmap(dev, dma_handle, size); + __iommu_dma_unmap(dev, dma_handle, size, free_iova); if (unlikely(is_swiotlb_buffer(dev, phys))) swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs); } +static void iommu_dma_unmap_page(struct device *dev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + __iommu_dma_unmap_pages(dev, dma_handle, size, dir, attrs, true); +} + /* * Prepare a successfully-mapped scatterlist to give back to the caller. * @@ -1509,13 +1531,13 @@ static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, } if (end) - __iommu_dma_unmap(dev, start, end - start); + __iommu_dma_unmap(dev, start, end - start, true); } static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, unsigned long attrs) { - return __iommu_dma_map(dev, phys, size, + return __iommu_dma_map(dev, phys, 0, size, dma_info_to_prot(dir, false, attrs) | IOMMU_MMIO, dma_get_mask(dev)); } @@ -1523,7 +1545,7 @@ static dma_addr_t iommu_dma_map_resource(struct device *dev, phys_addr_t phys, static void iommu_dma_unmap_resource(struct device *dev, dma_addr_t handle, size_t size, enum dma_data_direction dir, unsigned long attrs) { - __iommu_dma_unmap(dev, handle, size); + __iommu_dma_unmap(dev, handle, size, true); } static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr) @@ -1560,7 +1582,7 @@ static void __iommu_dma_free(struct device *dev, size_t size, void *cpu_addr) static void iommu_dma_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t handle, unsigned long attrs) { - __iommu_dma_unmap(dev, handle, size); + __iommu_dma_unmap(dev, handle, size, true); __iommu_dma_free(dev, size, cpu_addr); } @@ -1626,7 +1648,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t size, if (!cpu_addr) return NULL; - *handle = __iommu_dma_map(dev, page_to_phys(page), size, ioprot, + *handle = __iommu_dma_map(dev, page_to_phys(page), 0, size, ioprot, dev->coherent_dma_mask); if (*handle == DMA_MAPPING_ERROR) { __iommu_dma_free(dev, size, cpu_addr); From patchwork Tue Mar 5 11:18:37 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582206 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 834045C608; Tue, 5 Mar 2024 11:19:18 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637558; cv=none; b=QCaKBBmQ5t/mVSWdm/oG9ha1mAx3QZHrIGr3QF3dDlCYcLLSN2FbReEQW/nRvSWkyE4POccuqrz4bxwyhemV5qVeQY2NmIOakTGgdGTqZcayLrpZy/De6ajip8wGS51HT2pMLWdRN44RFUbhdbHPF5TmJ+yVaH+ikS/PHYFUO6M= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637558; c=relaxed/simple; bh=buI8QIvB9qQbf6UKyQhoBlduuXAN0B68Jk0a66fCfeA=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=WDcbHcCFGq1MH+jnWD1fugsxXhL/KTb2RY0235X6kGZB9QipJYHCr/XMmgHGTiGqnuaQSo3HHYM0z/0H/yhhq+6LgSMqOOeDEZPOM4scp+FS2lQDEWu3wzubnL6eh2de1axkk2JdI4A40C9vtaBDYTvMleqtQv/if4QhLH6KnnE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=VA62QsmE; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="VA62QsmE" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 57BD1C433C7; Tue, 5 Mar 2024 11:19:17 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637558; bh=buI8QIvB9qQbf6UKyQhoBlduuXAN0B68Jk0a66fCfeA=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=VA62QsmEtqhXBGAK4rN6VNEitTXLkb/2XYlSb4WBudevL9kDdBoh4tifdL/VjRFar 1H+/u30Q6gMUDemnHEK81ZwzsQARd67GB/2fvAmpk2CcjeRwvxKaTYdXgZ6GlVbmJJ 8uClIzWnmd+JyzMwtCHXLFF+u11aUDPMDymVmgmB8ncNrM8PGMvr5oAFCmcsFQ1qHg 1PKhw1asupu8LCj6Rp9jbukxSAp+szjyVb0frvcUjnShx1fXiOT6haE07ec60A46X/ 4KPQMde2Kg01UiKCpNqSAs27L3iCvFAZykPVHDsIOgRFXstkHoDfFwljyvBg+8qvrU PdxEE8VVerc/g== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 06/16] iommu/dma: Implement link/unlink page callbacks Date: Tue, 5 Mar 2024 13:18:37 +0200 Message-ID: <1d3d26afcdbf95b053a3a44ceff34a4fa5334582.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Add an implementation of link/unlink interface to perform in map/unmap pages in fast patch for pre-allocated IOVA. Signed-off-by: Leon Romanovsky --- drivers/iommu/dma-iommu.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index dbdd373a609a..b683c4a4e9f8 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1752,6 +1752,21 @@ static void iommu_dma_free_iova(struct device *dev, dma_addr_t iova, __iommu_dma_free_iova(cookie, iova, size, &iotlb_gather); } +static dma_addr_t iommu_dma_link_range(struct device *dev, struct page *page, + unsigned long offset, dma_addr_t iova, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + return __iommu_dma_map_pages(dev, page, offset, iova, size, dir, attrs); +} + +static void iommu_dma_unlink_range(struct device *dev, dma_addr_t addr, + size_t size, enum dma_data_direction dir, + unsigned long attrs) +{ + __iommu_dma_unmap_pages(dev, addr, size, dir, attrs, false); +} + static const struct dma_map_ops iommu_dma_ops = { .flags = DMA_F_PCI_P2PDMA_SUPPORTED, .alloc = iommu_dma_alloc, @@ -1776,6 +1791,8 @@ static const struct dma_map_ops iommu_dma_ops = { .opt_mapping_size = iommu_dma_opt_mapping_size, .alloc_iova = iommu_dma_alloc_iova, .free_iova = iommu_dma_free_iova, + .link_range = iommu_dma_link_range, + .unlink_range = iommu_dma_unlink_range, }; /* From patchwork Tue Mar 5 11:18:38 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582207 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 15B4157308; Tue, 5 Mar 2024 11:19:22 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637563; cv=none; b=T/LcYuKaCDToooEoiq4b3qqGGwkexnZlp68JYLJEvHdtd86biJRU8FF4/GI9yrQNguatGALSutzdYhqV8a3xp0nNoDJXjGVoHSXVIkLbUbbsnJsC5krhCSR8wF193yHLD5HPAMTGPMA4KGlSyHXbcDoeW+baQkdl6eGzLx/f2PA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637563; c=relaxed/simple; bh=E8ahaIx2eHfPJk0G81HhbupoEvLGKO0ealmbIxFQ2TY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=gq5u7H2qAj1A3zfeVkXXLRbRwZdL91E0AdU87VAUj2qQH3UAN9a4bTdvziFgyP0KgnbBoGWXqo+aw9+HV3B2HkYdlqBfkPAFcsgicR4SkvzQPUg5ZwINPGKUc7kS21vxIDCm+PqoV783mgxvCyMXvF32fX8joZD3LPafDTEgAYU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=caYziKGN; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="caYziKGN" Received: by smtp.kernel.org (Postfix) with ESMTPSA id BFF43C433F1; Tue, 5 Mar 2024 11:19:21 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637562; bh=E8ahaIx2eHfPJk0G81HhbupoEvLGKO0ealmbIxFQ2TY=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=caYziKGN4nQ6kN+9eq9OdBuTlgubm0Pb0jxPUg1nvKbDYZ7pFGIzeYrAmlE6IxeVv 746C+LqlVHPQ4dDcVQlVgnA2EHmFaCDoMqB8C5Q3zAXCesOehVRaHHoJvjmHc8io+k obH2BT5XHaaCbjd0RRi0CnpJ0GsTLZmOM8Vs7UUadTlXpFB9Su+dS65MTkhFtRA1TV uwDMsDHNaOVKiuMmBctORC3dngSDNkvdEz9cizg0F/CI1eZqzIjMskfyTBmd3Z/pDN Z1OLKXLVUURP5juEVvIRb6GwritonO9eAzr+ppm65LjXs9YffyvBUxqeu9hjdFCa+0 wMe/HvMTuqPEQ== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 07/16] RDMA/umem: Preallocate and cache IOVA for UMEM ODP Date: Tue, 5 Mar 2024 13:18:38 +0200 Message-ID: <47cc27fbaf9f4bd19edbcaac380bdd9684c5d12f.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky As a preparation to provide two step interface to map pages, preallocate IOVA when UMEM is initialized. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_odp.c | 16 +++++++++++++++- include/rdma/ib_umem_odp.h | 1 + include/rdma/ib_verbs.h | 18 ++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index e9fa22d31c23..f69d1233dc82 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -50,6 +50,7 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, const struct mmu_interval_notifier_ops *ops) { + struct ib_device *dev = umem_odp->umem.ibdev; int ret; umem_odp->umem.is_odp = 1; @@ -87,15 +88,25 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, goto out_pfn_list; } + umem_odp->iova.dev = dev->dma_device; + umem_odp->iova.size = end - start; + umem_odp->iova.dir = DMA_BIDIRECTIONAL; + ret = ib_dma_alloc_iova(dev, &umem_odp->iova); + if (ret) + goto out_dma_list; + + ret = mmu_interval_notifier_insert(&umem_odp->notifier, umem_odp->umem.owning_mm, start, end - start, ops); if (ret) - goto out_dma_list; + goto out_free_iova; } return 0; +out_free_iova: + ib_dma_free_iova(dev, &umem_odp->iova); out_dma_list: kvfree(umem_odp->dma_list); out_pfn_list: @@ -262,6 +273,8 @@ EXPORT_SYMBOL(ib_umem_odp_get); void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { + struct ib_device *dev = umem_odp->umem.ibdev; + /* * Ensure that no more pages are mapped in the umem. * @@ -274,6 +287,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) ib_umem_end(umem_odp)); mutex_unlock(&umem_odp->umem_mutex); mmu_interval_notifier_remove(&umem_odp->notifier); + ib_dma_free_iova(dev, &umem_odp->iova); kvfree(umem_odp->dma_list); kvfree(umem_odp->pfn_list); } diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 0844c1d05ac6..bb2d7f2a5b04 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -23,6 +23,7 @@ struct ib_umem_odp { * See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT. */ dma_addr_t *dma_list; + struct dma_iova_attrs iova; /* * The umem_mutex protects the page_list and dma_list fields of an ODP * umem, allowing only a single thread to map/unmap pages. The mutex diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index b7b6b58dd348..e71fa19187cc 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4077,6 +4077,24 @@ static inline int ib_dma_mapping_error(struct ib_device *dev, u64 dma_addr) return dma_mapping_error(dev->dma_device, dma_addr); } +static inline int ib_dma_alloc_iova(struct ib_device *dev, + struct dma_iova_attrs *iova) +{ + if (ib_uses_virt_dma(dev)) + return 0; + + return dma_alloc_iova(iova); +} + +static inline void ib_dma_free_iova(struct ib_device *dev, + struct dma_iova_attrs *iova) +{ + if (ib_uses_virt_dma(dev)) + return; + + dma_free_iova(iova); +} + /** * ib_dma_map_single - Map a kernel virtual address to DMA address * @dev: The device for which the dma_addr is to be created From patchwork Tue Mar 5 11:18:39 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582208 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id CF5465CDF4; Tue, 5 Mar 2024 11:19:26 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637566; cv=none; b=dW65QyXgRmOfj1uuswGUGKbcti//cmf42EkG1IuIx1cJfeWkI3qwaJTJ13hC24ur0blduUsIU1uW6ZS27/WC/iju4DDv0eCioMAQA4bSzm4u9MXj5seD6J9N2Tik8emA+SyAWMrOTVSi3OK2N7WwFCBee8QM155lFyNnwpBbpmQ= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637566; c=relaxed/simple; bh=aiC62CDa6AOfDDD5eldoWW566/DmWHKZI7ImrxLpIDI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=l+uLtRuw9tgtH/AbSYraeDiourIQgMc/resRbOJpVqkHgoCMQxj75rQ0HrHGNY2b6CpcfScB9IM2mkDVazTnMVVH3jOdn0bm2U6HzXyxpjWR0zLtzrju38EPo+TH8eIybqXDwkniuLwFw924du27CYzKpwX179arhhqEs8GiM/Q= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=sxauiQbK; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="sxauiQbK" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8EE64C433C7; Tue, 5 Mar 2024 11:19:25 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637566; bh=aiC62CDa6AOfDDD5eldoWW566/DmWHKZI7ImrxLpIDI=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=sxauiQbK5SOfxP09Kr8WJ3H5SXBJpE9ai2Oak8MuIomPFE2wIIL0DffrE3OFirAe+ fExoc7zYDckQcVg7Q9HxN0Kf2KhjxQCYyjQ/bKj+GWsvyYSbaASlXvX3kSQlb8tDV+ negyeJrhLO1x6ETD4+P5oOjlMVmyJ+QmHypJ5Z885h+uYvD9aCDhj+HjwbkCp/odSk Q8jIW/X0VOqYko89iHFoegY+QPyjbFRWTDw1fP1UKGv8oECfT2OeHeAfGxqwpDTyzl SLo0squsEi3NFppDz5F3pi5JdnQvzwyBtrSs4fweoPh62TKRUaCADKcrKk3/aYdmHf p+g0ttSUwTwag== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 08/16] RDMA/umem: Store ODP access mask information in PFN Date: Tue, 5 Mar 2024 13:18:39 +0200 Message-ID: <88b042d29a28a2866d5bc5ca20bdba4a71bc7aca.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky As a preparation to remove of dma_list, store access mask in PFN pointer and not in dma_addr_t. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_odp.c | 99 +++++++++++----------------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/odp.c | 37 ++++++----- include/rdma/ib_umem_odp.h | 13 ---- 4 files changed, 59 insertions(+), 91 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index f69d1233dc82..3619fb78f786 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -310,22 +310,11 @@ EXPORT_SYMBOL(ib_umem_odp_release); static int ib_umem_odp_map_dma_single_page( struct ib_umem_odp *umem_odp, unsigned int dma_index, - struct page *page, - u64 access_mask) + struct page *page) { struct ib_device *dev = umem_odp->umem.ibdev; dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; - if (*dma_addr) { - /* - * If the page is already dma mapped it means it went through - * a non-invalidating trasition, like read-only to writable. - * Resync the flags. - */ - *dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask; - return 0; - } - *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, DMA_BIDIRECTIONAL); if (ib_dma_mapping_error(dev, *dma_addr)) { @@ -333,7 +322,6 @@ static int ib_umem_odp_map_dma_single_page( return -EFAULT; } umem_odp->npages++; - *dma_addr |= access_mask; return 0; } @@ -369,9 +357,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, struct hmm_range range = {}; unsigned long timeout; - if (access_mask == 0) - return -EINVAL; - if (user_virt < ib_umem_start(umem_odp) || user_virt + bcnt > ib_umem_end(umem_odp)) return -EFAULT; @@ -397,7 +382,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, if (fault) { range.default_flags = HMM_PFN_REQ_FAULT; - if (access_mask & ODP_WRITE_ALLOWED_BIT) + if (access_mask & HMM_PFN_WRITE) range.default_flags |= HMM_PFN_REQ_WRITE; } @@ -429,22 +414,17 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, for (pfn_index = 0; pfn_index < num_pfns; pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { - if (fault) { - /* - * Since we asked for hmm_range_fault() to populate - * pages it shouldn't return an error entry on success. - */ - WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); - WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); - } else { - if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) { - WARN_ON(umem_odp->dma_list[dma_index]); - continue; - } - access_mask = ODP_READ_ALLOWED_BIT; - if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE) - access_mask |= ODP_WRITE_ALLOWED_BIT; - } + /* + * Since we asked for hmm_range_fault() to populate + * pages it shouldn't return an error entry on success. + */ + WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); + WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); + if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) + continue; + + if (range.hmm_pfns[pfn_index] & HMM_PFN_STICKY) + continue; hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); /* If a hugepage was detected and ODP wasn't set for, the umem @@ -459,13 +439,13 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, } ret = ib_umem_odp_map_dma_single_page( - umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]), - access_mask); + umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index])); if (ret < 0) { ibdev_dbg(umem_odp->umem.ibdev, "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); break; } + range.hmm_pfns[pfn_index] |= HMM_PFN_STICKY; } /* upon success lock should stay on hold for the callee */ if (!ret) @@ -485,7 +465,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - dma_addr_t dma_addr; dma_addr_t dma; int idx; u64 addr; @@ -496,34 +475,34 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, virt = max_t(u64, virt, ib_umem_start(umem_odp)); bound = min_t(u64, bound, ib_umem_end(umem_odp)); for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { + unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; + struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; dma = umem_odp->dma_list[idx]; - /* The access flags guaranteed a valid DMA address in case was NULL */ - if (dma) { - unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT; - struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); - - dma_addr = dma & ODP_DMA_ADDR_MASK; - ib_dma_unmap_page(dev, dma_addr, - BIT(umem_odp->page_shift), - DMA_BIDIRECTIONAL); - if (dma & ODP_WRITE_ALLOWED_BIT) { - struct page *head_page = compound_head(page); - /* - * set_page_dirty prefers being called with - * the page lock. However, MMU notifiers are - * called sometimes with and sometimes without - * the lock. We rely on the umem_mutex instead - * to prevent other mmu notifiers from - * continuing and allowing the page mapping to - * be removed. - */ - set_page_dirty(head_page); - } - umem_odp->dma_list[idx] = 0; - umem_odp->npages--; + if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID)) + continue; + if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_STICKY)) + continue; + + ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift), + DMA_BIDIRECTIONAL); + if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) { + struct page *head_page = compound_head(page); + /* + * set_page_dirty prefers being called with + * the page lock. However, MMU notifiers are + * called sometimes with and sometimes without + * the lock. We rely on the umem_mutex instead + * to prevent other mmu notifiers from + * continuing and allowing the page mapping to + * be removed. + */ + set_page_dirty(head_page); } + umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_STICKY; + umem_odp->npages--; } } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index bbe79b86c717..4f368242680d 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -334,6 +334,7 @@ struct mlx5_ib_flow_db { #define MLX5_IB_UPD_XLT_PD BIT(4) #define MLX5_IB_UPD_XLT_ACCESS BIT(5) #define MLX5_IB_UPD_XLT_INDIRECT BIT(6) +#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7) /* Private QP creation flags to be passed in ib_qp_init_attr.create_flags. * diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 4a04cbc5b78a..5713fe25f4de 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "mlx5_ib.h" #include "cmd.h" @@ -143,22 +144,12 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries, } } -static u64 umem_dma_to_mtt(dma_addr_t umem_dma) -{ - u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK; - - if (umem_dma & ODP_READ_ALLOWED_BIT) - mtt_entry |= MLX5_IB_MTT_READ; - if (umem_dma & ODP_WRITE_ALLOWED_BIT) - mtt_entry |= MLX5_IB_MTT_WRITE; - - return mtt_entry; -} - static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, struct mlx5_ib_mr *mr, int flags) { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); + bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE; + unsigned long pfn; dma_addr_t pa; size_t i; @@ -166,8 +157,17 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, return; for (i = 0; i < nentries; i++) { + pfn = odp->pfn_list[idx + i]; + if (!(pfn & HMM_PFN_VALID)) + /* Initial ODP init */ + continue; + pa = odp->dma_list[idx + i]; - pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); + pa |= MLX5_IB_MTT_READ; + if ((pfn & HMM_PFN_WRITE) && !downgrade) + pa |= MLX5_IB_MTT_WRITE; + + pas[i] = cpu_to_be64(pa); } } @@ -268,8 +268,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni, * estimate the cost of another UMR vs. the cost of bigger * UMR. */ - if (umem_odp->dma_list[idx] & - (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { + if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) { if (!in_block) { blk_start_idx = idx; in_block = 1; @@ -555,7 +554,7 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, { int page_shift, ret, np; bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE; - u64 access_mask; + u64 access_mask = 0; u64 start_idx; bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT); u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC; @@ -563,12 +562,14 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp, if (flags & MLX5_PF_FLAGS_ENABLE) xlt_flags |= MLX5_IB_UPD_XLT_ENABLE; + if (flags & MLX5_PF_FLAGS_DOWNGRADE) + xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE; + page_shift = odp->page_shift; start_idx = (user_va - ib_umem_start(odp)) >> page_shift; - access_mask = ODP_READ_ALLOWED_BIT; if (odp->umem.writable && !downgrade) - access_mask |= ODP_WRITE_ALLOWED_BIT; + access_mask |= HMM_PFN_WRITE; np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault); if (np < 0) diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index bb2d7f2a5b04..095b1297cfb1 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -68,19 +68,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp) umem_odp->page_shift; } -/* - * The lower 2 bits of the DMA address signal the R/W permissions for - * the entry. To upgrade the permissions, provide the appropriate - * bitmask to the map_dma_pages function. - * - * Be aware that upgrading a mapped address might result in change of - * the DMA address for the page. - */ -#define ODP_READ_ALLOWED_BIT (1<<0ULL) -#define ODP_WRITE_ALLOWED_BIT (1<<1ULL) - -#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) - #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct ib_umem_odp * From patchwork Tue Mar 5 11:18:40 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582209 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9C2045D492; Tue, 5 Mar 2024 11:19:30 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637571; cv=none; b=F6fevKfLee8kBBByB52ZQ+T7E1VmulJux0ZWKblPdVq775pNNtYr3O7Q1EGZ2PY1VL+k6XaK0cpHIRYTEvVSbeBK7t/XTGPpGglddRyQe6o0gtYnVF6hwGIED8jCNqJFmMEtiSEs6Eme++FD6GBkkB3FiM9suHiOKspvSS23A5s= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637571; c=relaxed/simple; bh=VVD1wxwJfNeGBkq+0kcqb+dcMkI1qAL6P7iY3YcEByk=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=KhZjtuFUCynT7obzgH0LdJwJ6ViHSmVIa3nmikZIv+d0NOxQhxzgrCNkV0pzd34Bk53um05++nXepIUfMBEtNI+b7M8LZ3LOC0cgzCO2lIzNjD0g8fwLBL4FW+lDlfxjgiBViw8BPuENd5neL5ZVFxQXEP2WPGWOtZFVRsj8wog= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=gsVdVJp7; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="gsVdVJp7" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 69F55C433C7; Tue, 5 Mar 2024 11:19:29 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637570; bh=VVD1wxwJfNeGBkq+0kcqb+dcMkI1qAL6P7iY3YcEByk=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=gsVdVJp7zskmKa3d9GUOqTMHCQEwl4g4RFMzHw0F9Akw/QEvnxId23V4Xv7cqKot0 xRs5/G0U/Z7Cxg2a3sBK2FA0j5+Y6mELzKaeJH7JVfezh3qUANE+OQYzEdn7hfuV+X XJeje9A6U7pXEhmMNYneImOs3HGYtHRYiBpsw7oSs84WxpilNIp3CHJ3HJbi3eE6GJ FPHsYluwmY7jPEyxc/0NB2kONZAnoIeR3xQct8EiJ04Yki++9CJxnbmLNjNM0MUFM0 IYebXXRQLLepDcJStMjfViy39+U4doJWkp04ZNjWAC9PKT4QKxlaX4NwISPQ9mIAhA xPpJYEQf9FUtA== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 09/16] RDMA/core: Separate DMA mapping to caching IOVA and page linkage Date: Tue, 5 Mar 2024 13:18:40 +0200 Message-ID: <22f9bd2e33ca2ec2b3d3bbd4cbac55122991e02f.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Reuse newly added DMA API to cache IOVA and only link/unlink pages in fast path. Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/umem_odp.c | 57 ++---------------------------- drivers/infiniband/hw/mlx5/odp.c | 22 +++++++++++- include/rdma/ib_umem_odp.h | 8 +---- include/rdma/ib_verbs.h | 36 +++++++++++++++++++ 4 files changed, 61 insertions(+), 62 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 3619fb78f786..1301009a6b78 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -81,20 +81,13 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, if (!umem_odp->pfn_list) return -ENOMEM; - umem_odp->dma_list = kvcalloc( - ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL); - if (!umem_odp->dma_list) { - ret = -ENOMEM; - goto out_pfn_list; - } umem_odp->iova.dev = dev->dma_device; umem_odp->iova.size = end - start; umem_odp->iova.dir = DMA_BIDIRECTIONAL; ret = ib_dma_alloc_iova(dev, &umem_odp->iova); if (ret) - goto out_dma_list; - + goto out_pfn_list; ret = mmu_interval_notifier_insert(&umem_odp->notifier, umem_odp->umem.owning_mm, @@ -107,8 +100,6 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, out_free_iova: ib_dma_free_iova(dev, &umem_odp->iova); -out_dma_list: - kvfree(umem_odp->dma_list); out_pfn_list: kvfree(umem_odp->pfn_list); return ret; @@ -288,7 +279,6 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) mutex_unlock(&umem_odp->umem_mutex); mmu_interval_notifier_remove(&umem_odp->notifier); ib_dma_free_iova(dev, &umem_odp->iova); - kvfree(umem_odp->dma_list); kvfree(umem_odp->pfn_list); } put_pid(umem_odp->tgid); @@ -296,40 +286,10 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) } EXPORT_SYMBOL(ib_umem_odp_release); -/* - * Map for DMA and insert a single page into the on-demand paging page tables. - * - * @umem: the umem to insert the page to. - * @dma_index: index in the umem to add the dma to. - * @page: the page struct to map and add. - * @access_mask: access permissions needed for this page. - * - * The function returns -EFAULT if the DMA mapping operation fails. - * - */ -static int ib_umem_odp_map_dma_single_page( - struct ib_umem_odp *umem_odp, - unsigned int dma_index, - struct page *page) -{ - struct ib_device *dev = umem_odp->umem.ibdev; - dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index]; - - *dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift, - DMA_BIDIRECTIONAL); - if (ib_dma_mapping_error(dev, *dma_addr)) { - *dma_addr = 0; - return -EFAULT; - } - umem_odp->npages++; - return 0; -} - /** * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. * * Maps the range passed in the argument to DMA addresses. - * The DMA addresses of the mapped pages is updated in umem_odp->dma_list. * Upon success the ODP MR will be locked to let caller complete its device * page table update. * @@ -437,15 +397,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, __func__, hmm_order, page_shift); break; } - - ret = ib_umem_odp_map_dma_single_page( - umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index])); - if (ret < 0) { - ibdev_dbg(umem_odp->umem.ibdev, - "ib_umem_odp_map_dma_single_page failed with error %d\n", ret); - break; - } - range.hmm_pfns[pfn_index] |= HMM_PFN_STICKY; } /* upon success lock should stay on hold for the callee */ if (!ret) @@ -465,7 +416,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - dma_addr_t dma; int idx; u64 addr; struct ib_device *dev = umem_odp->umem.ibdev; @@ -479,15 +429,14 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]); idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; - dma = umem_odp->dma_list[idx]; if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID)) continue; if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_STICKY)) continue; - ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift), - DMA_BIDIRECTIONAL); + ib_dma_unlink_range(dev, &umem_odp->iova, + idx * (1 << umem_odp->page_shift)); if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) { struct page *head_page = compound_head(page); /* diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 5713fe25f4de..13d61f1ab40b 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -149,6 +149,7 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, { struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE; + struct ib_device *dev = odp->umem.ibdev; unsigned long pfn; dma_addr_t pa; size_t i; @@ -162,12 +163,31 @@ static void populate_mtt(__be64 *pas, size_t idx, size_t nentries, /* Initial ODP init */ continue; - pa = odp->dma_list[idx + i]; + if (pfn & HMM_PFN_STICKY && odp->iova.addr) + /* + * We are in this flow when there is a need to resync flags, + * for example when page was already linked in prefetch call + * with READ flag and now we need to add WRITE flag + * + * This page was already programmed to HW and we don't want/need + * to unlink and link it again just to resync flags. + * + * The DMA address calculation below is based on the fact that + * RDMA UMEM doesn't work with swiotlb. + */ + pa = odp->iova.addr + (idx + i) * (1 << odp->page_shift); + else + pa = ib_dma_link_range(dev, hmm_pfn_to_page(pfn), 0, &odp->iova, + (idx + i) * (1 << odp->page_shift)); + WARN_ON_ONCE(ib_dma_mapping_error(dev, pa)); + pa |= MLX5_IB_MTT_READ; if ((pfn & HMM_PFN_WRITE) && !downgrade) pa |= MLX5_IB_MTT_WRITE; pas[i] = cpu_to_be64(pa); + odp->pfn_list[idx + i] |= HMM_PFN_STICKY; + odp->npages++; } } diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 095b1297cfb1..a786556c65f9 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -17,15 +17,9 @@ struct ib_umem_odp { /* An array of the pfns included in the on-demand paging umem. */ unsigned long *pfn_list; - /* - * An array with DMA addresses mapped for pfns in pfn_list. - * The lower two bits designate access permissions. - * See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT. - */ - dma_addr_t *dma_list; struct dma_iova_attrs iova; /* - * The umem_mutex protects the page_list and dma_list fields of an ODP + * The umem_mutex protects the page_list field of an ODP * umem, allowing only a single thread to map/unmap pages. The mutex * also protects access to the mmu notifier counters. */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e71fa19187cc..c9e2bcd5268a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4160,6 +4160,42 @@ static inline void ib_dma_unmap_page(struct ib_device *dev, dma_unmap_page(dev->dma_device, addr, size, direction); } +/** + * ib_dma_link_range - Link a physical page to DMA address + * @dev: The device for which the dma_addr is to be created + * @page: The page to be mapped + * @offset: The offset within the page + * @iova: Preallocated IOVA attributes + * @dma_offset: DMA offset + */ +static inline dma_addr_t ib_dma_link_range(struct ib_device *dev, + struct page *page, + unsigned long offset, + struct dma_iova_attrs *iova, + dma_addr_t dma_offset) +{ + if (ib_uses_virt_dma(dev)) + return (uintptr_t)(page_address(page) + offset); + + return dma_link_range(page, offset, iova, dma_offset); +} + +/** + * ib_dma_unlink_range - Unlink a mapping created by ib_dma_link_page() + * @dev: The device for which the DMA address was created + * @iova: DMA IOVA properties + * @dma_offset: DMA offset + */ +static inline void ib_dma_unlink_range(struct ib_device *dev, + struct dma_iova_attrs *iova, + dma_addr_t dma_offset) +{ + if (ib_uses_virt_dma(dev)) + return; + + dma_unlink_range(iova, dma_offset); +} + int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents); static inline int ib_dma_map_sg_attrs(struct ib_device *dev, struct scatterlist *sg, int nents, From patchwork Tue Mar 5 11:18:41 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582210 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 68DE75D750; Tue, 5 Mar 2024 11:19:34 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637574; cv=none; b=iu9MPhppo5Eay5nG3s1F4AgJXtWuRBdpuRq72SGDcDrBV5CHunlN1iLgFKhraBeotas2b72/l5EGp/4gFJMovfu/CTT6Akd359PPAkaW7EJa2VNeASbkdcHfmWM0+eyvd8vV4miodHrRQr/PC4KRPni0nBE0IbI9Yuo5acWzqNc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637574; c=relaxed/simple; bh=inT0MN5YzB4Hs5AmOhFFcCz6zSy1Lq0FeGslm64Slg8=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=lsvv95FL3gc+ljIxZv4CwHEyS5bwjMRN0GQJY60yth26NrP9k2VVpWSAstqeKfpX+Sixm0T+YCjwRsahrHc8WXBEnQLYdq6C6L8czprKIW0PaMkM+IbOjLXO8JwJ22upyX5KWvrSOwqSKQh+IN++7o1c2qaDXk3Bxacu8eMYg9I= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=CEpa5Pm+; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="CEpa5Pm+" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 59047C43390; Tue, 5 Mar 2024 11:19:33 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637574; bh=inT0MN5YzB4Hs5AmOhFFcCz6zSy1Lq0FeGslm64Slg8=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=CEpa5Pm+Qqhj2EfkdTwTrAYyAvXIl0YrVxvx+DWKOg/8mx3vqZQn32kmJlhI1Jn5n GkNRqePnBQ708Sr6/JNHoy038LyclUT7V9ppbAO8kM3l2Gld5oEnVCacbFXZ6lGe2x 8BOHhnFPUHmAF/ZNgkUh9zkKatGZOG1TSdpOKgPf8s6lDWYji4TtdmCdYu2MElxBbE isrwPMzFErClEesG4gO7SMHRYcWtqsTO09CRwC5DnOQrJiJQcEenGg/0iPLenz8go1 a5fPge+AvbZx/ANhEW4V7LdpvOPRHNZyJ32k6pOXqocpU3f+5jaKdRBPtPTL+AfaFI ckvCzL1UiaHcg== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 10/16] RDMA/umem: Prevent UMEM ODP creation with SWIOTLB Date: Tue, 5 Mar 2024 13:18:41 +0200 Message-ID: <8c6d5e7db2d1a01888cc7b9b9850b05e19c75c64.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky RDMA UMEM never supported DMA addresses returned from SWIOTLB, as these addresses should be programmed to the hardware which is not aware that it is bounce buffers and not real ones. Instead of silently leave broken system for the users who didn't know it, let's be explicit and return an error to them. Signed-off-by: Leon Romanovsky --- Documentation/core-api/dma-attributes.rst | 7 +++ drivers/infiniband/core/umem_odp.c | 77 +++++++++++------------ include/linux/dma-mapping.h | 6 ++ kernel/dma/direct.h | 4 +- kernel/dma/mapping.c | 4 ++ 5 files changed, 58 insertions(+), 40 deletions(-) diff --git a/Documentation/core-api/dma-attributes.rst b/Documentation/core-api/dma-attributes.rst index 1887d92e8e92..b337ec65d506 100644 --- a/Documentation/core-api/dma-attributes.rst +++ b/Documentation/core-api/dma-attributes.rst @@ -130,3 +130,10 @@ accesses to DMA buffers in both privileged "supervisor" and unprivileged subsystem that the buffer is fully accessible at the elevated privilege level (and ideally inaccessible or at least read-only at the lesser-privileged levels). + +DMA_ATTR_NO_TRANSLATION +----------------------- + +This attribute is used to indicate to the DMA-mapping subsystem that the +buffer is not subject to any address translation. This is used for devices +that doesn't need buffer bouncing or fixing DMA addresses. diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1301009a6b78..57c56000f60e 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -50,51 +50,50 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp, const struct mmu_interval_notifier_ops *ops) { + size_t page_size = 1UL << umem_odp->page_shift; struct ib_device *dev = umem_odp->umem.ibdev; + size_t ndmas, npfns; + unsigned long start; + unsigned long end; int ret; umem_odp->umem.is_odp = 1; mutex_init(&umem_odp->umem_mutex); - if (!umem_odp->is_implicit_odp) { - size_t page_size = 1UL << umem_odp->page_shift; - unsigned long start; - unsigned long end; - size_t ndmas, npfns; - - start = ALIGN_DOWN(umem_odp->umem.address, page_size); - if (check_add_overflow(umem_odp->umem.address, - (unsigned long)umem_odp->umem.length, - &end)) - return -EOVERFLOW; - end = ALIGN(end, page_size); - if (unlikely(end < page_size)) - return -EOVERFLOW; - - ndmas = (end - start) >> umem_odp->page_shift; - if (!ndmas) - return -EINVAL; - - npfns = (end - start) >> PAGE_SHIFT; - umem_odp->pfn_list = kvcalloc( - npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL); - if (!umem_odp->pfn_list) - return -ENOMEM; - - - umem_odp->iova.dev = dev->dma_device; - umem_odp->iova.size = end - start; - umem_odp->iova.dir = DMA_BIDIRECTIONAL; - ret = ib_dma_alloc_iova(dev, &umem_odp->iova); - if (ret) - goto out_pfn_list; - - ret = mmu_interval_notifier_insert(&umem_odp->notifier, - umem_odp->umem.owning_mm, - start, end - start, ops); - if (ret) - goto out_free_iova; - } + if (umem_odp->is_implicit_odp) + return 0; + + start = ALIGN_DOWN(umem_odp->umem.address, page_size); + if (check_add_overflow(umem_odp->umem.address, + (unsigned long)umem_odp->umem.length, &end)) + return -EOVERFLOW; + end = ALIGN(end, page_size); + if (unlikely(end < page_size)) + return -EOVERFLOW; + + ndmas = (end - start) >> umem_odp->page_shift; + if (!ndmas) + return -EINVAL; + + npfns = (end - start) >> PAGE_SHIFT; + umem_odp->pfn_list = + kvcalloc(npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL); + if (!umem_odp->pfn_list) + return -ENOMEM; + + umem_odp->iova.dev = dev->dma_device; + umem_odp->iova.size = end - start; + umem_odp->iova.dir = DMA_BIDIRECTIONAL; + umem_odp->iova.attrs = DMA_ATTR_NO_TRANSLATION; + ret = ib_dma_alloc_iova(dev, &umem_odp->iova); + if (ret) + goto out_pfn_list; + + ret = mmu_interval_notifier_insert(&umem_odp->notifier, + umem_odp->umem.owning_mm, start, + end - start, ops); + if (ret) + goto out_free_iova; return 0; diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 91cc084adb53..89945e707a9b 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -62,6 +62,12 @@ */ #define DMA_ATTR_PRIVILEGED (1UL << 9) +/* + * DMA_ATTR_NO_TRANSLATION: used to indicate that the buffer should not be mapped + * through address translation. + */ +#define DMA_ATTR_NO_TRANSLATION (1UL << 10) + /* * A dma_addr_t can hold any valid DMA or bus address for the platform. It can * be given to a device to use as a DMA source or target. It is specific to a diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h index 1c30e1cd607a..1c9ec204c999 100644 --- a/kernel/dma/direct.h +++ b/kernel/dma/direct.h @@ -92,6 +92,8 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, if (is_swiotlb_force_bounce(dev)) { if (is_pci_p2pdma_page(page)) return DMA_MAPPING_ERROR; + if (attrs & DMA_ATTR_NO_TRANSLATION) + return DMA_MAPPING_ERROR; return swiotlb_map(dev, phys, size, dir, attrs); } @@ -99,7 +101,7 @@ static inline dma_addr_t dma_direct_map_page(struct device *dev, dma_kmalloc_needs_bounce(dev, size, dir)) { if (is_pci_p2pdma_page(page)) return DMA_MAPPING_ERROR; - if (is_swiotlb_active(dev)) + if (is_swiotlb_active(dev) && !(attrs & DMA_ATTR_NO_TRANSLATION)) return swiotlb_map(dev, phys, size, dir, attrs); dev_WARN_ONCE(dev, 1, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index f989c64622c2..49b1fde510c5 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -188,6 +188,10 @@ int dma_alloc_iova(struct dma_iova_attrs *iova) struct device *dev = iova->dev; const struct dma_map_ops *ops = get_dma_ops(dev); + if (dma_map_direct(dev, ops) && is_swiotlb_force_bounce(dev) && + iova->attrs & DMA_ATTR_NO_TRANSLATION) + return -EOPNOTSUPP; + if (dma_map_direct(dev, ops) || !ops->alloc_iova) { iova->addr = 0; return 0; From patchwork Tue Mar 5 11:18:42 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582211 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3E7905D91A; Tue, 5 Mar 2024 11:19:37 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637578; cv=none; b=jpqSuCAqrfR4kSD/SPrRntw/XWk7VRpJMHknemHyH/DDak2huOT8Dgc4UTmnOy7CPVv7+t3q7gNcttKY4j3cCd2zaxnPiCbt4ux07cnbQFbbJhUE+WOAAWc4iviQg/ZQPkC7Wz6lyE42xn7I+9xk9p1QG3iLyCfmN1o0CovLQRU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637578; c=relaxed/simple; bh=utZ3KdFtdsQAESX2EpyzMrGiuaExMhARHy+JfNBKyD0=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=mRtwmDdDlMokIElTh545v1WoQh3/QLOJ+Tvrc1C0lV72GPPoisS6fnIuhqqdqfkO2iAGj7M8O5QCp33Q7lrkPH/Oio8m3lNFhTeRa7e8E6KdO6C7N52v8rxyCN7GldMgpRgyHuwZKYnbKr+Bx+FfXv8U0zoZznXibWmL+tnLEqs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=rNfJT5Xt; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="rNfJT5Xt" Received: by smtp.kernel.org (Postfix) with ESMTPSA id DA7B3C433C7; Tue, 5 Mar 2024 11:19:36 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637577; bh=utZ3KdFtdsQAESX2EpyzMrGiuaExMhARHy+JfNBKyD0=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=rNfJT5XtN75Ck++lflHTuufNoSuRQRIp5fEXn4ts+Oyu8SUrlnDkaCilewhs793Uf seE+pHE7NxgvG6jj4T/7YKQhN8ggD6/95qQ1mZOA7q2EH4quGPHoJ0qRVrNQYIT4IM yIR0OLKZj6wwKuIRx5nKCtXyVl1ovzh8iN9fmf0ko7bRBKeXi8ot2E/rze4WOP2wUb IuhKVcZ/M87fN7cUnwl5UhnM6nlrGfAtauyon9NCUOEbGuT+2retm0kus5cS9hwKqf Vb/65S273UMCMGjIbh/ae5sF7FnhIeGAY80CvUBGoetbUFmr04EJDkd/xzwWzvuAXh nCXYWmptmS2sQ== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 11/16] vfio/mlx5: Explicitly use number of pages instead of allocated length Date: Tue, 5 Mar 2024 13:18:42 +0200 Message-ID: <01606f62be051034035ef1501b7c721b8a319dcc.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky allocated_length is a multiple of page size and number of pages, so let's change the functions to accept number of pages. It opens us a venue to combine receive and send paths together with code readability improvement. Signed-off-by: Leon Romanovsky --- drivers/vfio/pci/mlx5/cmd.c | 31 ++++++++--------- drivers/vfio/pci/mlx5/cmd.h | 10 +++--- drivers/vfio/pci/mlx5/main.c | 65 +++++++++++++++++++++++------------- 3 files changed, 62 insertions(+), 44 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index efd1d252cdc9..45104e47b7b2 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -305,8 +305,7 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, struct mlx5_vhca_recv_buf *recv_buf, u32 *mkey) { - size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) : - recv_buf->npages; + size_t npages = buf ? buf->npages : recv_buf->npages; int err = 0, inlen; __be64 *mtt; void *mkc; @@ -362,7 +361,7 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) if (mvdev->mdev_detach) return -ENOTCONN; - if (buf->dmaed || !buf->allocated_length) + if (buf->dmaed || !buf->npages) return -EINVAL; ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); @@ -403,8 +402,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) } struct mlx5_vhca_data_buffer * -mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf; @@ -416,9 +414,8 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, buf->dma_dir = dma_dir; buf->migf = migf; - if (length) { - ret = mlx5vf_add_migration_pages(buf, - DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (npages) { + ret = mlx5vf_add_migration_pages(buf, npages); if (ret) goto end; @@ -444,8 +441,8 @@ void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf) } struct mlx5_vhca_data_buffer * -mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir) +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir) { struct mlx5_vhca_data_buffer *buf, *temp_buf; struct list_head free_list; @@ -460,7 +457,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { if (buf->dma_dir == dma_dir) { list_del_init(&buf->buf_elm); - if (buf->allocated_length >= length) { + if (buf->npages >= npages) { spin_unlock_irq(&migf->list_lock); goto found; } @@ -474,7 +471,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, } } spin_unlock_irq(&migf->list_lock); - buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir); + buf = mlx5vf_alloc_data_buffer(migf, npages, dma_dir); found: while ((temp_buf = list_first_entry_or_null(&free_list, @@ -645,7 +642,7 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, MLX5_SET(save_vhca_state_in, in, op_mod, 0); MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id); MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey); - MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length); + MLX5_SET(save_vhca_state_in, in, size, buf->npages * PAGE_SIZE); MLX5_SET(save_vhca_state_in, in, incremental, inc); MLX5_SET(save_vhca_state_in, in, set_track, track); @@ -668,8 +665,12 @@ int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev, } if (!header_buf) { - header_buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + u32 npages = DIV_ROUND_UP( + sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE); + + header_buf = + mlx5vf_get_data_buffer(migf, npages, DMA_NONE); if (IS_ERR(header_buf)) { err = PTR_ERR(header_buf); goto err_free; diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index f2c7227fa683..887267ebbd8a 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -60,7 +60,7 @@ struct mlx5_vhca_data_buffer { struct sg_append_table table; loff_t start_pos; u64 length; - u64 allocated_length; + u32 npages; u32 mkey; enum dma_data_direction dma_dir; u8 dmaed:1; @@ -219,12 +219,12 @@ int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf); void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf); void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf); struct mlx5_vhca_data_buffer * -mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir); +mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir); void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf); struct mlx5_vhca_data_buffer * -mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, - size_t length, enum dma_data_direction dma_dir); +mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, + enum dma_data_direction dma_dir); void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf); int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages); diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index fe09a8c8af95..b11b1c27d284 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -94,7 +94,7 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, if (ret) goto err; - buf->allocated_length += filled * PAGE_SIZE; + buf->npages += filled; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, @@ -352,6 +352,7 @@ static struct mlx5_vhca_data_buffer * mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, u8 index, size_t required_length) { + u32 npages = DIV_ROUND_UP(required_length, PAGE_SIZE); struct mlx5_vhca_data_buffer *buf = migf->buf[index]; u8 chunk_num; @@ -359,12 +360,11 @@ mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, chunk_num = buf->stop_copy_chunk_num; buf->migf->buf[index] = NULL; /* Checking whether the pre-allocated buffer can fit */ - if (buf->allocated_length >= required_length) + if (buf->npages >= npages) return buf; mlx5vf_put_data_buffer(buf); - buf = mlx5vf_get_data_buffer(buf->migf, required_length, - DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(buf->migf, npages, DMA_FROM_DEVICE); if (IS_ERR(buf)) return buf; @@ -417,7 +417,9 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, u8 *to_buff; int ret; - header_buf = mlx5vf_get_data_buffer(migf, size, DMA_NONE); + BUILD_BUG_ON(size > PAGE_SIZE); + header_buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(size, PAGE_SIZE), + DMA_NONE); if (IS_ERR(header_buf)) return PTR_ERR(header_buf); @@ -432,7 +434,7 @@ static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf, to_buff = kmap_local_page(page); memcpy(to_buff, &header, sizeof(header)); header_buf->length = sizeof(header); - data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); + data.stop_copy_size = cpu_to_le64(migf->buf[0]->npages * PAGE_SIZE); memcpy(to_buff + sizeof(header), &data, sizeof(data)); header_buf->length += sizeof(data); kunmap_local(to_buff); @@ -481,15 +483,22 @@ static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; for (i = 0; i < num_chunks; i++) { - buf = mlx5vf_get_data_buffer(migf, inc_state_size, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer( + migf, DIV_ROUND_UP(inc_state_size, PAGE_SIZE), + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; } + BUILD_BUG_ON(sizeof(struct mlx5_vf_migration_header) > + PAGE_SIZE); migf->buf[i] = buf; - buf = mlx5vf_get_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + buf = mlx5vf_get_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto err; @@ -597,7 +606,8 @@ static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, * We finished transferring the current state and the device has a * dirty state, save a new state to be ready for. */ - buf = mlx5vf_get_data_buffer(migf, inc_length, DMA_FROM_DEVICE); + buf = mlx5vf_get_data_buffer(migf, DIV_ROUND_UP(inc_length, PAGE_SIZE), + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); mlx5vf_mark_err(migf); @@ -718,8 +728,8 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) if (track) { /* leave the allocated buffer ready for the stop-copy phase */ - buf = mlx5vf_alloc_data_buffer(migf, - migf->buf[0]->allocated_length, DMA_FROM_DEVICE); + buf = mlx5vf_alloc_data_buffer(migf, migf->buf[0]->npages, + DMA_FROM_DEVICE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_pd; @@ -783,16 +793,15 @@ mlx5vf_resume_read_image_no_header(struct mlx5_vhca_data_buffer *vhca_buf, const char __user **buf, size_t *len, loff_t *pos, ssize_t *done) { + u32 npages = DIV_ROUND_UP(requested_length, PAGE_SIZE); int ret; if (requested_length > MAX_LOAD_SIZE) return -ENOMEM; - if (vhca_buf->allocated_length < requested_length) { - ret = mlx5vf_add_migration_pages( - vhca_buf, - DIV_ROUND_UP(requested_length - vhca_buf->allocated_length, - PAGE_SIZE)); + if (vhca_buf->npages < npages) { + ret = mlx5vf_add_migration_pages(vhca_buf, + npages - vhca_buf->npages); if (ret) return ret; } @@ -992,11 +1001,14 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, goto out_unlock; break; case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: - if (vhca_buf_header->allocated_length < migf->record_size) { + { + u32 npages = DIV_ROUND_UP(migf->record_size, PAGE_SIZE); + + if (vhca_buf_header->npages < npages) { mlx5vf_free_data_buffer(vhca_buf_header); - migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, - migf->record_size, DMA_NONE); + migf->buf_header[0] = mlx5vf_alloc_data_buffer( + migf, npages, DMA_NONE); if (IS_ERR(migf->buf_header[0])) { ret = PTR_ERR(migf->buf_header[0]); migf->buf_header[0] = NULL; @@ -1009,6 +1021,7 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, vhca_buf_header->start_pos = migf->max_pos; migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; break; + } case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: ret = mlx5vf_resume_read_header_data(migf, vhca_buf_header, &buf, &len, pos, &done); @@ -1019,12 +1032,13 @@ static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, { u64 size = max(migf->record_size, migf->stop_copy_prep_size); + u32 npages = DIV_ROUND_UP(size, PAGE_SIZE); - if (vhca_buf->allocated_length < size) { + if (vhca_buf->npages < npages) { mlx5vf_free_data_buffer(vhca_buf); migf->buf[0] = mlx5vf_alloc_data_buffer(migf, - size, DMA_TO_DEVICE); + npages, DMA_TO_DEVICE); if (IS_ERR(migf->buf[0])) { ret = PTR_ERR(migf->buf[0]); migf->buf[0] = NULL; @@ -1115,8 +1129,11 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->buf[0] = buf; if (MLX5VF_PRE_COPY_SUPP(mvdev)) { - buf = mlx5vf_alloc_data_buffer(migf, - sizeof(struct mlx5_vf_migration_header), DMA_NONE); + buf = mlx5vf_alloc_data_buffer( + migf, + DIV_ROUND_UP(sizeof(struct mlx5_vf_migration_header), + PAGE_SIZE), + DMA_NONE); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_buf; From patchwork Tue Mar 5 11:18:43 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582212 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id BD9A75DF2A; Tue, 5 Mar 2024 11:19:41 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637581; cv=none; b=ifeUGCxWhrQig1z01LsD8ZPH1LW7jLtvxrc9bf9s9PSEvtMwfpAF8mm2UdFZQVjnJzcqyDNhH+EWBVv358U2x1II6btQzhMRk0BGLGDgoFft2i+zEis17ZfPo7y8+RmTBThzhJuREoHkXLuo8EY7Ss67YArkUohetRH5F7Jr5lY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637581; c=relaxed/simple; bh=ktX9dWW2cFJcgdIR5D6ZbOLB18GCIDFrg21/P33AIJM=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=kDpiWwHBm+XgS17mcnK72uAEB3+sxTX6YuDuVEckQep+llyJcwgs5LCI+++E4CB+Wv67piUKh5C+2WyuOGoEGwIKAOQjvFn9iqFpZVghYTJ0y7WXzTZTAnIkVrqXFt7OaOh5pDG5U59ZGZv+SWT6/vPq/GWdbYyCKIloQfbzu/M= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=fey7wsaZ; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="fey7wsaZ" Received: by smtp.kernel.org (Postfix) with ESMTPSA id E2D66C433F1; Tue, 5 Mar 2024 11:19:40 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637581; bh=ktX9dWW2cFJcgdIR5D6ZbOLB18GCIDFrg21/P33AIJM=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=fey7wsaZdG45ZTpTCA+xNyFGkmbDKdEi/dCaLd8ajsXQPHtK/ULwBj1b093LSAn7Y UEsaBj2FuRckdNQ6Gujpmw48VjHXpNM1fSbQl5oI3g3CDuJYMY6DSFYtuR4RKD1hfJ y3qjLYezmudL7JBwgocAquKub46DL5PHx9MLVg9wRdKrq9yQKpJuvfRze9BmJKfQTE 8vzPYJrKGJZf8WQwAvGqrFw/R15+wLrNi5Vic0VFySi05yEvO7tShLpjnFAaBr0BMF 0782VG0ciDZd2UAETtRfSPOJkUJ8Z2QoO2Lr9sjX0IEekD5gZdbuKEH3QUMAXTi28c v7um/BHiwnWjA== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 12/16] vfio/mlx5: Rewrite create mkey flow to allow better code reuse Date: Tue, 5 Mar 2024 13:18:43 +0200 Message-ID: <9366169430357d953e961cd41ae912c5fbd3f568.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Change the creation of mkey to be performed in multiple steps: data allocation, DMA setup and actual call to HW to create that mkey. In this new flow, the whole input to MKEY command is saved to eliminate the need to keep array of pointers for DMA addresses for receive list and in the future patches for send list too. In addition to memory size reduce and elimination of unnecessary data movements to set MKEY input, the code is prepared for future reuse. Signed-off-by: Leon Romanovsky --- drivers/vfio/pci/mlx5/cmd.c | 149 +++++++++++++++++++++--------------- drivers/vfio/pci/mlx5/cmd.h | 3 +- 2 files changed, 88 insertions(+), 64 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 45104e47b7b2..44762980fcb9 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -300,39 +300,21 @@ static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id, return ret; } -static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, - struct mlx5_vhca_data_buffer *buf, - struct mlx5_vhca_recv_buf *recv_buf, - u32 *mkey) +static u32 *alloc_mkey_in(u32 npages, u32 pdn) { - size_t npages = buf ? buf->npages : recv_buf->npages; - int err = 0, inlen; - __be64 *mtt; + int inlen; void *mkc; u32 *in; inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + - sizeof(*mtt) * round_up(npages, 2); + sizeof(__be64) * round_up(npages, 2); - in = kvzalloc(inlen, GFP_KERNEL); + in = kvzalloc(inlen, GFP_KERNEL_ACCOUNT); if (!in) - return -ENOMEM; + return NULL; MLX5_SET(create_mkey_in, in, translations_octword_actual_size, DIV_ROUND_UP(npages, 2)); - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt); - - if (buf) { - struct sg_dma_page_iter dma_iter; - - for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) - *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); - } else { - int i; - - for (i = 0; i < npages; i++) - *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]); - } mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry); MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT); @@ -346,9 +328,30 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn, MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT); MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2)); MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE); - err = mlx5_core_create_mkey(mdev, mkey, in, inlen); - kvfree(in); - return err; + + return in; +} + +static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, + struct mlx5_vhca_data_buffer *buf, u32 *mkey_in, + u32 *mkey) +{ + __be64 *mtt; + int inlen; + + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + + if (buf) { + struct sg_dma_page_iter dma_iter; + + for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) + *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); + } + + inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + + sizeof(__be64) * round_up(npages, 2); + + return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen); } static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) @@ -368,13 +371,22 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) if (ret) return ret; - ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey); - if (ret) + buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn); + if (!buf->mkey_in) { + ret = -ENOMEM; goto err; + } + + ret = create_mkey(mdev, buf->npages, buf, buf->mkey_in, &buf->mkey); + if (ret) + goto err_create_mkey; buf->dmaed = true; return 0; + +err_create_mkey: + kvfree(buf->mkey_in); err: dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); return ret; @@ -390,6 +402,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) if (buf->dmaed) { mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + kvfree(buf->mkey_in); dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, buf->dma_dir, 0); } @@ -1286,46 +1299,45 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, return -ENOMEM; } -static int register_dma_recv_pages(struct mlx5_core_dev *mdev, - struct mlx5_vhca_recv_buf *recv_buf) +static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + u32 *mkey_in) { - int i, j; + dma_addr_t addr; + __be64 *mtt; + int i; - recv_buf->dma_addrs = kvcalloc(recv_buf->npages, - sizeof(*recv_buf->dma_addrs), - GFP_KERNEL_ACCOUNT); - if (!recv_buf->dma_addrs) - return -ENOMEM; + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); - for (i = 0; i < recv_buf->npages; i++) { - recv_buf->dma_addrs[i] = dma_map_page(mdev->device, - recv_buf->page_list[i], - 0, PAGE_SIZE, - DMA_FROM_DEVICE); - if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i])) - goto error; + for (i = npages - 1; i >= 0; i--) { + addr = be64_to_cpu(mtt[i]); + dma_unmap_single(mdev->device, addr, PAGE_SIZE, + DMA_FROM_DEVICE); } - return 0; - -error: - for (j = 0; j < i; j++) - dma_unmap_single(mdev->device, recv_buf->dma_addrs[j], - PAGE_SIZE, DMA_FROM_DEVICE); - - kvfree(recv_buf->dma_addrs); - return -ENOMEM; } -static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev, - struct mlx5_vhca_recv_buf *recv_buf) +static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + struct page **page_list, u32 *mkey_in) { + dma_addr_t addr; + __be64 *mtt; int i; - for (i = 0; i < recv_buf->npages; i++) - dma_unmap_single(mdev->device, recv_buf->dma_addrs[i], - PAGE_SIZE, DMA_FROM_DEVICE); + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + + for (i = 0; i < npages; i++) { + addr = dma_map_page(mdev->device, page_list[i], 0, PAGE_SIZE, + DMA_FROM_DEVICE); + if (dma_mapping_error(mdev->device, addr)) + goto error; + + *mtt++ = cpu_to_be64(addr); + } + + return 0; - kvfree(recv_buf->dma_addrs); +error: + unregister_dma_pages(mdev, i, mkey_in); + return -ENOMEM; } static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, @@ -1334,7 +1346,8 @@ static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; mlx5_core_destroy_mkey(mdev, recv_buf->mkey); - unregister_dma_recv_pages(mdev, recv_buf); + unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in); + kvfree(recv_buf->mkey_in); free_recv_pages(&qp->recv_buf); } @@ -1350,18 +1363,28 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, if (err < 0) return err; - err = register_dma_recv_pages(mdev, recv_buf); - if (err) + recv_buf->mkey_in = alloc_mkey_in(npages, pdn); + if (!recv_buf->mkey_in) { + err = -ENOMEM; goto end; + } + + err = register_dma_pages(mdev, npages, recv_buf->page_list, + recv_buf->mkey_in); + if (err) + goto err_register_dma; - err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey); + err = create_mkey(mdev, npages, NULL, recv_buf->mkey_in, + &recv_buf->mkey); if (err) goto err_create_mkey; return 0; err_create_mkey: - unregister_dma_recv_pages(mdev, recv_buf); + unregister_dma_pages(mdev, npages, recv_buf->mkey_in); +err_register_dma: + kvfree(recv_buf->mkey_in); end: free_recv_pages(recv_buf); return err; diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 887267ebbd8a..83728c0669e7 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -62,6 +62,7 @@ struct mlx5_vhca_data_buffer { u64 length; u32 npages; u32 mkey; + u32 *mkey_in; enum dma_data_direction dma_dir; u8 dmaed:1; u8 stop_copy_chunk_num; @@ -137,8 +138,8 @@ struct mlx5_vhca_cq { struct mlx5_vhca_recv_buf { u32 npages; struct page **page_list; - dma_addr_t *dma_addrs; u32 next_rq_offset; + u32 *mkey_in; u32 mkey; }; From patchwork Tue Mar 5 11:18:44 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582213 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id CAB6F58AA2; Tue, 5 Mar 2024 11:19:45 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637585; cv=none; b=MKUfLlwwf1wIrYv+h0m3kpuYHlKbrR1i77k9SJt308r0/KKSlctq/rB+aHD+kL4I9spNBPPdv2lYFlkwv/RBh+P2F/FBlRZdt4Tgg4uYmcPVKzf7jtAbLEo5NPIfrqwvceImXvzz3eoymgHT7epZdKprMP8Md6hLvmI+G3mffGU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637585; c=relaxed/simple; bh=WBfM8ZvY1JfgPv87VMT5XwFqFK1pOKYXqtZNVa4/Kmg=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=eBJJjPfYsjp3Z8vefBJYrFRz2guwBDUxulJ3f+WackvfAiNBWg4PyymCIGqKQXRtuVbU6DYO7LI925Sd/7MgkgDZc7UhQCGMEBPeZjyMQ1XDT3IjWDQWKBkPwsWfVi/bgMmIRqlJQcTpGjXh1L69njfVUmBmY316uzikci0edBM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=kbsMf9DJ; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="kbsMf9DJ" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 980C4C433C7; Tue, 5 Mar 2024 11:19:44 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637585; bh=WBfM8ZvY1JfgPv87VMT5XwFqFK1pOKYXqtZNVa4/Kmg=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=kbsMf9DJdoige6tJLshijtU8Kah4MBtJADzS0kuryTgbhA9Aurzvs2NHr9PC2Sxdk aWMyTQA9cpqJ9BZzF2xzaOYOiC9rT0wRI5JB4o6/Bd19PA6j8h5et4xH1Ybnssusxv ofz4zG1sNo7UQbdSRgvuO4DR++wHk5xf9qd7pP3BUoMIXhUyuiKKBSd7cguagJOd5c 2If7Dk3oa46O/Qqcblo2jX+SAbFRpj9bAAcaayM0w5quJX8gW8FLX3ZCbV05DbU0rN OkYxpZQZxj/lFGRv3G3vvw9fuADM/7YcjA+xBr5fSWKCwCH4P1a8ifmHULe6ES3OET IDThNn5c/6MLg== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 13/16] vfio/mlx5: Explicitly store page list Date: Tue, 5 Mar 2024 13:18:44 +0200 Message-ID: <1d0ca7408af6e5f0bb09baffd021bc72287e5ed8.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky As a preparation to removal scatter-gather table and unifying receive and send list, explicitly store page list. Signed-off-by: Leon Romanovsky --- drivers/vfio/pci/mlx5/cmd.c | 1 + drivers/vfio/pci/mlx5/cmd.h | 1 + drivers/vfio/pci/mlx5/main.c | 35 +++++++++++++++++------------------ 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 44762980fcb9..5e2103042d9b 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -411,6 +411,7 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) __free_page(sg_page_iter_page(&sg_iter)); sg_free_append_table(&buf->table); + kvfree(buf->page_list); kfree(buf); } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 83728c0669e7..815fcb54494d 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -57,6 +57,7 @@ struct mlx5_vf_migration_header { }; struct mlx5_vhca_data_buffer { + struct page **page_list; struct sg_append_table table; loff_t start_pos; u64 length; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index b11b1c27d284..7ffe24693a55 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -69,44 +69,43 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned int npages) { unsigned int to_alloc = npages; + size_t old_size, new_size; struct page **page_list; unsigned long filled; unsigned int to_fill; int ret; - to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); - page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); + to_fill = min_t(unsigned int, npages, + PAGE_SIZE / sizeof(*buf->page_list)); + old_size = buf->npages * sizeof(*buf->page_list); + new_size = old_size + to_fill * sizeof(*buf->page_list); + page_list = kvrealloc(buf->page_list, old_size, new_size, + GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page_list) return -ENOMEM; + buf->page_list = page_list; + do { filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, - page_list); - if (!filled) { - ret = -ENOMEM; - goto err; - } + buf->page_list + buf->npages); + if (!filled) + return -ENOMEM; + to_alloc -= filled; ret = sg_alloc_append_table_from_pages( - &buf->table, page_list, filled, 0, + &buf->table, buf->page_list + buf->npages, filled, 0, filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, GFP_KERNEL_ACCOUNT); - if (ret) - goto err; + return ret; + buf->npages += filled; - /* clean input for another bulk allocation */ - memset(page_list, 0, filled * sizeof(*page_list)); to_fill = min_t(unsigned int, to_alloc, - PAGE_SIZE / sizeof(*page_list)); + PAGE_SIZE / sizeof(*buf->page_list)); } while (to_alloc > 0); - kvfree(page_list); return 0; - -err: - kvfree(page_list); - return ret; } static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) From patchwork Tue Mar 5 11:18:45 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582214 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 57DAB5F876; Tue, 5 Mar 2024 11:19:49 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637589; cv=none; b=FVUfs9XoC6fBLNGpj9eycwBJ4W+TrKl9yH8quf9eA4BM8s6YEoswnQfPZoGKEOQaNdJIDlWmE+w6Wf70T79x545nKJtCHeIWPI9c6HHCf81WoQjBsel/vDRrIYFR8kD1Ga7M4cRqqIga7PBKSocFDe2k7pkNlKDS3geXN4r6i/k= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637589; c=relaxed/simple; bh=4IocgUHLj9eV065W5P3anYxI4nhTW2naxln23AtMtis=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=EXi9hYYHLevJhgOcw5jSkQe0zWC8CPTVvHZmH3WKne95G/X6t3ZHHvaL0fgIt0i5dAHsGQ3kIQnWdV+rMPVAyTqpufAxU0rmTH2Ujis4ScLWezPDM6CNFm+XeZfZykaXwrR1Iqo2CGfDGg6cYB/AzsAbKCfTIIG5LOfUsQPBsSE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=Kwf4ZBdi; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="Kwf4ZBdi" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8560AC433B1; Tue, 5 Mar 2024 11:19:48 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637589; bh=4IocgUHLj9eV065W5P3anYxI4nhTW2naxln23AtMtis=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=Kwf4ZBdivfDQ4nrqWyYJrdyGu+dTAKFt2w4DqfgV8hY2TF47G51LQsFhZizuBN0nl /LddIKms0IAfT0WHmYNU7cYBQM3+ojvIkcJyjq/EymmNLeyaASJ4pO0/yFwppdyV1J zCp+YcKJpzRFJdxeq2F+TnKDMaF4KgtTel4e936sGzxRr6kW5CAa6z3bp7+MrR9DZ1 67ZVLQoEwvtgJ/syi3B4Z3gpWPdGttjzmmu/HWJLILXzKwcrmmZGwIgAXTLxXjkI1c dXc+I3B9nwh/zRdzrJA3WiEIm3tdPaR6SonuiOpFWbMhZlxlwWuadZ9uQklYcFXI6M WBWzVN7llF2sg== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Leon Romanovsky , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Zhu Yanjun Subject: [RFC RESEND 14/16] vfio/mlx5: Convert vfio to use DMA link API Date: Tue, 5 Mar 2024 13:18:45 +0200 Message-ID: X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Leon Romanovsky Remove intermediate scatter-gather table as it is not needed if DMA link API is used. This conversion reduces drastically the memory used to manage that table. Signed-off-by: Leon Romanovsky --- drivers/vfio/pci/mlx5/cmd.c | 177 ++++++++++++++++------------------- drivers/vfio/pci/mlx5/cmd.h | 8 +- drivers/vfio/pci/mlx5/main.c | 50 ++-------- 3 files changed, 91 insertions(+), 144 deletions(-) diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 5e2103042d9b..cfae03f7b7da 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -332,26 +332,60 @@ static u32 *alloc_mkey_in(u32 npages, u32 pdn) return in; } -static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, - struct mlx5_vhca_data_buffer *buf, u32 *mkey_in, +static int create_mkey(struct mlx5_core_dev *mdev, u32 npages, u32 *mkey_in, u32 *mkey) { + int inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + + sizeof(__be64) * round_up(npages, 2); + + return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen); +} + +static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + u32 *mkey_in, struct dma_iova_attrs *iova) +{ + dma_addr_t addr; __be64 *mtt; - int inlen; + int i; mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); - if (buf) { - struct sg_dma_page_iter dma_iter; + for (i = npages - 1; i >= 0; i--) { + addr = be64_to_cpu(mtt[i]); + dma_unlink_range(iova, addr); + } + dma_free_iova(iova); +} + +static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages, + struct page **page_list, u32 *mkey_in, + struct dma_iova_attrs *iova) +{ + dma_addr_t addr; + __be64 *mtt; + int i, err; + + iova->dev = mdev->device; + iova->size = npages * PAGE_SIZE; + err = dma_alloc_iova(iova); + if (err) + return err; + + mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); + + for (i = 0; i < npages; i++) { + addr = dma_link_range(page_list[i], 0, iova, i * PAGE_SIZE); + if (dma_mapping_error(mdev->device, addr)) + goto error; - for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0) - *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter)); + *mtt++ = cpu_to_be64(addr); } - inlen = MLX5_ST_SZ_BYTES(create_mkey_in) + - sizeof(__be64) * round_up(npages, 2); + return 0; - return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen); +error: + unregister_dma_pages(mdev, i, mkey_in, iova); + return -ENOMEM; } static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) @@ -367,17 +401,16 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) if (buf->dmaed || !buf->npages) return -EINVAL; - ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); - if (ret) - return ret; - buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn); - if (!buf->mkey_in) { - ret = -ENOMEM; - goto err; - } + if (!buf->mkey_in) + return -ENOMEM; + + ret = register_dma_pages(mdev, buf->npages, buf->page_list, + buf->mkey_in, &buf->iova); + if (ret) + goto err_register_dma; - ret = create_mkey(mdev, buf->npages, buf, buf->mkey_in, &buf->mkey); + ret = create_mkey(mdev, buf->npages, buf->mkey_in, &buf->mkey); if (ret) goto err_create_mkey; @@ -386,32 +419,39 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf) return 0; err_create_mkey: + unregister_dma_pages(mdev, buf->npages, buf->mkey_in, &buf->iova); +err_register_dma: kvfree(buf->mkey_in); -err: - dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0); return ret; } +static void free_page_list(u32 npages, struct page **page_list) +{ + int i; + + /* Undo alloc_pages_bulk_array() */ + for (i = npages - 1; i >= 0; i--) + __free_page(page_list[i]); + + kvfree(page_list); +} + void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf) { - struct mlx5_vf_migration_file *migf = buf->migf; - struct sg_page_iter sg_iter; + struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev; + struct mlx5_core_dev *mdev = mvdev->mdev; - lockdep_assert_held(&migf->mvdev->state_mutex); - WARN_ON(migf->mvdev->mdev_detach); + lockdep_assert_held(&mvdev->state_mutex); + WARN_ON(mvdev->mdev_detach); if (buf->dmaed) { - mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey); + mlx5_core_destroy_mkey(mdev, buf->mkey); + unregister_dma_pages(mdev, buf->npages, buf->mkey_in, + &buf->iova); kvfree(buf->mkey_in); - dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt, - buf->dma_dir, 0); } - /* Undo alloc_pages_bulk_array() */ - for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) - __free_page(sg_page_iter_page(&sg_iter)); - sg_free_append_table(&buf->table); - kvfree(buf->page_list); + free_page_list(buf->npages, buf->page_list); kfree(buf); } @@ -426,7 +466,7 @@ mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, if (!buf) return ERR_PTR(-ENOMEM); - buf->dma_dir = dma_dir; + buf->iova.dir = dma_dir; buf->migf = migf; if (npages) { ret = mlx5vf_add_migration_pages(buf, npages); @@ -469,7 +509,7 @@ mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf, u32 npages, spin_lock_irq(&migf->list_lock); list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { - if (buf->dma_dir == dma_dir) { + if (buf->iova.dir == dma_dir) { list_del_init(&buf->buf_elm); if (buf->npages >= npages) { spin_unlock_irq(&migf->list_lock); @@ -1253,17 +1293,6 @@ static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev, kfree(qp); } -static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf) -{ - int i; - - /* Undo alloc_pages_bulk_array() */ - for (i = 0; i < recv_buf->npages; i++) - __free_page(recv_buf->page_list[i]); - - kvfree(recv_buf->page_list); -} - static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, unsigned int npages) { @@ -1300,56 +1329,16 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf, return -ENOMEM; } -static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages, - u32 *mkey_in) -{ - dma_addr_t addr; - __be64 *mtt; - int i; - - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); - - for (i = npages - 1; i >= 0; i--) { - addr = be64_to_cpu(mtt[i]); - dma_unmap_single(mdev->device, addr, PAGE_SIZE, - DMA_FROM_DEVICE); - } -} - -static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages, - struct page **page_list, u32 *mkey_in) -{ - dma_addr_t addr; - __be64 *mtt; - int i; - - mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt); - - for (i = 0; i < npages; i++) { - addr = dma_map_page(mdev->device, page_list[i], 0, PAGE_SIZE, - DMA_FROM_DEVICE); - if (dma_mapping_error(mdev->device, addr)) - goto error; - - *mtt++ = cpu_to_be64(addr); - } - - return 0; - -error: - unregister_dma_pages(mdev, i, mkey_in); - return -ENOMEM; -} - static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev, struct mlx5_vhca_qp *qp) { struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf; mlx5_core_destroy_mkey(mdev, recv_buf->mkey); - unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in); + unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in, + &recv_buf->iova); kvfree(recv_buf->mkey_in); - free_recv_pages(&qp->recv_buf); + free_page_list(recv_buf->npages, recv_buf->page_list); } static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, @@ -1370,24 +1359,24 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev, goto end; } + recv_buf->iova.dir = DMA_FROM_DEVICE; err = register_dma_pages(mdev, npages, recv_buf->page_list, - recv_buf->mkey_in); + recv_buf->mkey_in, &recv_buf->iova); if (err) goto err_register_dma; - err = create_mkey(mdev, npages, NULL, recv_buf->mkey_in, - &recv_buf->mkey); + err = create_mkey(mdev, npages, recv_buf->mkey_in, &recv_buf->mkey); if (err) goto err_create_mkey; return 0; err_create_mkey: - unregister_dma_pages(mdev, npages, recv_buf->mkey_in); + unregister_dma_pages(mdev, npages, recv_buf->mkey_in, &recv_buf->iova); err_register_dma: kvfree(recv_buf->mkey_in); end: - free_recv_pages(recv_buf); + free_page_list(npages, recv_buf->page_list); return err; } diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h index 815fcb54494d..3a046166d9f2 100644 --- a/drivers/vfio/pci/mlx5/cmd.h +++ b/drivers/vfio/pci/mlx5/cmd.h @@ -57,22 +57,17 @@ struct mlx5_vf_migration_header { }; struct mlx5_vhca_data_buffer { + struct dma_iova_attrs iova; struct page **page_list; - struct sg_append_table table; loff_t start_pos; u64 length; u32 npages; u32 mkey; u32 *mkey_in; - enum dma_data_direction dma_dir; u8 dmaed:1; u8 stop_copy_chunk_num; struct list_head buf_elm; struct mlx5_vf_migration_file *migf; - /* Optimize mlx5vf_get_migration_page() for sequential access */ - struct scatterlist *last_offset_sg; - unsigned int sg_last_entry; - unsigned long last_offset; }; struct mlx5vf_async_data { @@ -137,6 +132,7 @@ struct mlx5_vhca_cq { }; struct mlx5_vhca_recv_buf { + struct dma_iova_attrs iova; u32 npages; struct page **page_list; u32 next_rq_offset; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 7ffe24693a55..668c28bc429c 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -34,35 +34,10 @@ static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) core_device); } -struct page * -mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, - unsigned long offset) +struct page *mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, + unsigned long offset) { - unsigned long cur_offset = 0; - struct scatterlist *sg; - unsigned int i; - - /* All accesses are sequential */ - if (offset < buf->last_offset || !buf->last_offset_sg) { - buf->last_offset = 0; - buf->last_offset_sg = buf->table.sgt.sgl; - buf->sg_last_entry = 0; - } - - cur_offset = buf->last_offset; - - for_each_sg(buf->last_offset_sg, sg, - buf->table.sgt.orig_nents - buf->sg_last_entry, i) { - if (offset < sg->length + cur_offset) { - buf->last_offset_sg = sg; - buf->sg_last_entry += i; - buf->last_offset = cur_offset; - return nth_page(sg_page(sg), - (offset - cur_offset) / PAGE_SIZE); - } - cur_offset += sg->length; - } - return NULL; + return buf->page_list[offset / PAGE_SIZE]; } int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, @@ -72,13 +47,9 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, size_t old_size, new_size; struct page **page_list; unsigned long filled; - unsigned int to_fill; - int ret; - to_fill = min_t(unsigned int, npages, - PAGE_SIZE / sizeof(*buf->page_list)); old_size = buf->npages * sizeof(*buf->page_list); - new_size = old_size + to_fill * sizeof(*buf->page_list); + new_size = old_size + to_alloc * sizeof(*buf->page_list); page_list = kvrealloc(buf->page_list, old_size, new_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!page_list) @@ -87,22 +58,13 @@ int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, buf->page_list = page_list; do { - filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, + filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_alloc, buf->page_list + buf->npages); if (!filled) return -ENOMEM; to_alloc -= filled; - ret = sg_alloc_append_table_from_pages( - &buf->table, buf->page_list + buf->npages, filled, 0, - filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC, - GFP_KERNEL_ACCOUNT); - if (ret) - return ret; - buf->npages += filled; - to_fill = min_t(unsigned int, to_alloc, - PAGE_SIZE / sizeof(*buf->page_list)); } while (to_alloc > 0); return 0; @@ -164,7 +126,7 @@ static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf) struct mlx5_vf_migration_file *migf = vhca_buf->migf; if (vhca_buf->stop_copy_chunk_num) { - bool is_header = vhca_buf->dma_dir == DMA_NONE; + bool is_header = vhca_buf->iova.dir == DMA_NONE; u8 chunk_num = vhca_buf->stop_copy_chunk_num; size_t next_required_umem_size = 0; From patchwork Tue Mar 5 11:18:46 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582215 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 202D95FB9D; Tue, 5 Mar 2024 11:19:52 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637593; cv=none; b=k1WkzQ6/oiggW1rS8slR5Yd/5eJs6MJMt6MmohqN4H2fJ1CeCOAZMEW4G3BHB5c9mr4rhCI689/cck5fTW91LVo7eJf1vbhD81N0lRIKGe+w1CZrWZnKjbtXCoDmmhMTqMPCUO3fBtEHDDH+TWOiv1mUvb8K47SI/bzYMsosxAI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637593; c=relaxed/simple; bh=LoE4Mk2h+x4QygNOYAaQkErlcP1J9OkI1WKvPNtdRq4=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=bimzl/tNhIQxJMxi+p2dOyci8myzoJWdibLEEMCcSXvK5MpaDPW4FQ4yN+HLxpug4+1COEVpV/i58D4iItKKjocvw2YT6Ld8Ox0qyIajyV4OC173aULGd/naeuxHfbG2l8gvI3eKTT44oiMZqtz3+GPW9fXer5SDD4u2/NZln2Y= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=sE3ROVKU; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="sE3ROVKU" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 0A093C433C7; Tue, 5 Mar 2024 11:19:52 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637592; bh=LoE4Mk2h+x4QygNOYAaQkErlcP1J9OkI1WKvPNtdRq4=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=sE3ROVKUx5nlUBmQ1UDC4LListp7vH8VXfLmm9fyCvv8NB4GmDIxkbggaQ4SxXnE0 mvNE5IdOjICDtawe/pkNhuWfWOzOpUOcrElvhYVanrDlwvm48luEZXTHbEFagcL5V6 9PIU6O93gbKt9Az+UNSvmMcWTf09Bbhsp9Guzd3l+1YIn/eXnyBzk37vfgDnODIwcO eQmXyU8tgclJJntRDMkAHcYVxrMFajV5C64PY/XvgQvQ8LDiJIYbBQ0z5sgP59/5jf U1XvMP/wgNcsfu1tDnnqWg7VxucsOm+cnxMiVPHer4OWILzUKVQBtCzGMihNBlkVJx GRjraaS+NWUHA== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Chaitanya Kulkarni , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Leon Romanovsky , Zhu Yanjun Subject: [RFC RESEND 15/16] block: add dma_link_range() based API Date: Tue, 5 Mar 2024 13:18:46 +0200 Message-ID: <1e52aa392b9c434f55203c9d630dd06fcdb75c32.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Chaitanya Kulkarni Add two helper functions that are needed to calculate the total DMA length of the request blk_rq_get_dma_length() and to create DMA mapping blk_rq_dma_map(). blk_rq_get_dma_length() is used to get the total length of the request, when driver is allocating IOVA space for this request with the call to dma_alloc_iova(). This length is then initialized to the iova->size and passed to allocate iova call chain :- dma_map_ops->allov_iova() iommu_dma_alloc_iova() alloc_iova_fast() iova_rcache_get() OR alloc_iova() blk_rq_dma_map() iterates through bvec list and creates DMA mapping for each page using iova parameter with the help of dma_link_range(). Note that @iova is allocated & pre-initialized using dma_alloc_iova() by the caller. After creating a mapping for each page, call into the callback function @cb provided by the drive with a mapped DMA address for this page, offset into the iova space (needed at the time of unlink), length of the mapped page, and page number that is mapped in this request. Driver is responsible for using this DMA address to complete the mapping of underlying protocol-specific data structures, such as NVMe PRPs or NVMe SGLs. This callback approach allows us to iterate bvec list only once to create bvec to DMA mapping and use that DMA address in driver to build the protocol-specific data structure, essentially mapping one bvec page at a time to DMA address and using that DMA address to create underlying protocol-specific data structures. Finally, returning the number of linked count. Signed-off-by: Chaitanya Kulkarni Signed-off-by: Leon Romanovsky --- block/blk-merge.c | 156 +++++++++++++++++++++++++++++++++++++++++ include/linux/blk-mq.h | 9 +++ 2 files changed, 165 insertions(+) diff --git a/block/blk-merge.c b/block/blk-merge.c index 2d470cf2173e..63effc8ac1db 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -583,6 +583,162 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(__blk_rq_map_sg); +static dma_addr_t blk_dma_link_page(struct page *page, unsigned int page_offset, + struct dma_iova_attrs *iova, + dma_addr_t dma_offset) +{ + dma_addr_t dma_addr; + int ret; + + dma_addr = dma_link_range(page, page_offset, iova, dma_offset); + ret = dma_mapping_error(iova->dev, dma_addr); + if (ret) { + pr_err("dma_mapping_err %d dma_addr 0x%llx dma_offset %llu\n", + ret, dma_addr, dma_offset); + /* better way ? */ + dma_addr = 0; + } + return dma_addr; +} + +/** + * blk_rq_dma_map: block layer request to DMA mapping helper. + * + * @req : [in] request to be mapped + * @cb : [in] callback to be called for each bvec mapped bvec into + * underlaying driver. + * @cb_data : [in] callback data to be passed, privete to the underlaying + * driver. + * @iova : [in] iova to be used to create DMA mapping for this request's + * bvecs. + * Description: + * Iterates through bvec list and create dma mapping between each bvec page + * using @iova with dma_link_range(). Note that @iova needs to be allocated and + * pre-initialized using dma_alloc_iova() by the caller. After creating + * a mapping for each page, call into the callback function @cb provided by + * driver with mapped dma address for this bvec, offset into iova space, length + * of the mapped page, and bvec number that is mapped in this requets. Driver is + * responsible for using this dma address to complete the mapping of underlaying + * protocol specific data structure, such as NVMe PRPs or NVMe SGLs. This + * callback approach allows us to iterate bvec list only once to create bvec to + * DMA mapping & use that dma address in the driver to build the protocol + * specific data structure, essentially mapping one bvec page at a time to DMA + * address and use that DMA address to create underlaying protocol specific + * data structure. + * + * Caller needs to ensure @iova is initialized & allovated with using + * dma_alloc_iova(). + */ +int blk_rq_dma_map(struct request *req, driver_map_cb cb, void *cb_data, + struct dma_iova_attrs *iova) +{ + dma_addr_t curr_dma_offset = 0; + dma_addr_t prev_dma_addr = 0; + dma_addr_t dma_addr; + size_t prev_dma_len = 0; + struct req_iterator iter; + struct bio_vec bv; + int linked_cnt = 0; + + rq_for_each_bvec(bv, req, iter) { + if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) { + curr_dma_offset = prev_dma_addr + prev_dma_len; + + dma_addr = blk_dma_link_page(bv.bv_page, bv.bv_offset, + iova, curr_dma_offset); + if (!dma_addr) + break; + + cb(cb_data, linked_cnt, dma_addr, curr_dma_offset, + bv.bv_len); + + prev_dma_len = bv.bv_len; + prev_dma_addr = dma_addr; + linked_cnt++; + } else { + unsigned nbytes = bv.bv_len; + unsigned total = 0; + unsigned offset, len; + + while (nbytes > 0) { + struct page *page = bv.bv_page; + + offset = bv.bv_offset + total; + len = min(get_max_segment_size(&req->q->limits, + page, offset), + nbytes); + + page += (offset >> PAGE_SHIFT); + offset &= ~PAGE_MASK; + + curr_dma_offset = prev_dma_addr + prev_dma_len; + + dma_addr = blk_dma_link_page(page, offset, + iova, + curr_dma_offset); + if (!dma_addr) + break; + + cb(cb_data, linked_cnt, dma_addr, + curr_dma_offset, len); + + total += len; + nbytes -= len; + + prev_dma_len = len; + prev_dma_addr = dma_addr; + linked_cnt++; + } + } + } + return linked_cnt; +} +EXPORT_SYMBOL_GPL(blk_rq_dma_map); + +/* + * Calculate total DMA length needed to satisfy this request. + */ +size_t blk_rq_get_dma_length(struct request *rq) +{ + struct request_queue *q = rq->q; + struct bio *bio = rq->bio; + unsigned int offset, len; + struct bvec_iter iter; + size_t dma_length = 0; + struct bio_vec bvec; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + return rq->special_vec.bv_len; + + if (!rq->bio) + return 0; + + for_each_bio(bio) { + bio_for_each_bvec(bvec, bio, iter) { + unsigned int nbytes = bvec.bv_len; + unsigned int total = 0; + + if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE) { + dma_length += bvec.bv_len; + continue; + } + + while (nbytes > 0) { + offset = bvec.bv_offset + total; + len = min(get_max_segment_size(&q->limits, + bvec.bv_page, + offset), nbytes); + total += len; + nbytes -= len; + dma_length += len; + } + } + } + + return dma_length; +} +EXPORT_SYMBOL(blk_rq_get_dma_length); + static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 7a8150a5f051..80b9c7f2c3a0 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -8,6 +8,7 @@ #include #include #include +#include struct blk_mq_tags; struct blk_flush_queue; @@ -1144,7 +1145,15 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, return __blk_rq_map_sg(q, rq, sglist, &last_sg); } + +typedef void (*driver_map_cb)(void *cb_data, u32 cnt, dma_addr_t dma_addr, + dma_addr_t offset, u32 len); + +int blk_rq_dma_map(struct request *req, driver_map_cb cb, void *cb_data, + struct dma_iova_attrs *iova); + void blk_dump_rq_flags(struct request *, char *); +size_t blk_rq_get_dma_length(struct request *rq); #ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_rq_zone_no(struct request *rq) From patchwork Tue Mar 5 11:18:47 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Leon Romanovsky X-Patchwork-Id: 13582216 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9B4E2604AD; Tue, 5 Mar 2024 11:19:56 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637596; cv=none; b=RP+rev7L99DO/qknzZpZnlyFS3T+fZfUYUKxeDKPlWqSUj8zrM8wsPoRC4TzGfkAoe68AUl564DwC7ue1NFSaeY89pLQEAmmZ1KctTeA8aRCBBUMvkJOp43olmt2vDRogiYHVej/7Fsjh3akZnbGEIqeO4mlxF8MoPnRnLFSlzc= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1709637596; c=relaxed/simple; bh=POBXmxmfWLR3OdaBK59TZEAySmTXPtp10vl7hcXrrqE=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=XsSK7GpPJNYmEb48xaAG97UC+ojZDU5aQ2fQ/AZrKlAGMp1Y3mzZK/v3zBjb7cWrLZkZh0zU6aPHsrwNK6Wi9Z+rAFtorIY8YJqK4hxhu8cWCOdHtJR04X7Qtci0uVKcXVzixPyQmcYJTaDZfQVZGwMc13/yJamvMbERaI4k+s0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=iEe5O7nQ; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="iEe5O7nQ" Received: by smtp.kernel.org (Postfix) with ESMTPSA id B7783C433C7; Tue, 5 Mar 2024 11:19:55 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1709637596; bh=POBXmxmfWLR3OdaBK59TZEAySmTXPtp10vl7hcXrrqE=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=iEe5O7nQxOwTtvwCxloOxSNflsI8pgVht7KrqrUoyjvIA//bNfjdwpLXQhmnNUG+7 +sbnJaRnULrDsFLu3aamZmvRLTR6S3KfqMHZuD0nKRYPJJFTHRCFxqOTF7o2sTPcSO yoGUSEImScOHwafMNLeRscZ1fp3ftmKXzkLwOr/kuZN3JjfChMzNnqBzdBsJ7h8MjY x1tKITfCDc3osh+u2Tzd9qFJsXyI/OjxzZaneBb3C8m3vWQSnriFceEKlMm8OyXyos 2quqQv5Nrl5LbcS7FwTmH1rCeNKEDtQnRhtcxBY+E66dZVjlbi5hLqQU3C/3lNeUNl GKmbcMTVhgWYA== From: Leon Romanovsky To: Christoph Hellwig , Robin Murphy , Marek Szyprowski , Joerg Roedel , Will Deacon , Jason Gunthorpe , Chaitanya Kulkarni Cc: Chaitanya Kulkarni , Jonathan Corbet , Jens Axboe , Keith Busch , Sagi Grimberg , Yishai Hadas , Shameer Kolothum , Kevin Tian , Alex Williamson , =?utf-8?b?SsOpcsO0bWUgR2xp?= =?utf-8?b?c3Nl?= , Andrew Morton , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, linux-rdma@vger.kernel.org, iommu@lists.linux.dev, linux-nvme@lists.infradead.org, kvm@vger.kernel.org, linux-mm@kvack.org, Bart Van Assche , Damien Le Moal , Amir Goldstein , "josef@toxicpanda.com" , "Martin K. Petersen" , "daniel@iogearbox.net" , Dan Williams , "jack@suse.com" , Leon Romanovsky , Zhu Yanjun Subject: [RFC RESEND 16/16] nvme-pci: use blk_rq_dma_map() for NVMe SGL Date: Tue, 5 Mar 2024 13:18:47 +0200 Message-ID: <016fc02cbfa9be3c156a6f74df38def1e09c08f1.1709635535.git.leon@kernel.org> X-Mailer: git-send-email 2.44.0 In-Reply-To: References: Precedence: bulk X-Mailing-List: linux-rdma@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Chaitanya Kulkarni Update nvme_iod structure to hold iova, list of DMA linked addresses and total linked count, first one is needed in the request submission path to create a request to DMA mapping and last two are needed in the request completion path to remove the DMA mapping. In nvme_map_data() initialize iova with device, direction, and iova dma length with the help of blk_rq_get_dma_length(). Allocate iova using dma_alloc_iova(). and call in nvme_pci_setup_sgls(). Call newly added blk_rq_dma_map() to create request to DMA mapping and provide a callback function nvme_pci_sgl_map(). In the callback function initialize NVMe SGL dma addresses. Finally in nvme_unmap_data() unlink the dma address and free iova. Full disclosure:- ----------------- This is an RFC to demonstrate the newly added DMA APIs can be used to map/unmap bvecs without the use of sg list, hence I've modified the pci code to only handle SGLs for now. Once we have some agreement on the structure of new DMA API I'll add support for PRPs along with all the optimization that I've removed from the code for this RFC for NVMe SGLs and PRPs. I was able to run fio verification job successfully :- $ fio fio/verify.fio --ioengine=io_uring --filename=/dev/nvme0n1 --loops=10 write-and-verify: (g=0): rw=randwrite, bs=(R) 8192B-8192B, (W) 8192B-8192B, (T) 8192B-8192B, ioengine=io_uring, iodepth=16 fio-3.36 Starting 1 process Jobs: 1 (f=1): [V(1)][81.6%][r=12.2MiB/s][r=1559 IOPS][eta 03m:00s] write-and-verify: (groupid=0, jobs=1): err= 0: pid=4435: Mon Mar 4 20:54:48 2024 read: IOPS=2789, BW=21.8MiB/s (22.9MB/s)(6473MiB/297008msec) slat (usec): min=4, max=5124, avg=356.51, stdev=604.30 clat (nsec): min=1593, max=23376k, avg=5377076.99, stdev=2039189.93 lat (usec): min=493, max=23407, avg=5733.58, stdev=2103.22 clat percentiles (usec): | 1.00th=[ 1172], 5.00th=[ 2114], 10.00th=[ 2835], 20.00th=[ 3654], | 30.00th=[ 4228], 40.00th=[ 4752], 50.00th=[ 5276], 60.00th=[ 5800], | 70.00th=[ 6325], 80.00th=[ 7046], 90.00th=[ 8094], 95.00th=[ 8979], | 99.00th=[10421], 99.50th=[11076], 99.90th=[12780], 99.95th=[14222], | 99.99th=[16909] write: IOPS=2608, BW=20.4MiB/s (21.4MB/s)(10.0GiB/502571msec); 0 zone resets slat (usec): min=4, max=5787, avg=382.68, stdev=649.01 clat (nsec): min=521, max=23650k, avg=5751363.17, stdev=2676065.35 lat (usec): min=95, max=23674, avg=6134.04, stdev=2813.48 clat percentiles (usec): | 1.00th=[ 709], 5.00th=[ 1270], 10.00th=[ 1958], 20.00th=[ 3261], | 30.00th=[ 4228], 40.00th=[ 5014], 50.00th=[ 5800], 60.00th=[ 6521], | 70.00th=[ 7373], 80.00th=[ 8225], 90.00th=[ 9241], 95.00th=[ 9896], | 99.00th=[11469], 99.50th=[11863], 99.90th=[13960], 99.95th=[15270], | 99.99th=[17695] bw ( KiB/s): min= 1440, max=132496, per=99.28%, avg=20715.88, stdev=13123.13, samples=1013 iops : min= 180, max=16562, avg=2589.34, stdev=1640.39, samples=1013 lat (nsec) : 750=0.01% lat (usec) : 2=0.01%, 4=0.01%, 100=0.01%, 250=0.01%, 500=0.07% lat (usec) : 750=0.79%, 1000=1.22% lat (msec) : 2=5.94%, 4=18.87%, 10=69.53%, 20=3.58%, 50=0.01% cpu : usr=1.01%, sys=98.95%, ctx=1591, majf=0, minf=2286 IO depths : 1=0.1%, 2=0.1%, 4=0.1%, 8=0.1%, 16=100.0%, 32=0.0%, >=64=0.0% submit : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0% complete : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.1%, 32=0.0%, 64=0.0%, >=64=0.0% issued rwts: total=828524,1310720,0,0 short=0,0,0,0 dropped=0,0,0,0 latency : target=0, window=0, percentile=100.00%, depth=16 Run status group 0 (all jobs): READ: bw=21.8MiB/s (22.9MB/s), 21.8MiB/s-21.8MiB/s (22.9MB/s-22.9MB/s), io=6473MiB (6787MB), run=297008-297008msec WRITE: bw=20.4MiB/s (21.4MB/s), 20.4MiB/s-20.4MiB/s (21.4MB/s-21.4MB/s), io=10.0GiB (10.7GB), run=502571-502571msec Disk stats (read/write): nvme0n1: ios=829189/1310720, sectors=13293416/20971520, merge=0/0, ticks=836561/1340351, in_queue=2176913, util=99.30% Signed-off-by: Chaitanya Kulkarni Signed-off-by: Leon Romanovsky --- drivers/nvme/host/pci.c | 220 +++++++++------------------------------- 1 file changed, 49 insertions(+), 171 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e6267a6aa380..140939228409 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -236,7 +236,9 @@ struct nvme_iod { unsigned int dma_len; /* length of single DMA segment mapping */ dma_addr_t first_dma; dma_addr_t meta_dma; - struct sg_table sgt; + struct dma_iova_attrs iova; + dma_addr_t dma_link_address[128]; + u16 nr_dma_link_address; union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS]; }; @@ -521,25 +523,10 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req, return true; } -static void nvme_free_prps(struct nvme_dev *dev, struct request *req) -{ - const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1; - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - dma_addr_t dma_addr = iod->first_dma; - int i; - - for (i = 0; i < iod->nr_allocations; i++) { - __le64 *prp_list = iod->list[i].prp_list; - dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); - - dma_pool_free(dev->prp_page_pool, prp_list, dma_addr); - dma_addr = next_dma_addr; - } -} - static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + u16 i; if (iod->dma_len) { dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len, @@ -547,9 +534,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) return; } - WARN_ON_ONCE(!iod->sgt.nents); - - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); + for (i = 0; i < iod->nr_dma_link_address; i++) + dma_unlink_range(&iod->iova, iod->dma_link_address[i]); if (iod->nr_allocations == 0) dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list, @@ -557,120 +543,15 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) else if (iod->nr_allocations == 1) dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list, iod->first_dma); - else - nvme_free_prps(dev, req); - mempool_free(iod->sgt.sgl, dev->iod_mempool); -} - -static void nvme_print_sgl(struct scatterlist *sgl, int nents) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sgl, sg, nents, i) { - dma_addr_t phys = sg_phys(sg); - pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d " - "dma_address:%pad dma_length:%d\n", - i, &phys, sg->offset, sg->length, &sg_dma_address(sg), - sg_dma_len(sg)); - } -} - -static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, - struct request *req, struct nvme_rw_command *cmnd) -{ - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; - int length = blk_rq_payload_bytes(req); - struct scatterlist *sg = iod->sgt.sgl; - int dma_len = sg_dma_len(sg); - u64 dma_addr = sg_dma_address(sg); - int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1); - __le64 *prp_list; - dma_addr_t prp_dma; - int nprps, i; - - length -= (NVME_CTRL_PAGE_SIZE - offset); - if (length <= 0) { - iod->first_dma = 0; - goto done; - } - - dma_len -= (NVME_CTRL_PAGE_SIZE - offset); - if (dma_len) { - dma_addr += (NVME_CTRL_PAGE_SIZE - offset); - } else { - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } - - if (length <= NVME_CTRL_PAGE_SIZE) { - iod->first_dma = dma_addr; - goto done; - } - - nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); - if (nprps <= (256 / 8)) { - pool = dev->prp_small_pool; - iod->nr_allocations = 0; - } else { - pool = dev->prp_page_pool; - iod->nr_allocations = 1; - } - - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) { - iod->nr_allocations = -1; - return BLK_STS_RESOURCE; - } - iod->list[0].prp_list = prp_list; - iod->first_dma = prp_dma; - i = 0; - for (;;) { - if (i == NVME_CTRL_PAGE_SIZE >> 3) { - __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); - if (!prp_list) - goto free_prps; - iod->list[iod->nr_allocations++].prp_list = prp_list; - prp_list[0] = old_prp_list[i - 1]; - old_prp_list[i - 1] = cpu_to_le64(prp_dma); - i = 1; - } - prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= NVME_CTRL_PAGE_SIZE; - dma_addr += NVME_CTRL_PAGE_SIZE; - length -= NVME_CTRL_PAGE_SIZE; - if (length <= 0) - break; - if (dma_len > 0) - continue; - if (unlikely(dma_len < 0)) - goto bad_sgl; - sg = sg_next(sg); - dma_addr = sg_dma_address(sg); - dma_len = sg_dma_len(sg); - } -done: - cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl)); - cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); - return BLK_STS_OK; -free_prps: - nvme_free_prps(dev, req); - return BLK_STS_RESOURCE; -bad_sgl: - WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents), - "Invalid SGL for payload:%d nents:%d\n", - blk_rq_payload_bytes(req), iod->sgt.nents); - return BLK_STS_IOERR; + dma_free_iova(&iod->iova); } static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, - struct scatterlist *sg) + dma_addr_t dma_addr, + unsigned int dma_len) { - sge->addr = cpu_to_le64(sg_dma_address(sg)); - sge->length = cpu_to_le32(sg_dma_len(sg)); + sge->addr = cpu_to_le64(dma_addr); + sge->length = cpu_to_le32(dma_len); sge->type = NVME_SGL_FMT_DATA_DESC << 4; } @@ -682,25 +563,37 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; } +struct nvme_pci_sgl_map_data { + struct nvme_iod *iod; + struct nvme_sgl_desc *sgl_list; +}; + +static void nvme_pci_sgl_map(void *data, u32 cnt, dma_addr_t dma_addr, + dma_addr_t offset, u32 len) +{ + struct nvme_pci_sgl_map_data *d = data; + struct nvme_sgl_desc *sgl_list = d->sgl_list; + struct nvme_iod *iod = d->iod; + + nvme_pci_sgl_set_data(&sgl_list[cnt], dma_addr, len); + iod->dma_link_address[cnt] = offset; + iod->nr_dma_link_address++; +} + static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, struct request *req, struct nvme_rw_command *cmd) { + unsigned int entries = blk_rq_nr_phys_segments(req); struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct dma_pool *pool; struct nvme_sgl_desc *sg_list; - struct scatterlist *sg = iod->sgt.sgl; - unsigned int entries = iod->sgt.nents; + struct dma_pool *pool; dma_addr_t sgl_dma; - int i = 0; + int linked_count; + struct nvme_pci_sgl_map_data data; /* setting the transfer type as SGL */ cmd->flags = NVME_CMD_SGL_METABUF; - if (entries == 1) { - nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); - return BLK_STS_OK; - } - if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { pool = dev->prp_small_pool; iod->nr_allocations = 0; @@ -718,11 +611,13 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, iod->list[0].sg_list = sg_list; iod->first_dma = sgl_dma; - nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); - do { - nvme_pci_sgl_set_data(&sg_list[i++], sg); - sg = sg_next(sg); - } while (--entries > 0); + data.iod = iod; + data.sgl_list = sg_list; + + linked_count = blk_rq_dma_map(req, nvme_pci_sgl_map, &data, + &iod->iova); + + nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, linked_count); return BLK_STS_OK; } @@ -788,36 +683,20 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, &cmnd->rw, &bv); } } - - iod->dma_len = 0; - iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC); - if (!iod->sgt.sgl) + iod->iova.dev = dev->dev; + iod->iova.dir = rq_dma_dir(req); + iod->iova.attrs = DMA_ATTR_NO_WARN; + iod->iova.size = blk_rq_get_dma_length(req); + if (!iod->iova.size) return BLK_STS_RESOURCE; - sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req)); - iod->sgt.orig_nents = blk_rq_map_sg(req->q, req, iod->sgt.sgl); - if (!iod->sgt.orig_nents) - goto out_free_sg; - rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), - DMA_ATTR_NO_WARN); - if (rc) { - if (rc == -EREMOTEIO) - ret = BLK_STS_TARGET; - goto out_free_sg; - } + rc = dma_alloc_iova(&iod->iova); + if (rc) + return BLK_STS_RESOURCE; - if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) - ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); - else - ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); - if (ret != BLK_STS_OK) - goto out_unmap_sg; - return BLK_STS_OK; + iod->dma_len = 0; -out_unmap_sg: - dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); -out_free_sg: - mempool_free(iod->sgt.sgl, dev->iod_mempool); + ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); return ret; } @@ -841,7 +720,6 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) iod->aborted = false; iod->nr_allocations = -1; - iod->sgt.nents = 0; ret = nvme_setup_cmd(req->q->queuedata, req); if (ret)