Message ID | 20230327121317.4081816-21-arnd@kernel.org (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Series | dma-mapping: unify support for cache flushes | expand |
On 2023-03-27 13:13, Arnd Bergmann wrote: > From: Arnd Bergmann <arnd@arndb.de> > > The arm version of the arch_sync_dma_for_cpu() function annotates pages as > PG_dcache_clean after a DMA, but no other architecture does this here. On > ia64, the same thing is done in arch_sync_dma_for_cpu(), so it makes sense > to use the same hook in order to have identical arch_sync_dma_for_cpu() > semantics as all other architectures. > > Splitting this out has multiple effects: > > - for dma-direct, this now gets called after arch_sync_dma_for_cpu() > for DMA_FROM_DEVICE mappings, but not for DMA_BIDIRECTIONAL. While > it would not be harmful to keep doing it for bidirectional mappings, > those are apparently not used in any callers that care about the flag. > > - Since arm has its own dma-iommu abstraction, this now also needs to > call the same function, so the calls are added there to mirror the > dma-direct version. > > - Like dma-direct, the dma-iommu version now marks the dcache clean > for both coherent and noncoherent devices after a DMA, but it only > does this for DMA_FROM_DEVICE, not DMA_BIDIRECTIONAL. > > [ HELP NEEDED: can anyone confirm that it is a correct assumption > on arm that a cache-coherent device writing to a page always results > in it being in a PG_dcache_clean state like on ia64, or can a device > write directly into the dcache?] In AMBA at least, if a snooping write hits in a cache then the data is most likely going to get routed directly into that cache. If it has write-back write-allocate attributes it could also land in any cache along its normal path to RAM; it wouldn't have to go all the way. Hence all the fun we have where treating a coherent device as non-coherent can still be almost as broken as the other way round :) Cheers, Robin. > Signed-off-by: Arnd Bergmann <arnd@arndb.de> > --- > arch/arm/Kconfig | 1 + > arch/arm/mm/dma-mapping.c | 71 +++++++++++++++++++++++---------------- > 2 files changed, 43 insertions(+), 29 deletions(-) > > diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig > index e24a9820e12f..125d58c54ab1 100644 > --- a/arch/arm/Kconfig > +++ b/arch/arm/Kconfig > @@ -7,6 +7,7 @@ config ARM > select ARCH_HAS_BINFMT_FLAT > select ARCH_HAS_CURRENT_STACK_POINTER > select ARCH_HAS_DEBUG_VIRTUAL if MMU > + select ARCH_HAS_DMA_MARK_CLEAN if MMU > select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE > select ARCH_HAS_ELF_RANDOMIZE > select ARCH_HAS_FORTIFY_SOURCE > diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c > index cc702cb27ae7..b703cb83d27e 100644 > --- a/arch/arm/mm/dma-mapping.c > +++ b/arch/arm/mm/dma-mapping.c > @@ -665,6 +665,28 @@ static void dma_cache_maint(phys_addr_t paddr, > } while (left); > } > > +/* > + * Mark the D-cache clean for these pages to avoid extra flushing. > + */ > +void arch_dma_mark_clean(phys_addr_t paddr, size_t size) > +{ > + unsigned long pfn = PFN_UP(paddr); > + unsigned long off = paddr & (PAGE_SIZE - 1); > + size_t left = size; > + > + if (size < PAGE_SIZE) > + return; > + > + if (off) > + left -= PAGE_SIZE - off; > + > + while (left >= PAGE_SIZE) { > + struct page *page = pfn_to_page(pfn++); > + set_bit(PG_dcache_clean, &page->flags); > + left -= PAGE_SIZE; > + } > +} > + > static bool arch_sync_dma_cpu_needs_post_dma_flush(void) > { > if (IS_ENABLED(CONFIG_CPU_V6) || > @@ -715,24 +737,6 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, > outer_inv_range(paddr, paddr + size); > dma_cache_maint(paddr, size, dmac_inv_range); > } > - > - /* > - * Mark the D-cache clean for these pages to avoid extra flushing. > - */ > - if (dir != DMA_TO_DEVICE && size >= PAGE_SIZE) { > - unsigned long pfn = PFN_UP(paddr); > - unsigned long off = paddr & (PAGE_SIZE - 1); > - size_t left = size; > - > - if (off) > - left -= PAGE_SIZE - off; > - > - while (left >= PAGE_SIZE) { > - struct page *page = pfn_to_page(pfn++); > - set_bit(PG_dcache_clean, &page->flags); > - left -= PAGE_SIZE; > - } > - } > } > > #ifdef CONFIG_ARM_DMA_USE_IOMMU > @@ -1294,6 +1298,17 @@ static int arm_iommu_map_sg(struct device *dev, struct scatterlist *sg, > return -EINVAL; > } > > +static void arm_iommu_sync_dma_for_cpu(phys_addr_t phys, size_t len, > + enum dma_data_direction dir, > + bool dma_coherent) > +{ > + if (!dma_coherent) > + arch_sync_dma_for_cpu(phys, s->length, dir); > + > + if (dir == DMA_FROM_DEVICE) > + arch_dma_mark_clean(phys, s->length); > +} > + > /** > * arm_iommu_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg > * @dev: valid struct device pointer > @@ -1316,8 +1331,9 @@ static void arm_iommu_unmap_sg(struct device *dev, > if (sg_dma_len(s)) > __iommu_remove_mapping(dev, sg_dma_address(s), > sg_dma_len(s)); > - if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) > - arch_sync_dma_for_cpu(sg_phys(s), s->length, dir); > + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) > + arm_iommu_sync_dma_for_cpu(sg_phys(s), s->length, dir, > + dev->dma_coherent); > } > } > > @@ -1335,12 +1351,9 @@ static void arm_iommu_sync_sg_for_cpu(struct device *dev, > struct scatterlist *s; > int i; > > - if (dev->dma_coherent) > - return; > - > for_each_sg(sg, s, nents, i) > - arch_sync_dma_for_cpu(sg_phys(s), s->length, dir); > - > + arm_iommu_sync_dma_for_cpu(sg_phys(s), s->length, dir, > + dev->dma_coherent); > } > > /** > @@ -1425,9 +1438,9 @@ static void arm_iommu_unmap_page(struct device *dev, dma_addr_t handle, > if (!iova) > return; > > - if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { > + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) > phys = iommu_iova_to_phys(mapping->domain, handle); > - arch_sync_dma_for_cpu(phys, size, dir); > + arm_iommu_sync_dma_for_cpu(phys, size, dir, dev->dma_coherent); > } > > iommu_unmap(mapping->domain, iova, len); > @@ -1497,11 +1510,11 @@ static void arm_iommu_sync_single_for_cpu(struct device *dev, > struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev); > phys_addr_t phys; > > - if (dev->dma_coherent || !(handle & PAGE_MASK)) > + if (!(handle & PAGE_MASK)) > return; > > phys = iommu_iova_to_phys(mapping->domain, handle); > - arch_sync_dma_for_cpu(phys, size, dir); > + arm_iommu_sync_dma_for_cpu(phys, size, dir, dev->dma_coherent); > } > > static void arm_iommu_sync_single_for_device(struct device *dev,
On Mon, Mar 27, 2023 at 02:13:16PM +0200, Arnd Bergmann wrote: > From: Arnd Bergmann <arnd@arndb.de> > > The arm version of the arch_sync_dma_for_cpu() function annotates pages as > PG_dcache_clean after a DMA, but no other architecture does this here. ... because this is an arm32 specific feature. Generically, it's PG_arch_1, which is a page flag free for architecture use. On arm32 we decided to use this to mark whether we can skip dcache writebacks when establishing a PTE - and thus it was decided to call it PG_dcache_clean to reflect how arm32 decided to use that bit. This isn't just a DMA thing, there are other places that we update the bit, such as flush_dcache_page() and copy_user_highpage(). So thinking that the arm32 PG_dcache_clean is something for DMA is actually wrong. Other architectures are free to do their own other optimisations using that bit, and their implementations may be DMA-centric.
On Mon, Mar 27, 2023, at 14:48, Robin Murphy wrote: > On 2023-03-27 13:13, Arnd Bergmann wrote: >> >> [ HELP NEEDED: can anyone confirm that it is a correct assumption >> on arm that a cache-coherent device writing to a page always results >> in it being in a PG_dcache_clean state like on ia64, or can a device >> write directly into the dcache?] > > In AMBA at least, if a snooping write hits in a cache then the data is > most likely going to get routed directly into that cache. If it has > write-back write-allocate attributes it could also land in any cache > along its normal path to RAM; it wouldn't have to go all the way. > > Hence all the fun we have where treating a coherent device as > non-coherent can still be almost as broken as the other way round :) Ok, thanks for the information. I'm still not sure whether this can result in the situation where PG_dcache_clean is wrong though. Specifically, the question is whether a DMA to a coherent buffer can end up in a dirty L1 dcache of one core and require to write back the dcache before invalidating the icache for that page. On ia64, this is not the case, the optimization here is to only flush the icache after a coherent DMA into an executable user page, while Arm only does this for noncoherent DMA but not coherent DMA. From your explanation it sounds like this might happen, even though that would mean that "coherent" DMA is slightly less coherent than it is elsewhere. To be on the safe side, I'd have to pass a flag into arch_dma_mark_clean() about coherency, to let the arm implementation still require the extra dcache flush for coherent DMA, while ia64 can ignore that flag. Arnd
On Mon, Mar 27, 2023, at 17:01, Russell King (Oracle) wrote: > On Mon, Mar 27, 2023 at 02:13:16PM +0200, Arnd Bergmann wrote: >> From: Arnd Bergmann <arnd@arndb.de> >> >> The arm version of the arch_sync_dma_for_cpu() function annotates pages as >> PG_dcache_clean after a DMA, but no other architecture does this here. > > ... because this is an arm32 specific feature. Generically, it's > PG_arch_1, which is a page flag free for architecture use. On arm32 > we decided to use this to mark whether we can skip dcache writebacks > when establishing a PTE - and thus it was decided to call it > PG_dcache_clean to reflect how arm32 decided to use that bit. > > This isn't just a DMA thing, there are other places that we update > the bit, such as flush_dcache_page() and copy_user_highpage(). > > So thinking that the arm32 PG_dcache_clean is something for DMA is > actually wrong. > > Other architectures are free to do their own other optimisations > using that bit, and their implementations may be DMA-centric. The flag is used the same way on most architectures, though some use the opposite polarity and call it PG_dcache_dirty. The only other architecture that uses it for DMA is ia64, with the difference being that this also marks the page as clean even for coherent DMA, not just when doing a flush as part of noncoherent DMA. Based on Robin's reply it sounds that this is not a valid assumption on Arm, if a coherent DMA can target a dirty dcache line without cleaning it. Arnd
On 31/03/2023 3:00 pm, Arnd Bergmann wrote: > On Mon, Mar 27, 2023, at 14:48, Robin Murphy wrote: >> On 2023-03-27 13:13, Arnd Bergmann wrote: >>> >>> [ HELP NEEDED: can anyone confirm that it is a correct assumption >>> on arm that a cache-coherent device writing to a page always results >>> in it being in a PG_dcache_clean state like on ia64, or can a device >>> write directly into the dcache?] >> >> In AMBA at least, if a snooping write hits in a cache then the data is >> most likely going to get routed directly into that cache. If it has >> write-back write-allocate attributes it could also land in any cache >> along its normal path to RAM; it wouldn't have to go all the way. >> >> Hence all the fun we have where treating a coherent device as >> non-coherent can still be almost as broken as the other way round :) > > Ok, thanks for the information. I'm still not sure whether this can > result in the situation where PG_dcache_clean is wrong though. > > Specifically, the question is whether a DMA to a coherent buffer > can end up in a dirty L1 dcache of one core and require to write > back the dcache before invalidating the icache for that page. > > On ia64, this is not the case, the optimization here is to > only flush the icache after a coherent DMA into an executable > user page, while Arm only does this for noncoherent DMA but not > coherent DMA. > > From your explanation it sounds like this might happen, > even though that would mean that "coherent" DMA is slightly > less coherent than it is elsewhere. > > To be on the safe side, I'd have to pass a flag into > arch_dma_mark_clean() about coherency, to let the arm > implementation still require the extra dcache flush > for coherent DMA, while ia64 can ignore that flag. Coherent DMA on Arm is assumed to be inner-shareable, so a coherent DMA write should be pretty much equivalent to a coherent write by another CPU (or indeed the local CPU itself) - nothing says that it *couldn't* dirty a line in a data cache above the level of unification, so in general the assumption must be that, yes, if coherent DMA is writing data intended to be executable, then it's going to want a Dcache clean to PoU and an Icache invalidate to PoU before trying to execute it. By comparison, a non-coherent DMA transfer will inherently have to invalidate the Dcache all the way to PoC in its dma_unmap, thus cannot leave dirty data above the PoU, so only the Icache maintenance is required in the executable case. (FWIW I believe the Armv8 IDC/DIC features can safely be considered irrelevant to 32-bit kernels) I don't know a great deal about IA-64, but it appears to be using its PG_arch_1 flag in a subtly different manner to Arm, namely to optimise out the *Icache* maintenance. So if anything, it seems IA-64 is the weirdo here (who'd have guessed?) where DMA manages to be *more* coherent than the CPUs themselves :) This is all now making me think we need some careful consideration of whether the benefits of consolidating code outweigh the confusion of conflating multiple different meanings of "clean" together... Thanks, Robin.
On Fri, Mar 31, 2023 at 04:06:37PM +0200, Arnd Bergmann wrote: > On Mon, Mar 27, 2023, at 17:01, Russell King (Oracle) wrote: > > On Mon, Mar 27, 2023 at 02:13:16PM +0200, Arnd Bergmann wrote: > >> From: Arnd Bergmann <arnd@arndb.de> > >> > >> The arm version of the arch_sync_dma_for_cpu() function annotates pages as > >> PG_dcache_clean after a DMA, but no other architecture does this here. > > > > ... because this is an arm32 specific feature. Generically, it's > > PG_arch_1, which is a page flag free for architecture use. On arm32 > > we decided to use this to mark whether we can skip dcache writebacks > > when establishing a PTE - and thus it was decided to call it > > PG_dcache_clean to reflect how arm32 decided to use that bit. > > > > This isn't just a DMA thing, there are other places that we update > > the bit, such as flush_dcache_page() and copy_user_highpage(). > > > > So thinking that the arm32 PG_dcache_clean is something for DMA is > > actually wrong. > > > > Other architectures are free to do their own other optimisations > > using that bit, and their implementations may be DMA-centric. > > The flag is used the same way on most architectures, though some > use the opposite polarity and call it PG_dcache_dirty. The only > other architecture that uses it for DMA is ia64, with the difference > being that this also marks the page as clean even for coherent > DMA, not just when doing a flush as part of noncoherent DMA. > > Based on Robin's reply it sounds that this is not a valid assumption > on Arm, if a coherent DMA can target a dirty dcache line without > cleaning it. The other thing to note here is that PG_dcache_clean doesn't have much meaning on modern CPUs with PIPT caches. For these, cache_is_vipt_nonaliasing() will be true, and cache_ops_need_broadcast() will be false. Firstly, if we're using coherent DMA, then PG_dcache_clean is intentionally not touched, because the data cache isn't cleaned in any way by DMA operations. flush_dcache_page() turns into a no-op apart from clearing PG_dcache_clean if it was set. __sync_icache_dcache() will do nothing for non-executable pages, but will write-back a page that isn't marked PG_dcache_clean to ensure that it is visible to the instruction stream. This is only used to ensure that a the instructions are visible to a newly established executable mapping when e.g. the page has been DMA'd in. The default state of PG_dcache_clean is zero on any new allocation, so this has the effect of causing any executable page to be flushed such that the instruction stream can see the instructions, but only for the first establishment of the mapping. That means that e.g. libc text pages don't keep getting flushed on the start of every program. update_mmu_cache() isn't compiled, so it's use of PG_dcache_clean is irrelevant. v6_copy_user_highpage_aliasing() won't be called because we're not using an aliasing cache. So, for modern ARM systems with DMA-coherent PG_dcache_clean only serves for the __sync_icache_dcache() optimisation. ARMs use of this remains valid in this circumstance.
On Fri, Mar 31, 2023, at 17:12, Robin Murphy wrote: > On 31/03/2023 3:00 pm, Arnd Bergmann wrote: >> On Mon, Mar 27, 2023, at 14:48, Robin Murphy wrote: >> >> To be on the safe side, I'd have to pass a flag into >> arch_dma_mark_clean() about coherency, to let the arm >> implementation still require the extra dcache flush >> for coherent DMA, while ia64 can ignore that flag. > > Coherent DMA on Arm is assumed to be inner-shareable, so a coherent DMA > write should be pretty much equivalent to a coherent write by another > CPU (or indeed the local CPU itself) - nothing says that it *couldn't* > dirty a line in a data cache above the level of unification, so in > general the assumption must be that, yes, if coherent DMA is writing > data intended to be executable, then it's going to want a Dcache clean > to PoU and an Icache invalidate to PoU before trying to execute it. By > comparison, a non-coherent DMA transfer will inherently have to > invalidate the Dcache all the way to PoC in its dma_unmap, thus cannot > leave dirty data above the PoU, so only the Icache maintenance is > required in the executable case. Ok, makes sense. I've already started reworking my patch for it. > (FWIW I believe the Armv8 IDC/DIC features can safely be considered > irrelevant to 32-bit kernels) > > I don't know a great deal about IA-64, but it appears to be using its > PG_arch_1 flag in a subtly different manner to Arm, namely to optimise > out the *Icache* maintenance. So if anything, it seems IA-64 is the > weirdo here (who'd have guessed?) where DMA manages to be *more* > coherent than the CPUs themselves :) I checked this in the ia64 manual, and as far as I can tell, it originally only had one cacheflush instruction that flushes the dcache and invalidates the icache at the same time. So flush_icache_range() actually does both and flush_dcache_page() instead just marks the page as dirty to ensure flush_icache_range() does not get skipped after a writing a page from the kernel. On later Itaniums, there is apparently a separate icache flush instruction that gets used in flush_icache_range(), but that still works for the DMA case that is allowed to skip the flush. > This is all now making me think we need some careful consideration of > whether the benefits of consolidating code outweigh the confusion of > conflating multiple different meanings of "clean" together... The difference in usage of PG_dcache_clean/PG_dcache_dirty/PG_arch_1 across architectures is certainly big enough that we can't just define a a common arch_dma_mark_clean() across architectures, but I think the idea of having a common entry point for arch_dma_mark_clean() to be called from the dma-mapping code to do something architecture specific after a DMA is clean still makes sense, Arnd
Hi Arnd, On Mon, Mar 27, 2023 at 2:16 PM Arnd Bergmann <arnd@kernel.org> wrote: > From: Arnd Bergmann <arnd@arndb.de> > > The arm version of the arch_sync_dma_for_cpu() function annotates pages as > PG_dcache_clean after a DMA, but no other architecture does this here. On > ia64, the same thing is done in arch_sync_dma_for_cpu(), so it makes sense > to use the same hook in order to have identical arch_sync_dma_for_cpu() > semantics as all other architectures. > > Splitting this out has multiple effects: > > - for dma-direct, this now gets called after arch_sync_dma_for_cpu() > for DMA_FROM_DEVICE mappings, but not for DMA_BIDIRECTIONAL. While > it would not be harmful to keep doing it for bidirectional mappings, > those are apparently not used in any callers that care about the flag. > > - Since arm has its own dma-iommu abstraction, this now also needs to > call the same function, so the calls are added there to mirror the > dma-direct version. > > - Like dma-direct, the dma-iommu version now marks the dcache clean > for both coherent and noncoherent devices after a DMA, but it only > does this for DMA_FROM_DEVICE, not DMA_BIDIRECTIONAL. > > [ HELP NEEDED: can anyone confirm that it is a correct assumption > on arm that a cache-coherent device writing to a page always results > in it being in a PG_dcache_clean state like on ia64, or can a device > write directly into the dcache?] > > Signed-off-by: Arnd Bergmann <arnd@arndb.de> Thanks for your patch, which is now commit 322dbe898f82fd8a ("ARM: dma-mapping: split out arch_dma_mark_clean() helper") in esmil/jh7100-dmapool. If CONFIG_ARM_DMA_USE_IOMMU=y, the build fails. > --- a/arch/arm/mm/dma-mapping.c > +++ b/arch/arm/mm/dma-mapping.c > @@ -1294,6 +1298,17 @@ static int arm_iommu_map_sg(struct device *dev, struct scatterlist *sg, > return -EINVAL; > } > > +static void arm_iommu_sync_dma_for_cpu(phys_addr_t phys, size_t len, > + enum dma_data_direction dir, > + bool dma_coherent) > +{ > + if (!dma_coherent) > + arch_sync_dma_for_cpu(phys, s->length, dir); s/s->length/len/ > + > + if (dir == DMA_FROM_DEVICE) > + arch_dma_mark_clean(phys, s->length); Likewise. > +} > + > /** > * arm_iommu_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg > * @dev: valid struct device pointer > @@ -1425,9 +1438,9 @@ static void arm_iommu_unmap_page(struct device *dev, dma_addr_t handle, > if (!iova) > return; > > - if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { > + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) Missing opening curly brace. > phys = iommu_iova_to_phys(mapping->domain, handle); > - arch_sync_dma_for_cpu(phys, size, dir); > + arm_iommu_sync_dma_for_cpu(phys, size, dir, dev->dma_coherent); > } > > iommu_unmap(mapping->domain, iova, len); With the above fixed, it builds and boots fine (on R-Car M2-W). Gr{oetje,eeting}s, Geert
> Thanks for your patch, which is now commit 322dbe898f82fd8a > ("ARM: dma-mapping: split out arch_dma_mark_clean() helper") in > esmil/jh7100-dmapool. Well, something is wrong with that branch then, and this series still needs more work, and should eventually be merged through the dma-mapping tree.
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index e24a9820e12f..125d58c54ab1 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -7,6 +7,7 @@ config ARM select ARCH_HAS_BINFMT_FLAT select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL if MMU + select ARCH_HAS_DMA_MARK_CLEAN if MMU select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index cc702cb27ae7..b703cb83d27e 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -665,6 +665,28 @@ static void dma_cache_maint(phys_addr_t paddr, } while (left); } +/* + * Mark the D-cache clean for these pages to avoid extra flushing. + */ +void arch_dma_mark_clean(phys_addr_t paddr, size_t size) +{ + unsigned long pfn = PFN_UP(paddr); + unsigned long off = paddr & (PAGE_SIZE - 1); + size_t left = size; + + if (size < PAGE_SIZE) + return; + + if (off) + left -= PAGE_SIZE - off; + + while (left >= PAGE_SIZE) { + struct page *page = pfn_to_page(pfn++); + set_bit(PG_dcache_clean, &page->flags); + left -= PAGE_SIZE; + } +} + static bool arch_sync_dma_cpu_needs_post_dma_flush(void) { if (IS_ENABLED(CONFIG_CPU_V6) || @@ -715,24 +737,6 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, outer_inv_range(paddr, paddr + size); dma_cache_maint(paddr, size, dmac_inv_range); } - - /* - * Mark the D-cache clean for these pages to avoid extra flushing. - */ - if (dir != DMA_TO_DEVICE && size >= PAGE_SIZE) { - unsigned long pfn = PFN_UP(paddr); - unsigned long off = paddr & (PAGE_SIZE - 1); - size_t left = size; - - if (off) - left -= PAGE_SIZE - off; - - while (left >= PAGE_SIZE) { - struct page *page = pfn_to_page(pfn++); - set_bit(PG_dcache_clean, &page->flags); - left -= PAGE_SIZE; - } - } } #ifdef CONFIG_ARM_DMA_USE_IOMMU @@ -1294,6 +1298,17 @@ static int arm_iommu_map_sg(struct device *dev, struct scatterlist *sg, return -EINVAL; } +static void arm_iommu_sync_dma_for_cpu(phys_addr_t phys, size_t len, + enum dma_data_direction dir, + bool dma_coherent) +{ + if (!dma_coherent) + arch_sync_dma_for_cpu(phys, s->length, dir); + + if (dir == DMA_FROM_DEVICE) + arch_dma_mark_clean(phys, s->length); +} + /** * arm_iommu_unmap_sg - unmap a set of SG buffers mapped by dma_map_sg * @dev: valid struct device pointer @@ -1316,8 +1331,9 @@ static void arm_iommu_unmap_sg(struct device *dev, if (sg_dma_len(s)) __iommu_remove_mapping(dev, sg_dma_address(s), sg_dma_len(s)); - if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) - arch_sync_dma_for_cpu(sg_phys(s), s->length, dir); + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arm_iommu_sync_dma_for_cpu(sg_phys(s), s->length, dir, + dev->dma_coherent); } } @@ -1335,12 +1351,9 @@ static void arm_iommu_sync_sg_for_cpu(struct device *dev, struct scatterlist *s; int i; - if (dev->dma_coherent) - return; - for_each_sg(sg, s, nents, i) - arch_sync_dma_for_cpu(sg_phys(s), s->length, dir); - + arm_iommu_sync_dma_for_cpu(sg_phys(s), s->length, dir, + dev->dma_coherent); } /** @@ -1425,9 +1438,9 @@ static void arm_iommu_unmap_page(struct device *dev, dma_addr_t handle, if (!iova) return; - if (!dev->dma_coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC)) phys = iommu_iova_to_phys(mapping->domain, handle); - arch_sync_dma_for_cpu(phys, size, dir); + arm_iommu_sync_dma_for_cpu(phys, size, dir, dev->dma_coherent); } iommu_unmap(mapping->domain, iova, len); @@ -1497,11 +1510,11 @@ static void arm_iommu_sync_single_for_cpu(struct device *dev, struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(dev); phys_addr_t phys; - if (dev->dma_coherent || !(handle & PAGE_MASK)) + if (!(handle & PAGE_MASK)) return; phys = iommu_iova_to_phys(mapping->domain, handle); - arch_sync_dma_for_cpu(phys, size, dir); + arm_iommu_sync_dma_for_cpu(phys, size, dir, dev->dma_coherent); } static void arm_iommu_sync_single_for_device(struct device *dev,