diff mbox

[RFC,v3,3/5] ARM: NOMMU: Introduce dma operations for noMMU

Message ID 1483969669-4636-4-git-send-email-vladimir.murzin@arm.com
State New, archived
Headers show

Commit Message

Vladimir Murzin Jan. 9, 2017, 1:47 p.m. UTC
R/M classes of cpus can have memory covered by MPU which in turn might
configure RAM as Normal i.e. bufferable and cacheable. It breaks
dma_alloc_coherent() and friends, since data can stuck in caches now
or be buffered.

This patch factors out DMA support for NOMMU configuration into
separate entity which provides dedicated dma_ops. We have to handle
there several cases:
- configurations with MMU/MPU setup
- configurations without MMU/MPU setup
- special case for M-class, since caches and MPU there are optional

In general we rely on default DMA area for coherent allocations or/and
per-device memory reserves suitable for coherent DMA, so if such
regions are set coherent allocations go from there.

In case MPU/MPU was not setup we fallback to normal page allocator for
DMA memory allocation.

In case we run M-class cpus, for configuration without cache support
(like Cortex-M3/M4) dma operations are forced to be coherent and wired
with dma-noop (such decision is made based on cacheid global
variable); however, if caches are detected there and no DMA coherent
region is given (either default or per-device), dma is disallowed even
MPU is not set - it is because M-class implement system memory map
which defines part of address space as Normal memory.

Reported-by: Alexandre Torgue <alexandre.torgue@st.com>
Reported-by: Andras Szemzo <sza@esh.hu>
Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
---
 arch/arm/include/asm/dma-mapping.h |   3 +-
 arch/arm/mm/Makefile               |   5 +-
 arch/arm/mm/dma-mapping-nommu.c    | 252 +++++++++++++++++++++++++++++++++++++
 3 files changed, 256 insertions(+), 4 deletions(-)
 create mode 100644 arch/arm/mm/dma-mapping-nommu.c

Comments

Robin Murphy Jan. 9, 2017, 4:43 p.m. UTC | #1
Hi Vladimir,

On 09/01/17 13:47, Vladimir Murzin wrote:
> R/M classes of cpus can have memory covered by MPU which in turn might
> configure RAM as Normal i.e. bufferable and cacheable. It breaks
> dma_alloc_coherent() and friends, since data can stuck in caches now
> or be buffered.
> 
> This patch factors out DMA support for NOMMU configuration into
> separate entity which provides dedicated dma_ops. We have to handle
> there several cases:
> - configurations with MMU/MPU setup
> - configurations without MMU/MPU setup
> - special case for M-class, since caches and MPU there are optional
> 
> In general we rely on default DMA area for coherent allocations or/and
> per-device memory reserves suitable for coherent DMA, so if such
> regions are set coherent allocations go from there.
> 
> In case MPU/MPU was not setup we fallback to normal page allocator for
> DMA memory allocation.
> 
> In case we run M-class cpus, for configuration without cache support
> (like Cortex-M3/M4) dma operations are forced to be coherent and wired
> with dma-noop (such decision is made based on cacheid global
> variable); however, if caches are detected there and no DMA coherent
> region is given (either default or per-device), dma is disallowed even
> MPU is not set - it is because M-class implement system memory map
> which defines part of address space as Normal memory.
> 
> Reported-by: Alexandre Torgue <alexandre.torgue@st.com>
> Reported-by: Andras Szemzo <sza@esh.hu>
> Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
> ---
>  arch/arm/include/asm/dma-mapping.h |   3 +-
>  arch/arm/mm/Makefile               |   5 +-
>  arch/arm/mm/dma-mapping-nommu.c    | 252 +++++++++++++++++++++++++++++++++++++
>  3 files changed, 256 insertions(+), 4 deletions(-)
>  create mode 100644 arch/arm/mm/dma-mapping-nommu.c
> 
> diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
> index bf02dbd..559faad 100644
> --- a/arch/arm/include/asm/dma-mapping.h
> +++ b/arch/arm/include/asm/dma-mapping.h
> @@ -20,7 +20,8 @@ static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
>  {
>  	if (dev && dev->archdata.dma_ops)
>  		return dev->archdata.dma_ops;
> -	return &arm_dma_ops;
> +
> +	return IS_ENABLED(CONFIG_MMU) ? &arm_dma_ops : &dma_noop_ops;
>  }
>  
>  static inline struct dma_map_ops *get_dma_ops(struct device *dev)
> diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
> index 2ac7988..5796357 100644
> --- a/arch/arm/mm/Makefile
> +++ b/arch/arm/mm/Makefile
> @@ -2,9 +2,8 @@
>  # Makefile for the linux arm-specific parts of the memory manager.
>  #
>  
> -obj-y				:= dma-mapping.o extable.o fault.o init.o \
> -				   iomap.o
> -
> +obj-y				:= extable.o fault.o init.o iomap.o
> +obj-y				+= dma-mapping$(MMUEXT).o
>  obj-$(CONFIG_MMU)		+= fault-armv.o flush.o idmap.o ioremap.o \
>  				   mmap.o pgd.o mmu.o pageattr.o
>  
> diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
> new file mode 100644
> index 0000000..a5c50fb
> --- /dev/null
> +++ b/arch/arm/mm/dma-mapping-nommu.c
> @@ -0,0 +1,252 @@
> +/*
> + *  Based on linux/arch/arm/mm/dma-mapping.c
> + *
> + *  Copyright (C) 2000-2004 Russell King
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + */
> +
> +#include <linux/export.h>
> +#include <linux/mm.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/scatterlist.h>
> +
> +#include <asm/cachetype.h>
> +#include <asm/cacheflush.h>
> +#include <asm/outercache.h>
> +#include <asm/cp15.h>
> +
> +#include "dma.h"
> +
> +/*
> + *  dma_noop_ops is used if
> + *   - MMU/MPU is off
> + *   - cpu is v7m w/o cache support
> + *   - device is coherent
> + *  otherwise arm_nommu_dma_ops is used.
> + *
> + *  arm_nommu_dma_ops rely on consistent DMA memory (please, refer to
> + *  [1] on how to declare such memory).
> + *
> + *  [1] Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
> + */
> +
> +static void *arm_nommu_dma_alloc(struct device *dev, size_t size,
> +				 dma_addr_t *dma_handle, gfp_t gfp,
> +				 unsigned long attrs)
> +
> +{
> +	struct dma_map_ops *ops = &dma_noop_ops;
> +
> +	/*
> +	 * We are here because:
> +	 * - no consistent DMA region has been defined, so we can't
> +	 *   continue.
> +	 * - there is no space left in consistent DMA region, so we
> +	 *   only can fallback to generic allocator if we are
> +	 *   advertised that consistency is not required.
> +	 */
> +
> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
> +		return ops->alloc(dev, size, dma_handle, gfp, attrs);
> +
> +	WARN_ON_ONCE(1);
> +	return NULL;
> +}
> +
> +static void arm_nommu_dma_free(struct device *dev, size_t size,
> +			       void *cpu_addr, dma_addr_t dma_addr,
> +			       unsigned long attrs)
> +{
> +	struct dma_map_ops *ops = &dma_noop_ops;
> +
> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
> +		ops->free(dev, size, cpu_addr, dma_addr, attrs);
> +
> +	WARN_ON_ONCE(1);
> +	return;
> +}
> +
> +static int arm_nommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
> +			      void *cpu_addr, dma_addr_t dma_addr, size_t size,
> +			      unsigned long attrs)
> +{
> +	struct dma_map_ops *ops = &dma_noop_ops;
> +	int ret;
> +
> +	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
> +		return ret;
> +
> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
> +		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
> +
> +	WARN_ON_ONCE(1);
> +	return -ENXIO;
> +}
> +
> +static void __dma_page_cpu_to_dev(dma_addr_t handle, size_t size,
> +				  enum dma_data_direction dir)
> +{
> +	dmac_unmap_area(__va(handle), size, dir);
> +
> +	if (dir == DMA_FROM_DEVICE)
> +		outer_inv_range(handle, handle + size);
> +	else
> +		outer_clean_range(handle, handle + size);
> +}
> +
> +static void __dma_page_dev_to_cpu(dma_addr_t handle, size_t size,
> +				  enum dma_data_direction dir)
> +{
> +	if (dir != DMA_TO_DEVICE) {
> +		outer_inv_range(handle, handle + size);
> +		dmac_unmap_area(__va(handle), size, dir);
> +	}
> +}

Nit: I appreciate that the situation here makes it OK by construction,
but CPU cache maintenance on a DMA address just looks *so* wrong :)
Could we pass either the "virtual" or physical version of the address as
the argument to these helpers so that the code looks less crazy at a glance?

Robin.

> +static dma_addr_t arm_nommu_dma_map_page(struct device *dev, struct page *page,
> +					 unsigned long offset, size_t size,
> +					 enum dma_data_direction dir,
> +					 unsigned long attrs)
> +{
> +	dma_addr_t handle = page_to_phys(page) + offset;
> +
> +	__dma_page_cpu_to_dev(handle, size, dir);
> +
> +	return handle;
> +}
> +
> +static void arm_nommu_dma_unmap_page(struct device *dev, dma_addr_t handle,
> +				     size_t size, enum dma_data_direction dir,
> +				     unsigned long attrs)
> +{
> +	__dma_page_dev_to_cpu(handle, size, dir);
> +}
> +
> +
> +static int arm_nommu_dma_map_sg(struct device *dev, struct scatterlist *sgl,
> +				int nents, enum dma_data_direction dir,
> +				unsigned long attrs)
> +{
> +	int i;
> +	struct scatterlist *sg;
> +
> +	for_each_sg(sgl, sg, nents, i) {
> +		sg_dma_address(sg) = sg_phys(sg);
> +		sg_dma_len(sg) = sg->length;
> +		__dma_page_cpu_to_dev(sg_dma_address(sg), sg_dma_len(sg), dir);
> +	}
> +
> +	return nents;
> +}
> +
> +static void arm_nommu_dma_unmap_sg(struct device *dev, struct scatterlist *sgl,
> +				   int nents, enum dma_data_direction dir,
> +				   unsigned long attrs)
> +{
> +	struct scatterlist *sg;
> +	int i;
> +
> +	for_each_sg(sgl, sg, nents, i)
> +		__dma_page_dev_to_cpu(sg_dma_address(sg), sg_dma_len(sg), dir);
> +}
> +
> +static void arm_nommu_dma_sync_single_for_device(struct device *dev,
> +		dma_addr_t handle, size_t size, enum dma_data_direction dir)
> +{
> +	__dma_page_cpu_to_dev(handle, size, dir);
> +}
> +
> +static void arm_nommu_dma_sync_single_for_cpu(struct device *dev,
> +		dma_addr_t handle, size_t size, enum dma_data_direction dir)
> +{
> +	__dma_page_cpu_to_dev(handle, size, dir);
> +}
> +
> +static void arm_nommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
> +					     int nents, enum dma_data_direction dir)
> +{
> +	struct scatterlist *sg;
> +	int i;
> +
> +	for_each_sg(sgl, sg, nents, i)
> +		__dma_page_cpu_to_dev(sg_dma_address(sg), sg_dma_len(sg), dir);
> +}
> +
> +static void arm_nommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
> +					  int nents, enum dma_data_direction dir)
> +{
> +	struct scatterlist *sg;
> +	int i;
> +
> +	for_each_sg(sgl, sg, nents, i)
> +		__dma_page_dev_to_cpu(sg_dma_address(sg), sg_dma_len(sg), dir);
> +}
> +
> +struct dma_map_ops arm_nommu_dma_ops = {
> +	.alloc			= arm_nommu_dma_alloc,
> +	.free			= arm_nommu_dma_free,
> +	.mmap			= arm_nommu_dma_mmap,
> +	.map_page		= arm_nommu_dma_map_page,
> +	.unmap_page		= arm_nommu_dma_unmap_page,
> +	.map_sg			= arm_nommu_dma_map_sg,
> +	.unmap_sg		= arm_nommu_dma_unmap_sg,
> +	.sync_single_for_device	= arm_nommu_dma_sync_single_for_device,
> +	.sync_single_for_cpu	= arm_nommu_dma_sync_single_for_cpu,
> +	.sync_sg_for_device	= arm_nommu_dma_sync_sg_for_device,
> +	.sync_sg_for_cpu	= arm_nommu_dma_sync_sg_for_cpu,
> +};
> +EXPORT_SYMBOL(arm_nommu_dma_ops);
> +
> +static struct dma_map_ops *arm_nommu_get_dma_map_ops(bool coherent)
> +{
> +	return coherent ? &dma_noop_ops : &arm_nommu_dma_ops;
> +}
> +
> +void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
> +			const struct iommu_ops *iommu, bool coherent)
> +{
> +	struct dma_map_ops *dma_ops;
> +
> +	if (IS_ENABLED(CONFIG_CPU_V7M)) {
> +		/*
> +		 * Cache support for v7m is optional, so can be treated as
> +		 * coherent if no cache has been detected. Note that it is not
> +		 * enough to check if MPU is in use or not since in absense of
> +		 * MPU system memory map is used.
> +		 */
> +		dev->archdata.dma_coherent = (cacheid) ? coherent : true;
> +	} else {
> +		/*
> +		 * Assume coherent DMA in case MMU/MPU has not been set up.
> +		 */
> +		dev->archdata.dma_coherent = (get_cr() & CR_M) ? coherent : true;
> +	}
> +
> +	dma_ops = arm_nommu_get_dma_map_ops(dev->archdata.dma_coherent);
> +
> +	set_dma_ops(dev, dma_ops);
> +}
> +
> +void arch_teardown_dma_ops(struct device *dev)
> +{
> +}
> +
> +int dma_supported(struct device *dev, u64 mask)
> +{
> +	return 1;
> +}
> +
> +EXPORT_SYMBOL(dma_supported);
> +
> +#define PREALLOC_DMA_DEBUG_ENTRIES	4096
> +
> +static int __init dma_debug_do_init(void)
> +{
> +	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
> +	return 0;
> +}
> +core_initcall(dma_debug_do_init);
>
Vladimir Murzin Jan. 9, 2017, 4:51 p.m. UTC | #2
Hi Robin,

On 09/01/17 16:43, Robin Murphy wrote:
> Hi Vladimir,
> 
> On 09/01/17 13:47, Vladimir Murzin wrote:
>> R/M classes of cpus can have memory covered by MPU which in turn might
>> configure RAM as Normal i.e. bufferable and cacheable. It breaks
>> dma_alloc_coherent() and friends, since data can stuck in caches now
>> or be buffered.
>>
>> This patch factors out DMA support for NOMMU configuration into
>> separate entity which provides dedicated dma_ops. We have to handle
>> there several cases:
>> - configurations with MMU/MPU setup
>> - configurations without MMU/MPU setup
>> - special case for M-class, since caches and MPU there are optional
>>
>> In general we rely on default DMA area for coherent allocations or/and
>> per-device memory reserves suitable for coherent DMA, so if such
>> regions are set coherent allocations go from there.
>>
>> In case MPU/MPU was not setup we fallback to normal page allocator for
>> DMA memory allocation.
>>
>> In case we run M-class cpus, for configuration without cache support
>> (like Cortex-M3/M4) dma operations are forced to be coherent and wired
>> with dma-noop (such decision is made based on cacheid global
>> variable); however, if caches are detected there and no DMA coherent
>> region is given (either default or per-device), dma is disallowed even
>> MPU is not set - it is because M-class implement system memory map
>> which defines part of address space as Normal memory.
>>
>> Reported-by: Alexandre Torgue <alexandre.torgue@st.com>
>> Reported-by: Andras Szemzo <sza@esh.hu>
>> Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
>> ---
>>  arch/arm/include/asm/dma-mapping.h |   3 +-
>>  arch/arm/mm/Makefile               |   5 +-
>>  arch/arm/mm/dma-mapping-nommu.c    | 252 +++++++++++++++++++++++++++++++++++++
>>  3 files changed, 256 insertions(+), 4 deletions(-)
>>  create mode 100644 arch/arm/mm/dma-mapping-nommu.c
>>
>> diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
>> index bf02dbd..559faad 100644
>> --- a/arch/arm/include/asm/dma-mapping.h
>> +++ b/arch/arm/include/asm/dma-mapping.h
>> @@ -20,7 +20,8 @@ static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
>>  {
>>  	if (dev && dev->archdata.dma_ops)
>>  		return dev->archdata.dma_ops;
>> -	return &arm_dma_ops;
>> +
>> +	return IS_ENABLED(CONFIG_MMU) ? &arm_dma_ops : &dma_noop_ops;
>>  }
>>  
>>  static inline struct dma_map_ops *get_dma_ops(struct device *dev)
>> diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
>> index 2ac7988..5796357 100644
>> --- a/arch/arm/mm/Makefile
>> +++ b/arch/arm/mm/Makefile
>> @@ -2,9 +2,8 @@
>>  # Makefile for the linux arm-specific parts of the memory manager.
>>  #
>>  
>> -obj-y				:= dma-mapping.o extable.o fault.o init.o \
>> -				   iomap.o
>> -
>> +obj-y				:= extable.o fault.o init.o iomap.o
>> +obj-y				+= dma-mapping$(MMUEXT).o
>>  obj-$(CONFIG_MMU)		+= fault-armv.o flush.o idmap.o ioremap.o \
>>  				   mmap.o pgd.o mmu.o pageattr.o
>>  
>> diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
>> new file mode 100644
>> index 0000000..a5c50fb
>> --- /dev/null
>> +++ b/arch/arm/mm/dma-mapping-nommu.c
>> @@ -0,0 +1,252 @@
>> +/*
>> + *  Based on linux/arch/arm/mm/dma-mapping.c
>> + *
>> + *  Copyright (C) 2000-2004 Russell King
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + *
>> + */
>> +
>> +#include <linux/export.h>
>> +#include <linux/mm.h>
>> +#include <linux/dma-mapping.h>
>> +#include <linux/scatterlist.h>
>> +
>> +#include <asm/cachetype.h>
>> +#include <asm/cacheflush.h>
>> +#include <asm/outercache.h>
>> +#include <asm/cp15.h>
>> +
>> +#include "dma.h"
>> +
>> +/*
>> + *  dma_noop_ops is used if
>> + *   - MMU/MPU is off
>> + *   - cpu is v7m w/o cache support
>> + *   - device is coherent
>> + *  otherwise arm_nommu_dma_ops is used.
>> + *
>> + *  arm_nommu_dma_ops rely on consistent DMA memory (please, refer to
>> + *  [1] on how to declare such memory).
>> + *
>> + *  [1] Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
>> + */
>> +
>> +static void *arm_nommu_dma_alloc(struct device *dev, size_t size,
>> +				 dma_addr_t *dma_handle, gfp_t gfp,
>> +				 unsigned long attrs)
>> +
>> +{
>> +	struct dma_map_ops *ops = &dma_noop_ops;
>> +
>> +	/*
>> +	 * We are here because:
>> +	 * - no consistent DMA region has been defined, so we can't
>> +	 *   continue.
>> +	 * - there is no space left in consistent DMA region, so we
>> +	 *   only can fallback to generic allocator if we are
>> +	 *   advertised that consistency is not required.
>> +	 */
>> +
>> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
>> +		return ops->alloc(dev, size, dma_handle, gfp, attrs);
>> +
>> +	WARN_ON_ONCE(1);
>> +	return NULL;
>> +}
>> +
>> +static void arm_nommu_dma_free(struct device *dev, size_t size,
>> +			       void *cpu_addr, dma_addr_t dma_addr,
>> +			       unsigned long attrs)
>> +{
>> +	struct dma_map_ops *ops = &dma_noop_ops;
>> +
>> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
>> +		ops->free(dev, size, cpu_addr, dma_addr, attrs);
>> +
>> +	WARN_ON_ONCE(1);
>> +	return;
>> +}
>> +
>> +static int arm_nommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
>> +			      void *cpu_addr, dma_addr_t dma_addr, size_t size,
>> +			      unsigned long attrs)
>> +{
>> +	struct dma_map_ops *ops = &dma_noop_ops;
>> +	int ret;
>> +
>> +	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
>> +		return ret;
>> +
>> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
>> +		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
>> +
>> +	WARN_ON_ONCE(1);
>> +	return -ENXIO;
>> +}
>> +
>> +static void __dma_page_cpu_to_dev(dma_addr_t handle, size_t size,
>> +				  enum dma_data_direction dir)
>> +{
>> +	dmac_unmap_area(__va(handle), size, dir);
>> +
>> +	if (dir == DMA_FROM_DEVICE)
>> +		outer_inv_range(handle, handle + size);
>> +	else
>> +		outer_clean_range(handle, handle + size);
>> +}
>> +
>> +static void __dma_page_dev_to_cpu(dma_addr_t handle, size_t size,
>> +				  enum dma_data_direction dir)
>> +{
>> +	if (dir != DMA_TO_DEVICE) {
>> +		outer_inv_range(handle, handle + size);
>> +		dmac_unmap_area(__va(handle), size, dir);
>> +	}
>> +}
> 
> Nit: I appreciate that the situation here makes it OK by construction,
> but CPU cache maintenance on a DMA address just looks *so* wrong :)
> Could we pass either the "virtual" or physical version of the address as
> the argument to these helpers so that the code looks less crazy at a glance?

Something like bellow?

static void __dma_page_dev_to_cpu(dma_addr_t paddr, size_t size,
				  enum dma_data_direction dir)
{
	if (dir != DMA_TO_DEVICE) {
		outer_inv_range(paddr, paddr + size);
		dmac_unmap_area(__va(paddr), size, dir);
	}

Btw, thanks for having a look!

Cheers
Vladimir

> 
> Robin.
> 
>> +static dma_addr_t arm_nommu_dma_map_page(struct device *dev, struct page *page,
>> +					 unsigned long offset, size_t size,
>> +					 enum dma_data_direction dir,
>> +					 unsigned long attrs)
>> +{
>> +	dma_addr_t handle = page_to_phys(page) + offset;
>> +
>> +	__dma_page_cpu_to_dev(handle, size, dir);
>> +
>> +	return handle;
>> +}
>> +
>> +static void arm_nommu_dma_unmap_page(struct device *dev, dma_addr_t handle,
>> +				     size_t size, enum dma_data_direction dir,
>> +				     unsigned long attrs)
>> +{
>> +	__dma_page_dev_to_cpu(handle, size, dir);
>> +}
>> +
>> +
>> +static int arm_nommu_dma_map_sg(struct device *dev, struct scatterlist *sgl,
>> +				int nents, enum dma_data_direction dir,
>> +				unsigned long attrs)
>> +{
>> +	int i;
>> +	struct scatterlist *sg;
>> +
>> +	for_each_sg(sgl, sg, nents, i) {
>> +		sg_dma_address(sg) = sg_phys(sg);
>> +		sg_dma_len(sg) = sg->length;
>> +		__dma_page_cpu_to_dev(sg_dma_address(sg), sg_dma_len(sg), dir);
>> +	}
>> +
>> +	return nents;
>> +}
>> +
>> +static void arm_nommu_dma_unmap_sg(struct device *dev, struct scatterlist *sgl,
>> +				   int nents, enum dma_data_direction dir,
>> +				   unsigned long attrs)
>> +{
>> +	struct scatterlist *sg;
>> +	int i;
>> +
>> +	for_each_sg(sgl, sg, nents, i)
>> +		__dma_page_dev_to_cpu(sg_dma_address(sg), sg_dma_len(sg), dir);
>> +}
>> +
>> +static void arm_nommu_dma_sync_single_for_device(struct device *dev,
>> +		dma_addr_t handle, size_t size, enum dma_data_direction dir)
>> +{
>> +	__dma_page_cpu_to_dev(handle, size, dir);
>> +}
>> +
>> +static void arm_nommu_dma_sync_single_for_cpu(struct device *dev,
>> +		dma_addr_t handle, size_t size, enum dma_data_direction dir)
>> +{
>> +	__dma_page_cpu_to_dev(handle, size, dir);
>> +}
>> +
>> +static void arm_nommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
>> +					     int nents, enum dma_data_direction dir)
>> +{
>> +	struct scatterlist *sg;
>> +	int i;
>> +
>> +	for_each_sg(sgl, sg, nents, i)
>> +		__dma_page_cpu_to_dev(sg_dma_address(sg), sg_dma_len(sg), dir);
>> +}
>> +
>> +static void arm_nommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
>> +					  int nents, enum dma_data_direction dir)
>> +{
>> +	struct scatterlist *sg;
>> +	int i;
>> +
>> +	for_each_sg(sgl, sg, nents, i)
>> +		__dma_page_dev_to_cpu(sg_dma_address(sg), sg_dma_len(sg), dir);
>> +}
>> +
>> +struct dma_map_ops arm_nommu_dma_ops = {
>> +	.alloc			= arm_nommu_dma_alloc,
>> +	.free			= arm_nommu_dma_free,
>> +	.mmap			= arm_nommu_dma_mmap,
>> +	.map_page		= arm_nommu_dma_map_page,
>> +	.unmap_page		= arm_nommu_dma_unmap_page,
>> +	.map_sg			= arm_nommu_dma_map_sg,
>> +	.unmap_sg		= arm_nommu_dma_unmap_sg,
>> +	.sync_single_for_device	= arm_nommu_dma_sync_single_for_device,
>> +	.sync_single_for_cpu	= arm_nommu_dma_sync_single_for_cpu,
>> +	.sync_sg_for_device	= arm_nommu_dma_sync_sg_for_device,
>> +	.sync_sg_for_cpu	= arm_nommu_dma_sync_sg_for_cpu,
>> +};
>> +EXPORT_SYMBOL(arm_nommu_dma_ops);
>> +
>> +static struct dma_map_ops *arm_nommu_get_dma_map_ops(bool coherent)
>> +{
>> +	return coherent ? &dma_noop_ops : &arm_nommu_dma_ops;
>> +}
>> +
>> +void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
>> +			const struct iommu_ops *iommu, bool coherent)
>> +{
>> +	struct dma_map_ops *dma_ops;
>> +
>> +	if (IS_ENABLED(CONFIG_CPU_V7M)) {
>> +		/*
>> +		 * Cache support for v7m is optional, so can be treated as
>> +		 * coherent if no cache has been detected. Note that it is not
>> +		 * enough to check if MPU is in use or not since in absense of
>> +		 * MPU system memory map is used.
>> +		 */
>> +		dev->archdata.dma_coherent = (cacheid) ? coherent : true;
>> +	} else {
>> +		/*
>> +		 * Assume coherent DMA in case MMU/MPU has not been set up.
>> +		 */
>> +		dev->archdata.dma_coherent = (get_cr() & CR_M) ? coherent : true;
>> +	}
>> +
>> +	dma_ops = arm_nommu_get_dma_map_ops(dev->archdata.dma_coherent);
>> +
>> +	set_dma_ops(dev, dma_ops);
>> +}
>> +
>> +void arch_teardown_dma_ops(struct device *dev)
>> +{
>> +}
>> +
>> +int dma_supported(struct device *dev, u64 mask)
>> +{
>> +	return 1;
>> +}
>> +
>> +EXPORT_SYMBOL(dma_supported);
>> +
>> +#define PREALLOC_DMA_DEBUG_ENTRIES	4096
>> +
>> +static int __init dma_debug_do_init(void)
>> +{
>> +	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
>> +	return 0;
>> +}
>> +core_initcall(dma_debug_do_init);
>>
> 
>
Robin Murphy Jan. 9, 2017, 4:59 p.m. UTC | #3
On 09/01/17 16:51, Vladimir Murzin wrote:
> Hi Robin,
> 
> On 09/01/17 16:43, Robin Murphy wrote:
>> Hi Vladimir,
>>
>> On 09/01/17 13:47, Vladimir Murzin wrote:
>>> R/M classes of cpus can have memory covered by MPU which in turn might
>>> configure RAM as Normal i.e. bufferable and cacheable. It breaks
>>> dma_alloc_coherent() and friends, since data can stuck in caches now
>>> or be buffered.
>>>
>>> This patch factors out DMA support for NOMMU configuration into
>>> separate entity which provides dedicated dma_ops. We have to handle
>>> there several cases:
>>> - configurations with MMU/MPU setup
>>> - configurations without MMU/MPU setup
>>> - special case for M-class, since caches and MPU there are optional
>>>
>>> In general we rely on default DMA area for coherent allocations or/and
>>> per-device memory reserves suitable for coherent DMA, so if such
>>> regions are set coherent allocations go from there.
>>>
>>> In case MPU/MPU was not setup we fallback to normal page allocator for
>>> DMA memory allocation.
>>>
>>> In case we run M-class cpus, for configuration without cache support
>>> (like Cortex-M3/M4) dma operations are forced to be coherent and wired
>>> with dma-noop (such decision is made based on cacheid global
>>> variable); however, if caches are detected there and no DMA coherent
>>> region is given (either default or per-device), dma is disallowed even
>>> MPU is not set - it is because M-class implement system memory map
>>> which defines part of address space as Normal memory.
>>>
>>> Reported-by: Alexandre Torgue <alexandre.torgue@st.com>
>>> Reported-by: Andras Szemzo <sza@esh.hu>
>>> Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
>>> ---
>>>  arch/arm/include/asm/dma-mapping.h |   3 +-
>>>  arch/arm/mm/Makefile               |   5 +-
>>>  arch/arm/mm/dma-mapping-nommu.c    | 252 +++++++++++++++++++++++++++++++++++++
>>>  3 files changed, 256 insertions(+), 4 deletions(-)
>>>  create mode 100644 arch/arm/mm/dma-mapping-nommu.c
>>>
>>> diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
>>> index bf02dbd..559faad 100644
>>> --- a/arch/arm/include/asm/dma-mapping.h
>>> +++ b/arch/arm/include/asm/dma-mapping.h
>>> @@ -20,7 +20,8 @@ static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
>>>  {
>>>  	if (dev && dev->archdata.dma_ops)
>>>  		return dev->archdata.dma_ops;
>>> -	return &arm_dma_ops;
>>> +
>>> +	return IS_ENABLED(CONFIG_MMU) ? &arm_dma_ops : &dma_noop_ops;
>>>  }
>>>  
>>>  static inline struct dma_map_ops *get_dma_ops(struct device *dev)
>>> diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
>>> index 2ac7988..5796357 100644
>>> --- a/arch/arm/mm/Makefile
>>> +++ b/arch/arm/mm/Makefile
>>> @@ -2,9 +2,8 @@
>>>  # Makefile for the linux arm-specific parts of the memory manager.
>>>  #
>>>  
>>> -obj-y				:= dma-mapping.o extable.o fault.o init.o \
>>> -				   iomap.o
>>> -
>>> +obj-y				:= extable.o fault.o init.o iomap.o
>>> +obj-y				+= dma-mapping$(MMUEXT).o
>>>  obj-$(CONFIG_MMU)		+= fault-armv.o flush.o idmap.o ioremap.o \
>>>  				   mmap.o pgd.o mmu.o pageattr.o
>>>  
>>> diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
>>> new file mode 100644
>>> index 0000000..a5c50fb
>>> --- /dev/null
>>> +++ b/arch/arm/mm/dma-mapping-nommu.c
>>> @@ -0,0 +1,252 @@
>>> +/*
>>> + *  Based on linux/arch/arm/mm/dma-mapping.c
>>> + *
>>> + *  Copyright (C) 2000-2004 Russell King
>>> + *
>>> + * This program is free software; you can redistribute it and/or modify
>>> + * it under the terms of the GNU General Public License version 2 as
>>> + * published by the Free Software Foundation.
>>> + *
>>> + */
>>> +
>>> +#include <linux/export.h>
>>> +#include <linux/mm.h>
>>> +#include <linux/dma-mapping.h>
>>> +#include <linux/scatterlist.h>
>>> +
>>> +#include <asm/cachetype.h>
>>> +#include <asm/cacheflush.h>
>>> +#include <asm/outercache.h>
>>> +#include <asm/cp15.h>
>>> +
>>> +#include "dma.h"
>>> +
>>> +/*
>>> + *  dma_noop_ops is used if
>>> + *   - MMU/MPU is off
>>> + *   - cpu is v7m w/o cache support
>>> + *   - device is coherent
>>> + *  otherwise arm_nommu_dma_ops is used.
>>> + *
>>> + *  arm_nommu_dma_ops rely on consistent DMA memory (please, refer to
>>> + *  [1] on how to declare such memory).
>>> + *
>>> + *  [1] Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
>>> + */
>>> +
>>> +static void *arm_nommu_dma_alloc(struct device *dev, size_t size,
>>> +				 dma_addr_t *dma_handle, gfp_t gfp,
>>> +				 unsigned long attrs)
>>> +
>>> +{
>>> +	struct dma_map_ops *ops = &dma_noop_ops;
>>> +
>>> +	/*
>>> +	 * We are here because:
>>> +	 * - no consistent DMA region has been defined, so we can't
>>> +	 *   continue.
>>> +	 * - there is no space left in consistent DMA region, so we
>>> +	 *   only can fallback to generic allocator if we are
>>> +	 *   advertised that consistency is not required.
>>> +	 */
>>> +
>>> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
>>> +		return ops->alloc(dev, size, dma_handle, gfp, attrs);
>>> +
>>> +	WARN_ON_ONCE(1);
>>> +	return NULL;
>>> +}
>>> +
>>> +static void arm_nommu_dma_free(struct device *dev, size_t size,
>>> +			       void *cpu_addr, dma_addr_t dma_addr,
>>> +			       unsigned long attrs)
>>> +{
>>> +	struct dma_map_ops *ops = &dma_noop_ops;
>>> +
>>> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
>>> +		ops->free(dev, size, cpu_addr, dma_addr, attrs);
>>> +
>>> +	WARN_ON_ONCE(1);
>>> +	return;
>>> +}
>>> +
>>> +static int arm_nommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
>>> +			      void *cpu_addr, dma_addr_t dma_addr, size_t size,
>>> +			      unsigned long attrs)
>>> +{
>>> +	struct dma_map_ops *ops = &dma_noop_ops;
>>> +	int ret;
>>> +
>>> +	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
>>> +		return ret;
>>> +
>>> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
>>> +		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
>>> +
>>> +	WARN_ON_ONCE(1);
>>> +	return -ENXIO;
>>> +}
>>> +
>>> +static void __dma_page_cpu_to_dev(dma_addr_t handle, size_t size,
>>> +				  enum dma_data_direction dir)
>>> +{
>>> +	dmac_unmap_area(__va(handle), size, dir);
>>> +
>>> +	if (dir == DMA_FROM_DEVICE)
>>> +		outer_inv_range(handle, handle + size);
>>> +	else
>>> +		outer_clean_range(handle, handle + size);
>>> +}
>>> +
>>> +static void __dma_page_dev_to_cpu(dma_addr_t handle, size_t size,
>>> +				  enum dma_data_direction dir)
>>> +{
>>> +	if (dir != DMA_TO_DEVICE) {
>>> +		outer_inv_range(handle, handle + size);
>>> +		dmac_unmap_area(__va(handle), size, dir);
>>> +	}
>>> +}
>>
>> Nit: I appreciate that the situation here makes it OK by construction,
>> but CPU cache maintenance on a DMA address just looks *so* wrong :)
>> Could we pass either the "virtual" or physical version of the address as
>> the argument to these helpers so that the code looks less crazy at a glance?
> 
> Something like bellow?
> 
> static void __dma_page_dev_to_cpu(dma_addr_t paddr, size_t size,
                                    ^
I meant more in terms of this being a const void* or phys_addr_t ;)

> 				  enum dma_data_direction dir)
> {
> 	if (dir != DMA_TO_DEVICE) {
> 		outer_inv_range(paddr, paddr + size);
> 		dmac_unmap_area(__va(paddr), size, dir);
> 	}
> 
> Btw, thanks for having a look!

Otherwise, I think the rest of the series looks OK, thanks for
respinning it.

Robin.

> Cheers
> Vladimir
> 
>>
>> Robin.
>>
>>> +static dma_addr_t arm_nommu_dma_map_page(struct device *dev, struct page *page,
>>> +					 unsigned long offset, size_t size,
>>> +					 enum dma_data_direction dir,
>>> +					 unsigned long attrs)
>>> +{
>>> +	dma_addr_t handle = page_to_phys(page) + offset;
>>> +
>>> +	__dma_page_cpu_to_dev(handle, size, dir);
>>> +
>>> +	return handle;
>>> +}
>>> +
>>> +static void arm_nommu_dma_unmap_page(struct device *dev, dma_addr_t handle,
>>> +				     size_t size, enum dma_data_direction dir,
>>> +				     unsigned long attrs)
>>> +{
>>> +	__dma_page_dev_to_cpu(handle, size, dir);
>>> +}
>>> +
>>> +
>>> +static int arm_nommu_dma_map_sg(struct device *dev, struct scatterlist *sgl,
>>> +				int nents, enum dma_data_direction dir,
>>> +				unsigned long attrs)
>>> +{
>>> +	int i;
>>> +	struct scatterlist *sg;
>>> +
>>> +	for_each_sg(sgl, sg, nents, i) {
>>> +		sg_dma_address(sg) = sg_phys(sg);
>>> +		sg_dma_len(sg) = sg->length;
>>> +		__dma_page_cpu_to_dev(sg_dma_address(sg), sg_dma_len(sg), dir);
>>> +	}
>>> +
>>> +	return nents;
>>> +}
>>> +
>>> +static void arm_nommu_dma_unmap_sg(struct device *dev, struct scatterlist *sgl,
>>> +				   int nents, enum dma_data_direction dir,
>>> +				   unsigned long attrs)
>>> +{
>>> +	struct scatterlist *sg;
>>> +	int i;
>>> +
>>> +	for_each_sg(sgl, sg, nents, i)
>>> +		__dma_page_dev_to_cpu(sg_dma_address(sg), sg_dma_len(sg), dir);
>>> +}
>>> +
>>> +static void arm_nommu_dma_sync_single_for_device(struct device *dev,
>>> +		dma_addr_t handle, size_t size, enum dma_data_direction dir)
>>> +{
>>> +	__dma_page_cpu_to_dev(handle, size, dir);
>>> +}
>>> +
>>> +static void arm_nommu_dma_sync_single_for_cpu(struct device *dev,
>>> +		dma_addr_t handle, size_t size, enum dma_data_direction dir)
>>> +{
>>> +	__dma_page_cpu_to_dev(handle, size, dir);
>>> +}
>>> +
>>> +static void arm_nommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
>>> +					     int nents, enum dma_data_direction dir)
>>> +{
>>> +	struct scatterlist *sg;
>>> +	int i;
>>> +
>>> +	for_each_sg(sgl, sg, nents, i)
>>> +		__dma_page_cpu_to_dev(sg_dma_address(sg), sg_dma_len(sg), dir);
>>> +}
>>> +
>>> +static void arm_nommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
>>> +					  int nents, enum dma_data_direction dir)
>>> +{
>>> +	struct scatterlist *sg;
>>> +	int i;
>>> +
>>> +	for_each_sg(sgl, sg, nents, i)
>>> +		__dma_page_dev_to_cpu(sg_dma_address(sg), sg_dma_len(sg), dir);
>>> +}
>>> +
>>> +struct dma_map_ops arm_nommu_dma_ops = {
>>> +	.alloc			= arm_nommu_dma_alloc,
>>> +	.free			= arm_nommu_dma_free,
>>> +	.mmap			= arm_nommu_dma_mmap,
>>> +	.map_page		= arm_nommu_dma_map_page,
>>> +	.unmap_page		= arm_nommu_dma_unmap_page,
>>> +	.map_sg			= arm_nommu_dma_map_sg,
>>> +	.unmap_sg		= arm_nommu_dma_unmap_sg,
>>> +	.sync_single_for_device	= arm_nommu_dma_sync_single_for_device,
>>> +	.sync_single_for_cpu	= arm_nommu_dma_sync_single_for_cpu,
>>> +	.sync_sg_for_device	= arm_nommu_dma_sync_sg_for_device,
>>> +	.sync_sg_for_cpu	= arm_nommu_dma_sync_sg_for_cpu,
>>> +};
>>> +EXPORT_SYMBOL(arm_nommu_dma_ops);
>>> +
>>> +static struct dma_map_ops *arm_nommu_get_dma_map_ops(bool coherent)
>>> +{
>>> +	return coherent ? &dma_noop_ops : &arm_nommu_dma_ops;
>>> +}
>>> +
>>> +void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
>>> +			const struct iommu_ops *iommu, bool coherent)
>>> +{
>>> +	struct dma_map_ops *dma_ops;
>>> +
>>> +	if (IS_ENABLED(CONFIG_CPU_V7M)) {
>>> +		/*
>>> +		 * Cache support for v7m is optional, so can be treated as
>>> +		 * coherent if no cache has been detected. Note that it is not
>>> +		 * enough to check if MPU is in use or not since in absense of
>>> +		 * MPU system memory map is used.
>>> +		 */
>>> +		dev->archdata.dma_coherent = (cacheid) ? coherent : true;
>>> +	} else {
>>> +		/*
>>> +		 * Assume coherent DMA in case MMU/MPU has not been set up.
>>> +		 */
>>> +		dev->archdata.dma_coherent = (get_cr() & CR_M) ? coherent : true;
>>> +	}
>>> +
>>> +	dma_ops = arm_nommu_get_dma_map_ops(dev->archdata.dma_coherent);
>>> +
>>> +	set_dma_ops(dev, dma_ops);
>>> +}
>>> +
>>> +void arch_teardown_dma_ops(struct device *dev)
>>> +{
>>> +}
>>> +
>>> +int dma_supported(struct device *dev, u64 mask)
>>> +{
>>> +	return 1;
>>> +}
>>> +
>>> +EXPORT_SYMBOL(dma_supported);
>>> +
>>> +#define PREALLOC_DMA_DEBUG_ENTRIES	4096
>>> +
>>> +static int __init dma_debug_do_init(void)
>>> +{
>>> +	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
>>> +	return 0;
>>> +}
>>> +core_initcall(dma_debug_do_init);
>>>
>>
>>
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>
Vladimir Murzin Jan. 10, 2017, 11:40 a.m. UTC | #4
On 09/01/17 16:59, Robin Murphy wrote:
> On 09/01/17 16:51, Vladimir Murzin wrote:
>> Hi Robin,
>>
>> On 09/01/17 16:43, Robin Murphy wrote:
>>> Hi Vladimir,
>>>
>>> On 09/01/17 13:47, Vladimir Murzin wrote:
>>>> R/M classes of cpus can have memory covered by MPU which in turn might
>>>> configure RAM as Normal i.e. bufferable and cacheable. It breaks
>>>> dma_alloc_coherent() and friends, since data can stuck in caches now
>>>> or be buffered.
>>>>
>>>> This patch factors out DMA support for NOMMU configuration into
>>>> separate entity which provides dedicated dma_ops. We have to handle
>>>> there several cases:
>>>> - configurations with MMU/MPU setup
>>>> - configurations without MMU/MPU setup
>>>> - special case for M-class, since caches and MPU there are optional
>>>>
>>>> In general we rely on default DMA area for coherent allocations or/and
>>>> per-device memory reserves suitable for coherent DMA, so if such
>>>> regions are set coherent allocations go from there.
>>>>
>>>> In case MPU/MPU was not setup we fallback to normal page allocator for
>>>> DMA memory allocation.
>>>>
>>>> In case we run M-class cpus, for configuration without cache support
>>>> (like Cortex-M3/M4) dma operations are forced to be coherent and wired
>>>> with dma-noop (such decision is made based on cacheid global
>>>> variable); however, if caches are detected there and no DMA coherent
>>>> region is given (either default or per-device), dma is disallowed even
>>>> MPU is not set - it is because M-class implement system memory map
>>>> which defines part of address space as Normal memory.
>>>>
>>>> Reported-by: Alexandre Torgue <alexandre.torgue@st.com>
>>>> Reported-by: Andras Szemzo <sza@esh.hu>
>>>> Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
>>>> ---
>>>>  arch/arm/include/asm/dma-mapping.h |   3 +-
>>>>  arch/arm/mm/Makefile               |   5 +-
>>>>  arch/arm/mm/dma-mapping-nommu.c    | 252 +++++++++++++++++++++++++++++++++++++
>>>>  3 files changed, 256 insertions(+), 4 deletions(-)
>>>>  create mode 100644 arch/arm/mm/dma-mapping-nommu.c
>>>>
>>>> diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
>>>> index bf02dbd..559faad 100644
>>>> --- a/arch/arm/include/asm/dma-mapping.h
>>>> +++ b/arch/arm/include/asm/dma-mapping.h
>>>> @@ -20,7 +20,8 @@ static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
>>>>  {
>>>>  	if (dev && dev->archdata.dma_ops)
>>>>  		return dev->archdata.dma_ops;
>>>> -	return &arm_dma_ops;
>>>> +
>>>> +	return IS_ENABLED(CONFIG_MMU) ? &arm_dma_ops : &dma_noop_ops;
>>>>  }
>>>>  
>>>>  static inline struct dma_map_ops *get_dma_ops(struct device *dev)
>>>> diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
>>>> index 2ac7988..5796357 100644
>>>> --- a/arch/arm/mm/Makefile
>>>> +++ b/arch/arm/mm/Makefile
>>>> @@ -2,9 +2,8 @@
>>>>  # Makefile for the linux arm-specific parts of the memory manager.
>>>>  #
>>>>  
>>>> -obj-y				:= dma-mapping.o extable.o fault.o init.o \
>>>> -				   iomap.o
>>>> -
>>>> +obj-y				:= extable.o fault.o init.o iomap.o
>>>> +obj-y				+= dma-mapping$(MMUEXT).o
>>>>  obj-$(CONFIG_MMU)		+= fault-armv.o flush.o idmap.o ioremap.o \
>>>>  				   mmap.o pgd.o mmu.o pageattr.o
>>>>  
>>>> diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
>>>> new file mode 100644
>>>> index 0000000..a5c50fb
>>>> --- /dev/null
>>>> +++ b/arch/arm/mm/dma-mapping-nommu.c
>>>> @@ -0,0 +1,252 @@
>>>> +/*
>>>> + *  Based on linux/arch/arm/mm/dma-mapping.c
>>>> + *
>>>> + *  Copyright (C) 2000-2004 Russell King
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify
>>>> + * it under the terms of the GNU General Public License version 2 as
>>>> + * published by the Free Software Foundation.
>>>> + *
>>>> + */
>>>> +
>>>> +#include <linux/export.h>
>>>> +#include <linux/mm.h>
>>>> +#include <linux/dma-mapping.h>
>>>> +#include <linux/scatterlist.h>
>>>> +
>>>> +#include <asm/cachetype.h>
>>>> +#include <asm/cacheflush.h>
>>>> +#include <asm/outercache.h>
>>>> +#include <asm/cp15.h>
>>>> +
>>>> +#include "dma.h"
>>>> +
>>>> +/*
>>>> + *  dma_noop_ops is used if
>>>> + *   - MMU/MPU is off
>>>> + *   - cpu is v7m w/o cache support
>>>> + *   - device is coherent
>>>> + *  otherwise arm_nommu_dma_ops is used.
>>>> + *
>>>> + *  arm_nommu_dma_ops rely on consistent DMA memory (please, refer to
>>>> + *  [1] on how to declare such memory).
>>>> + *
>>>> + *  [1] Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
>>>> + */
>>>> +
>>>> +static void *arm_nommu_dma_alloc(struct device *dev, size_t size,
>>>> +				 dma_addr_t *dma_handle, gfp_t gfp,
>>>> +				 unsigned long attrs)
>>>> +
>>>> +{
>>>> +	struct dma_map_ops *ops = &dma_noop_ops;
>>>> +
>>>> +	/*
>>>> +	 * We are here because:
>>>> +	 * - no consistent DMA region has been defined, so we can't
>>>> +	 *   continue.
>>>> +	 * - there is no space left in consistent DMA region, so we
>>>> +	 *   only can fallback to generic allocator if we are
>>>> +	 *   advertised that consistency is not required.
>>>> +	 */
>>>> +
>>>> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
>>>> +		return ops->alloc(dev, size, dma_handle, gfp, attrs);
>>>> +
>>>> +	WARN_ON_ONCE(1);
>>>> +	return NULL;
>>>> +}
>>>> +
>>>> +static void arm_nommu_dma_free(struct device *dev, size_t size,
>>>> +			       void *cpu_addr, dma_addr_t dma_addr,
>>>> +			       unsigned long attrs)
>>>> +{
>>>> +	struct dma_map_ops *ops = &dma_noop_ops;
>>>> +
>>>> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
>>>> +		ops->free(dev, size, cpu_addr, dma_addr, attrs);
>>>> +
>>>> +	WARN_ON_ONCE(1);
>>>> +	return;
>>>> +}
>>>> +
>>>> +static int arm_nommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
>>>> +			      void *cpu_addr, dma_addr_t dma_addr, size_t size,
>>>> +			      unsigned long attrs)
>>>> +{
>>>> +	struct dma_map_ops *ops = &dma_noop_ops;
>>>> +	int ret;
>>>> +
>>>> +	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
>>>> +		return ret;
>>>> +
>>>> +	if (attrs & DMA_ATTR_NON_CONSISTENT)
>>>> +		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
>>>> +
>>>> +	WARN_ON_ONCE(1);
>>>> +	return -ENXIO;
>>>> +}
>>>> +
>>>> +static void __dma_page_cpu_to_dev(dma_addr_t handle, size_t size,
>>>> +				  enum dma_data_direction dir)
>>>> +{
>>>> +	dmac_unmap_area(__va(handle), size, dir);
>>>> +
>>>> +	if (dir == DMA_FROM_DEVICE)
>>>> +		outer_inv_range(handle, handle + size);
>>>> +	else
>>>> +		outer_clean_range(handle, handle + size);
>>>> +}
>>>> +
>>>> +static void __dma_page_dev_to_cpu(dma_addr_t handle, size_t size,
>>>> +				  enum dma_data_direction dir)
>>>> +{
>>>> +	if (dir != DMA_TO_DEVICE) {
>>>> +		outer_inv_range(handle, handle + size);
>>>> +		dmac_unmap_area(__va(handle), size, dir);
>>>> +	}
>>>> +}
>>>
>>> Nit: I appreciate that the situation here makes it OK by construction,
>>> but CPU cache maintenance on a DMA address just looks *so* wrong :)
>>> Could we pass either the "virtual" or physical version of the address as
>>> the argument to these helpers so that the code looks less crazy at a glance?
>>
>> Something like bellow?
>>
>> static void __dma_page_dev_to_cpu(dma_addr_t paddr, size_t size,
>                                     ^
> I meant more in terms of this being a const void* or phys_addr_t ;)
> 

Fixed locally with "phys_addr_t".

>> 				  enum dma_data_direction dir)
>> {
>> 	if (dir != DMA_TO_DEVICE) {
>> 		outer_inv_range(paddr, paddr + size);
>> 		dmac_unmap_area(__va(paddr), size, dir);
>> 	}
>>
>> Btw, thanks for having a look!
> 
> Otherwise, I think the rest of the series looks OK, thanks for
> respinning it.

I'll wait for a while for more feedback and tests before submitting updated
version. 

Cheers
Vladimir

> 
> Robin.
> 
>> Cheers
>> Vladimir
>>
>>>
>>> Robin.
>>>
>>>> +static dma_addr_t arm_nommu_dma_map_page(struct device *dev, struct page *page,
>>>> +					 unsigned long offset, size_t size,
>>>> +					 enum dma_data_direction dir,
>>>> +					 unsigned long attrs)
>>>> +{
>>>> +	dma_addr_t handle = page_to_phys(page) + offset;
>>>> +
>>>> +	__dma_page_cpu_to_dev(handle, size, dir);
>>>> +
>>>> +	return handle;
>>>> +}
>>>> +
>>>> +static void arm_nommu_dma_unmap_page(struct device *dev, dma_addr_t handle,
>>>> +				     size_t size, enum dma_data_direction dir,
>>>> +				     unsigned long attrs)
>>>> +{
>>>> +	__dma_page_dev_to_cpu(handle, size, dir);
>>>> +}
>>>> +
>>>> +
>>>> +static int arm_nommu_dma_map_sg(struct device *dev, struct scatterlist *sgl,
>>>> +				int nents, enum dma_data_direction dir,
>>>> +				unsigned long attrs)
>>>> +{
>>>> +	int i;
>>>> +	struct scatterlist *sg;
>>>> +
>>>> +	for_each_sg(sgl, sg, nents, i) {
>>>> +		sg_dma_address(sg) = sg_phys(sg);
>>>> +		sg_dma_len(sg) = sg->length;
>>>> +		__dma_page_cpu_to_dev(sg_dma_address(sg), sg_dma_len(sg), dir);
>>>> +	}
>>>> +
>>>> +	return nents;
>>>> +}
>>>> +
>>>> +static void arm_nommu_dma_unmap_sg(struct device *dev, struct scatterlist *sgl,
>>>> +				   int nents, enum dma_data_direction dir,
>>>> +				   unsigned long attrs)
>>>> +{
>>>> +	struct scatterlist *sg;
>>>> +	int i;
>>>> +
>>>> +	for_each_sg(sgl, sg, nents, i)
>>>> +		__dma_page_dev_to_cpu(sg_dma_address(sg), sg_dma_len(sg), dir);
>>>> +}
>>>> +
>>>> +static void arm_nommu_dma_sync_single_for_device(struct device *dev,
>>>> +		dma_addr_t handle, size_t size, enum dma_data_direction dir)
>>>> +{
>>>> +	__dma_page_cpu_to_dev(handle, size, dir);
>>>> +}
>>>> +
>>>> +static void arm_nommu_dma_sync_single_for_cpu(struct device *dev,
>>>> +		dma_addr_t handle, size_t size, enum dma_data_direction dir)
>>>> +{
>>>> +	__dma_page_cpu_to_dev(handle, size, dir);
>>>> +}
>>>> +
>>>> +static void arm_nommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
>>>> +					     int nents, enum dma_data_direction dir)
>>>> +{
>>>> +	struct scatterlist *sg;
>>>> +	int i;
>>>> +
>>>> +	for_each_sg(sgl, sg, nents, i)
>>>> +		__dma_page_cpu_to_dev(sg_dma_address(sg), sg_dma_len(sg), dir);
>>>> +}
>>>> +
>>>> +static void arm_nommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
>>>> +					  int nents, enum dma_data_direction dir)
>>>> +{
>>>> +	struct scatterlist *sg;
>>>> +	int i;
>>>> +
>>>> +	for_each_sg(sgl, sg, nents, i)
>>>> +		__dma_page_dev_to_cpu(sg_dma_address(sg), sg_dma_len(sg), dir);
>>>> +}
>>>> +
>>>> +struct dma_map_ops arm_nommu_dma_ops = {
>>>> +	.alloc			= arm_nommu_dma_alloc,
>>>> +	.free			= arm_nommu_dma_free,
>>>> +	.mmap			= arm_nommu_dma_mmap,
>>>> +	.map_page		= arm_nommu_dma_map_page,
>>>> +	.unmap_page		= arm_nommu_dma_unmap_page,
>>>> +	.map_sg			= arm_nommu_dma_map_sg,
>>>> +	.unmap_sg		= arm_nommu_dma_unmap_sg,
>>>> +	.sync_single_for_device	= arm_nommu_dma_sync_single_for_device,
>>>> +	.sync_single_for_cpu	= arm_nommu_dma_sync_single_for_cpu,
>>>> +	.sync_sg_for_device	= arm_nommu_dma_sync_sg_for_device,
>>>> +	.sync_sg_for_cpu	= arm_nommu_dma_sync_sg_for_cpu,
>>>> +};
>>>> +EXPORT_SYMBOL(arm_nommu_dma_ops);
>>>> +
>>>> +static struct dma_map_ops *arm_nommu_get_dma_map_ops(bool coherent)
>>>> +{
>>>> +	return coherent ? &dma_noop_ops : &arm_nommu_dma_ops;
>>>> +}
>>>> +
>>>> +void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
>>>> +			const struct iommu_ops *iommu, bool coherent)
>>>> +{
>>>> +	struct dma_map_ops *dma_ops;
>>>> +
>>>> +	if (IS_ENABLED(CONFIG_CPU_V7M)) {
>>>> +		/*
>>>> +		 * Cache support for v7m is optional, so can be treated as
>>>> +		 * coherent if no cache has been detected. Note that it is not
>>>> +		 * enough to check if MPU is in use or not since in absense of
>>>> +		 * MPU system memory map is used.
>>>> +		 */
>>>> +		dev->archdata.dma_coherent = (cacheid) ? coherent : true;
>>>> +	} else {
>>>> +		/*
>>>> +		 * Assume coherent DMA in case MMU/MPU has not been set up.
>>>> +		 */
>>>> +		dev->archdata.dma_coherent = (get_cr() & CR_M) ? coherent : true;
>>>> +	}
>>>> +
>>>> +	dma_ops = arm_nommu_get_dma_map_ops(dev->archdata.dma_coherent);
>>>> +
>>>> +	set_dma_ops(dev, dma_ops);
>>>> +}
>>>> +
>>>> +void arch_teardown_dma_ops(struct device *dev)
>>>> +{
>>>> +}
>>>> +
>>>> +int dma_supported(struct device *dev, u64 mask)
>>>> +{
>>>> +	return 1;
>>>> +}
>>>> +
>>>> +EXPORT_SYMBOL(dma_supported);
>>>> +
>>>> +#define PREALLOC_DMA_DEBUG_ENTRIES	4096
>>>> +
>>>> +static int __init dma_debug_do_init(void)
>>>> +{
>>>> +	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
>>>> +	return 0;
>>>> +}
>>>> +core_initcall(dma_debug_do_init);
>>>>
>>>
>>>
>>
>>
>> _______________________________________________
>> linux-arm-kernel mailing list
>> linux-arm-kernel@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>>
> 
>
diff mbox

Patch

diff --git a/arch/arm/include/asm/dma-mapping.h b/arch/arm/include/asm/dma-mapping.h
index bf02dbd..559faad 100644
--- a/arch/arm/include/asm/dma-mapping.h
+++ b/arch/arm/include/asm/dma-mapping.h
@@ -20,7 +20,8 @@  static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
 {
 	if (dev && dev->archdata.dma_ops)
 		return dev->archdata.dma_ops;
-	return &arm_dma_ops;
+
+	return IS_ENABLED(CONFIG_MMU) ? &arm_dma_ops : &dma_noop_ops;
 }
 
 static inline struct dma_map_ops *get_dma_ops(struct device *dev)
diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
index 2ac7988..5796357 100644
--- a/arch/arm/mm/Makefile
+++ b/arch/arm/mm/Makefile
@@ -2,9 +2,8 @@ 
 # Makefile for the linux arm-specific parts of the memory manager.
 #
 
-obj-y				:= dma-mapping.o extable.o fault.o init.o \
-				   iomap.o
-
+obj-y				:= extable.o fault.o init.o iomap.o
+obj-y				+= dma-mapping$(MMUEXT).o
 obj-$(CONFIG_MMU)		+= fault-armv.o flush.o idmap.o ioremap.o \
 				   mmap.o pgd.o mmu.o pageattr.o
 
diff --git a/arch/arm/mm/dma-mapping-nommu.c b/arch/arm/mm/dma-mapping-nommu.c
new file mode 100644
index 0000000..a5c50fb
--- /dev/null
+++ b/arch/arm/mm/dma-mapping-nommu.c
@@ -0,0 +1,252 @@ 
+/*
+ *  Based on linux/arch/arm/mm/dma-mapping.c
+ *
+ *  Copyright (C) 2000-2004 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/scatterlist.h>
+
+#include <asm/cachetype.h>
+#include <asm/cacheflush.h>
+#include <asm/outercache.h>
+#include <asm/cp15.h>
+
+#include "dma.h"
+
+/*
+ *  dma_noop_ops is used if
+ *   - MMU/MPU is off
+ *   - cpu is v7m w/o cache support
+ *   - device is coherent
+ *  otherwise arm_nommu_dma_ops is used.
+ *
+ *  arm_nommu_dma_ops rely on consistent DMA memory (please, refer to
+ *  [1] on how to declare such memory).
+ *
+ *  [1] Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
+ */
+
+static void *arm_nommu_dma_alloc(struct device *dev, size_t size,
+				 dma_addr_t *dma_handle, gfp_t gfp,
+				 unsigned long attrs)
+
+{
+	struct dma_map_ops *ops = &dma_noop_ops;
+
+	/*
+	 * We are here because:
+	 * - no consistent DMA region has been defined, so we can't
+	 *   continue.
+	 * - there is no space left in consistent DMA region, so we
+	 *   only can fallback to generic allocator if we are
+	 *   advertised that consistency is not required.
+	 */
+
+	if (attrs & DMA_ATTR_NON_CONSISTENT)
+		return ops->alloc(dev, size, dma_handle, gfp, attrs);
+
+	WARN_ON_ONCE(1);
+	return NULL;
+}
+
+static void arm_nommu_dma_free(struct device *dev, size_t size,
+			       void *cpu_addr, dma_addr_t dma_addr,
+			       unsigned long attrs)
+{
+	struct dma_map_ops *ops = &dma_noop_ops;
+
+	if (attrs & DMA_ATTR_NON_CONSISTENT)
+		ops->free(dev, size, cpu_addr, dma_addr, attrs);
+
+	WARN_ON_ONCE(1);
+	return;
+}
+
+static int arm_nommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+			      void *cpu_addr, dma_addr_t dma_addr, size_t size,
+			      unsigned long attrs)
+{
+	struct dma_map_ops *ops = &dma_noop_ops;
+	int ret;
+
+	if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret))
+		return ret;
+
+	if (attrs & DMA_ATTR_NON_CONSISTENT)
+		return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs);
+
+	WARN_ON_ONCE(1);
+	return -ENXIO;
+}
+
+static void __dma_page_cpu_to_dev(dma_addr_t handle, size_t size,
+				  enum dma_data_direction dir)
+{
+	dmac_unmap_area(__va(handle), size, dir);
+
+	if (dir == DMA_FROM_DEVICE)
+		outer_inv_range(handle, handle + size);
+	else
+		outer_clean_range(handle, handle + size);
+}
+
+static void __dma_page_dev_to_cpu(dma_addr_t handle, size_t size,
+				  enum dma_data_direction dir)
+{
+	if (dir != DMA_TO_DEVICE) {
+		outer_inv_range(handle, handle + size);
+		dmac_unmap_area(__va(handle), size, dir);
+	}
+}
+
+static dma_addr_t arm_nommu_dma_map_page(struct device *dev, struct page *page,
+					 unsigned long offset, size_t size,
+					 enum dma_data_direction dir,
+					 unsigned long attrs)
+{
+	dma_addr_t handle = page_to_phys(page) + offset;
+
+	__dma_page_cpu_to_dev(handle, size, dir);
+
+	return handle;
+}
+
+static void arm_nommu_dma_unmap_page(struct device *dev, dma_addr_t handle,
+				     size_t size, enum dma_data_direction dir,
+				     unsigned long attrs)
+{
+	__dma_page_dev_to_cpu(handle, size, dir);
+}
+
+
+static int arm_nommu_dma_map_sg(struct device *dev, struct scatterlist *sgl,
+				int nents, enum dma_data_direction dir,
+				unsigned long attrs)
+{
+	int i;
+	struct scatterlist *sg;
+
+	for_each_sg(sgl, sg, nents, i) {
+		sg_dma_address(sg) = sg_phys(sg);
+		sg_dma_len(sg) = sg->length;
+		__dma_page_cpu_to_dev(sg_dma_address(sg), sg_dma_len(sg), dir);
+	}
+
+	return nents;
+}
+
+static void arm_nommu_dma_unmap_sg(struct device *dev, struct scatterlist *sgl,
+				   int nents, enum dma_data_direction dir,
+				   unsigned long attrs)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		__dma_page_dev_to_cpu(sg_dma_address(sg), sg_dma_len(sg), dir);
+}
+
+static void arm_nommu_dma_sync_single_for_device(struct device *dev,
+		dma_addr_t handle, size_t size, enum dma_data_direction dir)
+{
+	__dma_page_cpu_to_dev(handle, size, dir);
+}
+
+static void arm_nommu_dma_sync_single_for_cpu(struct device *dev,
+		dma_addr_t handle, size_t size, enum dma_data_direction dir)
+{
+	__dma_page_cpu_to_dev(handle, size, dir);
+}
+
+static void arm_nommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl,
+					     int nents, enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		__dma_page_cpu_to_dev(sg_dma_address(sg), sg_dma_len(sg), dir);
+}
+
+static void arm_nommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl,
+					  int nents, enum dma_data_direction dir)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i)
+		__dma_page_dev_to_cpu(sg_dma_address(sg), sg_dma_len(sg), dir);
+}
+
+struct dma_map_ops arm_nommu_dma_ops = {
+	.alloc			= arm_nommu_dma_alloc,
+	.free			= arm_nommu_dma_free,
+	.mmap			= arm_nommu_dma_mmap,
+	.map_page		= arm_nommu_dma_map_page,
+	.unmap_page		= arm_nommu_dma_unmap_page,
+	.map_sg			= arm_nommu_dma_map_sg,
+	.unmap_sg		= arm_nommu_dma_unmap_sg,
+	.sync_single_for_device	= arm_nommu_dma_sync_single_for_device,
+	.sync_single_for_cpu	= arm_nommu_dma_sync_single_for_cpu,
+	.sync_sg_for_device	= arm_nommu_dma_sync_sg_for_device,
+	.sync_sg_for_cpu	= arm_nommu_dma_sync_sg_for_cpu,
+};
+EXPORT_SYMBOL(arm_nommu_dma_ops);
+
+static struct dma_map_ops *arm_nommu_get_dma_map_ops(bool coherent)
+{
+	return coherent ? &dma_noop_ops : &arm_nommu_dma_ops;
+}
+
+void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
+			const struct iommu_ops *iommu, bool coherent)
+{
+	struct dma_map_ops *dma_ops;
+
+	if (IS_ENABLED(CONFIG_CPU_V7M)) {
+		/*
+		 * Cache support for v7m is optional, so can be treated as
+		 * coherent if no cache has been detected. Note that it is not
+		 * enough to check if MPU is in use or not since in absense of
+		 * MPU system memory map is used.
+		 */
+		dev->archdata.dma_coherent = (cacheid) ? coherent : true;
+	} else {
+		/*
+		 * Assume coherent DMA in case MMU/MPU has not been set up.
+		 */
+		dev->archdata.dma_coherent = (get_cr() & CR_M) ? coherent : true;
+	}
+
+	dma_ops = arm_nommu_get_dma_map_ops(dev->archdata.dma_coherent);
+
+	set_dma_ops(dev, dma_ops);
+}
+
+void arch_teardown_dma_ops(struct device *dev)
+{
+}
+
+int dma_supported(struct device *dev, u64 mask)
+{
+	return 1;
+}
+
+EXPORT_SYMBOL(dma_supported);
+
+#define PREALLOC_DMA_DEBUG_ENTRIES	4096
+
+static int __init dma_debug_do_init(void)
+{
+	dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
+	return 0;
+}
+core_initcall(dma_debug_do_init);