diff mbox

[v5,1/3] iommu: Implement common IOMMU ops for DMA mapping

Message ID 6ce6b501501f611297ae0eae31e07b0d2060eaae.1438362603.git.robin.murphy@arm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Robin Murphy July 31, 2015, 5:18 p.m. UTC
Taking inspiration from the existing arch/arm code, break out some
generic functions to interface the DMA-API to the IOMMU-API. This will
do the bulk of the heavy lifting for IOMMU-backed dma-mapping.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
 drivers/iommu/Kconfig     |   7 +
 drivers/iommu/Makefile    |   1 +
 drivers/iommu/dma-iommu.c | 534 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dma-iommu.h |  84 ++++++++
 include/linux/iommu.h     |   1 +
 5 files changed, 627 insertions(+)
 create mode 100644 drivers/iommu/dma-iommu.c
 create mode 100644 include/linux/dma-iommu.h

Comments

Catalin Marinas Aug. 3, 2015, 5:40 p.m. UTC | #1
On Fri, Jul 31, 2015 at 06:18:27PM +0100, Robin Murphy wrote:
> Taking inspiration from the existing arch/arm code, break out some
> generic functions to interface the DMA-API to the IOMMU-API. This will
> do the bulk of the heavy lifting for IOMMU-backed dma-mapping.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>

Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Will Deacon Aug. 6, 2015, 3:23 p.m. UTC | #2
Joerg,

On Fri, Jul 31, 2015 at 06:18:27PM +0100, Robin Murphy wrote:
> Taking inspiration from the existing arch/arm code, break out some
> generic functions to interface the DMA-API to the IOMMU-API. This will
> do the bulk of the heavy lifting for IOMMU-backed dma-mapping.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>  drivers/iommu/Kconfig     |   7 +
>  drivers/iommu/Makefile    |   1 +
>  drivers/iommu/dma-iommu.c | 534 ++++++++++++++++++++++++++++++++++++++++++++++
>  include/linux/dma-iommu.h |  84 ++++++++
>  include/linux/iommu.h     |   1 +
>  5 files changed, 627 insertions(+)
>  create mode 100644 drivers/iommu/dma-iommu.c
>  create mode 100644 include/linux/dma-iommu.h

We're quite keen to get this in for arm64, since we're without IOMMU DMA
ops and need to get something upstream. Do you think this is likely to
be merged for 4.3/4.4 or would we be better off doing our own
arch-private implementation instead?

Sorry to pester, but we've got people basing their patches and products
on this and I don't want to end up having to support out-of-tree code.

Cheers,

Will
Joerg Roedel Aug. 6, 2015, 5:54 p.m. UTC | #3
Hi Will,

On Thu, Aug 06, 2015 at 04:23:27PM +0100, Will Deacon wrote:
> We're quite keen to get this in for arm64, since we're without IOMMU DMA
> ops and need to get something upstream. Do you think this is likely to
> be merged for 4.3/4.4 or would we be better off doing our own
> arch-private implementation instead?
> 
> Sorry to pester, but we've got people basing their patches and products
> on this and I don't want to end up having to support out-of-tree code.

I definitly plan to merge it soon, but not sure if its getting into
v4.3. There are a few things I have questions about or need rework, but
I am sure we can work this out.


	Joerg
Joerg Roedel Aug. 7, 2015, 8:42 a.m. UTC | #4
On Fri, Jul 31, 2015 at 06:18:27PM +0100, Robin Murphy wrote:
> +int iommu_get_dma_cookie(struct iommu_domain *domain)
> +{
> +	struct iova_domain *iovad;
> +
> +	if (domain->dma_api_cookie)
> +		return -EEXIST;

Why do you call that dma_api_cookie? It is just a pointer to an iova
allocator, you can just name it as such, like domain->iova.

> +static struct iova *__alloc_iova(struct iova_domain *iovad, size_t size,
> +		dma_addr_t dma_limit)

I think you also need a struct device here to take segment boundary and
dma_mask into account.

> +/* The IOVA allocator knows what we mapped, so just unmap whatever that was */
> +static void __iommu_dma_unmap(struct iommu_domain *domain, dma_addr_t dma_addr)
> +{
> +	struct iova_domain *iovad = domain->dma_api_cookie;
> +	unsigned long shift = iova_shift(iovad);
> +	unsigned long pfn = dma_addr >> shift;
> +	struct iova *iova = find_iova(iovad, pfn);
> +	size_t size = iova_size(iova) << shift;
> +
> +	/* ...and if we can't, then something is horribly, horribly wrong */
> +	BUG_ON(iommu_unmap(domain, pfn << shift, size) < size);

This is a WARN_ON at most, not a BUG_ON condition, especially since this
type of bug is also catched with the dma-api debugging code.

> +static struct page **__iommu_dma_alloc_pages(unsigned int count, gfp_t gfp)
> +{
> +	struct page **pages;
> +	unsigned int i = 0, array_size = count * sizeof(*pages);
> +
> +	if (array_size <= PAGE_SIZE)
> +		pages = kzalloc(array_size, GFP_KERNEL);
> +	else
> +		pages = vzalloc(array_size);
> +	if (!pages)
> +		return NULL;
> +
> +	/* IOMMU can map any pages, so himem can also be used here */
> +	gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
> +
> +	while (count) {
> +		struct page *page = NULL;
> +		int j, order = __fls(count);
> +
> +		/*
> +		 * Higher-order allocations are a convenience rather
> +		 * than a necessity, hence using __GFP_NORETRY until
> +		 * falling back to single-page allocations.
> +		 */
> +		for (order = min(order, MAX_ORDER); order > 0; order--) {
> +			page = alloc_pages(gfp | __GFP_NORETRY, order);
> +			if (!page)
> +				continue;
> +			if (PageCompound(page)) {
> +				if (!split_huge_page(page))
> +					break;
> +				__free_pages(page, order);
> +			} else {
> +				split_page(page, order);
> +				break;
> +			}
> +		}
> +		if (!page)
> +			page = alloc_page(gfp);
> +		if (!page) {
> +			__iommu_dma_free_pages(pages, i);
> +			return NULL;
> +		}
> +		j = 1 << order;
> +		count -= j;
> +		while (j--)
> +			pages[i++] = page++;
> +	}
> +	return pages;
> +}

Hmm, most dma-api implementation just try to allocate a big enough
region from the page-alloctor. Is it implemented different here to avoid
the use of CMA?


	Joerg
Robin Murphy Aug. 7, 2015, 1:38 p.m. UTC | #5
Hi Joerg,

Thanks for taking a look,

On 07/08/15 09:42, Joerg Roedel wrote:
> On Fri, Jul 31, 2015 at 06:18:27PM +0100, Robin Murphy wrote:
>> +int iommu_get_dma_cookie(struct iommu_domain *domain)
>> +{
>> +	struct iova_domain *iovad;
>> +
>> +	if (domain->dma_api_cookie)
>> +		return -EEXIST;
>
> Why do you call that dma_api_cookie? It is just a pointer to an iova
> allocator, you can just name it as such, like domain->iova.

Sure, it was more the case that since it had to be in the top-level 
generic domain structure, I didn't want it to be too 
implementation-specific. I figured this was a reasonable compromise that 
wouldn't be a waste of space for implementations with different 
per-domain DMA API data - e.g. the AMD IOMMU driver could then make use 
of protection_domain->domain->dma_api_cookie instead of having 
protection_domain->priv, but that's a patch that wouldn't belong in this 
series anyway.

If you really hate that idea, then yeah, let's just call it iova and 
consider if factoring out redundancy is still applicable later.

>> +static struct iova *__alloc_iova(struct iova_domain *iovad, size_t size,
>> +		dma_addr_t dma_limit)
>
> I think you also need a struct device here to take segment boundary and
> dma_mask into account.

To the best of my understanding, those limits are only relevant when 
actually handing off a scatterlist to a client device doing hardware 
scatter-gather operations, so it's not so much the IOVA allocation that 
matters, but where the segments lie within it when handling dma_map_sg.

However, you do raise a good point - in the current "map everything 
consecutively" approach, if there is a non-power-of-2-sized segment in 
the middle of a scatterlist, then subsequent segments could possibly end 
up inadvertently straddling a boundary. That needs handling in 
iommu_dma_map_sg; I'll fix it appropriately.

>> +/* The IOVA allocator knows what we mapped, so just unmap whatever that was */
>> +static void __iommu_dma_unmap(struct iommu_domain *domain, dma_addr_t dma_addr)
>> +{
>> +	struct iova_domain *iovad = domain->dma_api_cookie;
>> +	unsigned long shift = iova_shift(iovad);
>> +	unsigned long pfn = dma_addr >> shift;
>> +	struct iova *iova = find_iova(iovad, pfn);
>> +	size_t size = iova_size(iova) << shift;
>> +
>> +	/* ...and if we can't, then something is horribly, horribly wrong */
>> +	BUG_ON(iommu_unmap(domain, pfn << shift, size) < size);
>
> This is a WARN_ON at most, not a BUG_ON condition, especially since this
> type of bug is also catched with the dma-api debugging code.

Indeed, DMA_DEBUG will check that a driver is making DMA API calls to 
the arch code in the right way; this is a different check, to catch 
things like the arch code passing the wrong domain into this layer, or 
someone else having messed directly with the domain via the IOMMU API. 
If the iommu_unmap doesn't match the IOVA region we looked up, that 
means the IOMMU page tables have somehow become inconsistent with the 
IOVA allocator, so we are in an unrecoverable situation where we can no 
longer be sure what devices have access to. That's bad.

>> +static struct page **__iommu_dma_alloc_pages(unsigned int count, gfp_t gfp)
>> +{
>> +	struct page **pages;
>> +	unsigned int i = 0, array_size = count * sizeof(*pages);
>> +
>> +	if (array_size <= PAGE_SIZE)
>> +		pages = kzalloc(array_size, GFP_KERNEL);
>> +	else
>> +		pages = vzalloc(array_size);
>> +	if (!pages)
>> +		return NULL;
>> +
>> +	/* IOMMU can map any pages, so himem can also be used here */
>> +	gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
>> +
>> +	while (count) {
>> +		struct page *page = NULL;
>> +		int j, order = __fls(count);
>> +
>> +		/*
>> +		 * Higher-order allocations are a convenience rather
>> +		 * than a necessity, hence using __GFP_NORETRY until
>> +		 * falling back to single-page allocations.
>> +		 */
>> +		for (order = min(order, MAX_ORDER); order > 0; order--) {
>> +			page = alloc_pages(gfp | __GFP_NORETRY, order);
>> +			if (!page)
>> +				continue;
>> +			if (PageCompound(page)) {
>> +				if (!split_huge_page(page))
>> +					break;
>> +				__free_pages(page, order);
>> +			} else {
>> +				split_page(page, order);
>> +				break;
>> +			}
>> +		}
>> +		if (!page)
>> +			page = alloc_page(gfp);
>> +		if (!page) {
>> +			__iommu_dma_free_pages(pages, i);
>> +			return NULL;
>> +		}
>> +		j = 1 << order;
>> +		count -= j;
>> +		while (j--)
>> +			pages[i++] = page++;
>> +	}
>> +	return pages;
>> +}
>
> Hmm, most dma-api implementation just try to allocate a big enough
> region from the page-alloctor. Is it implemented different here to avoid
> the use of CMA?

AFAIK, yes (this is just a slight tidyup of the existing code that 
32-bit Exynos/Tegra/Rockchip/etc. devices are already using) - the 
display guys want increasingly massive contiguous allocations for 
framebuffers, layers, etc., so having IOMMU magic deal with that saves 
CMA for non-IOMMU devices that really need it.

Robin.
Joerg Roedel Aug. 11, 2015, 9:37 a.m. UTC | #6
On Fri, Aug 07, 2015 at 02:38:39PM +0100, Robin Murphy wrote:
> Indeed, DMA_DEBUG will check that a driver is making DMA API calls
> to the arch code in the right way; this is a different check, to
> catch things like the arch code passing the wrong domain into this
> layer, or someone else having messed directly with the domain via
> the IOMMU API. If the iommu_unmap doesn't match the IOVA region we
> looked up, that means the IOMMU page tables have somehow become
> inconsistent with the IOVA allocator, so we are in an unrecoverable
> situation where we can no longer be sure what devices have access
> to. That's bad.

Sure, but the BUG_ON would also trigger on things like a double-free,
which is bad to handle as a BUG_ON. A WARN_ON for this is sufficient.

> AFAIK, yes (this is just a slight tidyup of the existing code that
> 32-bit Exynos/Tegra/Rockchip/etc. devices are already using) - the
> display guys want increasingly massive contiguous allocations for
> framebuffers, layers, etc., so having IOMMU magic deal with that
> saves CMA for non-IOMMU devices that really need it.

Makes sense, I thougt about something similar for x86 too to avoid the
high-order allocations we currently do. I guess the buffer will later be
mapped into the vmalloc space for the CPU?


	Joerg
Robin Murphy Aug. 11, 2015, 1:31 p.m. UTC | #7
Hi Joerg,

On 11/08/15 10:37, Joerg Roedel wrote:
> On Fri, Aug 07, 2015 at 02:38:39PM +0100, Robin Murphy wrote:
>> Indeed, DMA_DEBUG will check that a driver is making DMA API calls
>> to the arch code in the right way; this is a different check, to
>> catch things like the arch code passing the wrong domain into this
>> layer, or someone else having messed directly with the domain via
>> the IOMMU API. If the iommu_unmap doesn't match the IOVA region we
>> looked up, that means the IOMMU page tables have somehow become
>> inconsistent with the IOVA allocator, so we are in an unrecoverable
>> situation where we can no longer be sure what devices have access
>> to. That's bad.
>
> Sure, but the BUG_ON would also trigger on things like a double-free,
> which is bad to handle as a BUG_ON. A WARN_ON for this is sufficient.

Oh dear, it gets even better than that; in the case of a simple 
double-unmap where the IOVA is already free, we wouldn't even get as far 
as that check because we'd die calling iova_size(NULL). How on Earth did 
I get to v5 without spotting that? :(

Anyway, on reflection I think you're probably right; I've clearly been 
working on this for long enough to start falling into the "my thing is 
obviously more important than all the other things" trap.

>> AFAIK, yes (this is just a slight tidyup of the existing code that
>> 32-bit Exynos/Tegra/Rockchip/etc. devices are already using) - the
>> display guys want increasingly massive contiguous allocations for
>> framebuffers, layers, etc., so having IOMMU magic deal with that
>> saves CMA for non-IOMMU devices that really need it.
>
> Makes sense, I thougt about something similar for x86 too to avoid the
> high-order allocations we currently do. I guess the buffer will later be
> mapped into the vmalloc space for the CPU?

Indeed - for non-coherent devices we have to remap all allocations 
(IOMMU or not) anyway in order to get a non-cacheable CPU mapping of the 
buffer, so having non-contiguous pages is no bother; for coherent 
devices we can just do the same thing but keep the vmalloc mapping 
cacheable. There's also the DMA_ATTR_NO_KERNEL_MAPPING case (e.g. GPU 
just wants a big buffer to render into and read back out again) where we 
wouldn't need a CPU address at all, although on arm64 vmalloc space is 
cheap enough that we've no plans to implement that at the moment.

Robin.
diff mbox

Patch

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 8a1bc38..4996dc3 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -48,6 +48,13 @@  config OF_IOMMU
        def_bool y
        depends on OF && IOMMU_API
 
+# IOMMU-agnostic DMA-mapping layer
+config IOMMU_DMA
+	bool
+	depends on NEED_SG_DMA_LENGTH
+	select IOMMU_API
+	select IOMMU_IOVA
+
 config FSL_PAMU
 	bool "Freescale IOMMU support"
 	depends on PPC32
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index dc6f511..45efa2a 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -1,6 +1,7 @@ 
 obj-$(CONFIG_IOMMU_API) += iommu.o
 obj-$(CONFIG_IOMMU_API) += iommu-traces.o
 obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
+obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
new file mode 100644
index 0000000..f34fd46
--- /dev/null
+++ b/drivers/iommu/dma-iommu.c
@@ -0,0 +1,534 @@ 
+/*
+ * A fairly generic DMA-API to IOMMU-API glue layer.
+ *
+ * Copyright (C) 2014-2015 ARM Ltd.
+ *
+ * based in part on arch/arm/mm/dma-mapping.c:
+ * Copyright (C) 2000-2004 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/device.h>
+#include <linux/dma-iommu.h>
+#include <linux/huge_mm.h>
+#include <linux/iommu.h>
+#include <linux/iova.h>
+#include <linux/mm.h>
+
+int iommu_dma_init(void)
+{
+	return iova_cache_get();
+}
+
+/**
+ * iommu_get_dma_cookie - Acquire DMA-API resources for a domain
+ * @domain: IOMMU domain to prepare for DMA-API usage
+ *
+ * IOMMU drivers should normally call this from their domain_alloc
+ * callback when domain->type == IOMMU_DOMAIN_DMA.
+ */
+int iommu_get_dma_cookie(struct iommu_domain *domain)
+{
+	struct iova_domain *iovad;
+
+	if (domain->dma_api_cookie)
+		return -EEXIST;
+
+	iovad = kzalloc(sizeof(*iovad), GFP_KERNEL);
+	domain->dma_api_cookie = iovad;
+
+	return iovad ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(iommu_get_dma_cookie);
+
+/**
+ * iommu_put_dma_cookie - Release a domain's DMA mapping resources
+ * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
+ *
+ * IOMMU drivers should normally call this from their domain_free callback.
+ */
+void iommu_put_dma_cookie(struct iommu_domain *domain)
+{
+	struct iova_domain *iovad = domain->dma_api_cookie;
+
+	if (!iovad)
+		return;
+
+	put_iova_domain(iovad);
+	kfree(iovad);
+	domain->dma_api_cookie = NULL;
+}
+EXPORT_SYMBOL(iommu_put_dma_cookie);
+
+/**
+ * iommu_dma_init_domain - Initialise a DMA mapping domain
+ * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
+ * @base: IOVA at which the mappable address space starts
+ * @size: Size of IOVA space
+ *
+ * @base and @size should be exact multiples of IOMMU page granularity to
+ * avoid rounding surprises. If necessary, we reserve the page at address 0
+ * to ensure it is an invalid IOVA.
+ */
+int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, u64 size)
+{
+	struct iova_domain *iovad = domain->dma_api_cookie;
+	unsigned long order, base_pfn, end_pfn;
+
+	if (!iovad)
+		return -ENODEV;
+
+	/* Use the smallest supported page size for IOVA granularity */
+	order = __ffs(domain->ops->pgsize_bitmap);
+	base_pfn = max_t(unsigned long, 1, base >> order);
+	end_pfn = (base + size - 1) >> order;
+
+	/* Check the domain allows at least some access to the device... */
+	if (domain->geometry.force_aperture) {
+		if (base > domain->geometry.aperture_end ||
+		    base + size <= domain->geometry.aperture_start) {
+			pr_warn("specified DMA range outside IOMMU capability\n");
+			return -EFAULT;
+		}
+		/* ...then finally give it a kicking to make sure it fits */
+		base_pfn = max_t(unsigned long, base_pfn,
+				domain->geometry.aperture_start >> order);
+		end_pfn = min_t(unsigned long, end_pfn,
+				domain->geometry.aperture_end >> order);
+	}
+
+	/* All we can safely do with an existing domain is enlarge it */
+	if (iovad->start_pfn) {
+		if (1UL << order != iovad->granule ||
+		    base_pfn != iovad->start_pfn ||
+		    end_pfn < iovad->dma_32bit_pfn) {
+			pr_warn("Incompatible range for DMA domain\n");
+			return -EFAULT;
+		}
+		iovad->dma_32bit_pfn = end_pfn;
+	} else {
+		init_iova_domain(iovad, 1UL << order, base_pfn, end_pfn);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(iommu_dma_init_domain);
+
+/**
+ * dma_direction_to_prot - Translate DMA API directions to IOMMU API page flags
+ * @dir: Direction of DMA transfer
+ * @coherent: Is the DMA master cache-coherent?
+ *
+ * Return: corresponding IOMMU API page protection flags
+ */
+int dma_direction_to_prot(enum dma_data_direction dir, bool coherent)
+{
+	int prot = coherent ? IOMMU_CACHE : 0;
+
+	switch (dir) {
+	case DMA_BIDIRECTIONAL:
+		return prot | IOMMU_READ | IOMMU_WRITE;
+	case DMA_TO_DEVICE:
+		return prot | IOMMU_READ;
+	case DMA_FROM_DEVICE:
+		return prot | IOMMU_WRITE;
+	default:
+		return 0;
+	}
+}
+
+static struct iova *__alloc_iova(struct iova_domain *iovad, size_t size,
+		dma_addr_t dma_limit)
+{
+	unsigned long shift = iova_shift(iovad);
+	unsigned long length = iova_align(iovad, size) >> shift;
+
+	/*
+	 * Enforce size-alignment to be safe - there should probably be
+	 * an attribute to control this per-device, or at least per-domain...
+	 */
+	return alloc_iova(iovad, length, dma_limit >> shift, true);
+}
+
+/* The IOVA allocator knows what we mapped, so just unmap whatever that was */
+static void __iommu_dma_unmap(struct iommu_domain *domain, dma_addr_t dma_addr)
+{
+	struct iova_domain *iovad = domain->dma_api_cookie;
+	unsigned long shift = iova_shift(iovad);
+	unsigned long pfn = dma_addr >> shift;
+	struct iova *iova = find_iova(iovad, pfn);
+	size_t size = iova_size(iova) << shift;
+
+	/* ...and if we can't, then something is horribly, horribly wrong */
+	BUG_ON(iommu_unmap(domain, pfn << shift, size) < size);
+	__free_iova(iovad, iova);
+}
+
+static void __iommu_dma_free_pages(struct page **pages, int count)
+{
+	while (count--)
+		__free_page(pages[count]);
+	kvfree(pages);
+}
+
+static struct page **__iommu_dma_alloc_pages(unsigned int count, gfp_t gfp)
+{
+	struct page **pages;
+	unsigned int i = 0, array_size = count * sizeof(*pages);
+
+	if (array_size <= PAGE_SIZE)
+		pages = kzalloc(array_size, GFP_KERNEL);
+	else
+		pages = vzalloc(array_size);
+	if (!pages)
+		return NULL;
+
+	/* IOMMU can map any pages, so himem can also be used here */
+	gfp |= __GFP_NOWARN | __GFP_HIGHMEM;
+
+	while (count) {
+		struct page *page = NULL;
+		int j, order = __fls(count);
+
+		/*
+		 * Higher-order allocations are a convenience rather
+		 * than a necessity, hence using __GFP_NORETRY until
+		 * falling back to single-page allocations.
+		 */
+		for (order = min(order, MAX_ORDER); order > 0; order--) {
+			page = alloc_pages(gfp | __GFP_NORETRY, order);
+			if (!page)
+				continue;
+			if (PageCompound(page)) {
+				if (!split_huge_page(page))
+					break;
+				__free_pages(page, order);
+			} else {
+				split_page(page, order);
+				break;
+			}
+		}
+		if (!page)
+			page = alloc_page(gfp);
+		if (!page) {
+			__iommu_dma_free_pages(pages, i);
+			return NULL;
+		}
+		j = 1 << order;
+		count -= j;
+		while (j--)
+			pages[i++] = page++;
+	}
+	return pages;
+}
+
+/**
+ * iommu_dma_free - Free a buffer allocated by iommu_dma_alloc()
+ * @dev: Device which owns this buffer
+ * @pages: Array of buffer pages as returned by iommu_dma_alloc()
+ * @size: Size of buffer in bytes
+ * @handle: DMA address of buffer
+ *
+ * Frees both the pages associated with the buffer, and the array
+ * describing them
+ */
+void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
+		dma_addr_t *handle)
+{
+	__iommu_dma_unmap(iommu_get_domain_for_dev(dev), *handle);
+	__iommu_dma_free_pages(pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
+	*handle = DMA_ERROR_CODE;
+}
+
+/**
+ * iommu_dma_alloc - Allocate and map a buffer contiguous in IOVA space
+ * @dev: Device to allocate memory for. Must be a real device
+ *	 attached to an iommu_dma_domain
+ * @size: Size of buffer in bytes
+ * @gfp: Allocation flags
+ * @prot: IOMMU mapping flags
+ * @handle: Out argument for allocated DMA handle
+ * @flush_page: Arch callback which must ensure PAGE_SIZE bytes from the
+ *		given VA/PA are visible to the given non-coherent device.
+ *
+ * If @size is less than PAGE_SIZE, then a full CPU page will be allocated,
+ * but an IOMMU which supports smaller pages might not map the whole thing.
+ *
+ * Return: Array of struct page pointers describing the buffer,
+ *	   or NULL on failure.
+ */
+struct page **iommu_dma_alloc(struct device *dev, size_t size,
+		gfp_t gfp, int prot, dma_addr_t *handle,
+		void (*flush_page)(struct device *, const void *, phys_addr_t))
+{
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iova_domain *iovad = domain->dma_api_cookie;
+	struct iova *iova;
+	struct page **pages;
+	struct sg_table sgt;
+	dma_addr_t dma_addr;
+	unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	*handle = DMA_ERROR_CODE;
+
+	pages = __iommu_dma_alloc_pages(count, gfp);
+	if (!pages)
+		return NULL;
+
+	iova = __alloc_iova(iovad, size, dev->coherent_dma_mask);
+	if (!iova)
+		goto out_free_pages;
+
+	size = iova_align(iovad, size);
+	if (sg_alloc_table_from_pages(&sgt, pages, count, 0, size, GFP_KERNEL))
+		goto out_free_iova;
+
+	if (!(prot & IOMMU_CACHE)) {
+		struct sg_mapping_iter miter;
+		/*
+		 * The CPU-centric flushing implied by SG_MITER_TO_SG isn't
+		 * sufficient here, so skip it by using the "wrong" direction.
+		 */
+		sg_miter_start(&miter, sgt.sgl, sgt.orig_nents, SG_MITER_FROM_SG);
+		while (sg_miter_next(&miter))
+			flush_page(dev, miter.addr, page_to_phys(miter.page));
+		sg_miter_stop(&miter);
+	}
+
+	dma_addr = iova_dma_addr(iovad, iova);
+	if (iommu_map_sg(domain, dma_addr, sgt.sgl, sgt.orig_nents, prot)
+			< size)
+		goto out_free_sg;
+
+	*handle = dma_addr;
+	sg_free_table(&sgt);
+	return pages;
+
+out_free_sg:
+	sg_free_table(&sgt);
+out_free_iova:
+	__free_iova(iovad, iova);
+out_free_pages:
+	__iommu_dma_free_pages(pages, count);
+	return NULL;
+}
+
+/**
+ * iommu_dma_mmap - Map a buffer into provided user VMA
+ * @pages: Array representing buffer from iommu_dma_alloc()
+ * @size: Size of buffer in bytes
+ * @vma: VMA describing requested userspace mapping
+ *
+ * Maps the pages of the buffer in @pages into @vma. The caller is responsible
+ * for verifying the correct size and protection of @vma beforehand.
+ */
+
+int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma)
+{
+	unsigned long uaddr = vma->vm_start;
+	unsigned int i, count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	int ret = -ENXIO;
+
+	for (i = vma->vm_pgoff; i < count && uaddr < vma->vm_end; i++) {
+		ret = vm_insert_page(vma, uaddr, pages[i]);
+		if (ret)
+			break;
+		uaddr += PAGE_SIZE;
+	}
+	return ret;
+}
+
+dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, int prot)
+{
+	dma_addr_t dma_addr;
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iova_domain *iovad = domain->dma_api_cookie;
+	phys_addr_t phys = page_to_phys(page) + offset;
+	size_t iova_off = iova_offset(iovad, phys);
+	size_t len = iova_align(iovad, size + iova_off);
+	struct iova *iova = __alloc_iova(iovad, len, dma_get_mask(dev));
+
+	if (!iova)
+		return DMA_ERROR_CODE;
+
+	dma_addr = iova_dma_addr(iovad, iova);
+	if (iommu_map(domain, dma_addr, phys - iova_off, len, prot)) {
+		__free_iova(iovad, iova);
+		return DMA_ERROR_CODE;
+	}
+	return dma_addr + iova_off;
+}
+
+void iommu_dma_unmap_page(struct device *dev, dma_addr_t handle, size_t size,
+		enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	__iommu_dma_unmap(iommu_get_domain_for_dev(dev), handle);
+}
+
+/*
+ * Go and look at iommu_dma_map_sg first; It's OK, I'll wait...
+ *
+ * ...right, now that the scatterlist pages are all contiguous from the
+ * device's viewpoint, we can collapse any buffer segments which run
+ * together (subject to the device's segment limitations), filling in
+ * the DMA fields at the same time as we run through the list.
+ */
+static int __finalise_sg(struct device *dev, struct scatterlist *sg, int nents,
+		dma_addr_t dma_addr)
+{
+	struct scatterlist *s, *seg = sg;
+	unsigned long seg_mask = dma_get_seg_boundary(dev);
+	unsigned int max_len = dma_get_max_seg_size(dev);
+	unsigned int seg_len = 0, seg_dma = 0;
+	int i, count = 1;
+
+	for_each_sg(sg, s, nents, i) {
+		/* Un-swizzling the fields here, hence the naming mismatch */
+		unsigned int s_offset = sg_dma_address(s);
+		unsigned int s_length = sg_dma_len(s);
+		unsigned int s_dma_len = s->length;
+
+		s->offset = s_offset;
+		s->length = s_length;
+		sg_dma_address(s) = DMA_ERROR_CODE;
+		sg_dma_len(s) = 0;
+
+		/*
+		 * This ensures any concatenation we do doesn't exceed the
+		 * dma_parms limits, but it also won't fail if any segments
+		 * were out of spec to begin with - they'll just stay as-is.
+		 */
+		if (seg_len && (seg_dma + seg_len == dma_addr + s_offset) &&
+		    (seg_len + s_dma_len <= max_len) &&
+		    ((seg_dma & seg_mask) <= seg_mask - (seg_len + s_length))
+		   ) {
+			sg_dma_len(seg) += s_dma_len;
+		} else {
+			if (seg_len) {
+				seg = sg_next(seg);
+				count++;
+			}
+			sg_dma_len(seg) = s_dma_len - s_offset;
+			sg_dma_address(seg) = dma_addr + s_offset;
+
+			seg_len = s_offset;
+			seg_dma = dma_addr + s_offset;
+		}
+		seg_len += s_length;
+		dma_addr += s_dma_len;
+	}
+	return count;
+}
+
+/*
+ * If mapping failed, then just restore the original list,
+ * but making sure the DMA fields are invalidated.
+ */
+static void __invalidate_sg(struct scatterlist *sg, int nents)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i) {
+		if (sg_dma_address(s) != DMA_ERROR_CODE)
+			s->offset = sg_dma_address(s);
+		if (sg_dma_len(s))
+			s->length = sg_dma_len(s);
+		sg_dma_address(s) = DMA_ERROR_CODE;
+		sg_dma_len(s) = 0;
+	}
+}
+
+/*
+ * The DMA API client is passing in a scatterlist which could describe
+ * any old buffer layout, but the IOMMU API requires everything to be
+ * aligned to IOMMU pages. Hence the need for this complicated bit of
+ * impedance-matching, to be able to hand off a suitably-aligned list,
+ * but still preserve the original offsets and sizes for the caller.
+ */
+int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
+		int nents, int prot)
+{
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iova_domain *iovad = domain->dma_api_cookie;
+	struct iova *iova;
+	struct scatterlist *s;
+	dma_addr_t dma_addr;
+	size_t iova_len = 0;
+	int i;
+
+	/*
+	 * Work out how much IOVA space we need, and align the segments to
+	 * IOVA granules for the IOMMU driver to handle. With some clever
+	 * trickery we can modify the list in-place, but reversibly, by
+	 * hiding the original data in the as-yet-unused DMA fields.
+	 */
+	for_each_sg(sg, s, nents, i) {
+		size_t s_offset = iova_offset(iovad, s->offset);
+		size_t s_length = s->length;
+
+		sg_dma_address(s) = s->offset;
+		sg_dma_len(s) = s_length;
+		s->offset -= s_offset;
+		s_length = iova_align(iovad, s_length + s_offset);
+		s->length = s_length;
+
+		iova_len += s_length;
+	}
+
+	iova = __alloc_iova(iovad, iova_len, dma_get_mask(dev));
+	if (!iova)
+		goto out_restore_sg;
+
+	/*
+	 * We'll leave any physical concatenation to the IOMMU driver's
+	 * implementation - it knows better than we do.
+	 */
+	dma_addr = iova_dma_addr(iovad, iova);
+	if (iommu_map_sg(domain, dma_addr, sg, nents, prot) < iova_len)
+		goto out_free_iova;
+
+	return __finalise_sg(dev, sg, nents, dma_addr);
+
+out_free_iova:
+	__free_iova(iovad, iova);
+out_restore_sg:
+	__invalidate_sg(sg, nents);
+	return 0;
+}
+
+void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir, struct dma_attrs *attrs)
+{
+	/*
+	 * The scatterlist segments are mapped contiguously
+	 * in IOVA space, so this is incredibly easy.
+	 */
+	__iommu_dma_unmap(iommu_get_domain_for_dev(dev), sg_dma_address(sg));
+}
+
+int iommu_dma_supported(struct device *dev, u64 mask)
+{
+	/*
+	 * 'Special' IOMMUs which don't have the same addressing capability
+	 * as the CPU will have to wait until we have some way to query that
+	 * before they'll be able to use this framework.
+	 */
+	return 1;
+}
+
+int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+	return dma_addr == DMA_ERROR_CODE;
+}
diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
new file mode 100644
index 0000000..227299f
--- /dev/null
+++ b/include/linux/dma-iommu.h
@@ -0,0 +1,84 @@ 
+/*
+ * Copyright (C) 2014-2015 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __DMA_IOMMU_H
+#define __DMA_IOMMU_H
+
+#ifdef __KERNEL__
+
+#include <linux/iommu.h>
+
+#ifdef CONFIG_IOMMU_DMA
+
+int iommu_dma_init(void);
+
+/* Domain management interface for IOMMU drivers */
+int iommu_get_dma_cookie(struct iommu_domain *domain);
+void iommu_put_dma_cookie(struct iommu_domain *domain);
+
+/* Setup call for arch DMA mapping code */
+int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base, u64 size);
+
+/* General helpers for DMA-API <-> IOMMU-API interaction */
+int dma_direction_to_prot(enum dma_data_direction dir, bool coherent);
+
+/*
+ * These implement the bulk of the relevant DMA mapping callbacks, but require
+ * the arch code to take care of attributes and cache maintenance
+ */
+struct page **iommu_dma_alloc(struct device *dev, size_t size,
+		gfp_t gfp, int prot, dma_addr_t *handle,
+		void (*flush_page)(struct device *, const void *, phys_addr_t));
+void iommu_dma_free(struct device *dev, struct page **pages, size_t size,
+		dma_addr_t *handle);
+
+int iommu_dma_mmap(struct page **pages, size_t size, struct vm_area_struct *vma);
+
+dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
+		unsigned long offset, size_t size, int prot);
+int iommu_dma_map_sg(struct device *dev, struct scatterlist *sg,
+		int nents, int prot);
+
+/*
+ * Arch code with no special attribute handling may use these
+ * directly as DMA mapping callbacks for simplicity
+ */
+void iommu_dma_unmap_page(struct device *dev, dma_addr_t handle, size_t size,
+		enum dma_data_direction dir, struct dma_attrs *attrs);
+void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+		enum dma_data_direction dir, struct dma_attrs *attrs);
+int iommu_dma_supported(struct device *dev, u64 mask);
+int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
+
+#else
+
+static inline int iommu_dma_init(void)
+{
+	return 0;
+}
+
+static inline int iommu_get_dma_cookie(struct iommu_domain *domain)
+{
+	return -ENODEV;
+}
+
+static inline void iommu_put_dma_cookie(struct iommu_domain *domain)
+{
+}
+
+#endif	/* CONFIG_IOMMU_DMA */
+
+#endif	/* __KERNEL__ */
+#endif	/* __DMA_IOMMU_H */
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index f9c1b6d..dd176a8 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -81,6 +81,7 @@  struct iommu_domain {
 	iommu_fault_handler_t handler;
 	void *handler_token;
 	struct iommu_domain_geometry geometry;
+	void *dma_api_cookie;
 };
 
 enum iommu_cap {