diff mbox

[9/9,v2] vfio pci: Add vfio iommu implementation for FSL_PAMU

Message ID 1384838233-24847-10-git-send-email-Bharat.Bhushan@freescale.com (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Bharat Bhushan Nov. 19, 2013, 5:17 a.m. UTC
This patch adds vfio iommu support for Freescale IOMMU (PAMU -
Peripheral Access Management Unit).

The Freescale PAMU is an aperture-based IOMMU with the following
characteristics.  Each device has an entry in a table in memory
describing the iova->phys mapping. The mapping has:
   -an overall aperture that is power of 2 sized, and has a start iova that
    is naturally aligned
   -has 1 or more windows within the aperture
   -number of windows must be power of 2, max is 256
   -size of each window is determined by aperture size / # of windows
   -iova of each window is determined by aperture start iova / # of windows
   -the mapped region in each window can be different than
    the window size...mapping must power of 2
   -physical address of the mapping must be naturally aligned
    with the mapping size

Some of the code is derived from TYPE1 iommu (driver/vfio/vfio_iommu_type1.c).

Signed-off-by: Bharat Bhushan <bharat.bhushan@freescale.com>
---
v1->v2
 - Use lock around msi-dma list
 - check for overlap between dma and msi-dma pages
 - Some code cleanup as per various comments

 drivers/vfio/Kconfig               |    6 +
 drivers/vfio/Makefile              |    1 +
 drivers/vfio/vfio_iommu_fsl_pamu.c | 1003 ++++++++++++++++++++++++++++++++++++
 include/uapi/linux/vfio.h          |  100 ++++
 4 files changed, 1110 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vfio/vfio_iommu_fsl_pamu.c
diff mbox

Patch

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 26b3d9d..7d1da26 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -8,11 +8,17 @@  config VFIO_IOMMU_SPAPR_TCE
 	depends on VFIO && SPAPR_TCE_IOMMU
 	default n
 
+config VFIO_IOMMU_FSL_PAMU
+	tristate
+	depends on VFIO
+	default n
+
 menuconfig VFIO
 	tristate "VFIO Non-Privileged userspace driver framework"
 	depends on IOMMU_API
 	select VFIO_IOMMU_TYPE1 if X86
 	select VFIO_IOMMU_SPAPR_TCE if (PPC_POWERNV || PPC_PSERIES)
+	select VFIO_IOMMU_FSL_PAMU if FSL_PAMU
 	help
 	  VFIO provides a framework for secure userspace device drivers.
 	  See Documentation/vfio.txt for more details.
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index c5792ec..7461350 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,4 +1,5 @@ 
 obj-$(CONFIG_VFIO) += vfio.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_common.o vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_common.o vfio_iommu_spapr_tce.o
+obj-$(CONFIG_VFIO_IOMMU_FSL_PAMU) += vfio_iommu_common.o vfio_iommu_fsl_pamu.o
 obj-$(CONFIG_VFIO_PCI) += pci/
diff --git a/drivers/vfio/vfio_iommu_fsl_pamu.c b/drivers/vfio/vfio_iommu_fsl_pamu.c
new file mode 100644
index 0000000..66efc84
--- /dev/null
+++ b/drivers/vfio/vfio_iommu_fsl_pamu.c
@@ -0,0 +1,1003 @@ 
+/*
+ * VFIO: IOMMU DMA mapping support for FSL PAMU IOMMU
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ *     Author: Bharat Bhushan <bharat.bhushan@freescale.com>
+ *
+ * This file is derived from driver/vfio/vfio_iommu_type1.c
+ *
+ * The Freescale PAMU is an aperture-based IOMMU with the following
+ * characteristics.  Each device has an entry in a table in memory
+ * describing the iova->phys mapping. The mapping has:
+ *  -an overall aperture that is power of 2 sized, and has a start iova that
+ *   is naturally aligned
+ *  -has 1 or more windows within the aperture
+ *     -number of windows must be power of 2, max is 256
+ *     -size of each window is determined by aperture size / # of windows
+ *     -iova of each window is determined by aperture start iova / # of windows
+ *     -the mapped region in each window can be different than
+ *      the window size...mapping must power of 2
+ *     -physical address of the mapping must be naturally aligned
+ *      with the mapping size
+ */
+
+#include <linux/compat.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/pci.h>		/* pci_bus_type */
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vfio.h>
+#include <linux/hugetlb.h>
+#include <linux/msi.h>
+#include <asm/fsl_pamu_stash.h>
+
+#include "vfio_iommu_common.h"
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "Bharat Bhushan <bharat.bhushan@freescale.com>"
+#define DRIVER_DESC     "FSL PAMU IOMMU driver for VFIO"
+
+struct vfio_iommu {
+	struct iommu_domain	*domain;
+	struct mutex		lock;
+	dma_addr_t		aperture_start;
+	dma_addr_t		aperture_end;
+	dma_addr_t		page_size;	/* Maximum mapped Page size */
+	int			nsubwindows;	/* Number of subwindows */
+	struct rb_root		dma_list;
+	struct list_head	msi_dma_list;
+	struct list_head	group_list;
+};
+
+struct vfio_dma {
+	struct rb_node		node;
+	dma_addr_t		iova;		/* Device address */
+	unsigned long		vaddr;		/* Process virtual addr */
+	size_t			size;		/* Map size (bytes) */
+	int			prot;		/* IOMMU_READ/WRITE */
+};
+
+struct vfio_msi_dma {
+	struct list_head	next;
+	dma_addr_t		iova;		/* Device address */
+	size_t			size;		/* MSI page size */
+	int			bank_id;
+	int			prot;		/* IOMMU_READ/WRITE */
+};
+
+struct vfio_group {
+	struct iommu_group	*iommu_group;
+	struct list_head	next;
+};
+
+static int iova_to_win(struct vfio_iommu *iommu, dma_addr_t iova)
+{
+	u64 offset = iova - iommu->aperture_start;
+	do_div(offset, iommu->page_size);
+	return (int) offset;
+}
+
+static int vfio_disable_iommu_domain(struct vfio_iommu *iommu)
+{
+	int enable = 0;
+	return iommu_domain_set_attr(iommu->domain,
+				     DOMAIN_ATTR_FSL_PAMU_ENABLE, &enable);
+}
+
+static int vfio_enable_iommu_domain(struct vfio_iommu *iommu)
+{
+	int enable = 1;
+	return iommu_domain_set_attr(iommu->domain,
+				     DOMAIN_ATTR_FSL_PAMU_ENABLE, &enable);
+}
+
+/* Unmap DMA region */
+/* This function disable iommu if no dma mapping is set */
+static void vfio_check_and_disable_iommu(struct vfio_iommu *iommu)
+{
+	if (list_empty(&iommu->msi_dma_list) && !rb_first(&iommu->dma_list))
+		vfio_disable_iommu_domain(iommu);
+}
+
+static struct vfio_msi_dma *vfio_find_msi_dma(struct vfio_iommu *iommu,
+					      dma_addr_t start, size_t size)
+{
+	struct vfio_msi_dma *msi_dma;
+
+	/* Check MSI MAP entries */
+	list_for_each_entry(msi_dma, &iommu->msi_dma_list, next) {
+		if ((start + size) <= (msi_dma->iova))
+			continue;
+
+		if ((start >= (msi_dma->iova + msi_dma->size)))
+			continue;
+
+		return msi_dma;
+	}
+
+	return NULL;
+}
+
+static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
+				      dma_addr_t start, size_t size)
+{
+	struct rb_node *node = iommu->dma_list.rb_node;
+
+	/* check DMA MAP entries */
+	while (node) {
+		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+
+		if (start + size <= dma->iova)
+			node = node->rb_left;
+		else if (start >= dma->iova + dma->size)
+			node = node->rb_right;
+		else
+			return dma;
+	}
+
+	return NULL;
+}
+
+static void vfio_insert_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
+{
+	struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
+	struct vfio_dma *dma;
+
+	while (*link) {
+		parent = *link;
+		dma = rb_entry(parent, struct vfio_dma, node);
+
+		if (new->iova + new->size <= dma->iova)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, &iommu->dma_list);
+}
+
+static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
+{
+	rb_erase(&old->node, &iommu->dma_list);
+	vfio_check_and_disable_iommu(iommu);
+}
+
+static int vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
+			    dma_addr_t iova, size_t *size)
+{
+	dma_addr_t start = iova;
+	int win, win_start, win_end;
+	long unlocked = 0;
+	unsigned int nr_pages;
+
+	nr_pages = iommu->page_size / PAGE_SIZE;
+	win_start = iova_to_win(iommu, iova);
+	win_end = iova_to_win(iommu, iova + *size - 1);
+
+	/* Release the pinned pages */
+	for (win = win_start; win <= win_end; iova += iommu->page_size, win++) {
+		unsigned long pfn;
+
+		pfn = iommu_iova_to_phys(iommu->domain, iova) >> PAGE_SHIFT;
+		if (!pfn)
+			continue;
+
+		iommu_domain_window_disable(iommu->domain, win);
+
+		unlocked += vfio_unpin_pages(pfn, nr_pages, dma->prot, 1);
+	}
+
+	vfio_lock_acct(-unlocked);
+	*size = iova - start;
+	return 0;
+}
+
+static int vfio_remove_dma_overlap(struct vfio_iommu *iommu, dma_addr_t start,
+				   size_t *size, struct vfio_dma *dma)
+{
+	size_t offset, overlap, tmp;
+	struct vfio_dma *split;
+	int ret;
+
+	if (!*size)
+		return 0;
+
+	/*
+	 * Existing dma region is completely covered, unmap all.  This is
+	 * the likely case since userspace tends to map and unmap buffers
+	 * in one shot rather than multiple mappings within a buffer.
+	 */
+	if (likely(start <= dma->iova &&
+		   start + *size >= dma->iova + dma->size)) {
+		*size = dma->size;
+		ret = vfio_unmap_unpin(iommu, dma, dma->iova, size);
+		if (ret)
+			return ret;
+
+		/*
+		 * Did we remove more than we have?  Should never happen
+		 * since a vfio_dma is contiguous in iova and vaddr.
+		 */
+		WARN_ON(*size != dma->size);
+
+		vfio_remove_dma(iommu, dma);
+		kfree(dma);
+		return 0;
+	}
+
+	/* Overlap low address of existing range */
+	if (start <= dma->iova) {
+		overlap = start + *size - dma->iova;
+		ret = vfio_unmap_unpin(iommu, dma, dma->iova, &overlap);
+		if (ret)
+			return ret;
+
+		vfio_remove_dma(iommu, dma);
+
+		/*
+		 * Check, we may have removed to whole vfio_dma.  If not
+		 * fixup and re-insert.
+		 */
+		if (overlap < dma->size) {
+			dma->iova += overlap;
+			dma->vaddr += overlap;
+			dma->size -= overlap;
+			vfio_insert_dma(iommu, dma);
+		} else
+			kfree(dma);
+
+		*size = overlap;
+		return 0;
+	}
+
+	/* Overlap high address of existing range */
+	if (start + *size >= dma->iova + dma->size) {
+		offset = start - dma->iova;
+		overlap = dma->size - offset;
+
+		ret = vfio_unmap_unpin(iommu, dma, start, &overlap);
+		if (ret)
+			return ret;
+
+		dma->size -= overlap;
+		*size = overlap;
+		return 0;
+	}
+
+	/* Split existing */
+
+	/*
+	 * Allocate our tracking structure early even though it may not
+	 * be used.  An Allocation failure later loses track of pages and
+	 * is more difficult to unwind.
+	 */
+	split = kzalloc(sizeof(*split), GFP_KERNEL);
+	if (!split)
+		return -ENOMEM;
+
+	offset = start - dma->iova;
+
+	ret = vfio_unmap_unpin(iommu, dma, start, size);
+	if (ret || !*size) {
+		kfree(split);
+		return ret;
+	}
+
+	tmp = dma->size;
+
+	/* Resize the lower vfio_dma in place, before the below insert */
+	dma->size = offset;
+
+	/* Insert new for remainder, assuming it didn't all get unmapped */
+	if (likely(offset + *size < tmp)) {
+		split->size = tmp - offset - *size;
+		split->iova = dma->iova + offset + *size;
+		split->vaddr = dma->vaddr + offset + *size;
+		split->prot = dma->prot;
+		vfio_insert_dma(iommu, split);
+	} else
+		kfree(split);
+
+	return 0;
+}
+
+/* Map DMA region */
+static int vfio_dma_map(struct vfio_iommu *iommu, dma_addr_t iova,
+			  unsigned long vaddr, long npage, int prot)
+{
+	int ret = 0, i;
+	size_t size;
+	unsigned int win, nr_subwindows;
+	dma_addr_t iovamap;
+
+	win = iova_to_win(iommu, iova);
+	if (iova != iommu->aperture_start + iommu->page_size * win) {
+		pr_err("%s iova(%llx) unalligned to window size %llx\n",
+			__func__, iova, iommu->page_size);
+		return -EINVAL;
+	}
+
+	/* total size to be mapped */
+	size = npage << PAGE_SHIFT;
+	nr_subwindows = size >> ilog2(iommu->page_size);
+	iovamap = iova;
+
+	for (i = 0; i < nr_subwindows; i++, win++) {
+		unsigned long pfn;
+		unsigned long nr_pages;
+		dma_addr_t mapsize;
+		struct vfio_dma *dma = NULL;
+
+		mapsize = min(iova + size - iovamap, iommu->page_size);
+		nr_pages = mapsize >> PAGE_SHIFT;
+
+		/* Pin a contiguous chunk of memory */
+		ret = vfio_pin_pages(vaddr, nr_pages, prot, &pfn);
+		if (ret != nr_pages) {
+			pr_err("%s unable to pin pages = %lx, pinned(%lx/%lx)\n",
+				__func__, vaddr, npage, nr_pages);
+			ret = -EINVAL;
+			break;
+		}
+
+		ret = iommu_domain_window_enable(iommu->domain, win,
+						 (phys_addr_t)pfn << PAGE_SHIFT,
+						 mapsize, prot);
+		if (ret) {
+			pr_err("%s unable to iommu_map()\n", __func__);
+			ret = -EINVAL;
+			break;
+		}
+
+		/*
+		 * Check if we abut a region below - nothing below 0.
+		 * This is the most likely case when mapping chunks of
+		 * physically contiguous regions within a virtual address
+		 * range.  Update the abutting entry in place since iova
+		 * doesn't change.
+		 */
+		if (likely(iovamap)) {
+			struct vfio_dma *tmp;
+			tmp = vfio_find_dma(iommu, iovamap - 1, 1);
+			if (tmp && tmp->prot == prot &&
+			    tmp->vaddr + tmp->size == vaddr) {
+				tmp->size += mapsize;
+				dma = tmp;
+			}
+		}
+
+		/*
+		 * Check if we abut a region above - nothing above ~0 + 1.
+		 * If we abut above and below, remove and free.  If only
+		 * abut above, remove, modify, reinsert.
+		 */
+		if (likely(iovamap + mapsize)) {
+			struct vfio_dma *tmp;
+			tmp = vfio_find_dma(iommu, iovamap + mapsize, 1);
+			if (tmp && tmp->prot == prot &&
+			    tmp->vaddr == vaddr + mapsize) {
+				vfio_remove_dma(iommu, tmp);
+				if (dma) {
+					dma->size += tmp->size;
+					kfree(tmp);
+				} else {
+					tmp->size += mapsize;
+					tmp->iova = iovamap;
+					tmp->vaddr = vaddr;
+					vfio_insert_dma(iommu, tmp);
+					dma = tmp;
+				}
+			}
+		}
+
+		if (!dma) {
+			dma = kzalloc(sizeof(*dma), GFP_KERNEL);
+			if (!dma) {
+				iommu_unmap(iommu->domain, iovamap, mapsize);
+				vfio_unpin_pages(pfn, npage, prot, true);
+				ret = -ENOMEM;
+				break;
+			}
+
+			dma->size = mapsize;
+			dma->iova = iovamap;
+			dma->vaddr = vaddr;
+			dma->prot = prot;
+			vfio_insert_dma(iommu, dma);
+		}
+
+		iovamap += mapsize;
+		vaddr += mapsize;
+	}
+
+	if (ret) {
+		struct vfio_dma *tmp;
+		while ((tmp = vfio_find_dma(iommu, iova, size))) {
+			int r = vfio_remove_dma_overlap(iommu, iova,
+							&size, tmp);
+			if (WARN_ON(r || !size))
+				break;
+		}
+		return 0;
+	}
+
+	vfio_enable_iommu_domain(iommu);
+	return 0;
+}
+
+static int vfio_dma_do_map(struct vfio_iommu *iommu,
+			   struct vfio_iommu_type1_dma_map *map)
+{
+	dma_addr_t iova = map->iova;
+	size_t size = map->size;
+	unsigned long vaddr = map->vaddr;
+	int ret = 0, prot = 0;
+	long npage;
+
+	/* READ/WRITE from device perspective */
+	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
+		prot |= IOMMU_WRITE;
+	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
+		prot |= IOMMU_READ;
+
+	if (!prot)
+		return -EINVAL; /* No READ/WRITE? */
+
+	/* Don't allow IOVA wrap */
+	if (iova + size && iova + size < iova)
+		return -EINVAL;
+
+	/* Don't allow virtual address wrap */
+	if (vaddr + size && vaddr + size < vaddr)
+		return -EINVAL;
+
+	/*
+	 * FIXME: Currently we only support mapping page-size
+	 * of subwindow-size.
+	 */
+	if (size < iommu->page_size)
+		return -EINVAL;
+
+	npage = size >> PAGE_SHIFT;
+	if (!npage)
+		return -EINVAL;
+
+	mutex_lock(&iommu->lock);
+
+	/* Check for dma maping and msi_dma mapping */
+	if (vfio_find_dma(iommu, iova, size) ||
+	    vfio_find_msi_dma(iommu, iova, size)) {
+		ret = -EEXIST;
+		goto out_lock;
+	}
+
+	ret = vfio_dma_map(iommu, iova, vaddr, npage, prot);
+
+out_lock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
+			     struct vfio_iommu_type1_dma_unmap *unmap)
+{
+	struct vfio_dma *dma;
+	size_t unmapped = 0, size;
+	int ret = 0;
+
+	mutex_lock(&iommu->lock);
+
+	while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
+		size = unmap->size;
+		ret = vfio_remove_dma_overlap(iommu, unmap->iova, &size, dma);
+		if (ret || !size)
+			break;
+		unmapped += size;
+	}
+
+	mutex_unlock(&iommu->lock);
+
+	/*
+	 * We may unmap more than requested, update the unmap struct so
+	 * userspace can know.
+	 */
+	unmap->size = unmapped;
+
+	return ret;
+}
+
+static int vfio_handle_get_attr(struct vfio_iommu *iommu,
+			 struct vfio_pamu_attr *pamu_attr)
+{
+	int ret = 0;
+
+	switch (pamu_attr->attribute) {
+	case VFIO_ATTR_GEOMETRY: {
+		struct iommu_domain_geometry geom;
+		ret = iommu_domain_get_attr(iommu->domain,
+					  DOMAIN_ATTR_GEOMETRY, &geom);
+		pamu_attr->attr_info.attr.aperture_start = geom.aperture_start;
+		pamu_attr->attr_info.attr.aperture_end = geom.aperture_end;
+		break;
+	}
+	case VFIO_ATTR_WINDOWS: {
+		u32 count;
+		ret = iommu_domain_get_attr(iommu->domain,
+				      DOMAIN_ATTR_WINDOWS, &count);
+		pamu_attr->attr_info.windows = count;
+		break;
+	}
+	case VFIO_ATTR_PAMU_STASH: {
+		struct pamu_stash_attribute stash;
+		ret = iommu_domain_get_attr(iommu->domain,
+				      DOMAIN_ATTR_FSL_PAMU_STASH, &stash);
+		pamu_attr->attr_info.stash.cpu = stash.cpu;
+		pamu_attr->attr_info.stash.cache = stash.cache;
+		break;
+	}
+
+	default:
+		pr_err("%s Error: Invalid attribute (%d)\n",
+			 __func__, pamu_attr->attribute);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int vfio_handle_set_attr(struct vfio_iommu *iommu,
+			 struct vfio_pamu_attr *pamu_attr)
+{
+	int ret = 0;
+
+	switch (pamu_attr->attribute) {
+	case VFIO_ATTR_GEOMETRY: {
+		struct iommu_domain_geometry geom;
+
+		geom.aperture_start = pamu_attr->attr_info.attr.aperture_start;
+		geom.aperture_end = pamu_attr->attr_info.attr.aperture_end;
+		iommu->aperture_start = geom.aperture_start;
+		iommu->aperture_end = geom.aperture_end;
+		geom.force_aperture = 1;
+		ret = iommu_domain_set_attr(iommu->domain,
+					  DOMAIN_ATTR_GEOMETRY, &geom);
+		break;
+	}
+	case VFIO_ATTR_WINDOWS: {
+		u32 count = pamu_attr->attr_info.windows;
+		u64 size = iommu->aperture_end - iommu->aperture_start + 1;
+
+		ret = iommu_domain_set_attr(iommu->domain,
+				      DOMAIN_ATTR_WINDOWS, &count);
+		if (!ret) {
+			iommu->nsubwindows = pamu_attr->attr_info.windows;
+			iommu->page_size = size >> ilog2(count);
+		}
+
+		break;
+	}
+	case VFIO_ATTR_PAMU_STASH: {
+		struct pamu_stash_attribute stash;
+
+		stash.cpu = pamu_attr->attr_info.stash.cpu;
+		stash.cache = pamu_attr->attr_info.stash.cache;
+		ret = iommu_domain_set_attr(iommu->domain,
+				      DOMAIN_ATTR_FSL_PAMU_STASH, &stash);
+		break;
+	}
+
+	default:
+		pr_err("%s Error: Invalid attribute (%d)\n",
+			 __func__, pamu_attr->attribute);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int pci_msi_set_device_iova(struct device *dev, void *data)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct vfio_msi_dma *msi_dma = data;
+
+	return msi_set_iova(pdev, msi_dma->bank_id, msi_dma->iova, 1);
+}
+
+static int pci_msi_clear_device_iova(struct device *dev, void *data)
+{
+	struct pci_dev *pdev = to_pci_dev(dev);
+	struct vfio_msi_dma *msi_dma = data;
+
+	return msi_set_iova(pdev, msi_dma->bank_id, msi_dma->iova, 0);
+}
+
+static int vfio_iommu_set_msi_iova(struct vfio_iommu *iommu,
+				   struct vfio_msi_dma *msi_dma)
+{
+	struct vfio_group *group;
+	int ret = 0;
+
+	list_for_each_entry(group, &iommu->group_list, next) {
+		ret = iommu_group_for_each_dev(group->iommu_group, msi_dma,
+					       pci_msi_set_device_iova);
+	}
+
+	return ret;
+}
+
+static int vfio_iommu_clear_msi_iova(struct vfio_iommu *iommu,
+				     struct vfio_msi_dma *msi_dma)
+{
+	struct vfio_group *group;
+	int ret = 0;
+
+	list_for_each_entry(group, &iommu->group_list, next) {
+		ret = iommu_group_for_each_dev(group->iommu_group, msi_dma,
+					       pci_msi_clear_device_iova);
+	}
+
+	return ret;
+}
+
+static int vfio_do_msi_map(struct vfio_iommu *iommu,
+			struct vfio_pamu_msi_bank_map *msi_map)
+{
+	struct msi_region region;
+	struct vfio_msi_dma *msi_dma;
+	int window;
+	int prot = 0;
+	int ret;
+
+	/* READ/WRITE from device perspective */
+	if (msi_map->flags & VFIO_DMA_MAP_FLAG_WRITE)
+		prot |= IOMMU_WRITE;
+	if (msi_map->flags & VFIO_DMA_MAP_FLAG_READ)
+		prot |= IOMMU_READ;
+
+	if (!prot)
+		return -EINVAL; /* No READ/WRITE? */
+
+	ret = msi_get_region(msi_map->msi_bank_index, &region);
+	if (ret) {
+		pr_err("%s MSI region (%d) not found\n", __func__,
+		       msi_map->msi_bank_index);
+		return ret;
+	}
+
+	mutex_lock(&iommu->lock);
+	/* Check for dma maping and msi_dma mapping */
+	if (vfio_find_dma(iommu, msi_map->iova, region.size) ||
+	    vfio_find_msi_dma(iommu, msi_map->iova, region.size)) {
+		ret = -EEXIST;
+		goto out_lock;
+	}
+
+	window = iova_to_win(iommu, msi_map->iova);
+	ret = iommu_domain_window_enable(iommu->domain, window, region.addr,
+					 region.size, prot);
+	if (ret) {
+		pr_err("%s Error: unable to map msi region\n", __func__);
+		goto out_lock;
+	}
+
+	msi_dma = kzalloc(sizeof(*msi_dma), GFP_KERNEL);
+	if (!msi_dma) {
+		ret = -ENOMEM;
+		goto out_lock;
+	}
+
+	msi_dma->iova = msi_map->iova;
+	msi_dma->size = region.size;
+	msi_dma->bank_id = msi_map->msi_bank_index;
+	list_add(&msi_dma->next, &iommu->msi_dma_list);
+
+	/* Set iova for all the device in iommu-group for the given msi-bank */
+	ret = vfio_iommu_set_msi_iova(iommu, msi_dma);
+
+out_lock:
+	mutex_unlock(&iommu->lock);
+	return ret;
+}
+
+static void vfio_msi_unmap(struct vfio_iommu *iommu, dma_addr_t iova)
+{
+	int window;
+	window = iova_to_win(iommu, iova);
+	iommu_domain_window_disable(iommu->domain, window);
+}
+
+static int vfio_do_msi_unmap(struct vfio_iommu *iommu,
+			     struct vfio_pamu_msi_bank_unmap *msi_unmap)
+{
+	struct vfio_msi_dma *mdma, *mdma_tmp;
+
+	mutex_lock(&iommu->lock);
+
+	list_for_each_entry_safe(mdma, mdma_tmp, &iommu->msi_dma_list, next) {
+		if (mdma->iova == msi_unmap->iova) {
+			/* Clear mapping for msi iova page mapping */
+			vfio_iommu_clear_msi_iova(iommu, mdma);
+			/* Unmap in iommu (PAMU) */
+			vfio_msi_unmap(iommu, mdma->iova);
+			list_del(&mdma->next);
+			vfio_check_and_disable_iommu(iommu);
+			kfree(mdma);
+			mutex_unlock(&iommu->lock);
+			return 0;
+		}
+	}
+
+	mutex_unlock(&iommu->lock);
+	return -EINVAL;
+}
+static void *vfio_iommu_fsl_pamu_open(unsigned long arg)
+{
+	struct vfio_iommu *iommu;
+
+	if (arg != VFIO_FSL_PAMU_IOMMU)
+		return ERR_PTR(-EINVAL);
+
+	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
+	if (!iommu)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&iommu->group_list);
+	iommu->dma_list = RB_ROOT;
+	INIT_LIST_HEAD(&iommu->msi_dma_list);
+	mutex_init(&iommu->lock);
+
+	/*
+	 * Wish we didn't have to know about bus_type here.
+	 */
+	iommu->domain = iommu_domain_alloc(&pci_bus_type);
+	if (!iommu->domain) {
+		kfree(iommu);
+		return ERR_PTR(-EIO);
+	}
+
+	return iommu;
+}
+
+static void vfio_iommu_fsl_pamu_release(void *iommu_data)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_group *group, *group_tmp;
+	struct vfio_msi_dma *mdma, *mdma_tmp;
+	struct rb_node *node;
+
+	list_for_each_entry_safe(group, group_tmp, &iommu->group_list, next) {
+		iommu_detach_group(iommu->domain, group->iommu_group);
+		list_del(&group->next);
+		kfree(group);
+	}
+
+	while ((node = rb_first(&iommu->dma_list))) {
+		struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
+		size_t size = dma->size;
+		vfio_remove_dma_overlap(iommu, dma->iova, &size, dma);
+		if (WARN_ON(!size))
+			break;
+	}
+
+	list_for_each_entry_safe(mdma, mdma_tmp, &iommu->msi_dma_list, next) {
+		vfio_msi_unmap(iommu, mdma->iova);
+		list_del(&mdma->next);
+		kfree(mdma);
+	}
+
+	/* Disable the iommu as there is no valid entry */
+	vfio_disable_iommu_domain(iommu);
+
+	iommu_domain_free(iommu->domain);
+	iommu->domain = NULL;
+	kfree(iommu);
+}
+
+static long vfio_iommu_fsl_pamu_ioctl(void *iommu_data,
+				      unsigned int cmd, unsigned long arg)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	unsigned long minsz;
+
+	if (cmd == VFIO_CHECK_EXTENSION) {
+		switch (arg) {
+		case VFIO_FSL_PAMU_IOMMU:
+			return 1;
+		default:
+			return 0;
+		}
+	} else if (cmd == VFIO_IOMMU_MAP_DMA) {
+		struct vfio_iommu_type1_dma_map map;
+		uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
+				VFIO_DMA_MAP_FLAG_WRITE;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
+
+		if (copy_from_user(&map, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (map.argsz < minsz || map.flags & ~mask)
+			return -EINVAL;
+
+		return vfio_dma_do_map(iommu, &map);
+
+	} else if (cmd == VFIO_IOMMU_UNMAP_DMA) {
+		struct vfio_iommu_type1_dma_unmap unmap;
+		long ret;
+
+		minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
+
+		if (copy_from_user(&unmap, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (unmap.argsz < minsz || unmap.flags)
+			return -EINVAL;
+
+		ret = vfio_dma_do_unmap(iommu, &unmap);
+		if (ret)
+			return ret;
+
+		return copy_to_user((void __user *)arg, &unmap, minsz);
+	} else if (cmd == VFIO_IOMMU_PAMU_GET_ATTR) {
+		struct vfio_pamu_attr pamu_attr;
+
+		minsz = offsetofend(struct vfio_pamu_attr, attr_info);
+		if (copy_from_user(&pamu_attr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (pamu_attr.argsz < minsz)
+			return -EINVAL;
+
+		vfio_handle_get_attr(iommu, &pamu_attr);
+
+		copy_to_user((void __user *)arg, &pamu_attr, minsz);
+		return 0;
+	} else if (cmd == VFIO_IOMMU_PAMU_SET_ATTR) {
+		struct vfio_pamu_attr pamu_attr;
+
+		minsz = offsetofend(struct vfio_pamu_attr, attr_info);
+		if (copy_from_user(&pamu_attr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (pamu_attr.argsz < minsz)
+			return -EINVAL;
+
+		vfio_handle_set_attr(iommu, &pamu_attr);
+		return 0;
+	} else if (cmd == VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT) {
+		return msi_get_region_count();
+	} else if (cmd == VFIO_IOMMU_PAMU_MAP_MSI_BANK) {
+		struct vfio_pamu_msi_bank_map msi_map;
+
+		minsz = offsetofend(struct vfio_pamu_msi_bank_map, iova);
+		if (copy_from_user(&msi_map, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (msi_map.argsz < minsz)
+			return -EINVAL;
+
+		vfio_do_msi_map(iommu, &msi_map);
+		return 0;
+	} else if (cmd == VFIO_IOMMU_PAMU_UNMAP_MSI_BANK) {
+		struct vfio_pamu_msi_bank_unmap msi_unmap;
+
+		minsz = offsetofend(struct vfio_pamu_msi_bank_unmap, iova);
+		if (copy_from_user(&msi_unmap, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (msi_unmap.argsz < minsz)
+			return -EINVAL;
+
+		vfio_do_msi_unmap(iommu, &msi_unmap);
+		return 0;
+
+	}
+
+	return -ENOTTY;
+}
+
+static int vfio_iommu_fsl_pamu_attach_group(void *iommu_data,
+					 struct iommu_group *iommu_group)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_group *group, *tmp;
+	int ret;
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group)
+		return -ENOMEM;
+
+	mutex_lock(&iommu->lock);
+
+	list_for_each_entry(tmp, &iommu->group_list, next) {
+		if (tmp->iommu_group == iommu_group) {
+			mutex_unlock(&iommu->lock);
+			kfree(group);
+			return -EINVAL;
+		}
+	}
+
+	ret = iommu_attach_group(iommu->domain, iommu_group);
+	if (ret) {
+		mutex_unlock(&iommu->lock);
+		kfree(group);
+		return ret;
+	}
+
+	group->iommu_group = iommu_group;
+	list_add(&group->next, &iommu->group_list);
+
+	mutex_unlock(&iommu->lock);
+
+	return 0;
+}
+
+static void vfio_iommu_fsl_pamu_detach_group(void *iommu_data,
+					  struct iommu_group *iommu_group)
+{
+	struct vfio_iommu *iommu = iommu_data;
+	struct vfio_group *group;
+
+	mutex_lock(&iommu->lock);
+
+	list_for_each_entry(group, &iommu->group_list, next) {
+		if (group->iommu_group == iommu_group) {
+			iommu_detach_group(iommu->domain, iommu_group);
+			list_del(&group->next);
+			kfree(group);
+			break;
+		}
+	}
+
+	mutex_unlock(&iommu->lock);
+}
+
+static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_fsl_pamu = {
+	.name		= "vfio-iommu-fsl_pamu",
+	.owner		= THIS_MODULE,
+	.open		= vfio_iommu_fsl_pamu_open,
+	.release	= vfio_iommu_fsl_pamu_release,
+	.ioctl		= vfio_iommu_fsl_pamu_ioctl,
+	.attach_group	= vfio_iommu_fsl_pamu_attach_group,
+	.detach_group	= vfio_iommu_fsl_pamu_detach_group,
+};
+
+static int __init vfio_iommu_fsl_pamu_init(void)
+{
+	if (!iommu_present(&pci_bus_type))
+		return -ENODEV;
+
+	return vfio_register_iommu_driver(&vfio_iommu_driver_ops_fsl_pamu);
+}
+
+static void __exit vfio_iommu_fsl_pamu_cleanup(void)
+{
+	vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_fsl_pamu);
+}
+
+module_init(vfio_iommu_fsl_pamu_init);
+module_exit(vfio_iommu_fsl_pamu_cleanup);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 0fd47f5..d359055 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -23,6 +23,7 @@ 
 
 #define VFIO_TYPE1_IOMMU		1
 #define VFIO_SPAPR_TCE_IOMMU		2
+#define VFIO_FSL_PAMU_IOMMU		3
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
@@ -451,4 +452,103 @@  struct vfio_iommu_spapr_tce_info {
 
 /* ***************************************************************** */
 
+/*********** APIs for VFIO_PAMU type only ****************/
+/*
+ * VFIO_IOMMU_PAMU_GET_ATTR - _IO(VFIO_TYPE, VFIO_BASE + 17,
+ *				  struct vfio_pamu_attr)
+ *
+ * Gets the iommu attributes for the current vfio container.
+ * Caller sets argsz and attribute.  The ioctl fills in
+ * the provided struct vfio_pamu_attr based on the attribute
+ * value that was set.
+ * Return: 0 on success, -errno on failure
+ */
+struct vfio_pamu_attr {
+	__u32	argsz;
+	__u32	flags;	/* no flags currently */
+#define VFIO_ATTR_GEOMETRY	0
+#define VFIO_ATTR_WINDOWS	1
+#define VFIO_ATTR_PAMU_STASH	2
+	__u32	attribute;
+
+	union {
+		/* VFIO_ATTR_GEOMETRY */
+		struct {
+			/* first addr that can be mapped */
+			__u64 aperture_start;
+			/* last addr that can be mapped */
+			__u64 aperture_end;
+		} attr;
+
+		/* VFIO_ATTR_WINDOWS */
+		__u32 windows;  /* number of windows in the aperture
+				 * initially this will be the max number
+				 * of windows that can be set
+				 */
+		/* VFIO_ATTR_PAMU_STASH */
+		struct {
+			__u32 cpu;	/* CPU number for stashing */
+			__u32 cache;	/* cache ID for stashing */
+		} stash;
+	} attr_info;
+};
+#define VFIO_IOMMU_PAMU_GET_ATTR  _IO(VFIO_TYPE, VFIO_BASE + 17)
+
+/*
+ * VFIO_IOMMU_PAMU_SET_ATTR - _IO(VFIO_TYPE, VFIO_BASE + 18,
+ *				  struct vfio_pamu_attr)
+ *
+ * Sets the iommu attributes for the current vfio container.
+ * Caller sets struct vfio_pamu attr, including argsz and attribute and
+ * setting any fields that are valid for the attribute.
+ * Return: 0 on success, -errno on failure
+ */
+#define VFIO_IOMMU_PAMU_SET_ATTR  _IO(VFIO_TYPE, VFIO_BASE + 18)
+
+/*
+ * VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT - _IO(VFIO_TYPE, VFIO_BASE + 19, __u32)
+ *
+ * Returns the number of MSI banks for this platform.  This tells user space
+ * how many aperture windows should be reserved for MSI banks when setting
+ * the PAMU geometry and window count.
+ * Return: __u32 bank count on success, -errno on failure
+ */
+#define VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT _IO(VFIO_TYPE, VFIO_BASE + 19)
+
+/*
+ * VFIO_IOMMU_PAMU_MAP_MSI_BANK - _IO(VFIO_TYPE, VFIO_BASE + 20,
+ *				      struct vfio_pamu_msi_bank_map)
+ *
+ * Maps the MSI bank at the specified index and iova.  User space must
+ * call this ioctl once for each MSI bank (count of banks is returned by
+ * VFIO_IOMMU_PAMU_GET_MSI_BANK_COUNT).
+ * Caller provides struct vfio_pamu_msi_bank_map with all fields set.
+ * Return: 0 on success, -errno on failure
+ */
+
+struct vfio_pamu_msi_bank_map {
+	__u32	argsz;
+	__u32	flags;		/* no flags currently */
+	__u32	msi_bank_index;	/* the index of the MSI bank */
+	__u64	iova;		/* the iova the bank is to be mapped to */
+};
+#define VFIO_IOMMU_PAMU_MAP_MSI_BANK  _IO(VFIO_TYPE, VFIO_BASE + 20)
+
+/*
+ * VFIO_IOMMU_PAMU_UNMAP_MSI_BANK - _IO(VFIO_TYPE, VFIO_BASE + 21,
+ *					struct vfio_pamu_msi_bank_unmap)
+ *
+ * Unmaps the MSI bank at the specified iova.
+ * Caller provides struct vfio_pamu_msi_bank_unmap with all fields set.
+ * Operates on VFIO file descriptor (/dev/vfio/vfio).
+ * Return: 0 on success, -errno on failure
+ */
+
+struct vfio_pamu_msi_bank_unmap {
+	__u32	argsz;
+	__u32	flags;	/* no flags currently */
+	__u64	iova;	/* the iova to be unmapped to */
+};
+#define VFIO_IOMMU_PAMU_UNMAP_MSI_BANK  _IO(VFIO_TYPE, VFIO_BASE + 21)
+
 #endif /* _UAPIVFIO_H */