diff mbox

[2/5] iommu/mediatek: Add mt8173 IOMMU driver

Message ID 1425638900-24989-3-git-send-email-yong.wu@mediatek.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yong Wu (吴勇) March 6, 2015, 10:48 a.m. UTC
From: Yong Wu <yong.wu@mediatek.com>

This patch adds support for mediatek m4u (MultiMedia Memory Management Unit).
Currently this only supports m4u gen 2 with 2 levels of page table on mt8173.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
---
 drivers/iommu/Kconfig               |  11 +
 drivers/iommu/Makefile              |   1 +
 drivers/iommu/mtk_iommu.c           | 754 ++++++++++++++++++++++++++++++++++++
 drivers/iommu/mtk_iommu.h           |  73 ++++
 drivers/iommu/mtk_iommu_pagetable.c | 439 +++++++++++++++++++++
 drivers/iommu/mtk_iommu_pagetable.h |  49 +++
 6 files changed, 1327 insertions(+)
 create mode 100644 drivers/iommu/mtk_iommu.c
 create mode 100644 drivers/iommu/mtk_iommu.h
 create mode 100644 drivers/iommu/mtk_iommu_pagetable.c
 create mode 100644 drivers/iommu/mtk_iommu_pagetable.h

Comments

Will Deacon March 6, 2015, 10:58 a.m. UTC | #1
On Fri, Mar 06, 2015 at 10:48:17AM +0000, yong.wu@mediatek.com wrote:
> From: Yong Wu <yong.wu@mediatek.com>
> 
> This patch adds support for mediatek m4u (MultiMedia Memory Management Unit).
> Currently this only supports m4u gen 2 with 2 levels of page table on mt8173.

[...]

> diff --git a/drivers/iommu/mtk_iommu_pagetable.c b/drivers/iommu/mtk_iommu_pagetable.c
> new file mode 100644
> index 0000000..5fe9640
> --- /dev/null
> +++ b/drivers/iommu/mtk_iommu_pagetable.c
> @@ -0,0 +1,439 @@
> +/*
> + * Copyright (c) 2014-2015 MediaTek Inc.
> + * Author: Yong Wu <yong.wu@mediatek.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +#include <linux/err.h>
> +#include <linux/mm.h>
> +#include <linux/iommu.h>
> +#include <linux/errno.h>
> +#include "asm/cacheflush.h"
> +
> +#include "mtk_iommu.h"
> +#include "mtk_iommu_pagetable.h"
> +
> +/* 2 level pagetable: pgd -> pte */
> +#define F_PTE_TYPE_GET(regval)  (regval & 0x3)
> +#define F_PTE_TYPE_LARGE         BIT(0)
> +#define F_PTE_TYPE_SMALL         BIT(1)
> +#define F_PTE_B_BIT              BIT(2)
> +#define F_PTE_C_BIT              BIT(3)
> +#define F_PTE_BIT32_BIT          BIT(9)
> +#define F_PTE_S_BIT              BIT(10)
> +#define F_PTE_NG_BIT             BIT(11)
> +#define F_PTE_PA_LARGE_MSK            (~0UL << 16)
> +#define F_PTE_PA_LARGE_GET(regval)    ((regval >> 16) & 0xffff)
> +#define F_PTE_PA_SMALL_MSK            (~0UL << 12)
> +#define F_PTE_PA_SMALL_GET(regval)    ((regval >> 12) & (~0))
> +#define F_PTE_TYPE_IS_LARGE_PAGE(pte) ((imu_pte_val(pte) & 0x3) == \
> +                                       F_PTE_TYPE_LARGE)
> +#define F_PTE_TYPE_IS_SMALL_PAGE(pte) ((imu_pte_val(pte) & 0x3) == \
> +                                       F_PTE_TYPE_SMALL)

This looks like the ARM short-descriptor format to me. Could you please
add a new page table format to the io-pgtable code, so that other IOMMU
drivers can make use of this? I know there was some interest in using
short descriptor for the ARM SMMU, for example.

Cheers,

Will
Mitchel Humpherys March 6, 2015, 5:15 p.m. UTC | #2
On Fri, Mar 06 2015 at 02:48:17 AM, <yong.wu@mediatek.com> wrote:
> From: Yong Wu <yong.wu@mediatek.com>
>
> This patch adds support for mediatek m4u (MultiMedia Memory Management Unit).
> Currently this only supports m4u gen 2 with 2 levels of page table on mt8173.

[...]

> +static int mtk_iommu_invalidate_tlb(const struct mtk_iommu_info *piommu,
> +				    int isinvall, unsigned int iova_start,
> +				    unsigned int iova_end)
> +{
> +	void __iomem *m4u_base = piommu->m4u_base;
> +	u32 val;
> +	u64 start, end;
> +
> +	start = sched_clock();
> +
> +	if (!isinvall) {
> +		iova_start = round_down(iova_start, SZ_4K);
> +		iova_end = round_up(iova_end, SZ_4K);
> +	}
> +
> +	val = F_MMU_INV_EN_L2 | F_MMU_INV_EN_L1;
> +
> +	writel(val, m4u_base + REG_INVLID_SEL);
> +
> +	if (isinvall) {
> +		writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);
> +	} else {
> +		writel(iova_start, m4u_base + REG_MMU_INVLD_SA);
> +		writel(iova_end, m4u_base + REG_MMU_INVLD_EA);
> +		writel(F_MMU_INV_RANGE, m4u_base + REG_MMU_INVLD);
> +
> +		while (!readl(m4u_base + REG_MMU_CPE_DONE)) {
> +			end = sched_clock();
> +			if (end - start >= 100000000ULL) {
> +				dev_warn(piommu->dev, "invalid don't done\n");
> +				writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);
> +			}
> +		};

Superfluous `;'.  Also, maybe you should be using readl_poll_timeout?

> +		writel(0, m4u_base + REG_MMU_CPE_DONE);
> +	}
> +
> +	return 0;
> +}



-Mitch
Tomasz Figa March 8, 2015, 4:12 a.m. UTC | #3
Hi Yong Wu,

Thanks for this series. Please see my comments inline.

On Fri, Mar 6, 2015 at 7:48 PM,  <yong.wu@mediatek.com> wrote:
> From: Yong Wu <yong.wu@mediatek.com>
>
> This patch adds support for mediatek m4u (MultiMedia Memory Management Unit).
> Currently this only supports m4u gen 2 with 2 levels of page table on mt8173.

[snip]

> diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
> new file mode 100644
> index 0000000..d62d4ab
> --- /dev/null
> +++ b/drivers/iommu/mtk_iommu.c
> @@ -0,0 +1,754 @@
> +/*
> + * Copyright (c) 2014-2015 MediaTek Inc.
> + * Author: Yong Wu <yong.wu@mediatek.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#define pr_fmt(fmt) "m4u:"fmt

This format is not very useful, especially when using dev_ helpers
prepends messages with device name.

> +
> +#include <linux/io.h>
> +#include <linux/interrupt.h>
> +#include <linux/platform_device.h>
> +#include <linux/slab.h>
> +#include <linux/clk.h>
> +#include <linux/err.h>
> +#include <linux/mm.h>
> +#include <linux/iommu.h>
> +#include <linux/errno.h>
> +#include <linux/list.h>
> +#include <linux/memblock.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/dma-iommu.h>
> +#include <linux/of_address.h>
> +#include <linux/of_irq.h>
> +#include <linux/of_platform.h>
> +#include <linux/mtk-smi.h>
> +#include <asm/cacheflush.h>
> +
> +#include "mtk_iommu.h"

CodingStyle: The definitions below do not seem to be aligned
correctly. Please make sure all the values are in the same column and
are aligned using tabs only (remembering that tab width specified by
Linux coding style is 8 characters).

> +
> +#define REG_MMUG_PT_BASE        0x0
> +
> +#define REG_MMU_INVLD           0x20
> +#define F_MMU_INV_ALL           0x2
> +#define F_MMU_INV_RANGE                 0x1
> +
> +#define REG_MMU_INVLD_SA        0x24
> +#define REG_MMU_INVLD_EA         0x28
> +
> +#define REG_MMU_INVLD_SEC         0x2c
> +#define F_MMU_INV_SEC_ALL          0x2
> +#define F_MMU_INV_SEC_RANGE        0x1
> +
> +#define REG_INVLID_SEL          0x38
> +#define F_MMU_INV_EN_L1                 BIT(0)
> +#define F_MMU_INV_EN_L2                 BIT(1)
> +
> +#define REG_MMU_STANDARD_AXI_MODE   0x48
> +#define REG_MMU_DCM_DIS             0x50
> +#define REG_MMU_LEGACY_4KB_MODE     0x60
> +
> +#define REG_MMU_CTRL_REG                 0x110
> +#define F_MMU_CTRL_REROUTE_PFQ_TO_MQ_EN  BIT(4)
> +#define F_MMU_CTRL_TF_PROT_VAL(prot)      (((prot) & 0x3)<<5)

CodingStyle: Operators (e.g. <<) should have spaces on both sides.

> +#define F_MMU_CTRL_COHERE_EN             BIT(8)
> +
> +#define REG_MMU_IVRP_PADDR               0x114
> +#define F_MMU_IVRP_PA_SET(PA)            (PA>>1)

Please make sure that all references to macro arguments in macro
definitions are surrounded by parentheses to avoid issues with
operator precedence. (i.e. ((pa) >> 1))

CodingStyle: Macro arguments should be lowercase.

> +
> +#define REG_MMU_INT_L2_CONTROL      0x120
> +#define F_INT_L2_CLR_BIT            BIT(12)
> +
> +#define REG_MMU_INT_MAIN_CONTROL    0x124
> +#define F_INT_TRANSLATION_FAULT(MMU)           (1<<(0+(((MMU)<<1)|((MMU)<<2))))
> +#define F_INT_MAIN_MULTI_HIT_FAULT(MMU)        (1<<(1+(((MMU)<<1)|((MMU)<<2))))
> +#define F_INT_INVALID_PA_FAULT(MMU)            (1<<(2+(((MMU)<<1)|((MMU)<<2))))
> +#define F_INT_ENTRY_REPLACEMENT_FAULT(MMU)     (1<<(3+(((MMU)<<1)|((MMU)<<2))))
> +#define F_INT_TLB_MISS_FAULT(MMU)              (1<<(4+(((MMU)<<1)|((MMU)<<2))))
> +#define F_INT_PFH_FIFO_ERR(MMU)                (1<<(6+(((MMU)<<1)|((MMU)<<2))))

Multiple coding style issues in these 6 macros, as per my comments above.

> +
> +#define REG_MMU_CPE_DONE              0x12C
> +
> +#define REG_MMU_MAIN_FAULT_ST         0x134
> +
> +#define REG_MMU_FAULT_VA(mmu)         (0x13c+((mmu)<<3))
> +#define F_MMU_FAULT_VA_MSK            ((~0x0)<<12)

Please specify the mask manually to avoid ambiguities with result type.
+ CodingStyle

> +#define F_MMU_FAULT_VA_WRITE_BIT       BIT(1)
> +#define F_MMU_FAULT_VA_LAYER_BIT       BIT(0)
> +
> +#define REG_MMU_INVLD_PA(mmu)         (0x140+((mmu)<<3))
> +#define REG_MMU_INT_ID(mmu)           (0x150+((mmu)<<2))
> +#define F_MMU0_INT_ID_TF_MSK          (~0x3)   /* only for MM iommu. */

Ditto.

> +
> +#define MTK_TFID(larbid, portid) ((larbid << 7) | (portid << 2))

Missing parentheses around macro arguments.

> +
> +static const struct mtk_iommu_port mtk_iommu_mt8173_port[] = {
> +       /* port name                m4uid slaveid larbid portid tfid */
> +       /* larb0 */
> +       {"M4U_PORT_DISP_OVL0",          0,  0,    0,  0, MTK_TFID(0, 0)},
> +       {"M4U_PORT_DISP_RDMA0",         0,  0,    0,  1, MTK_TFID(0, 1)},
> +       {"M4U_PORT_DISP_WDMA0",         0,  0,    0,  2, MTK_TFID(0, 2)},
> +       {"M4U_PORT_DISP_OD_R",          0,  0,    0,  3, MTK_TFID(0, 3)},
> +       {"M4U_PORT_DISP_OD_W",          0,  0,    0,  4, MTK_TFID(0, 4)},
> +       {"M4U_PORT_MDP_RDMA0",          0,  0,    0,  5, MTK_TFID(0, 5)},
> +       {"M4U_PORT_MDP_WDMA",           0,  0,    0,  6, MTK_TFID(0, 6)},
> +       {"M4U_PORT_MDP_WROT0",          0,  0,    0,  7, MTK_TFID(0, 7)},

[snip]

> +       {"M4U_PORT_HSIC_MAS",              1,  0,    6,  12, 0x11},
> +       {"M4U_PORT_HSIC_DEV",              1,  0,    6,  13, 0x19},
> +       {"M4U_PORT_AP_DMA",                1,  0,    6,  14, 0x18},
> +       {"M4U_PORT_HSIC_DMA",              1,  0,    6,  15, 0xc8},
> +       {"M4U_PORT_MSDC0",                 1,  0,    6,  16, 0x0},
> +       {"M4U_PORT_MSDC3",                 1,  0,    6,  17, 0x20},
> +       {"M4U_PORT_UNKNOWN",               1,  0,    6,  18, 0xf},

Why the MTK_TFID() macro is not used for perisys iommu?

> +};
> +

Anyway, is it really necessary to hardcode the SoC specific topology
data in this driver? Is there really any use besides of printing port
name? If not, you could just print the values in a way letting you
quickly look up in the datasheet, without hardcoding this. Or even
better, you could print which devices are attached to the port.

> +static const struct mtk_iommu_cfg mtk_iommu_mt8173_cfg = {
> +       .larb_nr = 6,
> +       .m4u_port_nr = ARRAY_SIZE(mtk_iommu_mt8173_port),
> +       .pport = mtk_iommu_mt8173_port,
> +};
> +
> +static const char *mtk_iommu_get_port_name(const struct mtk_iommu_info *piommu,
> +                                          unsigned int portid)
> +{
> +       const struct mtk_iommu_port *pcurport = NULL;
> +
> +       pcurport = piommu->imucfg->pport + portid;
> +       if (portid < piommu->imucfg->m4u_port_nr && pcurport)
> +               return pcurport->port_name;
> +       else
> +               return "UNKNOWN_PORT";
> +}

This function seems to be used just for printing the hardcoded port names.

> +
> +static int mtk_iommu_get_port_by_tfid(const struct mtk_iommu_info *pimu,
> +                                     int tf_id)
> +{
> +       const struct mtk_iommu_cfg *pimucfg = pimu->imucfg;
> +       int i;
> +       unsigned int portid = pimucfg->m4u_port_nr;
> +
> +       for (i = 0; i < pimucfg->m4u_port_nr; i++) {
> +               if (pimucfg->pport[i].tf_id == tf_id) {
> +                       portid = i;
> +                       break;
> +               }
> +       }
> +       if (i == pimucfg->m4u_port_nr)
> +               dev_err(pimu->dev, "tf_id find fail, tfid %d\n", tf_id);
> +       return portid;
> +}

This function seems to be used just for finding an index into the
array of hardcoded port names for printing purposes.

> +
> +static irqreturn_t mtk_iommu_isr(int irq, void *dev_id)
> +{
> +       struct iommu_domain *domain = dev_id;
> +       struct mtk_iommu_domain *mtkdomain = domain->priv;
> +       struct mtk_iommu_info *piommu = mtkdomain->piommuinfo;
> +
> +       if (irq == piommu->irq)
> +               report_iommu_fault(domain, piommu->dev, 0, 0);

This doesn't look like a good use of report_iommu_fault(). You should
pass real values of iova and flags arguments.

> +       else

This is impossible, no need to check this.

> +               dev_err(piommu->dev, "irq number:%d\n", irq);
> +
> +       return IRQ_HANDLED;
> +}
> +
> +static inline void mtk_iommu_clear_intr(void __iomem *m4u_base)

Please let the compiler decide if this function should be inline or not.

> +{
> +       u32 val;
> +
> +       val = readl(m4u_base + REG_MMU_INT_L2_CONTROL);
> +       val |= F_INT_L2_CLR_BIT;
> +       writel(val, m4u_base + REG_MMU_INT_L2_CONTROL);
> +}
> +
> +static int mtk_iommu_invalidate_tlb(const struct mtk_iommu_info *piommu,
> +                                   int isinvall, unsigned int iova_start,
> +                                   unsigned int iova_end)
> +{
> +       void __iomem *m4u_base = piommu->m4u_base;
> +       u32 val;
> +       u64 start, end;
> +
> +       start = sched_clock();

I don't think this is the preferred way of checking time in kernel
drivers, especially after seeing this comment:
http://lxr.free-electrons.com/source/include/linux/sched.h#L2134

You should use ktime_get() and other ktime_ helpers.

> +
> +       if (!isinvall) {
> +               iova_start = round_down(iova_start, SZ_4K);
> +               iova_end = round_up(iova_end, SZ_4K);
> +       }
> +
> +       val = F_MMU_INV_EN_L2 | F_MMU_INV_EN_L1;
> +
> +       writel(val, m4u_base + REG_INVLID_SEL);
> +
> +       if (isinvall) {
> +               writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);

Please move invalidate all into separate function and just call it
wherever the code currently passes true as invall argument. You will
get rid of two of the ifs in this function.

> +       } else {
> +               writel(iova_start, m4u_base + REG_MMU_INVLD_SA);
> +               writel(iova_end, m4u_base + REG_MMU_INVLD_EA);
> +               writel(F_MMU_INV_RANGE, m4u_base + REG_MMU_INVLD);
> +
> +               while (!readl(m4u_base + REG_MMU_CPE_DONE)) {
> +                       end = sched_clock();
> +                       if (end - start >= 100000000ULL) {

Looks like a very interesting magic number. Please define a macro and
use normal time units along with ktime_to_<unit>() helpers.

> +                               dev_warn(piommu->dev, "invalid don't done\n");
> +                               writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);

By following my comment above, you could just call the new invalidate
all function here instead of duplicating the same register write.

> +                       }
> +               };

nit: Unnecessary semicolon.

> +               writel(0, m4u_base + REG_MMU_CPE_DONE);
> +       }
> +
> +       return 0;
> +}
> +
> +static int mtk_iommu_fault_handler(struct iommu_domain *imudomain,
> +                                  struct device *dev, unsigned long iova,
> +                                  int m4uindex, void *pimu)
> +{
> +       void __iomem *m4u_base;
> +       u32 int_state, regval;
> +       int m4u_slave_id = 0;

Why 0? Also this variable doesn't seem to be assigned further in the
code. Should it be a constant? Moreover, shouldn't it be unsigned?

> +       unsigned int layer, write, m4u_port;

Shouldn't layer and write be bool?

> +       unsigned int fault_mva, fault_pa;
> +       struct mtk_iommu_info *piommu = pimu;
> +       struct mtk_iommu_domain *mtkdomain = imudomain->priv;
> +
> +       m4u_base = piommu->m4u_base;
> +       int_state = readl(m4u_base + REG_MMU_MAIN_FAULT_ST);
> +
> +       /* read error info from registers */
> +       fault_mva = readl(m4u_base + REG_MMU_FAULT_VA(m4u_slave_id));
> +       layer = !!(fault_mva & F_MMU_FAULT_VA_LAYER_BIT);
> +       write = !!(fault_mva & F_MMU_FAULT_VA_WRITE_BIT);
> +       fault_mva &= F_MMU_FAULT_VA_MSK;

So I assume the IOMMU virtual address space is 32-bit, so fault_mva
can be unsigned int. However it would be better to use an explicitly
sized type for this, i.e. u32.

> +       fault_pa = readl(m4u_base + REG_MMU_INVLD_PA(m4u_slave_id));

It is not clear from any documentation added by this series how wide
is the physical address space of this IOMMU, especially since the SoC
has ARM64 cores. Please make sure that fault_pa variable is using
explicitly sized type which matches width of the register.

End of part 1. Will continue when I find next chunk of time to spend
on review. :)

Best regards,
Tomasz
Daniel Kurtz March 9, 2015, 8:24 a.m. UTC | #4
Hi Yong Wu,

On Fri, Mar 6, 2015 at 6:48 PM,  <yong.wu@mediatek.com> wrote:
> From: Yong Wu <yong.wu@mediatek.com>
>
> This patch adds support for mediatek m4u (MultiMedia Memory Management Unit).
> Currently this only supports m4u gen 2 with 2 levels of page table on mt8173.
>
> Signed-off-by: Yong Wu <yong.wu@mediatek.com>
> ---
>  drivers/iommu/Kconfig               |  11 +
>  drivers/iommu/Makefile              |   1 +
>  drivers/iommu/mtk_iommu.c           | 754 ++++++++++++++++++++++++++++++++++++
>  drivers/iommu/mtk_iommu.h           |  73 ++++
>  drivers/iommu/mtk_iommu_pagetable.c | 439 +++++++++++++++++++++
>  drivers/iommu/mtk_iommu_pagetable.h |  49 +++
>  6 files changed, 1327 insertions(+)
>  create mode 100644 drivers/iommu/mtk_iommu.c
>  create mode 100644 drivers/iommu/mtk_iommu.h
>  create mode 100644 drivers/iommu/mtk_iommu_pagetable.c
>  create mode 100644 drivers/iommu/mtk_iommu_pagetable.h
>
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 19027bb..e63f5b6 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -326,4 +326,15 @@ config ARM_SMMU
>           Say Y here if your SoC includes an IOMMU device implementing
>           the ARM SMMU architecture.
>
> +config MTK_IOMMU
> +       bool "MTK IOMMU Support"
> +       select IOMMU_API
> +       select IOMMU_DMA
> +       select MTK_SMI
> +       help
> +         Support for the IOMMUs on certain Mediatek SOCs.
> +         These IOMMUs allow the multimedia hardware access discontinuous memory.
> +
> +         If unsure, say N here.
> +
>  endif # IOMMU_SUPPORT
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 37bfc4e..f2a8027 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -4,6 +4,7 @@ obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
>  obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
>  obj-$(CONFIG_IOMMU_IOVA) += iova.o
>  obj-$(CONFIG_OF_IOMMU) += of_iommu.o
> +obj-$(CONFIG_MTK_IOMMU) += mtk_iommu.o mtk_iommu_pagetable.o
>  obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o
>  obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
>  obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
> diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
> new file mode 100644
> index 0000000..d62d4ab
> --- /dev/null
> +++ b/drivers/iommu/mtk_iommu.c
> @@ -0,0 +1,754 @@
> +/*
> + * Copyright (c) 2014-2015 MediaTek Inc.
> + * Author: Yong Wu <yong.wu@mediatek.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#define pr_fmt(fmt) "m4u:"fmt
> +
> +#include <linux/io.h>
> +#include <linux/interrupt.h>
> +#include <linux/platform_device.h>
> +#include <linux/slab.h>
> +#include <linux/clk.h>
> +#include <linux/err.h>
> +#include <linux/mm.h>
> +#include <linux/iommu.h>
> +#include <linux/errno.h>
> +#include <linux/list.h>
> +#include <linux/memblock.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/dma-iommu.h>
> +#include <linux/of_address.h>
> +#include <linux/of_irq.h>
> +#include <linux/of_platform.h>
> +#include <linux/mtk-smi.h>
> +#include <asm/cacheflush.h>
> +
> +#include "mtk_iommu.h"
> +
> +#define REG_MMUG_PT_BASE        0x0

It would be a lot easier to follow the driver if the register names
matched the datasheet:

REG_MMU_PT_BASE_ADDR  0x000
REG_MMU_INT_CONTROL     0x220
REG_MMU_INT_FAULT_ST     0x224
REG_MMU_INVALIDATE          0x5c0
REG_MMU_INVLD_START_A  0x5c4
REG_MMU_INVLD_END_A      0x5c8
REG_MMU_INV_SEL                0x5d8
REG_MMU_DCM                       0x5f0

> +
> +#define REG_MMU_INVLD           0x20
> +#define F_MMU_INV_ALL           0x2
> +#define F_MMU_INV_RANGE                 0x1
> +
> +#define REG_MMU_INVLD_SA        0x24
> +#define REG_MMU_INVLD_EA         0x28
> +
> +#define REG_MMU_INVLD_SEC         0x2c
> +#define F_MMU_INV_SEC_ALL          0x2
> +#define F_MMU_INV_SEC_RANGE        0x1
> +
> +#define REG_INVLID_SEL          0x38
> +#define F_MMU_INV_EN_L1                 BIT(0)
> +#define F_MMU_INV_EN_L2                 BIT(1)
> +
> +#define REG_MMU_STANDARD_AXI_MODE   0x48
> +#define REG_MMU_DCM_DIS             0x50
> +#define REG_MMU_LEGACY_4KB_MODE     0x60
> +
> +#define REG_MMU_CTRL_REG                 0x110
> +#define F_MMU_CTRL_REROUTE_PFQ_TO_MQ_EN  BIT(4)
> +#define F_MMU_CTRL_TF_PROT_VAL(prot)      (((prot) & 0x3)<<5)
> +#define F_MMU_CTRL_COHERE_EN             BIT(8)
> +
> +#define REG_MMU_IVRP_PADDR               0x114
> +#define F_MMU_IVRP_PA_SET(PA)            (PA>>1)
> +
> +#define REG_MMU_INT_L2_CONTROL      0x120
> +#define F_INT_L2_CLR_BIT            BIT(12)
> +
> +#define REG_MMU_INT_MAIN_CONTROL    0x124
> +#define F_INT_TRANSLATION_FAULT(MMU)           (1<<(0+(((MMU)<<1)|((MMU)<<2))))
> +#define F_INT_MAIN_MULTI_HIT_FAULT(MMU)        (1<<(1+(((MMU)<<1)|((MMU)<<2))))
> +#define F_INT_INVALID_PA_FAULT(MMU)            (1<<(2+(((MMU)<<1)|((MMU)<<2))))
> +#define F_INT_ENTRY_REPLACEMENT_FAULT(MMU)     (1<<(3+(((MMU)<<1)|((MMU)<<2))))
> +#define F_INT_TLB_MISS_FAULT(MMU)              (1<<(4+(((MMU)<<1)|((MMU)<<2))))
> +#define F_INT_PFH_FIFO_ERR(MMU)                (1<<(6+(((MMU)<<1)|((MMU)<<2))))

This is not readable.  For one thing, kernel style is to always place
spaces around operators.
Did you run checkpatch?

Assuming MMU is either 0 or 1, this would shift the error bits for
MMU=1 by "(((MMU) << 1) | ((MMU) << 2))" = 6 bits.
( This "MMU" does not look like it could possibly be correct since
F_INT_PFH_FIFO_ERR(0) == F_INT_TRANSLATION_FAULT(1).

Since in the code below, "MMU" is always m4u_slave_id, (a constant 0),
just remove all of this complexity until it is actually used.
For now just define these macros as:

#define F_INT_TRANSLATION_FAULT        BIT(0)
#define F_INT_MAIN_MULTI_HIT_FAULT     BIT(1)
#define F_INT_INVALID_PA_FAULT         BIT(2)
#define F_INT_ENTRY_REPLACEMENT_FAULT  BIT(3)
#define F_INT_TLB_MISS_FAULT           BIT(4)
#define F_INT_PFH_FIFO_ERR             BIT(6)

A later patch that adds support for slave_id != 0 can then fix up
these macros to do what ever it is you are trying to do here.

> +
> +#define REG_MMU_CPE_DONE              0x12C
> +
> +#define REG_MMU_MAIN_FAULT_ST         0x134
> +
> +#define REG_MMU_FAULT_VA(mmu)         (0x13c+((mmu)<<3))
> +#define F_MMU_FAULT_VA_MSK            ((~0x0)<<12)
> +#define F_MMU_FAULT_VA_WRITE_BIT       BIT(1)
> +#define F_MMU_FAULT_VA_LAYER_BIT       BIT(0)
> +
> +#define REG_MMU_INVLD_PA(mmu)         (0x140+((mmu)<<3))
> +#define REG_MMU_INT_ID(mmu)           (0x150+((mmu)<<2))
> +#define F_MMU0_INT_ID_TF_MSK          (~0x3)   /* only for MM iommu. */
> +
> +#define MTK_TFID(larbid, portid) ((larbid << 7) | (portid << 2))
> +
> +static const struct mtk_iommu_port mtk_iommu_mt8173_port[] = {
> +       /* port name                m4uid slaveid larbid portid tfid */
> +       /* larb0 */
> +       {"M4U_PORT_DISP_OVL0",          0,  0,    0,  0, MTK_TFID(0, 0)},
> +       {"M4U_PORT_DISP_RDMA0",         0,  0,    0,  1, MTK_TFID(0, 1)},
> +       {"M4U_PORT_DISP_WDMA0",         0,  0,    0,  2, MTK_TFID(0, 2)},
> +       {"M4U_PORT_DISP_OD_R",          0,  0,    0,  3, MTK_TFID(0, 3)},
> +       {"M4U_PORT_DISP_OD_W",          0,  0,    0,  4, MTK_TFID(0, 4)},
> +       {"M4U_PORT_MDP_RDMA0",          0,  0,    0,  5, MTK_TFID(0, 5)},
> +       {"M4U_PORT_MDP_WDMA",           0,  0,    0,  6, MTK_TFID(0, 6)},
> +       {"M4U_PORT_MDP_WROT0",          0,  0,    0,  7, MTK_TFID(0, 7)},
> +
> +       /* larb1 */
> +       {"M4U_PORT_HW_VDEC_MC_EXT",      0,  0,   1,  0, MTK_TFID(1, 0)},
> +       {"M4U_PORT_HW_VDEC_PP_EXT",      0,  0,   1,  1, MTK_TFID(1, 1)},
> +       {"M4U_PORT_HW_VDEC_UFO_EXT",     0,  0,   1,  2, MTK_TFID(1, 2)},
> +       {"M4U_PORT_HW_VDEC_VLD_EXT",     0,  0,   1,  3, MTK_TFID(1, 3)},
> +       {"M4U_PORT_HW_VDEC_VLD2_EXT",    0,  0,   1,  4, MTK_TFID(1, 4)},
> +       {"M4U_PORT_HW_VDEC_AVC_MV_EXT",  0,  0,   1,  5, MTK_TFID(1, 5)},
> +       {"M4U_PORT_HW_VDEC_PRED_RD_EXT", 0,  0,   1,  6, MTK_TFID(1, 6)},
> +       {"M4U_PORT_HW_VDEC_PRED_WR_EXT", 0,  0,   1,  7, MTK_TFID(1, 7)},
> +       {"M4U_PORT_HW_VDEC_PPWRAP_EXT",  0,  0,   1,  8, MTK_TFID(1, 8)},
> +
> +       /* larb2 */
> +       {"M4U_PORT_IMGO",                0,  0,    2,  0, MTK_TFID(2, 0)},
> +       {"M4U_PORT_RRZO",                0,  0,    2,  1, MTK_TFID(2, 1)},
> +       {"M4U_PORT_AAO",                 0,  0,    2,  2, MTK_TFID(2, 2)},
> +       {"M4U_PORT_LCSO",                0,  0,    2,  3, MTK_TFID(2, 3)},
> +       {"M4U_PORT_ESFKO",               0,  0,    2,  4, MTK_TFID(2, 4)},
> +       {"M4U_PORT_IMGO_D",              0,  0,    2,  5, MTK_TFID(2, 5)},
> +       {"M4U_PORT_LSCI",                0,  0,    2,  6, MTK_TFID(2, 6)},
> +       {"M4U_PORT_LSCI_D",              0,  0,    2,  7, MTK_TFID(2, 7)},
> +       {"M4U_PORT_BPCI",                0,  0,    2,  8, MTK_TFID(2, 8)},
> +       {"M4U_PORT_BPCI_D",              0,  0,    2,  9, MTK_TFID(2, 9)},
> +       {"M4U_PORT_UFDI",                0,  0,    2,  10, MTK_TFID(2, 10)},
> +       {"M4U_PORT_IMGI",                0,  0,    2,  11, MTK_TFID(2, 11)},
> +       {"M4U_PORT_IMG2O",               0,  0,    2,  12, MTK_TFID(2, 12)},
> +       {"M4U_PORT_IMG3O",               0,  0,    2,  13, MTK_TFID(2, 13)},
> +       {"M4U_PORT_VIPI",                0,  0,    2,  14, MTK_TFID(2, 14)},
> +       {"M4U_PORT_VIP2I",               0,  0,    2,  15, MTK_TFID(2, 15)},
> +       {"M4U_PORT_VIP3I",               0,  0,    2,  16, MTK_TFID(2, 16)},
> +       {"M4U_PORT_LCEI",                0,  0,    2,  17, MTK_TFID(2, 17)},
> +       {"M4U_PORT_RB",                  0,  0,    2,  18, MTK_TFID(2, 18)},
> +       {"M4U_PORT_RP",                  0,  0,    2,  19, MTK_TFID(2, 19)},
> +       {"M4U_PORT_WR",                  0,  0,    2,  20, MTK_TFID(2, 20)},
> +
> +       /* larb3 */
> +       {"M4U_PORT_VENC_RCPU",            0,  0,   3,  0, MTK_TFID(3, 0)},
> +       {"M4U_PORT_VENC_REC",             0,  0,   3,  1, MTK_TFID(3, 1)},
> +       {"M4U_PORT_VENC_BSDMA",           0,  0,   3,  2, MTK_TFID(3, 2)},
> +       {"M4U_PORT_VENC_SV_COMV",         0,  0,   3,  3, MTK_TFID(3, 3)},
> +       {"M4U_PORT_VENC_RD_COMV",         0,  0,   3,  4, MTK_TFID(3, 4)},
> +       {"M4U_PORT_JPGENC_RDMA",          0,  0,   3,  5, MTK_TFID(3, 5)},
> +       {"M4U_PORT_JPGENC_BSDMA",         0,  0,   3,  6, MTK_TFID(3, 6)},
> +       {"M4U_PORT_JPGDEC_WDMA",          0,  0,   3,  7, MTK_TFID(3, 7)},
> +       {"M4U_PORT_JPGDEC_BSDMA",         0,  0,   3,  8, MTK_TFID(3, 8)},
> +       {"M4U_PORT_VENC_CUR_LUMA",        0,  0,   3,  9, MTK_TFID(3, 9)},
> +       {"M4U_PORT_VENC_CUR_CHROMA",      0,  0,   3,  10, MTK_TFID(3, 10)},
> +       {"M4U_PORT_VENC_REF_LUMA",        0,  0,   3,  11, MTK_TFID(3, 11)},
> +       {"M4U_PORT_VENC_REF_CHROMA",      0,  0,   3,  12, MTK_TFID(3, 12)},
> +       {"M4U_PORT_VENC_NBM_RDMA",        0,  0,   3,  13, MTK_TFID(3, 13)},
> +       {"M4U_PORT_VENC_NBM_WDMA",        0,  0,   3,  14, MTK_TFID(3, 14)},
> +
> +       /* larb4 */
> +       {"M4U_PORT_DISP_OVL1",             0,  0,   4,  0, MTK_TFID(4, 0)},
> +       {"M4U_PORT_DISP_RDMA1",            0,  0,   4,  1, MTK_TFID(4, 1)},
> +       {"M4U_PORT_DISP_RDMA2",            0,  0,   4,  2, MTK_TFID(4, 2)},
> +       {"M4U_PORT_DISP_WDMA1",            0,  0,   4,  3, MTK_TFID(4, 3)},
> +       {"M4U_PORT_MDP_RDMA1",             0,  0,   4,  4, MTK_TFID(4, 4)},
> +       {"M4U_PORT_MDP_WROT1",             0,  0,   4,  5, MTK_TFID(4, 5)},
> +
> +       /* larb5 */
> +       {"M4U_PORT_VENC_RCPU_SET2",        0,  0,    5,  0, MTK_TFID(5, 0)},
> +       {"M4U_PORT_VENC_REC_FRM_SET2",     0,  0,    5,  1, MTK_TFID(5, 1)},
> +       {"M4U_PORT_VENC_REF_LUMA_SET2",    0,  0,    5,  2, MTK_TFID(5, 2)},
> +       {"M4U_PORT_VENC_REC_CHROMA_SET2",  0,  0,    5,  3, MTK_TFID(5, 3)},
> +       {"M4U_PORT_VENC_BSDMA_SET2",       0,  0,    5,  4, MTK_TFID(5, 4)},
> +       {"M4U_PORT_VENC_CUR_LUMA_SET2",    0,  0,    5,  5, MTK_TFID(5, 5)},
> +       {"M4U_PORT_VENC_CUR_CHROMA_SET2",  0,  0,    5,  6, MTK_TFID(5, 6)},
> +       {"M4U_PORT_VENC_RD_COMA_SET2",     0,  0,    5,  7, MTK_TFID(5, 7)},
> +       {"M4U_PORT_VENC_SV_COMA_SET2",     0,  0,    5,  8, MTK_TFID(5, 8)},
> +
> +       /* perisys iommu */
> +       {"M4U_PORT_RESERVE",               1,  0,    6,  0, 0xff},
> +       {"M4U_PORT_SPM",                   1,  0,    6,  1, 0x50},
> +       {"M4U_PORT_MD32",                  1,  0,    6,  2, 0x90},
> +       {"M4U_PORT_PTP_THERM",             1,  0,    6,  4, 0xd0},
> +       {"M4U_PORT_PWM",                   1,  0,    6,  5, 0x1},
> +       {"M4U_PORT_MSDC1",                 1,  0,    6,  6, 0x21},
> +       {"M4U_PORT_MSDC2",                 1,  0,    6,  7, 0x41},
> +       {"M4U_PORT_NFI",                   1,  0,    6,  8, 0x8},
> +       {"M4U_PORT_AUDIO",                 1,  0,    6,  9, 0x48},
> +       {"M4U_PORT_RESERVED2",             1,  0,    6,  10, 0xfe},
> +       {"M4U_PORT_HSIC_XHCI",             1,  0,    6,  11, 0x9},
> +
> +       {"M4U_PORT_HSIC_MAS",              1,  0,    6,  12, 0x11},
> +       {"M4U_PORT_HSIC_DEV",              1,  0,    6,  13, 0x19},
> +       {"M4U_PORT_AP_DMA",                1,  0,    6,  14, 0x18},
> +       {"M4U_PORT_HSIC_DMA",              1,  0,    6,  15, 0xc8},
> +       {"M4U_PORT_MSDC0",                 1,  0,    6,  16, 0x0},
> +       {"M4U_PORT_MSDC3",                 1,  0,    6,  17, 0x20},
> +       {"M4U_PORT_UNKNOWN",               1,  0,    6,  18, 0xf},
> +};
> +
> +static const struct mtk_iommu_cfg mtk_iommu_mt8173_cfg = {
> +       .larb_nr = 6,
> +       .m4u_port_nr = ARRAY_SIZE(mtk_iommu_mt8173_port),
> +       .pport = mtk_iommu_mt8173_port,
> +};
> +
> +static const char *mtk_iommu_get_port_name(const struct mtk_iommu_info *piommu,
> +                                          unsigned int portid)
> +{
> +       const struct mtk_iommu_port *pcurport = NULL;

No need for NULL init.

> +
> +       pcurport = piommu->imucfg->pport + portid;
> +       if (portid < piommu->imucfg->m4u_port_nr && pcurport)

The pcurport NULL check is a bit superfluous here, right?

> +               return pcurport->port_name;
> +       else
> +               return "UNKNOWN_PORT";
> +}
> +
> +static int mtk_iommu_get_port_by_tfid(const struct mtk_iommu_info *pimu,
> +                                     int tf_id)
> +{
> +       const struct mtk_iommu_cfg *pimucfg = pimu->imucfg;
> +       int i;
> +       unsigned int portid = pimucfg->m4u_port_nr;
> +
> +       for (i = 0; i < pimucfg->m4u_port_nr; i++) {
> +               if (pimucfg->pport[i].tf_id == tf_id) {
> +                       portid = i;
> +                       break;
> +               }
> +       }
> +       if (i == pimucfg->m4u_port_nr)
> +               dev_err(pimu->dev, "tf_id find fail, tfid %d\n", tf_id);
> +       return portid;
> +}
> +
> +static irqreturn_t mtk_iommu_isr(int irq, void *dev_id)
> +{
> +       struct iommu_domain *domain = dev_id;
> +       struct mtk_iommu_domain *mtkdomain = domain->priv;
> +       struct mtk_iommu_info *piommu = mtkdomain->piommuinfo;
> +
> +       if (irq == piommu->irq)
> +               report_iommu_fault(domain, piommu->dev, 0, 0);
> +       else
> +               dev_err(piommu->dev, "irq number:%d\n", irq);

If this isr doesn't handle the irq, return IRQ_NONE, not IRQ_HANDLED.

> +
> +       return IRQ_HANDLED;
> +}
> +

*snip*

Ok, enough for now :-).

Best Regards,
-Dan
Tomasz Figa March 9, 2015, 11:11 a.m. UTC | #5
Hi,

You can find part 2 of my comments inline.

On Fri, Mar 6, 2015 at 7:48 PM,  <yong.wu@mediatek.com> wrote:

[snip]

> +static irqreturn_t mtk_iommu_isr(int irq, void *dev_id)
> +{
> +       struct iommu_domain *domain = dev_id;
> +       struct mtk_iommu_domain *mtkdomain = domain->priv;
> +       struct mtk_iommu_info *piommu = mtkdomain->piommuinfo;
> +
> +       if (irq == piommu->irq)
> +               report_iommu_fault(domain, piommu->dev, 0, 0);

In addition to my previous comment about how this gets called from
this handler, you need to keep in mind that the function called by
report_iommu_fault() might actually be a different function than
mtk_iommu_fault_handler(), because upper layers can provide their own
handlers. This means that you need to perform any operations on
hardware from this handler and only use the iommu fault handler as a
way to tell an upper layer about the fault (including notifying the
user through kernel log if there is no special handler installed and
the generic fallback is used).

> +       else
> +               dev_err(piommu->dev, "irq number:%d\n", irq);
> +
> +       return IRQ_HANDLED;
> +}

[snip]

> +static int mtk_iommu_fault_handler(struct iommu_domain *imudomain,
> +                                  struct device *dev, unsigned long iova,
> +                                  int m4uindex, void *pimu)
> +{
> +       void __iomem *m4u_base;
> +       u32 int_state, regval;
> +       int m4u_slave_id = 0;
> +       unsigned int layer, write, m4u_port;
> +       unsigned int fault_mva, fault_pa;
> +       struct mtk_iommu_info *piommu = pimu;
> +       struct mtk_iommu_domain *mtkdomain = imudomain->priv;
> +
> +       m4u_base = piommu->m4u_base;
> +       int_state = readl(m4u_base + REG_MMU_MAIN_FAULT_ST);
> +
> +       /* read error info from registers */
> +       fault_mva = readl(m4u_base + REG_MMU_FAULT_VA(m4u_slave_id));
> +       layer = !!(fault_mva & F_MMU_FAULT_VA_LAYER_BIT);
> +       write = !!(fault_mva & F_MMU_FAULT_VA_WRITE_BIT);
> +       fault_mva &= F_MMU_FAULT_VA_MSK;
> +       fault_pa = readl(m4u_base + REG_MMU_INVLD_PA(m4u_slave_id));
> +       regval = readl(m4u_base + REG_MMU_INT_ID(m4u_slave_id));
> +       regval &= F_MMU0_INT_ID_TF_MSK;
> +       m4u_port = mtk_iommu_get_port_by_tfid(piommu, regval);
> +
> +       if (int_state & F_INT_TRANSLATION_FAULT(m4u_slave_id)) {
> +               struct m4u_pte_info_t pte;
> +               unsigned long flags;
> +
> +               spin_lock_irqsave(&mtkdomain->pgtlock, flags);
> +               m4u_get_pte_info(mtkdomain, fault_mva, &pte);
> +               spin_unlock_irqrestore(&mtkdomain->pgtlock, flags);
> +
> +               if (pte.size == MMU_SMALL_PAGE_SIZE ||
> +                   pte.size == MMU_LARGE_PAGE_SIZE) {
> +                       dev_err_ratelimited(
> +                               dev,
> +                               "fault:port=%s iova=0x%x pa=0x%x layer=%d %s;"
> +                               "pgd(0x%x)->pte(0x%x)->pa(%pad)sz(0x%x)Valid(%d)\n",
> +                               mtk_iommu_get_port_name(piommu, m4u_port),
> +                               fault_mva, fault_pa, layer,
> +                               write ? "write" : "read",
> +                               imu_pgd_val(*pte.pgd), imu_pte_val(*pte.pte),
> +                               &pte.pa, pte.size, pte.valid);
> +               } else {
> +                       dev_err_ratelimited(
> +                               dev,
> +                               "fault:port=%s iova=0x%x pa=0x%x layer=%d %s;"
> +                               "pgd(0x%x)->pa(%pad)sz(0x%x)Valid(%d)\n",
> +                               mtk_iommu_get_port_name(piommu, m4u_port),
> +                               fault_mva, fault_pa, layer,
> +                               write ? "write" : "read",
> +                               imu_pgd_val(*pte.pgd),
> +                               &pte.pa, pte.size, pte.valid);
> +               }
> +       }
> +
> +       if (int_state & F_INT_MAIN_MULTI_HIT_FAULT(m4u_slave_id))
> +               dev_err_ratelimited(dev, "multi-hit!port=%s iova=0x%x\n",
> +                                   mtk_iommu_get_port_name(piommu, m4u_port),
> +                                   fault_mva);
> +
> +       if (int_state & F_INT_INVALID_PA_FAULT(m4u_slave_id)) {
> +               if (!(int_state & F_INT_TRANSLATION_FAULT(m4u_slave_id)))
> +                       dev_err_ratelimited(dev, "invalid pa!port=%s iova=0x%x\n",
> +                                           mtk_iommu_get_port_name(piommu,
> +                                                                   m4u_port),
> +                                           fault_mva);
> +       }
> +       if (int_state & F_INT_ENTRY_REPLACEMENT_FAULT(m4u_slave_id))
> +               dev_err_ratelimited(dev, "replace-fault!port=%s iova=0x%x\n",
> +                                   mtk_iommu_get_port_name(piommu, m4u_port),
> +                                   fault_mva);
> +
> +       if (int_state & F_INT_TLB_MISS_FAULT(m4u_slave_id))
> +               dev_err_ratelimited(dev, "tlb miss-fault!port=%s iova=0x%x\n",
> +                                   mtk_iommu_get_port_name(piommu, m4u_port),
> +                                   fault_mva);
> +
> +       mtk_iommu_invalidate_tlb(piommu, 1, 0, 0);
> +
> +       mtk_iommu_clear_intr(m4u_base);

As per my comment above, most of what is happening in this function
should be actually done in interrupt handler, excluding printing of
course.

> +
> +       return 0;
> +}
> +
> +static int mtk_iommu_parse_dt(struct platform_device *pdev,
> +                             struct mtk_iommu_info *piommu)
> +{
> +       struct device *piommudev = &pdev->dev;
> +       struct device_node *ofnode;
> +       struct resource *res;
> +       unsigned int mtk_iommu_cell = 0;
> +       unsigned int i;
> +
> +       ofnode = piommudev->of_node;

You could do this assignment on the same line as declaration.

nit: The usual naming convention for such variables are "dev" for
struct device * and "np" for struct device_node *.

> +
> +       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +       piommu->m4u_base = devm_ioremap_resource(&pdev->dev, res);

nit: There doesn't seem to be any other "bases" involved in this
driver. Could you simply name the field "base", without the obvious
prefix "m4u_"?

> +       if (IS_ERR(piommu->m4u_base)) {
> +               dev_err(piommudev, "m4u_base %p err\n", piommu->m4u_base);
> +               goto iommu_dts_err;
> +       }
> +
> +       piommu->irq = platform_get_irq(pdev, 0);
> +       if (piommu->irq < 0) {
> +               dev_err(piommudev, "irq err %d\n", piommu->irq);

Please keep the messages human readable, e.g. "Failed to get IRQ (%d)\n"

> +               goto iommu_dts_err;
> +       }
> +
> +       piommu->m4u_infra_clk = devm_clk_get(piommudev, "infra_m4u");
> +       if (IS_ERR(piommu->m4u_infra_clk)) {
> +               dev_err(piommudev, "clk err %p\n", piommu->m4u_infra_clk);

"Failed to get clock 'infra_m4u' (%d)\n" (and extract the error code
using PTR_ERR() helper.

> +               goto iommu_dts_err;
> +       }
> +
> +       of_property_read_u32(ofnode, "#iommu-cells", &mtk_iommu_cell);
> +       if (mtk_iommu_cell != 1) {
> +               dev_err(piommudev, "iommu-cell fail:%d\n", mtk_iommu_cell);
> +               goto iommu_dts_err;
> +       }

I don't think you should be checking this here. The function
performing translation from phandle and specifier should check whether
the correct cell count was provided.

> +
> +       for (i = 0; i < piommu->imucfg->larb_nr; i++) {
> +               struct device_node *larbnode;
> +
> +               larbnode = of_parse_phandle(ofnode, "larb", i);
> +               piommu->larbpdev[i] = of_find_device_by_node(larbnode);
> +               of_node_put(larbnode);

I need to take a look at further changes, but this looks like syscon
should be used here instead of using the device directly.

> +               if (!piommu->larbpdev[i]) {
> +                       dev_err(piommudev, "larb pdev fail@larb%d\n", i);
> +                       goto iommu_dts_err;
> +               }
> +       }
> +
> +       return 0;
> +
> +iommu_dts_err:
> +       return -EINVAL;

Please return the return value that actually brought us here.

> +}
> +
> +static int mtk_iommu_hw_init(const struct mtk_iommu_domain *mtkdomain)
> +{
> +       struct mtk_iommu_info *piommu = mtkdomain->piommuinfo;
> +       void __iomem *gm4ubaseaddr = piommu->m4u_base;

Hmm, gm4ubaseaddr is exactly as long as piommu->base (if you follow my
comment to rename this field). In general,

> +       phys_addr_t protectpa;
> +       u32 regval, protectreg;
> +       int ret = 0;
> +
> +       ret = clk_prepare_enable(piommu->m4u_infra_clk);
> +       if (ret) {
> +               dev_err(piommu->dev, "m4u clk enable error\n");
> +               return -ENODEV;
> +       }
> +
> +       writel((u32)mtkdomain->pgd_pa, gm4ubaseaddr + REG_MMUG_PT_BASE);

Why this has to be casted? Is the type of pgd_pa field correct?

> +
> +       regval = F_MMU_CTRL_REROUTE_PFQ_TO_MQ_EN |
> +               F_MMU_CTRL_TF_PROT_VAL(2) |
> +               F_MMU_CTRL_COHERE_EN;
> +       writel(regval, gm4ubaseaddr + REG_MMU_CTRL_REG);
> +
> +       writel(0x6f, gm4ubaseaddr + REG_MMU_INT_L2_CONTROL);
> +       writel(0xffffffff, gm4ubaseaddr + REG_MMU_INT_MAIN_CONTROL);

Please define all the bitfields and use the definitions here instead
of magic numbers.

> +
> +       /* protect memory,HW will write here while translation fault */
> +       protectpa = __virt_to_phys(piommu->protect_va);

Why the underscore variant? virt_to_phys() should be just fine.

> +       protectpa = ALIGN(protectpa, MTK_PROTECT_PA_ALIGN);

Shouldn't protect_va be already aligned?

> +       protectreg = (u32)F_MMU_IVRP_PA_SET(protectpa);

Again, why is this cast needed?

> +       writel(protectreg, gm4ubaseaddr + REG_MMU_IVRP_PADDR);
> +
> +       writel(0, gm4ubaseaddr + REG_MMU_DCM_DIS);
> +       writel(0, gm4ubaseaddr + REG_MMU_STANDARD_AXI_MODE);
> +
> +       return 0;
> +}
> +
> +static inline void mtk_iommu_config_port(struct mtk_iommu_info *piommu,
> +                                        int portid)

Please let the compiler decide whether to inline this function or not.

> +{
> +       int larb, larb_port;
> +
> +       larb = piommu->imucfg->pport[portid].larb_id;
> +       larb_port = piommu->imucfg->pport[portid].port_id;
> +
> +       mtk_smi_config_port(piommu->larbpdev[larb], larb_port);
> +}
> +
> +/*
> + * pimudev is a global var for dma_alloc_coherent.
> + * It is not accepatable, we will delete it if "domain_alloc" is enabled
> + */
> +static struct device *pimudev;

This is indeed not acceptable. Could you replace dma_alloc_coherent()
with something that doesn't require device pointer, e.g.
alloc_pages()? (Although that would require you to handle cache
maintenance in the driver, due to cached memory allocated.) I need to
think about a better solution for this.

OK. Enough for part 2. Stay tuned for part 3.

Best regards,
Tomasz
Yong Wu (吴勇) March 9, 2015, 12:11 p.m. UTC | #6
Dear Will,

On Fri, 2015-03-06 at 10:58 +0000, Will Deacon wrote:
> On Fri, Mar 06, 2015 at 10:48:17AM +0000, yong.wu@mediatek.com wrote:
> > From: Yong Wu <yong.wu@mediatek.com>
> > 
> > This patch adds support for mediatek m4u (MultiMedia Memory Management Unit).
> > Currently this only supports m4u gen 2 with 2 levels of page table on mt8173.
> 
> [...]
> 
> > diff --git a/drivers/iommu/mtk_iommu_pagetable.c b/drivers/iommu/mtk_iommu_pagetable.c
> > new file mode 100644
> > index 0000000..5fe9640
> > --- /dev/null
> > +++ b/drivers/iommu/mtk_iommu_pagetable.c
> > @@ -0,0 +1,439 @@
> > +/*
> > + * Copyright (c) 2014-2015 MediaTek Inc.
> > + * Author: Yong Wu <yong.wu@mediatek.com>
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + *
> > + * This program is distributed in the hope that it will be useful,
> > + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > + * GNU General Public License for more details.
> > + */
> > +#include <linux/err.h>
> > +#include <linux/mm.h>
> > +#include <linux/iommu.h>
> > +#include <linux/errno.h>
> > +#include "asm/cacheflush.h"
> > +
> > +#include "mtk_iommu.h"
> > +#include "mtk_iommu_pagetable.h"
> > +
> > +/* 2 level pagetable: pgd -> pte */
> > +#define F_PTE_TYPE_GET(regval)  (regval & 0x3)
> > +#define F_PTE_TYPE_LARGE         BIT(0)
> > +#define F_PTE_TYPE_SMALL         BIT(1)
> > +#define F_PTE_B_BIT              BIT(2)
> > +#define F_PTE_C_BIT              BIT(3)
> > +#define F_PTE_BIT32_BIT          BIT(9)
> > +#define F_PTE_S_BIT              BIT(10)
> > +#define F_PTE_NG_BIT             BIT(11)
> > +#define F_PTE_PA_LARGE_MSK            (~0UL << 16)
> > +#define F_PTE_PA_LARGE_GET(regval)    ((regval >> 16) & 0xffff)
> > +#define F_PTE_PA_SMALL_MSK            (~0UL << 12)
> > +#define F_PTE_PA_SMALL_GET(regval)    ((regval >> 12) & (~0))
> > +#define F_PTE_TYPE_IS_LARGE_PAGE(pte) ((imu_pte_val(pte) & 0x3) == \
> > +                                       F_PTE_TYPE_LARGE)
> > +#define F_PTE_TYPE_IS_SMALL_PAGE(pte) ((imu_pte_val(pte) & 0x3) == \
> > +                                       F_PTE_TYPE_SMALL)
> 
> This looks like the ARM short-descriptor format to me. Could you please
> add a new page table format to the io-pgtable code, so that other IOMMU
> drivers can make use of this? I know there was some interest in using
> short descriptor for the ARM SMMU, for example.
    Currently I not familiar with the io-pgtable,I may need some time
for it and the ARM short-descriptor. 
    And there are some difference between mediatek's pagetable with the
standard short-descriptor, like bit 9. we use it for the dram over 4GB.
Then how should we do if there are some difference. 
> 
> Cheers,
> 
> Will
Yong Wu (吴勇) March 9, 2015, 12:16 p.m. UTC | #7
Dear Mitchel,
     Thanks very much for your review.

On Fri, 2015-03-06 at 09:15 -0800, Mitchel Humpherys wrote:
> On Fri, Mar 06 2015 at 02:48:17 AM, <yong.wu@mediatek.com> wrote:
> > From: Yong Wu <yong.wu@mediatek.com>
> >
> > This patch adds support for mediatek m4u (MultiMedia Memory Management Unit).
> > Currently this only supports m4u gen 2 with 2 levels of page table on mt8173.
> 
> [...]
> 
> > +static int mtk_iommu_invalidate_tlb(const struct mtk_iommu_info *piommu,
> > +				    int isinvall, unsigned int iova_start,
> > +				    unsigned int iova_end)
> > +{
> > +	void __iomem *m4u_base = piommu->m4u_base;
> > +	u32 val;
> > +	u64 start, end;
> > +
> > +	start = sched_clock();
> > +
> > +	if (!isinvall) {
> > +		iova_start = round_down(iova_start, SZ_4K);
> > +		iova_end = round_up(iova_end, SZ_4K);
> > +	}
> > +
> > +	val = F_MMU_INV_EN_L2 | F_MMU_INV_EN_L1;
> > +
> > +	writel(val, m4u_base + REG_INVLID_SEL);
> > +
> > +	if (isinvall) {
> > +		writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);
> > +	} else {
> > +		writel(iova_start, m4u_base + REG_MMU_INVLD_SA);
> > +		writel(iova_end, m4u_base + REG_MMU_INVLD_EA);
> > +		writel(F_MMU_INV_RANGE, m4u_base + REG_MMU_INVLD);
> > +
> > +		while (!readl(m4u_base + REG_MMU_CPE_DONE)) {
> > +			end = sched_clock();
> > +			if (end - start >= 100000000ULL) {
> > +				dev_warn(piommu->dev, "invalid don't done\n");
> > +				writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);
> > +			}
> > +		};
> 
> Superfluous `;'.  Also, maybe you should be using readl_poll_timeout?
   Thanks.
   For the "readl_poll_timeout", My base is 3.19-rc7 and robin's patch.
it don't have this interface.  I will try to add it in the next version.
> 
> > +		writel(0, m4u_base + REG_MMU_CPE_DONE);
> > +	}
> > +
> > +	return 0;
> > +}
> 
> 
> 
> -Mitch
>
Yingjoe Chen March 9, 2015, 2:46 p.m. UTC | #8
On Mon, 2015-03-09 at 20:11 +0900, Tomasz Figa wrote:
<...>
> > +/*
> > + * pimudev is a global var for dma_alloc_coherent.
> > + * It is not accepatable, we will delete it if "domain_alloc" is enabled
> > + */
> > +static struct device *pimudev;
> 
> This is indeed not acceptable. Could you replace dma_alloc_coherent()
> with something that doesn't require device pointer, e.g.
> alloc_pages()? (Although that would require you to handle cache
> maintenance in the driver, due to cached memory allocated.) I need to
> think about a better solution for this.

Hi,

For 2nd level page table, we use cached memory now. Currently we are
using __dma_flush_range to flush the cache, which is also unacceptable.

For proper cache management, we'll need to use dma_map_single or
dma_sync_*, which still need a deivce*.

Joe.C
Mitchel Humpherys March 9, 2015, 4:57 p.m. UTC | #9
On Mon, Mar 09 2015 at 05:16:26 AM, Yong Wu <yong.wu@mediatek.com> wrote:
> Dear Mitchel,
>      Thanks very much for your review.
>
> On Fri, 2015-03-06 at 09:15 -0800, Mitchel Humpherys wrote:
>> On Fri, Mar 06 2015 at 02:48:17 AM, <yong.wu@mediatek.com> wrote:
>> > From: Yong Wu <yong.wu@mediatek.com>
>> >
>> > This patch adds support for mediatek m4u (MultiMedia Memory Management Unit).
>> > Currently this only supports m4u gen 2 with 2 levels of page table on mt8173.
>> 
>> [...]
>> 
>> > +static int mtk_iommu_invalidate_tlb(const struct mtk_iommu_info *piommu,
>> > +				    int isinvall, unsigned int iova_start,
>> > +				    unsigned int iova_end)
>> > +{
>> > +	void __iomem *m4u_base = piommu->m4u_base;
>> > +	u32 val;
>> > +	u64 start, end;
>> > +
>> > +	start = sched_clock();
>> > +
>> > +	if (!isinvall) {
>> > +		iova_start = round_down(iova_start, SZ_4K);
>> > +		iova_end = round_up(iova_end, SZ_4K);
>> > +	}
>> > +
>> > +	val = F_MMU_INV_EN_L2 | F_MMU_INV_EN_L1;
>> > +
>> > +	writel(val, m4u_base + REG_INVLID_SEL);
>> > +
>> > +	if (isinvall) {
>> > +		writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);
>> > +	} else {
>> > +		writel(iova_start, m4u_base + REG_MMU_INVLD_SA);
>> > +		writel(iova_end, m4u_base + REG_MMU_INVLD_EA);
>> > +		writel(F_MMU_INV_RANGE, m4u_base + REG_MMU_INVLD);
>> > +
>> > +		while (!readl(m4u_base + REG_MMU_CPE_DONE)) {
>> > +			end = sched_clock();
>> > +			if (end - start >= 100000000ULL) {
>> > +				dev_warn(piommu->dev, "invalid don't done\n");
>> > +				writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);
>> > +			}
>> > +		};
>> 
>> Superfluous `;'.  Also, maybe you should be using readl_poll_timeout?
>    Thanks.
>    For the "readl_poll_timeout", My base is 3.19-rc7 and robin's patch.
> it don't have this interface.  I will try to add it in the next version.

Yeah it was merged in v4.0-rc1.

-Mitch
Tomasz Figa March 9, 2015, 5 p.m. UTC | #10
On Mon, Mar 9, 2015 at 11:46 PM, Yingjoe Chen <yingjoe.chen@mediatek.com> wrote:
> On Mon, 2015-03-09 at 20:11 +0900, Tomasz Figa wrote:
> <...>
>> > +/*
>> > + * pimudev is a global var for dma_alloc_coherent.
>> > + * It is not accepatable, we will delete it if "domain_alloc" is enabled
>> > + */
>> > +static struct device *pimudev;
>>
>> This is indeed not acceptable. Could you replace dma_alloc_coherent()
>> with something that doesn't require device pointer, e.g.
>> alloc_pages()? (Although that would require you to handle cache
>> maintenance in the driver, due to cached memory allocated.) I need to
>> think about a better solution for this.
>
> Hi,
>
> For 2nd level page table, we use cached memory now. Currently we are
> using __dma_flush_range to flush the cache, which is also unacceptable.
>
> For proper cache management, we'll need to use dma_map_single or
> dma_sync_*, which still need a deivce*.

Looking at how already mainlined drivers do this, they either use
dmac_flush_range()
(https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/iommu/msm_iommu.c?id=refs/tags/v4.0-rc3#n80)
or directly __cpuc_flush_dcache_area() and outer_flush_range()
(https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/iommu/rockchip-iommu.c?id=refs/tags/v4.0-rc3#n93).

Joerg, what's your opinion on this?

Best regards,
Tomasz
Yingjoe Chen March 10, 2015, 3:41 a.m. UTC | #11
On Tue, 2015-03-10 at 02:00 +0900, Tomasz Figa wrote:
> On Mon, Mar 9, 2015 at 11:46 PM, Yingjoe Chen <yingjoe.chen@mediatek.com> wrote:
> > On Mon, 2015-03-09 at 20:11 +0900, Tomasz Figa wrote:
> > <...>
> >> > +/*
> >> > + * pimudev is a global var for dma_alloc_coherent.
> >> > + * It is not accepatable, we will delete it if "domain_alloc" is enabled
> >> > + */
> >> > +static struct device *pimudev;
> >>
> >> This is indeed not acceptable. Could you replace dma_alloc_coherent()
> >> with something that doesn't require device pointer, e.g.
> >> alloc_pages()? (Although that would require you to handle cache
> >> maintenance in the driver, due to cached memory allocated.) I need to
> >> think about a better solution for this.
> >
> > Hi,
> >
> > For 2nd level page table, we use cached memory now. Currently we are
> > using __dma_flush_range to flush the cache, which is also unacceptable.
> >
> > For proper cache management, we'll need to use dma_map_single or
> > dma_sync_*, which still need a deivce*.
> 
> Looking at how already mainlined drivers do this, they either use
> dmac_flush_range()
> (https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/iommu/msm_iommu.c?id=refs/tags/v4.0-rc3#n80)
> or directly __cpuc_flush_dcache_area() and outer_flush_range()
> (https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/iommu/rockchip-iommu.c?id=refs/tags/v4.0-rc3#n93).

Hi,

These only exist in arch/arm, not arm64. I think we should avoid using
API start with __ in drivers. This driver might be used in both
arm/arm64, I think the only option for us is DMA APIs.

Actually, I'm thinking that we should change to use coherent memory for
2nd level page table as well and totally skip the cache flush. It seems
dma_pool_create is suitable to replace kmem_cache we are using right
now. However it still need a device*, which we have to fix anyway.

Joe.C
Tomasz Figa March 10, 2015, 4:06 a.m. UTC | #12
On Tue, Mar 10, 2015 at 12:41 PM, Yingjoe Chen
<yingjoe.chen@mediatek.com> wrote:
> On Tue, 2015-03-10 at 02:00 +0900, Tomasz Figa wrote:
>> On Mon, Mar 9, 2015 at 11:46 PM, Yingjoe Chen <yingjoe.chen@mediatek.com> wrote:
>> > On Mon, 2015-03-09 at 20:11 +0900, Tomasz Figa wrote:
>> > <...>
>> >> > +/*
>> >> > + * pimudev is a global var for dma_alloc_coherent.
>> >> > + * It is not accepatable, we will delete it if "domain_alloc" is enabled
>> >> > + */
>> >> > +static struct device *pimudev;
>> >>
>> >> This is indeed not acceptable. Could you replace dma_alloc_coherent()
>> >> with something that doesn't require device pointer, e.g.
>> >> alloc_pages()? (Although that would require you to handle cache
>> >> maintenance in the driver, due to cached memory allocated.) I need to
>> >> think about a better solution for this.
>> >
>> > Hi,
>> >
>> > For 2nd level page table, we use cached memory now. Currently we are
>> > using __dma_flush_range to flush the cache, which is also unacceptable.
>> >
>> > For proper cache management, we'll need to use dma_map_single or
>> > dma_sync_*, which still need a deivce*.
>>
>> Looking at how already mainlined drivers do this, they either use
>> dmac_flush_range()
>> (https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/iommu/msm_iommu.c?id=refs/tags/v4.0-rc3#n80)
>> or directly __cpuc_flush_dcache_area() and outer_flush_range()
>> (https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/drivers/iommu/rockchip-iommu.c?id=refs/tags/v4.0-rc3#n93).
>
> Hi,
>
> These only exist in arch/arm, not arm64. I think we should avoid using
> API start with __ in drivers. This driver might be used in both
> arm/arm64, I think the only option for us is DMA APIs.
>
> Actually, I'm thinking that we should change to use coherent memory for
> 2nd level page table as well and totally skip the cache flush. It seems
> dma_pool_create is suitable to replace kmem_cache we are using right
> now. However it still need a device*, which we have to fix anyway.

That sounds like a reasonable option, because this is what we have DMA
mapping API for.

Do you expect to have more than one M4U block inside a SoC? Maybe this
static variable actually isn't that bad, with a comment added
explaining that there is always only one such block and that a rework
will be needed if future SoCs will have more.

Best regards,
Tomasz
Tomasz Figa March 11, 2015, 10:53 a.m. UTC | #13
Hi,

Please find next part of my comments inline.

On Fri, Mar 6, 2015 at 7:48 PM,  <yong.wu@mediatek.com> wrote:

[snip]

> +/*
> + * pimudev is a global var for dma_alloc_coherent.
> + * It is not accepatable, we will delete it if "domain_alloc" is enabled

It looks like we indeed need to use dma_alloc_coherent() and we don't
have a good way to pass the device pointer to domain_init callback.

If you don't expect SoCs in the nearest future to have multiple M4U
blocks, then I guess this global variable could stay, after changing
the comment into an explanation why it's correct. Also it should be
moved to the top of the file, below #include directives, as this is
where usually global variables are located.

> + */
> +static struct device *pimudev;
> +
> +static int mtk_iommu_domain_init(struct iommu_domain *domain)
> +{
> +       struct mtk_iommu_domain *priv;
> +
> +       priv = kzalloc(sizeof(*priv), GFP_KERNEL);
> +       if (!priv)
> +               return -ENOMEM;
> +
> +       priv->pgd = dma_alloc_coherent(pimudev, M4U_PGD_SIZE, &priv->pgd_pa,
> +                                      GFP_KERNEL);
> +       if (!priv->pgd) {
> +               pr_err("dma_alloc_coherent pagetable fail\n");
> +               goto err_pgtable;

ret = -ENOMEM;
goto err_free_priv;

> +       }
> +
> +       if (!IS_ALIGNED(priv->pgd_pa, M4U_PGD_SIZE)) {
> +               pr_err("pagetable not aligned pa 0x%pad-0x%p align 0x%x\n",
> +                      &priv->pgd_pa, priv->pgd, M4U_PGD_SIZE);
> +               goto err_pgtable;

ret = -ENOMEM;
goto err_dmafree;

> +       }
> +
> +       memset(priv->pgd, 0, M4U_PGD_SIZE);
> +
> +       spin_lock_init(&priv->pgtlock);
> +       spin_lock_init(&priv->portlock);
> +       domain->priv = priv;
> +
> +       domain->geometry.aperture_start = 0;
> +       domain->geometry.aperture_end   = (unsigned int)~0;

Please replace the cast with ~0U and fix multiple spaces between =.

> +       domain->geometry.force_aperture = true;
> +
> +       return 0;
> +
> +err_pgtable:

err_dma_free:

> +       if (priv->pgd)

Remove this check.

> +               dma_free_coherent(pimudev, M4U_PGD_SIZE, priv->pgd,
> +                                 priv->pgd_pa);

err_free_priv:

> +       kfree(priv);
> +       return -ENOMEM;

return ret;

> +}
> +
> +static void mtk_iommu_domain_destroy(struct iommu_domain *domain)
> +{
> +       struct mtk_iommu_domain *priv = domain->priv;
> +
> +       dma_free_coherent(priv->piommuinfo->dev, M4U_PGD_SIZE,
> +                         priv->pgd, priv->pgd_pa);
> +       kfree(domain->priv);
> +       domain->priv = NULL;
> +}
> +
> +static int mtk_iommu_attach_device(struct iommu_domain *domain,
> +                                  struct device *dev)
> +{
> +       unsigned long flags;
> +       struct mtk_iommu_domain *priv = domain->priv;
> +       struct mtk_iommu_info *piommu = priv->piommuinfo;
> +       struct of_phandle_args out_args = {0};
> +       struct device *imudev;
> +       unsigned int i = 0;
> +
> +       if (!piommu)

Could you explain when this can happen?

> +               goto imudev;

return 0;

> +       else

No else needed.

> +               imudev = piommu->dev;
> +
> +       spin_lock_irqsave(&priv->portlock, flags);

What is protected by this spinlock?

> +
> +       while (!of_parse_phandle_with_args(dev->of_node, "iommus",
> +                                          "#iommu-cells", i, &out_args)) {
> +               if (1 == out_args.args_count) {

Can we be sure that this is actually referring to our IOMMU?

Maybe this should be rewritten to

if (out_args.np != imudev->of_node)
        continue;
if (out_args.args_count != 1) {
        dev_err(imudev, "invalid #iommu-cells property for IOMMU %s\n",

}

> +                       unsigned int portid = out_args.args[0];
> +
> +                       dev_dbg(dev, "iommu add port:%d\n", portid);

imudev should be used here instead of dev.

> +
> +                       mtk_iommu_config_port(piommu, portid);
> +
> +                       if (i == 0)
> +                               dev->archdata.dma_ops =
> +                                       piommu->dev->archdata.dma_ops;

Shouldn't this be set automatically by IOMMU or DMA mapping core?

> +               }
> +               i++;
> +       }
> +
> +       spin_unlock_irqrestore(&priv->portlock, flags);
> +
> +imudev:
> +       return 0;
> +}
> +
> +static void mtk_iommu_detach_device(struct iommu_domain *domain,
> +                                   struct device *dev)
> +{

No hardware (de)configuration or clean-up necessary?

> +}
> +
> +static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova,
> +                        phys_addr_t paddr, size_t size, int prot)
> +{
> +       struct mtk_iommu_domain *priv = domain->priv;
> +       unsigned long flags;
> +       int ret;
> +
> +       spin_lock_irqsave(&priv->pgtlock, flags);
> +       ret = m4u_map(priv, (unsigned int)iova, paddr, size, prot);
> +       mtk_iommu_invalidate_tlb(priv->piommuinfo, 0,
> +                                iova, iova + size - 1);
> +       spin_unlock_irqrestore(&priv->pgtlock, flags);
> +
> +       return ret;
> +}
> +
> +static size_t mtk_iommu_unmap(struct iommu_domain *domain,
> +                             unsigned long iova, size_t size)
> +{
> +       struct mtk_iommu_domain *priv = domain->priv;
> +       unsigned long flags;
> +
> +       spin_lock_irqsave(&priv->pgtlock, flags);
> +       m4u_unmap(priv, (unsigned int)iova, size);
> +       mtk_iommu_invalidate_tlb(priv->piommuinfo, 0,
> +                                iova, iova + size - 1);
> +       spin_unlock_irqrestore(&priv->pgtlock, flags);
> +
> +       return size;
> +}
> +
> +static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain,
> +                                         dma_addr_t iova)
> +{
> +       struct mtk_iommu_domain *priv = domain->priv;
> +       unsigned long flags;
> +       struct m4u_pte_info_t pte;
> +
> +       spin_lock_irqsave(&priv->pgtlock, flags);
> +       m4u_get_pte_info(priv, (unsigned int)iova, &pte);
> +       spin_unlock_irqrestore(&priv->pgtlock, flags);
> +
> +       return pte.pa;
> +}
> +
> +static struct iommu_ops mtk_iommu_ops = {
> +       .domain_init = mtk_iommu_domain_init,
> +       .domain_destroy = mtk_iommu_domain_destroy,
> +       .attach_dev = mtk_iommu_attach_device,
> +       .detach_dev = mtk_iommu_detach_device,
> +       .map = mtk_iommu_map,
> +       .unmap = mtk_iommu_unmap,
> +       .map_sg = default_iommu_map_sg,
> +       .iova_to_phys = mtk_iommu_iova_to_phys,
> +       .pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
> +};
> +
> +static const struct of_device_id mtk_iommu_of_ids[] = {
> +       { .compatible = "mediatek,mt8173-iommu",
> +         .data = &mtk_iommu_mt8173_cfg,
> +       },
> +       {}
> +};
> +
> +static int mtk_iommu_probe(struct platform_device *pdev)
> +{
> +       int ret;
> +       struct iommu_domain *domain;
> +       struct mtk_iommu_domain *mtk_domain;
> +       struct mtk_iommu_info *piommu;
> +       struct iommu_dma_domain  *dom;
> +       const struct of_device_id *of_id;
> +
> +       piommu = devm_kzalloc(&pdev->dev, sizeof(struct mtk_iommu_info),

sizeof(*piommu)

> +                             GFP_KERNEL);
> +       if (!piommu)
> +               return -ENOMEM;
> +
> +       pimudev = &pdev->dev;
> +       piommu->dev = &pdev->dev;
> +
> +       of_id = of_match_node(mtk_iommu_of_ids, pdev->dev.of_node);
> +       if (!of_id)
> +               return -ENODEV;

Please print an error message.

> +
> +       piommu->protect_va = devm_kmalloc(piommu->dev, MTK_PROTECT_PA_ALIGN*2,

style: Operators like * should have space on both sides.

> +                                         GFP_KERNEL);

Shouldn't dma_alloc_coherent() be used for this?

> +       if (!piommu->protect_va)
> +               goto protect_err;

Please return -ENOMEM here directly, as there is nothing to clean up
in this case.

> +       memset(piommu->protect_va, 0x55, MTK_PROTECT_PA_ALIGN*2);
> +
> +       piommu->imucfg = (const struct mtk_iommu_cfg *)of_id->data;

Why this cast is needed? Since of_id->data is const void * it should
be fine without a cast.

> +
> +       ret = mtk_iommu_parse_dt(pdev, piommu);
> +       if (ret) {
> +               dev_err(piommu->dev, "iommu dt parse fail\n");
> +               goto protect_err;

Still nothing to clean-up, so you can safely just return ret;

> +       }
> +
> +       /* alloc memcache for level-2 pgt */
> +       piommu->m4u_pte_kmem = kmem_cache_create("m4u_pte", IMU_BYTES_PER_PTE,
> +                                                IMU_BYTES_PER_PTE, 0, NULL);
> +
> +       if (IS_ERR_OR_NULL(piommu->m4u_pte_kmem)) {

Looks like the convention used by kmem_cache_create() is valid ptr or
NULL, no ERR_PTR()s.

> +               dev_err(piommu->dev, "pte cached create fail %p\n",
> +                       piommu->m4u_pte_kmem);
> +               goto protect_err;

Still nothing to clean-up.

> +       }
> +
> +       arch_setup_dma_ops(piommu->dev, 0, (1ULL<<32) - 1, &mtk_iommu_ops, 0);

style: Missing spaces around << operator.

> +
> +       dom = get_dma_domain(piommu->dev);
> +       domain = iommu_dma_raw_domain(dom);
> +
> +       mtk_domain = domain->priv;

domain is already dereferenced here, but NULL pointer check is two lines below.

> +       mtk_domain->piommuinfo = piommu;
> +
> +       if (!domain)
> +               goto pte_err;

Please print error message.

> +
> +       ret = mtk_iommu_hw_init(mtk_domain);
> +       if (ret < 0)
> +               goto hw_err;

Please print error message.

> +
> +       if (devm_request_irq(piommu->dev, piommu->irq,
> +                            mtk_iommu_isr, IRQF_TRIGGER_NONE,
> +                            "mtkiommu", (void *)domain)) {

Please align wrapped lines using tabs only.

Also, what do you mean by IRQF_TRIGGER_NONE? If you don't need any
special flags then 0 is enough.

Also, usually dev_name() is used for interrupt name.

Also, unnecessary cast of domain to void *.

> +               dev_err(piommu->dev, "IRQ request %d failed\n",
> +                       piommu->irq);
> +               goto hw_err;
> +       }
> +
> +       iommu_set_fault_handler(domain, mtk_iommu_fault_handler, piommu);

I don't see any other drivers doing this. Isn't this for upper layers,
so that they can set their own generic fault handlers?

> +
> +       dev_set_drvdata(piommu->dev, piommu);

This should be set before allowing the interrupt to fire. In other
words, the driver should be fully set up at the time of enabling the
IRQ.

> +
> +       return 0;

style: Missing blank line.

> +hw_err:
> +       arch_teardown_dma_ops(piommu->dev);
> +pte_err:
> +       kmem_cache_destroy(piommu->m4u_pte_kmem);
> +protect_err:
> +       dev_err(piommu->dev, "probe error\n");

Please replace this with specific messages for all errors (in case the
called function doesn't already print one like kmalloc and friends).

> +       return 0;

Returning 0, which means success, doesn't look like a good idea for
signalling a failure. Please return the correct error code as received
from function that errors out if possible.

End of part 3.

Best regards,
Tomasz
Yong Wu (吴勇) March 12, 2015, 2:16 p.m. UTC | #14
Dear Tomasz,

      Thanks very much for review so detail!

      Please check my reply below. Others I will fix it in the next
version.
       
      And I have got your comment of [2/5]. Do you have plan for the
other patch?

On Sun, 2015-03-08 at 13:12 +0900, Tomasz Figa wrote:
> Hi Yong Wu,
> 
> Thanks for this series. Please see my comments inline.
> 
> On Fri, Mar 6, 2015 at 7:48 PM,  <yong.wu@mediatek.com> wrote:
> > From: Yong Wu <yong.wu@mediatek.com>
> >
> > This patch adds support for mediatek m4u (MultiMedia Memory Management Unit).
> > Currently this only supports m4u gen 2 with 2 levels of page table on mt8173.
> 
[snip]
> > +static const struct mtk_iommu_port mtk_iommu_mt8173_port[] = {
> > +       /* port name                m4uid slaveid larbid portid tfid */
> > +       /* larb0 */
> > +       {"M4U_PORT_DISP_OVL0",          0,  0,    0,  0, MTK_TFID(0, 0)},
> > +       {"M4U_PORT_DISP_RDMA0",         0,  0,    0,  1, MTK_TFID(0, 1)},
> > +       {"M4U_PORT_DISP_WDMA0",         0,  0,    0,  2, MTK_TFID(0, 2)},
> > +       {"M4U_PORT_DISP_OD_R",          0,  0,    0,  3, MTK_TFID(0, 3)},
> > +       {"M4U_PORT_DISP_OD_W",          0,  0,    0,  4, MTK_TFID(0, 4)},
> > +       {"M4U_PORT_MDP_RDMA0",          0,  0,    0,  5, MTK_TFID(0, 5)},
> > +       {"M4U_PORT_MDP_WDMA",           0,  0,    0,  6, MTK_TFID(0, 6)},
> > +       {"M4U_PORT_MDP_WROT0",          0,  0,    0,  7, MTK_TFID(0, 7)},
> 
> [snip]
> 
> > +       {"M4U_PORT_HSIC_MAS",              1,  0,    6,  12, 0x11},
> > +       {"M4U_PORT_HSIC_DEV",              1,  0,    6,  13, 0x19},
> > +       {"M4U_PORT_AP_DMA",                1,  0,    6,  14, 0x18},
> > +       {"M4U_PORT_HSIC_DMA",              1,  0,    6,  15, 0xc8},
> > +       {"M4U_PORT_MSDC0",                 1,  0,    6,  16, 0x0},
> > +       {"M4U_PORT_MSDC3",                 1,  0,    6,  17, 0x20},
> > +       {"M4U_PORT_UNKNOWN",               1,  0,    6,  18, 0xf},
> 
> Why the MTK_TFID() macro is not used for perisys iommu?
> 
       The perisys iommu don't connected with SMI and Local arbiter.
it's translation fault id is not MTK_TFID(x, y).it is special.
       For this perisys iommu , it is different with multimedia iommu,
we don't support it in this version, We have plan to delete perisys
iommu port next time.

> > +};
> > +
> 
> Anyway, is it really necessary to hardcode the SoC specific topology
> data in this driver? Is there really any use besides of printing port
> name? If not, you could just print the values in a way letting you
> quickly look up in the datasheet, without hardcoding this. Or even
> better, you could print which devices are attached to the port.
> 
a) Printing the port name is for debug. We could not request every iommu
user to understand smi&local arbiter. When there is irq, they have to
look up the iommu's datasheet to find out which port error. if we print
it directly, It may be more easily to debug.

b) In mtk_iommu_config_port, according to this hardcode we can be easily
to get out which local arbiter and which port we prepare to config.

c) If we support different SOCs, we could change this arrays easily.

> > +static const struct mtk_iommu_cfg mtk_iommu_mt8173_cfg = {
> > +       .larb_nr = 6,
> > +       .m4u_port_nr = ARRAY_SIZE(mtk_iommu_mt8173_port),
> > +       .pport = mtk_iommu_mt8173_port,
> > +};
> > +
> > +static const char *mtk_iommu_get_port_name(const struct mtk_iommu_info *piommu,
> > +                                          unsigned int portid)
> > +{
> > +       const struct mtk_iommu_port *pcurport = NULL;
> > +
> > +       pcurport = piommu->imucfg->pport + portid;
> > +       if (portid < piommu->imucfg->m4u_port_nr && pcurport)
> > +               return pcurport->port_name;
> > +       else
> > +               return "UNKNOWN_PORT";
> > +}
> 
> This function seems to be used just for printing the hardcoded port names.
> 
> > +
> > +static int mtk_iommu_get_port_by_tfid(const struct mtk_iommu_info *pimu,
> > +                                     int tf_id)
> > +{
> > +       const struct mtk_iommu_cfg *pimucfg = pimu->imucfg;
> > +       int i;
> > +       unsigned int portid = pimucfg->m4u_port_nr;
> > +
> > +       for (i = 0; i < pimucfg->m4u_port_nr; i++) {
> > +               if (pimucfg->pport[i].tf_id == tf_id) {
> > +                       portid = i;
> > +                       break;
> > +               }
> > +       }
> > +       if (i == pimucfg->m4u_port_nr)
> > +               dev_err(pimu->dev, "tf_id find fail, tfid %d\n", tf_id);
> > +       return portid;
> > +}
> 
> This function seems to be used just for finding an index into the
> array of hardcoded port names for printing purposes.
   Yes. "mtk_iommu_get_port_name" and "mtk_iommu_get_port_by_tfid" is
only for find out the right port to print for improve debug.
> 
[snip]
> > +
> > +static int mtk_iommu_invalidate_tlb(const struct mtk_iommu_info *piommu,
> > +                                   int isinvall, unsigned int iova_start,
> > +                                   unsigned int iova_end)
> > +{
> > +       void __iomem *m4u_base = piommu->m4u_base;
> > +       u32 val;
> > +       u64 start, end;
> > +
> > +       start = sched_clock();
> 
> I don't think this is the preferred way of checking time in kernel
> drivers, especially after seeing this comment:
> http://lxr.free-electrons.com/source/include/linux/sched.h#L2134
> 
> You should use ktime_get() and other ktime_ helpers.
> 
   I will try to replace this with readl_polling_timeout from Mitchel.
Is it ok?

> > +
> > +       if (!isinvall) {
> > +               iova_start = round_down(iova_start, SZ_4K);
> > +               iova_end = round_up(iova_end, SZ_4K);
> > +       }
> > +
> > +       val = F_MMU_INV_EN_L2 | F_MMU_INV_EN_L1;
> > +
> > +       writel(val, m4u_base + REG_INVLID_SEL);
> > +
> > +       if (isinvall) {
> > +               writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);
> 
> Please move invalidate all into separate function and just call it
> wherever the code currently passes true as invall argument. You will
> get rid of two of the ifs in this function.
> 
> > +       } else {
> > +               writel(iova_start, m4u_base + REG_MMU_INVLD_SA);
> > +               writel(iova_end, m4u_base + REG_MMU_INVLD_EA);
> > +               writel(F_MMU_INV_RANGE, m4u_base + REG_MMU_INVLD);
> > +
> > +               while (!readl(m4u_base + REG_MMU_CPE_DONE)) {
> > +                       end = sched_clock();
> > +                       if (end - start >= 100000000ULL) {
> 
> Looks like a very interesting magic number. Please define a macro and
> use normal time units along with ktime_to_<unit>() helpers.
> 
> > +                               dev_warn(piommu->dev, "invalid don't done\n");
> > +                               writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);
> 
> By following my comment above, you could just call the new invalidate
> all function here instead of duplicating the same register write.
[snip]
> Best regards,
> Tomasz
Will Deacon March 17, 2015, 3:14 p.m. UTC | #15
On Mon, Mar 09, 2015 at 12:11:43PM +0000, Yong Wu wrote:
> On Fri, 2015-03-06 at 10:58 +0000, Will Deacon wrote:
> > On Fri, Mar 06, 2015 at 10:48:17AM +0000, yong.wu@mediatek.com wrote:
> > > From: Yong Wu <yong.wu@mediatek.com>
> > > 
> > > This patch adds support for mediatek m4u (MultiMedia Memory Management Unit).
> > > Currently this only supports m4u gen 2 with 2 levels of page table on mt8173.
> > 
> > [...]
> > 
> > > +/* 2 level pagetable: pgd -> pte */
> > > +#define F_PTE_TYPE_GET(regval)  (regval & 0x3)
> > > +#define F_PTE_TYPE_LARGE         BIT(0)
> > > +#define F_PTE_TYPE_SMALL         BIT(1)
> > > +#define F_PTE_B_BIT              BIT(2)
> > > +#define F_PTE_C_BIT              BIT(3)
> > > +#define F_PTE_BIT32_BIT          BIT(9)
> > > +#define F_PTE_S_BIT              BIT(10)
> > > +#define F_PTE_NG_BIT             BIT(11)
> > > +#define F_PTE_PA_LARGE_MSK            (~0UL << 16)
> > > +#define F_PTE_PA_LARGE_GET(regval)    ((regval >> 16) & 0xffff)
> > > +#define F_PTE_PA_SMALL_MSK            (~0UL << 12)
> > > +#define F_PTE_PA_SMALL_GET(regval)    ((regval >> 12) & (~0))
> > > +#define F_PTE_TYPE_IS_LARGE_PAGE(pte) ((imu_pte_val(pte) & 0x3) == \
> > > +                                       F_PTE_TYPE_LARGE)
> > > +#define F_PTE_TYPE_IS_SMALL_PAGE(pte) ((imu_pte_val(pte) & 0x3) == \
> > > +                                       F_PTE_TYPE_SMALL)
> > 
> > This looks like the ARM short-descriptor format to me. Could you please
> > add a new page table format to the io-pgtable code, so that other IOMMU
> > drivers can make use of this? I know there was some interest in using
> > short descriptor for the ARM SMMU, for example.
>     Currently I not familiar with the io-pgtable,I may need some time
> for it and the ARM short-descriptor. 

Well, you can read the LPAE version I wrote in io-pgtable-arm.c for some
inspiration (it's used by arm-smmu.c and ipmmu-vmsa.c).

>     And there are some difference between mediatek's pagetable with the
> standard short-descriptor, like bit 9. we use it for the dram over 4GB.
> Then how should we do if there are some difference. 

That can easily be handled using a quirk (see, for example,
IO_PGTABLE_QUIRK_ARM_NS).

Will
Yong Wu (吴勇) March 18, 2015, 11:22 a.m. UTC | #16
Hi Tomasz,
   Thanks very much for your review. please help check below.
The others I will fix in the next version.

Hi Robin,
   There are some place I would like you can have a look and give me
some suggestion.

On Wed, 2015-03-11 at 19:53 +0900, Tomasz Figa wrote:
> Hi,
> 
> Please find next part of my comments inline.
> 
> On Fri, Mar 6, 2015 at 7:48 PM,  <yong.wu@mediatek.com> wrote:
> 
> [snip]
> 
> > +/*
> > + * pimudev is a global var for dma_alloc_coherent.
> > + * It is not accepatable, we will delete it if "domain_alloc" is enabled
> 
> It looks like we indeed need to use dma_alloc_coherent() and we don't
> have a good way to pass the device pointer to domain_init callback.
> 
> If you don't expect SoCs in the nearest future to have multiple M4U
> blocks, then I guess this global variable could stay, after changing
> the comment into an explanation why it's correct. Also it should be
> moved to the top of the file, below #include directives, as this is
> where usually global variables are located.
@Robin,
     We have merged this patch[0] in order to delete the global var, But
it seems that your patch of "arm64:IOMMU" isn't based on it right row.
it will build fail.

[0]:http://lists.linuxfoundation.org/pipermail/iommu/2015-January/011939.html

> > + */
> > +static struct device *pimudev;
> > +
[snip]
> > +
> > +static int mtk_iommu_attach_device(struct iommu_domain *domain,
> > +                                  struct device *dev)
> > +{
> > +       unsigned long flags;
> > +       struct mtk_iommu_domain *priv = domain->priv;
> > +       struct mtk_iommu_info *piommu = priv->piommuinfo;
> > +       struct of_phandle_args out_args = {0};
> > +       struct device *imudev;
> > +       unsigned int i = 0;
> > +
> > +       if (!piommu)
> 
> Could you explain when this can happen?
	If we call arch_setup_dma_ops to create a iommu domain,
it will enter iommu_dma_attach_device, then enter here. At that time, we
don't add the private data to this "struct iommu_domain *".
@Robin, Could this be improved?
> 
> > +               goto imudev;
> 
> return 0;
> 
> > +       else
> 
> No else needed.
> 
> > +               imudev = piommu->dev;
> > +
> > +       spin_lock_irqsave(&priv->portlock, flags);
> 
> What is protected by this spinlock?
	We will write a register of the local arbiter while config port. If
some modules are in the same local arbiter, it may be overwrite. so I
add it here.
> 
> > +
> > +       while (!of_parse_phandle_with_args(dev->of_node, "iommus",
> > +                                          "#iommu-cells", i, &out_args)) {
> > +               if (1 == out_args.args_count) {
> 
> Can we be sure that this is actually referring to our IOMMU?
> 
> Maybe this should be rewritten to
> 
> if (out_args.np != imudev->of_node)
>         continue;
> if (out_args.args_count != 1) {
>         dev_err(imudev, "invalid #iommu-cells property for IOMMU %s\n",
> 
> }
> 
> > +                       unsigned int portid = out_args.args[0];
> > +
> > +                       dev_dbg(dev, "iommu add port:%d\n", portid);
> 
> imudev should be used here instead of dev.
> 
> > +
> > +                       mtk_iommu_config_port(piommu, portid);
> > +
> > +                       if (i == 0)
> > +                               dev->archdata.dma_ops =
> > +                                       piommu->dev->archdata.dma_ops;
> 
> Shouldn't this be set automatically by IOMMU or DMA mapping core?
@Robin, 
     In the original "arm_iommu_attach_device" of arm/mm, it will call
set_dma_ops to add iommu_ops for each iommu device.
But iommu_dma_attach_device don't help this, so I have to add it here.
Could this be improved?
> 
> > +               }
> > +               i++;
> > +       }
> > +
> > +       spin_unlock_irqrestore(&priv->portlock, flags);
> > +
> > +imudev:
> > +       return 0;
> > +}
> > +
> > +static void mtk_iommu_detach_device(struct iommu_domain *domain,
> > +                                   struct device *dev)
> > +{
> 
> No hardware (de)configuration or clean-up necessary?
I will add it. Actually we design like this:If a device have attached to
iommu domain, it won't detach from it.
> 
> > +}
> > +
[snip]
> 
> > +
> > +       piommu->protect_va = devm_kmalloc(piommu->dev, MTK_PROTECT_PA_ALIGN*2,
> 
> style: Operators like * should have space on both sides.
> 
> > +                                         GFP_KERNEL);
> 
> Shouldn't dma_alloc_coherent() be used for this?
     We don't care the data in it. I think they are the same. Could you
help tell me why dma_alloc_coherent may be better.
> 
> > +       if (!piommu->protect_va)
> > +               goto protect_err;
> 
> Please return -ENOMEM here directly, as there is nothing to clean up
> in this case.
> 
[snip]
> 
> > +               dev_err(piommu->dev, "IRQ request %d failed\n",
> > +                       piommu->irq);
> > +               goto hw_err;
> > +       }
> > +
> > +       iommu_set_fault_handler(domain, mtk_iommu_fault_handler, piommu);
> 
> I don't see any other drivers doing this. Isn't this for upper layers,
> so that they can set their own generic fault handlers?
     I think that this function is related with the iommu domain, we
have only one multimedia iommu domain. so I add it after the iommu
domain are created.
> 
> > +
> > +       dev_set_drvdata(piommu->dev, piommu);
> 
> This should be set before allowing the interrupt to fire. In other
> words, the driver should be fully set up at the time of enabling the
> IRQ.
> 
> > +
> > +       return 0;
> 
> style: Missing blank line.
> 
> > +hw_err:
> > +       arch_teardown_dma_ops(piommu->dev);
> > +pte_err:
> > +       kmem_cache_destroy(piommu->m4u_pte_kmem);
> > +protect_err:
> > +       dev_err(piommu->dev, "probe error\n");
> 
> Please replace this with specific messages for all errors (in case the
> called function doesn't already print one like kmalloc and friends).
> 
> > +       return 0;
> 
> Returning 0, which means success, doesn't look like a good idea for
> signalling a failure. Please return the correct error code as received
> from function that errors out if possible.
> 
> End of part 3.
> 
> Best regards,
> Tomasz
Robin Murphy March 20, 2015, 7:14 p.m. UTC | #17
On 18/03/15 11:22, Yong Wu wrote:
> Hi Tomasz,
>     Thanks very much for your review. please help check below.
> The others I will fix in the next version.
>
> Hi Robin,
>     There are some place I would like you can have a look and give me
> some suggestion.
>
> On Wed, 2015-03-11 at 19:53 +0900, Tomasz Figa wrote:
>> Hi,
>>
>> Please find next part of my comments inline.
>>
>> On Fri, Mar 6, 2015 at 7:48 PM,  <yong.wu@mediatek.com> wrote:
>>
>> [snip]
>>
>>> +/*
>>> + * pimudev is a global var for dma_alloc_coherent.
>>> + * It is not accepatable, we will delete it if "domain_alloc" is enabled
>>
>> It looks like we indeed need to use dma_alloc_coherent() and we don't
>> have a good way to pass the device pointer to domain_init callback.
>>
>> If you don't expect SoCs in the nearest future to have multiple M4U
>> blocks, then I guess this global variable could stay, after changing
>> the comment into an explanation why it's correct. Also it should be
>> moved to the top of the file, below #include directives, as this is
>> where usually global variables are located.
> @Robin,
>       We have merged this patch[0] in order to delete the global var, But
> it seems that your patch of "arm64:IOMMU" isn't based on it right row.
> it will build fail.

Yeah, I've not yet managed to try pulling in that series (much as I 
approve of it), partly as I know doing so is going to lean towards a 
not-insignificant rework and I'd rather avoid picking up more unmerged 
dependencies to block getting _something_ in for arm64 (which we can 
then improve).

>
> [0]:http://lists.linuxfoundation.org/pipermail/iommu/2015-January/011939.html
>
>>> + */
>>> +static struct device *pimudev;
>>> +
> [snip]
>>> +
>>> +static int mtk_iommu_attach_device(struct iommu_domain *domain,
>>> +                                  struct device *dev)
>>> +{
>>> +       unsigned long flags;
>>> +       struct mtk_iommu_domain *priv = domain->priv;
>>> +       struct mtk_iommu_info *piommu = priv->piommuinfo;
>>> +       struct of_phandle_args out_args = {0};
>>> +       struct device *imudev;
>>> +       unsigned int i = 0;
>>> +
>>> +       if (!piommu)
>>
>> Could you explain when this can happen?
> 	If we call arch_setup_dma_ops to create a iommu domain,
> it will enter iommu_dma_attach_device, then enter here. At that time, we
> don't add the private data to this "struct iommu_domain *".
> @Robin, Could this be improved?

Calling arch_setup_dma_ops() from the driver looks plain wrong, 
especially given that you apparently attach the IOMMU to itself - if you 
want your own domain you should use iommu_dma_create_domain(). I admit 
that still leaves you having to dance around a bit in order to tear down 
the automatic domains for now, but hopefully we'll get the core code 
sorted out sooner rather than later.

>>
>>> +               goto imudev;
>>
>> return 0;
>>
>>> +       else
>>
>> No else needed.
>>
>>> +               imudev = piommu->dev;
>>> +
>>> +       spin_lock_irqsave(&priv->portlock, flags);
>>
>> What is protected by this spinlock?
> 	We will write a register of the local arbiter while config port. If
> some modules are in the same local arbiter, it may be overwrite. so I
> add it here.
>>
>>> +
>>> +       while (!of_parse_phandle_with_args(dev->of_node, "iommus",
>>> +                                          "#iommu-cells", i, &out_args)) {
>>> +               if (1 == out_args.args_count) {
>>
>> Can we be sure that this is actually referring to our IOMMU?
>>
>> Maybe this should be rewritten to
>>
>> if (out_args.np != imudev->of_node)
>>          continue;
>> if (out_args.args_count != 1) {
>>          dev_err(imudev, "invalid #iommu-cells property for IOMMU %s\n",
>>
>> }
>>
>>> +                       unsigned int portid = out_args.args[0];
>>> +
>>> +                       dev_dbg(dev, "iommu add port:%d\n", portid);
>>
>> imudev should be used here instead of dev.
>>
>>> +
>>> +                       mtk_iommu_config_port(piommu, portid);
>>> +
>>> +                       if (i == 0)
>>> +                               dev->archdata.dma_ops =
>>> +                                       piommu->dev->archdata.dma_ops;
>>
>> Shouldn't this be set automatically by IOMMU or DMA mapping core?
> @Robin,
>       In the original "arm_iommu_attach_device" of arm/mm, it will call
> set_dma_ops to add iommu_ops for each iommu device.
> But iommu_dma_attach_device don't help this, so I have to add it here.
> Could this be improved?

If you implemented a simple of_xlate callback so that the core code 
handles the dma_ops as intended, I think the simplest cheat would be to 
check the client device's domain, either on attachment or when they 
start mapping/unmapping, and move them to your own domain if necessary. 
I'm putting together a v3 of the DMA mapping series, so I'll have a look 
to see if I can squeeze in a way to make that a bit less painful until 
we solve it properly.


Robin.

>>
>>> +               }
>>> +               i++;
>>> +       }
>>> +
>>> +       spin_unlock_irqrestore(&priv->portlock, flags);
>>> +
>>> +imudev:
>>> +       return 0;
>>> +}
>>> +
>>> +static void mtk_iommu_detach_device(struct iommu_domain *domain,
>>> +                                   struct device *dev)
>>> +{
>>
>> No hardware (de)configuration or clean-up necessary?
> I will add it. Actually we design like this:If a device have attached to
> iommu domain, it won't detach from it.
>>
>>> +}
>>> +
> [snip]
>>
>>> +
>>> +       piommu->protect_va = devm_kmalloc(piommu->dev, MTK_PROTECT_PA_ALIGN*2,
>>
>> style: Operators like * should have space on both sides.
>>
>>> +                                         GFP_KERNEL);
>>
>> Shouldn't dma_alloc_coherent() be used for this?
>       We don't care the data in it. I think they are the same. Could you
> help tell me why dma_alloc_coherent may be better.
>>
>>> +       if (!piommu->protect_va)
>>> +               goto protect_err;
>>
>> Please return -ENOMEM here directly, as there is nothing to clean up
>> in this case.
>>
> [snip]
>>
>>> +               dev_err(piommu->dev, "IRQ request %d failed\n",
>>> +                       piommu->irq);
>>> +               goto hw_err;
>>> +       }
>>> +
>>> +       iommu_set_fault_handler(domain, mtk_iommu_fault_handler, piommu);
>>
>> I don't see any other drivers doing this. Isn't this for upper layers,
>> so that they can set their own generic fault handlers?
>       I think that this function is related with the iommu domain, we
> have only one multimedia iommu domain. so I add it after the iommu
> domain are created.
>>
>>> +
>>> +       dev_set_drvdata(piommu->dev, piommu);
>>
>> This should be set before allowing the interrupt to fire. In other
>> words, the driver should be fully set up at the time of enabling the
>> IRQ.
>>
>>> +
>>> +       return 0;
>>
>> style: Missing blank line.
>>
>>> +hw_err:
>>> +       arch_teardown_dma_ops(piommu->dev);
>>> +pte_err:
>>> +       kmem_cache_destroy(piommu->m4u_pte_kmem);
>>> +protect_err:
>>> +       dev_err(piommu->dev, "probe error\n");
>>
>> Please replace this with specific messages for all errors (in case the
>> called function doesn't already print one like kmalloc and friends).
>>
>>> +       return 0;
>>
>> Returning 0, which means success, doesn't look like a good idea for
>> signalling a failure. Please return the correct error code as received
>> from function that errors out if possible.
>>
>> End of part 3.
>>
>> Best regards,
>> Tomasz
>
>
>
Tomasz Figa March 27, 2015, 9:41 a.m. UTC | #18
Hi Yong Wu,

Sorry for long delay, I had to figure out some time to look at this again.

On Wed, Mar 18, 2015 at 8:22 PM, Yong Wu <yong.wu@mediatek.com> wrote:
>>
>> > +               imudev = piommu->dev;
>> > +
>> > +       spin_lock_irqsave(&priv->portlock, flags);
>>
>> What is protected by this spinlock?
>         We will write a register of the local arbiter while config port. If
> some modules are in the same local arbiter, it may be overwrite. so I
> add it here.
>>

OK. Maybe it could be called larb_lock then? It would be good to have
structures or code that should be running under this spinlock
annotated with proper comments. And purpose of the lock documented in
a comment as well (probably in a kerneldoc-style documentation of
priv).

>> > +static void mtk_iommu_detach_device(struct iommu_domain *domain,
>> > +                                   struct device *dev)
>> > +{
>>
>> No hardware (de)configuration or clean-up necessary?
> I will add it. Actually we design like this:If a device have attached to
> iommu domain, it won't detach from it.

Isn't proper clean-up required for module removal? Some drivers might
be required to be loadable modules, which should be unloadable.

>>
>> > +
>> > +       piommu->protect_va = devm_kmalloc(piommu->dev, MTK_PROTECT_PA_ALIGN*2,
>>
>> style: Operators like * should have space on both sides.
>>
>> > +                                         GFP_KERNEL);
>>
>> Shouldn't dma_alloc_coherent() be used for this?
>      We don't care the data in it. I think they are the same. Could you
> help tell me why dma_alloc_coherent may be better.

Can you guarantee that at the time you allocate the memory using
devm_kmalloc() the memory is not dirty (i.e. some write back data are
stored in CPU cache) and is not going to be written back in some time,
overwriting data put there by IOMMU hardware?

>> > +
>> > +       iommu_set_fault_handler(domain, mtk_iommu_fault_handler, piommu);
>>
>> I don't see any other drivers doing this. Isn't this for upper layers,
>> so that they can set their own generic fault handlers?
>      I think that this function is related with the iommu domain, we
> have only one multimedia iommu domain. so I add it after the iommu
> domain are created.

No, this function is for drivers of IOMMU clients (i.e. master IP
blocks) which want to subscribe to page fault to do things like paging
on demand and so on. It shouldn't be called by IOMMU driver. Please
see other IOMMU drivers, for example rockchip-iommmu.c.

Best regards,
Tomasz
Yong Wu (吴勇) April 14, 2015, 6:31 a.m. UTC | #19
Hi Tomasz,

     Thanks very much for you suggestion and explain so detail.
     please help check below.

On Fri, 2015-03-27 at 18:41 +0900, Tomasz Figa wrote:
> Hi Yong Wu,
> 
> Sorry for long delay, I had to figure out some time to look at this again.
> 
> On Wed, Mar 18, 2015 at 8:22 PM, Yong Wu <yong.wu@mediatek.com> wrote:
> >>
> >> > +               imudev = piommu->dev;
> >> > +
> >> > +       spin_lock_irqsave(&priv->portlock, flags);
> >>
> >> What is protected by this spinlock?
> >         We will write a register of the local arbiter while config port. If
> > some modules are in the same local arbiter, it may be overwrite. so I
> > add it here.
> >>
> 
> OK. Maybe it could be called larb_lock then? It would be good to have
> structures or code that should be running under this spinlock
> annotated with proper comments. And purpose of the lock documented in
> a comment as well (probably in a kerneldoc-style documentation of
> priv).
   Thanks. I have move the spinlock into the smi driver, it will lock
for writing the local arbiter regsiter only.
> 
> >> > +static void mtk_iommu_detach_device(struct iommu_domain *domain,
> >> > +                                   struct device *dev)
> >> > +{
> >>
> >> No hardware (de)configuration or clean-up necessary?
> > I will add it. Actually we design like this:If a device have attached to
> > iommu domain, it won't detach from it.
> 
> Isn't proper clean-up required for module removal? Some drivers might
> be required to be loadable modules, which should be unloadable.
> 
> >>
> >> > +
> >> > +       piommu->protect_va = devm_kmalloc(piommu->dev, MTK_PROTECT_PA_ALIGN*2,
> >>
> >> style: Operators like * should have space on both sides.
> >>
> >> > +                                         GFP_KERNEL);
> >>
> >> Shouldn't dma_alloc_coherent() be used for this?
> >      We don't care the data in it. I think they are the same. Could you
> > help tell me why dma_alloc_coherent may be better.
> 
> Can you guarantee that at the time you allocate the memory using
> devm_kmalloc() the memory is not dirty (i.e. some write back data are
> stored in CPU cache) and is not going to be written back in some time,
> overwriting data put there by IOMMU hardware?
> 
As I noted in the function "mtk_iommu_hw_init":

       /* protect memory,HW will write here while translation fault */
       protectpa = __virt_to_phys(piommu->protect_va);

     We don’t care the content of this buffer, It is ok even though its
data is dirty.
    It seem to be a the protect memory. While a translation fault
happened, The iommu HW will overwrite here instead of writing to the
fault physical address which may be 0 or some random address.

> >> > +
> >> > +       iommu_set_fault_handler(domain, mtk_iommu_fault_handler, piommu);
> >>
> >> I don't see any other drivers doing this. Isn't this for upper layers,
> >> so that they can set their own generic fault handlers?
> >      I think that this function is related with the iommu domain, we
> > have only one multimedia iommu domain. so I add it after the iommu
> > domain are created.
> 
> No, this function is for drivers of IOMMU clients (i.e. master IP
> blocks) which want to subscribe to page fault to do things like paging
> on demand and so on. It shouldn't be called by IOMMU driver. Please
> see other IOMMU drivers, for example rockchip-iommmu.c.
     Thanks. I have read it. I will delete it and print the error info
in the ISR. Also call the report_iommu_fault in the ISR.

> Best regards,
> Tomasz
Yong Wu (吴勇) April 14, 2015, 6:50 a.m. UTC | #20
Hi Robin,
      Thanks very much for your confirm.
      About the v3 of the DMA-mapping, I have some question below.

On Fri, 2015-03-20 at 19:14 +0000, Robin Murphy wrote:
> On 18/03/15 11:22, Yong Wu wrote:
> > Hi Tomasz,
> >     Thanks very much for your review. please help check below.
> > The others I will fix in the next version.
> >
> > Hi Robin,
> >     There are some place I would like you can have a look and give me
> > some suggestion.
> >
> > On Wed, 2015-03-11 at 19:53 +0900, Tomasz Figa wrote:
> >> Hi,
> >>
> >> Please find next part of my comments inline.
> >>
> >>> +/*
> >>> + * pimudev is a global var for dma_alloc_coherent.
> >>> + * It is not accepatable, we will delete it if "domain_alloc" is enabled
> >>
> >> It looks like we indeed need to use dma_alloc_coherent() and we don't
> >> have a good way to pass the device pointer to domain_init callback.
> >>
> >> If you don't expect SoCs in the nearest future to have multiple M4U
> >> blocks, then I guess this global variable could stay, after changing
> >> the comment into an explanation why it's correct. Also it should be
> >> moved to the top of the file, below #include directives, as this is
> >> where usually global variables are located.
> > @Robin,
> >       We have merged this patch[0] in order to delete the global var, But
> > it seems that your patch of "arm64:IOMMU" isn't based on it right row.
> > it will build fail.
> 
> Yeah, I've not yet managed to try pulling in that series (much as I 
> approve of it), partly as I know doing so is going to lean towards a 
> not-insignificant rework and I'd rather avoid picking up more unmerged 
> dependencies to block getting _something_ in for arm64 (which we can 
> then improve).
> 
> >
> > [0]:http://lists.linuxfoundation.org/pipermail/iommu/2015-January/011939.html
> >
[snip]
> 
> Calling arch_setup_dma_ops() from the driver looks plain wrong, 
> especially given that you apparently attach the IOMMU to itself - if you 
> want your own domain you should use iommu_dma_create_domain(). I admit 
> that still leaves you having to dance around a bit in order to tear down 
> the automatic domains for now, but hopefully we'll get the core code 
> sorted out sooner rather than later.
> >>> +
> >>> +                       mtk_iommu_config_port(piommu, portid);
> >>> +
> >>> +                       if (i == 0)
> >>> +                               dev->archdata.dma_ops =
> >>> +                                       piommu->dev->archdata.dma_ops;
> >>
> >> Shouldn't this be set automatically by IOMMU or DMA mapping core?
> > @Robin,
> >       In the original "arm_iommu_attach_device" of arm/mm, it will call
> > set_dma_ops to add iommu_ops for each iommu device.
> > But iommu_dma_attach_device don't help this, so I have to add it here.
> > Could this be improved?
> 
> If you implemented a simple of_xlate callback so that the core code 
> handles the dma_ops as intended, I think the simplest cheat would be to 
> check the client device's domain, either on attachment or when they 
> start mapping/unmapping, and move them to your own domain if necessary. 
> I'm putting together a v3 of the DMA mapping series, so I'll have a look 
> to see if I can squeeze in a way to make that a bit less painful until 
> we solve it properly.
> 
> 
> Robin.
> 
      I have implemented a simple of_xlate, but I can’t get the standard
struct dma_map_ops “iommu_dma_ops” to assigned it to the client device.
So the v3 of dma mapping will improve this issue?  

      And Is the v3 of the DMA-mapping based on 4.0-rc1? because we
expect it could contain will’s io-pagetable.

      And when the v3 will be ready?
> >>
> >>> +               }
> >>> +               i++;
> >>> +       }
> >>> +
> >>> +       spin_unlock_irqrestore(&priv->portlock, flags);
> >>> +
> >>> +imudev:
> >>> +       return 0;
> >>> +}
> >>> +
> >>> +static void mtk_iommu_detach_device(struct iommu_domain *domain,
> >>> +                                   struct device *dev)
> >>> +{
> >>
> >> No hardware (de)configuration or clean-up necessary?
> > I will add it. Actually we design like this:If a device have attached to
> > iommu domain, it won't detach from it.
> >>
> >>> +}
> >>> +
> > [snip]
Tomasz Figa April 15, 2015, 2:20 a.m. UTC | #21
On Tue, Apr 14, 2015 at 3:31 PM, Yong Wu <yong.wu@mediatek.com> wrote:
>> >>
>> >> > +
>> >> > +       piommu->protect_va = devm_kmalloc(piommu->dev, MTK_PROTECT_PA_ALIGN*2,
>> >>
>> >> style: Operators like * should have space on both sides.
>> >>
>> >> > +                                         GFP_KERNEL);
>> >>
>> >> Shouldn't dma_alloc_coherent() be used for this?
>> >      We don't care the data in it. I think they are the same. Could you
>> > help tell me why dma_alloc_coherent may be better.
>>
>> Can you guarantee that at the time you allocate the memory using
>> devm_kmalloc() the memory is not dirty (i.e. some write back data are
>> stored in CPU cache) and is not going to be written back in some time,
>> overwriting data put there by IOMMU hardware?
>>
> As I noted in the function "mtk_iommu_hw_init":
>
>        /* protect memory,HW will write here while translation fault */
>        protectpa = __virt_to_phys(piommu->protect_va);
>
>      We don’t care the content of this buffer, It is ok even though its
> data is dirty.
>     It seem to be a the protect memory. While a translation fault
> happened, The iommu HW will overwrite here instead of writing to the
> fault physical address which may be 0 or some random address.
>

Do you mean that it's just a dummy page for hardware behind the IOMMU
to access when the mapping is not available? How would that work with
potential on demand paging when the hardware needs to be blocked until
the mapping is created?

Best regards,
Tomasz
Yong Wu (吴勇) April 15, 2015, 7:06 a.m. UTC | #22
On Wed, 2015-04-15 at 11:20 +0900, Tomasz Figa wrote:
> On Tue, Apr 14, 2015 at 3:31 PM, Yong Wu <yong.wu@mediatek.com> wrote:
> >> >>
> >> >> > +
> >> >> > +       piommu->protect_va = devm_kmalloc(piommu->dev, MTK_PROTECT_PA_ALIGN*2,
> >> >>
> >> >> style: Operators like * should have space on both sides.
> >> >>
> >> >> > +                                         GFP_KERNEL);
> >> >>
> >> >> Shouldn't dma_alloc_coherent() be used for this?
> >> >      We don't care the data in it. I think they are the same. Could you
> >> > help tell me why dma_alloc_coherent may be better.
> >>
> >> Can you guarantee that at the time you allocate the memory using
> >> devm_kmalloc() the memory is not dirty (i.e. some write back data are
> >> stored in CPU cache) and is not going to be written back in some time,
> >> overwriting data put there by IOMMU hardware?
> >>
> > As I noted in the function "mtk_iommu_hw_init":
> >
> >        /* protect memory,HW will write here while translation fault */
> >        protectpa = __virt_to_phys(piommu->protect_va);
> >
> >      We don’t care the content of this buffer, It is ok even though its
> > data is dirty.
> >     It seem to be a the protect memory. While a translation fault
> > happened, The iommu HW will overwrite here instead of writing to the
> > fault physical address which may be 0 or some random address.
> >
> 
> Do you mean that it's just a dummy page for hardware behind the IOMMU
> to access when the mapping is not available? How would that work with
> potential on demand paging when the hardware needs to be blocked until
> the mapping is created?
> 
> Best regards,
> Tomasz
 1. YES
 2. Sorry. Our iommu HW can not support this right now. The HW can not
be blocked until the mapping is created.
    If the page is not ready, we can not get the physical address, then
How to fill the pagetable for that memory. I think the dma&iommu may
guaranty it?
Tomasz Figa April 15, 2015, 7:41 a.m. UTC | #23
On Wed, Apr 15, 2015 at 4:06 PM, Yong Wu <yong.wu@mediatek.com> wrote:
> On Wed, 2015-04-15 at 11:20 +0900, Tomasz Figa wrote:
>> On Tue, Apr 14, 2015 at 3:31 PM, Yong Wu <yong.wu@mediatek.com> wrote:
>> >> >>
>> >> >> > +
>> >> >> > +       piommu->protect_va = devm_kmalloc(piommu->dev, MTK_PROTECT_PA_ALIGN*2,
>> >> >>
>> >> >> style: Operators like * should have space on both sides.
>> >> >>
>> >> >> > +                                         GFP_KERNEL);
>> >> >>
>> >> >> Shouldn't dma_alloc_coherent() be used for this?
>> >> >      We don't care the data in it. I think they are the same. Could you
>> >> > help tell me why dma_alloc_coherent may be better.
>> >>
>> >> Can you guarantee that at the time you allocate the memory using
>> >> devm_kmalloc() the memory is not dirty (i.e. some write back data are
>> >> stored in CPU cache) and is not going to be written back in some time,
>> >> overwriting data put there by IOMMU hardware?
>> >>
>> > As I noted in the function "mtk_iommu_hw_init":
>> >
>> >        /* protect memory,HW will write here while translation fault */
>> >        protectpa = __virt_to_phys(piommu->protect_va);
>> >
>> >      We don’t care the content of this buffer, It is ok even though its
>> > data is dirty.
>> >     It seem to be a the protect memory. While a translation fault
>> > happened, The iommu HW will overwrite here instead of writing to the
>> > fault physical address which may be 0 or some random address.
>> >
>>
>> Do you mean that it's just a dummy page for hardware behind the IOMMU
>> to access when the mapping is not available? How would that work with
>> potential on demand paging when the hardware needs to be blocked until
>> the mapping is created?
>>
>> Best regards,
>> Tomasz
>  1. YES
>  2. Sorry. Our iommu HW can not support this right now. The HW can not
> be blocked until the mapping is created.

OK, that explains it. Well, then I guess this is necessary and
contents of that memory don't matter that much. (Although, this might
be a minor security issue, because the faulting hardware would get
access to some data previously stored by kernel code. Not sure how
much of a threat would that be, though.)

>     If the page is not ready, we can not get the physical address, then
> How to fill the pagetable for that memory. I think the dma&iommu may
> guaranty it?

If your hardware can't block until the mapping is created then what
you do currently seems to be the only option. (+/- the missing cache
maintenance at initialization)

Best regards,
Tomasz
Yong Wu (吴勇) April 29, 2015, 6:23 a.m. UTC | #24
Dear Tomasz,
     About a hardcode your comment, please help check below.

Dear Mark,
      I would like to add a item in the dtsi of mtk-iommu. Please also
help have a look.


> > > +static const struct mtk_iommu_port mtk_iommu_mt8173_port[] = {
> > > +       /* port name                m4uid slaveid larbid portid
tfid */
> > > +       /* larb0 */
> > > +       {"M4U_PORT_DISP_OVL0",          0,  0,    0,  0,
MTK_TFID(0, 0)},
> > > +       {"M4U_PORT_DISP_RDMA0",         0,  0,    0,  1,
MTK_TFID(0, 1)},
> > > +       {"M4U_PORT_DISP_WDMA0",         0,  0,    0,  2,
MTK_TFID(0, 2)},
> > > +       {"M4U_PORT_DISP_OD_R",          0,  0,    0,  3,
MTK_TFID(0, 3)},
> > > +       {"M4U_PORT_DISP_OD_W",          0,  0,    0,  4,
MTK_TFID(0, 4)},
> > > +       {"M4U_PORT_MDP_RDMA0",          0,  0,    0,  5,
MTK_TFID(0, 5)},
> > > +       {"M4U_PORT_MDP_WDMA",           0,  0,    0,  6,
MTK_TFID(0, 6)},
> > > +       {"M4U_PORT_MDP_WROT0",          0,  0,    0,  7,
MTK_TFID(0, 7)},
[...]
> 
> > > +};
> > > +
> > 
> > Anyway, is it really necessary to hardcode the SoC specific topology
> > data in this driver? Is there really any use besides of printing
port
> > name? If not, you could just print the values in a way letting you
> > quickly look up in the datasheet, without hardcoding this. Or even
> > better, you could print which devices are attached to the port.
> > 
> a) Printing the port name is for debug. We could not request every
iommu
> user to understand smi&local arbiter. When there is irq, they have to
> look up the iommu's datasheet to find out which port error. if we
print
> it directly, It may be more easily to debug.
> 
> b) In mtk_iommu_config_port, according to this hardcode we can be
easily
> to get out which local arbiter and which port we prepare to config.
> 
> c) If we support different SOCs, we could change this arrays easily.
> 
> > 
     There is no similar code in the others iommu, so I prepare to
delete it, But we really need know which local arbiter and which port we
are going to config(which port will enable iommu)
        so we prepare add a item in the dtsi like this:

	iommu: mmsys_iommu@10205000 {
			compatible = "mediatek,mt8173-iommu";
			<...>
+			larb-portes-nr = <M4U_LARB0_PORT_NR
+					 M4U_LARB1_PORT_NR
+					 M4U_LARB2_PORT_NR
+					 M4U_LARB3_PORT_NR
+					 M4U_LARB4_PORT_NR
+					 M4U_LARB5_PORT_NR>;
			larb = <&larb0 &larb1 &larb2 &larb3 &larb4 &larb5>;
			#iommu-cells = <1>;
		};
	larb-portes-nr : the number of the portes in each local arbiter.
        
	If we have this item, we can get which larb and which port from the
portid in the dtsi of the iommu user.

        And while there is isr, I will print the larb-id and the port-id
instead of the string of the port name.

	The M4U_LARB0_PORT_NR/... will be added in
dt-bindings/iommu/mt8173-iommu-port.h[0]

Dear Mark, 
      As above, if I add this item in the dtsi of iommu, is it ok?


[0]:http://lists.linuxfoundation.org/pipermail/iommu/2015-March/012450.html
diff mbox

Patch

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 19027bb..e63f5b6 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -326,4 +326,15 @@  config ARM_SMMU
 	  Say Y here if your SoC includes an IOMMU device implementing
 	  the ARM SMMU architecture.
 
+config MTK_IOMMU
+	bool "MTK IOMMU Support"
+	select IOMMU_API
+	select IOMMU_DMA
+	select MTK_SMI
+	help
+	  Support for the IOMMUs on certain Mediatek SOCs.
+	  These IOMMUs allow the multimedia hardware access discontinuous memory.
+
+	  If unsure, say N here.
+
 endif # IOMMU_SUPPORT
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 37bfc4e..f2a8027 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -4,6 +4,7 @@  obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
 obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
 obj-$(CONFIG_OF_IOMMU)	+= of_iommu.o
+obj-$(CONFIG_MTK_IOMMU) += mtk_iommu.o mtk_iommu_pagetable.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o
 obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
 obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
new file mode 100644
index 0000000..d62d4ab
--- /dev/null
+++ b/drivers/iommu/mtk_iommu.c
@@ -0,0 +1,754 @@ 
+/*
+ * Copyright (c) 2014-2015 MediaTek Inc.
+ * Author: Yong Wu <yong.wu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "m4u:"fmt
+
+#include <linux/io.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/iommu.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/memblock.h>
+#include <linux/dma-mapping.h>
+#include <linux/dma-iommu.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/of_platform.h>
+#include <linux/mtk-smi.h>
+#include <asm/cacheflush.h>
+
+#include "mtk_iommu.h"
+
+#define REG_MMUG_PT_BASE	 0x0
+
+#define REG_MMU_INVLD		 0x20
+#define F_MMU_INV_ALL		 0x2
+#define F_MMU_INV_RANGE		 0x1
+
+#define REG_MMU_INVLD_SA	 0x24
+#define REG_MMU_INVLD_EA         0x28
+
+#define REG_MMU_INVLD_SEC         0x2c
+#define F_MMU_INV_SEC_ALL          0x2
+#define F_MMU_INV_SEC_RANGE        0x1
+
+#define REG_INVLID_SEL	         0x38
+#define F_MMU_INV_EN_L1		 BIT(0)
+#define F_MMU_INV_EN_L2		 BIT(1)
+
+#define REG_MMU_STANDARD_AXI_MODE   0x48
+#define REG_MMU_DCM_DIS             0x50
+#define REG_MMU_LEGACY_4KB_MODE     0x60
+
+#define REG_MMU_CTRL_REG                 0x110
+#define F_MMU_CTRL_REROUTE_PFQ_TO_MQ_EN  BIT(4)
+#define F_MMU_CTRL_TF_PROT_VAL(prot)      (((prot) & 0x3)<<5)
+#define F_MMU_CTRL_COHERE_EN             BIT(8)
+
+#define REG_MMU_IVRP_PADDR               0x114
+#define F_MMU_IVRP_PA_SET(PA)            (PA>>1)
+
+#define REG_MMU_INT_L2_CONTROL      0x120
+#define F_INT_L2_CLR_BIT            BIT(12)
+
+#define REG_MMU_INT_MAIN_CONTROL    0x124
+#define F_INT_TRANSLATION_FAULT(MMU)           (1<<(0+(((MMU)<<1)|((MMU)<<2))))
+#define F_INT_MAIN_MULTI_HIT_FAULT(MMU)        (1<<(1+(((MMU)<<1)|((MMU)<<2))))
+#define F_INT_INVALID_PA_FAULT(MMU)            (1<<(2+(((MMU)<<1)|((MMU)<<2))))
+#define F_INT_ENTRY_REPLACEMENT_FAULT(MMU)     (1<<(3+(((MMU)<<1)|((MMU)<<2))))
+#define F_INT_TLB_MISS_FAULT(MMU)              (1<<(4+(((MMU)<<1)|((MMU)<<2))))
+#define F_INT_PFH_FIFO_ERR(MMU)                (1<<(6+(((MMU)<<1)|((MMU)<<2))))
+
+#define REG_MMU_CPE_DONE              0x12C
+
+#define REG_MMU_MAIN_FAULT_ST         0x134
+
+#define REG_MMU_FAULT_VA(mmu)         (0x13c+((mmu)<<3))
+#define F_MMU_FAULT_VA_MSK            ((~0x0)<<12)
+#define F_MMU_FAULT_VA_WRITE_BIT       BIT(1)
+#define F_MMU_FAULT_VA_LAYER_BIT       BIT(0)
+
+#define REG_MMU_INVLD_PA(mmu)         (0x140+((mmu)<<3))
+#define REG_MMU_INT_ID(mmu)           (0x150+((mmu)<<2))
+#define F_MMU0_INT_ID_TF_MSK          (~0x3)	/* only for MM iommu. */
+
+#define MTK_TFID(larbid, portid) ((larbid << 7) | (portid << 2))
+
+static const struct mtk_iommu_port mtk_iommu_mt8173_port[] = {
+	/* port name                m4uid slaveid larbid portid tfid */
+	/* larb0 */
+	{"M4U_PORT_DISP_OVL0",          0,  0,    0,  0, MTK_TFID(0, 0)},
+	{"M4U_PORT_DISP_RDMA0",         0,  0,    0,  1, MTK_TFID(0, 1)},
+	{"M4U_PORT_DISP_WDMA0",         0,  0,    0,  2, MTK_TFID(0, 2)},
+	{"M4U_PORT_DISP_OD_R",          0,  0,    0,  3, MTK_TFID(0, 3)},
+	{"M4U_PORT_DISP_OD_W",          0,  0,    0,  4, MTK_TFID(0, 4)},
+	{"M4U_PORT_MDP_RDMA0",          0,  0,    0,  5, MTK_TFID(0, 5)},
+	{"M4U_PORT_MDP_WDMA",           0,  0,    0,  6, MTK_TFID(0, 6)},
+	{"M4U_PORT_MDP_WROT0",          0,  0,    0,  7, MTK_TFID(0, 7)},
+
+	/* larb1 */
+	{"M4U_PORT_HW_VDEC_MC_EXT",      0,  0,   1,  0, MTK_TFID(1, 0)},
+	{"M4U_PORT_HW_VDEC_PP_EXT",      0,  0,   1,  1, MTK_TFID(1, 1)},
+	{"M4U_PORT_HW_VDEC_UFO_EXT",     0,  0,   1,  2, MTK_TFID(1, 2)},
+	{"M4U_PORT_HW_VDEC_VLD_EXT",     0,  0,   1,  3, MTK_TFID(1, 3)},
+	{"M4U_PORT_HW_VDEC_VLD2_EXT",    0,  0,   1,  4, MTK_TFID(1, 4)},
+	{"M4U_PORT_HW_VDEC_AVC_MV_EXT",  0,  0,   1,  5, MTK_TFID(1, 5)},
+	{"M4U_PORT_HW_VDEC_PRED_RD_EXT", 0,  0,   1,  6, MTK_TFID(1, 6)},
+	{"M4U_PORT_HW_VDEC_PRED_WR_EXT", 0,  0,   1,  7, MTK_TFID(1, 7)},
+	{"M4U_PORT_HW_VDEC_PPWRAP_EXT",  0,  0,   1,  8, MTK_TFID(1, 8)},
+
+	/* larb2 */
+	{"M4U_PORT_IMGO",                0,  0,    2,  0, MTK_TFID(2, 0)},
+	{"M4U_PORT_RRZO",                0,  0,    2,  1, MTK_TFID(2, 1)},
+	{"M4U_PORT_AAO",                 0,  0,    2,  2, MTK_TFID(2, 2)},
+	{"M4U_PORT_LCSO",                0,  0,    2,  3, MTK_TFID(2, 3)},
+	{"M4U_PORT_ESFKO",               0,  0,    2,  4, MTK_TFID(2, 4)},
+	{"M4U_PORT_IMGO_D",              0,  0,    2,  5, MTK_TFID(2, 5)},
+	{"M4U_PORT_LSCI",                0,  0,    2,  6, MTK_TFID(2, 6)},
+	{"M4U_PORT_LSCI_D",              0,  0,    2,  7, MTK_TFID(2, 7)},
+	{"M4U_PORT_BPCI",                0,  0,    2,  8, MTK_TFID(2, 8)},
+	{"M4U_PORT_BPCI_D",              0,  0,    2,  9, MTK_TFID(2, 9)},
+	{"M4U_PORT_UFDI",                0,  0,    2,  10, MTK_TFID(2, 10)},
+	{"M4U_PORT_IMGI",                0,  0,    2,  11, MTK_TFID(2, 11)},
+	{"M4U_PORT_IMG2O",               0,  0,    2,  12, MTK_TFID(2, 12)},
+	{"M4U_PORT_IMG3O",               0,  0,    2,  13, MTK_TFID(2, 13)},
+	{"M4U_PORT_VIPI",                0,  0,    2,  14, MTK_TFID(2, 14)},
+	{"M4U_PORT_VIP2I",               0,  0,    2,  15, MTK_TFID(2, 15)},
+	{"M4U_PORT_VIP3I",               0,  0,    2,  16, MTK_TFID(2, 16)},
+	{"M4U_PORT_LCEI",                0,  0,    2,  17, MTK_TFID(2, 17)},
+	{"M4U_PORT_RB",                  0,  0,    2,  18, MTK_TFID(2, 18)},
+	{"M4U_PORT_RP",                  0,  0,    2,  19, MTK_TFID(2, 19)},
+	{"M4U_PORT_WR",                  0,  0,    2,  20, MTK_TFID(2, 20)},
+
+	/* larb3 */
+	{"M4U_PORT_VENC_RCPU",            0,  0,   3,  0, MTK_TFID(3, 0)},
+	{"M4U_PORT_VENC_REC",             0,  0,   3,  1, MTK_TFID(3, 1)},
+	{"M4U_PORT_VENC_BSDMA",           0,  0,   3,  2, MTK_TFID(3, 2)},
+	{"M4U_PORT_VENC_SV_COMV",         0,  0,   3,  3, MTK_TFID(3, 3)},
+	{"M4U_PORT_VENC_RD_COMV",         0,  0,   3,  4, MTK_TFID(3, 4)},
+	{"M4U_PORT_JPGENC_RDMA",          0,  0,   3,  5, MTK_TFID(3, 5)},
+	{"M4U_PORT_JPGENC_BSDMA",         0,  0,   3,  6, MTK_TFID(3, 6)},
+	{"M4U_PORT_JPGDEC_WDMA",          0,  0,   3,  7, MTK_TFID(3, 7)},
+	{"M4U_PORT_JPGDEC_BSDMA",         0,  0,   3,  8, MTK_TFID(3, 8)},
+	{"M4U_PORT_VENC_CUR_LUMA",        0,  0,   3,  9, MTK_TFID(3, 9)},
+	{"M4U_PORT_VENC_CUR_CHROMA",      0,  0,   3,  10, MTK_TFID(3, 10)},
+	{"M4U_PORT_VENC_REF_LUMA",        0,  0,   3,  11, MTK_TFID(3, 11)},
+	{"M4U_PORT_VENC_REF_CHROMA",      0,  0,   3,  12, MTK_TFID(3, 12)},
+	{"M4U_PORT_VENC_NBM_RDMA",        0,  0,   3,  13, MTK_TFID(3, 13)},
+	{"M4U_PORT_VENC_NBM_WDMA",        0,  0,   3,  14, MTK_TFID(3, 14)},
+
+	/* larb4 */
+	{"M4U_PORT_DISP_OVL1",             0,  0,   4,  0, MTK_TFID(4, 0)},
+	{"M4U_PORT_DISP_RDMA1",            0,  0,   4,  1, MTK_TFID(4, 1)},
+	{"M4U_PORT_DISP_RDMA2",            0,  0,   4,  2, MTK_TFID(4, 2)},
+	{"M4U_PORT_DISP_WDMA1",            0,  0,   4,  3, MTK_TFID(4, 3)},
+	{"M4U_PORT_MDP_RDMA1",             0,  0,   4,  4, MTK_TFID(4, 4)},
+	{"M4U_PORT_MDP_WROT1",             0,  0,   4,  5, MTK_TFID(4, 5)},
+
+	/* larb5 */
+	{"M4U_PORT_VENC_RCPU_SET2",        0,  0,    5,  0, MTK_TFID(5, 0)},
+	{"M4U_PORT_VENC_REC_FRM_SET2",     0,  0,    5,  1, MTK_TFID(5, 1)},
+	{"M4U_PORT_VENC_REF_LUMA_SET2",    0,  0,    5,  2, MTK_TFID(5, 2)},
+	{"M4U_PORT_VENC_REC_CHROMA_SET2",  0,  0,    5,  3, MTK_TFID(5, 3)},
+	{"M4U_PORT_VENC_BSDMA_SET2",       0,  0,    5,  4, MTK_TFID(5, 4)},
+	{"M4U_PORT_VENC_CUR_LUMA_SET2",    0,  0,    5,  5, MTK_TFID(5, 5)},
+	{"M4U_PORT_VENC_CUR_CHROMA_SET2",  0,  0,    5,  6, MTK_TFID(5, 6)},
+	{"M4U_PORT_VENC_RD_COMA_SET2",     0,  0,    5,  7, MTK_TFID(5, 7)},
+	{"M4U_PORT_VENC_SV_COMA_SET2",     0,  0,    5,  8, MTK_TFID(5, 8)},
+
+	/* perisys iommu */
+	{"M4U_PORT_RESERVE",               1,  0,    6,  0, 0xff},
+	{"M4U_PORT_SPM",                   1,  0,    6,  1, 0x50},
+	{"M4U_PORT_MD32",                  1,  0,    6,  2, 0x90},
+	{"M4U_PORT_PTP_THERM",             1,  0,    6,  4, 0xd0},
+	{"M4U_PORT_PWM",                   1,  0,    6,  5, 0x1},
+	{"M4U_PORT_MSDC1",                 1,  0,    6,  6, 0x21},
+	{"M4U_PORT_MSDC2",                 1,  0,    6,  7, 0x41},
+	{"M4U_PORT_NFI",                   1,  0,    6,  8, 0x8},
+	{"M4U_PORT_AUDIO",                 1,  0,    6,  9, 0x48},
+	{"M4U_PORT_RESERVED2",             1,  0,    6,  10, 0xfe},
+	{"M4U_PORT_HSIC_XHCI",             1,  0,    6,  11, 0x9},
+
+	{"M4U_PORT_HSIC_MAS",              1,  0,    6,  12, 0x11},
+	{"M4U_PORT_HSIC_DEV",              1,  0,    6,  13, 0x19},
+	{"M4U_PORT_AP_DMA",                1,  0,    6,  14, 0x18},
+	{"M4U_PORT_HSIC_DMA",              1,  0,    6,  15, 0xc8},
+	{"M4U_PORT_MSDC0",                 1,  0,    6,  16, 0x0},
+	{"M4U_PORT_MSDC3",                 1,  0,    6,  17, 0x20},
+	{"M4U_PORT_UNKNOWN",               1,  0,    6,  18, 0xf},
+};
+
+static const struct mtk_iommu_cfg mtk_iommu_mt8173_cfg = {
+	.larb_nr = 6,
+	.m4u_port_nr = ARRAY_SIZE(mtk_iommu_mt8173_port),
+	.pport = mtk_iommu_mt8173_port,
+};
+
+static const char *mtk_iommu_get_port_name(const struct mtk_iommu_info *piommu,
+					   unsigned int portid)
+{
+	const struct mtk_iommu_port *pcurport = NULL;
+
+	pcurport = piommu->imucfg->pport + portid;
+	if (portid < piommu->imucfg->m4u_port_nr && pcurport)
+		return pcurport->port_name;
+	else
+		return "UNKNOWN_PORT";
+}
+
+static int mtk_iommu_get_port_by_tfid(const struct mtk_iommu_info *pimu,
+				      int tf_id)
+{
+	const struct mtk_iommu_cfg *pimucfg = pimu->imucfg;
+	int i;
+	unsigned int portid = pimucfg->m4u_port_nr;
+
+	for (i = 0; i < pimucfg->m4u_port_nr; i++) {
+		if (pimucfg->pport[i].tf_id == tf_id) {
+			portid = i;
+			break;
+		}
+	}
+	if (i == pimucfg->m4u_port_nr)
+		dev_err(pimu->dev, "tf_id find fail, tfid %d\n", tf_id);
+	return portid;
+}
+
+static irqreturn_t mtk_iommu_isr(int irq, void *dev_id)
+{
+	struct iommu_domain *domain = dev_id;
+	struct mtk_iommu_domain *mtkdomain = domain->priv;
+	struct mtk_iommu_info *piommu = mtkdomain->piommuinfo;
+
+	if (irq == piommu->irq)
+		report_iommu_fault(domain, piommu->dev, 0, 0);
+	else
+		dev_err(piommu->dev, "irq number:%d\n", irq);
+
+	return IRQ_HANDLED;
+}
+
+static inline void mtk_iommu_clear_intr(void __iomem *m4u_base)
+{
+	u32 val;
+
+	val = readl(m4u_base + REG_MMU_INT_L2_CONTROL);
+	val |= F_INT_L2_CLR_BIT;
+	writel(val, m4u_base + REG_MMU_INT_L2_CONTROL);
+}
+
+static int mtk_iommu_invalidate_tlb(const struct mtk_iommu_info *piommu,
+				    int isinvall, unsigned int iova_start,
+				    unsigned int iova_end)
+{
+	void __iomem *m4u_base = piommu->m4u_base;
+	u32 val;
+	u64 start, end;
+
+	start = sched_clock();
+
+	if (!isinvall) {
+		iova_start = round_down(iova_start, SZ_4K);
+		iova_end = round_up(iova_end, SZ_4K);
+	}
+
+	val = F_MMU_INV_EN_L2 | F_MMU_INV_EN_L1;
+
+	writel(val, m4u_base + REG_INVLID_SEL);
+
+	if (isinvall) {
+		writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);
+	} else {
+		writel(iova_start, m4u_base + REG_MMU_INVLD_SA);
+		writel(iova_end, m4u_base + REG_MMU_INVLD_EA);
+		writel(F_MMU_INV_RANGE, m4u_base + REG_MMU_INVLD);
+
+		while (!readl(m4u_base + REG_MMU_CPE_DONE)) {
+			end = sched_clock();
+			if (end - start >= 100000000ULL) {
+				dev_warn(piommu->dev, "invalid don't done\n");
+				writel(F_MMU_INV_ALL, m4u_base + REG_MMU_INVLD);
+			}
+		};
+		writel(0, m4u_base + REG_MMU_CPE_DONE);
+	}
+
+	return 0;
+}
+
+static int mtk_iommu_fault_handler(struct iommu_domain *imudomain,
+				   struct device *dev, unsigned long iova,
+				   int m4uindex, void *pimu)
+{
+	void __iomem *m4u_base;
+	u32 int_state, regval;
+	int m4u_slave_id = 0;
+	unsigned int layer, write, m4u_port;
+	unsigned int fault_mva, fault_pa;
+	struct mtk_iommu_info *piommu = pimu;
+	struct mtk_iommu_domain *mtkdomain = imudomain->priv;
+
+	m4u_base = piommu->m4u_base;
+	int_state = readl(m4u_base + REG_MMU_MAIN_FAULT_ST);
+
+	/* read error info from registers */
+	fault_mva = readl(m4u_base + REG_MMU_FAULT_VA(m4u_slave_id));
+	layer = !!(fault_mva & F_MMU_FAULT_VA_LAYER_BIT);
+	write = !!(fault_mva & F_MMU_FAULT_VA_WRITE_BIT);
+	fault_mva &= F_MMU_FAULT_VA_MSK;
+	fault_pa = readl(m4u_base + REG_MMU_INVLD_PA(m4u_slave_id));
+	regval = readl(m4u_base + REG_MMU_INT_ID(m4u_slave_id));
+	regval &= F_MMU0_INT_ID_TF_MSK;
+	m4u_port = mtk_iommu_get_port_by_tfid(piommu, regval);
+
+	if (int_state & F_INT_TRANSLATION_FAULT(m4u_slave_id)) {
+		struct m4u_pte_info_t pte;
+		unsigned long flags;
+
+		spin_lock_irqsave(&mtkdomain->pgtlock, flags);
+		m4u_get_pte_info(mtkdomain, fault_mva, &pte);
+		spin_unlock_irqrestore(&mtkdomain->pgtlock, flags);
+
+		if (pte.size == MMU_SMALL_PAGE_SIZE ||
+		    pte.size == MMU_LARGE_PAGE_SIZE) {
+			dev_err_ratelimited(
+				dev,
+				"fault:port=%s iova=0x%x pa=0x%x layer=%d %s;"
+				"pgd(0x%x)->pte(0x%x)->pa(%pad)sz(0x%x)Valid(%d)\n",
+				mtk_iommu_get_port_name(piommu, m4u_port),
+				fault_mva, fault_pa, layer,
+				write ? "write" : "read",
+				imu_pgd_val(*pte.pgd), imu_pte_val(*pte.pte),
+				&pte.pa, pte.size, pte.valid);
+		} else {
+			dev_err_ratelimited(
+				dev,
+				"fault:port=%s iova=0x%x pa=0x%x layer=%d %s;"
+				"pgd(0x%x)->pa(%pad)sz(0x%x)Valid(%d)\n",
+				mtk_iommu_get_port_name(piommu, m4u_port),
+				fault_mva, fault_pa, layer,
+				write ? "write" : "read",
+				imu_pgd_val(*pte.pgd),
+				&pte.pa, pte.size, pte.valid);
+		}
+	}
+
+	if (int_state & F_INT_MAIN_MULTI_HIT_FAULT(m4u_slave_id))
+		dev_err_ratelimited(dev, "multi-hit!port=%s iova=0x%x\n",
+				    mtk_iommu_get_port_name(piommu, m4u_port),
+				    fault_mva);
+
+	if (int_state & F_INT_INVALID_PA_FAULT(m4u_slave_id)) {
+		if (!(int_state & F_INT_TRANSLATION_FAULT(m4u_slave_id)))
+			dev_err_ratelimited(dev, "invalid pa!port=%s iova=0x%x\n",
+					    mtk_iommu_get_port_name(piommu,
+								    m4u_port),
+					    fault_mva);
+	}
+	if (int_state & F_INT_ENTRY_REPLACEMENT_FAULT(m4u_slave_id))
+		dev_err_ratelimited(dev, "replace-fault!port=%s iova=0x%x\n",
+				    mtk_iommu_get_port_name(piommu, m4u_port),
+				    fault_mva);
+
+	if (int_state & F_INT_TLB_MISS_FAULT(m4u_slave_id))
+		dev_err_ratelimited(dev, "tlb miss-fault!port=%s iova=0x%x\n",
+				    mtk_iommu_get_port_name(piommu, m4u_port),
+				    fault_mva);
+
+	mtk_iommu_invalidate_tlb(piommu, 1, 0, 0);
+
+	mtk_iommu_clear_intr(m4u_base);
+
+	return 0;
+}
+
+static int mtk_iommu_parse_dt(struct platform_device *pdev,
+			      struct mtk_iommu_info *piommu)
+{
+	struct device *piommudev = &pdev->dev;
+	struct device_node *ofnode;
+	struct resource *res;
+	unsigned int mtk_iommu_cell = 0;
+	unsigned int i;
+
+	ofnode = piommudev->of_node;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	piommu->m4u_base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(piommu->m4u_base)) {
+		dev_err(piommudev, "m4u_base %p err\n", piommu->m4u_base);
+		goto iommu_dts_err;
+	}
+
+	piommu->irq = platform_get_irq(pdev, 0);
+	if (piommu->irq < 0) {
+		dev_err(piommudev, "irq err %d\n", piommu->irq);
+		goto iommu_dts_err;
+	}
+
+	piommu->m4u_infra_clk = devm_clk_get(piommudev, "infra_m4u");
+	if (IS_ERR(piommu->m4u_infra_clk)) {
+		dev_err(piommudev, "clk err %p\n", piommu->m4u_infra_clk);
+		goto iommu_dts_err;
+	}
+
+	of_property_read_u32(ofnode, "#iommu-cells", &mtk_iommu_cell);
+	if (mtk_iommu_cell != 1) {
+		dev_err(piommudev, "iommu-cell fail:%d\n", mtk_iommu_cell);
+		goto iommu_dts_err;
+	}
+
+	for (i = 0; i < piommu->imucfg->larb_nr; i++) {
+		struct device_node *larbnode;
+
+		larbnode = of_parse_phandle(ofnode, "larb", i);
+		piommu->larbpdev[i] = of_find_device_by_node(larbnode);
+		of_node_put(larbnode);
+		if (!piommu->larbpdev[i]) {
+			dev_err(piommudev, "larb pdev fail@larb%d\n", i);
+			goto iommu_dts_err;
+		}
+	}
+
+	return 0;
+
+iommu_dts_err:
+	return -EINVAL;
+}
+
+static int mtk_iommu_hw_init(const struct mtk_iommu_domain *mtkdomain)
+{
+	struct mtk_iommu_info *piommu = mtkdomain->piommuinfo;
+	void __iomem *gm4ubaseaddr = piommu->m4u_base;
+	phys_addr_t protectpa;
+	u32 regval, protectreg;
+	int ret = 0;
+
+	ret = clk_prepare_enable(piommu->m4u_infra_clk);
+	if (ret) {
+		dev_err(piommu->dev, "m4u clk enable error\n");
+		return -ENODEV;
+	}
+
+	writel((u32)mtkdomain->pgd_pa, gm4ubaseaddr + REG_MMUG_PT_BASE);
+
+	regval = F_MMU_CTRL_REROUTE_PFQ_TO_MQ_EN |
+		F_MMU_CTRL_TF_PROT_VAL(2) |
+		F_MMU_CTRL_COHERE_EN;
+	writel(regval, gm4ubaseaddr + REG_MMU_CTRL_REG);
+
+	writel(0x6f, gm4ubaseaddr + REG_MMU_INT_L2_CONTROL);
+	writel(0xffffffff, gm4ubaseaddr + REG_MMU_INT_MAIN_CONTROL);
+
+	/* protect memory,HW will write here while translation fault */
+	protectpa = __virt_to_phys(piommu->protect_va);
+	protectpa = ALIGN(protectpa, MTK_PROTECT_PA_ALIGN);
+	protectreg = (u32)F_MMU_IVRP_PA_SET(protectpa);
+	writel(protectreg, gm4ubaseaddr + REG_MMU_IVRP_PADDR);
+
+	writel(0, gm4ubaseaddr + REG_MMU_DCM_DIS);
+	writel(0, gm4ubaseaddr + REG_MMU_STANDARD_AXI_MODE);
+
+	return 0;
+}
+
+static inline void mtk_iommu_config_port(struct mtk_iommu_info *piommu,
+					 int portid)
+{
+	int larb, larb_port;
+
+	larb = piommu->imucfg->pport[portid].larb_id;
+	larb_port = piommu->imucfg->pport[portid].port_id;
+
+	mtk_smi_config_port(piommu->larbpdev[larb], larb_port);
+}
+
+/*
+ * pimudev is a global var for dma_alloc_coherent.
+ * It is not accepatable, we will delete it if "domain_alloc" is enabled
+ */
+static struct device *pimudev;
+
+static int mtk_iommu_domain_init(struct iommu_domain *domain)
+{
+	struct mtk_iommu_domain *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	priv->pgd = dma_alloc_coherent(pimudev, M4U_PGD_SIZE, &priv->pgd_pa,
+				       GFP_KERNEL);
+	if (!priv->pgd) {
+		pr_err("dma_alloc_coherent pagetable fail\n");
+		goto err_pgtable;
+	}
+
+	if (!IS_ALIGNED(priv->pgd_pa, M4U_PGD_SIZE)) {
+		pr_err("pagetable not aligned pa 0x%pad-0x%p align 0x%x\n",
+		       &priv->pgd_pa, priv->pgd, M4U_PGD_SIZE);
+		goto err_pgtable;
+	}
+
+	memset(priv->pgd, 0, M4U_PGD_SIZE);
+
+	spin_lock_init(&priv->pgtlock);
+	spin_lock_init(&priv->portlock);
+	domain->priv = priv;
+
+	domain->geometry.aperture_start = 0;
+	domain->geometry.aperture_end   = (unsigned int)~0;
+	domain->geometry.force_aperture = true;
+
+	return 0;
+
+err_pgtable:
+	if (priv->pgd)
+		dma_free_coherent(pimudev, M4U_PGD_SIZE, priv->pgd,
+				  priv->pgd_pa);
+	kfree(priv);
+	return -ENOMEM;
+}
+
+static void mtk_iommu_domain_destroy(struct iommu_domain *domain)
+{
+	struct mtk_iommu_domain *priv = domain->priv;
+
+	dma_free_coherent(priv->piommuinfo->dev, M4U_PGD_SIZE,
+			  priv->pgd, priv->pgd_pa);
+	kfree(domain->priv);
+	domain->priv = NULL;
+}
+
+static int mtk_iommu_attach_device(struct iommu_domain *domain,
+				   struct device *dev)
+{
+	unsigned long flags;
+	struct mtk_iommu_domain *priv = domain->priv;
+	struct mtk_iommu_info *piommu = priv->piommuinfo;
+	struct of_phandle_args out_args = {0};
+	struct device *imudev;
+	unsigned int i = 0;
+
+	if (!piommu)
+		goto imudev;
+	else
+		imudev = piommu->dev;
+
+	spin_lock_irqsave(&priv->portlock, flags);
+
+	while (!of_parse_phandle_with_args(dev->of_node, "iommus",
+					   "#iommu-cells", i, &out_args)) {
+		if (1 == out_args.args_count) {
+			unsigned int portid = out_args.args[0];
+
+			dev_dbg(dev, "iommu add port:%d\n", portid);
+
+			mtk_iommu_config_port(piommu, portid);
+
+			if (i == 0)
+				dev->archdata.dma_ops =
+					piommu->dev->archdata.dma_ops;
+		}
+		i++;
+	}
+
+	spin_unlock_irqrestore(&priv->portlock, flags);
+
+imudev:
+	return 0;
+}
+
+static void mtk_iommu_detach_device(struct iommu_domain *domain,
+				    struct device *dev)
+{
+}
+
+static int mtk_iommu_map(struct iommu_domain *domain, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot)
+{
+	struct mtk_iommu_domain *priv = domain->priv;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&priv->pgtlock, flags);
+	ret = m4u_map(priv, (unsigned int)iova, paddr, size, prot);
+	mtk_iommu_invalidate_tlb(priv->piommuinfo, 0,
+				 iova, iova + size - 1);
+	spin_unlock_irqrestore(&priv->pgtlock, flags);
+
+	return ret;
+}
+
+static size_t mtk_iommu_unmap(struct iommu_domain *domain,
+			      unsigned long iova, size_t size)
+{
+	struct mtk_iommu_domain *priv = domain->priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->pgtlock, flags);
+	m4u_unmap(priv, (unsigned int)iova, size);
+	mtk_iommu_invalidate_tlb(priv->piommuinfo, 0,
+				 iova, iova + size - 1);
+	spin_unlock_irqrestore(&priv->pgtlock, flags);
+
+	return size;
+}
+
+static phys_addr_t mtk_iommu_iova_to_phys(struct iommu_domain *domain,
+					  dma_addr_t iova)
+{
+	struct mtk_iommu_domain *priv = domain->priv;
+	unsigned long flags;
+	struct m4u_pte_info_t pte;
+
+	spin_lock_irqsave(&priv->pgtlock, flags);
+	m4u_get_pte_info(priv, (unsigned int)iova, &pte);
+	spin_unlock_irqrestore(&priv->pgtlock, flags);
+
+	return pte.pa;
+}
+
+static struct iommu_ops mtk_iommu_ops = {
+	.domain_init = mtk_iommu_domain_init,
+	.domain_destroy = mtk_iommu_domain_destroy,
+	.attach_dev = mtk_iommu_attach_device,
+	.detach_dev = mtk_iommu_detach_device,
+	.map = mtk_iommu_map,
+	.unmap = mtk_iommu_unmap,
+	.map_sg = default_iommu_map_sg,
+	.iova_to_phys = mtk_iommu_iova_to_phys,
+	.pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
+};
+
+static const struct of_device_id mtk_iommu_of_ids[] = {
+	{ .compatible = "mediatek,mt8173-iommu",
+	  .data = &mtk_iommu_mt8173_cfg,
+	},
+	{}
+};
+
+static int mtk_iommu_probe(struct platform_device *pdev)
+{
+	int ret;
+	struct iommu_domain *domain;
+	struct mtk_iommu_domain *mtk_domain;
+	struct mtk_iommu_info *piommu;
+	struct iommu_dma_domain  *dom;
+	const struct of_device_id *of_id;
+
+	piommu = devm_kzalloc(&pdev->dev, sizeof(struct mtk_iommu_info),
+			      GFP_KERNEL);
+	if (!piommu)
+		return -ENOMEM;
+
+	pimudev = &pdev->dev;
+	piommu->dev = &pdev->dev;
+
+	of_id = of_match_node(mtk_iommu_of_ids, pdev->dev.of_node);
+	if (!of_id)
+		return -ENODEV;
+
+	piommu->protect_va = devm_kmalloc(piommu->dev, MTK_PROTECT_PA_ALIGN*2,
+					  GFP_KERNEL);
+	if (!piommu->protect_va)
+		goto protect_err;
+	memset(piommu->protect_va, 0x55, MTK_PROTECT_PA_ALIGN*2);
+
+	piommu->imucfg = (const struct mtk_iommu_cfg *)of_id->data;
+
+	ret = mtk_iommu_parse_dt(pdev, piommu);
+	if (ret) {
+		dev_err(piommu->dev, "iommu dt parse fail\n");
+		goto protect_err;
+	}
+
+	/* alloc memcache for level-2 pgt */
+	piommu->m4u_pte_kmem = kmem_cache_create("m4u_pte", IMU_BYTES_PER_PTE,
+						 IMU_BYTES_PER_PTE, 0, NULL);
+
+	if (IS_ERR_OR_NULL(piommu->m4u_pte_kmem)) {
+		dev_err(piommu->dev, "pte cached create fail %p\n",
+			piommu->m4u_pte_kmem);
+		goto protect_err;
+	}
+
+	arch_setup_dma_ops(piommu->dev, 0, (1ULL<<32) - 1, &mtk_iommu_ops, 0);
+
+	dom = get_dma_domain(piommu->dev);
+	domain = iommu_dma_raw_domain(dom);
+
+	mtk_domain = domain->priv;
+	mtk_domain->piommuinfo = piommu;
+
+	if (!domain)
+		goto pte_err;
+
+	ret = mtk_iommu_hw_init(mtk_domain);
+	if (ret < 0)
+		goto hw_err;
+
+	if (devm_request_irq(piommu->dev, piommu->irq,
+			     mtk_iommu_isr, IRQF_TRIGGER_NONE,
+			     "mtkiommu", (void *)domain)) {
+		dev_err(piommu->dev, "IRQ request %d failed\n",
+			piommu->irq);
+		goto hw_err;
+	}
+
+	iommu_set_fault_handler(domain, mtk_iommu_fault_handler, piommu);
+
+	dev_set_drvdata(piommu->dev, piommu);
+
+	return 0;
+hw_err:
+	arch_teardown_dma_ops(piommu->dev);
+pte_err:
+	kmem_cache_destroy(piommu->m4u_pte_kmem);
+protect_err:
+	dev_err(piommu->dev, "probe error\n");
+	return 0;
+}
+
+static int mtk_iommu_remove(struct platform_device *pdev)
+{
+	struct mtk_iommu_info *piommu =	dev_get_drvdata(&pdev->dev);
+
+	arch_teardown_dma_ops(piommu->dev);
+	kmem_cache_destroy(piommu->m4u_pte_kmem);
+
+	return 0;
+}
+
+static struct platform_driver mtk_iommu_driver = {
+	.probe	= mtk_iommu_probe,
+	.remove	= mtk_iommu_remove,
+	.driver	= {
+		.name = "mtkiommu",
+		.of_match_table = mtk_iommu_of_ids,
+	}
+};
+
+static int __init mtk_iommu_init(void)
+{
+	return platform_driver_register(&mtk_iommu_driver);
+}
+
+subsys_initcall(mtk_iommu_init);
+
diff --git a/drivers/iommu/mtk_iommu.h b/drivers/iommu/mtk_iommu.h
new file mode 100644
index 0000000..239471f
--- /dev/null
+++ b/drivers/iommu/mtk_iommu.h
@@ -0,0 +1,73 @@ 
+/*
+ * Copyright (c) 2014-2015 MediaTek Inc.
+ * Author: Yong Wu <yong.wu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef MTK_IOMMU_PLATFORM_H
+#define MTK_IOMMU_PLATFORM_H
+
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/platform_device.h>
+
+#include "mtk_iommu_pagetable.h"
+
+#define M4U_PGD_SIZE       SZ_16K   /* pagetable size,mt8173 */
+
+#define MTK_PROTECT_PA_ALIGN  128
+
+#define MTK_IOMMU_LARB_MAX_NR 8
+#define MTK_IOMMU_PORT_MAX_NR 100
+
+struct mtk_iommu_port {
+	const char *port_name;
+	unsigned int m4u_id:2;
+	unsigned int m4u_slave:2;/* main tlb index in mm iommu */
+	unsigned int larb_id:4;
+	unsigned int port_id:8;/* port id in larb */
+	unsigned int tf_id:16; /* translation fault id */
+};
+
+struct mtk_iommu_cfg {
+	unsigned int larb_nr;
+	unsigned int m4u_port_nr;
+	const struct mtk_iommu_port *pport;
+};
+
+struct mtk_iommu_info {
+	void __iomem *m4u_base;
+	unsigned int  irq;
+	struct platform_device *larbpdev[MTK_IOMMU_LARB_MAX_NR];
+	struct clk *m4u_infra_clk;
+	void __iomem *protect_va;
+	struct device *dev;
+	struct kmem_cache *m4u_pte_kmem;
+	const struct mtk_iommu_cfg *imucfg;
+};
+
+struct mtk_iommu_domain {
+	struct imu_pgd_t *pgd;
+	dma_addr_t pgd_pa;
+	spinlock_t pgtlock;	/* lock for modifying page table */
+	spinlock_t portlock;    /* lock for config port */
+	struct mtk_iommu_info *piommuinfo;
+};
+
+int m4u_map(struct mtk_iommu_domain *m4u_domain, unsigned int iova,
+	    phys_addr_t paddr, unsigned int size, unsigned int prot);
+int m4u_unmap(struct mtk_iommu_domain *domain, unsigned int iova,
+	      unsigned int size);
+int m4u_get_pte_info(const struct mtk_iommu_domain *domain,
+		     unsigned int iova, struct m4u_pte_info_t *pte_info);
+
+#endif
diff --git a/drivers/iommu/mtk_iommu_pagetable.c b/drivers/iommu/mtk_iommu_pagetable.c
new file mode 100644
index 0000000..5fe9640
--- /dev/null
+++ b/drivers/iommu/mtk_iommu_pagetable.c
@@ -0,0 +1,439 @@ 
+/*
+ * Copyright (c) 2014-2015 MediaTek Inc.
+ * Author: Yong Wu <yong.wu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/err.h>
+#include <linux/mm.h>
+#include <linux/iommu.h>
+#include <linux/errno.h>
+#include "asm/cacheflush.h"
+
+#include "mtk_iommu.h"
+#include "mtk_iommu_pagetable.h"
+
+/* 2 level pagetable: pgd -> pte */
+#define F_PTE_TYPE_GET(regval)  (regval & 0x3)
+#define F_PTE_TYPE_LARGE         BIT(0)
+#define F_PTE_TYPE_SMALL         BIT(1)
+#define F_PTE_B_BIT              BIT(2)
+#define F_PTE_C_BIT              BIT(3)
+#define F_PTE_BIT32_BIT          BIT(9)
+#define F_PTE_S_BIT              BIT(10)
+#define F_PTE_NG_BIT             BIT(11)
+#define F_PTE_PA_LARGE_MSK            (~0UL << 16)
+#define F_PTE_PA_LARGE_GET(regval)    ((regval >> 16) & 0xffff)
+#define F_PTE_PA_SMALL_MSK            (~0UL << 12)
+#define F_PTE_PA_SMALL_GET(regval)    ((regval >> 12) & (~0))
+#define F_PTE_TYPE_IS_LARGE_PAGE(pte) ((imu_pte_val(pte) & 0x3) == \
+					F_PTE_TYPE_LARGE)
+#define F_PTE_TYPE_IS_SMALL_PAGE(pte) ((imu_pte_val(pte) & 0x3) == \
+					F_PTE_TYPE_SMALL)
+
+#define F_PGD_TYPE_PAGE         (0x1)
+#define F_PGD_TYPE_PAGE_MSK     (0x3)
+#define F_PGD_TYPE_SECTION      (0x2)
+#define F_PGD_TYPE_SUPERSECTION   (0x2 | (1 << 18))
+#define F_PGD_TYPE_SECTION_MSK    (0x3 | (1 << 18))
+#define F_PGD_TYPE_IS_PAGE(pgd)   ((imu_pgd_val(pgd)&3) == 1)
+#define F_PGD_TYPE_IS_SECTION(pgd) \
+	(F_PGD_TYPE_IS_PAGE(pgd) ? 0 : \
+		((imu_pgd_val(pgd) & F_PGD_TYPE_SECTION_MSK) == \
+			F_PGD_TYPE_SECTION))
+#define F_PGD_TYPE_IS_SUPERSECTION(pgd) \
+	(F_PGD_TYPE_IS_PAGE(pgd) ? 0 : \
+		((imu_pgd_val(pgd) & F_PGD_TYPE_SECTION_MSK) ==\
+			F_PGD_TYPE_SUPERSECTION))
+
+#define F_PGD_B_BIT                     BIT(2)
+#define F_PGD_C_BIT                     BIT(3)
+#define F_PGD_BIT32_BIT                 BIT(9)
+#define F_PGD_S_BIT                     BIT(16)
+#define F_PGD_NG_BIT                    BIT(17)
+#define F_PGD_NS_BIT_PAGE(ns)           (ns << 3)
+#define F_PGD_NS_BIT_SECTION(ns)        (ns << 19)
+#define F_PGD_NS_BIT_SUPERSECTION(ns)   (ns << 19)
+
+#define imu_pgd_index(addr)		((addr) >> IMU_PGDIR_SHIFT)
+#define imu_pgd_offset(domain, addr) ((domain)->pgd + imu_pgd_index(addr))
+
+#define imu_pte_index(addr)    (((addr)>>IMU_PAGE_SHIFT)&(IMU_PTRS_PER_PTE - 1))
+#define imu_pte_offset_map(pgd, addr) (imu_pte_map(pgd) + imu_pte_index(addr))
+
+#define F_PGD_PA_PAGETABLE_MSK            (~0 << 10)
+#define F_PGD_PA_SECTION_MSK              (~0 << 20)
+#define F_PGD_PA_SUPERSECTION_MSK         (~0 << 24)
+
+static inline struct imu_pte_t *imu_pte_map(struct imu_pgd_t *pgd)
+{
+	unsigned int pte_pa = imu_pgd_val(*pgd);
+
+	return (struct imu_pte_t *)(__va(pte_pa
+					& F_PGD_PA_PAGETABLE_MSK));
+}
+
+static inline struct imu_pgd_t *imu_supersection_start(struct imu_pgd_t *pgd)
+{
+	return (struct imu_pgd_t *)(round_down((unsigned long)pgd, (16 * 4)));
+}
+
+static inline void m4u_set_pgd_val(struct imu_pgd_t *pgd, unsigned int val)
+{
+	imu_pgd_val(*pgd) = val;
+}
+
+static inline unsigned int __m4u_get_pgd_attr(unsigned int prot,
+					      bool super, bool imu4gmode)
+{
+	unsigned int pgprot;
+
+	pgprot = F_PGD_NS_BIT_SECTION(1) | F_PGD_S_BIT;
+	pgprot |= super ? F_PGD_TYPE_SUPERSECTION : F_PGD_TYPE_SECTION;
+	pgprot |= (prot & IOMMU_CACHE) ? (F_PGD_C_BIT | F_PGD_B_BIT) : 0;
+	pgprot |= imu4gmode ? F_PGD_BIT32_BIT : 0;
+
+	return pgprot;
+}
+
+static inline unsigned int __m4u_get_pte_attr(unsigned int prot,
+					      bool large, bool imu4gmode)
+{
+	unsigned int pgprot;
+
+	pgprot = F_PTE_S_BIT;
+	pgprot |= large ? F_PTE_TYPE_LARGE : F_PTE_TYPE_SMALL;
+	pgprot |= (prot & IOMMU_CACHE) ? (F_PGD_C_BIT | F_PGD_B_BIT) : 0;
+	pgprot |= imu4gmode ? F_PTE_BIT32_BIT : 0;
+
+	return pgprot;
+}
+
+static inline void m4u_pgtable_flush(void *vastart, void *vaend)
+{
+	/*
+	 * this function is not acceptable, we will use dma_map_single
+	 * or use dma_pool_create for the level2 pagetable.
+	 */
+	__dma_flush_range(vastart, vaend);
+}
+
+/* @return   0 -- pte is allocated
+ *     1 -- pte is not allocated, because it's allocated by others
+ *     <0 -- error
+ */
+static int m4u_alloc_pte(struct mtk_iommu_domain *domain, struct imu_pgd_t *pgd,
+			 unsigned int pgprot)
+{
+	void *pte_new_va;
+	phys_addr_t pte_new;
+	struct kmem_cache *pte_kmem = domain->piommuinfo->m4u_pte_kmem;
+	struct device *dev = domain->piommuinfo->dev;
+	unsigned int ret;
+
+	pte_new_va = kmem_cache_zalloc(pte_kmem, GFP_KERNEL);
+	if (unlikely(!pte_new_va)) {
+		dev_err(dev, "%s:fail, no memory\n", __func__);
+		return -ENOMEM;
+	}
+	pte_new = __virt_to_phys(pte_new_va);
+
+	/* check pte alignment -- must 1K align */
+	if (unlikely(pte_new & (IMU_BYTES_PER_PTE - 1))) {
+		dev_err(dev, "%s:fail, not align pa=0x%pa, va=0x%p\n",
+			__func__, &pte_new, pte_new_va);
+		kmem_cache_free(pte_kmem, (void *)pte_new_va);
+		return -ENOMEM;
+	}
+
+	/* because someone else may have allocated for this pgd first */
+	if (likely(!imu_pgd_val(*pgd))) {
+		m4u_set_pgd_val(pgd, (unsigned int)(pte_new) | pgprot);
+		dev_dbg(dev, "%s:pgd:0x%p,pte_va:0x%p,pte_pa:%pa,value:0x%x\n",
+			__func__, pgd, pte_new_va,
+			&pte_new, (unsigned int)(pte_new) | pgprot);
+		ret = 0;
+	} else {
+		/* allocated by other thread */
+		dev_dbg(dev, "m4u pte allocated by others: pgd=0x%p\n", pgd);
+		kmem_cache_free(pte_kmem, (void *)pte_new_va);
+		ret = 1;
+	}
+	return ret;
+}
+
+static int m4u_free_pte(struct mtk_iommu_domain *domain, struct imu_pgd_t *pgd)
+{
+	struct imu_pte_t *pte_old;
+	struct kmem_cache *pte_kmem = domain->piommuinfo->m4u_pte_kmem;
+
+	pte_old = imu_pte_map(pgd);
+	m4u_set_pgd_val(pgd, 0);
+
+	kmem_cache_free(pte_kmem, pte_old);
+
+	return 0;
+}
+
+static int m4u_map_page(struct mtk_iommu_domain *m4u_domain, unsigned int iova,
+			phys_addr_t pa, unsigned int prot, bool largepage)
+{
+	int ret;
+	struct imu_pgd_t *pgd;
+	struct imu_pte_t *pte;
+	unsigned int pte_new, pgprot;
+	unsigned int padscpt;
+	struct device *dev = m4u_domain->piommuinfo->dev;
+	unsigned int mask = largepage ?
+				F_PTE_PA_LARGE_MSK : F_PTE_PA_SMALL_MSK;
+	unsigned int i, ptenum = largepage ? 16 : 1;
+	bool imu4gmode = (pa > 0xffffffffL) ? true : false;
+
+	if ((iova & (~mask)) != ((unsigned int)pa & (~mask))) {
+		dev_err(dev, "error to mk_pte: iova=0x%x, pa=0x%pa, type=%s\n",
+			iova, &pa, largepage ? "large page" : "small page");
+		return -EINVAL;
+	}
+
+	iova &= mask;
+	padscpt = (unsigned int)pa & mask;
+
+	pgprot = F_PGD_TYPE_PAGE | F_PGD_NS_BIT_PAGE(1);
+	pgd = imu_pgd_offset(m4u_domain, iova);
+	if (!imu_pgd_val(*pgd)) {
+		ret = m4u_alloc_pte(m4u_domain, pgd, pgprot);
+		if (ret < 0)
+			return ret;
+		else if (ret > 0)
+			pte_new = 0;
+		else
+			pte_new = 1;
+	} else {
+		if ((imu_pgd_val(*pgd) & (~F_PGD_PA_PAGETABLE_MSK)) != pgprot) {
+			dev_err(dev, "%s: iova=0x%x, pgd=0x%x, pgprot=0x%x\n",
+				__func__, iova, imu_pgd_val(*pgd), pgprot);
+			return -1;
+		}
+		pte_new = 0;
+	}
+
+	pgprot = __m4u_get_pte_attr(prot, largepage, imu4gmode);
+	pte = imu_pte_offset_map(pgd, iova);
+
+	dev_dbg(dev, "%s:iova:0x%x,pte:0x%p(0x%p+0x%x),pa:%pa,value:0x%x-%s\n",
+		__func__, iova, &imu_pte_val(*pte), imu_pte_map(pgd),
+		imu_pte_index(iova), &pa, padscpt | pgprot,
+		largepage ? "large page" : "small page");
+
+	for (i = 0; i < ptenum; i++) {
+		if (imu_pte_val(pte[i])) {
+			dev_err(dev, "%s: pte=0x%x, i=%d\n", __func__,
+				imu_pte_val(pte[i]), i);
+			goto err_out;
+		}
+		imu_pte_val(pte[i]) = padscpt | pgprot;
+	}
+
+	m4u_pgtable_flush(pte, pte + ptenum);
+
+	return 0;
+
+ err_out:
+	for (i--; i >= 0; i--)
+		imu_pte_val(pte[i]) = 0;
+	return -EEXIST;
+}
+
+static int m4u_map_section(struct mtk_iommu_domain *m4u_domain,
+			   unsigned int iova, phys_addr_t pa,
+			   unsigned int prot, bool supersection)
+{
+	int i;
+	struct imu_pgd_t *pgd;
+	unsigned int pgprot;
+	unsigned int padscpt;
+	struct device *dev = m4u_domain->piommuinfo->dev;
+	unsigned int mask = supersection ?
+			F_PGD_PA_SUPERSECTION_MSK : F_PGD_PA_SECTION_MSK;
+	unsigned int pgdnum = supersection ? 16 : 1;
+	bool imu4gmode = (pa > 0xffffffffL) ? true : false;
+
+	if ((iova & (~mask)) != ((unsigned int)pa & (~mask))) {
+		dev_err(dev, "error to mk_pte: iova=0x%x, pa=0x%pa,type=%s\n",
+			iova, &pa, supersection ? "supersection" : "section");
+		return -EINVAL;
+	}
+
+	iova &= mask;
+	padscpt = (unsigned int)pa & mask;
+
+	pgprot = __m4u_get_pgd_attr(prot, supersection, imu4gmode);
+	pgd = imu_pgd_offset(m4u_domain, iova);
+
+	dev_dbg(dev, "%s:iova:0x%x,pgd:0x%p(0x%p+0x%x),pa:%pa,value:0x%x-%s\n",
+		__func__, iova, pgd, (m4u_domain)->pgd, imu_pgd_index(iova),
+		&pa, padscpt | pgprot,
+		supersection ? "supersection" : "section");
+
+	for (i = 0; i < pgdnum; i++) {
+		if (unlikely(imu_pgd_val(*pgd))) {
+			dev_err(dev, "%s:iova=0x%x, pgd=0x%x, i=%d\n", __func__,
+				iova, imu_pgd_val(*pgd), i);
+			goto err_out;
+		}
+		m4u_set_pgd_val(pgd, padscpt | pgprot);
+		pgd++;
+	}
+	return 0;
+
+ err_out:
+	for (pgd--; i > 0; i--) {
+		m4u_set_pgd_val(pgd, 0);
+		pgd--;
+	}
+	return -EEXIST;
+}
+
+int m4u_map(struct mtk_iommu_domain *m4u_domain, unsigned int iova,
+	    phys_addr_t paddr, unsigned int size, unsigned int prot)
+{
+	if (size == SZ_4K) {/* most case */
+		return m4u_map_page(m4u_domain, iova, paddr, prot, false);
+	} else if (size == SZ_64K) {
+		return m4u_map_page(m4u_domain, iova, paddr, prot, true);
+	} else if (size == SZ_1M) {
+		return m4u_map_section(m4u_domain, iova, paddr, prot, false);
+	} else if (size == SZ_16M) {
+		return m4u_map_section(m4u_domain, iova, paddr, prot, true);
+	} else {
+		return -EINVAL;
+	}
+}
+
+static int m4u_check_free_pte(struct mtk_iommu_domain *domain,
+			      struct imu_pgd_t *pgd)
+{
+	struct imu_pte_t *pte;
+	int i;
+
+	pte = imu_pte_map(pgd);
+	for (i = 0; i < IMU_PTRS_PER_PTE; i++, pte++) {
+		if (imu_pte_val(*pte) != 0)
+			return 1;
+	}
+
+	m4u_free_pte(domain, pgd);
+	return 0;
+}
+
+int m4u_unmap(struct mtk_iommu_domain *domain, unsigned int iova,
+	      unsigned int size)
+{
+	struct imu_pgd_t *pgd;
+	int i, ret;
+	unsigned long end_plus_1 = (unsigned long)iova + size;
+
+	do {
+		pgd = imu_pgd_offset(domain, iova);
+
+		if (F_PGD_TYPE_IS_PAGE(*pgd)) {
+			struct imu_pte_t *pte;
+			unsigned int pte_offset;
+			unsigned int num_to_clean;
+
+			pte_offset = imu_pte_index(iova);
+			num_to_clean =
+			    min((unsigned int)((end_plus_1 - iova) / PAGE_SIZE),
+				(unsigned int)(IMU_PTRS_PER_PTE - pte_offset));
+
+			pte = imu_pte_offset_map(pgd, iova);
+
+			memset(pte, 0, num_to_clean << 2);
+
+			ret = m4u_check_free_pte(domain, pgd);
+			if (ret == 1)/* pte is not freed, need to flush pte */
+				m4u_pgtable_flush(pte, pte + num_to_clean);
+
+			iova += num_to_clean << PAGE_SHIFT;
+		} else if (F_PGD_TYPE_IS_SECTION(*pgd)) {
+			m4u_set_pgd_val(pgd, 0);
+			iova += MMU_SECTION_SIZE;
+		} else if (F_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
+			struct imu_pgd_t *start = imu_supersection_start(pgd);
+
+			if (unlikely(start != pgd))
+				dev_err(domain->piommuinfo->dev,
+					"%s:supper not align,iova=0x%x,pgd=0x%x\n",
+					__func__, iova, imu_pgd_val(*pgd));
+
+			for (i = 0; i < 16; i++)
+				m4u_set_pgd_val((start+i), 0);
+
+			iova = (iova + MMU_SUPERSECTION_SIZE) &
+				(~(MMU_SUPERSECTION_SIZE - 1));
+		} else {
+			iova += MMU_SECTION_SIZE;
+		}
+	} while (iova < end_plus_1 && iova);
+
+	return 0;
+}
+
+int m4u_get_pte_info(const struct mtk_iommu_domain *domain, unsigned int iova,
+		     struct m4u_pte_info_t *pte_info)
+{
+	struct imu_pgd_t *pgd;
+	struct imu_pte_t *pte;
+	unsigned int pa = 0;
+	unsigned int size;
+	int valid = 1;
+
+	pgd = imu_pgd_offset(domain, iova);
+
+	if (F_PGD_TYPE_IS_PAGE(*pgd)) {
+		pte = imu_pte_offset_map(pgd, iova);
+		if (F_PTE_TYPE_GET(imu_pte_val(*pte)) == F_PTE_TYPE_LARGE) {
+			pa = imu_pte_val(*pte) & F_PTE_PA_LARGE_MSK;
+			pa |= iova & (~F_PTE_PA_LARGE_MSK);
+			size = MMU_LARGE_PAGE_SIZE;
+		} else if (F_PTE_TYPE_GET(imu_pte_val(*pte))
+			   == F_PTE_TYPE_SMALL) {
+			pa = imu_pte_val(*pte) & F_PTE_PA_SMALL_MSK;
+			pa |= iova & (~F_PTE_PA_SMALL_MSK);
+			size = MMU_SMALL_PAGE_SIZE;
+		} else {
+			valid = 0;
+			size = MMU_SMALL_PAGE_SIZE;
+		}
+	} else {
+		pte = NULL;
+		if (F_PGD_TYPE_IS_SECTION(*pgd)) {
+			pa = imu_pgd_val(*pgd) & F_PGD_PA_SECTION_MSK;
+			pa |= iova & (~F_PGD_PA_SECTION_MSK);
+			size = MMU_SECTION_SIZE;
+		} else if (F_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
+			pa = imu_pgd_val(*pgd) & F_PGD_PA_SUPERSECTION_MSK;
+			pa |= iova & (~F_PGD_PA_SUPERSECTION_MSK);
+			size = MMU_SUPERSECTION_SIZE;
+		} else {
+			valid = 0;
+			size = MMU_SECTION_SIZE;
+		}
+	}
+
+	pte_info->pgd = pgd;
+	pte_info->pte = pte;
+	pte_info->iova = iova;
+	pte_info->pa = pa;
+	pte_info->size = size;
+	pte_info->valid = valid;
+	return 0;
+}
+
diff --git a/drivers/iommu/mtk_iommu_pagetable.h b/drivers/iommu/mtk_iommu_pagetable.h
new file mode 100644
index 0000000..ebdfc6c
--- /dev/null
+++ b/drivers/iommu/mtk_iommu_pagetable.h
@@ -0,0 +1,49 @@ 
+/*
+ * Copyright (c) 2014-2015 MediaTek Inc.
+ * Author: Yong Wu <yong.wu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef MTK_IOMMU_PAGETABLE_H
+#define MTK_IOMMU_PAGETABLE_H
+
+#define MMU_SMALL_PAGE_SIZE     (SZ_4K)
+#define MMU_LARGE_PAGE_SIZE     (SZ_64K)
+#define MMU_SECTION_SIZE        (SZ_1M)
+#define MMU_SUPERSECTION_SIZE   (SZ_16M)
+
+#define IMU_PGDIR_SHIFT   20
+#define IMU_PAGE_SHIFT    12
+#define IMU_PTRS_PER_PGD  4096
+#define IMU_PTRS_PER_PTE  256
+#define IMU_BYTES_PER_PTE (IMU_PTRS_PER_PTE*sizeof(unsigned int))
+
+struct imu_pte_t {
+	unsigned int imu_pte;
+};
+
+struct imu_pgd_t {
+	unsigned int imu_pgd;
+};
+
+#define imu_pte_val(x)      ((x).imu_pte)
+#define imu_pgd_val(x)      ((x).imu_pgd)
+
+struct m4u_pte_info_t {
+	struct imu_pgd_t *pgd;
+	struct imu_pte_t *pte;
+	unsigned int iova;
+	phys_addr_t pa;
+	unsigned int size;
+	int valid;
+};
+
+#endif
+