diff mbox series

[2/3] iommu: Add Allwinner H6 IOMMU driver

Message ID fe383c3f6fa0db772c87d9d9080add97efe9ba1a.1579696927.git-series.maxime@cerno.tech (mailing list archive)
State New, archived
Headers show
Series iommu: Add Allwinner H6 IOMMU driver | expand

Commit Message

Maxime Ripard Jan. 22, 2020, 12:44 p.m. UTC
The Allwinner H6 has introduced an IOMMU for a few DMA controllers, mostly
video related: the display engine, the video decoders / encoders, the
camera capture controller, etc.

The design is pretty simple compared to other IOMMUs found in SoCs: there's
a single instance, controlling all the masters, with a single address
space.

It also features a performance monitoring unit that allows to retrieve
various informations (per-master and global TLB accesses, hits and misses,
access latency, etc) that isn't supported at the moment.

Signed-off-by: Maxime Ripard <maxime@cerno.tech>
---
 drivers/iommu/Kconfig        |   10 +-
 drivers/iommu/Makefile       |    1 +-
 drivers/iommu/sun50i-iommu.c | 1126 +++++++++++++++++++++++++++++++++++-
 3 files changed, 1137 insertions(+)
 create mode 100644 drivers/iommu/sun50i-iommu.c

Comments

Robin Murphy Jan. 23, 2020, 6:47 p.m. UTC | #1
Hi Maxime,

This looks like a pretty decent first cut of a driver, thanks! Plenty of 
comments, but overall it looks to be in mostly the right shape to begin 
with.

On 22/01/2020 12:44 pm, Maxime Ripard wrote:
> The Allwinner H6 has introduced an IOMMU for a few DMA controllers, mostly
> video related: the display engine, the video decoders / encoders, the
> camera capture controller, etc.
> 
> The design is pretty simple compared to other IOMMUs found in SoCs: there's
> a single instance, controlling all the masters, with a single address
> space.
> 
> It also features a performance monitoring unit that allows to retrieve
> various informations (per-master and global TLB accesses, hits and misses,
> access latency, etc) that isn't supported at the moment.

Yes, it sounds like that might ideally be a separate perf_events driver, 
but there's no need to get into the details of that until the basic 
IOMMU driver is baked.

> Signed-off-by: Maxime Ripard <maxime@cerno.tech>
> ---
>   drivers/iommu/Kconfig        |   10 +-
>   drivers/iommu/Makefile       |    1 +-
>   drivers/iommu/sun50i-iommu.c | 1126 +++++++++++++++++++++++++++++++++++-
>   3 files changed, 1137 insertions(+)
>   create mode 100644 drivers/iommu/sun50i-iommu.c
> 
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 0b9d78a0f3ac..5cbfa6f282e2 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -289,6 +289,16 @@ config ROCKCHIP_IOMMU
>   	  Say Y here if you are using a Rockchip SoC that includes an IOMMU
>   	  device.
>   
> +config SUN50I_IOMMU
> +	bool "Allwinner H6 IOMMU Support"
> +	depends on ARM || ARM64
> +	depends on ARCH_SUNXI

Surely ARCH_SUNXI implies "ARM || ARM64" anyway? Ideally there shouldn't 
be anything CPU-specific here, so supporting COMPILE_TEST as well would 
be nice.

> +	select ARM_DMA_USE_IOMMU
> +	select IOMMU_API
> +	select IOMMU_DMA
> +	help
> +	  Support for the IOMMU introduced in the Allwinner H6 SoCs.
> +
>   config TEGRA_IOMMU_GART
>   	bool "Tegra GART IOMMU Support"
>   	depends on ARCH_TEGRA_2x_SOC
> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
> index 97814cc861ea..43740a755786 100644
> --- a/drivers/iommu/Makefile
> +++ b/drivers/iommu/Makefile
> @@ -28,6 +28,7 @@ obj-$(CONFIG_MTK_IOMMU_V1) += mtk_iommu_v1.o
>   obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o
>   obj-$(CONFIG_OMAP_IOMMU_DEBUG) += omap-iommu-debug.o
>   obj-$(CONFIG_ROCKCHIP_IOMMU) += rockchip-iommu.o
> +obj-$(CONFIG_SUN50I_IOMMU) += sun50i-iommu.o
>   obj-$(CONFIG_TEGRA_IOMMU_GART) += tegra-gart.o
>   obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-smmu.o
>   obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
> diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
> new file mode 100644
> index 000000000000..ffca92628006
> --- /dev/null
> +++ b/drivers/iommu/sun50i-iommu.c
> @@ -0,0 +1,1126 @@
> +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
> +// Copyright (C) 2016-2018, Allwinner Technology CO., LTD.
> +// Copyright (C) 2019-2020, Cerno
> +
> +#include <linux/bug.h>
> +#include <linux/clk.h>
> +#include <linux/device.h>
> +#include <linux/dma-direction.h>
> +#include <linux/dma-iommu.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/err.h>
> +#include <linux/errno.h>
> +#include <linux/interrupt.h>
> +#include <linux/iommu.h>
> +#include <linux/iopoll.h>
> +#include <linux/ioport.h>
> +#include <linux/log2.h>
> +#include <linux/module.h>
> +#include <linux/of_platform.h>
> +#include <linux/platform_device.h>
> +#include <linux/pm.h>
> +#include <linux/pm_runtime.h>
> +#include <linux/reset.h>
> +#include <linux/sizes.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/types.h>
> +
> +#define IOMMU_RESET_REG			0x010
> +#define IOMMU_ENABLE_REG		0x020
> +#define IOMMU_ENABLE_ENABLE			BIT(0)
> +
> +#define IOMMU_BYPASS_REG		0x030
> +#define IOMMU_AUTO_GATING_REG		0x040
> +#define IOMMU_AUTO_GATING_ENABLE		BIT(0)
> +
> +#define IOMMU_WBUF_CTRL_REG		0x044
> +#define IOMMU_OOO_CTRL_REG		0x048
> +#define IOMMU_4KB_BDY_PRT_CTRL_REG	0x04c
> +#define IOMMU_TTB_REG			0x050
> +#define IOMMU_TLB_ENABLE_REG		0x060
> +#define IOMMU_TLB_PREFETCH_REG		0x070
> +#define IOMMU_TLB_PREFETCH_MASTER_ENABLE(m)	BIT(m)
> +
> +#define IOMMU_TLB_FLUSH_REG		0x080
> +#define IOMMU_TLB_FLUSH_PTW_CACHE		BIT(17)
> +#define IOMMU_TLB_FLUSH_MACRO_TLB		BIT(16)
> +#define IOMMU_TLB_FLUSH_MICRO_TLB(i)		(BIT(i) & GENMASK(5, 0))
> +
> +#define IOMMU_TLB_IVLD_ADDR_REG		0x090
> +#define IOMMU_TLB_IVLD_ADDR_MASK_REG	0x094
> +#define IOMMU_TLB_IVLD_ENABLE_REG	0x098
> +#define IOMMU_TLB_IVLD_ENABLE_ENABLE		BIT(0)
> +
> +#define IOMMU_PC_IVLD_ADDR_REG		0x0a0
> +#define IOMMU_PC_IVLD_ENABLE_REG	0x0a8
> +#define IOMMU_PC_IVLD_ENABLE_ENABLE		BIT(0)
> +
> +#define IOMMU_DM_AUT_CTRL_REG(d)	(0x0b0 + ((d) / 2) * 4)
> +#define IOMMU_DM_AUT_CTRL_RD_UNAVAIL(d, m)	(1 << (((d & 1) * 16) + ((m) * 2)))
> +#define IOMMU_DM_AUT_CTRL_RD_AVAIL(d, m)	(0 << (((d & 1) * 16) + ((m) * 2)))

It might just be personal opinion, but I think overly-complex ways of 
expressing 0 are usually better left entirely unsaid. Plus this makes 
sun50i_iommu_resume() appear to be doing a lot more than it really is - 
it seems sufficiently straightforward to me for the mere absence of 
"available" to imply "unavailable".

> +#define IOMMU_DM_AUT_CTRL_WR_UNAVAIL(d, m)	(1 << (((d & 1) * 16) + ((m) * 2) + 1))
> +#define IOMMU_DM_AUT_CTRL_WR_AVAIL(d, m)	(0 << (((d & 1) * 16) + ((m) * 2) + 1))
> +
> +#define IOMMU_DM_AUT_OVWT_REG		0x0d0
> +#define IOMMU_INT_ENABLE_REG		0x100
> +#define IOMMU_INT_CLR_REG		0x104
> +#define IOMMU_INT_STA_REG		0x108
> +#define IOMMU_INT_ERR_ADDR_REG(i)	(0x110 + (i) * 4)
> +#define IOMMU_INT_ERR_ADDR_L1_REG	0x130
> +#define IOMMU_INT_ERR_ADDR_L2_REG	0x134
> +#define IOMMU_INT_ERR_DATA_REG(i)	(0x150 + (i) * 4)
> +#define IOMMU_L1PG_INT_REG		0x0180
> +#define IOMMU_L2PG_INT_REG		0x0184
> +
> +#define IOMMU_INT_INVALID_L2PG			BIT(17)
> +#define IOMMU_INT_INVALID_L1PG			BIT(16)
> +#define IOMMU_INT_MASTER_PERMISSION(m)		BIT(m)
> +#define IOMMU_INT_MASTER_MASK			(IOMMU_INT_MASTER_PERMISSION(0) | \
> +						 IOMMU_INT_MASTER_PERMISSION(1) | \
> +						 IOMMU_INT_MASTER_PERMISSION(2) | \
> +						 IOMMU_INT_MASTER_PERMISSION(3) | \
> +						 IOMMU_INT_MASTER_PERMISSION(4) | \
> +						 IOMMU_INT_MASTER_PERMISSION(5))
> +#define IOMMU_INT_MASK				(IOMMU_INT_INVALID_L1PG | \
> +						 IOMMU_INT_INVALID_L2PG | \
> +						 IOMMU_INT_MASTER_MASK)
> +
> +#define PT_ENTRY_SIZE			sizeof(u32)
> +
> +#define NUM_DT_ENTRIES			4096
> +#define DT_SIZE				(NUM_DT_ENTRIES * PT_ENTRY_SIZE)
> +
> +#define NUM_PT_ENTRIES			256
> +#define PT_SIZE				(NUM_PT_ENTRIES * PT_ENTRY_SIZE)

Sigh, legacy ARM format strikes again... oh for nice logical 1024*1024 
instead of wonky 4096*256 :P

> +struct sun50i_iommu {
> +	struct iommu_device iommu;
> +
> +	/* Lock to modify the IOMMU registers */
> +	spinlock_t iommu_lock;
> +
> +	struct device *dev;
> +	void __iomem *base;
> +	struct reset_control *reset;
> +	struct clk *clk;
> +
> +	struct iommu_domain *domain;
> +	struct iommu_group *group;
> +	struct kmem_cache *pt_pool;
> +};
> +
> +struct sun50i_iommu_domain {
> +	struct iommu_domain domain;
> +
> +	/* Lock to modify the Directory Table */
> +	spinlock_t dt_lock;
> +
> +	/* L1 Page Table */
> +	u32 *dt;
> +	dma_addr_t dt_dma;

AFAICS you never use this. More on that later...

> +	struct sun50i_iommu *iommu;
> +};
> +
> +static struct sun50i_iommu_domain *to_sun50i_domain(struct iommu_domain *domain)
> +{
> +	return container_of(domain, struct sun50i_iommu_domain, domain);
> +}
> +
> +static struct sun50i_iommu *sun50i_iommu_from_dev(struct device *dev)
> +{
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +
> +	if (!fwspec)
> +		return NULL;

Strictly you should only need that check in .add_device and .attach_dev 
- if the other callbacks end up getting passed the wrong device then 
things have arguably gone crash-worthily wrong already.

> +	return fwspec->iommu_priv;
> +}
> +
> +static inline u32 iommu_read(struct sun50i_iommu *iommu, u32 offset)
> +{
> +	return readl(iommu->base + offset);
> +}
> +
> +static inline void iommu_write(struct sun50i_iommu *iommu,
> +			       u32 offset, u32 value)
> +{
> +	writel(value, iommu->base + offset);
> +}
> +
> +/*
> + * The Allwinner H6 IOMMU uses a 2-level page table.
> + *
> + * The first level is the usual Directory Table (DT), that consists of
> + * 4096 4-bytes Directory Table Entries (DTE), each pointing to a Page
> + * Table (PT).
> + *
> + * Each PT consits of 256 4-bytes Page Table Entries (PTE), each
> + * pointing to a 4kB page of physical memory.
> + *
> + * The IOMMU supports a single DT, pointed by the IOMMU_TTB_REG
> + * register that contains its physical address.
> + */
> +
> +#define SUN50I_IOVA_DTE_MASK	GENMASK(31, 20)
> +#define SUN50I_IOVA_DTE_SHIFT	20
> +#define SUN50I_IOVA_PTE_MASK	GENMASK(19, 12)
> +#define SUN50I_IOVA_PTE_SHIFT	12
> +#define SUN50I_IOVA_PAGE_MASK	GENMASK(11, 0)
> +#define SUN50I_IOVA_PAGE_SHIFT	0
> +
> +static u32 sun50i_iova_dte_index(dma_addr_t iova)
> +{
> +	return (u32)(iova & SUN50I_IOVA_DTE_MASK) >> SUN50I_IOVA_DTE_SHIFT;
> +}
> +
> +static u32 sun50i_iova_pte_index(dma_addr_t iova)
> +{
> +	return (u32)(iova & SUN50I_IOVA_PTE_MASK) >> SUN50I_IOVA_PTE_SHIFT;
> +}
> +
> +static u32 sun50i_iova_page_offset(dma_addr_t iova)
> +{
> +	return (u32)(iova & SUN50I_IOVA_PAGE_MASK) >> SUN50I_IOVA_PAGE_SHIFT;

A shift in a "page offset" calculation looks weird and surprising at the 
best of times, so a redundant one is *actively* confounding.

> +}
> +
> +/*
> + * Each Directory Table Entry has a Page Table address and a valid
> + * bit:
> +
> + * +---------------------+-----------+-+
> + * | PT address          | Reserved  |V|
> + * +---------------------+-----------+-+
> + *  31:10 - Page Table address
> + *   9:2  - Reserved
> + *   1:0  - 1 if the entry is valid
> + */
> +
> +#define SUN50I_DTE_PT_ADDRESS_MASK	GENMASK(31, 10)
> +#define SUN50I_DTE_PT_ATTRS		GENMASK(1, 0)
> +#define SUN50I_DTE_PT_VALID		BIT(0)
> +
> +static inline phys_addr_t sun50i_dte_pt_address(u32 dte)
> +{
> +	return (phys_addr_t)dte & SUN50I_DTE_PT_ADDRESS_MASK;
> +}
> +
> +static inline bool sun50i_dte_is_pt_valid(u32 dte)
> +{
> +	return (dte & SUN50I_DTE_PT_ATTRS) == SUN50I_DTE_PT_VALID;

If SUN50I_DTE_PT_ATTRS was being used consistently with an expectation 
that it might possibly grow in future, then it might deserve to stay, 
but right now I'd say just "return dte & SUN50I_DTE_PT_VALID" here to 
match the use below.

> +}
> +
> +static inline u32 sun50i_mk_dte(dma_addr_t pt_dma)
> +{
> +	return (pt_dma & SUN50I_DTE_PT_ADDRESS_MASK) | SUN50I_DTE_PT_VALID;
> +}
> +
> +/*
> + * Each PTE has a Page address, an authority index and a valid bit:
> + *
> + * +----------------+-----+-----+-----+---+-----+
> + * | Page address   | Rsv | ACI | Rsv | V | Rsv |
> + * +----------------+-----+-----+-----+---+-----+
> + *  31:12 - Page address
> + *  11:8  - Reserved
> + *   7:4  - Authority Control Index
> + *   3:2  - Reserved
> + *     1  - 1 if the entry is valid
> + *     0  - Reserved
> + *
> + * The way permissions work is that the IOMMU has 16 "domains" that
> + * can be configured to give each masters either read or write
> + * permissions through the IOMMU_DM_AUT_CTRL_REG registers. The domain
> + * 0 seems like the default domain, and its permissions in the
> + * IOMMU_DM_AUT_CTRL_REG are only read-only, so it's not really
> + * useful to enforce any particular permission.
> + *
> + * Each page entry willthen have a reference to the domain they are

Nit: missing space

> + * affected to, so that we can actually enforce them on a per-page
> + * basis.
> + *
> + * In order to make it work with the IOMMU framework, we will be using
> + * 4 different domains, starting at 1: RD_WR, RD, WR and NONE
> + * depending on the permission we want to enforce. Each domain will
> + * have each master setup in the same way, since the IOMMU framework
> + * doesn't seem to restrict page access on a per-device basis. And
> + * then we will use the relevant domain index when generating the page
> + * table entry depending on the permissions we want to be enforced.
> + */
> +
> +enum iommu_aci {
> +	IOMMU_ACI_DO_NOT_USE = 0,
> +	IOMMU_ACI_NONE,
> +	IOMMU_ACI_RD,
> +	IOMMU_ACI_WR,
> +	IOMMU_ACI_RD_WR,
> +};

The "iommu" namespace is already somewhat overloaded between the core 
API and amd-iommu - "sun50i_aci" might be nicer.

> +#define SUN50I_PTE_PAGE_ADDRESS_MASK	GENMASK(31, 12)
> +#define SUN50I_PTE_ACI_MASK		GENMASK(7, 4)
> +#define SUN50I_PTE_ACI_SHIFT		4
> +#define SUN50I_PTE_ACI(aci)		((aci) << SUN50I_PTE_ACI_SHIFT)
> +#define SUN50I_PTE_PAGE_VALID		BIT(1)
> +
> +static inline phys_addr_t sun50i_pte_page_address(u32 pte)
> +{
> +	return (phys_addr_t)pte & SUN50I_PTE_PAGE_ADDRESS_MASK;
> +}
> +
> +static inline enum iommu_aci sun50i_pte_aci(u32 pte)

Eww, "lowercase name does opposite of uppercase name" is a confusingly 
unfamiliar paradigm - can we have a clearer distinction between "get" 
and "set" operations please? Plus a bit more consistency in terms of 
macros vs. helper functions (maybe consider bitfield.h helpers too).

> +{
> +	return (pte & SUN50I_PTE_ACI_MASK) >> SUN50I_PTE_ACI_SHIFT;
> +}
> +
> +static inline bool sun50i_pte_is_page_valid(u32 pte)
> +{
> +	return pte & SUN50I_PTE_PAGE_VALID;
> +}

Furthermore this isn't a header so there's no need for explicit "inline" 
declarations - we can trust the compiler not to be a complete idiot.

> +
> +static u32 sun50i_mk_pte(phys_addr_t page, int prot)
> +{
> +	enum iommu_aci aci;
> +	u32 flags = 0;
> +
> +	if (prot & (IOMMU_READ | IOMMU_WRITE))
> +		aci = IOMMU_ACI_RD_WR;
> +	else if (prot & IOMMU_READ)
> +		aci = IOMMU_ACI_RD;
> +	else if (prot & IOMMU_WRITE)
> +		aci = IOMMU_ACI_WR;
> +	else
> +		aci = IOMMU_ACI_NONE;
> +
> +	flags |= SUN50I_PTE_ACI(aci);
> +	page &= SUN50I_PTE_PAGE_ADDRESS_MASK;
> +	return page | flags | SUN50I_PTE_PAGE_VALID;
> +}
> +
> +static inline void sun50i_table_flush(struct sun50i_iommu_domain *sun50i_domain,
> +				      dma_addr_t dma, unsigned int count)
> +{
> +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
> +	size_t size = count * PT_ENTRY_SIZE;

All but one callsite passes "virt_to_phys(...)" for the @dma argument 
here, so it's probably more sensible to fold that in here and take a PTE 
pointer as the argument.

> +
> +	dma_sync_single_for_device(iommu->dev, dma, size, DMA_TO_DEVICE);
> +}
> +
> +static int sun50i_iommu_flush_all_tlb(struct sun50i_iommu *iommu)
> +{
> +	u32 reg;
> +	int ret;
> +
> +	assert_spin_locked(&iommu->iommu_lock);
> +
> +	iommu_write(iommu,
> +		    IOMMU_TLB_FLUSH_REG,
> +		    IOMMU_TLB_FLUSH_PTW_CACHE |
> +		    IOMMU_TLB_FLUSH_MACRO_TLB |
> +		    IOMMU_TLB_FLUSH_MICRO_TLB(5) |
> +		    IOMMU_TLB_FLUSH_MICRO_TLB(4) |
> +		    IOMMU_TLB_FLUSH_MICRO_TLB(3) |
> +		    IOMMU_TLB_FLUSH_MICRO_TLB(2) |
> +		    IOMMU_TLB_FLUSH_MICRO_TLB(1) |
> +		    IOMMU_TLB_FLUSH_MICRO_TLB(0));
> +
> +	ret = readl_poll_timeout(iommu->base + IOMMU_TLB_FLUSH_REG,
> +				 reg, !reg,
> +				 1, 2000);
> +	if (ret)
> +		dev_err(iommu->dev, "Enable flush all request timed out\n");
> +
> +	return ret;
> +}
> +
> +static int sun50i_iommu_tlb_invalidate(struct sun50i_iommu *iommu,
> +				       dma_addr_t iova)
> +{
> +	int ret;
> +	u32 reg;
> +
> +	assert_spin_locked(&iommu->iommu_lock);
> +
> +	iommu_write(iommu, IOMMU_TLB_IVLD_ADDR_REG, iova);
> +	iommu_write(iommu, IOMMU_TLB_IVLD_ADDR_MASK_REG,
> +		    SUN50I_PTE_PAGE_ADDRESS_MASK);

Does this imply that the hardware might support power-of-two 
invalidations larger than a single page? No need to worry about it for 
the initial driver, but it could be worth investigating for future 
optimisation.

> +	iommu_write(iommu, IOMMU_TLB_IVLD_ENABLE_REG,
> +		    IOMMU_TLB_IVLD_ENABLE_ENABLE);
> +
> +	ret = readl_poll_timeout(iommu->base + IOMMU_TLB_IVLD_ENABLE_REG,
> +				 reg, !(reg & IOMMU_TLB_IVLD_ENABLE_ENABLE),
> +				 1, 2000);
> +	if (ret)
> +		dev_err(iommu->dev, "TLB Invalid timed out\n");
> +
> +	return ret;
> +}
> +
> +static int sun50i_iommu_ptw_invalidate(struct sun50i_iommu *iommu,
> +				       dma_addr_t iova)
> +{
> +	int ret;
> +	u32 reg;
> +
> +	assert_spin_locked(&iommu->iommu_lock);
> +
> +	iommu_write(iommu, IOMMU_PC_IVLD_ADDR_REG, iova);
> +	iommu_write(iommu, IOMMU_PC_IVLD_ENABLE_REG,
> +		    IOMMU_PC_IVLD_ENABLE_ENABLE);
> +
> +	ret = readl_poll_timeout(iommu->base + IOMMU_PC_IVLD_ENABLE_REG,
> +				 reg, !(reg & IOMMU_PC_IVLD_ENABLE_ENABLE),
> +				 1, 2000);
> +	if (ret)
> +		dev_err(iommu->dev, "PTW cache invalid timed out\n");
> +
> +	return ret;
> +}
> +
> +static u32 *sun50i_dte_get_page_table(struct sun50i_iommu_domain *sun50i_domain,
> +				      dma_addr_t iova)
> +{
> +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
> +	u32 *page_table, *dte_addr;
> +	phys_addr_t pt_phys;
> +	dma_addr_t pt_dma;
> +	u32 dte_index, dte;
> +
> +	assert_spin_locked(&sun50i_domain->dt_lock);
> +
> +	dte_index = sun50i_iova_dte_index(iova);
> +	dte_addr = &sun50i_domain->dt[dte_index];
> +	dte = *dte_addr;
> +	if (sun50i_dte_is_pt_valid(dte))
> +		goto done;
> +
> +	page_table = kmem_cache_zalloc(iommu->pt_pool, GFP_ATOMIC);

Can you respect the new gfp argument from ->map() here, or is that one 
of those things that kmem_caches can be overly picky about?

> +	if (!page_table)
> +		return ERR_PTR(-ENOMEM);
> +
> +	pt_dma = dma_map_single(iommu->dev, page_table, PT_SIZE, DMA_TO_DEVICE);
> +	if (dma_mapping_error(iommu->dev, pt_dma)) {
> +		dev_err(iommu->dev, "Couldn't map L2 Page Table\n");
> +		kmem_cache_free(iommu->pt_pool, page_table);
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	/* We rely on the physical address and DMA address being the same */
> +	WARN_ON(pt_dma != virt_to_phys(page_table));
> +
> +	dte = sun50i_mk_dte(pt_dma);
> +	*dte_addr = dte;
> +	sun50i_table_flush(sun50i_domain, pt_dma, PT_SIZE);
> +	sun50i_table_flush(sun50i_domain, virt_to_phys(dte_addr), 1);
> +
> +done:
> +	pt_phys = sun50i_dte_pt_address(dte);
> +	return (u32 *)phys_to_virt(pt_phys);

May as well just put the phys/virt dance in the early-out path and 
return page_table directly here.

> +}
> +
> +static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova,
> +			    phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
> +{
> +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
> +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
> +	u32 pte_index;
> +	u32 *page_table, *pte_addr;
> +	unsigned long flags;
> +	int ret = 0;
> +
> +	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
> +	page_table = sun50i_dte_get_page_table(sun50i_domain, iova);
> +	if (IS_ERR(page_table)) {
> +		ret = PTR_ERR(page_table);
> +		goto out;
> +	}
> +
> +	pte_index = sun50i_iova_pte_index(iova);
> +	pte_addr = &page_table[pte_index];
> +	if (sun50i_pte_is_page_valid(*pte_addr)) {
> +		phys_addr_t page_phys = sun50i_pte_page_address(*pte_addr);
> +		dev_err(iommu->dev,
> +			"iova %pad already mapped to %pa cannot remap to %pa prot: %#x\n",
> +			&iova, &page_phys, &paddr, prot);
> +		ret = -EADDRINUSE;

I think that strictly refers to a *network* address - I'd be inclined to 
stick with boring old EBUSY.

> +		goto out;
> +	}
> +
> +	*pte_addr = sun50i_mk_pte(paddr, prot);
> +	sun50i_table_flush(sun50i_domain, virt_to_phys(pte_addr), 1);
> +
> +	spin_lock_irqsave(&iommu->iommu_lock, flags);
> +
> +	sun50i_iommu_tlb_invalidate(iommu, iova);
> +	sun50i_iommu_ptw_invalidate(iommu, iova);

Does the walk cache actually cache PTEs as well as DTEs? If it's really 
only the latter, the maintenance could perhaps be folded into pagetable 
allocation (being the only place that ever changes them).

> +
> +	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
> +
> +out:
> +	spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
> +	return ret;
> +}
> +
> +static size_t sun50i_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
> +				 size_t size, struct iommu_iotlb_gather *gather)
> +{
> +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
> +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
> +	unsigned long flags;
> +	phys_addr_t pt_phys;
> +	dma_addr_t pte_dma;
> +	u32 *pte_addr;
> +	u32 dte;
> +
> +	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
> +
> +	dte = sun50i_domain->dt[sun50i_iova_dte_index(iova)];
> +	if (!sun50i_dte_is_pt_valid(dte)) {
> +		spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
> +		return 0;
> +	}
> +
> +	pt_phys = sun50i_dte_pt_address(dte);
> +	pte_addr = (u32 *)phys_to_virt(pt_phys) + sun50i_iova_pte_index(iova);
> +	pte_dma = pt_phys + sun50i_iova_pte_index(iova) * PT_ENTRY_SIZE;
> +
> +	if (!sun50i_pte_is_page_valid(*pte_addr)) {
> +		spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
> +		return 0;
> +	}
> +
> +	memset(pte_addr, 0, sizeof(*pte_addr));
> +	sun50i_table_flush(sun50i_domain, virt_to_phys(pte_addr), 1);
> +
> +	spin_lock(&iommu->iommu_lock);
> +	sun50i_iommu_tlb_invalidate(iommu, iova);
> +	sun50i_iommu_ptw_invalidate(iommu, iova);
> +	spin_unlock(&iommu->iommu_lock);
> +
> +	spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
> +
> +	return SZ_4K;
> +}
> +
> +static phys_addr_t sun50i_iommu_iova_to_phys(struct iommu_domain *domain,
> +					     dma_addr_t iova)
> +{
> +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
> +	phys_addr_t pt_phys, phys = 0;
> +	unsigned long flags;
> +	u32 *page_table;
> +	u32 dte, pte;
> +
> +	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
> +
> +	dte = sun50i_domain->dt[sun50i_iova_dte_index(iova)];
> +	if (!sun50i_dte_is_pt_valid(dte))
> +		goto out;
> +
> +	pt_phys = sun50i_dte_pt_address(dte);
> +	page_table = (u32 *)phys_to_virt(pt_phys);
> +	pte = page_table[sun50i_iova_pte_index(iova)];
> +	if (!sun50i_pte_is_page_valid(pte))
> +		goto out;
> +
> +	phys = sun50i_pte_page_address(pte) + sun50i_iova_page_offset(iova);
> +
> +out:
> +	spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
> +	return phys;
> +}
> +
> +static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type)
> +{
> +	struct sun50i_iommu_domain *sun50i_domain;
> +
> +	if (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_UNMANAGED)
> +		return NULL;
> +
> +	sun50i_domain = kzalloc(sizeof(*sun50i_domain), GFP_KERNEL);
> +	if (!sun50i_domain)
> +		return NULL;
> +
> +	if (type == IOMMU_DOMAIN_DMA &&
> +	    iommu_get_dma_cookie(&sun50i_domain->domain))
> +		goto err_free_domain;
> +
> +	sun50i_domain->dt = (u32 *)__get_free_pages(GFP_KERNEL,
> +						    get_order(DT_SIZE));
> +	if (!sun50i_domain->dt)
> +		goto err_put_cookie;
> +	memset(sun50i_domain->dt, 0, DT_SIZE);
> +
> +	spin_lock_init(&sun50i_domain->dt_lock);
> +
> +	sun50i_domain->domain.geometry.aperture_start = 0;
> +	sun50i_domain->domain.geometry.aperture_end = DMA_BIT_MASK(32);
> +	sun50i_domain->domain.geometry.force_aperture = true;
> +
> +	return &sun50i_domain->domain;
> +
> +err_put_cookie:
> +	if (type == IOMMU_DOMAIN_DMA)
> +		iommu_put_dma_cookie(&sun50i_domain->domain);
> +
> +err_free_domain:
> +	kfree(sun50i_domain);
> +
> +	return NULL;
> +}
> +
> +static void sun50i_iommu_domain_free(struct iommu_domain *domain)
> +{
> +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
> +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
> +	unsigned long flags;
> +	int i;
> +
> +	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
> +
> +	for (i = 0; i < NUM_DT_ENTRIES; i++) {
> +		phys_addr_t pt_phys;
> +		u32 *page_table;
> +		u32 *dte_addr;
> +		u32 dte;
> +
> +		dte_addr = &sun50i_domain->dt[i];
> +		dte = *dte_addr;
> +		if (!sun50i_dte_is_pt_valid(dte))
> +			continue;
> +
> +		memset(dte_addr, 0, sizeof(*dte_addr));
> +		sun50i_table_flush(sun50i_domain, virt_to_phys(dte_addr), 1);

This shouldn't be necessary - freeing a domain while it's still live is 
an incredibly very wrong thing to do, so the hardware should have 
already been programmed to no longer walk this table by this point.

> +
> +		pt_phys = sun50i_dte_pt_address(dte);
> +		dma_unmap_single(iommu->dev, pt_phys, PT_SIZE, DMA_TO_DEVICE);
> +
> +		page_table = phys_to_virt(pt_phys);
> +		kmem_cache_free(iommu->pt_pool, page_table);
> +	}
> +
> +	free_pages((unsigned long)sun50i_domain->dt, get_order(DT_SIZE));
> +	sun50i_domain->dt = NULL;
> +
> +	spin_lock(&iommu->iommu_lock);
> +	sun50i_iommu_flush_all_tlb(iommu);
> +	spin_unlock(&iommu->iommu_lock);

And this looks very out-of-place. I'd expect any invalidation to have 
happened at the point that translation was disabled, or be deferred 
until it is next reenabled. Consider that users are within their rights 
to call iommu_domain_alloc() 17 times in a row and immediately free them 
all again without ever attaching any devices; there should be no reason 
to ever touch the hardware in such a case.

> +
> +	spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
> +	iommu_put_dma_cookie(domain);
> +
> +	kfree(sun50i_domain);
> +}
> +
> +static void sun50i_iommu_detach_device(struct iommu_domain *domain,
> +				       struct device *dev)
> +{
> +	struct sun50i_iommu *iommu;
> +
> +	iommu = sun50i_iommu_from_dev(dev);
> +	if (!iommu)
> +		return;
> +
> +	dev_info(dev, "Detaching from IOMMU domain\n");

No.

> +
> +	if (iommu->domain != domain)
> +		return;
> +
> +	pm_runtime_put_sync(iommu->dev);
> +
> +	iommu->domain = NULL;
> +}
> +
> +static int sun50i_iommu_attach_device(struct iommu_domain *domain,
> +				      struct device *dev)
> +{
> +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
> +	struct sun50i_iommu *iommu;
> +
> +	iommu = sun50i_iommu_from_dev(dev);
> +	if (!iommu)
> +		return -ENODEV;
> +
> +	dev_info(dev, "Attaching to IOMMU domain\n");

Also no.

> +
> +	if (iommu->domain == domain)
> +		return 0;
> +
> +	if (iommu->domain)
> +		sun50i_iommu_detach_device(iommu->domain, dev);
> +
> +	iommu->domain = domain;
> +	sun50i_domain->iommu = iommu;
> +
> +	return pm_runtime_get_sync(iommu->dev);

Deferring all the actual hardware pogramming to the suspend/resume hooks 
is a fiendishly clever idea that took me more than a moment to make 
sense of, but how well does it work when RPM is compiled out or 
runtime-inhibited?

Furthermore, basing RPM on having a domain attached means that you'll 
effectively never turn the IOMMU off, even when all the clients are 
idle. It would make more sene to use device links like most other 
drivers do to properly model the producer/consumer relationship.

> +}
> +
> +static int sun50i_iommu_add_device(struct device *dev)
> +{
> +	struct sun50i_iommu *iommu;
> +	struct iommu_group *group;
> +
> +	iommu = sun50i_iommu_from_dev(dev);
> +	if (!iommu)
> +		return -ENODEV;
> +
> +	group = iommu_group_get_for_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	iommu_group_put(group);
> +
> +	return 0;
> +}
> +
> +static void sun50i_iommu_remove_device(struct device *dev)
> +{
> +	struct sun50i_iommu *iommu;
> +
> +	iommu = sun50i_iommu_from_dev(dev);
> +	if (!iommu)

By construction, that should be impossible - getting here requires 
dev->iommu_group to be non-NULL, which means the check in 
sun50i_iommu_add_device() passed.

> +		return;
> +
> +	iommu_group_remove_device(dev);
> +}
> +
> +static struct iommu_group *sun50i_iommu_device_group(struct device *dev)
> +{
> +	struct sun50i_iommu *iommu;
> +
> +	iommu = sun50i_iommu_from_dev(dev);
> +	if (!iommu)

Ditto - we should only ever be here from *within* sun50i_iommu_add_device().

> +		return NULL;
> +
> +	return iommu_group_ref_get(iommu->group);
> +}
> +
> +static int sun50i_iommu_of_xlate(struct device *dev,
> +				 struct of_phandle_args *args)
> +{
> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> +	struct platform_device *iommu_pdev;
> +	unsigned id = args->args[0];
> +
> +	iommu_pdev = of_find_device_by_node(args->np);
> +	if (WARN_ON(!iommu_pdev))

Similarly, to be able to get here at all means that sun50i_iommu_probe() 
has called iommu_device_register() and ultimately succeeded, so a device 
must exist.

> +		return -EINVAL;
> +
> +	fwspec->iommu_priv = platform_get_drvdata(iommu_pdev);
> +
> +	return iommu_fwspec_add_ids(dev, &id, 1);
> +}
> +
> +static struct iommu_ops sun50i_iommu_ops = {
> +	.pgsize_bitmap = SZ_4K,
> +	.map  = sun50i_iommu_map,
> +	.unmap = sun50i_iommu_unmap,
> +	.domain_alloc = sun50i_iommu_domain_alloc,
> +	.domain_free = sun50i_iommu_domain_free,
> +	.attach_dev = sun50i_iommu_attach_device,
> +	.detach_dev = sun50i_iommu_detach_device,
> +	.add_device = sun50i_iommu_add_device,
> +	.remove_device = sun50i_iommu_remove_device,
> +	.device_group	= sun50i_iommu_device_group,
> +	.of_xlate = sun50i_iommu_of_xlate,
> +	.iova_to_phys = sun50i_iommu_iova_to_phys,
> +};
> +
> +static void sun50i_iommu_report_fault(struct sun50i_iommu *iommu,
> +				      unsigned master, phys_addr_t iova,
> +				      unsigned prot)
> +{
> +	dev_err(iommu->dev, "Page fault for %pad (master %d, dir %s)\n",
> +		&iova, master, (prot == IOMMU_FAULT_WRITE) ? "wr" : "rd");

If you're expecting an external handler to be able to do something 
useful with faults, then save the log spam for the case where it's 
actually failed.

> +
> +	if (iommu->domain)
> +		report_iommu_fault(iommu->domain, iommu->dev, iova, prot);
> +	else
> +		dev_err(iommu->dev, "Page fault while iommu not attached to any domain?\n");
> +}
> +
> +static phys_addr_t sun50i_iommu_handle_pt_irq(struct sun50i_iommu *iommu,
> +					      unsigned addr_reg,
> +					      unsigned blame_reg)
> +{
> +	phys_addr_t iova;
> +	unsigned master;
> +	u32 blame;
> +
> +	assert_spin_locked(&iommu->iommu_lock);
> +
> +	iova = iommu_read(iommu, addr_reg);
> +	blame = iommu_read(iommu, blame_reg);
> +	master = ilog2(blame & IOMMU_INT_MASTER_MASK);
> +
> +	/*
> +	 * If the address is not in the page table, we can't get what
> +	 * operation triggered the fault. Assume it's a read
> +	 * operation.
> +	 */
> +	sun50i_iommu_report_fault(iommu, master, iova, IOMMU_FAULT_READ);
> +
> +	return iova;
> +}
> +
> +static phys_addr_t sun50i_iommu_handle_perm_irq(struct sun50i_iommu *iommu)
> +{
> +	enum iommu_aci aci;
> +	phys_addr_t iova;
> +	unsigned master;
> +	unsigned dir;
> +	u32 blame;
> +
> +	assert_spin_locked(&iommu->iommu_lock);
> +
> +	blame = iommu_read(iommu, IOMMU_INT_STA_REG);
> +	master = ilog2(blame & IOMMU_INT_MASTER_MASK);
> +	iova = iommu_read(iommu, IOMMU_INT_ERR_ADDR_REG(master));
> +	aci = sun50i_pte_aci(iommu_read(iommu, IOMMU_INT_ERR_DATA_REG(master)));
> +
> +	switch (aci) {
> +		/*
> +		 * If we are in the read-only domain, then it means we
> +		 * tried to write.
> +		 */
> +	case IOMMU_ACI_RD:
> +		dir = IOMMU_FAULT_WRITE;
> +		break;
> +
> +		/*
> +		 * If we are in the write-only domain, then it means
> +		 * we tried to read.
> +		 */
> +	case IOMMU_ACI_WR:
> +
> +		/*
> +		 * If we are in the domain without any permission, we
> +		 * can't really tell. Let's default to a read
> +		 * operation.
> +		 */
> +	case IOMMU_ACI_NONE:
> +
> +		/* WTF? */
> +	case IOMMU_ACI_RD_WR:
> +	default:
> +		dir = IOMMU_FAULT_READ;
> +		break;
> +	}
> +
> +	/*
> +	 * If the address is not in the page table, we can't get what
> +	 * operation triggered the fault. Assume it's a read
> +	 * operation.
> +	 */
> +	sun50i_iommu_report_fault(iommu, master, iova, dir);
> +
> +	return iova;
> +}
> +
> +static irqreturn_t sun50i_iommu_irq(int irq, void *dev_id)
> +{
> +	struct sun50i_iommu *iommu = dev_id;
> +	phys_addr_t iova;
> +	u32 status;
> +
> +	spin_lock(&iommu->iommu_lock);
> +
> +	status = iommu_read(iommu, IOMMU_INT_STA_REG);
> +	if (!(status & IOMMU_INT_MASK)) {
> +		spin_unlock(&iommu->iommu_lock);
> +		return IRQ_NONE;
> +	}
> +
> +	if (status & IOMMU_INT_INVALID_L2PG)
> +		iova = sun50i_iommu_handle_pt_irq(iommu,
> +						  IOMMU_INT_ERR_ADDR_L2_REG,
> +						  IOMMU_L2PG_INT_REG);
> +	else if (status & IOMMU_INT_INVALID_L1PG)
> +		iova = sun50i_iommu_handle_pt_irq(iommu,
> +						  IOMMU_INT_ERR_ADDR_L1_REG,
> +						  IOMMU_L1PG_INT_REG);
> +	else
> +		iova = sun50i_iommu_handle_perm_irq(iommu);
> +
> +	sun50i_iommu_tlb_invalidate(iommu, iova);
> +	sun50i_iommu_ptw_invalidate(iommu, iova);

Yikes, does the hardware really somehow require that?

> +
> +	iommu_write(iommu, IOMMU_INT_CLR_REG, status);
> +
> +	iommu_write(iommu, IOMMU_RESET_REG, ~status);
> +	iommu_write(iommu, IOMMU_RESET_REG, status);
> +
> +	spin_unlock(&iommu->iommu_lock);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static int sun50i_iommu_probe(struct platform_device *pdev)
> +{
> +	struct sun50i_iommu *iommu;
> +	struct resource *res;
> +	int ret, irq;
> +
> +	iommu = devm_kzalloc(&pdev->dev, sizeof(*iommu), GFP_KERNEL);
> +	if (!iommu)
> +		return -ENOMEM;
> +	spin_lock_init(&iommu->iommu_lock);
> +	platform_set_drvdata(pdev, iommu);
> +	iommu->dev = &pdev->dev;
> +
> +	iommu->pt_pool = kmem_cache_create(dev_name(&pdev->dev),
> +					   PT_SIZE, PT_SIZE,
> +					   SLAB_HWCACHE_ALIGN,
> +					   NULL);
> +	if (!iommu->pt_pool)
> +		return -ENOMEM;
> +
> +	iommu->group = iommu_group_alloc();
> +	if (IS_ERR(iommu->group)) {
> +		ret = PTR_ERR(iommu->group);
> +		goto err_free_cache;
> +	}
> +
> +	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
> +	iommu->base = devm_ioremap_resource(&pdev->dev, res);

Watch out for the devm_platform_ioremap_resource() police :)

> +	if (!iommu->base) {
> +		ret = PTR_ERR(iommu->base);
> +		goto err_free_group;
> +	}
> +
> +	irq = platform_get_irq(pdev, 0);
> +	if (irq < 0) {
> +		ret = irq;
> +		goto err_free_group;
> +	}
> +
> +	iommu->clk = devm_clk_get(&pdev->dev, NULL);
> +	if (IS_ERR(iommu->clk)) {
> +		dev_err(&pdev->dev, "Couldn't get our clock.\n");
> +		ret = PTR_ERR(iommu->clk);
> +		goto err_free_group;
> +	}
> +
> +	iommu->reset = devm_reset_control_get(&pdev->dev, NULL);
> +	if (IS_ERR(iommu->reset)) {
> +		dev_err(&pdev->dev, "Couldn't get our reset line.\n");
> +		ret = PTR_ERR(iommu->reset);
> +		goto err_free_group;
> +	}
> +
> +	ret = iommu_device_sysfs_add(&iommu->iommu, &pdev->dev,
> +				     NULL, dev_name(&pdev->dev));
> +	if (ret)
> +		goto err_free_group;
> +
> +	iommu_device_set_ops(&iommu->iommu, &sun50i_iommu_ops);
> +	iommu_device_set_fwnode(&iommu->iommu, &pdev->dev.of_node->fwnode);
> +
> +	ret = iommu_device_register(&iommu->iommu);
> +	if (ret)
> +		goto err_remove_sysfs;
> +
> +	pm_runtime_enable(&pdev->dev);
> +	ret = devm_request_irq(&pdev->dev, irq, sun50i_iommu_irq, 0,
> +			       dev_name(&pdev->dev), iommu);
> +	if (ret < 0)
> +		goto err_unregister;
> +
> +	bus_set_iommu(&platform_bus_type, &sun50i_iommu_ops);
> +
> +	return 0;
> +
> +err_unregister:
> +	iommu_device_unregister(&iommu->iommu);
> +
> +err_remove_sysfs:
> +	iommu_device_sysfs_remove(&iommu->iommu);
> +
> +err_free_group:
> +	iommu_group_put(iommu->group);
> +
> +err_free_cache:
> +	kmem_cache_destroy(iommu->pt_pool);
> +
> +	return ret;
> +}
> +
> +static int __maybe_unused sun50i_iommu_suspend(struct device *dev)
> +{
> +	struct sun50i_iommu_domain *sun50i_domain;
> +	struct sun50i_iommu *iommu;
> +	unsigned long flags;
> +
> +	iommu = dev_get_drvdata(dev);
> +	if (!iommu->domain)
> +		return 0;
> +
> +	spin_lock_irqsave(&iommu->iommu_lock, flags);
> +
> +	iommu_write(iommu, IOMMU_ENABLE_REG, 0);
> +	iommu_write(iommu, IOMMU_TTB_REG, 0);
> +
> +	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
> +
> +	clk_disable_unprepare(iommu->clk);
> +	reset_control_assert(iommu->reset);
> +
> +	sun50i_domain = to_sun50i_domain(iommu->domain);
> +	dma_unmap_single(dev, virt_to_phys(sun50i_domain->dt), DT_SIZE,
> +			 DMA_TO_DEVICE);

Why? The only time you should need to do this is when freeing the table.

> +
> +	return 0;
> +}
> +
> +static int __maybe_unused sun50i_iommu_resume(struct device *dev)
> +{
> +	struct sun50i_iommu_domain *sun50i_domain;
> +	struct sun50i_iommu *iommu;
> +	unsigned long flags;
> +	dma_addr_t dt_dma;
> +	int ret;
> +
> +	iommu = dev_get_drvdata(dev);
> +	if (!iommu->domain)
> +		return 0;
> +
> +	sun50i_domain = to_sun50i_domain(iommu->domain);
> +	dt_dma = dma_map_single(dev, sun50i_domain->dt, DT_SIZE, DMA_TO_DEVICE);

As above. The power state of the IOMMU should be enitrely irrelevant to 
the contents of RAM.

> +	if (dma_mapping_error(dev, dt_dma)) {
> +		dev_err(dev, "Couldn't map L1 Page Table\n");
> +		return -ENOMEM;
> +	}
> +
> +	ret = reset_control_deassert(iommu->reset);
> +	if (ret)
> +		goto err_unmap;
> +
> +	ret = clk_prepare_enable(iommu->clk);
> +	if (ret)
> +		goto err_reset_assert;
> +
> +	/* We rely on the physical address and DMA address being the same */
> +	WARN_ON(dt_dma != virt_to_phys(sun50i_domain->dt));

If you made proper use of both iommu->dt and iommu->dt_dma you wouldn't 
actually need that invariant at this level, which I think would be a 
good thing. The phys_to_virt() dance is somewhat justifiable for the 
second-level tables to save maintaining an additional dma_addr_t[4096] 
array; far less so for the single top-level address.

> +
> +	spin_lock_irqsave(&iommu->iommu_lock, flags);
> +
> +	iommu_write(iommu, IOMMU_TTB_REG, dt_dma);
> +	iommu_write(iommu, IOMMU_TLB_PREFETCH_REG,
> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(0) |
> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(1) |
> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(2) |
> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(3) |
> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(4) |
> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(5));
> +	iommu_write(iommu, IOMMU_INT_ENABLE_REG, IOMMU_INT_MASK);
> +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_NONE),
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 0) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 0) |
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 1) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 1) |
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 2) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 2) |
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 3) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 3) |
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 4) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 4) |
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 5) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 5));
> +
> +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_RD),
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 0) |
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 1) |
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 2) |
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 3) |
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 4) |
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 5) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 0) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 1) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 2) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 3) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 4) |
> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 5));
> +
> +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_WR),
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 0) |
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 1) |
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 2) |
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 3) |
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 4) |
> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 5) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 0) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 1) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 2) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 3) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 4) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 5));
> +
> +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_RD_WR),
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 0) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 0) |
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 1) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 1) |
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 2) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 2) |
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 3) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 3) |
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 4) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 4) |
> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 5) |
> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 5));
> +
> +	ret = sun50i_iommu_flush_all_tlb(iommu);
> +	if (ret) {
> +		spin_unlock_irqrestore(&iommu->iommu_lock, flags);
> +		goto err_clk_disable;
> +	}
> +
> +	iommu_write(iommu, IOMMU_AUTO_GATING_REG, IOMMU_AUTO_GATING_ENABLE);
> +	iommu_write(iommu, IOMMU_ENABLE_REG, IOMMU_ENABLE_ENABLE);

As mentioned, although you obviously do need to reprogram the hardware 
on resume, the fact that the *only* references to most of the control 
registers are from "__maybe_unused" functions is a big red flag.

> +
> +	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
> +
> +	return 0;
> +
> +err_clk_disable:
> +	clk_disable_unprepare(iommu->clk);
> +
> +err_reset_assert:
> +	reset_control_assert(iommu->reset);
> +
> +err_unmap:
> +	sun50i_domain = to_sun50i_domain(iommu->domain);
> +	dma_unmap_single(dev, dt_dma, DT_SIZE, DMA_TO_DEVICE);
> +	return ret;
> +}
> +
> +static const struct dev_pm_ops sun50i_iommu_pm_ops = {
> +	SET_RUNTIME_PM_OPS(sun50i_iommu_suspend, sun50i_iommu_resume, NULL)

In principle you should be able to support system PM with the same 
resume callback.

> +};
> +
> +static const struct of_device_id sun50i_iommu_dt[] = {
> +	{ .compatible = "allwinner,sun50i-h6-iommu", },
> +	{ /* sentinel */ },
> +};
> +MODULE_DEVICE_TABLE(of, sun50i_iommu_dt);
> +
> +static struct platform_driver sun50i_iommu_driver = {
> +	.probe		= sun50i_iommu_probe,
> +	.driver		= {
> +		.name			= "sun50i-iommu",
> +		.of_match_table 	= sun50i_iommu_dt,
> +		.pm			= &sun50i_iommu_pm_ops,
> +		.suppress_bind_attrs	= true,
> +	}
> +};
> +
> +static int __init sun50i_iommu_init(void)
> +{
> +	return platform_driver_register(&sun50i_iommu_driver);
> +}
> +subsys_initcall(sun50i_iommu_init);

Regular device_initcall() (or just a *_platform_driver() helper) should 
be sufficient these days.

Robin.

> +MODULE_DESCRIPTION("Allwinner H6 IOMMU driver");
> +MODULE_AUTHOR("Maxime Ripard <maxime@cerno.tech>");
> +MODULE_AUTHOR("zhuxianbin <zhuxianbin@allwinnertech.com>");
> +MODULE_LICENSE("Dual BSD/GPL");
>
Maxime Ripard Jan. 27, 2020, 2:22 p.m. UTC | #2
Hi Robin,

Thanks for the extensive review :)

On Thu, Jan 23, 2020 at 06:47:14PM +0000, Robin Murphy wrote:
> This looks like a pretty decent first cut of a driver, thanks! Plenty of
> comments, but overall it looks to be in mostly the right shape to begin
> with.
>
> On 22/01/2020 12:44 pm, Maxime Ripard wrote:
> > The Allwinner H6 has introduced an IOMMU for a few DMA controllers, mostly
> > video related: the display engine, the video decoders / encoders, the
> > camera capture controller, etc.
> >
> > The design is pretty simple compared to other IOMMUs found in SoCs: there's
> > a single instance, controlling all the masters, with a single address
> > space.
> >
> > It also features a performance monitoring unit that allows to retrieve
> > various informations (per-master and global TLB accesses, hits and misses,
> > access latency, etc) that isn't supported at the moment.
>
> Yes, it sounds like that might ideally be a separate perf_events driver, but
> there's no need to get into the details of that until the basic IOMMU driver
> is baked.
>
> > Signed-off-by: Maxime Ripard <maxime@cerno.tech>
> > ---
> >   drivers/iommu/Kconfig        |   10 +-
> >   drivers/iommu/Makefile       |    1 +-
> >   drivers/iommu/sun50i-iommu.c | 1126 +++++++++++++++++++++++++++++++++++-
> >   3 files changed, 1137 insertions(+)
> >   create mode 100644 drivers/iommu/sun50i-iommu.c
> >
> > diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> > index 0b9d78a0f3ac..5cbfa6f282e2 100644
> > --- a/drivers/iommu/Kconfig
> > +++ b/drivers/iommu/Kconfig
> > @@ -289,6 +289,16 @@ config ROCKCHIP_IOMMU
> >   	  Say Y here if you are using a Rockchip SoC that includes an IOMMU
> >   	  device.
> > +config SUN50I_IOMMU
> > +	bool "Allwinner H6 IOMMU Support"
> > +	depends on ARM || ARM64
> > +	depends on ARCH_SUNXI
>
> Surely ARCH_SUNXI implies "ARM || ARM64" anyway? Ideally there shouldn't be
> anything CPU-specific here, so supporting COMPILE_TEST as well would be
> nice.

I guess, yes, I'll change that.

...

> > +#define IOMMU_DM_AUT_CTRL_REG(d)	(0x0b0 + ((d) / 2) * 4)
> > +#define IOMMU_DM_AUT_CTRL_RD_UNAVAIL(d, m)	(1 << (((d & 1) * 16) + ((m) * 2)))
> > +#define IOMMU_DM_AUT_CTRL_RD_AVAIL(d, m)	(0 << (((d & 1) * 16) + ((m) * 2)))
>
> It might just be personal opinion, but I think overly-complex ways
> of expressing 0 are usually better left entirely unsaid. Plus this
> makes sun50i_iommu_resume() appear to be doing a lot more than it
> really is - it seems sufficiently straightforward to me for the mere
> absence of "available" to imply "unavailable".

The polarity is inversed though. By default (or with the value 0), the
operation is allowed, while 1 disables it, so the absence of available
actually sets it available (at least if you do the usual OR between
the values).

I guess you could say that the absence of unavailable is available
though. I'll remove those.

> > +static struct sun50i_iommu *sun50i_iommu_from_dev(struct device *dev)
> > +{
> > +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
> > +
> > +	if (!fwspec)
> > +		return NULL;
>
> Strictly you should only need that check in .add_device and .attach_dev - if
> the other callbacks end up getting passed the wrong device then things have
> arguably gone crash-worthily wrong already.

That helper is being used in both add_device / attach_dev and other
callbacks, so I'm not sure what your suggestion is here? Should I
remove the helper entirely, or remove that check?

> > +	return fwspec->iommu_priv;
> > +}
> > +

...

> > +/*
> > + * The Allwinner H6 IOMMU uses a 2-level page table.
> > + *
> > + * The first level is the usual Directory Table (DT), that consists of
> > + * 4096 4-bytes Directory Table Entries (DTE), each pointing to a Page
> > + * Table (PT).
> > + *
> > + * Each PT consits of 256 4-bytes Page Table Entries (PTE), each
> > + * pointing to a 4kB page of physical memory.
> > + *
> > + * The IOMMU supports a single DT, pointed by the IOMMU_TTB_REG
> > + * register that contains its physical address.
> > + */
> > +
> > +#define SUN50I_IOVA_DTE_MASK	GENMASK(31, 20)
> > +#define SUN50I_IOVA_DTE_SHIFT	20
> > +#define SUN50I_IOVA_PTE_MASK	GENMASK(19, 12)
> > +#define SUN50I_IOVA_PTE_SHIFT	12
> > +#define SUN50I_IOVA_PAGE_MASK	GENMASK(11, 0)
> > +#define SUN50I_IOVA_PAGE_SHIFT	0
> > +
> > +static u32 sun50i_iova_dte_index(dma_addr_t iova)
> > +{
> > +	return (u32)(iova & SUN50I_IOVA_DTE_MASK) >> SUN50I_IOVA_DTE_SHIFT;
> > +}
> > +
> > +static u32 sun50i_iova_pte_index(dma_addr_t iova)
> > +{
> > +	return (u32)(iova & SUN50I_IOVA_PTE_MASK) >> SUN50I_IOVA_PTE_SHIFT;
> > +}
> > +
> > +static u32 sun50i_iova_page_offset(dma_addr_t iova)
> > +{
> > +	return (u32)(iova & SUN50I_IOVA_PAGE_MASK) >> SUN50I_IOVA_PAGE_SHIFT;
>
> A shift in a "page offset" calculation looks weird and surprising at the
> best of times, so a redundant one is *actively* confounding.

It's probably a poorly written name then. It's supposed to return the
offset of a given address within the page. What would name / rework
would make it less confusing?

> > +}
> > +
> > +/*
> > + * Each Directory Table Entry has a Page Table address and a valid
> > + * bit:
> > +
> > + * +---------------------+-----------+-+
> > + * | PT address          | Reserved  |V|
> > + * +---------------------+-----------+-+
> > + *  31:10 - Page Table address
> > + *   9:2  - Reserved
> > + *   1:0  - 1 if the entry is valid
> > + */
> > +
> > +#define SUN50I_DTE_PT_ADDRESS_MASK	GENMASK(31, 10)
> > +#define SUN50I_DTE_PT_ATTRS		GENMASK(1, 0)
> > +#define SUN50I_DTE_PT_VALID		BIT(0)
> > +
> > +static inline phys_addr_t sun50i_dte_pt_address(u32 dte)
> > +{
> > +	return (phys_addr_t)dte & SUN50I_DTE_PT_ADDRESS_MASK;
> > +}
> > +
> > +static inline bool sun50i_dte_is_pt_valid(u32 dte)
> > +{
> > +	return (dte & SUN50I_DTE_PT_ATTRS) == SUN50I_DTE_PT_VALID;
>
> If SUN50I_DTE_PT_ATTRS was being used consistently with an expectation that
> it might possibly grow in future, then it might deserve to stay, but right
> now I'd say just "return dte & SUN50I_DTE_PT_VALID" here to match the use
> below.

You can't really do that though :/

The two LSBs are meant to hold whether the page is valid, and only
0b01 is the valid value. With an AND operator only, if the two LSBs
are 0b11, the helper would return true, while the IOMMU would consider
it invalid.

> > +#define SUN50I_PTE_PAGE_ADDRESS_MASK	GENMASK(31, 12)
> > +#define SUN50I_PTE_ACI_MASK		GENMASK(7, 4)
> > +#define SUN50I_PTE_ACI_SHIFT		4
> > +#define SUN50I_PTE_ACI(aci)		((aci) << SUN50I_PTE_ACI_SHIFT)
> > +#define SUN50I_PTE_PAGE_VALID		BIT(1)
> > +
> > +static inline phys_addr_t sun50i_pte_page_address(u32 pte)
> > +{
> > +	return (phys_addr_t)pte & SUN50I_PTE_PAGE_ADDRESS_MASK;
> > +}
> > +
> > +static inline enum iommu_aci sun50i_pte_aci(u32 pte)
>
> Eww, "lowercase name does opposite of uppercase name" is a confusingly
> unfamiliar paradigm - can we have a clearer distinction between "get" and
> "set" operations please?

ACK.

> Plus a bit more consistency in terms of macros vs. helper functions
> (maybe consider bitfield.h helpers too).

What did you find inconsistent? (Also, I didn't know FIELD_GET was a
thing. I'll use it).

...

> > +static int sun50i_iommu_tlb_invalidate(struct sun50i_iommu *iommu,
> > +				       dma_addr_t iova)
> > +{
> > +	int ret;
> > +	u32 reg;
> > +
> > +	assert_spin_locked(&iommu->iommu_lock);
> > +
> > +	iommu_write(iommu, IOMMU_TLB_IVLD_ADDR_REG, iova);
> > +	iommu_write(iommu, IOMMU_TLB_IVLD_ADDR_MASK_REG,
> > +		    SUN50I_PTE_PAGE_ADDRESS_MASK);
>
> Does this imply that the hardware might support power-of-two invalidations
> larger than a single page? No need to worry about it for the initial driver,
> but it could be worth investigating for future optimisation.

The documentation is pretty sparse (and I'm not really fluent in
IOMMUs yet) but it's my understanding yes.

I'm not quite sure how that would translate into the IOMMU
framework. As far as I could experience, map and unmap is only called
a single page at a time, so I'm not sure what we could optimize here?

...

> > +static u32 *sun50i_dte_get_page_table(struct sun50i_iommu_domain *sun50i_domain,
> > +				      dma_addr_t iova)
> > +{
> > +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
> > +	u32 *page_table, *dte_addr;
> > +	phys_addr_t pt_phys;
> > +	dma_addr_t pt_dma;
> > +	u32 dte_index, dte;
> > +
> > +	assert_spin_locked(&sun50i_domain->dt_lock);
> > +
> > +	dte_index = sun50i_iova_dte_index(iova);
> > +	dte_addr = &sun50i_domain->dt[dte_index];
> > +	dte = *dte_addr;
> > +	if (sun50i_dte_is_pt_valid(dte))
> > +		goto done;
> > +
> > +	page_table = kmem_cache_zalloc(iommu->pt_pool, GFP_ATOMIC);
>
> Can you respect the new gfp argument from ->map() here, or is that one of
> those things that kmem_caches can be overly picky about?

That's a good question, I'll try :)

> > +}
> > +
> > +static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova,
> > +			    phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
> > +{
> > +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
> > +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
> > +	u32 pte_index;
> > +	u32 *page_table, *pte_addr;
> > +	unsigned long flags;
> > +	int ret = 0;
> > +
> > +	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
> > +	page_table = sun50i_dte_get_page_table(sun50i_domain, iova);
> > +	if (IS_ERR(page_table)) {
> > +		ret = PTR_ERR(page_table);
> > +		goto out;
> > +	}
> > +
> > +	pte_index = sun50i_iova_pte_index(iova);
> > +	pte_addr = &page_table[pte_index];
> > +	if (sun50i_pte_is_page_valid(*pte_addr)) {
> > +		phys_addr_t page_phys = sun50i_pte_page_address(*pte_addr);
> > +		dev_err(iommu->dev,
> > +			"iova %pad already mapped to %pa cannot remap to %pa prot: %#x\n",
> > +			&iova, &page_phys, &paddr, prot);
> > +		ret = -EADDRINUSE;
>
> I think that strictly refers to a *network* address - I'd be inclined to
> stick with boring old EBUSY.

ACK

> > +		goto out;
> > +	}
> > +
> > +	*pte_addr = sun50i_mk_pte(paddr, prot);
> > +	sun50i_table_flush(sun50i_domain, virt_to_phys(pte_addr), 1);
> > +
> > +	spin_lock_irqsave(&iommu->iommu_lock, flags);
> > +
> > +	sun50i_iommu_tlb_invalidate(iommu, iova);
> > +	sun50i_iommu_ptw_invalidate(iommu, iova);
>
> Does the walk cache actually cache PTEs as well as DTEs? If it's really only
> the latter, the maintenance could perhaps be folded into pagetable
> allocation (being the only place that ever changes them).

So the documentation is also sparse on this, but there's definitely a
cache for the table walk, and you feed the invalidate register an
address aligned on 1MB, wich is (1 << 20), and the index of the DTE is
stored in the bits 20 to 31 in the iova.

So my best guess is that, yeah, it's the DTE -> PT association. But
it's not like something confirms it :)

But you're right that we shouldn't need this here, I'll move it to the
page table allocation function.

> > +static void sun50i_iommu_domain_free(struct iommu_domain *domain)
> > +{
> > +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
> > +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
> > +	unsigned long flags;
> > +	int i;
> > +
> > +	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
> > +
> > +	for (i = 0; i < NUM_DT_ENTRIES; i++) {
> > +		phys_addr_t pt_phys;
> > +		u32 *page_table;
> > +		u32 *dte_addr;
> > +		u32 dte;
> > +
> > +		dte_addr = &sun50i_domain->dt[i];
> > +		dte = *dte_addr;
> > +		if (!sun50i_dte_is_pt_valid(dte))
> > +			continue;
> > +
> > +		memset(dte_addr, 0, sizeof(*dte_addr));
> > +		sun50i_table_flush(sun50i_domain, virt_to_phys(dte_addr), 1);
>
> This shouldn't be necessary - freeing a domain while it's still live is an
> incredibly very wrong thing to do, so the hardware should have already been
> programmed to no longer walk this table by this point.

We never "garbage collect" and remove the dte for the page table we
don't use anymore elsewhere though, so couldn't we end up in a
situation where we don't have a page table (because it has been freed)
at the other end of our dte, but the IOMMU doesn't know about it since
we never flushed?

> > +
> > +		pt_phys = sun50i_dte_pt_address(dte);
> > +		dma_unmap_single(iommu->dev, pt_phys, PT_SIZE, DMA_TO_DEVICE);
> > +
> > +		page_table = phys_to_virt(pt_phys);
> > +		kmem_cache_free(iommu->pt_pool, page_table);
> > +	}
> > +
> > +	free_pages((unsigned long)sun50i_domain->dt, get_order(DT_SIZE));
> > +	sun50i_domain->dt = NULL;
> > +
> > +	spin_lock(&iommu->iommu_lock);
> > +	sun50i_iommu_flush_all_tlb(iommu);
> > +	spin_unlock(&iommu->iommu_lock);
>
> And this looks very out-of-place. I'd expect any invalidation to have
> happened at the point that translation was disabled, or be deferred until it
> is next reenabled. Consider that users are within their rights to call
> iommu_domain_alloc() 17 times in a row and immediately free them all again
> without ever attaching any devices; there should be no reason to ever touch
> the hardware in such a case.

So I guess we shouldn't remove the DTEs here then, but in unmap then?

> > +static int sun50i_iommu_attach_device(struct iommu_domain *domain,
> > +				      struct device *dev)
> > +{
> > +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
> > +	struct sun50i_iommu *iommu;
> > +
> > +	iommu = sun50i_iommu_from_dev(dev);
> > +	if (!iommu)
> > +		return -ENODEV;
> > +
> > +	dev_info(dev, "Attaching to IOMMU domain\n");
>
> Also no.

ACK

> > +
> > +	if (iommu->domain == domain)
> > +		return 0;
> > +
> > +	if (iommu->domain)
> > +		sun50i_iommu_detach_device(iommu->domain, dev);
> > +
> > +	iommu->domain = domain;
> > +	sun50i_domain->iommu = iommu;
> > +
> > +	return pm_runtime_get_sync(iommu->dev);
>
> Deferring all the actual hardware pogramming to the suspend/resume hooks is
> a fiendishly clever idea that took me more than a moment to make sense of,
> but how well does it work when RPM is compiled out or runtime-inhibited?

We have a bunch of other controllers that require runtime_pm already,
so it's going to be enabled. But that should be expressed in Kconfig.

> Furthermore, basing RPM on having a domain attached means that
> you'll effectively never turn the IOMMU off, even when all the
> clients are idle. It would make more sene to use device links like
> most other drivers do to properly model the producer/consumer
> relationship.

I'm not familiar with device links for runtime_pm, I thought this was
only useful for system-wide resume and suspend?

...

> > +static void sun50i_iommu_report_fault(struct sun50i_iommu *iommu,
> > +				      unsigned master, phys_addr_t iova,
> > +				      unsigned prot)
> > +{
> > +	dev_err(iommu->dev, "Page fault for %pad (master %d, dir %s)\n",
> > +		&iova, master, (prot == IOMMU_FAULT_WRITE) ? "wr" : "rd");
>
> If you're expecting an external handler to be able to do something useful
> with faults, then save the log spam for the case where it's actually failed.

I'm sorry, I'm not quite sure what you mean here. Where what actually failed?

> > +
> > +	if (iommu->domain)
> > +		report_iommu_fault(iommu->domain, iommu->dev, iova, prot);
> > +	else
> > +		dev_err(iommu->dev, "Page fault while iommu not attached to any domain?\n");
> > +}
> > +
> > +static phys_addr_t sun50i_iommu_handle_pt_irq(struct sun50i_iommu *iommu,
> > +					      unsigned addr_reg,
> > +					      unsigned blame_reg)
> > +{
> > +	phys_addr_t iova;
> > +	unsigned master;
> > +	u32 blame;
> > +
> > +	assert_spin_locked(&iommu->iommu_lock);
> > +
> > +	iova = iommu_read(iommu, addr_reg);
> > +	blame = iommu_read(iommu, blame_reg);
> > +	master = ilog2(blame & IOMMU_INT_MASTER_MASK);
> > +
> > +	/*
> > +	 * If the address is not in the page table, we can't get what
> > +	 * operation triggered the fault. Assume it's a read
> > +	 * operation.
> > +	 */
> > +	sun50i_iommu_report_fault(iommu, master, iova, IOMMU_FAULT_READ);
> > +
> > +	return iova;
> > +}
> > +
> > +static phys_addr_t sun50i_iommu_handle_perm_irq(struct sun50i_iommu *iommu)
> > +{
> > +	enum iommu_aci aci;
> > +	phys_addr_t iova;
> > +	unsigned master;
> > +	unsigned dir;
> > +	u32 blame;
> > +
> > +	assert_spin_locked(&iommu->iommu_lock);
> > +
> > +	blame = iommu_read(iommu, IOMMU_INT_STA_REG);
> > +	master = ilog2(blame & IOMMU_INT_MASTER_MASK);
> > +	iova = iommu_read(iommu, IOMMU_INT_ERR_ADDR_REG(master));
> > +	aci = sun50i_pte_aci(iommu_read(iommu, IOMMU_INT_ERR_DATA_REG(master)));
> > +
> > +	switch (aci) {
> > +		/*
> > +		 * If we are in the read-only domain, then it means we
> > +		 * tried to write.
> > +		 */
> > +	case IOMMU_ACI_RD:
> > +		dir = IOMMU_FAULT_WRITE;
> > +		break;
> > +
> > +		/*
> > +		 * If we are in the write-only domain, then it means
> > +		 * we tried to read.
> > +		 */
> > +	case IOMMU_ACI_WR:
> > +
> > +		/*
> > +		 * If we are in the domain without any permission, we
> > +		 * can't really tell. Let's default to a read
> > +		 * operation.
> > +		 */
> > +	case IOMMU_ACI_NONE:
> > +
> > +		/* WTF? */
> > +	case IOMMU_ACI_RD_WR:
> > +	default:
> > +		dir = IOMMU_FAULT_READ;
> > +		break;
> > +	}
> > +
> > +	/*
> > +	 * If the address is not in the page table, we can't get what
> > +	 * operation triggered the fault. Assume it's a read
> > +	 * operation.
> > +	 */
> > +	sun50i_iommu_report_fault(iommu, master, iova, dir);
> > +
> > +	return iova;
> > +}
> > +
> > +static irqreturn_t sun50i_iommu_irq(int irq, void *dev_id)
> > +{
> > +	struct sun50i_iommu *iommu = dev_id;
> > +	phys_addr_t iova;
> > +	u32 status;
> > +
> > +	spin_lock(&iommu->iommu_lock);
> > +
> > +	status = iommu_read(iommu, IOMMU_INT_STA_REG);
> > +	if (!(status & IOMMU_INT_MASK)) {
> > +		spin_unlock(&iommu->iommu_lock);
> > +		return IRQ_NONE;
> > +	}
> > +
> > +	if (status & IOMMU_INT_INVALID_L2PG)
> > +		iova = sun50i_iommu_handle_pt_irq(iommu,
> > +						  IOMMU_INT_ERR_ADDR_L2_REG,
> > +						  IOMMU_L2PG_INT_REG);
> > +	else if (status & IOMMU_INT_INVALID_L1PG)
> > +		iova = sun50i_iommu_handle_pt_irq(iommu,
> > +						  IOMMU_INT_ERR_ADDR_L1_REG,
> > +						  IOMMU_L1PG_INT_REG);
> > +	else
> > +		iova = sun50i_iommu_handle_perm_irq(iommu);
> > +
> > +	sun50i_iommu_tlb_invalidate(iommu, iova);
> > +	sun50i_iommu_ptw_invalidate(iommu, iova);
>
> Yikes, does the hardware really somehow require that?

Again the documentation is sparse, so I don't really know. The
original driver had that though (and not just one):
https://github.com/Allwinner-Homlet/H6-BSP4.9-linux/blob/master/drivers/iommu/sunxi-iommu.c#L952

I guess I can just remove it and see if it works. I'm not really an
expert on cache maintainance, so I wasn't really sure whether that was
needed or not.

> > +static int __maybe_unused sun50i_iommu_suspend(struct device *dev)
> > +{
> > +	struct sun50i_iommu_domain *sun50i_domain;
> > +	struct sun50i_iommu *iommu;
> > +	unsigned long flags;
> > +
> > +	iommu = dev_get_drvdata(dev);
> > +	if (!iommu->domain)
> > +		return 0;
> > +
> > +	spin_lock_irqsave(&iommu->iommu_lock, flags);
> > +
> > +	iommu_write(iommu, IOMMU_ENABLE_REG, 0);
> > +	iommu_write(iommu, IOMMU_TTB_REG, 0);
> > +
> > +	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
> > +
> > +	clk_disable_unprepare(iommu->clk);
> > +	reset_control_assert(iommu->reset);
> > +
> > +	sun50i_domain = to_sun50i_domain(iommu->domain);
> > +	dma_unmap_single(dev, virt_to_phys(sun50i_domain->dt), DT_SIZE,
> > +			 DMA_TO_DEVICE);
>
> Why? The only time you should need to do this is when freeing the table.
>
> > +
> > +	return 0;
> > +}
> > +
> > +static int __maybe_unused sun50i_iommu_resume(struct device *dev)
> > +{
> > +	struct sun50i_iommu_domain *sun50i_domain;
> > +	struct sun50i_iommu *iommu;
> > +	unsigned long flags;
> > +	dma_addr_t dt_dma;
> > +	int ret;
> > +
> > +	iommu = dev_get_drvdata(dev);
> > +	if (!iommu->domain)
> > +		return 0;
> > +
> > +	sun50i_domain = to_sun50i_domain(iommu->domain);
> > +	dt_dma = dma_map_single(dev, sun50i_domain->dt, DT_SIZE, DMA_TO_DEVICE);
>
> As above. The power state of the IOMMU should be enitrely irrelevant to the
> contents of RAM.

Sorry, I should have put a comment here.

I'm not quite sure what the difference between a group and domain in
the IOMMU framework is, but since this IOMMU can only deal with a
single address space, my understanding was that we'd need to allocate
a single domain and group, and that the domain was the abstraction
tied to an address space (since it's what is passed as an argument to
map).

So, given this, what made since was to allocate the directory table
buffer at domain_alloc time and map it. But then, domain_alloc seems
to not have any pointer back to the iommu we registered for some
reason (I guess that a domain could be shared across multiple
IOMMUs?), and so we don't have access to our IOMMU's struct device.

It's not really clear in my mind, so there's probably a better way.

> > +	if (dma_mapping_error(dev, dt_dma)) {
> > +		dev_err(dev, "Couldn't map L1 Page Table\n");
> > +		return -ENOMEM;
> > +	}
> > +
> > +	ret = reset_control_deassert(iommu->reset);
> > +	if (ret)
> > +		goto err_unmap;
> > +
> > +	ret = clk_prepare_enable(iommu->clk);
> > +	if (ret)
> > +		goto err_reset_assert;
> > +
> > +	/* We rely on the physical address and DMA address being the same */
> > +	WARN_ON(dt_dma != virt_to_phys(sun50i_domain->dt));
>
> If you made proper use of both iommu->dt and iommu->dt_dma you wouldn't
> actually need that invariant at this level, which I think would be a good
> thing. The phys_to_virt() dance is somewhat justifiable for the second-level
> tables to save maintaining an additional dma_addr_t[4096] array; far less so
> for the single top-level address.

This kind of conflics with your earlier comment about moving the
virt_to_phys call for the flush function into the function and remove
it from the caller. Judging from that last comment, I guess keeping
the dma address as the argument makes more sense?

Also, a more general question. One of the cleanups I wanted to do was
to remove the kmem_cache in favour of a dma_pool, which triggered that
test. It looks like with a dma_pool, the physical address and dma
address are not the same, even though the IOMMU is directly connected
to the RAM so there should be no intermediate mapping. Do you know
why?

> > +
> > +	spin_lock_irqsave(&iommu->iommu_lock, flags);
> > +
> > +	iommu_write(iommu, IOMMU_TTB_REG, dt_dma);
> > +	iommu_write(iommu, IOMMU_TLB_PREFETCH_REG,
> > +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(0) |
> > +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(1) |
> > +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(2) |
> > +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(3) |
> > +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(4) |
> > +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(5));
> > +	iommu_write(iommu, IOMMU_INT_ENABLE_REG, IOMMU_INT_MASK);
> > +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_NONE),
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 0) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 0) |
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 1) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 1) |
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 2) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 2) |
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 3) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 3) |
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 4) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 4) |
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 5) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 5));
> > +
> > +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_RD),
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 0) |
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 1) |
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 2) |
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 3) |
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 4) |
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 5) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 0) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 1) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 2) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 3) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 4) |
> > +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 5));
> > +
> > +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_WR),
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 0) |
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 1) |
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 2) |
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 3) |
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 4) |
> > +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 5) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 0) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 1) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 2) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 3) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 4) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 5));
> > +
> > +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_RD_WR),
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 0) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 0) |
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 1) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 1) |
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 2) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 2) |
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 3) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 3) |
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 4) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 4) |
> > +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 5) |
> > +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 5));
> > +
> > +	ret = sun50i_iommu_flush_all_tlb(iommu);
> > +	if (ret) {
> > +		spin_unlock_irqrestore(&iommu->iommu_lock, flags);
> > +		goto err_clk_disable;
> > +	}
> > +
> > +	iommu_write(iommu, IOMMU_AUTO_GATING_REG, IOMMU_AUTO_GATING_ENABLE);
> > +	iommu_write(iommu, IOMMU_ENABLE_REG, IOMMU_ENABLE_ENABLE);
>
> As mentioned, although you obviously do need to reprogram the hardware on
> resume, the fact that the *only* references to most of the control registers
> are from "__maybe_unused" functions is a big red flag.
>
> > +
> > +	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
> > +
> > +	return 0;
> > +
> > +err_clk_disable:
> > +	clk_disable_unprepare(iommu->clk);
> > +
> > +err_reset_assert:
> > +	reset_control_assert(iommu->reset);
> > +
> > +err_unmap:
> > +	sun50i_domain = to_sun50i_domain(iommu->domain);
> > +	dma_unmap_single(dev, dt_dma, DT_SIZE, DMA_TO_DEVICE);
> > +	return ret;
> > +}
> > +
> > +static const struct dev_pm_ops sun50i_iommu_pm_ops = {
> > +	SET_RUNTIME_PM_OPS(sun50i_iommu_suspend, sun50i_iommu_resume, NULL)
>
> In principle you should be able to support system PM with the same resume
> callback.

We don't have system-wide support, so I couldn't test it. I'd prefer
to leave it out until we have, if that's alright?

> > +};
> > +
> > +static const struct of_device_id sun50i_iommu_dt[] = {
> > +	{ .compatible = "allwinner,sun50i-h6-iommu", },
> > +	{ /* sentinel */ },
> > +};
> > +MODULE_DEVICE_TABLE(of, sun50i_iommu_dt);
> > +
> > +static struct platform_driver sun50i_iommu_driver = {
> > +	.probe		= sun50i_iommu_probe,
> > +	.driver		= {
> > +		.name			= "sun50i-iommu",
> > +		.of_match_table 	= sun50i_iommu_dt,
> > +		.pm			= &sun50i_iommu_pm_ops,
> > +		.suppress_bind_attrs	= true,
> > +	}
> > +};
> > +
> > +static int __init sun50i_iommu_init(void)
> > +{
> > +	return platform_driver_register(&sun50i_iommu_driver);
> > +}
> > +subsys_initcall(sun50i_iommu_init);
>
> Regular device_initcall() (or just a *_platform_driver() helper) should be
> sufficient these days.

Ack. Thanks again!
Maxime
Robin Murphy Jan. 27, 2020, 7:01 p.m. UTC | #3
On 27/01/2020 2:22 pm, Maxime Ripard wrote:
> Hi Robin,
> 
> Thanks for the extensive review :)
> 
> On Thu, Jan 23, 2020 at 06:47:14PM +0000, Robin Murphy wrote:
>> This looks like a pretty decent first cut of a driver, thanks! Plenty of
>> comments, but overall it looks to be in mostly the right shape to begin
>> with.
>>
>> On 22/01/2020 12:44 pm, Maxime Ripard wrote:
>>> The Allwinner H6 has introduced an IOMMU for a few DMA controllers, mostly
>>> video related: the display engine, the video decoders / encoders, the
>>> camera capture controller, etc.
>>>
>>> The design is pretty simple compared to other IOMMUs found in SoCs: there's
>>> a single instance, controlling all the masters, with a single address
>>> space.
>>>
>>> It also features a performance monitoring unit that allows to retrieve
>>> various informations (per-master and global TLB accesses, hits and misses,
>>> access latency, etc) that isn't supported at the moment.
>>
>> Yes, it sounds like that might ideally be a separate perf_events driver, but
>> there's no need to get into the details of that until the basic IOMMU driver
>> is baked.
>>
>>> Signed-off-by: Maxime Ripard <maxime@cerno.tech>
>>> ---
>>>    drivers/iommu/Kconfig        |   10 +-
>>>    drivers/iommu/Makefile       |    1 +-
>>>    drivers/iommu/sun50i-iommu.c | 1126 +++++++++++++++++++++++++++++++++++-
>>>    3 files changed, 1137 insertions(+)
>>>    create mode 100644 drivers/iommu/sun50i-iommu.c
>>>
>>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>>> index 0b9d78a0f3ac..5cbfa6f282e2 100644
>>> --- a/drivers/iommu/Kconfig
>>> +++ b/drivers/iommu/Kconfig
>>> @@ -289,6 +289,16 @@ config ROCKCHIP_IOMMU
>>>    	  Say Y here if you are using a Rockchip SoC that includes an IOMMU
>>>    	  device.
>>> +config SUN50I_IOMMU
>>> +	bool "Allwinner H6 IOMMU Support"
>>> +	depends on ARM || ARM64
>>> +	depends on ARCH_SUNXI
>>
>> Surely ARCH_SUNXI implies "ARM || ARM64" anyway? Ideally there shouldn't be
>> anything CPU-specific here, so supporting COMPILE_TEST as well would be
>> nice.
> 
> I guess, yes, I'll change that.
> 
> ...
> 
>>> +#define IOMMU_DM_AUT_CTRL_REG(d)	(0x0b0 + ((d) / 2) * 4)
>>> +#define IOMMU_DM_AUT_CTRL_RD_UNAVAIL(d, m)	(1 << (((d & 1) * 16) + ((m) * 2)))
>>> +#define IOMMU_DM_AUT_CTRL_RD_AVAIL(d, m)	(0 << (((d & 1) * 16) + ((m) * 2)))
>>
>> It might just be personal opinion, but I think overly-complex ways
>> of expressing 0 are usually better left entirely unsaid. Plus this
>> makes sun50i_iommu_resume() appear to be doing a lot more than it
>> really is - it seems sufficiently straightforward to me for the mere
>> absence of "available" to imply "unavailable".
> 
> The polarity is inversed though. By default (or with the value 0), the
> operation is allowed, while 1 disables it, so the absence of available
> actually sets it available (at least if you do the usual OR between
> the values).
> 
> I guess you could say that the absence of unavailable is available
> though. I'll remove those.

Oops, sorry, I did indeed manage to read it the wrong way round. Still, 
I think it would be enough to document the behaviour at point-of-use 
with a comment like "all domains have permission by default" then 
explicitly setting just the non-default values. That way when the next 
iteration of the hardware comes along supporting a different number of 
'authority domains' we don't potentially end up with a mix of both 
explicit and implicit zero values being significant or not.

>>> +static struct sun50i_iommu *sun50i_iommu_from_dev(struct device *dev)
>>> +{
>>> +	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
>>> +
>>> +	if (!fwspec)
>>> +		return NULL;
>>
>> Strictly you should only need that check in .add_device and .attach_dev - if
>> the other callbacks end up getting passed the wrong device then things have
>> arguably gone crash-worthily wrong already.
> 
> That helper is being used in both add_device / attach_dev and other
> callbacks, so I'm not sure what your suggestion is here? Should I
> remove the helper entirely, or remove that check?

I was implying that you could make the fwspec check explicit at the 
points where you have a real "is this a device I care about?" condition, 
rather than every time you need to retrieve priv for any reason. It's 
only a matter of conceptual cleanliness, though, so hardly critical.

>>> +	return fwspec->iommu_priv;
>>> +}
>>> +
> 
> ...
> 
>>> +/*
>>> + * The Allwinner H6 IOMMU uses a 2-level page table.
>>> + *
>>> + * The first level is the usual Directory Table (DT), that consists of
>>> + * 4096 4-bytes Directory Table Entries (DTE), each pointing to a Page
>>> + * Table (PT).
>>> + *
>>> + * Each PT consits of 256 4-bytes Page Table Entries (PTE), each
>>> + * pointing to a 4kB page of physical memory.
>>> + *
>>> + * The IOMMU supports a single DT, pointed by the IOMMU_TTB_REG
>>> + * register that contains its physical address.
>>> + */
>>> +
>>> +#define SUN50I_IOVA_DTE_MASK	GENMASK(31, 20)
>>> +#define SUN50I_IOVA_DTE_SHIFT	20
>>> +#define SUN50I_IOVA_PTE_MASK	GENMASK(19, 12)
>>> +#define SUN50I_IOVA_PTE_SHIFT	12
>>> +#define SUN50I_IOVA_PAGE_MASK	GENMASK(11, 0)
>>> +#define SUN50I_IOVA_PAGE_SHIFT	0
>>> +
>>> +static u32 sun50i_iova_dte_index(dma_addr_t iova)
>>> +{
>>> +	return (u32)(iova & SUN50I_IOVA_DTE_MASK) >> SUN50I_IOVA_DTE_SHIFT;
>>> +}
>>> +
>>> +static u32 sun50i_iova_pte_index(dma_addr_t iova)
>>> +{
>>> +	return (u32)(iova & SUN50I_IOVA_PTE_MASK) >> SUN50I_IOVA_PTE_SHIFT;
>>> +}
>>> +
>>> +static u32 sun50i_iova_page_offset(dma_addr_t iova)
>>> +{
>>> +	return (u32)(iova & SUN50I_IOVA_PAGE_MASK) >> SUN50I_IOVA_PAGE_SHIFT;
>>
>> A shift in a "page offset" calculation looks weird and surprising at the
>> best of times, so a redundant one is *actively* confounding.
> 
> It's probably a poorly written name then. It's supposed to return the
> offset of a given address within the page. What would name / rework
> would make it less confusing?

I mean just get rid of SUN50I_IOVA_PAGE_SHIFT, since it serves no 
purpose other than to look misleading.

>>> +}
>>> +
>>> +/*
>>> + * Each Directory Table Entry has a Page Table address and a valid
>>> + * bit:
>>> +
>>> + * +---------------------+-----------+-+
>>> + * | PT address          | Reserved  |V|
>>> + * +---------------------+-----------+-+
>>> + *  31:10 - Page Table address
>>> + *   9:2  - Reserved
>>> + *   1:0  - 1 if the entry is valid
>>> + */
>>> +
>>> +#define SUN50I_DTE_PT_ADDRESS_MASK	GENMASK(31, 10)
>>> +#define SUN50I_DTE_PT_ATTRS		GENMASK(1, 0)
>>> +#define SUN50I_DTE_PT_VALID		BIT(0)
>>> +
>>> +static inline phys_addr_t sun50i_dte_pt_address(u32 dte)
>>> +{
>>> +	return (phys_addr_t)dte & SUN50I_DTE_PT_ADDRESS_MASK;
>>> +}
>>> +
>>> +static inline bool sun50i_dte_is_pt_valid(u32 dte)
>>> +{
>>> +	return (dte & SUN50I_DTE_PT_ATTRS) == SUN50I_DTE_PT_VALID;
>>
>> If SUN50I_DTE_PT_ATTRS was being used consistently with an expectation that
>> it might possibly grow in future, then it might deserve to stay, but right
>> now I'd say just "return dte & SUN50I_DTE_PT_VALID" here to match the use
>> below.
> 
> You can't really do that though :/
> 
> The two LSBs are meant to hold whether the page is valid, and only
> 0b01 is the valid value. With an AND operator only, if the two LSBs
> are 0b11, the helper would return true, while the IOMMU would consider
> it invalid.

Oops, my bad - between skimming the deceptively-almost-to-scale ASCII 
diagram and the "BIT(0)" definition I misread it as being a single-bit 
field. In that case the masking logic is fine, but please define the 
value as "1" since it's still conceptually a 2-bit integer.

>>> +#define SUN50I_PTE_PAGE_ADDRESS_MASK	GENMASK(31, 12)
>>> +#define SUN50I_PTE_ACI_MASK		GENMASK(7, 4)
>>> +#define SUN50I_PTE_ACI_SHIFT		4
>>> +#define SUN50I_PTE_ACI(aci)		((aci) << SUN50I_PTE_ACI_SHIFT)
>>> +#define SUN50I_PTE_PAGE_VALID		BIT(1)
>>> +
>>> +static inline phys_addr_t sun50i_pte_page_address(u32 pte)
>>> +{
>>> +	return (phys_addr_t)pte & SUN50I_PTE_PAGE_ADDRESS_MASK;
>>> +}
>>> +
>>> +static inline enum iommu_aci sun50i_pte_aci(u32 pte)
>>
>> Eww, "lowercase name does opposite of uppercase name" is a confusingly
>> unfamiliar paradigm - can we have a clearer distinction between "get" and
>> "set" operations please?
> 
> ACK.
> 
>> Plus a bit more consistency in terms of macros vs. helper functions
>> (maybe consider bitfield.h helpers too).
> 
> What did you find inconsistent? (Also, I didn't know FIELD_GET was a
> thing. I'll use it).

I mean having a function to convert one way but a macro to convert the 
other - I'd be inclined to use functions everywhere for the sake of type 
safety.

> 
> ...
> 
>>> +static int sun50i_iommu_tlb_invalidate(struct sun50i_iommu *iommu,
>>> +				       dma_addr_t iova)
>>> +{
>>> +	int ret;
>>> +	u32 reg;
>>> +
>>> +	assert_spin_locked(&iommu->iommu_lock);
>>> +
>>> +	iommu_write(iommu, IOMMU_TLB_IVLD_ADDR_REG, iova);
>>> +	iommu_write(iommu, IOMMU_TLB_IVLD_ADDR_MASK_REG,
>>> +		    SUN50I_PTE_PAGE_ADDRESS_MASK);
>>
>> Does this imply that the hardware might support power-of-two invalidations
>> larger than a single page? No need to worry about it for the initial driver,
>> but it could be worth investigating for future optimisation.
> 
> The documentation is pretty sparse (and I'm not really fluent in
> IOMMUs yet) but it's my understanding yes.
> 
> I'm not quite sure how that would translate into the IOMMU
> framework. As far as I could experience, map and unmap is only called
> a single page at a time, so I'm not sure what we could optimize here?

For unmaps, invalidation can now be somewhat decoupled via the 
iotlb_gather concept. Invalidate-on-map is considered more of a weird 
quirk so isn't given quite so much accommodation in the high-level API, 
but with the iotlb_sync_map hook and a bit of state-tracking in the 
domain, even that should be similarly optimisable if you wanted to.

> ...
> 
>>> +static u32 *sun50i_dte_get_page_table(struct sun50i_iommu_domain *sun50i_domain,
>>> +				      dma_addr_t iova)
>>> +{
>>> +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
>>> +	u32 *page_table, *dte_addr;
>>> +	phys_addr_t pt_phys;
>>> +	dma_addr_t pt_dma;
>>> +	u32 dte_index, dte;
>>> +
>>> +	assert_spin_locked(&sun50i_domain->dt_lock);
>>> +
>>> +	dte_index = sun50i_iova_dte_index(iova);
>>> +	dte_addr = &sun50i_domain->dt[dte_index];
>>> +	dte = *dte_addr;
>>> +	if (sun50i_dte_is_pt_valid(dte))
>>> +		goto done;
>>> +
>>> +	page_table = kmem_cache_zalloc(iommu->pt_pool, GFP_ATOMIC);
>>
>> Can you respect the new gfp argument from ->map() here, or is that one of
>> those things that kmem_caches can be overly picky about?
> 
> That's a good question, I'll try :)
> 
>>> +}
>>> +
>>> +static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova,
>>> +			    phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
>>> +{
>>> +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
>>> +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
>>> +	u32 pte_index;
>>> +	u32 *page_table, *pte_addr;
>>> +	unsigned long flags;
>>> +	int ret = 0;
>>> +
>>> +	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
>>> +	page_table = sun50i_dte_get_page_table(sun50i_domain, iova);
>>> +	if (IS_ERR(page_table)) {
>>> +		ret = PTR_ERR(page_table);
>>> +		goto out;
>>> +	}
>>> +
>>> +	pte_index = sun50i_iova_pte_index(iova);
>>> +	pte_addr = &page_table[pte_index];
>>> +	if (sun50i_pte_is_page_valid(*pte_addr)) {
>>> +		phys_addr_t page_phys = sun50i_pte_page_address(*pte_addr);
>>> +		dev_err(iommu->dev,
>>> +			"iova %pad already mapped to %pa cannot remap to %pa prot: %#x\n",
>>> +			&iova, &page_phys, &paddr, prot);
>>> +		ret = -EADDRINUSE;
>>
>> I think that strictly refers to a *network* address - I'd be inclined to
>> stick with boring old EBUSY.
> 
> ACK
> 
>>> +		goto out;
>>> +	}
>>> +
>>> +	*pte_addr = sun50i_mk_pte(paddr, prot);
>>> +	sun50i_table_flush(sun50i_domain, virt_to_phys(pte_addr), 1);
>>> +
>>> +	spin_lock_irqsave(&iommu->iommu_lock, flags);
>>> +
>>> +	sun50i_iommu_tlb_invalidate(iommu, iova);
>>> +	sun50i_iommu_ptw_invalidate(iommu, iova);
>>
>> Does the walk cache actually cache PTEs as well as DTEs? If it's really only
>> the latter, the maintenance could perhaps be folded into pagetable
>> allocation (being the only place that ever changes them).
> 
> So the documentation is also sparse on this, but there's definitely a
> cache for the table walk, and you feed the invalidate register an
> address aligned on 1MB, wich is (1 << 20), and the index of the DTE is
> stored in the bits 20 to 31 in the iova.
> 
> So my best guess is that, yeah, it's the DTE -> PT association. But
> it's not like something confirms it :)
> 
> But you're right that we shouldn't need this here, I'll move it to the
> page table allocation function.

If you've only got a vendor driver to go on, then feel free to play 
safe. It's a logical hunch that ptw_invalidate is *probably* only needed 
if a DTE changes, but you might still prefer potentially 
over-invalidating to debugging weird TLB issues down the line (which is 
never fun).

>>> +static void sun50i_iommu_domain_free(struct iommu_domain *domain)
>>> +{
>>> +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
>>> +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
>>> +	unsigned long flags;
>>> +	int i;
>>> +
>>> +	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
>>> +
>>> +	for (i = 0; i < NUM_DT_ENTRIES; i++) {
>>> +		phys_addr_t pt_phys;
>>> +		u32 *page_table;
>>> +		u32 *dte_addr;
>>> +		u32 dte;
>>> +
>>> +		dte_addr = &sun50i_domain->dt[i];
>>> +		dte = *dte_addr;
>>> +		if (!sun50i_dte_is_pt_valid(dte))
>>> +			continue;
>>> +
>>> +		memset(dte_addr, 0, sizeof(*dte_addr));
>>> +		sun50i_table_flush(sun50i_domain, virt_to_phys(dte_addr), 1);
>>
>> This shouldn't be necessary - freeing a domain while it's still live is an
>> incredibly very wrong thing to do, so the hardware should have already been
>> programmed to no longer walk this table by this point.
> 
> We never "garbage collect" and remove the dte for the page table we
> don't use anymore elsewhere though, so couldn't we end up in a
> situation where we don't have a page table (because it has been freed)
> at the other end of our dte, but the IOMMU doesn't know about it since
> we never flushed?

Let me reiterate: at the point of freeing, the assumption is that this 
domain should be long dissociated from the hardware. Any actual 
invalidation should have already happened at the point the TTB was 
disabled or pointed to some other domain, therefore fiddling with 
pagetable pages just before you free them back to the kernel is just 
pointless busywork.

If the TTB *was* still live here, then as soon as you call free_pages() 
below the DT memory could get reallocated by someone else and filled 
with data that happens to look like valid pagetables, so even if you 
invalidate the TLBs the hardware could just immediately go walk that 
data and refill them with nonsense, thus any pretence at invalidation is 
in vain anyway.

The fly in the soup, however, is that default domains appear to be 
lacking proper detach notifications (I hadn't consciously noticed that 
before), so if you've been looking at the iommu_group_release() path it 
might have given the wrong impression. So what might be justifiable here 
is to check if the domain being freed is the one currently active in 
hardware, and if so perform a detach (i.e. disable the TTB and 
invalidate everything) first, then free everything as normal. Or just 
handwave that we essentially never free a default domain anyway so it's 
OK to just assume that we're not freeing anything live.

>>> +
>>> +		pt_phys = sun50i_dte_pt_address(dte);
>>> +		dma_unmap_single(iommu->dev, pt_phys, PT_SIZE, DMA_TO_DEVICE);
>>> +
>>> +		page_table = phys_to_virt(pt_phys);
>>> +		kmem_cache_free(iommu->pt_pool, page_table);
>>> +	}
>>> +
>>> +	free_pages((unsigned long)sun50i_domain->dt, get_order(DT_SIZE));
>>> +	sun50i_domain->dt = NULL;
>>> +
>>> +	spin_lock(&iommu->iommu_lock);
>>> +	sun50i_iommu_flush_all_tlb(iommu);
>>> +	spin_unlock(&iommu->iommu_lock);
>>
>> And this looks very out-of-place. I'd expect any invalidation to have
>> happened at the point that translation was disabled, or be deferred until it
>> is next reenabled. Consider that users are within their rights to call
>> iommu_domain_alloc() 17 times in a row and immediately free them all again
>> without ever attaching any devices; there should be no reason to ever touch
>> the hardware in such a case.
> 
> So I guess we shouldn't remove the DTEs here then, but in unmap then?

It's fine to lazily free the tables themselves - you don't really want 
to have to scan all 256 PTEs on every unmap just to work out if a table 
is still used or not - it's just that, as above, fiddling with the 
actual DTE values is pretty futile even then.

>>> +static int sun50i_iommu_attach_device(struct iommu_domain *domain,
>>> +				      struct device *dev)
>>> +{
>>> +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
>>> +	struct sun50i_iommu *iommu;
>>> +
>>> +	iommu = sun50i_iommu_from_dev(dev);
>>> +	if (!iommu)
>>> +		return -ENODEV;
>>> +
>>> +	dev_info(dev, "Attaching to IOMMU domain\n");
>>
>> Also no.
> 
> ACK
> 
>>> +
>>> +	if (iommu->domain == domain)
>>> +		return 0;
>>> +
>>> +	if (iommu->domain)
>>> +		sun50i_iommu_detach_device(iommu->domain, dev);
>>> +
>>> +	iommu->domain = domain;
>>> +	sun50i_domain->iommu = iommu;
>>> +
>>> +	return pm_runtime_get_sync(iommu->dev);
>>
>> Deferring all the actual hardware pogramming to the suspend/resume hooks is
>> a fiendishly clever idea that took me more than a moment to make sense of,
>> but how well does it work when RPM is compiled out or runtime-inhibited?
> 
> We have a bunch of other controllers that require runtime_pm already,
> so it's going to be enabled. But that should be expressed in Kconfig.

But it can still be inhibited from sysfs, right? We don't want driver 
behaviour to be *unnecessarily* fragile to user actions, however silly 
they may be.

>> Furthermore, basing RPM on having a domain attached means that
>> you'll effectively never turn the IOMMU off, even when all the
>> clients are idle. It would make more sene to use device links like
>> most other drivers do to properly model the producer/consumer
>> relationship.
> 
> I'm not familiar with device links for runtime_pm, I thought this was
> only useful for system-wide resume and suspend?

See DL_FLAG_PM_RUNTIME - we already have several IOMMU drivers taking 
full advantage of this.

> 
> ...
> 
>>> +static void sun50i_iommu_report_fault(struct sun50i_iommu *iommu,
>>> +				      unsigned master, phys_addr_t iova,
>>> +				      unsigned prot)
>>> +{
>>> +	dev_err(iommu->dev, "Page fault for %pad (master %d, dir %s)\n",
>>> +		&iova, master, (prot == IOMMU_FAULT_WRITE) ? "wr" : "rd");
>>
>> If you're expecting an external handler to be able to do something useful
>> with faults, then save the log spam for the case where it's actually failed.
> 
> I'm sorry, I'm not quite sure what you mean here. Where what actually failed?

The (potential) handler behind report_iommu_fault(). If that returns 0 
then you probably want to stay quiet since the fault was presumably 
expected.

>>> +
>>> +	if (iommu->domain)
>>> +		report_iommu_fault(iommu->domain, iommu->dev, iova, prot);
>>> +	else
>>> +		dev_err(iommu->dev, "Page fault while iommu not attached to any domain?\n");
>>> +}
>>> +
>>> +static phys_addr_t sun50i_iommu_handle_pt_irq(struct sun50i_iommu *iommu,
>>> +					      unsigned addr_reg,
>>> +					      unsigned blame_reg)
>>> +{
>>> +	phys_addr_t iova;
>>> +	unsigned master;
>>> +	u32 blame;
>>> +
>>> +	assert_spin_locked(&iommu->iommu_lock);
>>> +
>>> +	iova = iommu_read(iommu, addr_reg);
>>> +	blame = iommu_read(iommu, blame_reg);
>>> +	master = ilog2(blame & IOMMU_INT_MASTER_MASK);
>>> +
>>> +	/*
>>> +	 * If the address is not in the page table, we can't get what
>>> +	 * operation triggered the fault. Assume it's a read
>>> +	 * operation.
>>> +	 */
>>> +	sun50i_iommu_report_fault(iommu, master, iova, IOMMU_FAULT_READ);
>>> +
>>> +	return iova;
>>> +}
>>> +
>>> +static phys_addr_t sun50i_iommu_handle_perm_irq(struct sun50i_iommu *iommu)
>>> +{
>>> +	enum iommu_aci aci;
>>> +	phys_addr_t iova;
>>> +	unsigned master;
>>> +	unsigned dir;
>>> +	u32 blame;
>>> +
>>> +	assert_spin_locked(&iommu->iommu_lock);
>>> +
>>> +	blame = iommu_read(iommu, IOMMU_INT_STA_REG);
>>> +	master = ilog2(blame & IOMMU_INT_MASTER_MASK);
>>> +	iova = iommu_read(iommu, IOMMU_INT_ERR_ADDR_REG(master));
>>> +	aci = sun50i_pte_aci(iommu_read(iommu, IOMMU_INT_ERR_DATA_REG(master)));
>>> +
>>> +	switch (aci) {
>>> +		/*
>>> +		 * If we are in the read-only domain, then it means we
>>> +		 * tried to write.
>>> +		 */
>>> +	case IOMMU_ACI_RD:
>>> +		dir = IOMMU_FAULT_WRITE;
>>> +		break;
>>> +
>>> +		/*
>>> +		 * If we are in the write-only domain, then it means
>>> +		 * we tried to read.
>>> +		 */
>>> +	case IOMMU_ACI_WR:
>>> +
>>> +		/*
>>> +		 * If we are in the domain without any permission, we
>>> +		 * can't really tell. Let's default to a read
>>> +		 * operation.
>>> +		 */
>>> +	case IOMMU_ACI_NONE:
>>> +
>>> +		/* WTF? */
>>> +	case IOMMU_ACI_RD_WR:
>>> +	default:
>>> +		dir = IOMMU_FAULT_READ;
>>> +		break;
>>> +	}
>>> +
>>> +	/*
>>> +	 * If the address is not in the page table, we can't get what
>>> +	 * operation triggered the fault. Assume it's a read
>>> +	 * operation.
>>> +	 */
>>> +	sun50i_iommu_report_fault(iommu, master, iova, dir);
>>> +
>>> +	return iova;
>>> +}
>>> +
>>> +static irqreturn_t sun50i_iommu_irq(int irq, void *dev_id)
>>> +{
>>> +	struct sun50i_iommu *iommu = dev_id;
>>> +	phys_addr_t iova;
>>> +	u32 status;
>>> +
>>> +	spin_lock(&iommu->iommu_lock);
>>> +
>>> +	status = iommu_read(iommu, IOMMU_INT_STA_REG);
>>> +	if (!(status & IOMMU_INT_MASK)) {
>>> +		spin_unlock(&iommu->iommu_lock);
>>> +		return IRQ_NONE;
>>> +	}
>>> +
>>> +	if (status & IOMMU_INT_INVALID_L2PG)
>>> +		iova = sun50i_iommu_handle_pt_irq(iommu,
>>> +						  IOMMU_INT_ERR_ADDR_L2_REG,
>>> +						  IOMMU_L2PG_INT_REG);
>>> +	else if (status & IOMMU_INT_INVALID_L1PG)
>>> +		iova = sun50i_iommu_handle_pt_irq(iommu,
>>> +						  IOMMU_INT_ERR_ADDR_L1_REG,
>>> +						  IOMMU_L1PG_INT_REG);
>>> +	else
>>> +		iova = sun50i_iommu_handle_perm_irq(iommu);
>>> +
>>> +	sun50i_iommu_tlb_invalidate(iommu, iova);
>>> +	sun50i_iommu_ptw_invalidate(iommu, iova);
>>
>> Yikes, does the hardware really somehow require that?
> 
> Again the documentation is sparse, so I don't really know. The
> original driver had that though (and not just one):
> https://github.com/Allwinner-Homlet/H6-BSP4.9-linux/blob/master/drivers/iommu/sunxi-iommu.c#L952
> 
> I guess I can just remove it and see if it works. I'm not really an
> expert on cache maintainance, so I wasn't really sure whether that was
> needed or not.

No, I'm sure it's more than possible for sufficiently wacky hardware to 
genuinely require TLB invalidation on a faulting address. It's just one 
of those things that can't help like it could have been a bodge to 'fix' 
a fault that was caused by improper TLB maintenance in the first place. 
If in any doubt, leave it in.

>>> +static int __maybe_unused sun50i_iommu_suspend(struct device *dev)
>>> +{
>>> +	struct sun50i_iommu_domain *sun50i_domain;
>>> +	struct sun50i_iommu *iommu;
>>> +	unsigned long flags;
>>> +
>>> +	iommu = dev_get_drvdata(dev);
>>> +	if (!iommu->domain)
>>> +		return 0;
>>> +
>>> +	spin_lock_irqsave(&iommu->iommu_lock, flags);
>>> +
>>> +	iommu_write(iommu, IOMMU_ENABLE_REG, 0);
>>> +	iommu_write(iommu, IOMMU_TTB_REG, 0);
>>> +
>>> +	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
>>> +
>>> +	clk_disable_unprepare(iommu->clk);
>>> +	reset_control_assert(iommu->reset);
>>> +
>>> +	sun50i_domain = to_sun50i_domain(iommu->domain);
>>> +	dma_unmap_single(dev, virt_to_phys(sun50i_domain->dt), DT_SIZE,
>>> +			 DMA_TO_DEVICE);
>>
>> Why? The only time you should need to do this is when freeing the table.
>>
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +static int __maybe_unused sun50i_iommu_resume(struct device *dev)
>>> +{
>>> +	struct sun50i_iommu_domain *sun50i_domain;
>>> +	struct sun50i_iommu *iommu;
>>> +	unsigned long flags;
>>> +	dma_addr_t dt_dma;
>>> +	int ret;
>>> +
>>> +	iommu = dev_get_drvdata(dev);
>>> +	if (!iommu->domain)
>>> +		return 0;
>>> +
>>> +	sun50i_domain = to_sun50i_domain(iommu->domain);
>>> +	dt_dma = dma_map_single(dev, sun50i_domain->dt, DT_SIZE, DMA_TO_DEVICE);
>>
>> As above. The power state of the IOMMU should be enitrely irrelevant to the
>> contents of RAM.
> 
> Sorry, I should have put a comment here.
> 
> I'm not quite sure what the difference between a group and domain in
> the IOMMU framework is, but since this IOMMU can only deal with a
> single address space, my understanding was that we'd need to allocate
> a single domain and group, and that the domain was the abstraction
> tied to an address space (since it's what is passed as an argument to
> map).

That's correct, a domain is effectvely an address space, while groups 
represents sets of devices that the IOMMU can isolate from each other. 
IOMMUs like this one (and the MediaTek M4U in mtk_iommu.c) are a little 
hard-done-by in that they do actually have some finer-grained isolation 
on a basic allow/deny level, but the API really assumes that isolation 
happens at the address space level, so it's easier to ignore it and just 
use the single-group model anyway.

The really neat advantage of having a guaranteed single group, though, 
is that you then no longer need to care about address spaces: since the 
group can only ever be attached to one domain at a time, you can have as 
many domains as you like, and handle it by having the first attach_dev 
call on a given domain context-switch that pagetable into the hardware. 
That's more or less what you've done already, which is good, it would 
just benefit from that context-switching being done in a more robust and 
obvious manner :)

> So, given this, what made since was to allocate the directory table
> buffer at domain_alloc time and map it. But then, domain_alloc seems
> to not have any pointer back to the iommu we registered for some
> reason (I guess that a domain could be shared across multiple
> IOMMUs?), and so we don't have access to our IOMMU's struct device.

I'll spare you the unrpoductive "Robin complains bitterly about the 
iommu_domain_alloc() interface being terrible, episode #27"...

You'll see two main ways that existing drivers work around that - if 
you're happy to assume that you'll only ever have one IOMMU instance, or 
that all instances will always be functionally equal, then you can 
simply keep track of any old IOMMU device handle for DMA purposes (e.g. 
exynos_iommu); otherwise if you might need to cope with multiple IOMMU 
instances having different DMA capabilities then deferring 
instance-specific setup to the first device attach is the de-facto 
standard (e.g. arm-smmu).

> It's not really clear in my mind, so there's probably a better way.
> 
>>> +	if (dma_mapping_error(dev, dt_dma)) {
>>> +		dev_err(dev, "Couldn't map L1 Page Table\n");
>>> +		return -ENOMEM;
>>> +	}
>>> +
>>> +	ret = reset_control_deassert(iommu->reset);
>>> +	if (ret)
>>> +		goto err_unmap;
>>> +
>>> +	ret = clk_prepare_enable(iommu->clk);
>>> +	if (ret)
>>> +		goto err_reset_assert;
>>> +
>>> +	/* We rely on the physical address and DMA address being the same */
>>> +	WARN_ON(dt_dma != virt_to_phys(sun50i_domain->dt));
>>
>> If you made proper use of both iommu->dt and iommu->dt_dma you wouldn't
>> actually need that invariant at this level, which I think would be a good
>> thing. The phys_to_virt() dance is somewhat justifiable for the second-level
>> tables to save maintaining an additional dma_addr_t[4096] array; far less so
>> for the single top-level address.
> 
> This kind of conflics with your earlier comment about moving the
> virt_to_phys call for the flush function into the function and remove
> it from the caller. Judging from that last comment, I guess keeping
> the dma address as the argument makes more sense?

Yeah, I try to avoid backtracking too much in big reviews these days :)

It's an either/or choice of alternative ideas, I guess - refining the 
slightly restrictive approach for self-consistency vs. separating the 
handling for each level of table entirely so that one can be a little 
more robust.

> Also, a more general question. One of the cleanups I wanted to do was
> to remove the kmem_cache in favour of a dma_pool, which triggered that
> test. It looks like with a dma_pool, the physical address and dma
> address are not the same, even though the IOMMU is directly connected
> to the RAM so there should be no intermediate mapping. Do you know
> why?

DMA pools are backed by dma_alloc_coherent, so (at least on arm64) the 
virtual address you get will be a non-cacheable remap (assuming a 
non-coherent device), and thus calling virt_to_phys() on it is bogus and 
will give you nonsense.

Although using coherent allocations (correctly) for pagetables would be 
perfectly acceptable, the nature of both IOMMU and CPU accesses makes it 
generally more efficient to do this trick with long-lived streaming 
mappings plus occasional syncs. Mixing both methods for different levels 
of table wouldn't be very nice, though.

>>> +
>>> +	spin_lock_irqsave(&iommu->iommu_lock, flags);
>>> +
>>> +	iommu_write(iommu, IOMMU_TTB_REG, dt_dma);
>>> +	iommu_write(iommu, IOMMU_TLB_PREFETCH_REG,
>>> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(0) |
>>> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(1) |
>>> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(2) |
>>> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(3) |
>>> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(4) |
>>> +		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(5));
>>> +	iommu_write(iommu, IOMMU_INT_ENABLE_REG, IOMMU_INT_MASK);
>>> +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_NONE),
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 0) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 0) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 1) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 1) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 2) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 2) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 3) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 3) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 4) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 4) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 5) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 5));
>>> +
>>> +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_RD),
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 0) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 1) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 2) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 3) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 4) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 5) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 0) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 1) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 2) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 3) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 4) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 5));
>>> +
>>> +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_WR),
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 0) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 1) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 2) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 3) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 4) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 5) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 0) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 1) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 2) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 3) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 4) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 5));
>>> +
>>> +	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_RD_WR),
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 0) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 0) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 1) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 1) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 2) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 2) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 3) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 3) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 4) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 4) |
>>> +		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 5) |
>>> +		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 5));
>>> +
>>> +	ret = sun50i_iommu_flush_all_tlb(iommu);
>>> +	if (ret) {
>>> +		spin_unlock_irqrestore(&iommu->iommu_lock, flags);
>>> +		goto err_clk_disable;
>>> +	}
>>> +
>>> +	iommu_write(iommu, IOMMU_AUTO_GATING_REG, IOMMU_AUTO_GATING_ENABLE);
>>> +	iommu_write(iommu, IOMMU_ENABLE_REG, IOMMU_ENABLE_ENABLE);
>>
>> As mentioned, although you obviously do need to reprogram the hardware on
>> resume, the fact that the *only* references to most of the control registers
>> are from "__maybe_unused" functions is a big red flag.
>>
>>> +
>>> +	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
>>> +
>>> +	return 0;
>>> +
>>> +err_clk_disable:
>>> +	clk_disable_unprepare(iommu->clk);
>>> +
>>> +err_reset_assert:
>>> +	reset_control_assert(iommu->reset);
>>> +
>>> +err_unmap:
>>> +	sun50i_domain = to_sun50i_domain(iommu->domain);
>>> +	dma_unmap_single(dev, dt_dma, DT_SIZE, DMA_TO_DEVICE);
>>> +	return ret;
>>> +}
>>> +
>>> +static const struct dev_pm_ops sun50i_iommu_pm_ops = {
>>> +	SET_RUNTIME_PM_OPS(sun50i_iommu_suspend, sun50i_iommu_resume, NULL)
>>
>> In principle you should be able to support system PM with the same resume
>> callback.
> 
> We don't have system-wide support, so I couldn't test it. I'd prefer
> to leave it out until we have, if that's alright?

Oh, for sure, that was more of a general observation in case it had just 
been overlooked. FWIW if you do get to it, hibernate is a good way to 
exercise system suspend/resume ops too, especially for IOMMUs where the 
symptoms of failing to restore from whatever state the resume kernel 
left things in are typically very obvious.

Robin.

>>> +};
>>> +
>>> +static const struct of_device_id sun50i_iommu_dt[] = {
>>> +	{ .compatible = "allwinner,sun50i-h6-iommu", },
>>> +	{ /* sentinel */ },
>>> +};
>>> +MODULE_DEVICE_TABLE(of, sun50i_iommu_dt);
>>> +
>>> +static struct platform_driver sun50i_iommu_driver = {
>>> +	.probe		= sun50i_iommu_probe,
>>> +	.driver		= {
>>> +		.name			= "sun50i-iommu",
>>> +		.of_match_table 	= sun50i_iommu_dt,
>>> +		.pm			= &sun50i_iommu_pm_ops,
>>> +		.suppress_bind_attrs	= true,
>>> +	}
>>> +};
>>> +
>>> +static int __init sun50i_iommu_init(void)
>>> +{
>>> +	return platform_driver_register(&sun50i_iommu_driver);
>>> +}
>>> +subsys_initcall(sun50i_iommu_init);
>>
>> Regular device_initcall() (or just a *_platform_driver() helper) should be
>> sufficient these days.
> 
> Ack. Thanks again!
> Maxime
>
Maxime Ripard Feb. 11, 2020, 12:39 p.m. UTC | #4
Hi Robin,

On Mon, Jan 27, 2020 at 07:01:02PM +0000, Robin Murphy wrote:
> > > > +static void sun50i_iommu_domain_free(struct iommu_domain *domain)
> > > > +{
> > > > +	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
> > > > +	struct sun50i_iommu *iommu = sun50i_domain->iommu;
> > > > +	unsigned long flags;
> > > > +	int i;
> > > > +
> > > > +	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
> > > > +
> > > > +	for (i = 0; i < NUM_DT_ENTRIES; i++) {
> > > > +		phys_addr_t pt_phys;
> > > > +		u32 *page_table;
> > > > +		u32 *dte_addr;
> > > > +		u32 dte;
> > > > +
> > > > +		dte_addr = &sun50i_domain->dt[i];
> > > > +		dte = *dte_addr;
> > > > +		if (!sun50i_dte_is_pt_valid(dte))
> > > > +			continue;
> > > > +
> > > > +		memset(dte_addr, 0, sizeof(*dte_addr));
> > > > +		sun50i_table_flush(sun50i_domain, virt_to_phys(dte_addr), 1);
> > >
> > > This shouldn't be necessary - freeing a domain while it's still live is an
> > > incredibly very wrong thing to do, so the hardware should have already been
> > > programmed to no longer walk this table by this point.
> >
> > We never "garbage collect" and remove the dte for the page table we
> > don't use anymore elsewhere though, so couldn't we end up in a
> > situation where we don't have a page table (because it has been freed)
> > at the other end of our dte, but the IOMMU doesn't know about it since
> > we never flushed?
>
> Let me reiterate: at the point of freeing, the assumption is that this
> domain should be long dissociated from the hardware. Any actual invalidation
> should have already happened at the point the TTB was disabled or pointed to
> some other domain, therefore fiddling with pagetable pages just before you
> free them back to the kernel is just pointless busywork.
>
> If the TTB *was* still live here, then as soon as you call free_pages()
> below the DT memory could get reallocated by someone else and filled with
> data that happens to look like valid pagetables, so even if you invalidate
> the TLBs the hardware could just immediately go walk that data and refill
> them with nonsense, thus any pretence at invalidation is in vain anyway.

Thanks, that makes a lot of sense.

> The fly in the soup, however, is that default domains appear to be lacking
> proper detach notifications (I hadn't consciously noticed that before), so
> if you've been looking at the iommu_group_release() path it might have given
> the wrong impression. So what might be justifiable here is to check if the
> domain being freed is the one currently active in hardware, and if so
> perform a detach (i.e. disable the TTB and invalidate everything) first,
> then free everything as normal. Or just handwave that we essentially never
> free a default domain anyway so it's OK to just assume that we're not
> freeing anything live.

I'm still a bit unsure as of what to do exactly here. I haven't found
a hook that would be called when a given domain doesn't have any
devices attached to it. Or did you mean that I should just create a
separate function, not part of the IOMMU ops?

> > > > +
> > > > +	if (iommu->domain == domain)
> > > > +		return 0;
> > > > +
> > > > +	if (iommu->domain)
> > > > +		sun50i_iommu_detach_device(iommu->domain, dev);
> > > > +
> > > > +	iommu->domain = domain;
> > > > +	sun50i_domain->iommu = iommu;
> > > > +
> > > > +	return pm_runtime_get_sync(iommu->dev);
> > >
> > > Deferring all the actual hardware pogramming to the suspend/resume hooks is
> > > a fiendishly clever idea that took me more than a moment to make sense of,
> > > but how well does it work when RPM is compiled out or runtime-inhibited?
> >
> > We have a bunch of other controllers that require runtime_pm already,
> > so it's going to be enabled. But that should be expressed in Kconfig.
>
> But it can still be inhibited from sysfs, right? We don't want driver
> behaviour to be *unnecessarily* fragile to user actions, however silly they
> may be.

That's a good point :)

> > > Furthermore, basing RPM on having a domain attached means that
> > > you'll effectively never turn the IOMMU off, even when all the
> > > clients are idle. It would make more sene to use device links like
> > > most other drivers do to properly model the producer/consumer
> > > relationship.
> >
> > I'm not familiar with device links for runtime_pm, I thought this was
> > only useful for system-wide resume and suspend?
>
> See DL_FLAG_PM_RUNTIME - we already have several IOMMU drivers taking full
> advantage of this.

I'll look into it, thanks!

> > > > +static int __maybe_unused sun50i_iommu_resume(struct device *dev)
> > > > +{
> > > > +	struct sun50i_iommu_domain *sun50i_domain;
> > > > +	struct sun50i_iommu *iommu;
> > > > +	unsigned long flags;
> > > > +	dma_addr_t dt_dma;
> > > > +	int ret;
> > > > +
> > > > +	iommu = dev_get_drvdata(dev);
> > > > +	if (!iommu->domain)
> > > > +		return 0;
> > > > +
> > > > +	sun50i_domain = to_sun50i_domain(iommu->domain);
> > > > +	dt_dma = dma_map_single(dev, sun50i_domain->dt, DT_SIZE, DMA_TO_DEVICE);
> > >
> > > As above. The power state of the IOMMU should be enitrely irrelevant to the
> > > contents of RAM.
> >
> > Sorry, I should have put a comment here.
> >
> > I'm not quite sure what the difference between a group and domain in
> > the IOMMU framework is, but since this IOMMU can only deal with a
> > single address space, my understanding was that we'd need to allocate
> > a single domain and group, and that the domain was the abstraction
> > tied to an address space (since it's what is passed as an argument to
> > map).
>
> That's correct, a domain is effectvely an address space, while groups
> represents sets of devices that the IOMMU can isolate from each other.
> IOMMUs like this one (and the MediaTek M4U in mtk_iommu.c) are a little
> hard-done-by in that they do actually have some finer-grained isolation on a
> basic allow/deny level, but the API really assumes that isolation happens at
> the address space level, so it's easier to ignore it and just use the
> single-group model anyway.
>
> The really neat advantage of having a guaranteed single group, though, is
> that you then no longer need to care about address spaces: since the group
> can only ever be attached to one domain at a time, you can have as many
> domains as you like, and handle it by having the first attach_dev call on a
> given domain context-switch that pagetable into the hardware. That's more or
> less what you've done already, which is good, it would just benefit from
> that context-switching being done in a more robust and obvious manner :)

Got it, thanks :)

> > So, given this, what made since was to allocate the directory table
> > buffer at domain_alloc time and map it. But then, domain_alloc seems
> > to not have any pointer back to the iommu we registered for some
> > reason (I guess that a domain could be shared across multiple
> > IOMMUs?), and so we don't have access to our IOMMU's struct device.
>
> I'll spare you the unrpoductive "Robin complains bitterly about the
> iommu_domain_alloc() interface being terrible, episode #27"...
>
> You'll see two main ways that existing drivers work around that - if you're
> happy to assume that you'll only ever have one IOMMU instance, or that all
> instances will always be functionally equal, then you can simply keep track
> of any old IOMMU device handle for DMA purposes (e.g. exynos_iommu);
> otherwise if you might need to cope with multiple IOMMU instances having
> different DMA capabilities then deferring instance-specific setup to the
> first device attach is the de-facto standard (e.g. arm-smmu).

I don't have any idea on how it's going to evolve, and the latter
seems cleaner, I'll work on that.

> > Also, a more general question. One of the cleanups I wanted to do was
> > to remove the kmem_cache in favour of a dma_pool, which triggered that
> > test. It looks like with a dma_pool, the physical address and dma
> > address are not the same, even though the IOMMU is directly connected
> > to the RAM so there should be no intermediate mapping. Do you know
> > why?
>
> DMA pools are backed by dma_alloc_coherent, so (at least on arm64) the
> virtual address you get will be a non-cacheable remap (assuming a
> non-coherent device), and thus calling virt_to_phys() on it is bogus and
> will give you nonsense.

Going further off-topic, why do we need a remap instead of a regular
physical address?

Thanks!
Maxime
diff mbox series

Patch

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 0b9d78a0f3ac..5cbfa6f282e2 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -289,6 +289,16 @@  config ROCKCHIP_IOMMU
 	  Say Y here if you are using a Rockchip SoC that includes an IOMMU
 	  device.
 
+config SUN50I_IOMMU
+	bool "Allwinner H6 IOMMU Support"
+	depends on ARM || ARM64
+	depends on ARCH_SUNXI
+	select ARM_DMA_USE_IOMMU
+	select IOMMU_API
+	select IOMMU_DMA
+	help
+	  Support for the IOMMU introduced in the Allwinner H6 SoCs.
+
 config TEGRA_IOMMU_GART
 	bool "Tegra GART IOMMU Support"
 	depends on ARCH_TEGRA_2x_SOC
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 97814cc861ea..43740a755786 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -28,6 +28,7 @@  obj-$(CONFIG_MTK_IOMMU_V1) += mtk_iommu_v1.o
 obj-$(CONFIG_OMAP_IOMMU) += omap-iommu.o
 obj-$(CONFIG_OMAP_IOMMU_DEBUG) += omap-iommu-debug.o
 obj-$(CONFIG_ROCKCHIP_IOMMU) += rockchip-iommu.o
+obj-$(CONFIG_SUN50I_IOMMU) += sun50i-iommu.o
 obj-$(CONFIG_TEGRA_IOMMU_GART) += tegra-gart.o
 obj-$(CONFIG_TEGRA_IOMMU_SMMU) += tegra-smmu.o
 obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
new file mode 100644
index 000000000000..ffca92628006
--- /dev/null
+++ b/drivers/iommu/sun50i-iommu.c
@@ -0,0 +1,1126 @@ 
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+// Copyright (C) 2016-2018, Allwinner Technology CO., LTD.
+// Copyright (C) 2019-2020, Cerno
+
+#include <linux/bug.h>
+#include <linux/clk.h>
+#include <linux/device.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-iommu.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+#include <linux/ioport.h>
+#include <linux/log2.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/pm_runtime.h>
+#include <linux/reset.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#define IOMMU_RESET_REG			0x010
+#define IOMMU_ENABLE_REG		0x020
+#define IOMMU_ENABLE_ENABLE			BIT(0)
+
+#define IOMMU_BYPASS_REG		0x030
+#define IOMMU_AUTO_GATING_REG		0x040
+#define IOMMU_AUTO_GATING_ENABLE		BIT(0)
+
+#define IOMMU_WBUF_CTRL_REG		0x044
+#define IOMMU_OOO_CTRL_REG		0x048
+#define IOMMU_4KB_BDY_PRT_CTRL_REG	0x04c
+#define IOMMU_TTB_REG			0x050
+#define IOMMU_TLB_ENABLE_REG		0x060
+#define IOMMU_TLB_PREFETCH_REG		0x070
+#define IOMMU_TLB_PREFETCH_MASTER_ENABLE(m)	BIT(m)
+
+#define IOMMU_TLB_FLUSH_REG		0x080
+#define IOMMU_TLB_FLUSH_PTW_CACHE		BIT(17)
+#define IOMMU_TLB_FLUSH_MACRO_TLB		BIT(16)
+#define IOMMU_TLB_FLUSH_MICRO_TLB(i)		(BIT(i) & GENMASK(5, 0))
+
+#define IOMMU_TLB_IVLD_ADDR_REG		0x090
+#define IOMMU_TLB_IVLD_ADDR_MASK_REG	0x094
+#define IOMMU_TLB_IVLD_ENABLE_REG	0x098
+#define IOMMU_TLB_IVLD_ENABLE_ENABLE		BIT(0)
+
+#define IOMMU_PC_IVLD_ADDR_REG		0x0a0
+#define IOMMU_PC_IVLD_ENABLE_REG	0x0a8
+#define IOMMU_PC_IVLD_ENABLE_ENABLE		BIT(0)
+
+#define IOMMU_DM_AUT_CTRL_REG(d)	(0x0b0 + ((d) / 2) * 4)
+#define IOMMU_DM_AUT_CTRL_RD_UNAVAIL(d, m)	(1 << (((d & 1) * 16) + ((m) * 2)))
+#define IOMMU_DM_AUT_CTRL_RD_AVAIL(d, m)	(0 << (((d & 1) * 16) + ((m) * 2)))
+#define IOMMU_DM_AUT_CTRL_WR_UNAVAIL(d, m)	(1 << (((d & 1) * 16) + ((m) * 2) + 1))
+#define IOMMU_DM_AUT_CTRL_WR_AVAIL(d, m)	(0 << (((d & 1) * 16) + ((m) * 2) + 1))
+
+#define IOMMU_DM_AUT_OVWT_REG		0x0d0
+#define IOMMU_INT_ENABLE_REG		0x100
+#define IOMMU_INT_CLR_REG		0x104
+#define IOMMU_INT_STA_REG		0x108
+#define IOMMU_INT_ERR_ADDR_REG(i)	(0x110 + (i) * 4)
+#define IOMMU_INT_ERR_ADDR_L1_REG	0x130
+#define IOMMU_INT_ERR_ADDR_L2_REG	0x134
+#define IOMMU_INT_ERR_DATA_REG(i)	(0x150 + (i) * 4)
+#define IOMMU_L1PG_INT_REG		0x0180
+#define IOMMU_L2PG_INT_REG		0x0184
+
+#define IOMMU_INT_INVALID_L2PG			BIT(17)
+#define IOMMU_INT_INVALID_L1PG			BIT(16)
+#define IOMMU_INT_MASTER_PERMISSION(m)		BIT(m)
+#define IOMMU_INT_MASTER_MASK			(IOMMU_INT_MASTER_PERMISSION(0) | \
+						 IOMMU_INT_MASTER_PERMISSION(1) | \
+						 IOMMU_INT_MASTER_PERMISSION(2) | \
+						 IOMMU_INT_MASTER_PERMISSION(3) | \
+						 IOMMU_INT_MASTER_PERMISSION(4) | \
+						 IOMMU_INT_MASTER_PERMISSION(5))
+#define IOMMU_INT_MASK				(IOMMU_INT_INVALID_L1PG | \
+						 IOMMU_INT_INVALID_L2PG | \
+						 IOMMU_INT_MASTER_MASK)
+
+#define PT_ENTRY_SIZE			sizeof(u32)
+
+#define NUM_DT_ENTRIES			4096
+#define DT_SIZE				(NUM_DT_ENTRIES * PT_ENTRY_SIZE)
+
+#define NUM_PT_ENTRIES			256
+#define PT_SIZE				(NUM_PT_ENTRIES * PT_ENTRY_SIZE)
+
+struct sun50i_iommu {
+	struct iommu_device iommu;
+
+	/* Lock to modify the IOMMU registers */
+	spinlock_t iommu_lock;
+
+	struct device *dev;
+	void __iomem *base;
+	struct reset_control *reset;
+	struct clk *clk;
+
+	struct iommu_domain *domain;
+	struct iommu_group *group;
+	struct kmem_cache *pt_pool;
+};
+
+struct sun50i_iommu_domain {
+	struct iommu_domain domain;
+
+	/* Lock to modify the Directory Table */
+	spinlock_t dt_lock;
+
+	/* L1 Page Table */
+	u32 *dt;
+	dma_addr_t dt_dma;
+
+	struct sun50i_iommu *iommu;
+};
+
+static struct sun50i_iommu_domain *to_sun50i_domain(struct iommu_domain *domain)
+{
+	return container_of(domain, struct sun50i_iommu_domain, domain);
+}
+
+static struct sun50i_iommu *sun50i_iommu_from_dev(struct device *dev)
+{
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+	if (!fwspec)
+		return NULL;
+
+	return fwspec->iommu_priv;
+}
+
+static inline u32 iommu_read(struct sun50i_iommu *iommu, u32 offset)
+{
+	return readl(iommu->base + offset);
+}
+
+static inline void iommu_write(struct sun50i_iommu *iommu,
+			       u32 offset, u32 value)
+{
+	writel(value, iommu->base + offset);
+}
+
+/*
+ * The Allwinner H6 IOMMU uses a 2-level page table.
+ *
+ * The first level is the usual Directory Table (DT), that consists of
+ * 4096 4-bytes Directory Table Entries (DTE), each pointing to a Page
+ * Table (PT).
+ *
+ * Each PT consits of 256 4-bytes Page Table Entries (PTE), each
+ * pointing to a 4kB page of physical memory.
+ *
+ * The IOMMU supports a single DT, pointed by the IOMMU_TTB_REG
+ * register that contains its physical address.
+ */
+
+#define SUN50I_IOVA_DTE_MASK	GENMASK(31, 20)
+#define SUN50I_IOVA_DTE_SHIFT	20
+#define SUN50I_IOVA_PTE_MASK	GENMASK(19, 12)
+#define SUN50I_IOVA_PTE_SHIFT	12
+#define SUN50I_IOVA_PAGE_MASK	GENMASK(11, 0)
+#define SUN50I_IOVA_PAGE_SHIFT	0
+
+static u32 sun50i_iova_dte_index(dma_addr_t iova)
+{
+	return (u32)(iova & SUN50I_IOVA_DTE_MASK) >> SUN50I_IOVA_DTE_SHIFT;
+}
+
+static u32 sun50i_iova_pte_index(dma_addr_t iova)
+{
+	return (u32)(iova & SUN50I_IOVA_PTE_MASK) >> SUN50I_IOVA_PTE_SHIFT;
+}
+
+static u32 sun50i_iova_page_offset(dma_addr_t iova)
+{
+	return (u32)(iova & SUN50I_IOVA_PAGE_MASK) >> SUN50I_IOVA_PAGE_SHIFT;
+}
+
+/*
+ * Each Directory Table Entry has a Page Table address and a valid
+ * bit:
+
+ * +---------------------+-----------+-+
+ * | PT address          | Reserved  |V|
+ * +---------------------+-----------+-+
+ *  31:10 - Page Table address
+ *   9:2  - Reserved
+ *   1:0  - 1 if the entry is valid
+ */
+
+#define SUN50I_DTE_PT_ADDRESS_MASK	GENMASK(31, 10)
+#define SUN50I_DTE_PT_ATTRS		GENMASK(1, 0)
+#define SUN50I_DTE_PT_VALID		BIT(0)
+
+static inline phys_addr_t sun50i_dte_pt_address(u32 dte)
+{
+	return (phys_addr_t)dte & SUN50I_DTE_PT_ADDRESS_MASK;
+}
+
+static inline bool sun50i_dte_is_pt_valid(u32 dte)
+{
+	return (dte & SUN50I_DTE_PT_ATTRS) == SUN50I_DTE_PT_VALID;
+}
+
+static inline u32 sun50i_mk_dte(dma_addr_t pt_dma)
+{
+	return (pt_dma & SUN50I_DTE_PT_ADDRESS_MASK) | SUN50I_DTE_PT_VALID;
+}
+
+/*
+ * Each PTE has a Page address, an authority index and a valid bit:
+ *
+ * +----------------+-----+-----+-----+---+-----+
+ * | Page address   | Rsv | ACI | Rsv | V | Rsv |
+ * +----------------+-----+-----+-----+---+-----+
+ *  31:12 - Page address
+ *  11:8  - Reserved
+ *   7:4  - Authority Control Index
+ *   3:2  - Reserved
+ *     1  - 1 if the entry is valid
+ *     0  - Reserved
+ *
+ * The way permissions work is that the IOMMU has 16 "domains" that
+ * can be configured to give each masters either read or write
+ * permissions through the IOMMU_DM_AUT_CTRL_REG registers. The domain
+ * 0 seems like the default domain, and its permissions in the
+ * IOMMU_DM_AUT_CTRL_REG are only read-only, so it's not really
+ * useful to enforce any particular permission.
+ *
+ * Each page entry willthen have a reference to the domain they are
+ * affected to, so that we can actually enforce them on a per-page
+ * basis.
+ *
+ * In order to make it work with the IOMMU framework, we will be using
+ * 4 different domains, starting at 1: RD_WR, RD, WR and NONE
+ * depending on the permission we want to enforce. Each domain will
+ * have each master setup in the same way, since the IOMMU framework
+ * doesn't seem to restrict page access on a per-device basis. And
+ * then we will use the relevant domain index when generating the page
+ * table entry depending on the permissions we want to be enforced.
+ */
+
+enum iommu_aci {
+	IOMMU_ACI_DO_NOT_USE = 0,
+	IOMMU_ACI_NONE,
+	IOMMU_ACI_RD,
+	IOMMU_ACI_WR,
+	IOMMU_ACI_RD_WR,
+};
+
+#define SUN50I_PTE_PAGE_ADDRESS_MASK	GENMASK(31, 12)
+#define SUN50I_PTE_ACI_MASK		GENMASK(7, 4)
+#define SUN50I_PTE_ACI_SHIFT		4
+#define SUN50I_PTE_ACI(aci)		((aci) << SUN50I_PTE_ACI_SHIFT)
+#define SUN50I_PTE_PAGE_VALID		BIT(1)
+
+static inline phys_addr_t sun50i_pte_page_address(u32 pte)
+{
+	return (phys_addr_t)pte & SUN50I_PTE_PAGE_ADDRESS_MASK;
+}
+
+static inline enum iommu_aci sun50i_pte_aci(u32 pte)
+{
+	return (pte & SUN50I_PTE_ACI_MASK) >> SUN50I_PTE_ACI_SHIFT;
+}
+
+static inline bool sun50i_pte_is_page_valid(u32 pte)
+{
+	return pte & SUN50I_PTE_PAGE_VALID;
+}
+
+static u32 sun50i_mk_pte(phys_addr_t page, int prot)
+{
+	enum iommu_aci aci;
+	u32 flags = 0;
+
+	if (prot & (IOMMU_READ | IOMMU_WRITE))
+		aci = IOMMU_ACI_RD_WR;
+	else if (prot & IOMMU_READ)
+		aci = IOMMU_ACI_RD;
+	else if (prot & IOMMU_WRITE)
+		aci = IOMMU_ACI_WR;
+	else
+		aci = IOMMU_ACI_NONE;
+
+	flags |= SUN50I_PTE_ACI(aci);
+	page &= SUN50I_PTE_PAGE_ADDRESS_MASK;
+	return page | flags | SUN50I_PTE_PAGE_VALID;
+}
+
+static inline void sun50i_table_flush(struct sun50i_iommu_domain *sun50i_domain,
+				      dma_addr_t dma, unsigned int count)
+{
+	struct sun50i_iommu *iommu = sun50i_domain->iommu;
+	size_t size = count * PT_ENTRY_SIZE;
+
+	dma_sync_single_for_device(iommu->dev, dma, size, DMA_TO_DEVICE);
+}
+
+static int sun50i_iommu_flush_all_tlb(struct sun50i_iommu *iommu)
+{
+	u32 reg;
+	int ret;
+
+	assert_spin_locked(&iommu->iommu_lock);
+
+	iommu_write(iommu,
+		    IOMMU_TLB_FLUSH_REG,
+		    IOMMU_TLB_FLUSH_PTW_CACHE |
+		    IOMMU_TLB_FLUSH_MACRO_TLB |
+		    IOMMU_TLB_FLUSH_MICRO_TLB(5) |
+		    IOMMU_TLB_FLUSH_MICRO_TLB(4) |
+		    IOMMU_TLB_FLUSH_MICRO_TLB(3) |
+		    IOMMU_TLB_FLUSH_MICRO_TLB(2) |
+		    IOMMU_TLB_FLUSH_MICRO_TLB(1) |
+		    IOMMU_TLB_FLUSH_MICRO_TLB(0));
+
+	ret = readl_poll_timeout(iommu->base + IOMMU_TLB_FLUSH_REG,
+				 reg, !reg,
+				 1, 2000);
+	if (ret)
+		dev_err(iommu->dev, "Enable flush all request timed out\n");
+
+	return ret;
+}
+
+static int sun50i_iommu_tlb_invalidate(struct sun50i_iommu *iommu,
+				       dma_addr_t iova)
+{
+	int ret;
+	u32 reg;
+
+	assert_spin_locked(&iommu->iommu_lock);
+
+	iommu_write(iommu, IOMMU_TLB_IVLD_ADDR_REG, iova);
+	iommu_write(iommu, IOMMU_TLB_IVLD_ADDR_MASK_REG,
+		    SUN50I_PTE_PAGE_ADDRESS_MASK);
+	iommu_write(iommu, IOMMU_TLB_IVLD_ENABLE_REG,
+		    IOMMU_TLB_IVLD_ENABLE_ENABLE);
+
+	ret = readl_poll_timeout(iommu->base + IOMMU_TLB_IVLD_ENABLE_REG,
+				 reg, !(reg & IOMMU_TLB_IVLD_ENABLE_ENABLE),
+				 1, 2000);
+	if (ret)
+		dev_err(iommu->dev, "TLB Invalid timed out\n");
+
+	return ret;
+}
+
+static int sun50i_iommu_ptw_invalidate(struct sun50i_iommu *iommu,
+				       dma_addr_t iova)
+{
+	int ret;
+	u32 reg;
+
+	assert_spin_locked(&iommu->iommu_lock);
+
+	iommu_write(iommu, IOMMU_PC_IVLD_ADDR_REG, iova);
+	iommu_write(iommu, IOMMU_PC_IVLD_ENABLE_REG,
+		    IOMMU_PC_IVLD_ENABLE_ENABLE);
+
+	ret = readl_poll_timeout(iommu->base + IOMMU_PC_IVLD_ENABLE_REG,
+				 reg, !(reg & IOMMU_PC_IVLD_ENABLE_ENABLE),
+				 1, 2000);
+	if (ret)
+		dev_err(iommu->dev, "PTW cache invalid timed out\n");
+
+	return ret;
+}
+
+static u32 *sun50i_dte_get_page_table(struct sun50i_iommu_domain *sun50i_domain,
+				      dma_addr_t iova)
+{
+	struct sun50i_iommu *iommu = sun50i_domain->iommu;
+	u32 *page_table, *dte_addr;
+	phys_addr_t pt_phys;
+	dma_addr_t pt_dma;
+	u32 dte_index, dte;
+
+	assert_spin_locked(&sun50i_domain->dt_lock);
+
+	dte_index = sun50i_iova_dte_index(iova);
+	dte_addr = &sun50i_domain->dt[dte_index];
+	dte = *dte_addr;
+	if (sun50i_dte_is_pt_valid(dte))
+		goto done;
+
+	page_table = kmem_cache_zalloc(iommu->pt_pool, GFP_ATOMIC);
+	if (!page_table)
+		return ERR_PTR(-ENOMEM);
+
+	pt_dma = dma_map_single(iommu->dev, page_table, PT_SIZE, DMA_TO_DEVICE);
+	if (dma_mapping_error(iommu->dev, pt_dma)) {
+		dev_err(iommu->dev, "Couldn't map L2 Page Table\n");
+		kmem_cache_free(iommu->pt_pool, page_table);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* We rely on the physical address and DMA address being the same */
+	WARN_ON(pt_dma != virt_to_phys(page_table));
+
+	dte = sun50i_mk_dte(pt_dma);
+	*dte_addr = dte;
+	sun50i_table_flush(sun50i_domain, pt_dma, PT_SIZE);
+	sun50i_table_flush(sun50i_domain, virt_to_phys(dte_addr), 1);
+
+done:
+	pt_phys = sun50i_dte_pt_address(dte);
+	return (u32 *)phys_to_virt(pt_phys);
+}
+
+static int sun50i_iommu_map(struct iommu_domain *domain, unsigned long iova,
+			    phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+	struct sun50i_iommu *iommu = sun50i_domain->iommu;
+	u32 pte_index;
+	u32 *page_table, *pte_addr;
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
+	page_table = sun50i_dte_get_page_table(sun50i_domain, iova);
+	if (IS_ERR(page_table)) {
+		ret = PTR_ERR(page_table);
+		goto out;
+	}
+
+	pte_index = sun50i_iova_pte_index(iova);
+	pte_addr = &page_table[pte_index];
+	if (sun50i_pte_is_page_valid(*pte_addr)) {
+		phys_addr_t page_phys = sun50i_pte_page_address(*pte_addr);
+		dev_err(iommu->dev,
+			"iova %pad already mapped to %pa cannot remap to %pa prot: %#x\n",
+			&iova, &page_phys, &paddr, prot);
+		ret = -EADDRINUSE;
+		goto out;
+	}
+
+	*pte_addr = sun50i_mk_pte(paddr, prot);
+	sun50i_table_flush(sun50i_domain, virt_to_phys(pte_addr), 1);
+
+	spin_lock_irqsave(&iommu->iommu_lock, flags);
+
+	sun50i_iommu_tlb_invalidate(iommu, iova);
+	sun50i_iommu_ptw_invalidate(iommu, iova);
+
+	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
+
+out:
+	spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
+	return ret;
+}
+
+static size_t sun50i_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
+				 size_t size, struct iommu_iotlb_gather *gather)
+{
+	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+	struct sun50i_iommu *iommu = sun50i_domain->iommu;
+	unsigned long flags;
+	phys_addr_t pt_phys;
+	dma_addr_t pte_dma;
+	u32 *pte_addr;
+	u32 dte;
+
+	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
+
+	dte = sun50i_domain->dt[sun50i_iova_dte_index(iova)];
+	if (!sun50i_dte_is_pt_valid(dte)) {
+		spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
+		return 0;
+	}
+
+	pt_phys = sun50i_dte_pt_address(dte);
+	pte_addr = (u32 *)phys_to_virt(pt_phys) + sun50i_iova_pte_index(iova);
+	pte_dma = pt_phys + sun50i_iova_pte_index(iova) * PT_ENTRY_SIZE;
+
+	if (!sun50i_pte_is_page_valid(*pte_addr)) {
+		spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
+		return 0;
+	}
+
+	memset(pte_addr, 0, sizeof(*pte_addr));
+	sun50i_table_flush(sun50i_domain, virt_to_phys(pte_addr), 1);
+
+	spin_lock(&iommu->iommu_lock);
+	sun50i_iommu_tlb_invalidate(iommu, iova);
+	sun50i_iommu_ptw_invalidate(iommu, iova);
+	spin_unlock(&iommu->iommu_lock);
+
+	spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
+
+	return SZ_4K;
+}
+
+static phys_addr_t sun50i_iommu_iova_to_phys(struct iommu_domain *domain,
+					     dma_addr_t iova)
+{
+	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+	phys_addr_t pt_phys, phys = 0;
+	unsigned long flags;
+	u32 *page_table;
+	u32 dte, pte;
+
+	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
+
+	dte = sun50i_domain->dt[sun50i_iova_dte_index(iova)];
+	if (!sun50i_dte_is_pt_valid(dte))
+		goto out;
+
+	pt_phys = sun50i_dte_pt_address(dte);
+	page_table = (u32 *)phys_to_virt(pt_phys);
+	pte = page_table[sun50i_iova_pte_index(iova)];
+	if (!sun50i_pte_is_page_valid(pte))
+		goto out;
+
+	phys = sun50i_pte_page_address(pte) + sun50i_iova_page_offset(iova);
+
+out:
+	spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
+	return phys;
+}
+
+static struct iommu_domain *sun50i_iommu_domain_alloc(unsigned type)
+{
+	struct sun50i_iommu_domain *sun50i_domain;
+
+	if (type != IOMMU_DOMAIN_DMA && type != IOMMU_DOMAIN_UNMANAGED)
+		return NULL;
+
+	sun50i_domain = kzalloc(sizeof(*sun50i_domain), GFP_KERNEL);
+	if (!sun50i_domain)
+		return NULL;
+
+	if (type == IOMMU_DOMAIN_DMA &&
+	    iommu_get_dma_cookie(&sun50i_domain->domain))
+		goto err_free_domain;
+
+	sun50i_domain->dt = (u32 *)__get_free_pages(GFP_KERNEL,
+						    get_order(DT_SIZE));
+	if (!sun50i_domain->dt)
+		goto err_put_cookie;
+	memset(sun50i_domain->dt, 0, DT_SIZE);
+
+	spin_lock_init(&sun50i_domain->dt_lock);
+
+	sun50i_domain->domain.geometry.aperture_start = 0;
+	sun50i_domain->domain.geometry.aperture_end = DMA_BIT_MASK(32);
+	sun50i_domain->domain.geometry.force_aperture = true;
+
+	return &sun50i_domain->domain;
+
+err_put_cookie:
+	if (type == IOMMU_DOMAIN_DMA)
+		iommu_put_dma_cookie(&sun50i_domain->domain);
+
+err_free_domain:
+	kfree(sun50i_domain);
+
+	return NULL;
+}
+
+static void sun50i_iommu_domain_free(struct iommu_domain *domain)
+{
+	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+	struct sun50i_iommu *iommu = sun50i_domain->iommu;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&sun50i_domain->dt_lock, flags);
+
+	for (i = 0; i < NUM_DT_ENTRIES; i++) {
+		phys_addr_t pt_phys;
+		u32 *page_table;
+		u32 *dte_addr;
+		u32 dte;
+
+		dte_addr = &sun50i_domain->dt[i];
+		dte = *dte_addr;
+		if (!sun50i_dte_is_pt_valid(dte))
+			continue;
+
+		memset(dte_addr, 0, sizeof(*dte_addr));
+		sun50i_table_flush(sun50i_domain, virt_to_phys(dte_addr), 1);
+
+		pt_phys = sun50i_dte_pt_address(dte);
+		dma_unmap_single(iommu->dev, pt_phys, PT_SIZE, DMA_TO_DEVICE);
+
+		page_table = phys_to_virt(pt_phys);
+		kmem_cache_free(iommu->pt_pool, page_table);
+	}
+
+	free_pages((unsigned long)sun50i_domain->dt, get_order(DT_SIZE));
+	sun50i_domain->dt = NULL;
+
+	spin_lock(&iommu->iommu_lock);
+	sun50i_iommu_flush_all_tlb(iommu);
+	spin_unlock(&iommu->iommu_lock);
+
+	spin_unlock_irqrestore(&sun50i_domain->dt_lock, flags);
+	iommu_put_dma_cookie(domain);
+
+	kfree(sun50i_domain);
+}
+
+static void sun50i_iommu_detach_device(struct iommu_domain *domain,
+				       struct device *dev)
+{
+	struct sun50i_iommu *iommu;
+
+	iommu = sun50i_iommu_from_dev(dev);
+	if (!iommu)
+		return;
+
+	dev_info(dev, "Detaching from IOMMU domain\n");
+
+	if (iommu->domain != domain)
+		return;
+
+	pm_runtime_put_sync(iommu->dev);
+
+	iommu->domain = NULL;
+}
+
+static int sun50i_iommu_attach_device(struct iommu_domain *domain,
+				      struct device *dev)
+{
+	struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain);
+	struct sun50i_iommu *iommu;
+
+	iommu = sun50i_iommu_from_dev(dev);
+	if (!iommu)
+		return -ENODEV;
+
+	dev_info(dev, "Attaching to IOMMU domain\n");
+
+	if (iommu->domain == domain)
+		return 0;
+
+	if (iommu->domain)
+		sun50i_iommu_detach_device(iommu->domain, dev);
+
+	iommu->domain = domain;
+	sun50i_domain->iommu = iommu;
+
+	return pm_runtime_get_sync(iommu->dev);
+}
+
+static int sun50i_iommu_add_device(struct device *dev)
+{
+	struct sun50i_iommu *iommu;
+	struct iommu_group *group;
+
+	iommu = sun50i_iommu_from_dev(dev);
+	if (!iommu)
+		return -ENODEV;
+
+	group = iommu_group_get_for_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	iommu_group_put(group);
+
+	return 0;
+}
+
+static void sun50i_iommu_remove_device(struct device *dev)
+{
+	struct sun50i_iommu *iommu;
+
+	iommu = sun50i_iommu_from_dev(dev);
+	if (!iommu)
+		return;
+
+	iommu_group_remove_device(dev);
+}
+
+static struct iommu_group *sun50i_iommu_device_group(struct device *dev)
+{
+	struct sun50i_iommu *iommu;
+
+	iommu = sun50i_iommu_from_dev(dev);
+	if (!iommu)
+		return NULL;
+
+	return iommu_group_ref_get(iommu->group);
+}
+
+static int sun50i_iommu_of_xlate(struct device *dev,
+				 struct of_phandle_args *args)
+{
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct platform_device *iommu_pdev;
+	unsigned id = args->args[0];
+
+	iommu_pdev = of_find_device_by_node(args->np);
+	if (WARN_ON(!iommu_pdev))
+		return -EINVAL;
+
+	fwspec->iommu_priv = platform_get_drvdata(iommu_pdev);
+
+	return iommu_fwspec_add_ids(dev, &id, 1);
+}
+
+static struct iommu_ops sun50i_iommu_ops = {
+	.pgsize_bitmap = SZ_4K,
+	.map  = sun50i_iommu_map,
+	.unmap = sun50i_iommu_unmap,
+	.domain_alloc = sun50i_iommu_domain_alloc,
+	.domain_free = sun50i_iommu_domain_free,
+	.attach_dev = sun50i_iommu_attach_device,
+	.detach_dev = sun50i_iommu_detach_device,
+	.add_device = sun50i_iommu_add_device,
+	.remove_device = sun50i_iommu_remove_device,
+	.device_group	= sun50i_iommu_device_group,
+	.of_xlate = sun50i_iommu_of_xlate,
+	.iova_to_phys = sun50i_iommu_iova_to_phys,
+};
+
+static void sun50i_iommu_report_fault(struct sun50i_iommu *iommu,
+				      unsigned master, phys_addr_t iova,
+				      unsigned prot)
+{
+	dev_err(iommu->dev, "Page fault for %pad (master %d, dir %s)\n",
+		&iova, master, (prot == IOMMU_FAULT_WRITE) ? "wr" : "rd");
+
+	if (iommu->domain)
+		report_iommu_fault(iommu->domain, iommu->dev, iova, prot);
+	else
+		dev_err(iommu->dev, "Page fault while iommu not attached to any domain?\n");
+}
+
+static phys_addr_t sun50i_iommu_handle_pt_irq(struct sun50i_iommu *iommu,
+					      unsigned addr_reg,
+					      unsigned blame_reg)
+{
+	phys_addr_t iova;
+	unsigned master;
+	u32 blame;
+
+	assert_spin_locked(&iommu->iommu_lock);
+
+	iova = iommu_read(iommu, addr_reg);
+	blame = iommu_read(iommu, blame_reg);
+	master = ilog2(blame & IOMMU_INT_MASTER_MASK);
+
+	/*
+	 * If the address is not in the page table, we can't get what
+	 * operation triggered the fault. Assume it's a read
+	 * operation.
+	 */
+	sun50i_iommu_report_fault(iommu, master, iova, IOMMU_FAULT_READ);
+
+	return iova;
+}
+
+static phys_addr_t sun50i_iommu_handle_perm_irq(struct sun50i_iommu *iommu)
+{
+	enum iommu_aci aci;
+	phys_addr_t iova;
+	unsigned master;
+	unsigned dir;
+	u32 blame;
+
+	assert_spin_locked(&iommu->iommu_lock);
+
+	blame = iommu_read(iommu, IOMMU_INT_STA_REG);
+	master = ilog2(blame & IOMMU_INT_MASTER_MASK);
+	iova = iommu_read(iommu, IOMMU_INT_ERR_ADDR_REG(master));
+	aci = sun50i_pte_aci(iommu_read(iommu, IOMMU_INT_ERR_DATA_REG(master)));
+
+	switch (aci) {
+		/*
+		 * If we are in the read-only domain, then it means we
+		 * tried to write.
+		 */
+	case IOMMU_ACI_RD:
+		dir = IOMMU_FAULT_WRITE;
+		break;
+
+		/*
+		 * If we are in the write-only domain, then it means
+		 * we tried to read.
+		 */
+	case IOMMU_ACI_WR:
+
+		/*
+		 * If we are in the domain without any permission, we
+		 * can't really tell. Let's default to a read
+		 * operation.
+		 */
+	case IOMMU_ACI_NONE:
+
+		/* WTF? */
+	case IOMMU_ACI_RD_WR:
+	default:
+		dir = IOMMU_FAULT_READ;
+		break;
+	}
+
+	/*
+	 * If the address is not in the page table, we can't get what
+	 * operation triggered the fault. Assume it's a read
+	 * operation.
+	 */
+	sun50i_iommu_report_fault(iommu, master, iova, dir);
+
+	return iova;
+}
+
+static irqreturn_t sun50i_iommu_irq(int irq, void *dev_id)
+{
+	struct sun50i_iommu *iommu = dev_id;
+	phys_addr_t iova;
+	u32 status;
+
+	spin_lock(&iommu->iommu_lock);
+
+	status = iommu_read(iommu, IOMMU_INT_STA_REG);
+	if (!(status & IOMMU_INT_MASK)) {
+		spin_unlock(&iommu->iommu_lock);
+		return IRQ_NONE;
+	}
+
+	if (status & IOMMU_INT_INVALID_L2PG)
+		iova = sun50i_iommu_handle_pt_irq(iommu,
+						  IOMMU_INT_ERR_ADDR_L2_REG,
+						  IOMMU_L2PG_INT_REG);
+	else if (status & IOMMU_INT_INVALID_L1PG)
+		iova = sun50i_iommu_handle_pt_irq(iommu,
+						  IOMMU_INT_ERR_ADDR_L1_REG,
+						  IOMMU_L1PG_INT_REG);
+	else
+		iova = sun50i_iommu_handle_perm_irq(iommu);
+
+	sun50i_iommu_tlb_invalidate(iommu, iova);
+	sun50i_iommu_ptw_invalidate(iommu, iova);
+
+	iommu_write(iommu, IOMMU_INT_CLR_REG, status);
+
+	iommu_write(iommu, IOMMU_RESET_REG, ~status);
+	iommu_write(iommu, IOMMU_RESET_REG, status);
+
+	spin_unlock(&iommu->iommu_lock);
+
+	return IRQ_HANDLED;
+}
+
+static int sun50i_iommu_probe(struct platform_device *pdev)
+{
+	struct sun50i_iommu *iommu;
+	struct resource *res;
+	int ret, irq;
+
+	iommu = devm_kzalloc(&pdev->dev, sizeof(*iommu), GFP_KERNEL);
+	if (!iommu)
+		return -ENOMEM;
+	spin_lock_init(&iommu->iommu_lock);
+	platform_set_drvdata(pdev, iommu);
+	iommu->dev = &pdev->dev;
+
+	iommu->pt_pool = kmem_cache_create(dev_name(&pdev->dev),
+					   PT_SIZE, PT_SIZE,
+					   SLAB_HWCACHE_ALIGN,
+					   NULL);
+	if (!iommu->pt_pool)
+		return -ENOMEM;
+
+	iommu->group = iommu_group_alloc();
+	if (IS_ERR(iommu->group)) {
+		ret = PTR_ERR(iommu->group);
+		goto err_free_cache;
+	}
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	iommu->base = devm_ioremap_resource(&pdev->dev, res);
+	if (!iommu->base) {
+		ret = PTR_ERR(iommu->base);
+		goto err_free_group;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		ret = irq;
+		goto err_free_group;
+	}
+
+	iommu->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(iommu->clk)) {
+		dev_err(&pdev->dev, "Couldn't get our clock.\n");
+		ret = PTR_ERR(iommu->clk);
+		goto err_free_group;
+	}
+
+	iommu->reset = devm_reset_control_get(&pdev->dev, NULL);
+	if (IS_ERR(iommu->reset)) {
+		dev_err(&pdev->dev, "Couldn't get our reset line.\n");
+		ret = PTR_ERR(iommu->reset);
+		goto err_free_group;
+	}
+
+	ret = iommu_device_sysfs_add(&iommu->iommu, &pdev->dev,
+				     NULL, dev_name(&pdev->dev));
+	if (ret)
+		goto err_free_group;
+
+	iommu_device_set_ops(&iommu->iommu, &sun50i_iommu_ops);
+	iommu_device_set_fwnode(&iommu->iommu, &pdev->dev.of_node->fwnode);
+
+	ret = iommu_device_register(&iommu->iommu);
+	if (ret)
+		goto err_remove_sysfs;
+
+	pm_runtime_enable(&pdev->dev);
+	ret = devm_request_irq(&pdev->dev, irq, sun50i_iommu_irq, 0,
+			       dev_name(&pdev->dev), iommu);
+	if (ret < 0)
+		goto err_unregister;
+
+	bus_set_iommu(&platform_bus_type, &sun50i_iommu_ops);
+
+	return 0;
+
+err_unregister:
+	iommu_device_unregister(&iommu->iommu);
+
+err_remove_sysfs:
+	iommu_device_sysfs_remove(&iommu->iommu);
+
+err_free_group:
+	iommu_group_put(iommu->group);
+
+err_free_cache:
+	kmem_cache_destroy(iommu->pt_pool);
+
+	return ret;
+}
+
+static int __maybe_unused sun50i_iommu_suspend(struct device *dev)
+{
+	struct sun50i_iommu_domain *sun50i_domain;
+	struct sun50i_iommu *iommu;
+	unsigned long flags;
+
+	iommu = dev_get_drvdata(dev);
+	if (!iommu->domain)
+		return 0;
+
+	spin_lock_irqsave(&iommu->iommu_lock, flags);
+
+	iommu_write(iommu, IOMMU_ENABLE_REG, 0);
+	iommu_write(iommu, IOMMU_TTB_REG, 0);
+
+	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
+
+	clk_disable_unprepare(iommu->clk);
+	reset_control_assert(iommu->reset);
+
+	sun50i_domain = to_sun50i_domain(iommu->domain);
+	dma_unmap_single(dev, virt_to_phys(sun50i_domain->dt), DT_SIZE,
+			 DMA_TO_DEVICE);
+
+	return 0;
+}
+
+static int __maybe_unused sun50i_iommu_resume(struct device *dev)
+{
+	struct sun50i_iommu_domain *sun50i_domain;
+	struct sun50i_iommu *iommu;
+	unsigned long flags;
+	dma_addr_t dt_dma;
+	int ret;
+
+	iommu = dev_get_drvdata(dev);
+	if (!iommu->domain)
+		return 0;
+
+	sun50i_domain = to_sun50i_domain(iommu->domain);
+	dt_dma = dma_map_single(dev, sun50i_domain->dt, DT_SIZE, DMA_TO_DEVICE);
+	if (dma_mapping_error(dev, dt_dma)) {
+		dev_err(dev, "Couldn't map L1 Page Table\n");
+		return -ENOMEM;
+	}
+
+	ret = reset_control_deassert(iommu->reset);
+	if (ret)
+		goto err_unmap;
+
+	ret = clk_prepare_enable(iommu->clk);
+	if (ret)
+		goto err_reset_assert;
+
+	/* We rely on the physical address and DMA address being the same */
+	WARN_ON(dt_dma != virt_to_phys(sun50i_domain->dt));
+
+	spin_lock_irqsave(&iommu->iommu_lock, flags);
+
+	iommu_write(iommu, IOMMU_TTB_REG, dt_dma);
+	iommu_write(iommu, IOMMU_TLB_PREFETCH_REG,
+		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(0) |
+		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(1) |
+		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(2) |
+		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(3) |
+		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(4) |
+		    IOMMU_TLB_PREFETCH_MASTER_ENABLE(5));
+	iommu_write(iommu, IOMMU_INT_ENABLE_REG, IOMMU_INT_MASK);
+	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_NONE),
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 0) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 0) |
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 1) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 1) |
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 2) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 2) |
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 3) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 3) |
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 4) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 4) |
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_NONE, 5) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_NONE, 5));
+
+	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_RD),
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 0) |
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 1) |
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 2) |
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 3) |
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 4) |
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD, 5) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 0) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 1) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 2) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 3) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 4) |
+		    IOMMU_DM_AUT_CTRL_WR_UNAVAIL(IOMMU_ACI_RD, 5));
+
+	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_WR),
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 0) |
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 1) |
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 2) |
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 3) |
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 4) |
+		    IOMMU_DM_AUT_CTRL_RD_UNAVAIL(IOMMU_ACI_WR, 5) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 0) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 1) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 2) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 3) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 4) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_WR, 5));
+
+	iommu_write(iommu, IOMMU_DM_AUT_CTRL_REG(IOMMU_ACI_RD_WR),
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 0) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 0) |
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 1) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 1) |
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 2) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 2) |
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 3) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 3) |
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 4) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 4) |
+		    IOMMU_DM_AUT_CTRL_RD_AVAIL(IOMMU_ACI_RD_WR, 5) |
+		    IOMMU_DM_AUT_CTRL_WR_AVAIL(IOMMU_ACI_RD_WR, 5));
+
+	ret = sun50i_iommu_flush_all_tlb(iommu);
+	if (ret) {
+		spin_unlock_irqrestore(&iommu->iommu_lock, flags);
+		goto err_clk_disable;
+	}
+
+	iommu_write(iommu, IOMMU_AUTO_GATING_REG, IOMMU_AUTO_GATING_ENABLE);
+	iommu_write(iommu, IOMMU_ENABLE_REG, IOMMU_ENABLE_ENABLE);
+
+	spin_unlock_irqrestore(&iommu->iommu_lock, flags);
+
+	return 0;
+
+err_clk_disable:
+	clk_disable_unprepare(iommu->clk);
+
+err_reset_assert:
+	reset_control_assert(iommu->reset);
+
+err_unmap:
+	sun50i_domain = to_sun50i_domain(iommu->domain);
+	dma_unmap_single(dev, dt_dma, DT_SIZE, DMA_TO_DEVICE);
+	return ret;
+}
+
+static const struct dev_pm_ops sun50i_iommu_pm_ops = {
+	SET_RUNTIME_PM_OPS(sun50i_iommu_suspend, sun50i_iommu_resume, NULL)
+};
+
+static const struct of_device_id sun50i_iommu_dt[] = {
+	{ .compatible = "allwinner,sun50i-h6-iommu", },
+	{ /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, sun50i_iommu_dt);
+
+static struct platform_driver sun50i_iommu_driver = {
+	.probe		= sun50i_iommu_probe,
+	.driver		= {
+		.name			= "sun50i-iommu",
+		.of_match_table 	= sun50i_iommu_dt,
+		.pm			= &sun50i_iommu_pm_ops,
+		.suppress_bind_attrs	= true,
+	}
+};
+
+static int __init sun50i_iommu_init(void)
+{
+	return platform_driver_register(&sun50i_iommu_driver);
+}
+subsys_initcall(sun50i_iommu_init);
+
+MODULE_DESCRIPTION("Allwinner H6 IOMMU driver");
+MODULE_AUTHOR("Maxime Ripard <maxime@cerno.tech>");
+MODULE_AUTHOR("zhuxianbin <zhuxianbin@allwinnertech.com>");
+MODULE_LICENSE("Dual BSD/GPL");