diff mbox

[v4,3/6] iommu: add ARM short descriptor page table allocator.

Message ID 1438597279-2937-4-git-send-email-yong.wu@mediatek.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yong Wu (吴勇) Aug. 3, 2015, 10:21 a.m. UTC
This patch is for ARM Short Descriptor Format.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
---
 drivers/iommu/Kconfig                |  18 +
 drivers/iommu/Makefile               |   1 +
 drivers/iommu/io-pgtable-arm-short.c | 813 +++++++++++++++++++++++++++++++++++
 drivers/iommu/io-pgtable-arm.c       |   3 -
 drivers/iommu/io-pgtable.c           |   4 +
 drivers/iommu/io-pgtable.h           |  14 +
 6 files changed, 850 insertions(+), 3 deletions(-)
 create mode 100644 drivers/iommu/io-pgtable-arm-short.c

Comments

Will Deacon Sept. 16, 2015, 3:58 p.m. UTC | #1
On Mon, Aug 03, 2015 at 11:21:16AM +0100, Yong Wu wrote:
> This patch is for ARM Short Descriptor Format.
> 
> Signed-off-by: Yong Wu <yong.wu@mediatek.com>
> ---
>  drivers/iommu/Kconfig                |  18 +
>  drivers/iommu/Makefile               |   1 +
>  drivers/iommu/io-pgtable-arm-short.c | 813 +++++++++++++++++++++++++++++++++++
>  drivers/iommu/io-pgtable-arm.c       |   3 -
>  drivers/iommu/io-pgtable.c           |   4 +
>  drivers/iommu/io-pgtable.h           |  14 +
>  6 files changed, 850 insertions(+), 3 deletions(-)
>  create mode 100644 drivers/iommu/io-pgtable-arm-short.c
> 
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index f1fb1d3..3abd066 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -39,6 +39,24 @@ config IOMMU_IO_PGTABLE_LPAE_SELFTEST
> 
>           If unsure, say N here.
> 
> +config IOMMU_IO_PGTABLE_SHORT
> +       bool "ARMv7/v8 Short Descriptor Format"
> +       select IOMMU_IO_PGTABLE
> +       depends on ARM || ARM64 || COMPILE_TEST
> +       help
> +         Enable support for the ARM Short-descriptor pagetable format.
> +         This allocator supports 2 levels translation tables which supports

Some minor rewording here:

"...2 levels of translation tables, which enables a 32-bit memory map based
 on..."

> +         a memory map based on memory sections or pages.
> +
> +config IOMMU_IO_PGTABLE_SHORT_SELFTEST
> +       bool "Short Descriptor selftests"
> +       depends on IOMMU_IO_PGTABLE_SHORT
> +       help
> +         Enable self-tests for Short-descriptor page table allocator.
> +         This performs a series of page-table consistency checks during boot.
> +
> +         If unsure, say N here.
> +
>  endmenu
> 
>  config IOMMU_IOVA

[...]

> +#define ARM_SHORT_PGDIR_SHIFT                  20
> +#define ARM_SHORT_PAGE_SHIFT                   12
> +#define ARM_SHORT_PTRS_PER_PTE                 \
> +       (1 << (ARM_SHORT_PGDIR_SHIFT - ARM_SHORT_PAGE_SHIFT))
> +#define ARM_SHORT_BYTES_PER_PTE                        \
> +       (ARM_SHORT_PTRS_PER_PTE * sizeof(arm_short_iopte))
> +
> +/* level 1 pagetable */
> +#define ARM_SHORT_PGD_TYPE_PGTABLE             BIT(0)
> +#define ARM_SHORT_PGD_TYPE_SECTION             BIT(1)
> +#define ARM_SHORT_PGD_B                                BIT(2)
> +#define ARM_SHORT_PGD_C                                BIT(3)
> +#define ARM_SHORT_PGD_PGTABLE_NS               BIT(3)
> +#define ARM_SHORT_PGD_SECTION_XN               BIT(4)
> +#define ARM_SHORT_PGD_IMPLE                    BIT(9)
> +#define ARM_SHORT_PGD_RD_WR                    (3 << 10)
> +#define ARM_SHORT_PGD_RDONLY                   BIT(15)
> +#define ARM_SHORT_PGD_S                                BIT(16)
> +#define ARM_SHORT_PGD_nG                       BIT(17)
> +#define ARM_SHORT_PGD_SUPERSECTION             BIT(18)
> +#define ARM_SHORT_PGD_SECTION_NS               BIT(19)
> +
> +#define ARM_SHORT_PGD_TYPE_SUPERSECTION                \
> +       (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION)
> +#define ARM_SHORT_PGD_SECTION_TYPE_MSK         \
> +       (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION)
> +#define ARM_SHORT_PGD_PGTABLE_TYPE_MSK         \
> +       (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_TYPE_PGTABLE)
> +#define ARM_SHORT_PGD_TYPE_IS_PGTABLE(pgd)     \
> +       (((pgd) & ARM_SHORT_PGD_PGTABLE_TYPE_MSK) == ARM_SHORT_PGD_TYPE_PGTABLE)
> +#define ARM_SHORT_PGD_TYPE_IS_SECTION(pgd)     \
> +       (((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == ARM_SHORT_PGD_TYPE_SECTION)
> +#define ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(pgd)        \
> +       (((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == \
> +       ARM_SHORT_PGD_TYPE_SUPERSECTION)
> +#define ARM_SHORT_PGD_PGTABLE_MSK              0xfffffc00

You could use (~(ARM_SHORT_BYTES_PER_PTE - 1)), I think.

> +#define ARM_SHORT_PGD_SECTION_MSK              (~(SZ_1M - 1))
> +#define ARM_SHORT_PGD_SUPERSECTION_MSK         (~(SZ_16M - 1))
> +
> +/* level 2 pagetable */
> +#define ARM_SHORT_PTE_TYPE_LARGE               BIT(0)
> +#define ARM_SHORT_PTE_SMALL_XN                 BIT(0)
> +#define ARM_SHORT_PTE_TYPE_SMALL               BIT(1)
> +#define ARM_SHORT_PTE_B                                BIT(2)
> +#define ARM_SHORT_PTE_C                                BIT(3)
> +#define ARM_SHORT_PTE_RD_WR                    (3 << 4)
> +#define ARM_SHORT_PTE_RDONLY                   BIT(9)
> +#define ARM_SHORT_PTE_S                                BIT(10)
> +#define ARM_SHORT_PTE_nG                       BIT(11)
> +#define ARM_SHORT_PTE_LARGE_XN                 BIT(15)
> +#define ARM_SHORT_PTE_LARGE_MSK                        (~(SZ_64K - 1))
> +#define ARM_SHORT_PTE_SMALL_MSK                        (~(SZ_4K - 1))
> +#define ARM_SHORT_PTE_TYPE_MSK                 \
> +       (ARM_SHORT_PTE_TYPE_LARGE | ARM_SHORT_PTE_TYPE_SMALL)
> +#define ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(pte)   \
> +       (((pte) & ARM_SHORT_PTE_TYPE_SMALL) == ARM_SHORT_PTE_TYPE_SMALL)

Maybe a comment here, because it's confusing that you don't and with the
mask due to XN.

> +#define ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(pte)   \
> +       (((pte) & ARM_SHORT_PTE_TYPE_MSK) == ARM_SHORT_PTE_TYPE_LARGE)
> +
> +#define ARM_SHORT_PGD_IDX(a)                   ((a) >> ARM_SHORT_PGDIR_SHIFT)
> +#define ARM_SHORT_PTE_IDX(a)                   \
> +       (((a) >> ARM_SHORT_PAGE_SHIFT) & (ARM_SHORT_PTRS_PER_PTE - 1))
> +
> +#define ARM_SHORT_GET_PGTABLE_VA(pgd)          \
> +       (phys_to_virt((unsigned long)pgd & ARM_SHORT_PGD_PGTABLE_MSK))
> +
> +#define ARM_SHORT_PTE_LARGE_GET_PROT(pte)      \
> +       (((pte) & (~ARM_SHORT_PTE_LARGE_MSK)) & ~ARM_SHORT_PTE_TYPE_MSK)

AFAICT, the only user of this also does an '& ~ARM_SHORT_PTE_SMALL_MSK'.
Wouldn't it be better to define ARM_SHORT_PTE_GET_PROT, which just returns
the AP bits? That said, what are you going to do about XN? I know you
don't support it in your hardware, but this could code should still do
the right thing.

> +static int
> +__arm_short_set_pte(arm_short_iopte *ptep, arm_short_iopte pte,
> +                   unsigned int ptenr, struct io_pgtable_cfg *cfg)
> +{
> +       struct device *dev = cfg->iommu_dev;
> +       int i;
> +
> +       for (i = 0; i < ptenr; i++) {
> +               if (ptep[i] && pte) {
> +                       /* Someone else may have allocated for this pte */
> +                       WARN_ON(!selftest_running);
> +                       goto err_exist_pte;
> +               }
> +               ptep[i] = pte;
> +       }
> +
> +       if (selftest_running)
> +               return 0;
> +
> +       dma_sync_single_for_device(dev, __arm_short_dma_addr(dev, ptep),
> +                                  sizeof(*ptep) * ptenr, DMA_TO_DEVICE);
> +       return 0;
> +
> +err_exist_pte:
> +       while (i--)
> +               ptep[i] = 0;

What about a dma_sync for the failure case?

> +       return -EEXIST;
> +}
> +
> +static void *
> +__arm_short_alloc_pgtable(size_t size, gfp_t gfp, bool pgd,
> +                         struct io_pgtable_cfg *cfg)
> +{
> +       struct arm_short_io_pgtable *data;
> +       struct device *dev = cfg->iommu_dev;
> +       dma_addr_t dma;
> +       void *va;
> +
> +       if (pgd) {/* lvl1 pagetable */
> +               va = alloc_pages_exact(size, gfp);
> +       } else {  /* lvl2 pagetable */
> +               data = io_pgtable_cfg_to_data(cfg);
> +               va = kmem_cache_zalloc(data->pgtable_cached, gfp);
> +       }
> +
> +       if (!va)
> +               return NULL;
> +
> +       if (selftest_running)
> +               return va;
> +
> +       dma = dma_map_single(dev, va, size, DMA_TO_DEVICE);
> +       if (dma_mapping_error(dev, dma))
> +               goto out_free;
> +
> +       if (dma != __arm_short_dma_addr(dev, va))
> +               goto out_unmap;
> +
> +       if (!pgd) {
> +               kmemleak_ignore(va);
> +               dma_sync_single_for_device(dev, __arm_short_dma_addr(dev, va),
> +                                          size, DMA_TO_DEVICE);

Why do you need to do this as well as the dma_map_single above?

> +       }
> +
> +       return va;
> +
> +out_unmap:
> +       dev_err_ratelimited(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
> +       dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
> +out_free:
> +       if (pgd)
> +               free_pages_exact(va, size);
> +       else
> +               kmem_cache_free(data->pgtable_cached, va);
> +       return NULL;
> +}
> +
> +static void
> +__arm_short_free_pgtable(void *va, size_t size, bool pgd,
> +                        struct io_pgtable_cfg *cfg)
> +{
> +       struct arm_short_io_pgtable *data = io_pgtable_cfg_to_data(cfg);
> +       struct device *dev = cfg->iommu_dev;
> +
> +       if (!selftest_running)
> +               dma_unmap_single(dev, __arm_short_dma_addr(dev, va),
> +                                size, DMA_TO_DEVICE);
> +
> +       if (pgd)
> +               free_pages_exact(va, size);
> +       else
> +               kmem_cache_free(data->pgtable_cached, va);
> +}
> +
> +static arm_short_iopte
> +__arm_short_pte_prot(struct arm_short_io_pgtable *data, int prot, bool large)
> +{
> +       arm_short_iopte pteprot;
> +       int quirk = data->iop.cfg.quirks;
> +
> +       pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
> +       pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
> +                               ARM_SHORT_PTE_TYPE_SMALL;
> +       if (prot & IOMMU_CACHE)
> +               pteprot |=  ARM_SHORT_PTE_B | ARM_SHORT_PTE_C;
> +       if (!(quirk & IO_PGTABLE_QUIRK_SHORT_NO_XN) && (prot & IOMMU_NOEXEC)) {
> +                       pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
> +                               ARM_SHORT_PTE_SMALL_XN;

Weird indentation, man. Also, see my later comment about combining NO_XN
with NO_PERMS (the latter subsumes the first)

> +       }
> +       if (!(quirk & IO_PGTABLE_QUIRK_SHORT_NO_PERMS)) {
> +               pteprot |= ARM_SHORT_PTE_RD_WR;
> +               if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
> +                       pteprot |= ARM_SHORT_PTE_RDONLY;
> +       }
> +       return pteprot;
> +}
> +
> +static arm_short_iopte
> +__arm_short_pgd_prot(struct arm_short_io_pgtable *data, int prot, bool super)
> +{
> +       arm_short_iopte pgdprot;
> +       int quirk = data->iop.cfg.quirks;
> +
> +       pgdprot = ARM_SHORT_PGD_S | ARM_SHORT_PGD_nG;
> +       pgdprot |= super ? ARM_SHORT_PGD_TYPE_SUPERSECTION :
> +                               ARM_SHORT_PGD_TYPE_SECTION;
> +       if (prot & IOMMU_CACHE)
> +               pgdprot |= ARM_SHORT_PGD_C | ARM_SHORT_PGD_B;
> +       if (quirk & IO_PGTABLE_QUIRK_ARM_NS)
> +               pgdprot |= ARM_SHORT_PGD_SECTION_NS;
> +
> +       if (!(quirk & IO_PGTABLE_QUIRK_SHORT_NO_XN) && (prot & IOMMU_NOEXEC))
> +                       pgdprot |= ARM_SHORT_PGD_SECTION_XN;
> +
> +       if (!(quirk & IO_PGTABLE_QUIRK_SHORT_NO_PERMS)) {

Same comments here.

> +               pgdprot |= ARM_SHORT_PGD_RD_WR;
> +               if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
> +                       pgdprot |= ARM_SHORT_PGD_RDONLY;
> +       }
> +       return pgdprot;
> +}
> +
> +static arm_short_iopte
> +__arm_short_pte_prot_split(struct arm_short_io_pgtable *data,
> +                          arm_short_iopte pgdprot,
> +                          arm_short_iopte pteprot_large,
> +                          bool large)
> +{
> +       arm_short_iopte pteprot = 0;
> +
> +       pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG | ARM_SHORT_PTE_RD_WR;
> +       pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
> +                               ARM_SHORT_PTE_TYPE_SMALL;
> +
> +       /* large page to small page pte prot. Only large page may split */
> +       if (!pgdprot && !large) {

It's slightly complicated having these two variables controlling the
behaviour of the split. In reality, we're either splitting a section or
a large page, so there are three valid combinations.

It might be simpler to operate on IOMMU_{READ,WRITE,NOEXEC,CACHE} as
much as possible, and then have some simple functions to encode/decode
these into section/large/small page prot bits. We could then just pass
the IOMMU_* prot around along with the map size. What do you think?

> +               pteprot |= pteprot_large & ~ARM_SHORT_PTE_SMALL_MSK;
> +               if (pteprot_large & ARM_SHORT_PTE_LARGE_XN)
> +                       pteprot |= ARM_SHORT_PTE_SMALL_XN;
> +       }
> +
> +       /* section to pte prot */
> +       if (pgdprot & ARM_SHORT_PGD_C)
> +               pteprot |= ARM_SHORT_PTE_C;
> +       if (pgdprot & ARM_SHORT_PGD_B)
> +               pteprot |= ARM_SHORT_PTE_B;
> +       if (pgdprot & ARM_SHORT_PGD_nG)
> +               pteprot |= ARM_SHORT_PTE_nG;
> +       if (pgdprot & ARM_SHORT_PGD_SECTION_XN)
> +               pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
> +                               ARM_SHORT_PTE_SMALL_XN;
> +       if (pgdprot & ARM_SHORT_PGD_RD_WR)
> +               pteprot |= ARM_SHORT_PTE_RD_WR;
> +       if (pgdprot & ARM_SHORT_PGD_RDONLY)
> +               pteprot |= ARM_SHORT_PTE_RDONLY;
> +
> +       return pteprot;
> +}
> +
> +static arm_short_iopte
> +__arm_short_pgtable_prot(struct arm_short_io_pgtable *data)
> +{
> +       arm_short_iopte pgdprot = 0;
> +
> +       pgdprot = ARM_SHORT_PGD_TYPE_PGTABLE;
> +       if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
> +               pgdprot |= ARM_SHORT_PGD_PGTABLE_NS;
> +       return pgdprot;
> +}
> +
> +static int
> +_arm_short_map(struct arm_short_io_pgtable *data,
> +              unsigned int iova, phys_addr_t paddr,
> +              arm_short_iopte pgdprot, arm_short_iopte pteprot,
> +              bool large)
> +{
> +       struct io_pgtable_cfg *cfg = &data->iop.cfg;
> +       arm_short_iopte *pgd = data->pgd, *pte;
> +       void *pte_new = NULL;
> +       int ret;
> +
> +       pgd += ARM_SHORT_PGD_IDX(iova);
> +
> +       if (!pteprot) { /* section or supersection */
> +               pte = pgd;
> +               pteprot = pgdprot;
> +       } else {        /* page or largepage */
> +               if (!(*pgd)) {
> +                       pte_new = __arm_short_alloc_pgtable(
> +                                       ARM_SHORT_BYTES_PER_PTE,
> +                                       GFP_ATOMIC, false, cfg);
> +                       if (unlikely(!pte_new))
> +                               return -ENOMEM;
> +
> +                       pgdprot |= virt_to_phys(pte_new);
> +                       __arm_short_set_pte(pgd, pgdprot, 1, cfg);
> +               }
> +               pte = arm_short_get_pte_in_pgd(*pgd, iova);
> +       }
> +
> +       pteprot |= (arm_short_iopte)paddr;
> +       ret = __arm_short_set_pte(pte, pteprot, large ? 16 : 1, cfg);
> +       if (ret && pte_new)
> +               __arm_short_free_pgtable(pte_new, ARM_SHORT_BYTES_PER_PTE,
> +                                        false, cfg);

Don't you need to kill the pgd entry before freeing this? Please see my
previous comments about safely freeing page tables:

  http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/358268.html

(at the end of the post)

> +       return ret;
> +}
> +
> +static int arm_short_map(struct io_pgtable_ops *ops, unsigned long iova,
> +                        phys_addr_t paddr, size_t size, int prot)
> +{
> +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +       arm_short_iopte pgdprot = 0, pteprot = 0;
> +       bool large;
> +
> +       /* If no access, then nothing to do */
> +       if (!(prot & (IOMMU_READ | IOMMU_WRITE)))
> +               return 0;
> +
> +       if (WARN_ON((iova | paddr) & (size - 1)))
> +               return -EINVAL;
> +
> +       switch (size) {
> +       case SZ_4K:
> +       case SZ_64K:
> +               large = (size == SZ_64K) ? true : false;
> +               pteprot = __arm_short_pte_prot(data, prot, large);
> +               pgdprot = __arm_short_pgtable_prot(data);
> +               break;
> +
> +       case SZ_1M:
> +       case SZ_16M:
> +               large = (size == SZ_16M) ? true : false;
> +               pgdprot = __arm_short_pgd_prot(data, prot, large);
> +               break;
> +       default:
> +               return -EINVAL;
> +       }
> +
> +       return _arm_short_map(data, iova, paddr, pgdprot, pteprot, large);
> +}
> +
> +static phys_addr_t arm_short_iova_to_phys(struct io_pgtable_ops *ops,
> +                                         unsigned long iova)
> +{
> +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +       arm_short_iopte *pte, *pgd = data->pgd;
> +       phys_addr_t pa = 0;
> +
> +       pgd += ARM_SHORT_PGD_IDX(iova);
> +
> +       if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) {
> +               pte = arm_short_get_pte_in_pgd(*pgd, iova);
> +
> +               if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte)) {
> +                       pa = (*pte) & ARM_SHORT_PTE_LARGE_MSK;
> +                       pa |= iova & ~ARM_SHORT_PTE_LARGE_MSK;
> +               } else if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte)) {
> +                       pa = (*pte) & ARM_SHORT_PTE_SMALL_MSK;
> +                       pa |= iova & ~ARM_SHORT_PTE_SMALL_MSK;
> +               }
> +       } else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) {
> +               pa = (*pgd) & ARM_SHORT_PGD_SECTION_MSK;
> +               pa |= iova & ~ARM_SHORT_PGD_SECTION_MSK;
> +       } else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
> +               pa = (*pgd) & ARM_SHORT_PGD_SUPERSECTION_MSK;
> +               pa |= iova & ~ARM_SHORT_PGD_SUPERSECTION_MSK;
> +       }
> +
> +       return pa;
> +}
> +
> +static bool _arm_short_whether_free_pgtable(arm_short_iopte *pgd)
> +{

_arm_short_pgtable_empty might be a better name.

> +       arm_short_iopte *pte;
> +       int i;
> +
> +       pte = ARM_SHORT_GET_PGTABLE_VA(*pgd);
> +       for (i = 0; i < ARM_SHORT_PTRS_PER_PTE; i++) {
> +               if (pte[i] != 0)
> +                       return false;
> +       }
> +
> +       return true;
> +}
> +
> +static int
> +arm_short_split_blk_unmap(struct io_pgtable_ops *ops, unsigned int iova,
> +                         phys_addr_t paddr, size_t size,
> +                         arm_short_iopte pgdprotup, arm_short_iopte pteprotup,
> +                         size_t blk_size)
> +{
> +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> +       struct io_pgtable_cfg *cfg = &data->iop.cfg;
> +       unsigned long *pgbitmap = &cfg->pgsize_bitmap;
> +       unsigned int blk_base, blk_start, blk_end, i;
> +       arm_short_iopte pgdprot, pteprot;
> +       phys_addr_t blk_paddr;
> +       size_t mapsize = 0, nextmapsize;
> +       int ret;
> +
> +       /* find the nearest mapsize */
> +       for (i = find_first_bit(pgbitmap, BITS_PER_LONG);
> +            i < BITS_PER_LONG && ((1 << i) < blk_size) &&
> +            IS_ALIGNED(size, 1 << i);
> +            i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1))
> +               mapsize = 1 << i;
> +
> +       if (WARN_ON(!mapsize))
> +               return 0; /* Bytes unmapped */
> +       nextmapsize = 1 << i;
> +
> +       blk_base = iova & ~(blk_size - 1);
> +       blk_start = blk_base;
> +       blk_end = blk_start + blk_size;
> +       blk_paddr = paddr;
> +
> +       for (; blk_start < blk_end;
> +            blk_start += mapsize, blk_paddr += mapsize) {
> +               /* Unmap! */
> +               if (blk_start == iova)
> +                       continue;
> +
> +               /* Try to upper map */
> +               if (blk_base != blk_start &&
> +                   IS_ALIGNED(blk_start | blk_paddr, nextmapsize) &&
> +                   mapsize != nextmapsize) {
> +                       mapsize = nextmapsize;
> +                       i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1);
> +                       if (i < BITS_PER_LONG)
> +                               nextmapsize = 1 << i;
> +               }
> +
> +               if (mapsize == SZ_1M) {

How do we get here with a mapsize of 1M?

> +                       pgdprot = pgdprotup;
> +                       pgdprot |= __arm_short_pgd_prot(data, 0, false);
> +                       pteprot = 0;
> +               } else { /* small or large page */
> +                       pgdprot = (blk_size == SZ_64K) ? 0 : pgdprotup;
> +                       pteprot = __arm_short_pte_prot_split(
> +                                       data, pgdprot, pteprotup,
> +                                       mapsize == SZ_64K);
> +                       pgdprot = __arm_short_pgtable_prot(data);
> +               }
> +
> +               ret = _arm_short_map(data, blk_start, blk_paddr, pgdprot,
> +                                    pteprot, mapsize == SZ_64K);
> +               if (ret < 0) {
> +                       /* Free the table we allocated */
> +                       arm_short_iopte *pgd = data->pgd, *pte;
> +
> +                       pgd += ARM_SHORT_PGD_IDX(blk_base);
> +                       if (*pgd) {
> +                               pte = ARM_SHORT_GET_PGTABLE_VA(*pgd);
> +                               __arm_short_set_pte(pgd, 0, 1, cfg);
> +                               tlb->tlb_add_flush(blk_base, blk_size, true,
> +                                                  data->iop.cookie);
> +                               tlb->tlb_sync(data->iop.cookie);
> +                               __arm_short_free_pgtable(
> +                                       pte, ARM_SHORT_BYTES_PER_PTE,
> +                                       false, cfg);

This looks wrong. _arm_short_map cleans up if it returns non-zero already.

> +                       }
> +                       return 0;/* Bytes unmapped */
> +               }
> +       }
> +
> +       tlb->tlb_add_flush(blk_base, blk_size, true, data->iop.cookie);
> +       tlb->tlb_sync(data->iop.cookie);

Why are you syncing here? You can postpone this to the caller, if it turns
out the unmap was a success.

> +       return size;
> +}
> +
> +static int arm_short_unmap(struct io_pgtable_ops *ops,
> +                          unsigned long iova,
> +                          size_t size)
> +{
> +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +       struct io_pgtable_cfg *cfg = &data->iop.cfg;
> +       arm_short_iopte *pgd, *pte = NULL;
> +       arm_short_iopte curpgd, curpte = 0;
> +       phys_addr_t paddr;
> +       unsigned int iova_base, blk_size = 0;
> +       void *cookie = data->iop.cookie;
> +       bool pgtablefree = false;
> +
> +       pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(iova);
> +
> +       /* Get block size */
> +       if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) {
> +               pte = arm_short_get_pte_in_pgd(*pgd, iova);
> +
> +               if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte))
> +                       blk_size = SZ_4K;
> +               else if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte))
> +                       blk_size = SZ_64K;
> +               else
> +                       WARN_ON(1);
> +       } else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) {
> +               blk_size = SZ_1M;
> +       } else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
> +               blk_size = SZ_16M;
> +       } else {
> +               WARN_ON(1);

Maybe return 0 or something instead of falling through with blk_size == 0?

> +       }
> +
> +       iova_base = iova & ~(blk_size - 1);
> +       pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(iova_base);
> +       paddr = arm_short_iova_to_phys(ops, iova_base);
> +       curpgd = *pgd;
> +
> +       if (blk_size == SZ_4K || blk_size == SZ_64K) {
> +               pte = arm_short_get_pte_in_pgd(*pgd, iova_base);
> +               curpte = *pte;
> +               __arm_short_set_pte(pte, 0, blk_size / SZ_4K, cfg);
> +
> +               pgtablefree = _arm_short_whether_free_pgtable(pgd);
> +               if (pgtablefree)
> +                       __arm_short_set_pte(pgd, 0, 1, cfg);
> +       } else if (blk_size == SZ_1M || blk_size == SZ_16M) {
> +               __arm_short_set_pte(pgd, 0, blk_size / SZ_1M, cfg);
> +       }
> +
> +       cfg->tlb->tlb_add_flush(iova_base, blk_size, true, cookie);
> +       cfg->tlb->tlb_sync(cookie);
> +
> +       if (pgtablefree)/* Free pgtable after tlb-flush */
> +               __arm_short_free_pgtable(ARM_SHORT_GET_PGTABLE_VA(curpgd),
> +                                        ARM_SHORT_BYTES_PER_PTE, false, cfg);

Curious, but why do you care about freeing this on unmap? It will get
freed when the page table itself is freed anyway (via the ->free callback).

> +
> +       if (blk_size > size) { /* Split the block */
> +               return arm_short_split_blk_unmap(
> +                               ops, iova, paddr, size,
> +                               ARM_SHORT_PGD_GET_PROT(curpgd),
> +                               ARM_SHORT_PTE_LARGE_GET_PROT(curpte),
> +                               blk_size);
> +       } else if (blk_size < size) {
> +               /* Unmap the block while remap partial again after split */
> +               return blk_size +
> +                       arm_short_unmap(ops, iova + blk_size, size - blk_size);
> +       }
> +
> +       return size;
> +}
> +
> +static struct io_pgtable *
> +arm_short_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
> +{
> +       struct arm_short_io_pgtable *data;
> +
> +       if (cfg->ias > 32 || cfg->oas > 32)
> +               return NULL;
> +
> +       cfg->pgsize_bitmap &=
> +               (cfg->quirks & IO_PGTABLE_QUIRK_SHORT_SUPERSECTION) ?
> +               (SZ_4K | SZ_64K | SZ_1M | SZ_16M) : (SZ_4K | SZ_64K | SZ_1M);
> +
> +       data = kzalloc(sizeof(*data), GFP_KERNEL);
> +       if (!data)
> +               return NULL;
> +
> +       data->pgd_size = SZ_16K;
> +       data->pgd = __arm_short_alloc_pgtable(
> +                                       data->pgd_size,
> +                                       GFP_KERNEL | __GFP_ZERO | __GFP_DMA,
> +                                       true, cfg);
> +       if (!data->pgd)
> +               goto out_free_data;
> +       wmb();/* Ensure the empty pgd is visible before any actual TTBR write */
> +
> +       data->pgtable_cached = kmem_cache_create(
> +                                       "io-pgtable-arm-short",
> +                                        ARM_SHORT_BYTES_PER_PTE,
> +                                        ARM_SHORT_BYTES_PER_PTE,
> +                                        0, NULL);
> +       if (!data->pgtable_cached)
> +               goto out_free_pgd;
> +
> +       /* TTBRs */
> +       cfg->arm_short_cfg.ttbr[0] = virt_to_phys(data->pgd);
> +       cfg->arm_short_cfg.ttbr[1] = 0;
> +       cfg->arm_short_cfg.tcr = 0;
> +       cfg->arm_short_cfg.nmrr = 0;
> +       cfg->arm_short_cfg.prrr = 0;
> +
> +       data->iop.ops = (struct io_pgtable_ops) {
> +               .map            = arm_short_map,
> +               .unmap          = arm_short_unmap,
> +               .iova_to_phys   = arm_short_iova_to_phys,
> +       };
> +
> +       return &data->iop;
> +
> +out_free_pgd:
> +       __arm_short_free_pgtable(data->pgd, data->pgd_size, true, cfg);
> +out_free_data:
> +       kfree(data);
> +       return NULL;
> +}
> +
> +static void arm_short_free_pgtable(struct io_pgtable *iop)
> +{
> +       struct arm_short_io_pgtable *data = io_pgtable_to_data(iop);
> +
> +       kmem_cache_destroy(data->pgtable_cached);
> +       __arm_short_free_pgtable(data->pgd, data->pgd_size,
> +                                true, &data->iop.cfg);
> +       kfree(data);
> +}
> +
> +struct io_pgtable_init_fns io_pgtable_arm_short_init_fns = {
> +       .alloc  = arm_short_alloc_pgtable,
> +       .free   = arm_short_free_pgtable,
> +};
> +

[...]

> diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
> index 6436fe2..14a9b3a 100644
> --- a/drivers/iommu/io-pgtable.c
> +++ b/drivers/iommu/io-pgtable.c
> @@ -28,6 +28,7 @@ extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
>  extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
>  extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
>  extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
> +extern struct io_pgtable_init_fns io_pgtable_arm_short_init_fns;
> 
>  static const struct io_pgtable_init_fns *
>  io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] =
> @@ -38,6 +39,9 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] =
>         [ARM_64_LPAE_S1] = &io_pgtable_arm_64_lpae_s1_init_fns,
>         [ARM_64_LPAE_S2] = &io_pgtable_arm_64_lpae_s2_init_fns,
>  #endif
> +#ifdef CONFIG_IOMMU_IO_PGTABLE_SHORT
> +       [ARM_SHORT_DESC] = &io_pgtable_arm_short_init_fns,
> +#endif
>  };
> 
>  struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
> diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
> index 68c63d9..0f45e60 100644
> --- a/drivers/iommu/io-pgtable.h
> +++ b/drivers/iommu/io-pgtable.h
> @@ -9,6 +9,7 @@ enum io_pgtable_fmt {
>         ARM_32_LPAE_S2,
>         ARM_64_LPAE_S1,
>         ARM_64_LPAE_S2,
> +       ARM_SHORT_DESC,
>         IO_PGTABLE_NUM_FMTS,
>  };
> 
> @@ -45,6 +46,9 @@ struct iommu_gather_ops {
>   */
>  struct io_pgtable_cfg {
>         #define IO_PGTABLE_QUIRK_ARM_NS (1 << 0)        /* Set NS bit in PTEs */
> +       #define IO_PGTABLE_QUIRK_SHORT_SUPERSECTION     BIT(1)
> +       #define IO_PGTABLE_QUIRK_SHORT_NO_XN            BIT(2) /* No XN bit */
> +       #define IO_PGTABLE_QUIRK_SHORT_NO_PERMS         BIT(3) /* No AP bit */

Why have two quirks for this? I suggested included NO_XN in NO_PERMS:

  http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/361160.html

>         int                             quirks;
>         unsigned long                   pgsize_bitmap;
>         unsigned int                    ias;
> @@ -64,6 +68,13 @@ struct io_pgtable_cfg {
>                         u64     vttbr;
>                         u64     vtcr;
>                 } arm_lpae_s2_cfg;
> +
> +               struct {
> +                       u32     ttbr[2];
> +                       u32     tcr;
> +                       u32     nmrr;
> +                       u32     prrr;
> +               } arm_short_cfg;

We don't return an SCTLR value here, so a comment somewhere saying that
access flag is not supported would be helpful (so that drivers can ensure
that they configure things for the AP[2:0] permission model).

Will
Yong Wu (吴勇) Sept. 17, 2015, 2:54 p.m. UTC | #2
On Wed, 2015-09-16 at 16:58 +0100, Will Deacon wrote:
> On Mon, Aug 03, 2015 at 11:21:16AM +0100, Yong Wu wrote:
> > This patch is for ARM Short Descriptor Format.
> > 
> > Signed-off-by: Yong Wu <yong.wu@mediatek.com>
> > ---
> >  drivers/iommu/Kconfig                |  18 +
> >  drivers/iommu/Makefile               |   1 +
> >  drivers/iommu/io-pgtable-arm-short.c | 813 +++++++++++++++++++++++++++++++++++
> >  drivers/iommu/io-pgtable-arm.c       |   3 -
> >  drivers/iommu/io-pgtable.c           |   4 +
> >  drivers/iommu/io-pgtable.h           |  14 +
> >  6 files changed, 850 insertions(+), 3 deletions(-)
> >  create mode 100644 drivers/iommu/io-pgtable-arm-short.c
> > 
> > diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> > index f1fb1d3..3abd066 100644
> > --- a/drivers/iommu/Kconfig
> > +++ b/drivers/iommu/Kconfig
> > @@ -39,6 +39,24 @@ config IOMMU_IO_PGTABLE_LPAE_SELFTEST
> > 
> >           If unsure, say N here.
> > 
> > +config IOMMU_IO_PGTABLE_SHORT
> > +       bool "ARMv7/v8 Short Descriptor Format"
> > +       select IOMMU_IO_PGTABLE
> > +       depends on ARM || ARM64 || COMPILE_TEST
> > +       help
> > +         Enable support for the ARM Short-descriptor pagetable format.
> > +         This allocator supports 2 levels translation tables which supports
> 
> Some minor rewording here:
> 
> "...2 levels of translation tables, which enables a 32-bit memory map based
>  on..."

Hi Will,
    OK.Thanks very much for your review so detail every time.

> 
> > +         a memory map based on memory sections or pages.
> > +
> > +config IOMMU_IO_PGTABLE_SHORT_SELFTEST
> > +       bool "Short Descriptor selftests"
> > +       depends on IOMMU_IO_PGTABLE_SHORT
> > +       help
> > +         Enable self-tests for Short-descriptor page table allocator.
> > +         This performs a series of page-table consistency checks during boot.
> > +
> > +         If unsure, say N here.
> > +
> >  endmenu
> > 
> >  config IOMMU_IOVA
> 
> [...]
> > +#define ARM_SHORT_PGD_PGTABLE_MSK              0xfffffc00
> 
> You could use (~(ARM_SHORT_BYTES_PER_PTE - 1)), I think.

Yes. Thanks.

> > +/* level 2 pagetable */
> > +#define ARM_SHORT_PTE_TYPE_LARGE               BIT(0)
> > +#define ARM_SHORT_PTE_SMALL_XN                 BIT(0)
> > +#define ARM_SHORT_PTE_TYPE_SMALL               BIT(1)
> > +#define ARM_SHORT_PTE_B                                BIT(2)
> > +#define ARM_SHORT_PTE_C                                BIT(3)
> > +#define ARM_SHORT_PTE_RD_WR                    (3 << 4)
> > +#define ARM_SHORT_PTE_RDONLY                   BIT(9)
> > +#define ARM_SHORT_PTE_S                                BIT(10)
> > +#define ARM_SHORT_PTE_nG                       BIT(11)
> > +#define ARM_SHORT_PTE_LARGE_XN                 BIT(15)
> > +#define ARM_SHORT_PTE_LARGE_MSK                        (~(SZ_64K - 1))
> > +#define ARM_SHORT_PTE_SMALL_MSK                        (~(SZ_4K - 1))
> > +#define ARM_SHORT_PTE_TYPE_MSK                 \
> > +       (ARM_SHORT_PTE_TYPE_LARGE | ARM_SHORT_PTE_TYPE_SMALL)
> > +#define ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(pte)   \
> > +       (((pte) & ARM_SHORT_PTE_TYPE_SMALL) == ARM_SHORT_PTE_TYPE_SMALL)
> 
> Maybe a comment here, because it's confusing that you don't and with the
> mask due to XN.

I will add a comment like : 
/* The bit0 in small page is XN */

> > +#define ARM_SHORT_PTE_LARGE_GET_PROT(pte)      \
> > +       (((pte) & (~ARM_SHORT_PTE_LARGE_MSK)) & ~ARM_SHORT_PTE_TYPE_MSK)
> 
> AFAICT, the only user of this also does an '& ~ARM_SHORT_PTE_SMALL_MSK'.
> Wouldn't it be better to define ARM_SHORT_PTE_GET_PROT, which just returns
> the AP bits? That said, what are you going to do about XN? I know you
> don't support it in your hardware, but this could code should still do
> the right thing.

I'm a little confuse here: rename to ARM_SHORT_PTE_GET_PROT which just
return the AP bits? like this :
//=====
#define ARM_SHORT_PTE_GET_PROT(pte) \
(((pte) & (~ARM_SHORT_PTE_SMALL_MSK)) & ~ARM_SHORT_PTE_TYPE_MSK)
//=====

This macro is only used to get the prot of large page while split.

If it only return AP bits, then how about PXN,TEX[2:0] in large page?
(we need transform PXN in large-page to XN in small-page while split)

how about add a comment like below:
//=====
/* Get the prot of large page for split */
#define ARM_SHORT_PTE_LARGE_GET_PROT(pte)      \
   (((pte) & (~ARM_SHORT_PTE_LARGE_MSK)) & ~ARM_SHORT_PTE_TYPE_MSK)
//=====
or rename it ARM_SHORT_PTE_GET_PROT_SPLIT?

> 
> > +static int
> > +__arm_short_set_pte(arm_short_iopte *ptep, arm_short_iopte pte,
> > +                   unsigned int ptenr, struct io_pgtable_cfg *cfg)
> > +{
> > +       struct device *dev = cfg->iommu_dev;
> > +       int i;
> > +
> > +       for (i = 0; i < ptenr; i++) {
> > +               if (ptep[i] && pte) {
> > +                       /* Someone else may have allocated for this pte */
> > +                       WARN_ON(!selftest_running);
> > +                       goto err_exist_pte;
> > +               }
> > +               ptep[i] = pte;
> > +       }
> > +
> > +       if (selftest_running)
> > +               return 0;
> > +
> > +       dma_sync_single_for_device(dev, __arm_short_dma_addr(dev, ptep),
> > +                                  sizeof(*ptep) * ptenr, DMA_TO_DEVICE);
> > +       return 0;
> > +
> > +err_exist_pte:
> > +       while (i--)
> > +               ptep[i] = 0;
> 
> What about a dma_sync for the failure case?

I will add it.

> 
> > +       return -EEXIST;
> > +}
> > +
> > +static void *
> > +__arm_short_alloc_pgtable(size_t size, gfp_t gfp, bool pgd,
> > +                         struct io_pgtable_cfg *cfg)
> > +{
> > +       struct arm_short_io_pgtable *data;
> > +       struct device *dev = cfg->iommu_dev;
> > +       dma_addr_t dma;
> > +       void *va;
> > +
> > +       if (pgd) {/* lvl1 pagetable */
> > +               va = alloc_pages_exact(size, gfp);
> > +       } else {  /* lvl2 pagetable */
> > +               data = io_pgtable_cfg_to_data(cfg);
> > +               va = kmem_cache_zalloc(data->pgtable_cached, gfp);
> > +       }
> > +
> > +       if (!va)
> > +               return NULL;
> > +
> > +       if (selftest_running)
> > +               return va;
> > +
> > +       dma = dma_map_single(dev, va, size, DMA_TO_DEVICE);
> > +       if (dma_mapping_error(dev, dma))
> > +               goto out_free;
> > +
> > +       if (dma != __arm_short_dma_addr(dev, va))
> > +               goto out_unmap;
> > +
> > +       if (!pgd) {
> > +               kmemleak_ignore(va);
> > +               dma_sync_single_for_device(dev, __arm_short_dma_addr(dev, va),
> > +                                          size, DMA_TO_DEVICE);
> 
> Why do you need to do this as well as the dma_map_single above?

It's redundance, I will delete it...

> 
> > +       }
> > +
> > +       return va;
> > +
> > +out_unmap:
> > +       dev_err_ratelimited(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
> > +       dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
> > +out_free:
> > +       if (pgd)
> > +               free_pages_exact(va, size);
> > +       else
> > +               kmem_cache_free(data->pgtable_cached, va);
> > +       return NULL;
> > +}
> > +
> > +static void
> > +__arm_short_free_pgtable(void *va, size_t size, bool pgd,
> > +                        struct io_pgtable_cfg *cfg)
> > +{
> > +       struct arm_short_io_pgtable *data = io_pgtable_cfg_to_data(cfg);
> > +       struct device *dev = cfg->iommu_dev;
> > +
> > +       if (!selftest_running)
> > +               dma_unmap_single(dev, __arm_short_dma_addr(dev, va),
> > +                                size, DMA_TO_DEVICE);
> > +
> > +       if (pgd)
> > +               free_pages_exact(va, size);
> > +       else
> > +               kmem_cache_free(data->pgtable_cached, va);
> > +}
> > +
> > +static arm_short_iopte
> > +__arm_short_pte_prot(struct arm_short_io_pgtable *data, int prot, bool large)
> > +{
> > +       arm_short_iopte pteprot;
> > +       int quirk = data->iop.cfg.quirks;
> > +
> > +       pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
> > +       pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
> > +                               ARM_SHORT_PTE_TYPE_SMALL;
> > +       if (prot & IOMMU_CACHE)
> > +               pteprot |=  ARM_SHORT_PTE_B | ARM_SHORT_PTE_C;
> > +       if (!(quirk & IO_PGTABLE_QUIRK_SHORT_NO_XN) && (prot & IOMMU_NOEXEC)) {
> > +                       pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
> > +                               ARM_SHORT_PTE_SMALL_XN;
> 
> Weird indentation, man. Also, see my later comment about combining NO_XN
> with NO_PERMS (the latter subsumes the first)

Sorry, I misunderstanded about the quirk, I will use NO_PERMS which
contain NO_XN.

> 
> > +       }
> > +       if (!(quirk & IO_PGTABLE_QUIRK_SHORT_NO_PERMS)) {
> > +               pteprot |= ARM_SHORT_PTE_RD_WR;
> > +               if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
> > +                       pteprot |= ARM_SHORT_PTE_RDONLY;
> > +       }
> > +       return pteprot;
> > +}
> > +
[...]
> > +static arm_short_iopte
> > +__arm_short_pte_prot_split(struct arm_short_io_pgtable *data,
> > +                          arm_short_iopte pgdprot,
> > +                          arm_short_iopte pteprot_large,
> > +                          bool large)
> > +{
> > +       arm_short_iopte pteprot = 0;
> > +
> > +       pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG | ARM_SHORT_PTE_RD_WR;
> > +       pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
> > +                               ARM_SHORT_PTE_TYPE_SMALL;
> > +
> > +       /* large page to small page pte prot. Only large page may split */
> > +       if (!pgdprot && !large) {
> 
> It's slightly complicated having these two variables controlling the
> behaviour of the split. In reality, we're either splitting a section or
> a large page, so there are three valid combinations.
> 
> It might be simpler to operate on IOMMU_{READ,WRITE,NOEXEC,CACHE} as
> much as possible, and then have some simple functions to encode/decode
> these into section/large/small page prot bits. We could then just pass
> the IOMMU_* prot around along with the map size. What do you think?

It will be more simple if IOMMU_{READ,WRITE,NOEXEC,CACHE} prot can be
used here. But we cann't get IOMMU_x prot while split in unmap. is it
right?
we can only get the prot from the pagetable, then restructure the new
prot after split.

> 
> > +               pteprot |= pteprot_large & ~ARM_SHORT_PTE_SMALL_MSK;
> > +               if (pteprot_large & ARM_SHORT_PTE_LARGE_XN)
> > +                       pteprot |= ARM_SHORT_PTE_SMALL_XN;
> > +       }
> > +
> > +       /* section to pte prot */
> > +       if (pgdprot & ARM_SHORT_PGD_C)
> > +               pteprot |= ARM_SHORT_PTE_C;
> > +       if (pgdprot & ARM_SHORT_PGD_B)
> > +               pteprot |= ARM_SHORT_PTE_B;
> > +       if (pgdprot & ARM_SHORT_PGD_nG)
> > +               pteprot |= ARM_SHORT_PTE_nG;
> > +       if (pgdprot & ARM_SHORT_PGD_SECTION_XN)
> > +               pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
> > +                               ARM_SHORT_PTE_SMALL_XN;
> > +       if (pgdprot & ARM_SHORT_PGD_RD_WR)
> > +               pteprot |= ARM_SHORT_PTE_RD_WR;
> > +       if (pgdprot & ARM_SHORT_PGD_RDONLY)
> > +               pteprot |= ARM_SHORT_PTE_RDONLY;
> > +
> > +       return pteprot;
> > +}
> > +
> > +static int
> > +_arm_short_map(struct arm_short_io_pgtable *data,
> > +              unsigned int iova, phys_addr_t paddr,
> > +              arm_short_iopte pgdprot, arm_short_iopte pteprot,
> > +              bool large)
> > +{
> > +       struct io_pgtable_cfg *cfg = &data->iop.cfg;
> > +       arm_short_iopte *pgd = data->pgd, *pte;
> > +       void *pte_new = NULL;
> > +       int ret;
> > +
> > +       pgd += ARM_SHORT_PGD_IDX(iova);
> > +
> > +       if (!pteprot) { /* section or supersection */
> > +               pte = pgd;
> > +               pteprot = pgdprot;
> > +       } else {        /* page or largepage */
> > +               if (!(*pgd)) {
> > +                       pte_new = __arm_short_alloc_pgtable(
> > +                                       ARM_SHORT_BYTES_PER_PTE,
> > +                                       GFP_ATOMIC, false, cfg);
> > +                       if (unlikely(!pte_new))
> > +                               return -ENOMEM;
> > +
> > +                       pgdprot |= virt_to_phys(pte_new);
> > +                       __arm_short_set_pte(pgd, pgdprot, 1, cfg);
> > +               }
> > +               pte = arm_short_get_pte_in_pgd(*pgd, iova);
> > +       }
> > +
> > +       pteprot |= (arm_short_iopte)paddr;
> > +       ret = __arm_short_set_pte(pte, pteprot, large ? 16 : 1, cfg);
> > +       if (ret && pte_new)
> > +               __arm_short_free_pgtable(pte_new, ARM_SHORT_BYTES_PER_PTE,
> > +                                        false, cfg);
> 
> Don't you need to kill the pgd entry before freeing this? Please see my
> previous comments about safely freeing page tables:
> 
>   http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/358268.html
> 
> (at the end of the post)


 I will add like this:
//======================
  if (ret && pte_new)
        goto err_unmap_pgd:

  if (data->iop.cfg.quirk & IO_PGTABLE_QUIRK_TLBI_ON_MAP) {
        tlb->tlb_add_flush(iova, size, true, data->iop.cookie);
        tlb->tlb_sync(data->iop.cookie);
   }
   return ret;

err_unmap_pgd:
           __arm_short_set_pte(pgd, 0, 1, cfg);
           tlb->tlb_add_flush(iova, SZ_1M, true, data->iop.cookie); /*
the size is 1M, the whole pgd */
           tlb->tlb_sync(data->iop.cookie); 
           __arm_short_free_pgtable(pte_new, ARM_SHORT_BYTES_PER_PTE,
                                   false, cfg);
   return ret;
//======================

Here I move the TLBI_ON_MAP quirk into _arm_short_map, then the map from
split also could flush-tlb if it's necessary.

> > +       return ret;
> > +}
> > +
[...]
> > +static bool _arm_short_whether_free_pgtable(arm_short_iopte *pgd)
> > +{
> 
> _arm_short_pgtable_empty might be a better name.

Thanks.

> 
> > +       arm_short_iopte *pte;
> > +       int i;
> > +
> > +       pte = ARM_SHORT_GET_PGTABLE_VA(*pgd);
> > +       for (i = 0; i < ARM_SHORT_PTRS_PER_PTE; i++) {
> > +               if (pte[i] != 0)
> > +                       return false;
> > +       }
> > +
> > +       return true;
> > +}
> > +
> > +static int
> > +arm_short_split_blk_unmap(struct io_pgtable_ops *ops, unsigned int iova,
> > +                         phys_addr_t paddr, size_t size,
> > +                         arm_short_iopte pgdprotup, arm_short_iopte pteprotup,
> > +                         size_t blk_size)
> > +{
> > +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> > +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> > +       struct io_pgtable_cfg *cfg = &data->iop.cfg;
> > +       unsigned long *pgbitmap = &cfg->pgsize_bitmap;
> > +       unsigned int blk_base, blk_start, blk_end, i;
> > +       arm_short_iopte pgdprot, pteprot;
> > +       phys_addr_t blk_paddr;
> > +       size_t mapsize = 0, nextmapsize;
> > +       int ret;
> > +
> > +       /* find the nearest mapsize */
> > +       for (i = find_first_bit(pgbitmap, BITS_PER_LONG);
> > +            i < BITS_PER_LONG && ((1 << i) < blk_size) &&
> > +            IS_ALIGNED(size, 1 << i);
> > +            i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1))
> > +               mapsize = 1 << i;
> > +
> > +       if (WARN_ON(!mapsize))
> > +               return 0; /* Bytes unmapped */
> > +       nextmapsize = 1 << i;
> > +
> > +       blk_base = iova & ~(blk_size - 1);
> > +       blk_start = blk_base;
> > +       blk_end = blk_start + blk_size;
> > +       blk_paddr = paddr;
> > +
> > +       for (; blk_start < blk_end;
> > +            blk_start += mapsize, blk_paddr += mapsize) {
> > +               /* Unmap! */
> > +               if (blk_start == iova)
> > +                       continue;
> > +
> > +               /* Try to upper map */
> > +               if (blk_base != blk_start &&
> > +                   IS_ALIGNED(blk_start | blk_paddr, nextmapsize) &&
> > +                   mapsize != nextmapsize) {
> > +                       mapsize = nextmapsize;
> > +                       i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1);
> > +                       if (i < BITS_PER_LONG)
> > +                               nextmapsize = 1 << i;
> > +               }
> > +
> > +               if (mapsize == SZ_1M) {
> 
> How do we get here with a mapsize of 1M?

About the split, there may be some cases:

super section may split to section, or large page, or small page.
section may split to large page, or small page.
large page may split to small page.

How do we get here with a mapsize of 1M?
->the mapsize will be 1M while supersection split to section.
  If we run the self-test, we can get the mapsize's change.

> 
> > +                       pgdprot = pgdprotup;
> > +                       pgdprot |= __arm_short_pgd_prot(data, 0, false);

    Here I cann't get IOMMU_{READ,WRITE,NOEXEC,CACHE}, I have to use 0
as the second parameter(some bits like PGD_B/PGD_C have been record in
pgdprotup)

> > +                       pteprot = 0;
> > +               } else { /* small or large page */
> > +                       pgdprot = (blk_size == SZ_64K) ? 0 : pgdprotup;
> > +                       pteprot = __arm_short_pte_prot_split(
> > +                                       data, pgdprot, pteprotup,
> > +                                       mapsize == SZ_64K);
> > +                       pgdprot = __arm_short_pgtable_prot(data);
> > +               }
> > +
> > +               ret = _arm_short_map(data, blk_start, blk_paddr, pgdprot,
> > +                                    pteprot, mapsize == SZ_64K);
> > +               if (ret < 0) {
> > +                       /* Free the table we allocated */
> > +                       arm_short_iopte *pgd = data->pgd, *pte;
> > +
> > +                       pgd += ARM_SHORT_PGD_IDX(blk_base);
> > +                       if (*pgd) {
> > +                               pte = ARM_SHORT_GET_PGTABLE_VA(*pgd);
> > +                               __arm_short_set_pte(pgd, 0, 1, cfg);
> > +                               tlb->tlb_add_flush(blk_base, blk_size, true,
> > +                                                  data->iop.cookie);
> > +                               tlb->tlb_sync(data->iop.cookie);
> > +                               __arm_short_free_pgtable(
> > +                                       pte, ARM_SHORT_BYTES_PER_PTE,
> > +                                       false, cfg);
> 
> This looks wrong. _arm_short_map cleans up if it returns non-zero already.

...
YES. It seems that I can delete the "if" for the error case.

if (ret < 0)
	return 0;

> 
> > +                       }
> > +                       return 0;/* Bytes unmapped */
> > +               }
> > +       }
> > +
> > +       tlb->tlb_add_flush(blk_base, blk_size, true, data->iop.cookie);
> > +       tlb->tlb_sync(data->iop.cookie);
> 
> Why are you syncing here? You can postpone this to the caller, if it turns
> out the unmap was a success.

I only saw that there is a tlb_add_flush in arm_lpae_split_blk_unmap. so
add it here.
About this, I think that we can delete the tlb-flush here. see below.

> 
> > +       return size;
> > +}
> > +
> > +static int arm_short_unmap(struct io_pgtable_ops *ops,
> > +                          unsigned long iova,
> > +                          size_t size)
> > +{
> > +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> > +       struct io_pgtable_cfg *cfg = &data->iop.cfg;
> > +       arm_short_iopte *pgd, *pte = NULL;
> > +       arm_short_iopte curpgd, curpte = 0;
> > +       phys_addr_t paddr;
> > +       unsigned int iova_base, blk_size = 0;
> > +       void *cookie = data->iop.cookie;
> > +       bool pgtablefree = false;
> > +
> > +       pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(iova);
> > +
> > +       /* Get block size */
> > +       if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) {
> > +               pte = arm_short_get_pte_in_pgd(*pgd, iova);
> > +
> > +               if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte))
> > +                       blk_size = SZ_4K;
> > +               else if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte))
> > +                       blk_size = SZ_64K;
> > +               else
> > +                       WARN_ON(1);

> > +       } else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) {
> > +               blk_size = SZ_1M;
> > +       } else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
> > +               blk_size = SZ_16M;
> > +       } else {
> > +               WARN_ON(1);
> 
> Maybe return 0 or something instead of falling through with blk_size == 0?

how about :
//=====
        if (WARN_ON(blk_size == 0))
		return 0;
//=====
 return 0 and report the error information.

> 
> > +       }
> > +
> > +       iova_base = iova & ~(blk_size - 1);
> > +       pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(iova_base);
> > +       paddr = arm_short_iova_to_phys(ops, iova_base);
> > +       curpgd = *pgd;
> > +
> > +       if (blk_size == SZ_4K || blk_size == SZ_64K) {
> > +               pte = arm_short_get_pte_in_pgd(*pgd, iova_base);
> > +               curpte = *pte;
> > +               __arm_short_set_pte(pte, 0, blk_size / SZ_4K, cfg);
> > +
> > +            pgtablefree = _arm_short_whether_free_pgtable(pgd);
> > +            if (pgtablefree)
> > +(1)(2)                 __arm_short_set_pte(pgd, 0, 1, cfg);
> > +       } else if (blk_size == SZ_1M || blk_size == SZ_16M) {
> > +               __arm_short_set_pte(pgd, 0, blk_size / SZ_1M, cfg);
> > +       }
> > +
> > +(3)    cfg->tlb->tlb_add_flush(iova_base, blk_size, true, cookie);
> > +(4)    cfg->tlb->tlb_sync(cookie);
> > +
> > +       if (pgtablefree)/* Free pgtable after tlb-flush */
> > +(5)              __arm_short_free_pgtable(ARM_SHORT_GET_PGTABLE_VA(curpgd),
> > +                                        ARM_SHORT_BYTES_PER_PTE, false, cfg);
> 
> Curious, but why do you care about freeing this on unmap? It will get
> freed when the page table itself is freed anyway (via the ->free callback).

This is free the level2 pagetable while there is no item in it. It isn't
level1 pagetable(->free callback).

The flow of free pagetable is following your suggestion that 5 steps
what I mark above like (1)..(5). so I have to move free_pgtable after
(4)tlb_sync. and add a comment /* Free pgtable after tlb-flush */.
the comment may should be changed to : /* Free level2 pgtable after
tlb-flush */

> > +
> > +       if (blk_size > size) { /* Split the block */
> > +               return arm_short_split_blk_unmap(
> > +                               ops, iova, paddr, size,
> > +                               ARM_SHORT_PGD_GET_PROT(curpgd),
> > +                               ARM_SHORT_PTE_LARGE_GET_PROT(curpte),
> > +                               blk_size);

    About add flush-tlb after split. There is a flush-tlb before. And
the map in split is from invalid to valid. It don't need flush-tlb
again.

> > +       } else if (blk_size < size) {
> > +               /* Unmap the block while remap partial again after split */
> > +               return blk_size +
> > +                       arm_short_unmap(ops, iova + blk_size, size - blk_size);
> > +       }
> > +
> > +       return size;
> > +}
> > +
> > +static struct io_pgtable *
> > +arm_short_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
> > +{
> > +       struct arm_short_io_pgtable *data;
> > +
> > +       if (cfg->ias > 32 || cfg->oas > 32)
> > +               return NULL;
> > +
> > +       cfg->pgsize_bitmap &=
> > +               (cfg->quirks & IO_PGTABLE_QUIRK_SHORT_SUPERSECTION) ?
> > +               (SZ_4K | SZ_64K | SZ_1M | SZ_16M) : (SZ_4K | SZ_64K | SZ_1M);
> > +
> > +       data = kzalloc(sizeof(*data), GFP_KERNEL);
> > +       if (!data)
> > +               return NULL;
> > +
> > +       data->pgd_size = SZ_16K;
> > +       data->pgd = __arm_short_alloc_pgtable(
> > +                                       data->pgd_size,
> > +                                       GFP_KERNEL | __GFP_ZERO | __GFP_DMA,
> > +                                       true, cfg);
> > +       if (!data->pgd)
> > +               goto out_free_data;
> > +       wmb();/* Ensure the empty pgd is visible before any actual TTBR write */
> > +
> > +       data->pgtable_cached = kmem_cache_create(
> > +                                       "io-pgtable-arm-short",
> > +                                        ARM_SHORT_BYTES_PER_PTE,
> > +                                        ARM_SHORT_BYTES_PER_PTE,
> > +                                        0, NULL);

I prepare to add SLAB_CACHE_DMA to guarantee the level2 pagetable base
address(pa) is alway 32bit(not over 4GB).

> > +       if (!data->pgtable_cached)
> > +               goto out_free_pgd;
> > +
> > +       /* TTBRs */
> > +       cfg->arm_short_cfg.ttbr[0] = virt_to_phys(data->pgd);
> > +       cfg->arm_short_cfg.ttbr[1] = 0;
> > +       cfg->arm_short_cfg.tcr = 0;
> > +       cfg->arm_short_cfg.nmrr = 0;
> > +       cfg->arm_short_cfg.prrr = 0;

           About SCTLR, How about we add it here:
          //===========
          cfg->arm_short_cfg.sctlr = 0; /* The iommu user should
configure IOMMU_{READ/WRITE} */
          //===========
           Is the comment ok?
> > +
> > +       data->iop.ops = (struct io_pgtable_ops) {
> > +               .map            = arm_short_map,
> > +               .unmap          = arm_short_unmap,
> > +               .iova_to_phys   = arm_short_iova_to_phys,
> > +       };
> > +
> > +       return &data->iop;
> > +
> > +out_free_pgd:
> > +       __arm_short_free_pgtable(data->pgd, data->pgd_size, true, cfg);
> > +out_free_data:
> > +       kfree(data);
> > +       return NULL;
> > +}
> > +
> [...]
> 
> > diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
> > index 6436fe2..14a9b3a 100644
> > --- a/drivers/iommu/io-pgtable.c
> > +++ b/drivers/iommu/io-pgtable.c
> > @@ -28,6 +28,7 @@ extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
> >  extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
> >  extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
> >  extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
> > +extern struct io_pgtable_init_fns io_pgtable_arm_short_init_fns;
> > 
> >  static const struct io_pgtable_init_fns *
> >  io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] =
> > @@ -38,6 +39,9 @@ io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] =
> >         [ARM_64_LPAE_S1] = &io_pgtable_arm_64_lpae_s1_init_fns,
> >         [ARM_64_LPAE_S2] = &io_pgtable_arm_64_lpae_s2_init_fns,
> >  #endif
> > +#ifdef CONFIG_IOMMU_IO_PGTABLE_SHORT
> > +       [ARM_SHORT_DESC] = &io_pgtable_arm_short_init_fns,
> > +#endif
> >  };
> > 
> >  struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
> > diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
> > index 68c63d9..0f45e60 100644
> > --- a/drivers/iommu/io-pgtable.h
> > +++ b/drivers/iommu/io-pgtable.h
> > @@ -9,6 +9,7 @@ enum io_pgtable_fmt {
> >         ARM_32_LPAE_S2,
> >         ARM_64_LPAE_S1,
> >         ARM_64_LPAE_S2,
> > +       ARM_SHORT_DESC,
> >         IO_PGTABLE_NUM_FMTS,
> >  };
> > 
> > @@ -45,6 +46,9 @@ struct iommu_gather_ops {
> >   */
> >  struct io_pgtable_cfg {
> >         #define IO_PGTABLE_QUIRK_ARM_NS (1 << 0)        /* Set NS bit in PTEs */
> > +       #define IO_PGTABLE_QUIRK_SHORT_SUPERSECTION     BIT(1)
> > +       #define IO_PGTABLE_QUIRK_SHORT_NO_XN            BIT(2) /* No XN bit */
> > +       #define IO_PGTABLE_QUIRK_SHORT_NO_PERMS         BIT(3) /* No AP bit */
> 
> Why have two quirks for this? I suggested included NO_XN in NO_PERMS:
> 
>   http://lists.infradead.org/pipermail/linux-arm-kernel/2015-July/361160.html

Sorry. I will change like this in next time:

#define IO_PGTABLE_QUIRK_NO_PERMS            BIT(1) /* No XN/AP bits */
#define IO_PGTABLE_QUIRK_TLBI_ON_MAP         BIT(2) /* TLB-Flush after
map */
#define IO_PGTABLE_QUIRK_SHORT_SUPERSECTION  BIT(3)

Do I need change (1 << 0) to BIT(0) in ARM_NS?

> 
> >         int                             quirks;
> >         unsigned long                   pgsize_bitmap;
> >         unsigned int                    ias;
> > @@ -64,6 +68,13 @@ struct io_pgtable_cfg {
> >                         u64     vttbr;
> >                         u64     vtcr;
> >                 } arm_lpae_s2_cfg;
> > +
> > +               struct {
> > +                       u32     ttbr[2];
> > +                       u32     tcr;
> > +                       u32     nmrr;
> > +                       u32     prrr;
> > +               } arm_short_cfg;
> 
> We don't return an SCTLR value here, so a comment somewhere saying that
> access flag is not supported would be helpful (so that drivers can ensure
> that they configure things for the AP[2:0] permission model).

Do we need add SCTLR? like: 
         struct {
                 u32     ttbr[2];
                 u32     tcr;
                 u32     nmrr;
                 u32     prrr;
+                u32     sctlr;
          } arm_short_cfg;

  Or only add a comment in the place where ttbr,tcr is set?
  /* The iommu user should configure IOMMU_{READ/WRITE} since SCTLR
isn't implemented */
> 
> Will
Yong Wu (吴勇) Sept. 22, 2015, 2:12 p.m. UTC | #3
> > > +static int arm_short_unmap(struct io_pgtable_ops *ops,
> > > +                          unsigned long iova,
> > > +                          size_t size)
> > > +{
> > > +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> > > +       struct io_pgtable_cfg *cfg = &data->iop.cfg;
> > > +       arm_short_iopte *pgd, *pte = NULL;
> > > +       arm_short_iopte curpgd, curpte = 0;
> > > +       phys_addr_t paddr;
> > > +       unsigned int iova_base, blk_size = 0;
> > > +       void *cookie = data->iop.cookie;
> > > +       bool pgtablefree = false;
> > > +
> > > +       pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(iova);
> > > +
> > > +       /* Get block size */
> > > +       if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) {
> > > +               pte = arm_short_get_pte_in_pgd(*pgd, iova);
> > > +
> > > +               if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte))
> > > +                       blk_size = SZ_4K;
> > > +               else if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte))
> > > +                       blk_size = SZ_64K;
> > > +               else
> > > +                       WARN_ON(1);
> 
> > > +       } else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) {
> > > +               blk_size = SZ_1M;
> > > +       } else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
> > > +               blk_size = SZ_16M;
> > > +       } else {
> > > +               WARN_ON(1); 
> > > +       }
> > > +
> > > +       iova_base = iova & ~(blk_size - 1);
> > > +       pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(iova_base);
> > > +       paddr = arm_short_iova_to_phys(ops, iova_base);
> > > +       curpgd = *pgd;
> > > +
> > > +       if (blk_size == SZ_4K || blk_size == SZ_64K) {
> > > +               pte = arm_short_get_pte_in_pgd(*pgd, iova_base);
> > > +               curpte = *pte;
> > > +               __arm_short_set_pte(pte, 0, blk_size / SZ_4K, cfg);
> > > +
> > > +            pgtablefree = _arm_short_whether_free_pgtable(pgd);
> > > +            if (pgtablefree)
> > > +                 __arm_short_set_pte(pgd, 0, 1, cfg);
> > > +       } else if (blk_size == SZ_1M || blk_size == SZ_16M) {
> > > +               __arm_short_set_pte(pgd, 0, blk_size / SZ_1M, cfg);
> > > +       }
> > > +
> > > +    cfg->tlb->tlb_add_flush(iova_base, blk_size, true, cookie);
> > > +    cfg->tlb->tlb_sync(cookie);
> > > +
> > > +       if (pgtablefree)/* Free pgtable after tlb-flush */
> > > +              __arm_short_free_pgtable(ARM_SHORT_GET_PGTABLE_VA(curpgd),
> > > +                                        ARM_SHORT_BYTES_PER_PTE, false, cfg);
> > > +
> > > +       if (blk_size > size) { /* Split the block */
> > > +               return arm_short_split_blk_unmap(
> > > +                               ops, iova, paddr, size,
> > > +                               ARM_SHORT_PGD_GET_PROT(curpgd),
> > > +                               ARM_SHORT_PTE_LARGE_GET_PROT(curpte),
> > > +                               blk_size);
> > > +       } else if (blk_size < size) {
> > > +               /* Unmap the block while remap partial again after split */
> > > +               return blk_size +
> > > +                       arm_short_unmap(ops, iova + blk_size, size - blk_size);

Hi Will, Robin,
     I would like to show you a problem I met, The recursion here may
lead to stack overflow while we test FHD video decode.

    From the log, I get the internal variable in the error case: the
"size" is 0x100000, the "iova" is 0xfea00000, but at that time the
"blk_size" is 0x1000 as it was the map of small-page. so it enter the
recursion here.
    
    After check the unmap flow, there is only a iommu_unmap in
__iommu_dma_unmap, and it won't check the physical address align in
iommu_unmap.
so if the iova and size both are SZ_16M or SZ_1M align by chance, it
also enter the arm_short_unmap even though it was the map of small-page.

    So.
    a) Do we need unmap each sg item while iommu_dma_unmap?, like below:

//===============
static void __iommu_dma_unmap(struct iommu_domain *domain, dma_addr_t
dma_addr)
{
	/* ...and if we can't, then something is horribly, horribly wrong */
+       for_each_sg(sg, s, nents, i)
		BUG_ON(iommu_unmap(domain, pfn << shift, size) < size);
	__free_iova(iovad, iova);
}
//===============
 
    b). I need to add do-while which was suggested to delete from [1] in
arm_short_unmap for this case.

     After I test locally, I prepare to add the do-while like below:
     
//==============================
static int arm_short_unmap(struct io_pgtable_ops *ops,
			   unsigned long iova,
			   size_t size)
{
	struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
	struct io_pgtable_cfg *cfg = &data->iop.cfg;
	arm_short_iopte *pgd, *pte = NULL;
	arm_short_iopte curpgd, curpte = 0;
	unsigned int blk_base, blk_size;
	int unmap_size = 0;
	bool pgtempty;

	do {
		pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(iova);
		blk_size = 0;
		pgtempty = false;

		/* Get block size */
		if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) {
			pte = arm_short_get_pte_in_pgd(*pgd, iova);

			if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte))
				blk_size = SZ_4K;
			else if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte))
				blk_size = SZ_64K;
		} else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) {
			blk_size = SZ_1M;
		} else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
			blk_size = SZ_16M;
		}

		if (WARN_ON(!blk_size))
			return 0;

		blk_base = iova & ~(blk_size - 1);
		pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(blk_base);
		curpgd = *pgd;

		if (blk_size == SZ_4K || blk_size == SZ_64K) {
			pte = arm_short_get_pte_in_pgd(*pgd, blk_base);
			curpte = *pte;
			__arm_short_set_pte(pte, 0, blk_size / SZ_4K, cfg);

			pgtempty = __arm_short_pgtable_empty(pgd);
			if (pgtempty)
				__arm_short_set_pte(pgd, 0, 1, cfg);
		} else if (blk_size == SZ_1M || blk_size == SZ_16M) {
			__arm_short_set_pte(pgd, 0, blk_size / SZ_1M, cfg);
		}

		cfg->tlb->tlb_add_flush(blk_base, blk_size, true, data->iop.cookie);
		cfg->tlb->tlb_sync(data->iop.cookie);

		if (pgtempty)/* Free lvl2 pgtable after tlb-flush */
			__arm_short_free_pgtable(
					ARM_SHORT_GET_PGTABLE_VA(curpgd),
					ARM_SHORT_BYTES_PER_PTE, false, cfg);

		/*
		 * If the unmap size which is from the pgsize_bitmap is more
		 * than the current blk_size, unmap it continuously.
		 */
		if (blk_size <= size) {
			iova += blk_size;
			size -= blk_size;
			unmap_size += blk_size;
			continue;
		} else { /* Split this block */
			return arm_short_split_blk_unmap(
					ops, iova, size, blk_size,
					ARM_SHORT_PGD_GET_PROT(curpgd),
					ARM_SHORT_PTE_GET_PROT_LARGE(curpte));
		}
	}while (size);

	return unmap_size;
}
//=============================

    Is there any other suggestion?
    Thanks very much.


[1]:http://lists.linuxfoundation.org/pipermail/iommu/2015-June/013322.html

> > > +       }
> > > +
> > > +       return size;
> > > +}
> > > +
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
Will Deacon Oct. 9, 2015, 3:57 p.m. UTC | #4
On Tue, Sep 22, 2015 at 03:12:47PM +0100, Yong Wu wrote:
>      I would like to show you a problem I met, The recursion here may
> lead to stack overflow while we test FHD video decode.
> 
>     From the log, I get the internal variable in the error case: the
> "size" is 0x100000, the "iova" is 0xfea00000, but at that time the
> "blk_size" is 0x1000 as it was the map of small-page. so it enter the
> recursion here.
>     
>     After check the unmap flow, there is only a iommu_unmap in
> __iommu_dma_unmap, and it won't check the physical address align in
> iommu_unmap.

That sounds like a bug in __iommu_dma_unmap. Robin?

Will
Robin Murphy Oct. 9, 2015, 5:41 p.m. UTC | #5
On 09/10/15 16:57, Will Deacon wrote:
> On Tue, Sep 22, 2015 at 03:12:47PM +0100, Yong Wu wrote:
>>       I would like to show you a problem I met, The recursion here may
>> lead to stack overflow while we test FHD video decode.
>>
>>      From the log, I get the internal variable in the error case: the
>> "size" is 0x100000, the "iova" is 0xfea00000, but at that time the
>> "blk_size" is 0x1000 as it was the map of small-page. so it enter the
>> recursion here.
>>
>>      After check the unmap flow, there is only a iommu_unmap in
>> __iommu_dma_unmap, and it won't check the physical address align in
>> iommu_unmap.
>
> That sounds like a bug in __iommu_dma_unmap. Robin?

Isn't it just cf27ec930be9 again wearing different trousers? All I do is 
call iommu_unmap with the same total size that was mapped originally.

Robin.
Will Deacon Oct. 9, 2015, 6:19 p.m. UTC | #6
On Fri, Oct 09, 2015 at 06:41:51PM +0100, Robin Murphy wrote:
> On 09/10/15 16:57, Will Deacon wrote:
> >On Tue, Sep 22, 2015 at 03:12:47PM +0100, Yong Wu wrote:
> >>      I would like to show you a problem I met, The recursion here may
> >>lead to stack overflow while we test FHD video decode.
> >>
> >>     From the log, I get the internal variable in the error case: the
> >>"size" is 0x100000, the "iova" is 0xfea00000, but at that time the
> >>"blk_size" is 0x1000 as it was the map of small-page. so it enter the
> >>recursion here.
> >>
> >>     After check the unmap flow, there is only a iommu_unmap in
> >>__iommu_dma_unmap, and it won't check the physical address align in
> >>iommu_unmap.
> >
> >That sounds like a bug in __iommu_dma_unmap. Robin?
> 
> Isn't it just cf27ec930be9 again wearing different trousers? All I do is
> call iommu_unmap with the same total size that was mapped originally.

I don't think it's the same as that issue, which was to do with installing
block mappings over the top of an existing table entry. The problem here
seems to be that we don't walk the page table properly on unmap.

The long descriptor code has:

	/* If the size matches this level, we're in the right place */
	if (size == blk_size) {
		__arm_lpae_set_pte(ptep, 0, &data->iop.cfg);

		if (!iopte_leaf(pte, lvl)) {
			/* Also flush any partial walks */
			tlb->tlb_add_flush(iova, size, false, cookie);
			tlb->tlb_sync(cookie);
			ptep = iopte_deref(pte, data);
			__arm_lpae_free_pgtable(data, lvl + 1, ptep);
		} else {
			tlb->tlb_add_flush(iova, size, true, cookie);
		}

		return size;
	} else if (iopte_leaf(pte, lvl)) {
		/*
		 * Insert a table at the next level to map the old region,
		 * minus the part we want to unmap
		 */
		return arm_lpae_split_blk_unmap(data, iova, size,
						iopte_prot(pte), lvl, ptep,
						blk_size);
	}

why doesn't something similar work for short descriptors?

Will
Yong Wu (吴勇) Oct. 21, 2015, 10:34 a.m. UTC | #7
On Fri, 2015-10-09 at 19:19 +0100, Will Deacon wrote:
> On Fri, Oct 09, 2015 at 06:41:51PM +0100, Robin Murphy wrote:
> > On 09/10/15 16:57, Will Deacon wrote:
> > >On Tue, Sep 22, 2015 at 03:12:47PM +0100, Yong Wu wrote:
> > >>      I would like to show you a problem I met, The recursion here may
> > >>lead to stack overflow while we test FHD video decode.
> > >>
> > >>     From the log, I get the internal variable in the error case: the
> > >>"size" is 0x100000, the "iova" is 0xfea00000, but at that time the
> > >>"blk_size" is 0x1000 as it was the map of small-page. so it enter the
> > >>recursion here.
> > >>
> > >>     After check the unmap flow, there is only a iommu_unmap in
> > >>__iommu_dma_unmap, and it won't check the physical address align in
> > >>iommu_unmap.
> > >
> > >That sounds like a bug in __iommu_dma_unmap. Robin?
> > 
> > Isn't it just cf27ec930be9 again wearing different trousers? All I do is
> > call iommu_unmap with the same total size that was mapped originally.
> 
> I don't think it's the same as that issue, which was to do with installing
> block mappings over the top of an existing table entry. The problem here
> seems to be that we don't walk the page table properly on unmap.
> 
> The long descriptor code has:
> 
> 	/* If the size matches this level, we're in the right place */
> 	if (size == blk_size) {
> 		__arm_lpae_set_pte(ptep, 0, &data->iop.cfg);
> 
> 		if (!iopte_leaf(pte, lvl)) {
> 			/* Also flush any partial walks */
> 			tlb->tlb_add_flush(iova, size, false, cookie);
> 			tlb->tlb_sync(cookie);
> 			ptep = iopte_deref(pte, data);
> 			__arm_lpae_free_pgtable(data, lvl + 1, ptep);
> 		} else {
> 			tlb->tlb_add_flush(iova, size, true, cookie);
> 		}
> 
> 		return size;
> 	} else if (iopte_leaf(pte, lvl)) {
> 		/*
> 		 * Insert a table at the next level to map the old region,
> 		 * minus the part we want to unmap
> 		 */
> 		return arm_lpae_split_blk_unmap(data, iova, size,
> 						iopte_prot(pte), lvl, ptep,
> 						blk_size);
> 	}
> 
> why doesn't something similar work for short descriptors?
> 
> Will

Hi Will,

   There are some different between long and short descriptor, I can not
use it directly.

1. Long descriptor control the blk_size with 3 levels easily whose 
lvl1 is 4KB, lvl2 is 2MB and lvl3 is 1GB in stage 1. It have 3 levels
pagetable, then it use 3 levels block_size here. It is ok.

But I don't use the "level" in short descriptor. At the beginning of
designing short, I planned to use 4 levels whose lvl1 is 4KB, lvl2 is
64KB, lvl3 is 1MB, lvl4 is 16MB in short descriptor. then the code may
be more similar with long descriptor. But there is only 2 levels
pagetable in short. if we use 4 levels here, It may lead to
misunderstand. so I don't use the "level" and list the four case in map
and unmap.
(If you think short-descriptor could use 4 level like above, I can try
it.)

2. Following the unmap in long, if it's not a leaf, we free the
pagetable, then we can delete do-while. I have tested this:
 
//===========================
static int arm_short_unmap(struct io_pgtable_ops *ops,
			   unsigned long iova,
			   size_t size)
{
	struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
	struct io_pgtable_cfg *cfg = &data->iop.cfg;
	void *cookie = data->iop.cookie;
	arm_short_iopte *pgd, *pte = NULL;
	arm_short_iopte pgd_tmp, pte_tmp = 0;
	unsigned int blk_size = 0, blk_base;
	bool empty = false, split = false;
	int i;

	blk_size = arm_short_iova_to_blk_size(ops, iova);
	if (WARN_ON(!blk_size))
		return 0;

	blk_base = iova & ~(blk_size - 1);
	pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(blk_base);

	if (size == SZ_1M || size == SZ_16M) {/* section or supersection */
		for (i = 0; i < size/SZ_1M; i++, pgd++, blk_base += SZ_1M) {
			pgd_tmp = *pgd;
			__arm_short_set_pte(pgd, 0, 1, cfg);

			cfg->tlb->tlb_add_flush(blk_base, SZ_1M, true, cookie);
			cfg->tlb->tlb_sync(cookie);

			/* Lvl2 pgtable should be freed while current is pgtable */
			if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(pgd_tmp))
				__arm_short_free_pgtable(
					ARM_SHORT_GET_PGTABLE_VA(pgd_tmp),
					ARM_SHORT_BYTES_PER_PTE, false, cfg);

			/* Split is needed while unmap 1M in supersection */
			if (size == SZ_1M && blk_size == SZ_16M)
				split = true;
		}
	} else if (size == SZ_4K || size == SZ_64K) {/* page or large page */
		pgd_tmp = *pgd;

		/* Unmap the current node */
		if (blk_size == SZ_4K || blk_size == SZ_64K) {
			pte = arm_short_get_pte_in_pgd(*pgd, blk_base);
			pte_tmp = *pte;
			__arm_short_set_pte(
				pte, 0, max_t(size_t, size, blk_size) / SZ_4K, cfg);

			empty = __arm_short_pgtable_empty(pgd);
			if (empty)
				__arm_short_set_pte(pgd, 0, 1, cfg);
		} else if (blk_size == SZ_1M || blk_size == SZ_16M) {
			__arm_short_set_pte(pgd, 0, blk_size / SZ_1M, cfg);
		}

		cfg->tlb->tlb_add_flush(blk_size, size, true, cookie);
		cfg->tlb->tlb_sync(cookie);

		if (empty)/* Free lvl2 pgtable */
			__arm_short_free_pgtable(
					ARM_SHORT_GET_PGTABLE_VA(pgd_tmp),
					ARM_SHORT_BYTES_PER_PTE, false, cfg);

		if (blk_size > size)
			split = true;

	} else {
		return 0;/* Unmapped size */
	}

	if (split)/* Split while blk_size > size */
		return arm_short_split_blk_unmap(
				ops, iova, size, blk_size,
				ARM_SHORT_PGD_GET_PROT(pgd_tmp),
				ARM_SHORT_PTE_GET_PROT_LARGE(pte_tmp));

	return size;
}
//===========================
This look also not good. The do-while in our v5 maybe better than this
one. what's your opinion?

3. (Also add Joerg)There is a line in iommu_map:
size_t pgsize = iommu_pgsize(domain, iova | paddr, size);

   And there is a line in iommu_unmap:
size_t pgsize = iommu_pgsize(domain, iova, size - unmapped);

   Is it possible to change like this:
phys_addr_t paddr = domain->ops->iova_to_phys(domain, iova);
size_t pgsize = iommu_pgsize(domain, iova | paddr, size - unmapped);

   If we add physical address align check in iommu_unmap, then it may be
simpler in the unmap flow.
   I think iommu_map&iommu_unmap both should be atmoic map/unmap,
iommu_map check the physical address, Why iommu_unmap don't check the
physical address?
diff mbox

Patch

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f1fb1d3..3abd066 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -39,6 +39,24 @@  config IOMMU_IO_PGTABLE_LPAE_SELFTEST
 
 	  If unsure, say N here.
 
+config IOMMU_IO_PGTABLE_SHORT
+	bool "ARMv7/v8 Short Descriptor Format"
+	select IOMMU_IO_PGTABLE
+	depends on ARM || ARM64 || COMPILE_TEST
+	help
+	  Enable support for the ARM Short-descriptor pagetable format.
+	  This allocator supports 2 levels translation tables which supports
+	  a memory map based on memory sections or pages.
+
+config IOMMU_IO_PGTABLE_SHORT_SELFTEST
+	bool "Short Descriptor selftests"
+	depends on IOMMU_IO_PGTABLE_SHORT
+	help
+	  Enable self-tests for Short-descriptor page table allocator.
+	  This performs a series of page-table consistency checks during boot.
+
+	  If unsure, say N here.
+
 endmenu
 
 config IOMMU_IOVA
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index c6dcc51..06df3e6 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -3,6 +3,7 @@  obj-$(CONFIG_IOMMU_API) += iommu-traces.o
 obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_SHORT) += io-pgtable-arm-short.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
 obj-$(CONFIG_OF_IOMMU)	+= of_iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o
diff --git a/drivers/iommu/io-pgtable-arm-short.c b/drivers/iommu/io-pgtable-arm-short.c
new file mode 100644
index 0000000..56f5480
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm-short.c
@@ -0,0 +1,813 @@ 
+/*
+ * Copyright (c) 2014-2015 MediaTek Inc.
+ * Author: Yong Wu <yong.wu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#define pr_fmt(fmt)	"arm-short-desc io-pgtable: "fmt
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/errno.h>
+#include "io-pgtable.h"
+
+typedef u32 arm_short_iopte;
+
+struct arm_short_io_pgtable {
+	struct io_pgtable	iop;
+	struct kmem_cache	*pgtable_cached;
+	size_t			pgd_size;
+	void			*pgd;
+};
+
+#define io_pgtable_to_data(x)			\
+	container_of((x), struct arm_short_io_pgtable, iop)
+
+#define io_pgtable_ops_to_data(x)		\
+	io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
+
+#define io_pgtable_cfg_to_pgtable(x)		\
+	container_of((x), struct io_pgtable, cfg)
+
+#define io_pgtable_cfg_to_data(x)		\
+	io_pgtable_to_data(io_pgtable_cfg_to_pgtable(x))
+
+#define ARM_SHORT_PGDIR_SHIFT			20
+#define ARM_SHORT_PAGE_SHIFT			12
+#define ARM_SHORT_PTRS_PER_PTE			\
+	(1 << (ARM_SHORT_PGDIR_SHIFT - ARM_SHORT_PAGE_SHIFT))
+#define ARM_SHORT_BYTES_PER_PTE			\
+	(ARM_SHORT_PTRS_PER_PTE * sizeof(arm_short_iopte))
+
+/* level 1 pagetable */
+#define ARM_SHORT_PGD_TYPE_PGTABLE		BIT(0)
+#define ARM_SHORT_PGD_TYPE_SECTION		BIT(1)
+#define ARM_SHORT_PGD_B				BIT(2)
+#define ARM_SHORT_PGD_C				BIT(3)
+#define ARM_SHORT_PGD_PGTABLE_NS		BIT(3)
+#define ARM_SHORT_PGD_SECTION_XN		BIT(4)
+#define ARM_SHORT_PGD_IMPLE			BIT(9)
+#define ARM_SHORT_PGD_RD_WR			(3 << 10)
+#define ARM_SHORT_PGD_RDONLY			BIT(15)
+#define ARM_SHORT_PGD_S				BIT(16)
+#define ARM_SHORT_PGD_nG			BIT(17)
+#define ARM_SHORT_PGD_SUPERSECTION		BIT(18)
+#define ARM_SHORT_PGD_SECTION_NS		BIT(19)
+
+#define ARM_SHORT_PGD_TYPE_SUPERSECTION		\
+	(ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION)
+#define ARM_SHORT_PGD_SECTION_TYPE_MSK		\
+	(ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION)
+#define ARM_SHORT_PGD_PGTABLE_TYPE_MSK		\
+	(ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_TYPE_PGTABLE)
+#define ARM_SHORT_PGD_TYPE_IS_PGTABLE(pgd)	\
+	(((pgd) & ARM_SHORT_PGD_PGTABLE_TYPE_MSK) == ARM_SHORT_PGD_TYPE_PGTABLE)
+#define ARM_SHORT_PGD_TYPE_IS_SECTION(pgd)	\
+	(((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == ARM_SHORT_PGD_TYPE_SECTION)
+#define ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(pgd)	\
+	(((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == \
+	ARM_SHORT_PGD_TYPE_SUPERSECTION)
+#define ARM_SHORT_PGD_PGTABLE_MSK		0xfffffc00
+#define ARM_SHORT_PGD_SECTION_MSK		(~(SZ_1M - 1))
+#define ARM_SHORT_PGD_SUPERSECTION_MSK		(~(SZ_16M - 1))
+
+/* level 2 pagetable */
+#define ARM_SHORT_PTE_TYPE_LARGE		BIT(0)
+#define ARM_SHORT_PTE_SMALL_XN			BIT(0)
+#define ARM_SHORT_PTE_TYPE_SMALL		BIT(1)
+#define ARM_SHORT_PTE_B				BIT(2)
+#define ARM_SHORT_PTE_C				BIT(3)
+#define ARM_SHORT_PTE_RD_WR			(3 << 4)
+#define ARM_SHORT_PTE_RDONLY			BIT(9)
+#define ARM_SHORT_PTE_S				BIT(10)
+#define ARM_SHORT_PTE_nG			BIT(11)
+#define ARM_SHORT_PTE_LARGE_XN			BIT(15)
+#define ARM_SHORT_PTE_LARGE_MSK			(~(SZ_64K - 1))
+#define ARM_SHORT_PTE_SMALL_MSK			(~(SZ_4K - 1))
+#define ARM_SHORT_PTE_TYPE_MSK			\
+	(ARM_SHORT_PTE_TYPE_LARGE | ARM_SHORT_PTE_TYPE_SMALL)
+#define ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(pte)	\
+	(((pte) & ARM_SHORT_PTE_TYPE_SMALL) == ARM_SHORT_PTE_TYPE_SMALL)
+#define ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(pte)	\
+	(((pte) & ARM_SHORT_PTE_TYPE_MSK) == ARM_SHORT_PTE_TYPE_LARGE)
+
+#define ARM_SHORT_PGD_IDX(a)			((a) >> ARM_SHORT_PGDIR_SHIFT)
+#define ARM_SHORT_PTE_IDX(a)			\
+	(((a) >> ARM_SHORT_PAGE_SHIFT) & (ARM_SHORT_PTRS_PER_PTE - 1))
+
+#define ARM_SHORT_GET_PGTABLE_VA(pgd)		\
+	(phys_to_virt((unsigned long)pgd & ARM_SHORT_PGD_PGTABLE_MSK))
+
+#define ARM_SHORT_PTE_LARGE_GET_PROT(pte)	\
+	(((pte) & (~ARM_SHORT_PTE_LARGE_MSK)) & ~ARM_SHORT_PTE_TYPE_MSK)
+
+#define ARM_SHORT_PGD_GET_PROT(pgd)		\
+	(((pgd) & (~ARM_SHORT_PGD_SECTION_MSK)) & ~ARM_SHORT_PGD_SUPERSECTION)
+
+static bool selftest_running;
+
+static arm_short_iopte *
+arm_short_get_pte_in_pgd(arm_short_iopte pgd, unsigned int iova)
+{
+	arm_short_iopte *pte;
+
+	pte = ARM_SHORT_GET_PGTABLE_VA(pgd);
+	pte += ARM_SHORT_PTE_IDX(iova);
+	return pte;
+}
+
+static dma_addr_t
+__arm_short_dma_addr(struct device *dev, void *va)
+{
+	return phys_to_dma(dev, virt_to_phys(va));
+}
+
+static int
+__arm_short_set_pte(arm_short_iopte *ptep, arm_short_iopte pte,
+		    unsigned int ptenr, struct io_pgtable_cfg *cfg)
+{
+	struct device *dev = cfg->iommu_dev;
+	int i;
+
+	for (i = 0; i < ptenr; i++) {
+		if (ptep[i] && pte) {
+			/* Someone else may have allocated for this pte */
+			WARN_ON(!selftest_running);
+			goto err_exist_pte;
+		}
+		ptep[i] = pte;
+	}
+
+	if (selftest_running)
+		return 0;
+
+	dma_sync_single_for_device(dev, __arm_short_dma_addr(dev, ptep),
+				   sizeof(*ptep) * ptenr, DMA_TO_DEVICE);
+	return 0;
+
+err_exist_pte:
+	while (i--)
+		ptep[i] = 0;
+	return -EEXIST;
+}
+
+static void *
+__arm_short_alloc_pgtable(size_t size, gfp_t gfp, bool pgd,
+			  struct io_pgtable_cfg *cfg)
+{
+	struct arm_short_io_pgtable *data;
+	struct device *dev = cfg->iommu_dev;
+	dma_addr_t dma;
+	void *va;
+
+	if (pgd) {/* lvl1 pagetable */
+		va = alloc_pages_exact(size, gfp);
+	} else {  /* lvl2 pagetable */
+		data = io_pgtable_cfg_to_data(cfg);
+		va = kmem_cache_zalloc(data->pgtable_cached, gfp);
+	}
+
+	if (!va)
+		return NULL;
+
+	if (selftest_running)
+		return va;
+
+	dma = dma_map_single(dev, va, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(dev, dma))
+		goto out_free;
+
+	if (dma != __arm_short_dma_addr(dev, va))
+		goto out_unmap;
+
+	if (!pgd) {
+		kmemleak_ignore(va);
+		dma_sync_single_for_device(dev, __arm_short_dma_addr(dev, va),
+					   size, DMA_TO_DEVICE);
+	}
+
+	return va;
+
+out_unmap:
+	dev_err_ratelimited(dev, "Cannot accommodate DMA translation for IOMMU page tables\n");
+	dma_unmap_single(dev, dma, size, DMA_TO_DEVICE);
+out_free:
+	if (pgd)
+		free_pages_exact(va, size);
+	else
+		kmem_cache_free(data->pgtable_cached, va);
+	return NULL;
+}
+
+static void
+__arm_short_free_pgtable(void *va, size_t size, bool pgd,
+			 struct io_pgtable_cfg *cfg)
+{
+	struct arm_short_io_pgtable *data = io_pgtable_cfg_to_data(cfg);
+	struct device *dev = cfg->iommu_dev;
+
+	if (!selftest_running)
+		dma_unmap_single(dev, __arm_short_dma_addr(dev, va),
+				 size, DMA_TO_DEVICE);
+
+	if (pgd)
+		free_pages_exact(va, size);
+	else
+		kmem_cache_free(data->pgtable_cached, va);
+}
+
+static arm_short_iopte
+__arm_short_pte_prot(struct arm_short_io_pgtable *data, int prot, bool large)
+{
+	arm_short_iopte pteprot;
+	int quirk = data->iop.cfg.quirks;
+
+	pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
+	pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
+				ARM_SHORT_PTE_TYPE_SMALL;
+	if (prot & IOMMU_CACHE)
+		pteprot |=  ARM_SHORT_PTE_B | ARM_SHORT_PTE_C;
+	if (!(quirk & IO_PGTABLE_QUIRK_SHORT_NO_XN) && (prot & IOMMU_NOEXEC)) {
+			pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
+				ARM_SHORT_PTE_SMALL_XN;
+	}
+	if (!(quirk & IO_PGTABLE_QUIRK_SHORT_NO_PERMS)) {
+		pteprot |= ARM_SHORT_PTE_RD_WR;
+		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
+			pteprot |= ARM_SHORT_PTE_RDONLY;
+	}
+	return pteprot;
+}
+
+static arm_short_iopte
+__arm_short_pgd_prot(struct arm_short_io_pgtable *data, int prot, bool super)
+{
+	arm_short_iopte pgdprot;
+	int quirk = data->iop.cfg.quirks;
+
+	pgdprot = ARM_SHORT_PGD_S | ARM_SHORT_PGD_nG;
+	pgdprot |= super ? ARM_SHORT_PGD_TYPE_SUPERSECTION :
+				ARM_SHORT_PGD_TYPE_SECTION;
+	if (prot & IOMMU_CACHE)
+		pgdprot |= ARM_SHORT_PGD_C | ARM_SHORT_PGD_B;
+	if (quirk & IO_PGTABLE_QUIRK_ARM_NS)
+		pgdprot |= ARM_SHORT_PGD_SECTION_NS;
+
+	if (!(quirk & IO_PGTABLE_QUIRK_SHORT_NO_XN) && (prot & IOMMU_NOEXEC))
+			pgdprot |= ARM_SHORT_PGD_SECTION_XN;
+
+	if (!(quirk & IO_PGTABLE_QUIRK_SHORT_NO_PERMS)) {
+		pgdprot |= ARM_SHORT_PGD_RD_WR;
+		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
+			pgdprot |= ARM_SHORT_PGD_RDONLY;
+	}
+	return pgdprot;
+}
+
+static arm_short_iopte
+__arm_short_pte_prot_split(struct arm_short_io_pgtable *data,
+			   arm_short_iopte pgdprot,
+			   arm_short_iopte pteprot_large,
+			   bool large)
+{
+	arm_short_iopte pteprot = 0;
+
+	pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG | ARM_SHORT_PTE_RD_WR;
+	pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
+				ARM_SHORT_PTE_TYPE_SMALL;
+
+	/* large page to small page pte prot. Only large page may split */
+	if (!pgdprot && !large) {
+		pteprot |= pteprot_large & ~ARM_SHORT_PTE_SMALL_MSK;
+		if (pteprot_large & ARM_SHORT_PTE_LARGE_XN)
+			pteprot |= ARM_SHORT_PTE_SMALL_XN;
+	}
+
+	/* section to pte prot */
+	if (pgdprot & ARM_SHORT_PGD_C)
+		pteprot |= ARM_SHORT_PTE_C;
+	if (pgdprot & ARM_SHORT_PGD_B)
+		pteprot |= ARM_SHORT_PTE_B;
+	if (pgdprot & ARM_SHORT_PGD_nG)
+		pteprot |= ARM_SHORT_PTE_nG;
+	if (pgdprot & ARM_SHORT_PGD_SECTION_XN)
+		pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
+				ARM_SHORT_PTE_SMALL_XN;
+	if (pgdprot & ARM_SHORT_PGD_RD_WR)
+		pteprot |= ARM_SHORT_PTE_RD_WR;
+	if (pgdprot & ARM_SHORT_PGD_RDONLY)
+		pteprot |= ARM_SHORT_PTE_RDONLY;
+
+	return pteprot;
+}
+
+static arm_short_iopte
+__arm_short_pgtable_prot(struct arm_short_io_pgtable *data)
+{
+	arm_short_iopte pgdprot = 0;
+
+	pgdprot = ARM_SHORT_PGD_TYPE_PGTABLE;
+	if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
+		pgdprot |= ARM_SHORT_PGD_PGTABLE_NS;
+	return pgdprot;
+}
+
+static int
+_arm_short_map(struct arm_short_io_pgtable *data,
+	       unsigned int iova, phys_addr_t paddr,
+	       arm_short_iopte pgdprot, arm_short_iopte pteprot,
+	       bool large)
+{
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	arm_short_iopte *pgd = data->pgd, *pte;
+	void *pte_new = NULL;
+	int ret;
+
+	pgd += ARM_SHORT_PGD_IDX(iova);
+
+	if (!pteprot) { /* section or supersection */
+		pte = pgd;
+		pteprot = pgdprot;
+	} else {        /* page or largepage */
+		if (!(*pgd)) {
+			pte_new = __arm_short_alloc_pgtable(
+					ARM_SHORT_BYTES_PER_PTE,
+					GFP_ATOMIC, false, cfg);
+			if (unlikely(!pte_new))
+				return -ENOMEM;
+
+			pgdprot |= virt_to_phys(pte_new);
+			__arm_short_set_pte(pgd, pgdprot, 1, cfg);
+		}
+		pte = arm_short_get_pte_in_pgd(*pgd, iova);
+	}
+
+	pteprot |= (arm_short_iopte)paddr;
+	ret = __arm_short_set_pte(pte, pteprot, large ? 16 : 1, cfg);
+	if (ret && pte_new)
+		__arm_short_free_pgtable(pte_new, ARM_SHORT_BYTES_PER_PTE,
+					 false, cfg);
+	return ret;
+}
+
+static int arm_short_map(struct io_pgtable_ops *ops, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot)
+{
+	struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	arm_short_iopte pgdprot = 0, pteprot = 0;
+	bool large;
+
+	/* If no access, then nothing to do */
+	if (!(prot & (IOMMU_READ | IOMMU_WRITE)))
+		return 0;
+
+	if (WARN_ON((iova | paddr) & (size - 1)))
+		return -EINVAL;
+
+	switch (size) {
+	case SZ_4K:
+	case SZ_64K:
+		large = (size == SZ_64K) ? true : false;
+		pteprot = __arm_short_pte_prot(data, prot, large);
+		pgdprot = __arm_short_pgtable_prot(data);
+		break;
+
+	case SZ_1M:
+	case SZ_16M:
+		large = (size == SZ_16M) ? true : false;
+		pgdprot = __arm_short_pgd_prot(data, prot, large);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return _arm_short_map(data, iova, paddr, pgdprot, pteprot, large);
+}
+
+static phys_addr_t arm_short_iova_to_phys(struct io_pgtable_ops *ops,
+					  unsigned long iova)
+{
+	struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	arm_short_iopte *pte, *pgd = data->pgd;
+	phys_addr_t pa = 0;
+
+	pgd += ARM_SHORT_PGD_IDX(iova);
+
+	if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) {
+		pte = arm_short_get_pte_in_pgd(*pgd, iova);
+
+		if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte)) {
+			pa = (*pte) & ARM_SHORT_PTE_LARGE_MSK;
+			pa |= iova & ~ARM_SHORT_PTE_LARGE_MSK;
+		} else if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte)) {
+			pa = (*pte) & ARM_SHORT_PTE_SMALL_MSK;
+			pa |= iova & ~ARM_SHORT_PTE_SMALL_MSK;
+		}
+	} else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) {
+		pa = (*pgd) & ARM_SHORT_PGD_SECTION_MSK;
+		pa |= iova & ~ARM_SHORT_PGD_SECTION_MSK;
+	} else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
+		pa = (*pgd) & ARM_SHORT_PGD_SUPERSECTION_MSK;
+		pa |= iova & ~ARM_SHORT_PGD_SUPERSECTION_MSK;
+	}
+
+	return pa;
+}
+
+static bool _arm_short_whether_free_pgtable(arm_short_iopte *pgd)
+{
+	arm_short_iopte *pte;
+	int i;
+
+	pte = ARM_SHORT_GET_PGTABLE_VA(*pgd);
+	for (i = 0; i < ARM_SHORT_PTRS_PER_PTE; i++) {
+		if (pte[i] != 0)
+			return false;
+	}
+
+	return true;
+}
+
+static int
+arm_short_split_blk_unmap(struct io_pgtable_ops *ops, unsigned int iova,
+			  phys_addr_t paddr, size_t size,
+			  arm_short_iopte pgdprotup, arm_short_iopte pteprotup,
+			  size_t blk_size)
+{
+	struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	unsigned long *pgbitmap = &cfg->pgsize_bitmap;
+	unsigned int blk_base, blk_start, blk_end, i;
+	arm_short_iopte pgdprot, pteprot;
+	phys_addr_t blk_paddr;
+	size_t mapsize = 0, nextmapsize;
+	int ret;
+
+	/* find the nearest mapsize */
+	for (i = find_first_bit(pgbitmap, BITS_PER_LONG);
+	     i < BITS_PER_LONG && ((1 << i) < blk_size) &&
+	     IS_ALIGNED(size, 1 << i);
+	     i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1))
+		mapsize = 1 << i;
+
+	if (WARN_ON(!mapsize))
+		return 0; /* Bytes unmapped */
+	nextmapsize = 1 << i;
+
+	blk_base = iova & ~(blk_size - 1);
+	blk_start = blk_base;
+	blk_end = blk_start + blk_size;
+	blk_paddr = paddr;
+
+	for (; blk_start < blk_end;
+	     blk_start += mapsize, blk_paddr += mapsize) {
+		/* Unmap! */
+		if (blk_start == iova)
+			continue;
+
+		/* Try to upper map */
+		if (blk_base != blk_start &&
+		    IS_ALIGNED(blk_start | blk_paddr, nextmapsize) &&
+		    mapsize != nextmapsize) {
+			mapsize = nextmapsize;
+			i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1);
+			if (i < BITS_PER_LONG)
+				nextmapsize = 1 << i;
+		}
+
+		if (mapsize == SZ_1M) {
+			pgdprot = pgdprotup;
+			pgdprot |= __arm_short_pgd_prot(data, 0, false);
+			pteprot = 0;
+		} else { /* small or large page */
+			pgdprot = (blk_size == SZ_64K) ? 0 : pgdprotup;
+			pteprot = __arm_short_pte_prot_split(
+					data, pgdprot, pteprotup,
+					mapsize == SZ_64K);
+			pgdprot = __arm_short_pgtable_prot(data);
+		}
+
+		ret = _arm_short_map(data, blk_start, blk_paddr, pgdprot,
+				     pteprot, mapsize == SZ_64K);
+		if (ret < 0) {
+			/* Free the table we allocated */
+			arm_short_iopte *pgd = data->pgd, *pte;
+
+			pgd += ARM_SHORT_PGD_IDX(blk_base);
+			if (*pgd) {
+				pte = ARM_SHORT_GET_PGTABLE_VA(*pgd);
+				__arm_short_set_pte(pgd, 0, 1, cfg);
+				tlb->tlb_add_flush(blk_base, blk_size, true,
+						   data->iop.cookie);
+				tlb->tlb_sync(data->iop.cookie);
+				__arm_short_free_pgtable(
+					pte, ARM_SHORT_BYTES_PER_PTE,
+					false, cfg);
+			}
+			return 0;/* Bytes unmapped */
+		}
+	}
+
+	tlb->tlb_add_flush(blk_base, blk_size, true, data->iop.cookie);
+	tlb->tlb_sync(data->iop.cookie);
+	return size;
+}
+
+static int arm_short_unmap(struct io_pgtable_ops *ops,
+			   unsigned long iova,
+			   size_t size)
+{
+	struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	arm_short_iopte *pgd, *pte = NULL;
+	arm_short_iopte curpgd, curpte = 0;
+	phys_addr_t paddr;
+	unsigned int iova_base, blk_size = 0;
+	void *cookie = data->iop.cookie;
+	bool pgtablefree = false;
+
+	pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(iova);
+
+	/* Get block size */
+	if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) {
+		pte = arm_short_get_pte_in_pgd(*pgd, iova);
+
+		if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte))
+			blk_size = SZ_4K;
+		else if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte))
+			blk_size = SZ_64K;
+		else
+			WARN_ON(1);
+	} else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) {
+		blk_size = SZ_1M;
+	} else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
+		blk_size = SZ_16M;
+	} else {
+		WARN_ON(1);
+	}
+
+	iova_base = iova & ~(blk_size - 1);
+	pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(iova_base);
+	paddr = arm_short_iova_to_phys(ops, iova_base);
+	curpgd = *pgd;
+
+	if (blk_size == SZ_4K || blk_size == SZ_64K) {
+		pte = arm_short_get_pte_in_pgd(*pgd, iova_base);
+		curpte = *pte;
+		__arm_short_set_pte(pte, 0, blk_size / SZ_4K, cfg);
+
+		pgtablefree = _arm_short_whether_free_pgtable(pgd);
+		if (pgtablefree)
+			__arm_short_set_pte(pgd, 0, 1, cfg);
+	} else if (blk_size == SZ_1M || blk_size == SZ_16M) {
+		__arm_short_set_pte(pgd, 0, blk_size / SZ_1M, cfg);
+	}
+
+	cfg->tlb->tlb_add_flush(iova_base, blk_size, true, cookie);
+	cfg->tlb->tlb_sync(cookie);
+
+	if (pgtablefree)/* Free pgtable after tlb-flush */
+		__arm_short_free_pgtable(ARM_SHORT_GET_PGTABLE_VA(curpgd),
+					 ARM_SHORT_BYTES_PER_PTE, false, cfg);
+
+	if (blk_size > size) { /* Split the block */
+		return arm_short_split_blk_unmap(
+				ops, iova, paddr, size,
+				ARM_SHORT_PGD_GET_PROT(curpgd),
+				ARM_SHORT_PTE_LARGE_GET_PROT(curpte),
+				blk_size);
+	} else if (blk_size < size) {
+		/* Unmap the block while remap partial again after split */
+		return blk_size +
+			arm_short_unmap(ops, iova + blk_size, size - blk_size);
+	}
+
+	return size;
+}
+
+static struct io_pgtable *
+arm_short_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
+{
+	struct arm_short_io_pgtable *data;
+
+	if (cfg->ias > 32 || cfg->oas > 32)
+		return NULL;
+
+	cfg->pgsize_bitmap &=
+		(cfg->quirks & IO_PGTABLE_QUIRK_SHORT_SUPERSECTION) ?
+		(SZ_4K | SZ_64K | SZ_1M | SZ_16M) : (SZ_4K | SZ_64K | SZ_1M);
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return NULL;
+
+	data->pgd_size = SZ_16K;
+	data->pgd = __arm_short_alloc_pgtable(
+					data->pgd_size,
+					GFP_KERNEL | __GFP_ZERO | __GFP_DMA,
+					true, cfg);
+	if (!data->pgd)
+		goto out_free_data;
+	wmb();/* Ensure the empty pgd is visible before any actual TTBR write */
+
+	data->pgtable_cached = kmem_cache_create(
+					"io-pgtable-arm-short",
+					 ARM_SHORT_BYTES_PER_PTE,
+					 ARM_SHORT_BYTES_PER_PTE,
+					 0, NULL);
+	if (!data->pgtable_cached)
+		goto out_free_pgd;
+
+	/* TTBRs */
+	cfg->arm_short_cfg.ttbr[0] = virt_to_phys(data->pgd);
+	cfg->arm_short_cfg.ttbr[1] = 0;
+	cfg->arm_short_cfg.tcr = 0;
+	cfg->arm_short_cfg.nmrr = 0;
+	cfg->arm_short_cfg.prrr = 0;
+
+	data->iop.ops = (struct io_pgtable_ops) {
+		.map		= arm_short_map,
+		.unmap		= arm_short_unmap,
+		.iova_to_phys	= arm_short_iova_to_phys,
+	};
+
+	return &data->iop;
+
+out_free_pgd:
+	__arm_short_free_pgtable(data->pgd, data->pgd_size, true, cfg);
+out_free_data:
+	kfree(data);
+	return NULL;
+}
+
+static void arm_short_free_pgtable(struct io_pgtable *iop)
+{
+	struct arm_short_io_pgtable *data = io_pgtable_to_data(iop);
+
+	kmem_cache_destroy(data->pgtable_cached);
+	__arm_short_free_pgtable(data->pgd, data->pgd_size,
+				 true, &data->iop.cfg);
+	kfree(data);
+}
+
+struct io_pgtable_init_fns io_pgtable_arm_short_init_fns = {
+	.alloc	= arm_short_alloc_pgtable,
+	.free	= arm_short_free_pgtable,
+};
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_SHORT_SELFTEST
+
+static struct io_pgtable_cfg *cfg_cookie;
+
+static void dummy_tlb_flush_all(void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_tlb_add_flush(unsigned long iova, size_t size, bool leaf,
+				void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+	WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
+}
+
+static void dummy_tlb_sync(void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static struct iommu_gather_ops dummy_tlb_ops = {
+	.tlb_flush_all	= dummy_tlb_flush_all,
+	.tlb_add_flush	= dummy_tlb_add_flush,
+	.tlb_sync	= dummy_tlb_sync,
+};
+
+#define __FAIL(ops)	({				\
+		WARN(1, "selftest: test failed\n");	\
+		selftest_running = false;		\
+		-EFAULT;				\
+})
+
+static int __init arm_short_do_selftests(void)
+{
+	struct io_pgtable_ops *ops;
+	struct io_pgtable_cfg cfg = {
+		.tlb = &dummy_tlb_ops,
+		.oas = 32,
+		.ias = 32,
+		.quirks = IO_PGTABLE_QUIRK_ARM_NS |
+			IO_PGTABLE_QUIRK_SHORT_SUPERSECTION,
+		.pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
+	};
+	unsigned int iova, size, iova_start;
+	unsigned int i, loopnr = 0;
+
+	selftest_running = true;
+
+	cfg_cookie = &cfg;
+
+	ops = alloc_io_pgtable_ops(ARM_SHORT_DESC, &cfg, &cfg);
+	if (!ops) {
+		pr_err("Failed to alloc short desc io pgtable\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Initial sanity checks.
+	 * Empty page tables shouldn't provide any translations.
+	 */
+	if (ops->iova_to_phys(ops, 42))
+		return __FAIL(ops);
+
+	if (ops->iova_to_phys(ops, SZ_1G + 42))
+		return __FAIL(ops);
+
+	if (ops->iova_to_phys(ops, SZ_2G + 42))
+		return __FAIL(ops);
+
+	/*
+	 * Distinct mappings of different granule sizes.
+	 */
+	iova = 0;
+	i = find_first_bit(&cfg.pgsize_bitmap, BITS_PER_LONG);
+	while (i != BITS_PER_LONG) {
+		size = 1UL << i;
+		if (ops->map(ops, iova, iova, size, IOMMU_READ |
+						    IOMMU_WRITE |
+						    IOMMU_NOEXEC |
+						    IOMMU_CACHE))
+			return __FAIL(ops);
+
+		/* Overlapping mappings */
+		if (!ops->map(ops, iova, iova + size, size,
+			      IOMMU_READ | IOMMU_NOEXEC))
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+			return __FAIL(ops);
+
+		iova += SZ_16M;
+		i++;
+		i = find_next_bit(&cfg.pgsize_bitmap, BITS_PER_LONG, i);
+		loopnr++;
+	}
+
+	/* Partial unmap */
+	i = 1;
+	size = 1UL << __ffs(cfg.pgsize_bitmap);
+	while (i < loopnr) {
+		iova_start = i * SZ_16M;
+		if (ops->unmap(ops, iova_start + size, size) != size)
+			return __FAIL(ops);
+
+		/* Remap of partial unmap */
+		if (ops->map(ops, iova_start + size, size, size, IOMMU_READ))
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova_start + size + 42)
+		    != (size + 42))
+			return __FAIL(ops);
+		i++;
+	}
+
+	/* Full unmap */
+	iova = 0;
+	i = find_first_bit(&cfg.pgsize_bitmap, BITS_PER_LONG);
+	while (i != BITS_PER_LONG) {
+		size = 1UL << i;
+
+		if (ops->unmap(ops, iova, size) != size)
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova + 42))
+			return __FAIL(ops);
+
+		/* Remap full block */
+		if (ops->map(ops, iova, iova, size, IOMMU_WRITE))
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+			return __FAIL(ops);
+
+		iova += SZ_16M;
+		i++;
+		i = find_next_bit(&cfg.pgsize_bitmap, BITS_PER_LONG, i);
+	}
+
+	free_io_pgtable_ops(ops);
+
+	selftest_running = false;
+	return 0;
+}
+
+subsys_initcall(arm_short_do_selftests);
+#endif
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index e4bc2b2..9978eca 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -38,9 +38,6 @@ 
 #define io_pgtable_to_data(x)						\
 	container_of((x), struct arm_lpae_io_pgtable, iop)
 
-#define io_pgtable_ops_to_pgtable(x)					\
-	container_of((x), struct io_pgtable, ops)
-
 #define io_pgtable_ops_to_data(x)					\
 	io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
 
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 6436fe2..14a9b3a 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -28,6 +28,7 @@  extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_short_init_fns;
 
 static const struct io_pgtable_init_fns *
 io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] =
@@ -38,6 +39,9 @@  io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] =
 	[ARM_64_LPAE_S1] = &io_pgtable_arm_64_lpae_s1_init_fns,
 	[ARM_64_LPAE_S2] = &io_pgtable_arm_64_lpae_s2_init_fns,
 #endif
+#ifdef CONFIG_IOMMU_IO_PGTABLE_SHORT
+	[ARM_SHORT_DESC] = &io_pgtable_arm_short_init_fns,
+#endif
 };
 
 struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index 68c63d9..0f45e60 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -9,6 +9,7 @@  enum io_pgtable_fmt {
 	ARM_32_LPAE_S2,
 	ARM_64_LPAE_S1,
 	ARM_64_LPAE_S2,
+	ARM_SHORT_DESC,
 	IO_PGTABLE_NUM_FMTS,
 };
 
@@ -45,6 +46,9 @@  struct iommu_gather_ops {
  */
 struct io_pgtable_cfg {
 	#define IO_PGTABLE_QUIRK_ARM_NS	(1 << 0)	/* Set NS bit in PTEs */
+	#define IO_PGTABLE_QUIRK_SHORT_SUPERSECTION     BIT(1)
+	#define IO_PGTABLE_QUIRK_SHORT_NO_XN		BIT(2) /* No XN bit */
+	#define IO_PGTABLE_QUIRK_SHORT_NO_PERMS		BIT(3) /* No AP bit */
 	int				quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;
@@ -64,6 +68,13 @@  struct io_pgtable_cfg {
 			u64	vttbr;
 			u64	vtcr;
 		} arm_lpae_s2_cfg;
+
+		struct {
+			u32	ttbr[2];
+			u32	tcr;
+			u32	nmrr;
+			u32	prrr;
+		} arm_short_cfg;
 	};
 };
 
@@ -130,6 +141,9 @@  struct io_pgtable {
 	struct io_pgtable_ops	ops;
 };
 
+#define io_pgtable_ops_to_pgtable(x)		\
+	container_of((x), struct io_pgtable, ops)
+
 /**
  * struct io_pgtable_init_fns - Alloc/free a set of page tables for a
  *                              particular format.