diff mbox

[v3,3/6] iommu: add ARM short descriptor page table allocator.

Message ID 1437037475-9065-4-git-send-email-yong.wu@mediatek.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yong Wu (吴勇) July 16, 2015, 9:04 a.m. UTC
This patch is for ARM Short Descriptor Format.

Signed-off-by: Yong Wu <yong.wu@mediatek.com>
---
 drivers/iommu/Kconfig                |   18 +
 drivers/iommu/Makefile               |    1 +
 drivers/iommu/io-pgtable-arm-short.c |  742 ++++++++++++++++++++++++++++++++++
 drivers/iommu/io-pgtable-arm.c       |    3 -
 drivers/iommu/io-pgtable.c           |    4 +
 drivers/iommu/io-pgtable.h           |   13 +
 6 files changed, 778 insertions(+), 3 deletions(-)
 create mode 100644 drivers/iommu/io-pgtable-arm-short.c

Comments

Will Deacon July 21, 2015, 5:11 p.m. UTC | #1
Hello,

This is looking better, but I still have some concerns.

On Thu, Jul 16, 2015 at 10:04:32AM +0100, Yong Wu wrote:
> This patch is for ARM Short Descriptor Format.
> 
> Signed-off-by: Yong Wu <yong.wu@mediatek.com>
> ---
>  drivers/iommu/Kconfig                |   18 +
>  drivers/iommu/Makefile               |    1 +
>  drivers/iommu/io-pgtable-arm-short.c |  742 ++++++++++++++++++++++++++++++++++
>  drivers/iommu/io-pgtable-arm.c       |    3 -
>  drivers/iommu/io-pgtable.c           |    4 +
>  drivers/iommu/io-pgtable.h           |   13 +
>  6 files changed, 778 insertions(+), 3 deletions(-)
>  create mode 100644 drivers/iommu/io-pgtable-arm-short.c

[...]

> +#define ARM_SHORT_PGDIR_SHIFT                  20
> +#define ARM_SHORT_PAGE_SHIFT                   12
> +#define ARM_SHORT_PTRS_PER_PTE                 \
> +       (1 << (ARM_SHORT_PGDIR_SHIFT - ARM_SHORT_PAGE_SHIFT))
> +#define ARM_SHORT_BYTES_PER_PTE                        \
> +       (ARM_SHORT_PTRS_PER_PTE * sizeof(arm_short_iopte))
> +
> +/* level 1 pagetable */
> +#define ARM_SHORT_PGD_TYPE_PGTABLE             BIT(0)
> +#define ARM_SHORT_PGD_SECTION_XN               (BIT(0) | BIT(4))
> +#define ARM_SHORT_PGD_TYPE_SECTION             BIT(1)
> +#define ARM_SHORT_PGD_PGTABLE_XN               BIT(2)

This should be PXN, but I'm not even sure why we care for a table at
level 1.

> +#define ARM_SHORT_PGD_B                                BIT(2)
> +#define ARM_SHORT_PGD_C                                BIT(3)
> +#define ARM_SHORT_PGD_PGTABLE_NS               BIT(3)
> +#define ARM_SHORT_PGD_IMPLE                    BIT(9)
> +#define ARM_SHORT_PGD_TEX0                     BIT(12)
> +#define ARM_SHORT_PGD_S                                BIT(16)
> +#define ARM_SHORT_PGD_nG                       BIT(17)
> +#define ARM_SHORT_PGD_SUPERSECTION             BIT(18)
> +#define ARM_SHORT_PGD_SECTION_NS               BIT(19)
> +
> +#define ARM_SHORT_PGD_TYPE_SUPERSECTION                \
> +       (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION)
> +#define ARM_SHORT_PGD_SECTION_TYPE_MSK         \
> +       (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION)
> +#define ARM_SHORT_PGD_PGTABLE_TYPE_MSK         \
> +       (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_TYPE_PGTABLE)
> +#define ARM_SHORT_PGD_TYPE_IS_PGTABLE(pgd)     \
> +       (((pgd) & ARM_SHORT_PGD_PGTABLE_TYPE_MSK) == ARM_SHORT_PGD_TYPE_PGTABLE)
> +#define ARM_SHORT_PGD_TYPE_IS_SECTION(pgd)     \
> +       (((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == ARM_SHORT_PGD_TYPE_SECTION)
> +#define ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(pgd)        \
> +       (((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == \
> +       ARM_SHORT_PGD_TYPE_SUPERSECTION)
> +#define ARM_SHORT_PGD_PGTABLE_MSK              0xfffffc00
> +#define ARM_SHORT_PGD_SECTION_MSK              (~(SZ_1M - 1))
> +#define ARM_SHORT_PGD_SUPERSECTION_MSK         (~(SZ_16M - 1))
> +
> +/* level 2 pagetable */
> +#define ARM_SHORT_PTE_TYPE_LARGE               BIT(0)
> +#define ARM_SHORT_PTE_SMALL_XN                 BIT(0)
> +#define ARM_SHORT_PTE_TYPE_SMALL               BIT(1)
> +#define ARM_SHORT_PTE_B                                BIT(2)
> +#define ARM_SHORT_PTE_C                                BIT(3)
> +#define ARM_SHORT_PTE_SMALL_TEX0               BIT(6)
> +#define ARM_SHORT_PTE_IMPLE                    BIT(9)

This is AP[2] for small pages.

> +#define ARM_SHORT_PTE_S                                BIT(10)
> +#define ARM_SHORT_PTE_nG                       BIT(11)
> +#define ARM_SHORT_PTE_LARGE_TEX0               BIT(12)
> +#define ARM_SHORT_PTE_LARGE_XN                 BIT(15)
> +#define ARM_SHORT_PTE_LARGE_MSK                        (~(SZ_64K - 1))
> +#define ARM_SHORT_PTE_SMALL_MSK                        (~(SZ_4K - 1))
> +#define ARM_SHORT_PTE_TYPE_MSK                 \
> +       (ARM_SHORT_PTE_TYPE_LARGE | ARM_SHORT_PTE_TYPE_SMALL)
> +#define ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(pte)   \
> +       (((((pte) & ARM_SHORT_PTE_TYPE_MSK) >> 1) << 1)\
> +       == ARM_SHORT_PTE_TYPE_SMALL)

I see what you're trying to do here, but the shifting is confusing. I
think it's better doing something like:

	((pte) & ARM_SHORT_PTE_TYPE_SMALL) == ARM_SHORT_PTE_TYPE_SMALL

or even just:

	(pte) & ARM_SHORT_PTE_TYPE_SMALL

> +#define ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(pte)   \
> +       (((pte) & ARM_SHORT_PTE_TYPE_MSK) == ARM_SHORT_PTE_TYPE_LARGE)
> +
> +#define ARM_SHORT_PGD_IDX(a)                   ((a) >> ARM_SHORT_PGDIR_SHIFT)
> +#define ARM_SHORT_PTE_IDX(a)                   \
> +       (((a) >> ARM_SHORT_PAGE_SHIFT) & (ARM_SHORT_PTRS_PER_PTE - 1))
> +
> +#define ARM_SHORT_GET_PTE_VA(pgd)              \
> +       (phys_to_virt((unsigned long)pgd & ARM_SHORT_PGD_PGTABLE_MSK))

Maybe better named as ARM_SHORT_GET_PGTABLE_VA ?

> +
> +#define ARM_SHORT_PTE_LARGE_GET_PROT(pte)      \
> +       (((pte) & (~ARM_SHORT_PTE_LARGE_MSK)) & ~ARM_SHORT_PTE_TYPE_MSK)
> +
> +#define ARM_SHORT_PGD_GET_PROT(pgd)            \
> +       (((pgd) & (~ARM_SHORT_PGD_SECTION_MSK)) & ~ARM_SHORT_PGD_SUPERSECTION)
> +
> +static bool selftest_running;
> +
> +static arm_short_iopte *
> +arm_short_get_pte_in_pgd(arm_short_iopte pgd, unsigned int iova)
> +{
> +       arm_short_iopte *pte;
> +
> +       pte = ARM_SHORT_GET_PTE_VA(pgd);
> +       pte += ARM_SHORT_PTE_IDX(iova);
> +       return pte;
> +}
> +
> +static void _arm_short_free_pgtable(struct arm_short_io_pgtable *data,
> +                                   arm_short_iopte *pgd)
> +{
> +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> +       arm_short_iopte *pte;
> +       int i;
> +
> +       pte = ARM_SHORT_GET_PTE_VA(*pgd);
> +       for (i = 0; i < ARM_SHORT_PTRS_PER_PTE; i++) {
> +               if (pte[i] != 0)
> +                       return;
> +       }

Do you actually need this loop if you're not warning or returning an error?

> +
> +       /* Free whole pte and set pgd to zero while all pte is unmap */
> +       kmem_cache_free(data->ptekmem, pte);
> +       *pgd = 0;

I still don't think this is safe. What stops the page table walker from
speculatively walking freed memory?

> +       tlb->flush_pgtable(pgd, sizeof(*pgd), data->iop.cookie);
> +}
> +
> +static arm_short_iopte
> +__arm_short_pte_prot(struct arm_short_io_pgtable *data, int prot, bool large)
> +{
> +       arm_short_iopte pteprot;
> +
> +       pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
> +       pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
> +                               ARM_SHORT_PTE_TYPE_SMALL;
> +       if (prot & IOMMU_CACHE)
> +               pteprot |=  ARM_SHORT_PTE_B | ARM_SHORT_PTE_C;
> +       if (prot & IOMMU_WRITE)
> +               pteprot |= large ? ARM_SHORT_PTE_LARGE_TEX0 :
> +                               ARM_SHORT_PTE_SMALL_TEX0;

This doesn't make any sense. TEX[2:0] is all about memory attributes, not
permissions, so you're making the mapping write-back, write-allocate but
that's not what the IOMMU_* values are about.

> +       if (prot & IOMMU_NOEXEC)
> +               pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
> +                       ARM_SHORT_PTE_SMALL_XN;
> +       return pteprot;
> +}
> +
> +static arm_short_iopte
> +__arm_short_pgd_prot(struct arm_short_io_pgtable *data, int prot, bool super)
> +{
> +       arm_short_iopte pgdprot;
> +
> +       pgdprot = ARM_SHORT_PGD_S | ARM_SHORT_PGD_nG;
> +       pgdprot |= super ? ARM_SHORT_PGD_TYPE_SUPERSECTION :
> +                               ARM_SHORT_PGD_TYPE_SECTION;
> +       if (prot & IOMMU_CACHE)
> +               pgdprot |= ARM_SHORT_PGD_C | ARM_SHORT_PGD_B;
> +       if (prot & IOMMU_WRITE)
> +               pgdprot |= ARM_SHORT_PGD_TEX0;

Likewise.

> +       if (prot & IOMMU_NOEXEC)
> +               pgdprot |= ARM_SHORT_PGD_SECTION_XN;
> +       if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
> +               pgdprot |= ARM_SHORT_PGD_SECTION_NS;
> +       return pgdprot;
> +}
> +
> +static arm_short_iopte
> +__arm_short_pte_prot_split(struct arm_short_io_pgtable *data,
> +                          arm_short_iopte pgdprot,
> +                          arm_short_iopte pteprot_large,
> +                          bool large)
> +{
> +       arm_short_iopte pteprot = 0;
> +
> +       pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
> +       pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
> +                               ARM_SHORT_PTE_TYPE_SMALL;
> +       /* section to pte prot */
> +       if (pgdprot & ARM_SHORT_PGD_C)
> +               pteprot |= ARM_SHORT_PTE_C;
> +       if (pgdprot & ARM_SHORT_PGD_B)
> +               pteprot |= ARM_SHORT_PTE_B;
> +       if (pgdprot & ARM_SHORT_PGD_TEX0)
> +               pteprot |= large ? ARM_SHORT_PTE_LARGE_TEX0 :
> +                               ARM_SHORT_PTE_SMALL_TEX0;
> +       if (pgdprot & ARM_SHORT_PGD_nG)
> +               pteprot |= ARM_SHORT_PTE_nG;
> +       if (pgdprot & ARM_SHORT_PGD_SECTION_XN)
> +               pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
> +                               ARM_SHORT_PTE_SMALL_XN;
> +
> +       /* large page to small page pte prot. Only large page may split */
> +       if (pteprot_large && !large) {
> +               if (pteprot_large & ARM_SHORT_PTE_LARGE_TEX0)
> +                       pteprot |= ARM_SHORT_PTE_SMALL_TEX0;
> +               if (pteprot_large & ARM_SHORT_PTE_LARGE_XN)
> +                       pteprot |= ARM_SHORT_PTE_SMALL_XN;
> +       }
> +       return pteprot;
> +}
> +
> +static arm_short_iopte
> +__arm_short_pgtable_prot(struct arm_short_io_pgtable *data, bool noexec)
> +{
> +       arm_short_iopte pgdprot = 0;
> +
> +       pgdprot = ARM_SHORT_PGD_TYPE_PGTABLE;
> +       if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
> +               pgdprot |= ARM_SHORT_PGD_PGTABLE_NS;
> +       if (noexec)
> +               pgdprot |= ARM_SHORT_PGD_PGTABLE_XN;

I don't think you need to worry about XN bits for PGTABLEs.

> +       return pgdprot;
> +}
> +
> +static int
> +_arm_short_map(struct arm_short_io_pgtable *data,
> +              unsigned int iova, phys_addr_t paddr,
> +              arm_short_iopte pgdprot, arm_short_iopte pteprot,
> +              bool large)
> +{
> +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> +       arm_short_iopte *pgd = data->pgd, *pte;
> +       void *cookie = data->iop.cookie, *pte_va;
> +       unsigned int ptenr = large ? 16 : 1;
> +       int i, quirk = data->iop.cfg.quirks;
> +       bool ptenew = false;
> +
> +       pgd += ARM_SHORT_PGD_IDX(iova);
> +
> +       if (!pteprot) { /* section or supersection */
> +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK)
> +                       pgdprot &= ~ARM_SHORT_PGD_SECTION_XN;
> +               pte = pgd;
> +               pteprot = pgdprot;
> +       } else {        /* page or largepage */
> +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
> +                       if (large) { /* special Bit */

This definitely needs a better comment! What exactly are you doing here
and what is that quirk all about?

> +                               if (pteprot & ARM_SHORT_PTE_LARGE_TEX0) {
> +                                       pteprot &= ~ARM_SHORT_PTE_LARGE_TEX0;
> +                                       pteprot |= ARM_SHORT_PTE_SMALL_TEX0;
> +                               }
> +                               pteprot &= ~ARM_SHORT_PTE_LARGE_XN;
> +                       } else {
> +                               pteprot &= ~ARM_SHORT_PTE_SMALL_XN;
> +                       }
> +               }
> +
> +               if (!(*pgd)) {
> +                       pte_va = kmem_cache_zalloc(data->ptekmem, GFP_ATOMIC);
> +                       if (unlikely(!pte_va))
> +                               return -ENOMEM;
> +                       ptenew = true;
> +                       *pgd = virt_to_phys(pte_va) | pgdprot;
> +                       kmemleak_ignore(pte_va);
> +                       tlb->flush_pgtable(pgd, sizeof(*pgd), cookie);

I think you need to flush this before it becomes visible to the walker.

> +               }
> +               pte = arm_short_get_pte_in_pgd(*pgd, iova);
> +       }
> +
> +       pteprot |= (arm_short_iopte)paddr;
> +       for (i = 0; i < ptenr; i++) {
> +               if (pte[i]) {/* Someone else may have allocated for this pte */
> +                       WARN_ON(!selftest_running);
> +                       goto err_exist_pte;
> +               }
> +               pte[i] = pteprot;
> +       }
> +       tlb->flush_pgtable(pte, ptenr * sizeof(*pte), cookie);
> +
> +       return 0;
> +
> +err_exist_pte:
> +       while (i--)
> +               pte[i] = 0;
> +       if (ptenew)
> +               kmem_cache_free(data->ptekmem, pte_va);
> +       return -EEXIST;
> +}
> +
> +static int arm_short_map(struct io_pgtable_ops *ops, unsigned long iova,
> +                        phys_addr_t paddr, size_t size, int prot)
> +{
> +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> +       int ret;
> +       arm_short_iopte pgdprot = 0, pteprot = 0;
> +       bool large;
> +
> +       /* If no access, then nothing to do */
> +       if (!(prot & (IOMMU_READ | IOMMU_WRITE)))
> +               return 0;
> +
> +       switch (size) {
> +       case SZ_4K:
> +       case SZ_64K:
> +               large = (size == SZ_64K) ? true : false;
> +               pteprot = __arm_short_pte_prot(data, prot, large);
> +               pgdprot = __arm_short_pgtable_prot(data, prot & IOMMU_NOEXEC);
> +               break;
> +
> +       case SZ_1M:
> +       case SZ_16M:
> +               large = (size == SZ_16M) ? true : false;
> +               pgdprot = __arm_short_pgd_prot(data, prot, large);
> +               break;
> +       default:
> +               return -EINVAL;
> +       }
> +
> +       if (WARN_ON((iova | paddr) & (size - 1)))
> +               return -EINVAL;
> +
> +       ret = _arm_short_map(data, iova, paddr, pgdprot, pteprot, large);
> +
> +       tlb->tlb_add_flush(iova, size, true, data->iop.cookie);
> +       tlb->tlb_sync(data->iop.cookie);

In _arm_short_map, it looks like you can only go from invalid -> valid,
so why do you need to flush the TLB here?

> +       return ret;
> +}
> +
> +static phys_addr_t arm_short_iova_to_phys(struct io_pgtable_ops *ops,
> +                                         unsigned long iova)
> +{
> +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +       arm_short_iopte *pte, *pgd = data->pgd;
> +       phys_addr_t pa = 0;
> +
> +       pgd += ARM_SHORT_PGD_IDX(iova);
> +
> +       if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) {
> +               pte = arm_short_get_pte_in_pgd(*pgd, iova);
> +
> +               if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte)) {
> +                       pa = (*pte) & ARM_SHORT_PTE_LARGE_MSK;
> +                       pa |= iova & ~ARM_SHORT_PTE_LARGE_MSK;
> +               } else if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte)) {
> +                       pa = (*pte) & ARM_SHORT_PTE_SMALL_MSK;
> +                       pa |= iova & ~ARM_SHORT_PTE_SMALL_MSK;
> +               }
> +       } else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) {
> +               pa = (*pgd) & ARM_SHORT_PGD_SECTION_MSK;
> +               pa |= iova & ~ARM_SHORT_PGD_SECTION_MSK;
> +       } else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
> +               pa = (*pgd) & ARM_SHORT_PGD_SUPERSECTION_MSK;
> +               pa |= iova & ~ARM_SHORT_PGD_SUPERSECTION_MSK;
> +       }
> +
> +       return pa;
> +}
> +
> +static int
> +arm_short_split_blk_unmap(struct io_pgtable_ops *ops, unsigned int iova,
> +                         phys_addr_t paddr, size_t size,
> +                         arm_short_iopte pgdprotup, arm_short_iopte pteprotup,
> +                         size_t blk_size)
> +{
> +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> +       struct io_pgtable_cfg *cfg = &data->iop.cfg;
> +       unsigned long *pgbitmap = &cfg->pgsize_bitmap;
> +       unsigned int blk_base, blk_start, blk_end;
> +       arm_short_iopte pgdprot, pteprot;
> +       size_t mapsize = 0, nextmapsize;
> +       phys_addr_t blk_paddr;
> +       int ret;
> +       unsigned int i;
> +
> +       /* find the nearest mapsize */
> +       for (i = find_first_bit(pgbitmap, BITS_PER_LONG);
> +            i < BITS_PER_LONG && ((1 << i) < blk_size) &&
> +            IS_ALIGNED(size, 1 << i);
> +            i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1))
> +               mapsize = 1 << i;
> +
> +       if (WARN_ON(!mapsize))
> +               return 0; /* Bytes unmapped */
> +       nextmapsize = 1 << i;
> +
> +       blk_base = iova & ~(blk_size - 1);
> +       blk_start = blk_base;
> +       blk_end = blk_start + blk_size;
> +       blk_paddr = paddr;
> +
> +       for (; blk_start < blk_end;
> +            blk_start += mapsize, blk_paddr += mapsize) {
> +               /* Unmap! */
> +               if (blk_start == iova)
> +                       continue;
> +
> +               /* Try to upper map */
> +               if (blk_base != blk_start &&
> +                   IS_ALIGNED(blk_start | blk_paddr, nextmapsize) &&
> +                   mapsize != nextmapsize) {
> +                       mapsize = nextmapsize;
> +                       i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1);
> +                       if (i < BITS_PER_LONG)
> +                               nextmapsize = 1 << i;
> +               }
> +
> +               if (mapsize == SZ_1M) {
> +                       pgdprot = pgdprotup;
> +                       pgdprot |= __arm_short_pgd_prot(data, 0, false);
> +                       pteprot = 0;
> +               } else { /* small or large page */
> +                       bool noexec = (blk_size == SZ_64K) ?
> +                               (pteprotup & ARM_SHORT_PTE_LARGE_XN) :
> +                               (pgdprotup & ARM_SHORT_PGD_SECTION_XN);
> +
> +                       pteprot = __arm_short_pte_prot_split(
> +                                               data, pgdprotup, pteprotup,
> +                                               mapsize == SZ_64K);
> +                       pgdprot = __arm_short_pgtable_prot(data, noexec);
> +               }
> +
> +               ret = _arm_short_map(data, blk_start, blk_paddr, pgdprot,
> +                                    pteprot, mapsize == SZ_64K);
> +               if (ret < 0) {
> +                       /* Free the table we allocated */
> +                       arm_short_iopte *pgd = data->pgd, *pte;
> +
> +                       pgd += ARM_SHORT_PGD_IDX(blk_base);
> +                       if (*pgd) {
> +                               pte = ARM_SHORT_GET_PTE_VA(*pgd);
> +                               kmem_cache_free(data->ptekmem, pte);
> +                               *pgd = 0;
> +                               tlb->flush_pgtable(pgd, sizeof(*pgd),
> +                                                  data->iop.cookie);

Again, the ordering looks totally wrong here. You need to:

  (1) Zero the pgd pointer
  (2) Flush the pointer, so the walker can no longer follow the page table
  (3) Invalidate the TLB, so we don't have any cached intermediate walks
  (4) Sync the TLB
  (5) Free the memory

That said, I don't fully follow the logic here.

> +                       }
> +                       return 0;/* Bytes unmapped */
> +               }
> +               tlb->tlb_add_flush(blk_start, mapsize, true, data->iop.cookie);
> +               tlb->tlb_sync(data->iop.cookie);

Why are you doing this here for every iteration?

Will
Yong Wu (吴勇) July 24, 2015, 5:24 a.m. UTC | #2
Hi Will,
    Thanks for your review so detail.
    When you are free, please help me check whether it's ok if it's
changed like below.
    Thanks very much.

On Tue, 2015-07-21 at 18:11 +0100, Will Deacon wrote:
> Hello,
> 
> This is looking better, but I still have some concerns.
> 
> On Thu, Jul 16, 2015 at 10:04:32AM +0100, Yong Wu wrote:
> > This patch is for ARM Short Descriptor Format.
> > 
> > Signed-off-by: Yong Wu <yong.wu@mediatek.com>
> > ---
> >  drivers/iommu/Kconfig                |   18 +
> >  drivers/iommu/Makefile               |    1 +
> >  drivers/iommu/io-pgtable-arm-short.c |  742 ++++++++++++++++++++++++++++++++++
> >  drivers/iommu/io-pgtable-arm.c       |    3 -
> >  drivers/iommu/io-pgtable.c           |    4 +
> >  drivers/iommu/io-pgtable.h           |   13 +
> >  6 files changed, 778 insertions(+), 3 deletions(-)
> >  create mode 100644 drivers/iommu/io-pgtable-arm-short.c
> 
> [...]
> 
> > +#define ARM_SHORT_PGDIR_SHIFT                  20
> > +#define ARM_SHORT_PAGE_SHIFT                   12
> > +#define ARM_SHORT_PTRS_PER_PTE                 \
> > +       (1 << (ARM_SHORT_PGDIR_SHIFT - ARM_SHORT_PAGE_SHIFT))
> > +#define ARM_SHORT_BYTES_PER_PTE                        \
> > +       (ARM_SHORT_PTRS_PER_PTE * sizeof(arm_short_iopte))
> > +
> > +/* level 1 pagetable */
> > +#define ARM_SHORT_PGD_TYPE_PGTABLE             BIT(0)
> > +#define ARM_SHORT_PGD_SECTION_XN               (BIT(0) | BIT(4))
> > +#define ARM_SHORT_PGD_TYPE_SECTION             BIT(1)
> > +#define ARM_SHORT_PGD_PGTABLE_XN               BIT(2)
> 
> This should be PXN, but I'm not even sure why we care for a table at
> level 1.

I will delete it.

> 
> > +#define ARM_SHORT_PGD_B                                BIT(2)
> > +#define ARM_SHORT_PGD_C                                BIT(3)
> > +#define ARM_SHORT_PGD_PGTABLE_NS               BIT(3)
> > +#define ARM_SHORT_PGD_IMPLE                    BIT(9)
> > +#define ARM_SHORT_PGD_TEX0                     BIT(12)
> > +#define ARM_SHORT_PGD_S                                BIT(16)
> > +#define ARM_SHORT_PGD_nG                       BIT(17)
> > +#define ARM_SHORT_PGD_SUPERSECTION             BIT(18)
> > +#define ARM_SHORT_PGD_SECTION_NS               BIT(19)
> > +
> > +#define ARM_SHORT_PGD_TYPE_SUPERSECTION                \
> > +       (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION)
> > +#define ARM_SHORT_PGD_SECTION_TYPE_MSK         \
> > +       (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION)
> > +#define ARM_SHORT_PGD_PGTABLE_TYPE_MSK         \
> > +       (ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_TYPE_PGTABLE)
> > +#define ARM_SHORT_PGD_TYPE_IS_PGTABLE(pgd)     \
> > +       (((pgd) & ARM_SHORT_PGD_PGTABLE_TYPE_MSK) == ARM_SHORT_PGD_TYPE_PGTABLE)
> > +#define ARM_SHORT_PGD_TYPE_IS_SECTION(pgd)     \
> > +       (((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == ARM_SHORT_PGD_TYPE_SECTION)
> > +#define ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(pgd)        \
> > +       (((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == \
> > +       ARM_SHORT_PGD_TYPE_SUPERSECTION)
> > +#define ARM_SHORT_PGD_PGTABLE_MSK              0xfffffc00
> > +#define ARM_SHORT_PGD_SECTION_MSK              (~(SZ_1M - 1))
> > +#define ARM_SHORT_PGD_SUPERSECTION_MSK         (~(SZ_16M - 1))
> > +
> > +/* level 2 pagetable */
> > +#define ARM_SHORT_PTE_TYPE_LARGE               BIT(0)
> > +#define ARM_SHORT_PTE_SMALL_XN                 BIT(0)
> > +#define ARM_SHORT_PTE_TYPE_SMALL               BIT(1)
> > +#define ARM_SHORT_PTE_B                                BIT(2)
> > +#define ARM_SHORT_PTE_C                                BIT(3)
> > +#define ARM_SHORT_PTE_SMALL_TEX0               BIT(6)
> > +#define ARM_SHORT_PTE_IMPLE                    BIT(9)
> 
> This is AP[2] for small pages.

Sorry, In our pagetable bit9 in PGD and PTE is PA[32] that is for  the
dram size over 4G. I didn't care it is different in PTE of the standard
spec.
And I don't use the AP[2] currently, so I only delete this line in next
time.

> 
> > +#define ARM_SHORT_PTE_S                                BIT(10)
> > +#define ARM_SHORT_PTE_nG                       BIT(11)
> > +#define ARM_SHORT_PTE_LARGE_TEX0               BIT(12)
> > +#define ARM_SHORT_PTE_LARGE_XN                 BIT(15)
> > +#define ARM_SHORT_PTE_LARGE_MSK                        (~(SZ_64K - 1))
> > +#define ARM_SHORT_PTE_SMALL_MSK                        (~(SZ_4K - 1))
> > +#define ARM_SHORT_PTE_TYPE_MSK                 \
> > +       (ARM_SHORT_PTE_TYPE_LARGE | ARM_SHORT_PTE_TYPE_SMALL)
> > +#define ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(pte)   \
> > +       (((((pte) & ARM_SHORT_PTE_TYPE_MSK) >> 1) << 1)\
> > +       == ARM_SHORT_PTE_TYPE_SMALL)
> 
> I see what you're trying to do here, but the shifting is confusing. I
> think it's better doing something like:
> 
> 	((pte) & ARM_SHORT_PTE_TYPE_SMALL) == ARM_SHORT_PTE_TYPE_SMALL
> 
> or even just:
> 
> 	(pte) & ARM_SHORT_PTE_TYPE_SMALL

Thanks. I will use this:
((pte) & ARM_SHORT_PTE_TYPE_SMALL) == ARM_SHORT_PTE_TYPE_SMALL

This line may be close to the ARM_SHORT_PTE_TYPE_IS_LARGEPAGE below.
> 
> > +#define ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(pte)   \
> > +       (((pte) & ARM_SHORT_PTE_TYPE_MSK) == ARM_SHORT_PTE_TYPE_LARGE)
> > +
> > +#define ARM_SHORT_PGD_IDX(a)                   ((a) >> ARM_SHORT_PGDIR_SHIFT)
> > +#define ARM_SHORT_PTE_IDX(a)                   \
> > +       (((a) >> ARM_SHORT_PAGE_SHIFT) & (ARM_SHORT_PTRS_PER_PTE - 1))
> > +
> > +#define ARM_SHORT_GET_PTE_VA(pgd)              \
> > +       (phys_to_virt((unsigned long)pgd & ARM_SHORT_PGD_PGTABLE_MSK))
> 
> Maybe better named as ARM_SHORT_GET_PGTABLE_VA ?

Thanks.

> 
> > +
> > +#define ARM_SHORT_PTE_LARGE_GET_PROT(pte)      \
> > +       (((pte) & (~ARM_SHORT_PTE_LARGE_MSK)) & ~ARM_SHORT_PTE_TYPE_MSK)
> > +
> > +#define ARM_SHORT_PGD_GET_PROT(pgd)            \
> > +       (((pgd) & (~ARM_SHORT_PGD_SECTION_MSK)) & ~ARM_SHORT_PGD_SUPERSECTION)
> > +
> > +static bool selftest_running;
> > +
> > +static arm_short_iopte *
> > +arm_short_get_pte_in_pgd(arm_short_iopte pgd, unsigned int iova)
> > +{
> > +       arm_short_iopte *pte;
> > +
> > +       pte = ARM_SHORT_GET_PTE_VA(pgd);
> > +       pte += ARM_SHORT_PTE_IDX(iova);
> > +       return pte;
> > +}
> > +
> > +static void _arm_short_free_pgtable(struct arm_short_io_pgtable *data,
> > +                                   arm_short_iopte *pgd)
> > +{
> > +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> > +       arm_short_iopte *pte;
> > +       int i;
> > +
> > +       pte = ARM_SHORT_GET_PTE_VA(*pgd);
> > +       for (i = 0; i < ARM_SHORT_PTRS_PER_PTE; i++) {
> > +               if (pte[i] != 0)
> > +                       return;
> > +       }
> 
> Do you actually need this loop if you're not warning or returning an error?

I can't return an error here.

Here I only walk all the pte items in the pgd, 
If there is some pte item exist, we do nothing, It is a ok case too.
If all the pte items are unmapped, We have to free the memory of
pgtable(kmem).

> 
> > +
> > +       /* Free whole pte and set pgd to zero while all pte is unmap */
> > +       kmem_cache_free(data->ptekmem, pte);
> > +       *pgd = 0;
> 
> I still don't think this is safe. What stops the page table walker from
> speculatively walking freed memory?

      Sorry. I didn't care the free flow this time. 

      I will change like below:    
static bool _arm_short_free_pgtable(struct arm_short_io_pgtable *data,
arm_short_iopte *pgd)
{
 /* if whole the pte items of this pgd are unmapped, return true.
  * if others return fail.
  */
  for(*****)
}
   In arm_short_unmap, I will compare the return value and following the
free 5 steps as you suggestion below.

> 
> > +       tlb->flush_pgtable(pgd, sizeof(*pgd), data->iop.cookie);
> > +}
> > +
> > +static arm_short_iopte
> > +__arm_short_pte_prot(struct arm_short_io_pgtable *data, int prot, bool large)
> > +{
> > +       arm_short_iopte pteprot;
> > +
> > +       pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
> > +       pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
> > +                               ARM_SHORT_PTE_TYPE_SMALL;
> > +       if (prot & IOMMU_CACHE)
> > +               pteprot |=  ARM_SHORT_PTE_B | ARM_SHORT_PTE_C;
> > +       if (prot & IOMMU_WRITE)
> > +               pteprot |= large ? ARM_SHORT_PTE_LARGE_TEX0 :
> > +                               ARM_SHORT_PTE_SMALL_TEX0;
> 
> This doesn't make any sense. TEX[2:0] is all about memory attributes, not
> permissions, so you're making the mapping write-back, write-allocate but
> that's not what the IOMMU_* values are about.

     I will delete it.

> 
> > +       if (prot & IOMMU_NOEXEC)
> > +               pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
> > +                       ARM_SHORT_PTE_SMALL_XN;
> > +       return pteprot;
> > +}
> > +
> > +static arm_short_iopte
> > +__arm_short_pgd_prot(struct arm_short_io_pgtable *data, int prot, bool super)
> > +{
> > +       arm_short_iopte pgdprot;
> > +
> > +       pgdprot = ARM_SHORT_PGD_S | ARM_SHORT_PGD_nG;
> > +       pgdprot |= super ? ARM_SHORT_PGD_TYPE_SUPERSECTION :
> > +                               ARM_SHORT_PGD_TYPE_SECTION;
> > +       if (prot & IOMMU_CACHE)
> > +               pgdprot |= ARM_SHORT_PGD_C | ARM_SHORT_PGD_B;
> > +       if (prot & IOMMU_WRITE)
> > +               pgdprot |= ARM_SHORT_PGD_TEX0;
> 
> Likewise.

I will delete it.

> 
> > +       if (prot & IOMMU_NOEXEC)
> > +               pgdprot |= ARM_SHORT_PGD_SECTION_XN;
> > +       if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
> > +               pgdprot |= ARM_SHORT_PGD_SECTION_NS;
> > +       return pgdprot;
> > +}
> > +
> > +static arm_short_iopte
> > +__arm_short_pte_prot_split(struct arm_short_io_pgtable *data,
> > +                          arm_short_iopte pgdprot,
> > +                          arm_short_iopte pteprot_large,
> > +                          bool large)
> > +{
> > +       arm_short_iopte pteprot = 0;
> > +
> > +       pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
> > +       pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
> > +                               ARM_SHORT_PTE_TYPE_SMALL;
> > +       /* section to pte prot */
> > +       if (pgdprot & ARM_SHORT_PGD_C)
> > +               pteprot |= ARM_SHORT_PTE_C;
> > +       if (pgdprot & ARM_SHORT_PGD_B)
> > +               pteprot |= ARM_SHORT_PTE_B;
> > +       if (pgdprot & ARM_SHORT_PGD_TEX0)
> > +               pteprot |= large ? ARM_SHORT_PTE_LARGE_TEX0 :
> > +                               ARM_SHORT_PTE_SMALL_TEX0;

Here I also delete it.

> > +       if (pgdprot & ARM_SHORT_PGD_nG)
> > +               pteprot |= ARM_SHORT_PTE_nG;
> > +       if (pgdprot & ARM_SHORT_PGD_SECTION_XN)
> > +               pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
> > +                               ARM_SHORT_PTE_SMALL_XN;
> > +
> > +       /* large page to small page pte prot. Only large page may split */
> > +       if (pteprot_large && !large) {
> > +               if (pteprot_large & ARM_SHORT_PTE_LARGE_TEX0)
> > +                       pteprot |= ARM_SHORT_PTE_SMALL_TEX0;

Delete this too.

> > +               if (pteprot_large & ARM_SHORT_PTE_LARGE_XN)
> > +                       pteprot |= ARM_SHORT_PTE_SMALL_XN;
> > +       }
> > +       return pteprot;
> > +}
> > +
> > +static arm_short_iopte
> > +__arm_short_pgtable_prot(struct arm_short_io_pgtable *data, bool noexec)
> > +{
> > +       arm_short_iopte pgdprot = 0;
> > +
> > +       pgdprot = ARM_SHORT_PGD_TYPE_PGTABLE;
> > +       if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
> > +               pgdprot |= ARM_SHORT_PGD_PGTABLE_NS;
> > +       if (noexec)
> > +               pgdprot |= ARM_SHORT_PGD_PGTABLE_XN;
> 
> I don't think you need to worry about XN bits for PGTABLEs.

I will delete it. 
MTK don't have XN bit in PGTABLEs,
I prepared to add all the bits according to the standard spec, and add
MTK quirk to disable this bit for our special bit.

> 
> > +       return pgdprot;
> > +}
> > +
> > +static int
> > +_arm_short_map(struct arm_short_io_pgtable *data,
> > +              unsigned int iova, phys_addr_t paddr,
> > +              arm_short_iopte pgdprot, arm_short_iopte pteprot,
> > +              bool large)
> > +{
> > +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> > +       arm_short_iopte *pgd = data->pgd, *pte;
> > +       void *cookie = data->iop.cookie, *pte_va;
> > +       unsigned int ptenr = large ? 16 : 1;
> > +       int i, quirk = data->iop.cfg.quirks;
> > +       bool ptenew = false;
> > +
> > +       pgd += ARM_SHORT_PGD_IDX(iova);
> > +
> > +       if (!pteprot) { /* section or supersection */
> > +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK)
> > +                       pgdprot &= ~ARM_SHORT_PGD_SECTION_XN;
> > +               pte = pgd;
> > +               pteprot = pgdprot;
> > +       } else {        /* page or largepage */
> > +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
> > +                       if (large) { /* special Bit */
> 
> This definitely needs a better comment! What exactly are you doing here
> and what is that quirk all about?

I use this quirk is for MTK Special Bit as we don't have the XN bit in
pagetable.

And the TEX0 will be deleted. Then it will like this:

//==========
    if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
            if (large)  
                  pteprot &= ~ARM_SHORT_PTE_LARGE_XN;
            else
                  pteprot &= ~ARM_SHORT_PTE_SMALL_XN;
//=========
And move the comment close to the definition of the quirk.

like this:
#define IO_PGTABLE_QUIRK_SHORT_MTK   BIT(2)/* MTK Speical Bit
*/         			
> 
> > +                               if (pteprot & ARM_SHORT_PTE_LARGE_TEX0) {
> > +                                       pteprot &= ~ARM_SHORT_PTE_LARGE_TEX0;
> > +                                       pteprot |= ARM_SHORT_PTE_SMALL_TEX0;
> > +                               }
> > +                               pteprot &= ~ARM_SHORT_PTE_LARGE_XN;
> > +                       } else {
> > +                               pteprot &= ~ARM_SHORT_PTE_SMALL_XN;
> > +                       }
> > +               }
> > +
> > +               if (!(*pgd)) {
> > +                       pte_va = kmem_cache_zalloc(data->ptekmem, GFP_ATOMIC);
> > +                       if (unlikely(!pte_va))
> > +                               return -ENOMEM;
> > +                       ptenew = true;
> > +                       *pgd = virt_to_phys(pte_va) | pgdprot;
> > +                       kmemleak_ignore(pte_va);
> > +                       tlb->flush_pgtable(pgd, sizeof(*pgd), cookie);
> 
> I think you need to flush this before it becomes visible to the walker.

I have flushed pgtable here, Do you meaning flush tlb here?

From the comment below, you don't think tlb-flush is necessary while the
new pte item is from invalid -> valid,

> 
> > +               }
> > +               pte = arm_short_get_pte_in_pgd(*pgd, iova);
> > +       }
> > +
> > +       pteprot |= (arm_short_iopte)paddr;
> > +       for (i = 0; i < ptenr; i++) {
> > +               if (pte[i]) {/* Someone else may have allocated for this pte */
> > +                       WARN_ON(!selftest_running);
> > +                       goto err_exist_pte;
> > +               }
> > +               pte[i] = pteprot;
> > +       }
> > +       tlb->flush_pgtable(pte, ptenr * sizeof(*pte), cookie);
> > +
> > +       return 0;
> > +
> > +err_exist_pte:
> > +       while (i--)
> > +               pte[i] = 0;
> > +       if (ptenew)
> > +               kmem_cache_free(data->ptekmem, pte_va);
> > +       return -EEXIST;
> > +}
> > +
> > +static int arm_short_map(struct io_pgtable_ops *ops, unsigned long iova,
> > +                        phys_addr_t paddr, size_t size, int prot)
> > +{
> > +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> > +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> > +       int ret;
> > +       arm_short_iopte pgdprot = 0, pteprot = 0;
> > +       bool large;
> > +
> > +       /* If no access, then nothing to do */
> > +       if (!(prot & (IOMMU_READ | IOMMU_WRITE)))
> > +               return 0;
> > +
> > +       switch (size) {
> > +       case SZ_4K:
> > +       case SZ_64K:
> > +               large = (size == SZ_64K) ? true : false;
> > +               pteprot = __arm_short_pte_prot(data, prot, large);
> > +               pgdprot = __arm_short_pgtable_prot(data, prot & IOMMU_NOEXEC);
> > +               break;
> > +
> > +       case SZ_1M:
> > +       case SZ_16M:
> > +               large = (size == SZ_16M) ? true : false;
> > +               pgdprot = __arm_short_pgd_prot(data, prot, large);
> > +               break;
> > +       default:
> > +               return -EINVAL;
> > +       }
> > +
> > +       if (WARN_ON((iova | paddr) & (size - 1)))
> > +               return -EINVAL;
> > +
> > +       ret = _arm_short_map(data, iova, paddr, pgdprot, pteprot, large);
> > +
> > +       tlb->tlb_add_flush(iova, size, true, data->iop.cookie);
> > +       tlb->tlb_sync(data->iop.cookie);
> 
> In _arm_short_map, it looks like you can only go from invalid -> valid,
> so why do you need to flush the TLB here?

I will delete tlb flush here. 
Then the tlb flush is only called after unmap and split.

> 
> > +       return ret;
> > +}
> > +
> > +static phys_addr_t arm_short_iova_to_phys(struct io_pgtable_ops *ops,
> > +                                         unsigned long iova)
> > +{
> > +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> > +       arm_short_iopte *pte, *pgd = data->pgd;
> > +       phys_addr_t pa = 0;
> > +
> > +       pgd += ARM_SHORT_PGD_IDX(iova);
> > +
> > +       if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) {
> > +               pte = arm_short_get_pte_in_pgd(*pgd, iova);
> > +
> > +               if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte)) {
> > +                       pa = (*pte) & ARM_SHORT_PTE_LARGE_MSK;
> > +                       pa |= iova & ~ARM_SHORT_PTE_LARGE_MSK;
> > +               } else if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte)) {
> > +                       pa = (*pte) & ARM_SHORT_PTE_SMALL_MSK;
> > +                       pa |= iova & ~ARM_SHORT_PTE_SMALL_MSK;
> > +               }
> > +       } else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) {
> > +               pa = (*pgd) & ARM_SHORT_PGD_SECTION_MSK;
> > +               pa |= iova & ~ARM_SHORT_PGD_SECTION_MSK;
> > +       } else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
> > +               pa = (*pgd) & ARM_SHORT_PGD_SUPERSECTION_MSK;
> > +               pa |= iova & ~ARM_SHORT_PGD_SUPERSECTION_MSK;
> > +       }
> > +
> > +       return pa;
> > +}
> > +
> > +static int
> > +arm_short_split_blk_unmap(struct io_pgtable_ops *ops, unsigned int iova,
> > +                         phys_addr_t paddr, size_t size,
> > +                         arm_short_iopte pgdprotup, arm_short_iopte pteprotup,
> > +                         size_t blk_size)
> > +{
> > +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> > +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> > +       struct io_pgtable_cfg *cfg = &data->iop.cfg;
> > +       unsigned long *pgbitmap = &cfg->pgsize_bitmap;
> > +       unsigned int blk_base, blk_start, blk_end;
> > +       arm_short_iopte pgdprot, pteprot;
> > +       size_t mapsize = 0, nextmapsize;
> > +       phys_addr_t blk_paddr;
> > +       int ret;
> > +       unsigned int i;
> > +
> > +       /* find the nearest mapsize */
> > +       for (i = find_first_bit(pgbitmap, BITS_PER_LONG);
> > +            i < BITS_PER_LONG && ((1 << i) < blk_size) &&
> > +            IS_ALIGNED(size, 1 << i);
> > +            i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1))
> > +               mapsize = 1 << i;
> > +
> > +       if (WARN_ON(!mapsize))
> > +               return 0; /* Bytes unmapped */
> > +       nextmapsize = 1 << i;
> > +
> > +       blk_base = iova & ~(blk_size - 1);
> > +       blk_start = blk_base;
> > +       blk_end = blk_start + blk_size;
> > +       blk_paddr = paddr;
> > +
> > +       for (; blk_start < blk_end;
> > +            blk_start += mapsize, blk_paddr += mapsize) {
> > +               /* Unmap! */
> > +               if (blk_start == iova)
> > +                       continue;
> > +
> > +               /* Try to upper map */
> > +               if (blk_base != blk_start &&
> > +                   IS_ALIGNED(blk_start | blk_paddr, nextmapsize) &&
> > +                   mapsize != nextmapsize) {
> > +                       mapsize = nextmapsize;
> > +                       i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1);
> > +                       if (i < BITS_PER_LONG)
> > +                               nextmapsize = 1 << i;
> > +               }
> > +
> > +               if (mapsize == SZ_1M) {
> > +                       pgdprot = pgdprotup;
> > +                       pgdprot |= __arm_short_pgd_prot(data, 0, false);
> > +                       pteprot = 0;
> > +               } else { /* small or large page */
> > +                       bool noexec = (blk_size == SZ_64K) ?
> > +                               (pteprotup & ARM_SHORT_PTE_LARGE_XN) :
> > +                               (pgdprotup & ARM_SHORT_PGD_SECTION_XN);
> > +
> > +                       pteprot = __arm_short_pte_prot_split(
> > +                                               data, pgdprotup, pteprotup,
> > +                                               mapsize == SZ_64K);
> > +                       pgdprot = __arm_short_pgtable_prot(data, noexec);
> > +               }
> > +
> > +               ret = _arm_short_map(data, blk_start, blk_paddr, pgdprot,
> > +                                    pteprot, mapsize == SZ_64K);
> > +               if (ret < 0) {
> > +                       /* Free the table we allocated */
> > +                       arm_short_iopte *pgd = data->pgd, *pte;
> > +
> > +                       pgd += ARM_SHORT_PGD_IDX(blk_base);
> > +                       if (*pgd) {
> > +                               pte = ARM_SHORT_GET_PTE_VA(*pgd);
> > +                               kmem_cache_free(data->ptekmem, pte);
> > +                               *pgd = 0;
> > +                               tlb->flush_pgtable(pgd, sizeof(*pgd),
> > +                                                  data->iop.cookie);
> 
> Again, the ordering looks totally wrong here. You need to:
> 
>   (1) Zero the pgd pointer
>   (2) Flush the pointer, so the walker can no longer follow the page table
>   (3) Invalidate the TLB, so we don't have any cached intermediate walks
>   (4) Sync the TLB
>   (5) Free the memory
> 
> That said, I don't fully follow the logic here.

Sorry. I didn't care the free flow specially. I will change it like
below:
//=======
       *pgd = 0;
       tlb->flush_pgtable(pgd, sizeof(*pgd), data->iop.cookie);
       tlb->tlb_add_flush(blk_base, blk_size, true, data->iop.cookie);
       tlb->tlb_sync(data->iop.cookie);
       kmem_cache_free(data->ptekmem, pte);
//============

> 
> > +                       }
> > +                       return 0;/* Bytes unmapped */
> > +               }
> > +               tlb->tlb_add_flush(blk_start, mapsize, true, data->iop.cookie);
> > +               tlb->tlb_sync(data->iop.cookie);
> 
> Why are you doing this here for every iteration?

I will move it out from the loop.

> Will
Will Deacon July 24, 2015, 4:53 p.m. UTC | #3
On Fri, Jul 24, 2015 at 06:24:26AM +0100, Yong Wu wrote:
> On Tue, 2015-07-21 at 18:11 +0100, Will Deacon wrote:
> > On Thu, Jul 16, 2015 at 10:04:32AM +0100, Yong Wu wrote:
> > > +/* level 2 pagetable */
> > > +#define ARM_SHORT_PTE_TYPE_LARGE               BIT(0)
> > > +#define ARM_SHORT_PTE_SMALL_XN                 BIT(0)
> > > +#define ARM_SHORT_PTE_TYPE_SMALL               BIT(1)
> > > +#define ARM_SHORT_PTE_B                                BIT(2)
> > > +#define ARM_SHORT_PTE_C                                BIT(3)
> > > +#define ARM_SHORT_PTE_SMALL_TEX0               BIT(6)
> > > +#define ARM_SHORT_PTE_IMPLE                    BIT(9)
> >
> > This is AP[2] for small pages.
> 
> Sorry, In our pagetable bit9 in PGD and PTE is PA[32] that is for  the
> dram size over 4G. I didn't care it is different in PTE of the standard
> spec.
> And I don't use the AP[2] currently, so I only delete this line in next
> time.

Is this related to the "special bit". What would be good is a comment
next to the #define for the quirk describing *exactly* that differs in
your implementation. Without that, it's very difficult to know what is
intentional and what is actually broken.

> > > +static arm_short_iopte
> > > +__arm_short_pte_prot(struct arm_short_io_pgtable *data, int prot, bool large)
> > > +{
> > > +       arm_short_iopte pteprot;
> > > +
> > > +       pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
> > > +       pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
> > > +                               ARM_SHORT_PTE_TYPE_SMALL;
> > > +       if (prot & IOMMU_CACHE)
> > > +               pteprot |=  ARM_SHORT_PTE_B | ARM_SHORT_PTE_C;
> > > +       if (prot & IOMMU_WRITE)
> > > +               pteprot |= large ? ARM_SHORT_PTE_LARGE_TEX0 :
> > > +                               ARM_SHORT_PTE_SMALL_TEX0;
> >
> > This doesn't make any sense. TEX[2:0] is all about memory attributes, not
> > permissions, so you're making the mapping write-back, write-allocate but
> > that's not what the IOMMU_* values are about.
> 
>      I will delete it.

Well, can you not control mapping permissions with the AP bits? The idea
of the IOMMU flags are:

  IOMMU_CACHE : Install a normal, cacheable mapping (you've got this right)
  IOMMU_READ : Allow read access for the device
  IOMMU_WRITE : Allow write access for the device
  IOMMU_NOEXEC : Disallow execute access for the device

so the caller to iommu_map passes in a bitmap of these, which you need to
encode in the page-table entry.

> > > +static int
> > > +_arm_short_map(struct arm_short_io_pgtable *data,
> > > +              unsigned int iova, phys_addr_t paddr,
> > > +              arm_short_iopte pgdprot, arm_short_iopte pteprot,
> > > +              bool large)
> > > +{
> > > +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> > > +       arm_short_iopte *pgd = data->pgd, *pte;
> > > +       void *cookie = data->iop.cookie, *pte_va;
> > > +       unsigned int ptenr = large ? 16 : 1;
> > > +       int i, quirk = data->iop.cfg.quirks;
> > > +       bool ptenew = false;
> > > +
> > > +       pgd += ARM_SHORT_PGD_IDX(iova);
> > > +
> > > +       if (!pteprot) { /* section or supersection */
> > > +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK)
> > > +                       pgdprot &= ~ARM_SHORT_PGD_SECTION_XN;
> > > +               pte = pgd;
> > > +               pteprot = pgdprot;
> > > +       } else {        /* page or largepage */
> > > +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
> > > +                       if (large) { /* special Bit */
> >
> > This definitely needs a better comment! What exactly are you doing here
> > and what is that quirk all about?
> 
> I use this quirk is for MTK Special Bit as we don't have the XN bit in
> pagetable.

I'm still not really clear about what this is.

> > > +               if (!(*pgd)) {
> > > +                       pte_va = kmem_cache_zalloc(data->ptekmem, GFP_ATOMIC);
> > > +                       if (unlikely(!pte_va))
> > > +                               return -ENOMEM;
> > > +                       ptenew = true;
> > > +                       *pgd = virt_to_phys(pte_va) | pgdprot;
> > > +                       kmemleak_ignore(pte_va);
> > > +                       tlb->flush_pgtable(pgd, sizeof(*pgd), cookie);
> >
> > I think you need to flush this before it becomes visible to the walker.
> 
> I have flushed pgtable here, Do you meaning flush tlb here?

No. afaict, you allocate the pte table using kmem_cache_zalloc but you never
flush it. However, you update the pgd to point at this table, so the walker
can potentially see garbage instead of the zeroed entries.

Will
Yong Wu (吴勇) July 27, 2015, 4:21 a.m. UTC | #4
On Fri, 2015-07-24 at 17:53 +0100, Will Deacon wrote:
> On Fri, Jul 24, 2015 at 06:24:26AM +0100, Yong Wu wrote:
> > On Tue, 2015-07-21 at 18:11 +0100, Will Deacon wrote:
> > > On Thu, Jul 16, 2015 at 10:04:32AM +0100, Yong Wu wrote:
> > > > +/* level 2 pagetable */
> > > > +#define ARM_SHORT_PTE_TYPE_LARGE               BIT(0)
> > > > +#define ARM_SHORT_PTE_SMALL_XN                 BIT(0)
> > > > +#define ARM_SHORT_PTE_TYPE_SMALL               BIT(1)
> > > > +#define ARM_SHORT_PTE_B                                BIT(2)
> > > > +#define ARM_SHORT_PTE_C                                BIT(3)
> > > > +#define ARM_SHORT_PTE_SMALL_TEX0               BIT(6)
> > > > +#define ARM_SHORT_PTE_IMPLE                    BIT(9)
> > >
> > > This is AP[2] for small pages.
> > 
> > Sorry, In our pagetable bit9 in PGD and PTE is PA[32] that is for  the
> > dram size over 4G. I didn't care it is different in PTE of the standard
> > spec.
> > And I don't use the AP[2] currently, so I only delete this line in next
> > time.
> 
> Is this related to the "special bit". What would be good is a comment
> next to the #define for the quirk describing *exactly* that differs in
> your implementation. Without that, it's very difficult to know what is
> intentional and what is actually broken.

I will add the comment alongside the #define.

> 
> > > > +static arm_short_iopte
> > > > +__arm_short_pte_prot(struct arm_short_io_pgtable *data, int prot, bool large)
> > > > +{
> > > > +       arm_short_iopte pteprot;
> > > > +
> > > > +       pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
> > > > +       pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
> > > > +                               ARM_SHORT_PTE_TYPE_SMALL;
> > > > +       if (prot & IOMMU_CACHE)
> > > > +               pteprot |=  ARM_SHORT_PTE_B | ARM_SHORT_PTE_C;
> > > > +       if (prot & IOMMU_WRITE)
> > > > +               pteprot |= large ? ARM_SHORT_PTE_LARGE_TEX0 :
> > > > +                               ARM_SHORT_PTE_SMALL_TEX0;
> > >
> > > This doesn't make any sense. TEX[2:0] is all about memory attributes, not
> > > permissions, so you're making the mapping write-back, write-allocate but
> > > that's not what the IOMMU_* values are about.
> > 
> >      I will delete it.
> 
> Well, can you not control mapping permissions with the AP bits? The idea
> of the IOMMU flags are:
> 
>   IOMMU_CACHE : Install a normal, cacheable mapping (you've got this right)
>   IOMMU_READ : Allow read access for the device
>   IOMMU_WRITE : Allow write access for the device
>   IOMMU_NOEXEC : Disallow execute access for the device
> 
> so the caller to iommu_map passes in a bitmap of these, which you need to
> encode in the page-table entry.

From the spec, AP[2] differentiate the read/write and readonly.
How about this?: 
//===============
  #define ARM_SHORT_PGD_FULL_ACCESS  (3 << 10) 
  #define ARM_SHORT_PGD_RDONLY       BIT(15)

  pgdprot |= ARM_SHORT_PGD_FULL_ACCESS;/* or other names? */
  if(!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
     pgdprot |= ARM_SHORT_PGD_RDONLY;
//===============
pte is the same. 

Sorry, Our HW don't meet the standard spec fully. it don't implement the
AP bits.

> 
> > > > +static int
> > > > +_arm_short_map(struct arm_short_io_pgtable *data,
> > > > +              unsigned int iova, phys_addr_t paddr,
> > > > +              arm_short_iopte pgdprot, arm_short_iopte pteprot,
> > > > +              bool large)
> > > > +{
> > > > +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> > > > +       arm_short_iopte *pgd = data->pgd, *pte;
> > > > +       void *cookie = data->iop.cookie, *pte_va;
> > > > +       unsigned int ptenr = large ? 16 : 1;
> > > > +       int i, quirk = data->iop.cfg.quirks;
> > > > +       bool ptenew = false;
> > > > +
> > > > +       pgd += ARM_SHORT_PGD_IDX(iova);
> > > > +
> > > > +       if (!pteprot) { /* section or supersection */
> > > > +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK)
> > > > +                       pgdprot &= ~ARM_SHORT_PGD_SECTION_XN;
> > > > +               pte = pgd;
> > > > +               pteprot = pgdprot;
> > > > +       } else {        /* page or largepage */
> > > > +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
> > > > +                       if (large) { /* special Bit */
> > >
> > > This definitely needs a better comment! What exactly are you doing here
> > > and what is that quirk all about?
> > 
> > I use this quirk is for MTK Special Bit as we don't have the XN bit in
> > pagetable.
> 
> I'm still not really clear about what this is.

There is some difference between the standard spec and MTK HW,
Our hw don't implement some bits, like XN and AP.
So I add a quirk for MTK special.

> 
> > > > +               if (!(*pgd)) {
> > > > +                       pte_va = kmem_cache_zalloc(data->ptekmem, GFP_ATOMIC);
> > > > +                       if (unlikely(!pte_va))
> > > > +                               return -ENOMEM;
> > > > +                       ptenew = true;
> > > > +                       *pgd = virt_to_phys(pte_va) | pgdprot;
> > > > +                       kmemleak_ignore(pte_va);
> > > > +                       tlb->flush_pgtable(pgd, sizeof(*pgd), cookie);
> > >
> > > I think you need to flush this before it becomes visible to the walker.
> > 
> > I have flushed pgtable here, Do you meaning flush tlb here?
> 
> No. afaict, you allocate the pte table using kmem_cache_zalloc but you never
> flush it. However, you update the pgd to point at this table, so the walker
> can potentially see garbage instead of the zeroed entries.

Thanks. I will add :
tlb->flush_pgtable(pte_va, ARM_SHORT_BYTES_PER_PTE, cookie);

> 
> Will
Robin Murphy July 27, 2015, 2:05 p.m. UTC | #5
On 27/07/15 05:21, Yong Wu wrote:
[...]
>>>>> +static arm_short_iopte
>>>>> +__arm_short_pte_prot(struct arm_short_io_pgtable *data, int prot, bool large)
>>>>> +{
>>>>> +       arm_short_iopte pteprot;
>>>>> +
>>>>> +       pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
>>>>> +       pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
>>>>> +                               ARM_SHORT_PTE_TYPE_SMALL;
>>>>> +       if (prot & IOMMU_CACHE)
>>>>> +               pteprot |=  ARM_SHORT_PTE_B | ARM_SHORT_PTE_C;
>>>>> +       if (prot & IOMMU_WRITE)
>>>>> +               pteprot |= large ? ARM_SHORT_PTE_LARGE_TEX0 :
>>>>> +                               ARM_SHORT_PTE_SMALL_TEX0;
>>>>
>>>> This doesn't make any sense. TEX[2:0] is all about memory attributes, not
>>>> permissions, so you're making the mapping write-back, write-allocate but
>>>> that's not what the IOMMU_* values are about.
>>>
>>>       I will delete it.
>>
>> Well, can you not control mapping permissions with the AP bits? The idea
>> of the IOMMU flags are:
>>
>>    IOMMU_CACHE : Install a normal, cacheable mapping (you've got this right)
>>    IOMMU_READ : Allow read access for the device
>>    IOMMU_WRITE : Allow write access for the device
>>    IOMMU_NOEXEC : Disallow execute access for the device
>>
>> so the caller to iommu_map passes in a bitmap of these, which you need to
>> encode in the page-table entry.
>
>  From the spec, AP[2] differentiate the read/write and readonly.
> How about this?:
> //===============
>    #define ARM_SHORT_PGD_FULL_ACCESS  (3 << 10)
>    #define ARM_SHORT_PGD_RDONLY       BIT(15)
>
>    pgdprot |= ARM_SHORT_PGD_FULL_ACCESS;/* or other names? */
>    if(!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
>       pgdprot |= ARM_SHORT_PGD_RDONLY;
> //===============
> pte is the same.
>
> Sorry, Our HW don't meet the standard spec fully. it don't implement the
> AP bits.
>
>>
>>>>> +static int
>>>>> +_arm_short_map(struct arm_short_io_pgtable *data,
>>>>> +              unsigned int iova, phys_addr_t paddr,
>>>>> +              arm_short_iopte pgdprot, arm_short_iopte pteprot,
>>>>> +              bool large)
>>>>> +{
>>>>> +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
>>>>> +       arm_short_iopte *pgd = data->pgd, *pte;
>>>>> +       void *cookie = data->iop.cookie, *pte_va;
>>>>> +       unsigned int ptenr = large ? 16 : 1;
>>>>> +       int i, quirk = data->iop.cfg.quirks;
>>>>> +       bool ptenew = false;
>>>>> +
>>>>> +       pgd += ARM_SHORT_PGD_IDX(iova);
>>>>> +
>>>>> +       if (!pteprot) { /* section or supersection */
>>>>> +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK)
>>>>> +                       pgdprot &= ~ARM_SHORT_PGD_SECTION_XN;
>>>>> +               pte = pgd;
>>>>> +               pteprot = pgdprot;
>>>>> +       } else {        /* page or largepage */
>>>>> +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
>>>>> +                       if (large) { /* special Bit */
>>>>
>>>> This definitely needs a better comment! What exactly are you doing here
>>>> and what is that quirk all about?
>>>
>>> I use this quirk is for MTK Special Bit as we don't have the XN bit in
>>> pagetable.
>>
>> I'm still not really clear about what this is.
>
> There is some difference between the standard spec and MTK HW,
> Our hw don't implement some bits, like XN and AP.
> So I add a quirk for MTK special.

When you say it doesn't implement these bits, do you mean that having 
them set will lead to Bad Things happening in the hardware, or that it 
will simply ignore them and not enforce any of the protections they 
imply? The former case would definitely want clearly documenting 
somewhere, whereas for the latter case I'm not sure it's even worth the 
complication of having a quirk - if the value doesn't matter there seems 
little point in doing a special dance just for the sake of semantic 
correctness of the in-memory PTEs, in my opinion.

Robin.
Will Deacon July 27, 2015, 2:11 p.m. UTC | #6
On Mon, Jul 27, 2015 at 03:05:38PM +0100, Robin Murphy wrote:
> On 27/07/15 05:21, Yong Wu wrote:
> >>>>> +       } else {        /* page or largepage */
> >>>>> +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
> >>>>> +                       if (large) { /* special Bit */
> >>>>
> >>>> This definitely needs a better comment! What exactly are you doing here
> >>>> and what is that quirk all about?
> >>>
> >>> I use this quirk is for MTK Special Bit as we don't have the XN bit in
> >>> pagetable.
> >>
> >> I'm still not really clear about what this is.
> >
> > There is some difference between the standard spec and MTK HW,
> > Our hw don't implement some bits, like XN and AP.
> > So I add a quirk for MTK special.
> 
> When you say it doesn't implement these bits, do you mean that having 
> them set will lead to Bad Things happening in the hardware, or that it 
> will simply ignore them and not enforce any of the protections they 
> imply? The former case would definitely want clearly documenting 
> somewhere, whereas for the latter case I'm not sure it's even worth the 
> complication of having a quirk - if the value doesn't matter there seems 
> little point in doing a special dance just for the sake of semantic 
> correctness of the in-memory PTEs, in my opinion.

Agreed. We should only use quirks if the current (architecturally
compliant) code causes real issues with the hardware. Then the quirk can
be used to either avoid the problematic routines or to take extra steps
to make things work as the architecture intended.

I've asked how this IOMMU differs from the architecture on a number of
occasions, but I'm still yet to receive a response other than "it's special".

Will
Yong Wu (吴勇) July 28, 2015, 5:08 a.m. UTC | #7
On Mon, 2015-07-27 at 15:11 +0100, Will Deacon wrote:
> On Mon, Jul 27, 2015 at 03:05:38PM +0100, Robin Murphy wrote:
> > On 27/07/15 05:21, Yong Wu wrote:
> > >>>>> +       } else {        /* page or largepage */
> > >>>>> +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
> > >>>>> +                       if (large) { /* special Bit */
> > >>>>
> > >>>> This definitely needs a better comment! What exactly are you doing here
> > >>>> and what is that quirk all about?
> > >>>
> > >>> I use this quirk is for MTK Special Bit as we don't have the XN bit in
> > >>> pagetable.
> > >>
> > >> I'm still not really clear about what this is.
> > >
> > > There is some difference between the standard spec and MTK HW,
> > > Our hw don't implement some bits, like XN and AP.
> > > So I add a quirk for MTK special.
> > 
> > When you say it doesn't implement these bits, do you mean that having 
> > them set will lead to Bad Things happening in the hardware, or that it 
> > will simply ignore them and not enforce any of the protections they 
> > imply? The former case would definitely want clearly documenting 
> > somewhere, whereas for the latter case I'm not sure it's even worth the 
> > complication of having a quirk - if the value doesn't matter there seems 
> > little point in doing a special dance just for the sake of semantic 
> > correctness of the in-memory PTEs, in my opinion.
> 
> Agreed. We should only use quirks if the current (architecturally
> compliant) code causes real issues with the hardware. Then the quirk can
> be used to either avoid the problematic routines or to take extra steps
> to make things work as the architecture intended.
> 
> I've asked how this IOMMU differs from the architecture on a number of
> occasions, but I'm still yet to receive a response other than "it's special".
> 

After check further with DE, Our pagetable is refer to ARM-v7's
short-descriptor which is a little different from ARM-v8. like bit0(PXN)
in section and supersection, I didn't read ARM-v7 spec before, so I add
a MTK quirk to disable PXN bit in section and supersection.(if the PXN
bit is wrote in ARM-v7 spec, HW will page fault.)

Then I write this code according to ARM-v8 spec defaultly, and add a
ARM-v7 quirk?

And there is a little different between ARM-v7 spec and MTK pagetable.
It's the XN(bit0) in small page. MTK don't implement XN bit. 
The bit[1:0] in MTK's small page should be 2'b10, if it's 2'b11, HW will
page fault.
(MTK don't implement AP bits too, but HW don't use them, it is ok even
though AP bits is wrote)

In the end, I will add two quirk like this, is it OK?

//===========
#define ARM_PGTABLE_QUIRK_SHORT_ARM_V7   BIT(2)  /* for ARM-v7 while
default is the ARM-v8 spec */
#define ARM_PGTABLE_QUIRK_SHORT_MTK  BIT(3)      /* MTK special */
//===========

In the ARM_V7 quirk, I only disable PXN bit in section and supersection,
In the MTK quirk, I only disbable XN in small page.

The other bits seems the same. I'm not sure I write clearly and It seems
we could not copy a image of mtk pagetable here..If any question, please
help tell me.
Thanks very much.

> Will
Will Deacon July 28, 2015, 11 a.m. UTC | #8
On Tue, Jul 28, 2015 at 06:08:14AM +0100, Yong Wu wrote:
> On Mon, 2015-07-27 at 15:11 +0100, Will Deacon wrote:
> > On Mon, Jul 27, 2015 at 03:05:38PM +0100, Robin Murphy wrote:
> > > On 27/07/15 05:21, Yong Wu wrote:
> > > >>>>> +       } else {        /* page or largepage */
> > > >>>>> +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
> > > >>>>> +                       if (large) { /* special Bit */
> > > >>>>
> > > >>>> This definitely needs a better comment! What exactly are you doing here
> > > >>>> and what is that quirk all about?
> > > >>>
> > > >>> I use this quirk is for MTK Special Bit as we don't have the XN bit in
> > > >>> pagetable.
> > > >>
> > > >> I'm still not really clear about what this is.
> > > >
> > > > There is some difference between the standard spec and MTK HW,
> > > > Our hw don't implement some bits, like XN and AP.
> > > > So I add a quirk for MTK special.
> > > 
> > > When you say it doesn't implement these bits, do you mean that having 
> > > them set will lead to Bad Things happening in the hardware, or that it 
> > > will simply ignore them and not enforce any of the protections they 
> > > imply? The former case would definitely want clearly documenting 
> > > somewhere, whereas for the latter case I'm not sure it's even worth the 
> > > complication of having a quirk - if the value doesn't matter there seems 
> > > little point in doing a special dance just for the sake of semantic 
> > > correctness of the in-memory PTEs, in my opinion.
> > 
> > Agreed. We should only use quirks if the current (architecturally
> > compliant) code causes real issues with the hardware. Then the quirk can
> > be used to either avoid the problematic routines or to take extra steps
> > to make things work as the architecture intended.
> > 
> > I've asked how this IOMMU differs from the architecture on a number of
> > occasions, but I'm still yet to receive a response other than "it's special".
> > 
> 
> After check further with DE, Our pagetable is refer to ARM-v7's
> short-descriptor which is a little different from ARM-v8. like bit0(PXN)
> in section and supersection, I didn't read ARM-v7 spec before, so I add
> a MTK quirk to disable PXN bit in section and supersection.(if the PXN
> bit is wrote in ARM-v7 spec, HW will page fault.)

I've been reviewing this using the ARMv7 ARM (Rev.C of DDI0406C) the whole
time. PXN is there as an optional field in non-LPAE implementations. That's
fine and doesn't require any quirks.

> Then I write this code according to ARM-v8 spec defaultly, and add a
> ARM-v7 quirk?

No, I don't think you need this, as the v8 and v7 short-descriptor formats
look compatible to me. You should only need a quirk if architecturally
compliant code cannot work on your hardware.

> And there is a little different between ARM-v7 spec and MTK pagetable.
> It's the XN(bit0) in small page. MTK don't implement XN bit. 
> The bit[1:0] in MTK's small page should be 2'b10, if it's 2'b11, HW will
> page fault.

Aha, thanks! *That* is worthy of a quirk. Something like:

  IO_PGTABLE_QUIRK_ARM_NO_XN

> (MTK don't implement AP bits too, but HW don't use them, it is ok even
> though AP bits is wrote)

Yeah, I think that's fine. The pgtable code will honour the request but
the h/w will ignore it.

> In the end, I will add two quirk like this, is it OK?

I think you only need the one I mentioned above. I don't see the need
for PXN at all (as I said in the last review).

Will
Yong Wu (吴勇) July 28, 2015, 1:37 p.m. UTC | #9
On Tue, 2015-07-28 at 12:00 +0100, Will Deacon wrote:
> On Tue, Jul 28, 2015 at 06:08:14AM +0100, Yong Wu wrote:
> > On Mon, 2015-07-27 at 15:11 +0100, Will Deacon wrote:
> > > On Mon, Jul 27, 2015 at 03:05:38PM +0100, Robin Murphy wrote:
> > > > On 27/07/15 05:21, Yong Wu wrote:
> > > > >>>>> +       } else {        /* page or largepage */
> > > > >>>>> +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
> > > > >>>>> +                       if (large) { /* special Bit */
> > > > >>>>
> > > > >>>> This definitely needs a better comment! What exactly are you doing here
> > > > >>>> and what is that quirk all about?
> > > > >>>
> > > > >>> I use this quirk is for MTK Special Bit as we don't have the XN bit in
> > > > >>> pagetable.
> > > > >>
> > > > >> I'm still not really clear about what this is.
> > > > >
> > > > > There is some difference between the standard spec and MTK HW,
> > > > > Our hw don't implement some bits, like XN and AP.
> > > > > So I add a quirk for MTK special.
> > > > 
> > > > When you say it doesn't implement these bits, do you mean that having 
> > > > them set will lead to Bad Things happening in the hardware, or that it 
> > > > will simply ignore them and not enforce any of the protections they 
> > > > imply? The former case would definitely want clearly documenting 
> > > > somewhere, whereas for the latter case I'm not sure it's even worth the 
> > > > complication of having a quirk - if the value doesn't matter there seems 
> > > > little point in doing a special dance just for the sake of semantic 
> > > > correctness of the in-memory PTEs, in my opinion.
> > > 
> > > Agreed. We should only use quirks if the current (architecturally
> > > compliant) code causes real issues with the hardware. Then the quirk can
> > > be used to either avoid the problematic routines or to take extra steps
> > > to make things work as the architecture intended.
> > > 
> > > I've asked how this IOMMU differs from the architecture on a number of
> > > occasions, but I'm still yet to receive a response other than "it's special".
> > > 
> > 
> > After check further with DE, Our pagetable is refer to ARM-v7's
> > short-descriptor which is a little different from ARM-v8. like bit0(PXN)
> > in section and supersection, I didn't read ARM-v7 spec before, so I add
> > a MTK quirk to disable PXN bit in section and supersection.(if the PXN
> > bit is wrote in ARM-v7 spec, HW will page fault.)
> 
> I've been reviewing this using the ARMv7 ARM (Rev.C of DDI0406C) the whole
> time. PXN is there as an optional field in non-LPAE implementations. That's
> fine and doesn't require any quirks.

Thanks for your confirm.
Then I delete all the PXN bit in here?

Take a example, 
#define ARM_SHORT_PGD_SECTION_XN		(BIT(0) | BIT(4))
I will change it to "BIT(4)".

> 
> > Then I write this code according to ARM-v8 spec defaultly, and add a
> > ARM-v7 quirk?
> 
> No, I don't think you need this, as the v8 and v7 short-descriptor formats
> look compatible to me. You should only need a quirk if architecturally
> compliant code cannot work on your hardware.
> 
> > And there is a little different between ARM-v7 spec and MTK pagetable.
> > It's the XN(bit0) in small page. MTK don't implement XN bit. 
> > The bit[1:0] in MTK's small page should be 2'b10, if it's 2'b11, HW will
> > page fault.
> 
> Aha, thanks! *That* is worthy of a quirk. Something like:
> 
>   IO_PGTABLE_QUIRK_ARM_NO_XN

Thanks, I will add it.

> 
> > (MTK don't implement AP bits too, but HW don't use them, it is ok even
> > though AP bits is wrote)
> 
> Yeah, I think that's fine. The pgtable code will honour the request but
> the h/w will ignore it.
> 
> > In the end, I will add two quirk like this, is it OK?
> 
> I think you only need the one I mentioned above. I don't see the need
> for PXN at all (as I said in the last review).
> 
> Will
Will Deacon July 28, 2015, 1:47 p.m. UTC | #10
On Tue, Jul 28, 2015 at 02:37:43PM +0100, Yong Wu wrote:
> On Tue, 2015-07-28 at 12:00 +0100, Will Deacon wrote:
> > On Tue, Jul 28, 2015 at 06:08:14AM +0100, Yong Wu wrote:
> > > On Mon, 2015-07-27 at 15:11 +0100, Will Deacon wrote:
> > > > On Mon, Jul 27, 2015 at 03:05:38PM +0100, Robin Murphy wrote:
> > > > > On 27/07/15 05:21, Yong Wu wrote:
> > > > > >>>>> +       } else {        /* page or largepage */
> > > > > >>>>> +               if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
> > > > > >>>>> +                       if (large) { /* special Bit */
> > > > > >>>>
> > > > > >>>> This definitely needs a better comment! What exactly are you doing here
> > > > > >>>> and what is that quirk all about?
> > > > > >>>
> > > > > >>> I use this quirk is for MTK Special Bit as we don't have the XN bit in
> > > > > >>> pagetable.
> > > > > >>
> > > > > >> I'm still not really clear about what this is.
> > > > > >
> > > > > > There is some difference between the standard spec and MTK HW,
> > > > > > Our hw don't implement some bits, like XN and AP.
> > > > > > So I add a quirk for MTK special.
> > > > > 
> > > > > When you say it doesn't implement these bits, do you mean that having 
> > > > > them set will lead to Bad Things happening in the hardware, or that it 
> > > > > will simply ignore them and not enforce any of the protections they 
> > > > > imply? The former case would definitely want clearly documenting 
> > > > > somewhere, whereas for the latter case I'm not sure it's even worth the 
> > > > > complication of having a quirk - if the value doesn't matter there seems 
> > > > > little point in doing a special dance just for the sake of semantic 
> > > > > correctness of the in-memory PTEs, in my opinion.
> > > > 
> > > > Agreed. We should only use quirks if the current (architecturally
> > > > compliant) code causes real issues with the hardware. Then the quirk can
> > > > be used to either avoid the problematic routines or to take extra steps
> > > > to make things work as the architecture intended.
> > > > 
> > > > I've asked how this IOMMU differs from the architecture on a number of
> > > > occasions, but I'm still yet to receive a response other than "it's special".
> > > > 
> > > 
> > > After check further with DE, Our pagetable is refer to ARM-v7's
> > > short-descriptor which is a little different from ARM-v8. like bit0(PXN)
> > > in section and supersection, I didn't read ARM-v7 spec before, so I add
> > > a MTK quirk to disable PXN bit in section and supersection.(if the PXN
> > > bit is wrote in ARM-v7 spec, HW will page fault.)
> > 
> > I've been reviewing this using the ARMv7 ARM (Rev.C of DDI0406C) the whole
> > time. PXN is there as an optional field in non-LPAE implementations. That's
> > fine and doesn't require any quirks.
> 
> Thanks for your confirm.
> Then I delete all the PXN bit in here?
> 
> Take a example, 
> #define ARM_SHORT_PGD_SECTION_XN		(BIT(0) | BIT(4))
> I will change it to "BIT(4)".

Yes. Then the PXN bit can be added later as a quirk when we have an
implementation that supports it.

Will
Will Deacon July 31, 2015, 11:32 a.m. UTC | #11
On Fri, Jul 31, 2015 at 08:55:37AM +0100, Yong Wu wrote:
>     About the AP bits, I may have to add a new quirk for it...
> 
>   Current I add AP in pte like this:
> #define ARM_SHORT_PTE_RD_WR        (3 << 4)
> #define ARM_SHORT_PTE_RDONLY       BIT(9)
> 
> pteprot |=  ARM_SHORT_PTE_RD_WR;
> 
> 
>  If(!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
> 
> 
>       pteprot |= ARM_SHORT_PTE_RDONLY;
> 
> The problem is that the BIT(9) in the level1 and level2 pagetable of our
> HW has been used for PA[32] that is for the dram size over 4G.

Aha, now *thats* a case of page-table abuse!

> so I had to add a quirk to disable bit9 while RDONLY case.
> (If BIT9 isn't disabled, the HW treat it as the PA[32] case then it will
> translation fault..)
> 
> like: IO_PGTABLE_QUIRK_SHORT_MTK ?

Given that you don't have XN either, maybe IO_PGTABLE_QUIRK_NO_PERMS?
When set, IOMMU_READ/WRITE/EXEC are ignored and the mapping will never
generate a permission fault.

Will
Yong Wu (吴勇) Sept. 14, 2015, 12:25 p.m. UTC | #12
On Tue, 2015-07-21 at 18:11 +0100, Will Deacon wrote:
[...]
> > +static int arm_short_map(struct io_pgtable_ops *ops, unsigned long iova,
> > +                        phys_addr_t paddr, size_t size, int prot)
> > +{
> > +       struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
> > +       const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
> > +       int ret;
> > +       arm_short_iopte pgdprot = 0, pteprot = 0;
> > +       bool large;
> > +
> > +       /* If no access, then nothing to do */
> > +       if (!(prot & (IOMMU_READ | IOMMU_WRITE)))
> > +               return 0;
> > +
> > +       switch (size) {
> > +       case SZ_4K:
> > +       case SZ_64K:
> > +               large = (size == SZ_64K) ? true : false;
> > +               pteprot = __arm_short_pte_prot(data, prot, large);
> > +               pgdprot = __arm_short_pgtable_prot(data, prot & IOMMU_NOEXEC);
> > +               break;
> > +
> > +       case SZ_1M:
> > +       case SZ_16M:
> > +               large = (size == SZ_16M) ? true : false;
> > +               pgdprot = __arm_short_pgd_prot(data, prot, large);
> > +               break;
> > +       default:
> > +               return -EINVAL;
> > +       }
> > +
> > +       if (WARN_ON((iova | paddr) & (size - 1)))
> > +               return -EINVAL;
> > +
> > +       ret = _arm_short_map(data, iova, paddr, pgdprot, pteprot, large);
> > +
> > +       tlb->tlb_add_flush(iova, size, true, data->iop.cookie);
> > +       tlb->tlb_sync(data->iop.cookie);
> 
> In _arm_short_map, it looks like you can only go from invalid -> valid,
> so why do you need to flush the TLB here?

Hi Will,
   Here is about flush-tlb after map iova, I have deleted it in v4
following this suggestion. But We meet a problem about it.

Take a example with JPEG. the test steps is:
a).JPEG HW decode a picture with the source iova,like 0xfd780000.
b).JPEG HW decode done, It will unmap the iova(write 0 in pagetable and
flush tlb).
c).JPEG HW decode the second picture, whose source iova is also
0xfd780000.
   Then our HW maybe fail due to it will auto prefetch, It may prefecth
between the step b) and c). then the HW may fetch the pagetable content
which has been unmapped in step b). then the HW will get the iova's
physical address is 0, It will translation fault!

    So I think our HW need flush-tlb after map iova. Could we add a
QUIRK like "IO_PGTABLE_QUIRK_AUTO_PREFETCH_ENABLE" for it?
If it's not allowed, we will have to add this in our internal function
mtk_iommu_map of mtk_iommu.c.
Thanks.

> 
> > +       return ret;
> > +}
> > +
[...]
Will Deacon Sept. 16, 2015, 12:55 p.m. UTC | #13
Hello Yong,

On Mon, Sep 14, 2015 at 01:25:00PM +0100, Yong Wu wrote:
> On Tue, 2015-07-21 at 18:11 +0100, Will Deacon wrote:
> > > +       ret = _arm_short_map(data, iova, paddr, pgdprot, pteprot, large);
> > > +
> > > +       tlb->tlb_add_flush(iova, size, true, data->iop.cookie);
> > > +       tlb->tlb_sync(data->iop.cookie);
> > 
> > In _arm_short_map, it looks like you can only go from invalid -> valid,
> > so why do you need to flush the TLB here?
> 
> Hi Will,
>    Here is about flush-tlb after map iova, I have deleted it in v4
> following this suggestion. But We meet a problem about it.

Ok.

> Take a example with JPEG. the test steps is:
> a).JPEG HW decode a picture with the source iova,like 0xfd780000.
> b).JPEG HW decode done, It will unmap the iova(write 0 in pagetable and
> flush tlb).
> c).JPEG HW decode the second picture, whose source iova is also
> 0xfd780000.
>    Then our HW maybe fail due to it will auto prefetch, It may prefecth
> between the step b) and c). then the HW may fetch the pagetable content
> which has been unmapped in step b). then the HW will get the iova's
> physical address is 0, It will translation fault!

Oh no! So-called "negative caching" is certainly prohibited by the ARM
architecture, but if you've built it then we can probably work around it
as an additional quirk. I assume the prefetcher stops prefetching when
it sees an invalid descriptor?

>     So I think our HW need flush-tlb after map iova. Could we add a
> QUIRK like "IO_PGTABLE_QUIRK_AUTO_PREFETCH_ENABLE" for it?
> If it's not allowed, we will have to add this in our internal function
> mtk_iommu_map of mtk_iommu.c.

Actually, this type of quirk is ringing bells with me (I think another
IOMMU needed something similar in the past), so maybe just add
IO_PGTABLE_QUIRK_TLBI_ON_MAP?

Will
Yong Wu (吴勇) Sept. 17, 2015, 2:38 a.m. UTC | #14
On Wed, 2015-09-16 at 13:55 +0100, Will Deacon wrote:
> Hello Yong,
> 
> On Mon, Sep 14, 2015 at 01:25:00PM +0100, Yong Wu wrote:
> > On Tue, 2015-07-21 at 18:11 +0100, Will Deacon wrote:
> > > > +       ret = _arm_short_map(data, iova, paddr, pgdprot, pteprot, large);
> > > > +
> > > > +       tlb->tlb_add_flush(iova, size, true, data->iop.cookie);
> > > > +       tlb->tlb_sync(data->iop.cookie);
> > > 
> > > In _arm_short_map, it looks like you can only go from invalid -> valid,
> > > so why do you need to flush the TLB here?
> > 
> > Hi Will,
> >    Here is about flush-tlb after map iova, I have deleted it in v4
> > following this suggestion. But We meet a problem about it.
> 
> Ok.
> 
> > Take a example with JPEG. the test steps is:
> > a).JPEG HW decode a picture with the source iova,like 0xfd780000.
> > b).JPEG HW decode done, It will unmap the iova(write 0 in pagetable and
> > flush tlb).
> > c).JPEG HW decode the second picture, whose source iova is also
> > 0xfd780000.
> >    Then our HW maybe fail due to it will auto prefetch, It may prefecth
> > between the step b) and c). then the HW may fetch the pagetable content
> > which has been unmapped in step b). then the HW will get the iova's
> > physical address is 0, It will translation fault!
> 
> Oh no! So-called "negative caching" is certainly prohibited by the ARM
> architecture, but if you've built it then we can probably work around it
> as an additional quirk. I assume the prefetcher stops prefetching when
> it sees an invalid descriptor?

Yes, If it's a invalid descriptor, the HW will stop prefetch.

> 
> >     So I think our HW need flush-tlb after map iova. Could we add a
> > QUIRK like "IO_PGTABLE_QUIRK_AUTO_PREFETCH_ENABLE" for it?
> > If it's not allowed, we will have to add this in our internal function
> > mtk_iommu_map of mtk_iommu.c.
> 
> Actually, this type of quirk is ringing bells with me (I think another
> IOMMU needed something similar in the past), so maybe just add
> IO_PGTABLE_QUIRK_TLBI_ON_MAP?

Thanks. I will add it like:
//=====================
ret = _arm_short_map(data, iova, paddr, pgdprot, pteprot, large);

if (data->iop.cfg.quirk & IO_PGTABLE_QUIRK_TLBI_ON_MAP) {
	tlb->tlb_add_flush(iova, size, true, data->iop.cookie);
	tlb->tlb_sync(data->iop.cookie);
}
//======================
It will flush-tlb every time after map-iova. then the HW will fetch the
new PA from the dram.

> 
> Will
diff mbox

Patch

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f1fb1d3..f50dbf3 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -39,6 +39,24 @@  config IOMMU_IO_PGTABLE_LPAE_SELFTEST
 
 	  If unsure, say N here.
 
+config IOMMU_IO_PGTABLE_SHORT
+	bool "ARMv7/v8 Short Descriptor Format"
+	select IOMMU_IO_PGTABLE
+	depends on ARM || ARM64 || COMPILE_TEST
+	help
+	  Enable support for the ARM Short descriptor pagetable format.
+	  This allocator supports 2 levels translation tables which supports
+	  a memory map based on memory sections or pages.
+
+config IOMMU_IO_PGTABLE_SHORT_SELFTEST
+	bool "Short Descriptor selftests"
+	depends on IOMMU_IO_PGTABLE_SHORT
+	help
+	  Enable self-tests for Short Descriptor page table allocator.
+	  This performs a series of page-table consistency checks during boot.
+
+	  If unsure, say N here.
+
 endmenu
 
 config IOMMU_IOVA
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index c6dcc51..06df3e6 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -3,6 +3,7 @@  obj-$(CONFIG_IOMMU_API) += iommu-traces.o
 obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
+obj-$(CONFIG_IOMMU_IO_PGTABLE_SHORT) += io-pgtable-arm-short.o
 obj-$(CONFIG_IOMMU_IOVA) += iova.o
 obj-$(CONFIG_OF_IOMMU)	+= of_iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o
diff --git a/drivers/iommu/io-pgtable-arm-short.c b/drivers/iommu/io-pgtable-arm-short.c
new file mode 100644
index 0000000..340d590
--- /dev/null
+++ b/drivers/iommu/io-pgtable-arm-short.c
@@ -0,0 +1,742 @@ 
+/*
+ * Copyright (c) 2014-2015 MediaTek Inc.
+ * Author: Yong Wu <yong.wu@mediatek.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#define pr_fmt(fmt)	"arm-short-desc io-pgtable: "fmt
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/iommu.h>
+#include <linux/errno.h>
+
+#include "io-pgtable.h"
+
+typedef u32 arm_short_iopte;
+
+struct arm_short_io_pgtable {
+	struct io_pgtable	iop;
+	struct kmem_cache	*ptekmem;
+	size_t			pgd_size;
+	void			*pgd;
+};
+
+#define io_pgtable_to_data(x)			\
+	container_of((x), struct arm_short_io_pgtable, iop)
+
+#define io_pgtable_ops_to_data(x)		\
+	io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
+
+#define ARM_SHORT_PGDIR_SHIFT			20
+#define ARM_SHORT_PAGE_SHIFT			12
+#define ARM_SHORT_PTRS_PER_PTE			\
+	(1 << (ARM_SHORT_PGDIR_SHIFT - ARM_SHORT_PAGE_SHIFT))
+#define ARM_SHORT_BYTES_PER_PTE			\
+	(ARM_SHORT_PTRS_PER_PTE * sizeof(arm_short_iopte))
+
+/* level 1 pagetable */
+#define ARM_SHORT_PGD_TYPE_PGTABLE		BIT(0)
+#define ARM_SHORT_PGD_SECTION_XN		(BIT(0) | BIT(4))
+#define ARM_SHORT_PGD_TYPE_SECTION		BIT(1)
+#define ARM_SHORT_PGD_PGTABLE_XN		BIT(2)
+#define ARM_SHORT_PGD_B				BIT(2)
+#define ARM_SHORT_PGD_C				BIT(3)
+#define ARM_SHORT_PGD_PGTABLE_NS		BIT(3)
+#define ARM_SHORT_PGD_IMPLE			BIT(9)
+#define ARM_SHORT_PGD_TEX0			BIT(12)
+#define ARM_SHORT_PGD_S				BIT(16)
+#define ARM_SHORT_PGD_nG			BIT(17)
+#define ARM_SHORT_PGD_SUPERSECTION		BIT(18)
+#define ARM_SHORT_PGD_SECTION_NS		BIT(19)
+
+#define ARM_SHORT_PGD_TYPE_SUPERSECTION		\
+	(ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION)
+#define ARM_SHORT_PGD_SECTION_TYPE_MSK		\
+	(ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_SUPERSECTION)
+#define ARM_SHORT_PGD_PGTABLE_TYPE_MSK		\
+	(ARM_SHORT_PGD_TYPE_SECTION | ARM_SHORT_PGD_TYPE_PGTABLE)
+#define ARM_SHORT_PGD_TYPE_IS_PGTABLE(pgd)	\
+	(((pgd) & ARM_SHORT_PGD_PGTABLE_TYPE_MSK) == ARM_SHORT_PGD_TYPE_PGTABLE)
+#define ARM_SHORT_PGD_TYPE_IS_SECTION(pgd)	\
+	(((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == ARM_SHORT_PGD_TYPE_SECTION)
+#define ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(pgd)	\
+	(((pgd) & ARM_SHORT_PGD_SECTION_TYPE_MSK) == \
+	ARM_SHORT_PGD_TYPE_SUPERSECTION)
+#define ARM_SHORT_PGD_PGTABLE_MSK		0xfffffc00
+#define ARM_SHORT_PGD_SECTION_MSK		(~(SZ_1M - 1))
+#define ARM_SHORT_PGD_SUPERSECTION_MSK		(~(SZ_16M - 1))
+
+/* level 2 pagetable */
+#define ARM_SHORT_PTE_TYPE_LARGE		BIT(0)
+#define ARM_SHORT_PTE_SMALL_XN			BIT(0)
+#define ARM_SHORT_PTE_TYPE_SMALL		BIT(1)
+#define ARM_SHORT_PTE_B				BIT(2)
+#define ARM_SHORT_PTE_C				BIT(3)
+#define ARM_SHORT_PTE_SMALL_TEX0		BIT(6)
+#define ARM_SHORT_PTE_IMPLE			BIT(9)
+#define ARM_SHORT_PTE_S				BIT(10)
+#define ARM_SHORT_PTE_nG			BIT(11)
+#define ARM_SHORT_PTE_LARGE_TEX0		BIT(12)
+#define ARM_SHORT_PTE_LARGE_XN			BIT(15)
+#define ARM_SHORT_PTE_LARGE_MSK			(~(SZ_64K - 1))
+#define ARM_SHORT_PTE_SMALL_MSK			(~(SZ_4K - 1))
+#define ARM_SHORT_PTE_TYPE_MSK			\
+	(ARM_SHORT_PTE_TYPE_LARGE | ARM_SHORT_PTE_TYPE_SMALL)
+#define ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(pte)	\
+	(((((pte) & ARM_SHORT_PTE_TYPE_MSK) >> 1) << 1)\
+	== ARM_SHORT_PTE_TYPE_SMALL)
+#define ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(pte)	\
+	(((pte) & ARM_SHORT_PTE_TYPE_MSK) == ARM_SHORT_PTE_TYPE_LARGE)
+
+#define ARM_SHORT_PGD_IDX(a)			((a) >> ARM_SHORT_PGDIR_SHIFT)
+#define ARM_SHORT_PTE_IDX(a)			\
+	(((a) >> ARM_SHORT_PAGE_SHIFT) & (ARM_SHORT_PTRS_PER_PTE - 1))
+
+#define ARM_SHORT_GET_PTE_VA(pgd)		\
+	(phys_to_virt((unsigned long)pgd & ARM_SHORT_PGD_PGTABLE_MSK))
+
+#define ARM_SHORT_PTE_LARGE_GET_PROT(pte)	\
+	(((pte) & (~ARM_SHORT_PTE_LARGE_MSK)) & ~ARM_SHORT_PTE_TYPE_MSK)
+
+#define ARM_SHORT_PGD_GET_PROT(pgd)		\
+	(((pgd) & (~ARM_SHORT_PGD_SECTION_MSK)) & ~ARM_SHORT_PGD_SUPERSECTION)
+
+static bool selftest_running;
+
+static arm_short_iopte *
+arm_short_get_pte_in_pgd(arm_short_iopte pgd, unsigned int iova)
+{
+	arm_short_iopte *pte;
+
+	pte = ARM_SHORT_GET_PTE_VA(pgd);
+	pte += ARM_SHORT_PTE_IDX(iova);
+	return pte;
+}
+
+static void _arm_short_free_pgtable(struct arm_short_io_pgtable *data,
+				    arm_short_iopte *pgd)
+{
+	const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
+	arm_short_iopte *pte;
+	int i;
+
+	pte = ARM_SHORT_GET_PTE_VA(*pgd);
+	for (i = 0; i < ARM_SHORT_PTRS_PER_PTE; i++) {
+		if (pte[i] != 0)
+			return;
+	}
+
+	/* Free whole pte and set pgd to zero while all pte is unmap */
+	kmem_cache_free(data->ptekmem, pte);
+	*pgd = 0;
+	tlb->flush_pgtable(pgd, sizeof(*pgd), data->iop.cookie);
+}
+
+static arm_short_iopte
+__arm_short_pte_prot(struct arm_short_io_pgtable *data, int prot, bool large)
+{
+	arm_short_iopte pteprot;
+
+	pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
+	pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
+				ARM_SHORT_PTE_TYPE_SMALL;
+	if (prot & IOMMU_CACHE)
+		pteprot |=  ARM_SHORT_PTE_B | ARM_SHORT_PTE_C;
+	if (prot & IOMMU_WRITE)
+		pteprot |= large ? ARM_SHORT_PTE_LARGE_TEX0 :
+				ARM_SHORT_PTE_SMALL_TEX0;
+	if (prot & IOMMU_NOEXEC)
+		pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
+			ARM_SHORT_PTE_SMALL_XN;
+	return pteprot;
+}
+
+static arm_short_iopte
+__arm_short_pgd_prot(struct arm_short_io_pgtable *data, int prot, bool super)
+{
+	arm_short_iopte pgdprot;
+
+	pgdprot = ARM_SHORT_PGD_S | ARM_SHORT_PGD_nG;
+	pgdprot |= super ? ARM_SHORT_PGD_TYPE_SUPERSECTION :
+				ARM_SHORT_PGD_TYPE_SECTION;
+	if (prot & IOMMU_CACHE)
+		pgdprot |= ARM_SHORT_PGD_C | ARM_SHORT_PGD_B;
+	if (prot & IOMMU_WRITE)
+		pgdprot |= ARM_SHORT_PGD_TEX0;
+	if (prot & IOMMU_NOEXEC)
+		pgdprot |= ARM_SHORT_PGD_SECTION_XN;
+	if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
+		pgdprot |= ARM_SHORT_PGD_SECTION_NS;
+	return pgdprot;
+}
+
+static arm_short_iopte
+__arm_short_pte_prot_split(struct arm_short_io_pgtable *data,
+			   arm_short_iopte pgdprot,
+			   arm_short_iopte pteprot_large,
+			   bool large)
+{
+	arm_short_iopte pteprot = 0;
+
+	pteprot = ARM_SHORT_PTE_S | ARM_SHORT_PTE_nG;
+	pteprot |= large ? ARM_SHORT_PTE_TYPE_LARGE :
+				ARM_SHORT_PTE_TYPE_SMALL;
+	/* section to pte prot */
+	if (pgdprot & ARM_SHORT_PGD_C)
+		pteprot |= ARM_SHORT_PTE_C;
+	if (pgdprot & ARM_SHORT_PGD_B)
+		pteprot |= ARM_SHORT_PTE_B;
+	if (pgdprot & ARM_SHORT_PGD_TEX0)
+		pteprot |= large ? ARM_SHORT_PTE_LARGE_TEX0 :
+				ARM_SHORT_PTE_SMALL_TEX0;
+	if (pgdprot & ARM_SHORT_PGD_nG)
+		pteprot |= ARM_SHORT_PTE_nG;
+	if (pgdprot & ARM_SHORT_PGD_SECTION_XN)
+		pteprot |= large ? ARM_SHORT_PTE_LARGE_XN :
+				ARM_SHORT_PTE_SMALL_XN;
+
+	/* large page to small page pte prot. Only large page may split */
+	if (pteprot_large && !large) {
+		if (pteprot_large & ARM_SHORT_PTE_LARGE_TEX0)
+			pteprot |= ARM_SHORT_PTE_SMALL_TEX0;
+		if (pteprot_large & ARM_SHORT_PTE_LARGE_XN)
+			pteprot |= ARM_SHORT_PTE_SMALL_XN;
+	}
+	return pteprot;
+}
+
+static arm_short_iopte
+__arm_short_pgtable_prot(struct arm_short_io_pgtable *data, bool noexec)
+{
+	arm_short_iopte pgdprot = 0;
+
+	pgdprot = ARM_SHORT_PGD_TYPE_PGTABLE;
+	if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_NS)
+		pgdprot |= ARM_SHORT_PGD_PGTABLE_NS;
+	if (noexec)
+		pgdprot |= ARM_SHORT_PGD_PGTABLE_XN;
+	return pgdprot;
+}
+
+static int
+_arm_short_map(struct arm_short_io_pgtable *data,
+	       unsigned int iova, phys_addr_t paddr,
+	       arm_short_iopte pgdprot, arm_short_iopte pteprot,
+	       bool large)
+{
+	const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
+	arm_short_iopte *pgd = data->pgd, *pte;
+	void *cookie = data->iop.cookie, *pte_va;
+	unsigned int ptenr = large ? 16 : 1;
+	int i, quirk = data->iop.cfg.quirks;
+	bool ptenew = false;
+
+	pgd += ARM_SHORT_PGD_IDX(iova);
+
+	if (!pteprot) { /* section or supersection */
+		if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK)
+			pgdprot &= ~ARM_SHORT_PGD_SECTION_XN;
+		pte = pgd;
+		pteprot = pgdprot;
+	} else {        /* page or largepage */
+		if (quirk & IO_PGTABLE_QUIRK_SHORT_MTK) {
+			if (large) { /* special Bit */
+				if (pteprot & ARM_SHORT_PTE_LARGE_TEX0) {
+					pteprot &= ~ARM_SHORT_PTE_LARGE_TEX0;
+					pteprot |= ARM_SHORT_PTE_SMALL_TEX0;
+				}
+				pteprot &= ~ARM_SHORT_PTE_LARGE_XN;
+			} else {
+				pteprot &= ~ARM_SHORT_PTE_SMALL_XN;
+			}
+		}
+
+		if (!(*pgd)) {
+			pte_va = kmem_cache_zalloc(data->ptekmem, GFP_ATOMIC);
+			if (unlikely(!pte_va))
+				return -ENOMEM;
+			ptenew = true;
+			*pgd = virt_to_phys(pte_va) | pgdprot;
+			kmemleak_ignore(pte_va);
+			tlb->flush_pgtable(pgd, sizeof(*pgd), cookie);
+		}
+		pte = arm_short_get_pte_in_pgd(*pgd, iova);
+	}
+
+	pteprot |= (arm_short_iopte)paddr;
+	for (i = 0; i < ptenr; i++) {
+		if (pte[i]) {/* Someone else may have allocated for this pte */
+			WARN_ON(!selftest_running);
+			goto err_exist_pte;
+		}
+		pte[i] = pteprot;
+	}
+	tlb->flush_pgtable(pte, ptenr * sizeof(*pte), cookie);
+
+	return 0;
+
+err_exist_pte:
+	while (i--)
+		pte[i] = 0;
+	if (ptenew)
+		kmem_cache_free(data->ptekmem, pte_va);
+	return -EEXIST;
+}
+
+static int arm_short_map(struct io_pgtable_ops *ops, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot)
+{
+	struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
+	int ret;
+	arm_short_iopte pgdprot = 0, pteprot = 0;
+	bool large;
+
+	/* If no access, then nothing to do */
+	if (!(prot & (IOMMU_READ | IOMMU_WRITE)))
+		return 0;
+
+	switch (size) {
+	case SZ_4K:
+	case SZ_64K:
+		large = (size == SZ_64K) ? true : false;
+		pteprot = __arm_short_pte_prot(data, prot, large);
+		pgdprot = __arm_short_pgtable_prot(data, prot & IOMMU_NOEXEC);
+		break;
+
+	case SZ_1M:
+	case SZ_16M:
+		large = (size == SZ_16M) ? true : false;
+		pgdprot = __arm_short_pgd_prot(data, prot, large);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (WARN_ON((iova | paddr) & (size - 1)))
+		return -EINVAL;
+
+	ret = _arm_short_map(data, iova, paddr, pgdprot, pteprot, large);
+
+	tlb->tlb_add_flush(iova, size, true, data->iop.cookie);
+	tlb->tlb_sync(data->iop.cookie);
+	return ret;
+}
+
+static phys_addr_t arm_short_iova_to_phys(struct io_pgtable_ops *ops,
+					  unsigned long iova)
+{
+	struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	arm_short_iopte *pte, *pgd = data->pgd;
+	phys_addr_t pa = 0;
+
+	pgd += ARM_SHORT_PGD_IDX(iova);
+
+	if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) {
+		pte = arm_short_get_pte_in_pgd(*pgd, iova);
+
+		if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte)) {
+			pa = (*pte) & ARM_SHORT_PTE_LARGE_MSK;
+			pa |= iova & ~ARM_SHORT_PTE_LARGE_MSK;
+		} else if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte)) {
+			pa = (*pte) & ARM_SHORT_PTE_SMALL_MSK;
+			pa |= iova & ~ARM_SHORT_PTE_SMALL_MSK;
+		}
+	} else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) {
+		pa = (*pgd) & ARM_SHORT_PGD_SECTION_MSK;
+		pa |= iova & ~ARM_SHORT_PGD_SECTION_MSK;
+	} else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
+		pa = (*pgd) & ARM_SHORT_PGD_SUPERSECTION_MSK;
+		pa |= iova & ~ARM_SHORT_PGD_SUPERSECTION_MSK;
+	}
+
+	return pa;
+}
+
+static int
+arm_short_split_blk_unmap(struct io_pgtable_ops *ops, unsigned int iova,
+			  phys_addr_t paddr, size_t size,
+			  arm_short_iopte pgdprotup, arm_short_iopte pteprotup,
+			  size_t blk_size)
+{
+	struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	unsigned long *pgbitmap = &cfg->pgsize_bitmap;
+	unsigned int blk_base, blk_start, blk_end;
+	arm_short_iopte pgdprot, pteprot;
+	size_t mapsize = 0, nextmapsize;
+	phys_addr_t blk_paddr;
+	int ret;
+	unsigned int i;
+
+	/* find the nearest mapsize */
+	for (i = find_first_bit(pgbitmap, BITS_PER_LONG);
+	     i < BITS_PER_LONG && ((1 << i) < blk_size) &&
+	     IS_ALIGNED(size, 1 << i);
+	     i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1))
+		mapsize = 1 << i;
+
+	if (WARN_ON(!mapsize))
+		return 0; /* Bytes unmapped */
+	nextmapsize = 1 << i;
+
+	blk_base = iova & ~(blk_size - 1);
+	blk_start = blk_base;
+	blk_end = blk_start + blk_size;
+	blk_paddr = paddr;
+
+	for (; blk_start < blk_end;
+	     blk_start += mapsize, blk_paddr += mapsize) {
+		/* Unmap! */
+		if (blk_start == iova)
+			continue;
+
+		/* Try to upper map */
+		if (blk_base != blk_start &&
+		    IS_ALIGNED(blk_start | blk_paddr, nextmapsize) &&
+		    mapsize != nextmapsize) {
+			mapsize = nextmapsize;
+			i = find_next_bit(pgbitmap, BITS_PER_LONG, i + 1);
+			if (i < BITS_PER_LONG)
+				nextmapsize = 1 << i;
+		}
+
+		if (mapsize == SZ_1M) {
+			pgdprot = pgdprotup;
+			pgdprot |= __arm_short_pgd_prot(data, 0, false);
+			pteprot = 0;
+		} else { /* small or large page */
+			bool noexec = (blk_size == SZ_64K) ?
+				(pteprotup & ARM_SHORT_PTE_LARGE_XN) :
+				(pgdprotup & ARM_SHORT_PGD_SECTION_XN);
+
+			pteprot = __arm_short_pte_prot_split(
+						data, pgdprotup, pteprotup,
+						mapsize == SZ_64K);
+			pgdprot = __arm_short_pgtable_prot(data, noexec);
+		}
+
+		ret = _arm_short_map(data, blk_start, blk_paddr, pgdprot,
+				     pteprot, mapsize == SZ_64K);
+		if (ret < 0) {
+			/* Free the table we allocated */
+			arm_short_iopte *pgd = data->pgd, *pte;
+
+			pgd += ARM_SHORT_PGD_IDX(blk_base);
+			if (*pgd) {
+				pte = ARM_SHORT_GET_PTE_VA(*pgd);
+				kmem_cache_free(data->ptekmem, pte);
+				*pgd = 0;
+				tlb->flush_pgtable(pgd, sizeof(*pgd),
+						   data->iop.cookie);
+			}
+			return 0;/* Bytes unmapped */
+		}
+		tlb->tlb_add_flush(blk_start, mapsize, true, data->iop.cookie);
+		tlb->tlb_sync(data->iop.cookie);
+	}
+
+	return size;
+}
+
+static int arm_short_unmap(struct io_pgtable_ops *ops,
+			   unsigned long iova,
+			   size_t size)
+{
+	struct arm_short_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	const struct iommu_gather_ops *tlb = data->iop.cfg.tlb;
+	void *cookie = data->iop.cookie;
+	arm_short_iopte *pgd, *pte = NULL;
+	arm_short_iopte pgdprot, pteprot = 0;
+	phys_addr_t paddr;
+	unsigned int nrtoclean, iova_base, blk_size = 0;
+
+	pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(iova);
+
+	/* get block size */
+	if (ARM_SHORT_PGD_TYPE_IS_PGTABLE(*pgd)) {
+		pte = arm_short_get_pte_in_pgd(*pgd, iova);
+
+		if (ARM_SHORT_PTE_TYPE_IS_SMALLPAGE(*pte))
+			blk_size = SZ_4K;
+		else if (ARM_SHORT_PTE_TYPE_IS_LARGEPAGE(*pte))
+			blk_size = SZ_64K;
+		else
+			WARN_ON(1);
+	} else if (ARM_SHORT_PGD_TYPE_IS_SECTION(*pgd)) {
+		blk_size = SZ_1M;
+	} else if (ARM_SHORT_PGD_TYPE_IS_SUPERSECTION(*pgd)) {
+		blk_size = SZ_16M;
+	} else {
+		WARN_ON(1);
+	}
+
+	iova_base = iova & ~(blk_size - 1);
+	pgd = (arm_short_iopte *)data->pgd + ARM_SHORT_PGD_IDX(iova_base);
+	paddr = arm_short_iova_to_phys(ops, iova_base);
+	pgdprot = *pgd;
+
+	if (blk_size == SZ_4K || blk_size == SZ_64K) {
+		pte = arm_short_get_pte_in_pgd(*pgd, iova_base);
+		pteprot = *pte;
+		nrtoclean = blk_size / SZ_4K;
+		memset(pte, 0, nrtoclean * sizeof(*pte));
+		tlb->flush_pgtable(pte, nrtoclean * sizeof(*pte), cookie);
+
+		_arm_short_free_pgtable(data, pgd);
+	} else if (blk_size == SZ_1M || blk_size == SZ_16M) {
+		nrtoclean = blk_size / SZ_1M;
+		memset(pgd, 0, nrtoclean * sizeof(*pgd));
+		tlb->flush_pgtable(pgd, nrtoclean * sizeof(*pgd), cookie);
+	}
+
+	tlb->tlb_add_flush(iova, blk_size, true, cookie);
+	tlb->tlb_sync(cookie);
+
+	if (blk_size > size) { /* Split the block */
+		return arm_short_split_blk_unmap(
+				ops, iova, paddr, size,
+				ARM_SHORT_PGD_GET_PROT(pgdprot),
+				ARM_SHORT_PTE_LARGE_GET_PROT(pteprot),
+				blk_size);
+	} else if (blk_size < size) {
+		/* Unmap the block while remap partial again after split */
+		return blk_size +
+			arm_short_unmap(ops, iova + blk_size, size - blk_size);
+	}
+
+	return size;
+}
+
+static struct io_pgtable *
+arm_short_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
+{
+	struct arm_short_io_pgtable *data;
+
+	if (cfg->ias > 32)
+		return NULL;
+
+	if (cfg->oas > 32)
+		return NULL;
+
+	cfg->pgsize_bitmap &=
+		(cfg->quirks & IO_PGTABLE_QUIRK_SHORT_SUPERSECTION) ?
+		(SZ_4K | SZ_64K | SZ_1M | SZ_16M) : (SZ_4K | SZ_64K | SZ_1M);
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return NULL;
+
+	data->pgd_size = SZ_16K;
+	data->pgd = alloc_pages_exact(data->pgd_size,
+				      GFP_KERNEL | __GFP_ZERO | __GFP_DMA);
+	if (!data->pgd)
+		goto out_free_data;
+
+	cfg->tlb->flush_pgtable(data->pgd, data->pgd_size, cookie);
+
+	data->ptekmem = kmem_cache_create("io-pgtable-arm-short",
+					  ARM_SHORT_BYTES_PER_PTE,
+					  ARM_SHORT_BYTES_PER_PTE,
+					  0, NULL);
+	if (!data->ptekmem)
+		goto out_free_pte;
+
+	/* TTBRs */
+	cfg->arm_short_cfg.ttbr[0] = virt_to_phys(data->pgd);
+	cfg->arm_short_cfg.ttbr[1] = 0;
+
+	cfg->arm_short_cfg.tcr = 0;
+	cfg->arm_short_cfg.nmrr = 0;
+	cfg->arm_short_cfg.prrr = 0;
+
+	data->iop.ops = (struct io_pgtable_ops) {
+		.map		= arm_short_map,
+		.unmap		= arm_short_unmap,
+		.iova_to_phys	= arm_short_iova_to_phys,
+	};
+
+	return &data->iop;
+
+out_free_pte:
+	free_pages_exact(data->pgd, data->pgd_size);
+out_free_data:
+	kfree(data);
+	return NULL;
+}
+
+static void arm_short_free_pgtable(struct io_pgtable *iop)
+{
+	struct arm_short_io_pgtable *data = io_pgtable_to_data(iop);
+
+	kmem_cache_destroy(data->ptekmem);
+	free_pages_exact(data->pgd, data->pgd_size);
+	kfree(data);
+}
+
+struct io_pgtable_init_fns io_pgtable_arm_short_init_fns = {
+	.alloc	= arm_short_alloc_pgtable,
+	.free	= arm_short_free_pgtable,
+};
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_SHORT_SELFTEST
+
+static struct io_pgtable_cfg *cfg_cookie;
+
+static void dummy_tlb_flush_all(void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_tlb_add_flush(unsigned long iova, size_t size, bool leaf,
+				void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+	WARN_ON(!(size & cfg_cookie->pgsize_bitmap));
+}
+
+static void dummy_tlb_sync(void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_flush_pgtable(void *ptr, size_t size, void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static struct iommu_gather_ops dummy_tlb_ops = {
+	.tlb_flush_all	= dummy_tlb_flush_all,
+	.tlb_add_flush	= dummy_tlb_add_flush,
+	.tlb_sync	= dummy_tlb_sync,
+	.flush_pgtable	= dummy_flush_pgtable,
+};
+
+#define __FAIL(ops)	({						\
+		WARN(1, "selftest: test failed\n");	\
+		selftest_running = false;				\
+		-EFAULT;						\
+})
+
+static int __init arm_short_do_selftests(void)
+{
+	struct io_pgtable_ops *ops;
+	struct io_pgtable_cfg cfg = {
+		.tlb = &dummy_tlb_ops,
+		.oas = 32,
+		.ias = 32,
+		.quirks = IO_PGTABLE_QUIRK_ARM_NS |
+			IO_PGTABLE_QUIRK_SHORT_SUPERSECTION,
+		.pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
+	};
+	unsigned int iova, size, iova_start;
+	unsigned int i, loopnr = 0;
+
+	selftest_running = true;
+
+	cfg_cookie = &cfg;
+
+	ops = alloc_io_pgtable_ops(ARM_SHORT_DESC, &cfg, &cfg);
+	if (!ops) {
+		pr_err("Failed to alloc short desc io pgtable\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Initial sanity checks.
+	 * Empty page tables shouldn't provide any translations.
+	 */
+	if (ops->iova_to_phys(ops, 42))
+		return __FAIL(ops);
+
+	if (ops->iova_to_phys(ops, SZ_1G + 42))
+		return __FAIL(ops);
+
+	if (ops->iova_to_phys(ops, SZ_2G + 42))
+		return __FAIL(ops);
+
+	/*
+	 * Distinct mappings of different granule sizes.
+	 */
+	iova = 0;
+	i = find_first_bit(&cfg.pgsize_bitmap, BITS_PER_LONG);
+	while (i != BITS_PER_LONG) {
+		size = 1UL << i;
+		if (ops->map(ops, iova, iova, size, IOMMU_READ |
+						    IOMMU_WRITE |
+						    IOMMU_NOEXEC |
+						    IOMMU_CACHE))
+			return __FAIL(ops);
+
+		/* Overlapping mappings */
+		if (!ops->map(ops, iova, iova + size, size,
+			      IOMMU_READ | IOMMU_NOEXEC))
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+			return __FAIL(ops);
+
+		iova += SZ_16M;
+		i++;
+		i = find_next_bit(&cfg.pgsize_bitmap, BITS_PER_LONG, i);
+		loopnr++;
+	}
+
+	/* Partial unmap */
+	i = 1;
+	size = 1UL << __ffs(cfg.pgsize_bitmap);
+	while (i < loopnr) {
+		iova_start = i * SZ_16M;
+		if (ops->unmap(ops, iova_start + size, size) != size)
+			return __FAIL(ops);
+
+		/* Remap of partial unmap */
+		if (ops->map(ops, iova_start + size, size, size, IOMMU_READ))
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova_start + size + 42)
+		    != (size + 42))
+			return __FAIL(ops);
+		i++;
+	}
+
+	/* Full unmap */
+	iova = 0;
+	i = find_first_bit(&cfg.pgsize_bitmap, BITS_PER_LONG);
+	while (i != BITS_PER_LONG) {
+		size = 1UL << i;
+
+		if (ops->unmap(ops, iova, size) != size)
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova + 42))
+			return __FAIL(ops);
+
+		/* Remap full block */
+		if (ops->map(ops, iova, iova, size, IOMMU_WRITE))
+			return __FAIL(ops);
+
+		if (ops->iova_to_phys(ops, iova + 42) != (iova + 42))
+			return __FAIL(ops);
+
+		iova += SZ_16M;
+		i++;
+		i = find_next_bit(&cfg.pgsize_bitmap, BITS_PER_LONG, i);
+	}
+
+	free_io_pgtable_ops(ops);
+
+	selftest_running = false;
+	return 0;
+}
+subsys_initcall(arm_short_do_selftests);
+#endif
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 4e46021..13aad17 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -36,9 +36,6 @@ 
 #define io_pgtable_to_data(x)						\
 	container_of((x), struct arm_lpae_io_pgtable, iop)
 
-#define io_pgtable_ops_to_pgtable(x)					\
-	container_of((x), struct io_pgtable, ops)
-
 #define io_pgtable_ops_to_data(x)					\
 	io_pgtable_to_data(io_pgtable_ops_to_pgtable(x))
 
diff --git a/drivers/iommu/io-pgtable.c b/drivers/iommu/io-pgtable.c
index 6436fe2..14a9b3a 100644
--- a/drivers/iommu/io-pgtable.c
+++ b/drivers/iommu/io-pgtable.c
@@ -28,6 +28,7 @@  extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s1_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_32_lpae_s2_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns;
 extern struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s2_init_fns;
+extern struct io_pgtable_init_fns io_pgtable_arm_short_init_fns;
 
 static const struct io_pgtable_init_fns *
 io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] =
@@ -38,6 +39,9 @@  io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] =
 	[ARM_64_LPAE_S1] = &io_pgtable_arm_64_lpae_s1_init_fns,
 	[ARM_64_LPAE_S2] = &io_pgtable_arm_64_lpae_s2_init_fns,
 #endif
+#ifdef CONFIG_IOMMU_IO_PGTABLE_SHORT
+	[ARM_SHORT_DESC] = &io_pgtable_arm_short_init_fns,
+#endif
 };
 
 struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index 10e32f6..7261ada 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -9,6 +9,7 @@  enum io_pgtable_fmt {
 	ARM_32_LPAE_S2,
 	ARM_64_LPAE_S1,
 	ARM_64_LPAE_S2,
+	ARM_SHORT_DESC,
 	IO_PGTABLE_NUM_FMTS,
 };
 
@@ -44,6 +45,8 @@  struct iommu_gather_ops {
  */
 struct io_pgtable_cfg {
 	#define IO_PGTABLE_QUIRK_ARM_NS	(1 << 0)	/* Set NS bit in PTEs */
+	#define IO_PGTABLE_QUIRK_SHORT_SUPERSECTION     BIT(1)
+	#define IO_PGTABLE_QUIRK_SHORT_MTK		BIT(2)
 	int				quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;
@@ -62,6 +65,13 @@  struct io_pgtable_cfg {
 			u64	vttbr;
 			u64	vtcr;
 		} arm_lpae_s2_cfg;
+
+		struct {
+			u32	ttbr[2];
+			u32	tcr;
+			u32	nmrr;
+			u32	prrr;
+		} arm_short_cfg;
 	};
 };
 
@@ -128,6 +138,9 @@  struct io_pgtable {
 	struct io_pgtable_ops	ops;
 };
 
+#define io_pgtable_ops_to_pgtable(x)		\
+	container_of((x), struct io_pgtable, ops)
+
 /**
  * struct io_pgtable_init_fns - Alloc/free a set of page tables for a
  *                              particular format.