Message ID | 1465392392-2003-7-git-send-email-zhengsq@rock-chips.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Hi, On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng <zhengsq@rock-chips.com> wrote: > Use DMA API instead of architecture internal functions like > __cpuc_flush_dcache_area() etc. > > To support the virtual device like DRM the virtual slave iommu > added in the previous patch, attaching to which the DRM can use > it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled. > > With this patch, this driver is available for ARM64 like RK3399. > Could we instead simply allocate coherent memory for page tables using dma_alloc_coherent() and skip any flushing on CPU side completely? If I'm looking correctly, the driver only reads back the page directory when checking if there is a need to allocate new page table, so there shouldn't be any significant penalty for disabling the cache. Other than that, please see some comments inline. > Signed-off-by: Shunqian Zheng <zhengsq@rock-chips.com> > --- > drivers/iommu/rockchip-iommu.c | 113 ++++++++++++++++++++++++++--------------- > 1 file changed, 71 insertions(+), 42 deletions(-) > > diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c > index d6c3051..aafea6e 100644 > --- a/drivers/iommu/rockchip-iommu.c > +++ b/drivers/iommu/rockchip-iommu.c > @@ -4,8 +4,6 @@ > * published by the Free Software Foundation. > */ > > -#include <asm/cacheflush.h> > -#include <asm/pgtable.h> > #include <linux/compiler.h> > #include <linux/delay.h> > #include <linux/device.h> > @@ -61,8 +59,7 @@ > #define RK_MMU_IRQ_BUS_ERROR 0x02 /* bus read error */ > #define RK_MMU_IRQ_MASK (RK_MMU_IRQ_PAGE_FAULT | RK_MMU_IRQ_BUS_ERROR) > > -#define NUM_DT_ENTRIES 1024 > -#define NUM_PT_ENTRIES 1024 > +#define NUM_TLB_ENTRIES 1024 /* for both DT and PT */ Is it necessary to change this in this patch? In general, it's not a good idea to mix multiple logical changes together. > > #define SPAGE_ORDER 12 > #define SPAGE_SIZE (1 << SPAGE_ORDER) > @@ -82,7 +79,9 @@ > > struct rk_iommu_domain { > struct list_head iommus; > + struct device *dev; > u32 *dt; /* page directory table */ > + dma_addr_t dt_dma; > spinlock_t iommus_lock; /* lock for iommus list */ > spinlock_t dt_lock; /* lock for modifying page directory table */ > > @@ -98,14 +97,12 @@ struct rk_iommu { > struct iommu_domain *domain; /* domain to which iommu is attached */ > }; > > -static inline void rk_table_flush(u32 *va, unsigned int count) > +static inline void rk_table_flush(struct device *dev, dma_addr_t dma, > + unsigned int count) > { > - phys_addr_t pa_start = virt_to_phys(va); > - phys_addr_t pa_end = virt_to_phys(va + count); > - size_t size = pa_end - pa_start; > + size_t size = count * 4; It would be a good idea to specify what "count" is. I'm a bit confused that before it meant bytes and now some multiple of 4? Best regards, Tomasz
Hi On 2016年06月10日 17:10, Tomasz Figa wrote: > Hi, > > On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng <zhengsq@rock-chips.com> wrote: >> Use DMA API instead of architecture internal functions like >> __cpuc_flush_dcache_area() etc. >> >> To support the virtual device like DRM the virtual slave iommu >> added in the previous patch, attaching to which the DRM can use >> it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled. >> >> With this patch, this driver is available for ARM64 like RK3399. >> > Could we instead simply allocate coherent memory for page tables using > dma_alloc_coherent() and skip any flushing on CPU side completely? If > I'm looking correctly, the driver only reads back the page directory > when checking if there is a need to allocate new page table, so there > shouldn't be any significant penalty for disabling the cache. I try to use dma_alloc_coherent() to replace the dma_map_single(), but it doesn't work for me properly. Because the DRM uses the iommu_dma_ops instead the swiotlb_dma_ops after attaching to iommu, so when the iommu domain need to alloc a new page in rk_iommu_map(), it would call: rk_iommu_map() --> dma_alloc_coherent() --> ops->alloc() --> iommu_map() --> rk_iommu_map() Then I try to reserve memory for coherent so that, dma_alloc_coherent() calls dma_alloc_from_coherent() but not ops->alloc(). But it doesn't work too because when DRM request buffer it never uses iommu. > > Other than that, please see some comments inline. > >> Signed-off-by: Shunqian Zheng <zhengsq@rock-chips.com> >> --- >> drivers/iommu/rockchip-iommu.c | 113 ++++++++++++++++++++++++++--------------- >> 1 file changed, 71 insertions(+), 42 deletions(-) >> >> diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c >> index d6c3051..aafea6e 100644 >> --- a/drivers/iommu/rockchip-iommu.c >> +++ b/drivers/iommu/rockchip-iommu.c >> @@ -4,8 +4,6 @@ >> * published by the Free Software Foundation. >> */ >> >> -#include <asm/cacheflush.h> >> -#include <asm/pgtable.h> >> #include <linux/compiler.h> >> #include <linux/delay.h> >> #include <linux/device.h> >> @@ -61,8 +59,7 @@ >> #define RK_MMU_IRQ_BUS_ERROR 0x02 /* bus read error */ >> #define RK_MMU_IRQ_MASK (RK_MMU_IRQ_PAGE_FAULT | RK_MMU_IRQ_BUS_ERROR) >> >> -#define NUM_DT_ENTRIES 1024 >> -#define NUM_PT_ENTRIES 1024 >> +#define NUM_TLB_ENTRIES 1024 /* for both DT and PT */ > Is it necessary to change this in this patch? In general, it's not a > good idea to mix multiple logical changes together. Sure, will restore changes in v3. > >> #define SPAGE_ORDER 12 >> #define SPAGE_SIZE (1 << SPAGE_ORDER) >> @@ -82,7 +79,9 @@ >> >> struct rk_iommu_domain { >> struct list_head iommus; >> + struct device *dev; >> u32 *dt; /* page directory table */ >> + dma_addr_t dt_dma; >> spinlock_t iommus_lock; /* lock for iommus list */ >> spinlock_t dt_lock; /* lock for modifying page directory table */ >> >> @@ -98,14 +97,12 @@ struct rk_iommu { >> struct iommu_domain *domain; /* domain to which iommu is attached */ >> }; >> >> -static inline void rk_table_flush(u32 *va, unsigned int count) >> +static inline void rk_table_flush(struct device *dev, dma_addr_t dma, >> + unsigned int count) >> { >> - phys_addr_t pa_start = virt_to_phys(va); >> - phys_addr_t pa_end = virt_to_phys(va + count); >> - size_t size = pa_end - pa_start; >> + size_t size = count * 4; > It would be a good idea to specify what "count" is. I'm a bit confused > that before it meant bytes and now some multiple of 4? "count" means PT/DT entry count to flush here. I would add some more comment on it. Thank you very much, Shunqian > > Best regards, > Tomasz > > _______________________________________________ > Linux-rockchip mailing list > Linux-rockchip@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/linux-rockchip
On Mon, Jun 13, 2016 at 6:56 PM, Shunqian Zheng <shunqian.zheng@gmail.com> wrote: > Hi > > On 2016年06月10日 17:10, Tomasz Figa wrote: >> >> Hi, >> >> On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng <zhengsq@rock-chips.com> >> wrote: >>> >>> Use DMA API instead of architecture internal functions like >>> __cpuc_flush_dcache_area() etc. >>> >>> To support the virtual device like DRM the virtual slave iommu >>> added in the previous patch, attaching to which the DRM can use >>> it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled. >>> >>> With this patch, this driver is available for ARM64 like RK3399. >>> >> Could we instead simply allocate coherent memory for page tables using >> dma_alloc_coherent() and skip any flushing on CPU side completely? If >> I'm looking correctly, the driver only reads back the page directory >> when checking if there is a need to allocate new page table, so there >> shouldn't be any significant penalty for disabling the cache. > > I try to use dma_alloc_coherent() to replace the dma_map_single(), > but it doesn't work for me properly. > Because the DRM uses the iommu_dma_ops instead the swiotlb_dma_ops after > attaching > to iommu, so when the iommu domain need to alloc a new page in > rk_iommu_map(), > it would call: > rk_iommu_map() --> dma_alloc_coherent() --> ops->alloc() --> iommu_map() > --> rk_iommu_map() It shouldn't call iommu_map(), because the IOMMU is not behind another IOMMU. Are you sure you called dma_alloc_coherent() on behalf of the IOMMU struct device and not the DRM device? Best regards, Tomasz
HI, On 2016年06月13日 18:21, Tomasz Figa wrote: > On Mon, Jun 13, 2016 at 6:56 PM, Shunqian Zheng > <shunqian.zheng@gmail.com> wrote: >> Hi >> >> On 2016年06月10日 17:10, Tomasz Figa wrote: >>> Hi, >>> >>> On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng <zhengsq@rock-chips.com> >>> wrote: >>>> Use DMA API instead of architecture internal functions like >>>> __cpuc_flush_dcache_area() etc. >>>> >>>> To support the virtual device like DRM the virtual slave iommu >>>> added in the previous patch, attaching to which the DRM can use >>>> it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled. >>>> >>>> With this patch, this driver is available for ARM64 like RK3399. >>>> >>> Could we instead simply allocate coherent memory for page tables using >>> dma_alloc_coherent() and skip any flushing on CPU side completely? If >>> I'm looking correctly, the driver only reads back the page directory >>> when checking if there is a need to allocate new page table, so there >>> shouldn't be any significant penalty for disabling the cache. >> I try to use dma_alloc_coherent() to replace the dma_map_single(), >> but it doesn't work for me properly. >> Because the DRM uses the iommu_dma_ops instead the swiotlb_dma_ops after >> attaching >> to iommu, so when the iommu domain need to alloc a new page in >> rk_iommu_map(), >> it would call: >> rk_iommu_map() --> dma_alloc_coherent() --> ops->alloc() --> iommu_map() >> --> rk_iommu_map() > It shouldn't call iommu_map(), because the IOMMU is not behind another > IOMMU. Are you sure you called dma_alloc_coherent() on behalf of the > IOMMU struct device and not the DRM device? I called dma_alloc_coherent() with DRM device but not the IOMMU device, because DRM didn't attach to any iommu. Even allocating an virtual one when attaching, the iommu->dev is DRM device though. Am I right here? Thank you very much, Shunqian > > Best regards, > Tomasz
On 13/06/16 10:56, Shunqian Zheng wrote: > Hi > > On 2016年06月10日 17:10, Tomasz Figa wrote: >> Hi, >> >> On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng >> <zhengsq@rock-chips.com> wrote: >>> Use DMA API instead of architecture internal functions like >>> __cpuc_flush_dcache_area() etc. >>> >>> To support the virtual device like DRM the virtual slave iommu >>> added in the previous patch, attaching to which the DRM can use >>> it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled. >>> >>> With this patch, this driver is available for ARM64 like RK3399. >>> >> Could we instead simply allocate coherent memory for page tables using >> dma_alloc_coherent() and skip any flushing on CPU side completely? If >> I'm looking correctly, the driver only reads back the page directory >> when checking if there is a need to allocate new page table, so there >> shouldn't be any significant penalty for disabling the cache. > I try to use dma_alloc_coherent() to replace the dma_map_single(), > but it doesn't work for me properly. > Because the DRM uses the iommu_dma_ops instead the swiotlb_dma_ops after > attaching > to iommu, so when the iommu domain need to alloc a new page in > rk_iommu_map(), > it would call: > rk_iommu_map() --> dma_alloc_coherent() --> ops->alloc() --> > iommu_map() --> rk_iommu_map() That sounds more like you're passing the wrong device around somewhere, since this approach is working fine for other IOMMUs; specifically, the flow goes: dma_alloc_coherent(DRM dev) // for buffer --> ops->alloc(DRM dev) --> iommu_dma_alloc(DRM dev) --> iommu_map() --> dma_alloc_coherent(IOMMU dev) // for pagetable --> ops->alloc(IOMMU dev) --> swiotlb_alloc(IOMMU dev) There shouldn't be any need for this "virtual IOMMU" at all. I think the Exynos DRM driver is in a similar situation of having multiple devices (involving multiple IOMMUs) backing the virtual DRM device, and that doesn't seem to be doing anything this crazy so it's probably worth taking a look at. Robin. > Then I try to reserve memory for coherent so that, dma_alloc_coherent() > calls dma_alloc_from_coherent() > but not ops->alloc(). But it doesn't work too because when DRM request > buffer it never uses iommu. >> >> Other than that, please see some comments inline. >> >>> Signed-off-by: Shunqian Zheng <zhengsq@rock-chips.com> >>> --- >>> drivers/iommu/rockchip-iommu.c | 113 >>> ++++++++++++++++++++++++++--------------- >>> 1 file changed, 71 insertions(+), 42 deletions(-) >>> >>> diff --git a/drivers/iommu/rockchip-iommu.c >>> b/drivers/iommu/rockchip-iommu.c >>> index d6c3051..aafea6e 100644 >>> --- a/drivers/iommu/rockchip-iommu.c >>> +++ b/drivers/iommu/rockchip-iommu.c >>> @@ -4,8 +4,6 @@ >>> * published by the Free Software Foundation. >>> */ >>> >>> -#include <asm/cacheflush.h> >>> -#include <asm/pgtable.h> >>> #include <linux/compiler.h> >>> #include <linux/delay.h> >>> #include <linux/device.h> >>> @@ -61,8 +59,7 @@ >>> #define RK_MMU_IRQ_BUS_ERROR 0x02 /* bus read error */ >>> #define RK_MMU_IRQ_MASK (RK_MMU_IRQ_PAGE_FAULT | >>> RK_MMU_IRQ_BUS_ERROR) >>> >>> -#define NUM_DT_ENTRIES 1024 >>> -#define NUM_PT_ENTRIES 1024 >>> +#define NUM_TLB_ENTRIES 1024 /* for both DT and PT */ >> Is it necessary to change this in this patch? In general, it's not a >> good idea to mix multiple logical changes together. > Sure, will restore changes in v3. >> >>> #define SPAGE_ORDER 12 >>> #define SPAGE_SIZE (1 << SPAGE_ORDER) >>> @@ -82,7 +79,9 @@ >>> >>> struct rk_iommu_domain { >>> struct list_head iommus; >>> + struct device *dev; >>> u32 *dt; /* page directory table */ >>> + dma_addr_t dt_dma; >>> spinlock_t iommus_lock; /* lock for iommus list */ >>> spinlock_t dt_lock; /* lock for modifying page directory >>> table */ >>> >>> @@ -98,14 +97,12 @@ struct rk_iommu { >>> struct iommu_domain *domain; /* domain to which iommu is >>> attached */ >>> }; >>> >>> -static inline void rk_table_flush(u32 *va, unsigned int count) >>> +static inline void rk_table_flush(struct device *dev, dma_addr_t dma, >>> + unsigned int count) >>> { >>> - phys_addr_t pa_start = virt_to_phys(va); >>> - phys_addr_t pa_end = virt_to_phys(va + count); >>> - size_t size = pa_end - pa_start; >>> + size_t size = count * 4; >> It would be a good idea to specify what "count" is. I'm a bit confused >> that before it meant bytes and now some multiple of 4? > "count" means PT/DT entry count to flush here. I would add some more > comment on it. > > Thank you very much, > Shunqian >> >> Best regards, >> Tomasz >> >> _______________________________________________ >> Linux-rockchip mailing list >> Linux-rockchip@lists.infradead.org >> http://lists.infradead.org/mailman/listinfo/linux-rockchip > > _______________________________________________ > iommu mailing list > iommu@lists.linux-foundation.org > https://lists.linuxfoundation.org/mailman/listinfo/iommu
On Mon, Jun 13, 2016 at 7:31 PM, Shunqian Zheng <shunqian.zheng@gmail.com> wrote: > HI, > > > On 2016年06月13日 18:21, Tomasz Figa wrote: >> >> On Mon, Jun 13, 2016 at 6:56 PM, Shunqian Zheng >> <shunqian.zheng@gmail.com> wrote: >>> >>> Hi >>> >>> On 2016年06月10日 17:10, Tomasz Figa wrote: >>>> >>>> Hi, >>>> >>>> On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng <zhengsq@rock-chips.com> >>>> wrote: >>>>> >>>>> Use DMA API instead of architecture internal functions like >>>>> __cpuc_flush_dcache_area() etc. >>>>> >>>>> To support the virtual device like DRM the virtual slave iommu >>>>> added in the previous patch, attaching to which the DRM can use >>>>> it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled. >>>>> >>>>> With this patch, this driver is available for ARM64 like RK3399. >>>>> >>>> Could we instead simply allocate coherent memory for page tables using >>>> dma_alloc_coherent() and skip any flushing on CPU side completely? If >>>> I'm looking correctly, the driver only reads back the page directory >>>> when checking if there is a need to allocate new page table, so there >>>> shouldn't be any significant penalty for disabling the cache. >>> >>> I try to use dma_alloc_coherent() to replace the dma_map_single(), >>> but it doesn't work for me properly. >>> Because the DRM uses the iommu_dma_ops instead the swiotlb_dma_ops after >>> attaching >>> to iommu, so when the iommu domain need to alloc a new page in >>> rk_iommu_map(), >>> it would call: >>> rk_iommu_map() --> dma_alloc_coherent() --> ops->alloc() --> >>> iommu_map() >>> --> rk_iommu_map() >> >> It shouldn't call iommu_map(), because the IOMMU is not behind another >> IOMMU. Are you sure you called dma_alloc_coherent() on behalf of the >> IOMMU struct device and not the DRM device? > > I called dma_alloc_coherent() with DRM device but not the IOMMU device, > because DRM didn't attach to any iommu. Even allocating an virtual one when > attaching, the iommu->dev > is DRM device though. > Am I right here? What I meant, is that even though rk_iommu_map() is called for DRM device, the page table allocation happening inside should be called for the IOMMU device itself, because it's the consumer of these page tables. Best regards, Tomasz
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c index d6c3051..aafea6e 100644 --- a/drivers/iommu/rockchip-iommu.c +++ b/drivers/iommu/rockchip-iommu.c @@ -4,8 +4,6 @@ * published by the Free Software Foundation. */ -#include <asm/cacheflush.h> -#include <asm/pgtable.h> #include <linux/compiler.h> #include <linux/delay.h> #include <linux/device.h> @@ -61,8 +59,7 @@ #define RK_MMU_IRQ_BUS_ERROR 0x02 /* bus read error */ #define RK_MMU_IRQ_MASK (RK_MMU_IRQ_PAGE_FAULT | RK_MMU_IRQ_BUS_ERROR) -#define NUM_DT_ENTRIES 1024 -#define NUM_PT_ENTRIES 1024 +#define NUM_TLB_ENTRIES 1024 /* for both DT and PT */ #define SPAGE_ORDER 12 #define SPAGE_SIZE (1 << SPAGE_ORDER) @@ -82,7 +79,9 @@ struct rk_iommu_domain { struct list_head iommus; + struct device *dev; u32 *dt; /* page directory table */ + dma_addr_t dt_dma; spinlock_t iommus_lock; /* lock for iommus list */ spinlock_t dt_lock; /* lock for modifying page directory table */ @@ -98,14 +97,12 @@ struct rk_iommu { struct iommu_domain *domain; /* domain to which iommu is attached */ }; -static inline void rk_table_flush(u32 *va, unsigned int count) +static inline void rk_table_flush(struct device *dev, dma_addr_t dma, + unsigned int count) { - phys_addr_t pa_start = virt_to_phys(va); - phys_addr_t pa_end = virt_to_phys(va + count); - size_t size = pa_end - pa_start; + size_t size = count * 4; - __cpuc_flush_dcache_area(va, size); - outer_flush_range(pa_start, pa_end); + dma_sync_single_range_for_device(dev, dma, 0, size, DMA_TO_DEVICE); } static struct rk_iommu_domain *to_rk_domain(struct iommu_domain *dom) @@ -188,10 +185,9 @@ static inline bool rk_dte_is_pt_valid(u32 dte) return dte & RK_DTE_PT_VALID; } -static u32 rk_mk_dte(u32 *pt) +static inline u32 rk_mk_dte(dma_addr_t pt_dma) { - phys_addr_t pt_phys = virt_to_phys(pt); - return (pt_phys & RK_DTE_PT_ADDRESS_MASK) | RK_DTE_PT_VALID; + return (pt_dma & RK_DTE_PT_ADDRESS_MASK) | RK_DTE_PT_VALID; } /* @@ -609,12 +605,14 @@ static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain, dma_addr_t iova) { u32 *page_table, *dte_addr; + u32 dte_index = rk_iova_dte_index(iova); u32 dte; phys_addr_t pt_phys; + dma_addr_t pt_dma; assert_spin_locked(&rk_domain->dt_lock); - dte_addr = &rk_domain->dt[rk_iova_dte_index(iova)]; + dte_addr = &rk_domain->dt[dte_index]; dte = *dte_addr; if (rk_dte_is_pt_valid(dte)) goto done; @@ -623,19 +621,27 @@ static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain, if (!page_table) return ERR_PTR(-ENOMEM); - dte = rk_mk_dte(page_table); - *dte_addr = dte; + pt_dma = dma_map_single(rk_domain->dev, page_table, + SPAGE_SIZE, DMA_TO_DEVICE); + if (dma_mapping_error(rk_domain->dev, pt_dma)) { + dev_err(rk_domain->dev, "dma mapping error\n"); + free_page((unsigned long)page_table); + return ERR_PTR(-ENOMEM); + } - rk_table_flush(page_table, NUM_PT_ENTRIES); - rk_table_flush(dte_addr, 1); + dte = rk_mk_dte(pt_dma); + *dte_addr = dte; + rk_table_flush(rk_domain->dev, pt_dma, NUM_TLB_ENTRIES); + rk_table_flush(rk_domain->dev, rk_domain->dt_dma + dte_index * 4, 1); done: pt_phys = rk_dte_pt_address(dte); return (u32 *)phys_to_virt(pt_phys); } static size_t rk_iommu_unmap_iova(struct rk_iommu_domain *rk_domain, - u32 *pte_addr, dma_addr_t iova, size_t size) + u32 *pte_addr, dma_addr_t pte_dma, + size_t size) { unsigned int pte_count; unsigned int pte_total = size / SPAGE_SIZE; @@ -650,14 +656,14 @@ static size_t rk_iommu_unmap_iova(struct rk_iommu_domain *rk_domain, pte_addr[pte_count] = rk_mk_pte_invalid(pte); } - rk_table_flush(pte_addr, pte_count); + rk_table_flush(rk_domain->dev, pte_dma, pte_count); return pte_count * SPAGE_SIZE; } static int rk_iommu_map_iova(struct rk_iommu_domain *rk_domain, u32 *pte_addr, - dma_addr_t iova, phys_addr_t paddr, size_t size, - int prot) + dma_addr_t pte_dma, dma_addr_t iova, + phys_addr_t paddr, size_t size, int prot) { unsigned int pte_count; unsigned int pte_total = size / SPAGE_SIZE; @@ -676,7 +682,7 @@ static int rk_iommu_map_iova(struct rk_iommu_domain *rk_domain, u32 *pte_addr, paddr += SPAGE_SIZE; } - rk_table_flush(pte_addr, pte_count); + rk_table_flush(rk_domain->dev, pte_dma, pte_total); /* * Zap the first and last iova to evict from iotlb any previously @@ -689,7 +695,8 @@ static int rk_iommu_map_iova(struct rk_iommu_domain *rk_domain, u32 *pte_addr, return 0; unwind: /* Unmap the range of iovas that we just mapped */ - rk_iommu_unmap_iova(rk_domain, pte_addr, iova, pte_count * SPAGE_SIZE); + rk_iommu_unmap_iova(rk_domain, pte_addr, pte_dma, + pte_count * SPAGE_SIZE); iova += pte_count * SPAGE_SIZE; page_phys = rk_pte_page_address(pte_addr[pte_count]); @@ -704,8 +711,9 @@ static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova, { struct rk_iommu_domain *rk_domain = to_rk_domain(domain); unsigned long flags; - dma_addr_t iova = (dma_addr_t)_iova; + dma_addr_t pte_dma, iova = (dma_addr_t)_iova; u32 *page_table, *pte_addr; + u32 dte_index, pte_index; int ret; spin_lock_irqsave(&rk_domain->dt_lock, flags); @@ -723,8 +731,13 @@ static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova, return PTR_ERR(page_table); } - pte_addr = &page_table[rk_iova_pte_index(iova)]; - ret = rk_iommu_map_iova(rk_domain, pte_addr, iova, paddr, size, prot); + dte_index = rk_domain->dt[rk_iova_dte_index(iova)]; + pte_index = rk_iova_pte_index(iova); + pte_addr = &page_table[pte_index]; + pte_dma = rk_dte_pt_address(dte_index) + pte_index * 4; + ret = rk_iommu_map_iova(rk_domain, pte_addr, pte_dma, iova, + paddr, size, prot); + spin_unlock_irqrestore(&rk_domain->dt_lock, flags); return ret; @@ -735,7 +748,7 @@ static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova, { struct rk_iommu_domain *rk_domain = to_rk_domain(domain); unsigned long flags; - dma_addr_t iova = (dma_addr_t)_iova; + dma_addr_t pte_dma, iova = (dma_addr_t)_iova; phys_addr_t pt_phys; u32 dte; u32 *pte_addr; @@ -759,7 +772,8 @@ static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova, pt_phys = rk_dte_pt_address(dte); pte_addr = (u32 *)phys_to_virt(pt_phys) + rk_iova_pte_index(iova); - unmap_size = rk_iommu_unmap_iova(rk_domain, pte_addr, iova, size); + pte_dma = pt_phys + rk_iova_pte_index(iova) * 4; + unmap_size = rk_iommu_unmap_iova(rk_domain, pte_addr, pte_dma, size); spin_unlock_irqrestore(&rk_domain->dt_lock, flags); @@ -776,8 +790,6 @@ static struct rk_iommu *rk_iommu_from_dev(struct device *dev) struct rk_iommu *rk_iommu; group = iommu_group_get(dev); - if (!group) - return NULL; iommu_dev = iommu_group_get_iommudata(group); rk_iommu = dev_get_drvdata(iommu_dev); iommu_group_put(group); @@ -792,9 +804,21 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, struct rk_iommu_domain *rk_domain = to_rk_domain(domain); unsigned long flags; int ret, i; - phys_addr_t dte_addr; iommu = rk_iommu_from_dev(dev); + /* Set the domain dev to virtual one if exist */ + if (RK_IOMMU_IS_VIRTUAL(iommu) || !rk_domain->dev) + rk_domain->dev = iommu->dev; + + if (!rk_domain->dt_dma) { + rk_domain->dt_dma = dma_map_single(rk_domain->dev, rk_domain->dt, + SPAGE_SIZE, DMA_TO_DEVICE); + if (dma_mapping_error(rk_domain->dev, rk_domain->dt_dma)) + return -ENOMEM; + + rk_table_flush(rk_domain->dev, rk_domain->dt_dma, + NUM_TLB_ENTRIES); + } iommu->domain = domain; if (RK_IOMMU_IS_VIRTUAL(iommu)) { @@ -804,28 +828,27 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, ret = rk_iommu_enable_stall(iommu); if (ret) - return ret; + goto unmap; ret = rk_iommu_force_reset(iommu); if (ret) - return ret; + goto unmap; ret = devm_request_irq(iommu->dev, iommu->irq, rk_iommu_irq, IRQF_SHARED, dev_name(dev), iommu); if (ret) - return ret; + goto unmap; - dte_addr = virt_to_phys(rk_domain->dt); for (i = 0; i < iommu->num_mmu; i++) { - rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR, dte_addr); + rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR, rk_domain->dt_dma); rk_iommu_command(iommu->bases[i], RK_MMU_CMD_ZAP_CACHE); rk_iommu_write(iommu->bases[i], RK_MMU_INT_MASK, RK_MMU_IRQ_MASK); } ret = rk_iommu_enable_paging(iommu); if (ret) - return ret; + goto unmap; spin_lock_irqsave(&rk_domain->iommus_lock, flags); list_add_tail(&iommu->node, &rk_domain->iommus); @@ -836,6 +859,10 @@ static int rk_iommu_attach_device(struct iommu_domain *domain, rk_iommu_disable_stall(iommu); return 0; +unmap: + dma_unmap_single(rk_domain->dev, rk_domain->dt_dma, SPAGE_SIZE, + DMA_TO_DEVICE); + return ret; } static void rk_iommu_detach_device(struct iommu_domain *domain, @@ -846,8 +873,10 @@ static void rk_iommu_detach_device(struct iommu_domain *domain, unsigned long flags; int i; - iommu = rk_iommu_from_dev(dev); + rk_domain->dev = NULL; + /* Allow 'virtual devices' (eg drm) to detach from domain */ + iommu = rk_iommu_from_dev(dev); iommu->domain = NULL; if (RK_IOMMU_IS_VIRTUAL(iommu)) { dev_dbg(dev, "Master with virtual iommu detached from domain\n"); @@ -883,6 +912,8 @@ static struct iommu_domain *rk_iommu_domain_alloc(unsigned type) if (!rk_domain) return NULL; + rk_domain->dev = NULL; + rk_domain->dt_dma = 0; /* * rk32xx iommus use a 2 level pagetable. * Each level1 (dt) and level2 (pt) table has 1024 4-byte entries. @@ -892,8 +923,6 @@ static struct iommu_domain *rk_iommu_domain_alloc(unsigned type) if (!rk_domain->dt) goto err_dt; - rk_table_flush(rk_domain->dt, NUM_DT_ENTRIES); - spin_lock_init(&rk_domain->iommus_lock); spin_lock_init(&rk_domain->dt_lock); INIT_LIST_HEAD(&rk_domain->iommus); @@ -912,7 +941,7 @@ static void rk_iommu_domain_free(struct iommu_domain *domain) WARN_ON(!list_empty(&rk_domain->iommus)); - for (i = 0; i < NUM_DT_ENTRIES; i++) { + for (i = 0; i < NUM_TLB_ENTRIES; i++) { u32 dte = rk_domain->dt[i]; if (rk_dte_is_pt_valid(dte)) { phys_addr_t pt_phys = rk_dte_pt_address(dte);
Use DMA API instead of architecture internal functions like __cpuc_flush_dcache_area() etc. To support the virtual device like DRM the virtual slave iommu added in the previous patch, attaching to which the DRM can use it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled. With this patch, this driver is available for ARM64 like RK3399. Signed-off-by: Shunqian Zheng <zhengsq@rock-chips.com> --- drivers/iommu/rockchip-iommu.c | 113 ++++++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 42 deletions(-)