diff mbox

[v2,6/7] iommu/rockchip: use DMA API to map, to flush cache

Message ID 1465392392-2003-7-git-send-email-zhengsq@rock-chips.com (mailing list archive)
State New, archived
Headers show

Commit Message

Shunqian Zheng June 8, 2016, 1:26 p.m. UTC
Use DMA API instead of architecture internal functions like
__cpuc_flush_dcache_area() etc.

To support the virtual device like DRM the virtual slave iommu
added in the previous patch, attaching to which the DRM can use
it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled.

With this patch, this driver is available for ARM64 like RK3399.

Signed-off-by: Shunqian Zheng <zhengsq@rock-chips.com>
---
 drivers/iommu/rockchip-iommu.c | 113 ++++++++++++++++++++++++++---------------
 1 file changed, 71 insertions(+), 42 deletions(-)

Comments

Tomasz Figa June 10, 2016, 9:10 a.m. UTC | #1
Hi,

On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng <zhengsq@rock-chips.com> wrote:
> Use DMA API instead of architecture internal functions like
> __cpuc_flush_dcache_area() etc.
>
> To support the virtual device like DRM the virtual slave iommu
> added in the previous patch, attaching to which the DRM can use
> it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled.
>
> With this patch, this driver is available for ARM64 like RK3399.
>

Could we instead simply allocate coherent memory for page tables using
dma_alloc_coherent() and skip any flushing on CPU side completely? If
I'm looking correctly, the driver only reads back the page directory
when checking if there is a need to allocate new page table, so there
shouldn't be any significant penalty for disabling the cache.

Other than that, please see some comments inline.

> Signed-off-by: Shunqian Zheng <zhengsq@rock-chips.com>
> ---
>  drivers/iommu/rockchip-iommu.c | 113 ++++++++++++++++++++++++++---------------
>  1 file changed, 71 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
> index d6c3051..aafea6e 100644
> --- a/drivers/iommu/rockchip-iommu.c
> +++ b/drivers/iommu/rockchip-iommu.c
> @@ -4,8 +4,6 @@
>   * published by the Free Software Foundation.
>   */
>
> -#include <asm/cacheflush.h>
> -#include <asm/pgtable.h>
>  #include <linux/compiler.h>
>  #include <linux/delay.h>
>  #include <linux/device.h>
> @@ -61,8 +59,7 @@
>  #define RK_MMU_IRQ_BUS_ERROR     0x02  /* bus read error */
>  #define RK_MMU_IRQ_MASK          (RK_MMU_IRQ_PAGE_FAULT | RK_MMU_IRQ_BUS_ERROR)
>
> -#define NUM_DT_ENTRIES 1024
> -#define NUM_PT_ENTRIES 1024
> +#define NUM_TLB_ENTRIES 1024 /* for both DT and PT */

Is it necessary to change this in this patch? In general, it's not a
good idea to mix multiple logical changes together.

>
>  #define SPAGE_ORDER 12
>  #define SPAGE_SIZE (1 << SPAGE_ORDER)
> @@ -82,7 +79,9 @@
>
>  struct rk_iommu_domain {
>         struct list_head iommus;
> +       struct device *dev;
>         u32 *dt; /* page directory table */
> +       dma_addr_t dt_dma;
>         spinlock_t iommus_lock; /* lock for iommus list */
>         spinlock_t dt_lock; /* lock for modifying page directory table */
>
> @@ -98,14 +97,12 @@ struct rk_iommu {
>         struct iommu_domain *domain; /* domain to which iommu is attached */
>  };
>
> -static inline void rk_table_flush(u32 *va, unsigned int count)
> +static inline void rk_table_flush(struct device *dev, dma_addr_t dma,
> +                                 unsigned int count)
>  {
> -       phys_addr_t pa_start = virt_to_phys(va);
> -       phys_addr_t pa_end = virt_to_phys(va + count);
> -       size_t size = pa_end - pa_start;
> +       size_t size = count * 4;

It would be a good idea to specify what "count" is. I'm a bit confused
that before it meant bytes and now some multiple of 4?

Best regards,
Tomasz
Shunqian Zheng June 13, 2016, 9:56 a.m. UTC | #2
Hi

On 2016年06月10日 17:10, Tomasz Figa wrote:
> Hi,
>
> On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng <zhengsq@rock-chips.com> wrote:
>> Use DMA API instead of architecture internal functions like
>> __cpuc_flush_dcache_area() etc.
>>
>> To support the virtual device like DRM the virtual slave iommu
>> added in the previous patch, attaching to which the DRM can use
>> it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled.
>>
>> With this patch, this driver is available for ARM64 like RK3399.
>>
> Could we instead simply allocate coherent memory for page tables using
> dma_alloc_coherent() and skip any flushing on CPU side completely? If
> I'm looking correctly, the driver only reads back the page directory
> when checking if there is a need to allocate new page table, so there
> shouldn't be any significant penalty for disabling the cache.
I try to use dma_alloc_coherent() to replace the dma_map_single(),
but it doesn't work for me properly.
Because the DRM uses the iommu_dma_ops instead the swiotlb_dma_ops after 
attaching
to iommu, so when the iommu domain need to alloc a new page in 
rk_iommu_map(),
it would call:
rk_iommu_map()  --> dma_alloc_coherent()  --> ops->alloc()  --> 
iommu_map() --> rk_iommu_map()

Then I try to reserve memory for coherent so that, dma_alloc_coherent()  
calls dma_alloc_from_coherent()
but not ops->alloc(). But it doesn't work too because when DRM request 
buffer it never uses iommu.
>
> Other than that, please see some comments inline.
>
>> Signed-off-by: Shunqian Zheng <zhengsq@rock-chips.com>
>> ---
>>   drivers/iommu/rockchip-iommu.c | 113 ++++++++++++++++++++++++++---------------
>>   1 file changed, 71 insertions(+), 42 deletions(-)
>>
>> diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
>> index d6c3051..aafea6e 100644
>> --- a/drivers/iommu/rockchip-iommu.c
>> +++ b/drivers/iommu/rockchip-iommu.c
>> @@ -4,8 +4,6 @@
>>    * published by the Free Software Foundation.
>>    */
>>
>> -#include <asm/cacheflush.h>
>> -#include <asm/pgtable.h>
>>   #include <linux/compiler.h>
>>   #include <linux/delay.h>
>>   #include <linux/device.h>
>> @@ -61,8 +59,7 @@
>>   #define RK_MMU_IRQ_BUS_ERROR     0x02  /* bus read error */
>>   #define RK_MMU_IRQ_MASK          (RK_MMU_IRQ_PAGE_FAULT | RK_MMU_IRQ_BUS_ERROR)
>>
>> -#define NUM_DT_ENTRIES 1024
>> -#define NUM_PT_ENTRIES 1024
>> +#define NUM_TLB_ENTRIES 1024 /* for both DT and PT */
> Is it necessary to change this in this patch? In general, it's not a
> good idea to mix multiple logical changes together.
Sure, will restore changes in v3.
>
>>   #define SPAGE_ORDER 12
>>   #define SPAGE_SIZE (1 << SPAGE_ORDER)
>> @@ -82,7 +79,9 @@
>>
>>   struct rk_iommu_domain {
>>          struct list_head iommus;
>> +       struct device *dev;
>>          u32 *dt; /* page directory table */
>> +       dma_addr_t dt_dma;
>>          spinlock_t iommus_lock; /* lock for iommus list */
>>          spinlock_t dt_lock; /* lock for modifying page directory table */
>>
>> @@ -98,14 +97,12 @@ struct rk_iommu {
>>          struct iommu_domain *domain; /* domain to which iommu is attached */
>>   };
>>
>> -static inline void rk_table_flush(u32 *va, unsigned int count)
>> +static inline void rk_table_flush(struct device *dev, dma_addr_t dma,
>> +                                 unsigned int count)
>>   {
>> -       phys_addr_t pa_start = virt_to_phys(va);
>> -       phys_addr_t pa_end = virt_to_phys(va + count);
>> -       size_t size = pa_end - pa_start;
>> +       size_t size = count * 4;
> It would be a good idea to specify what "count" is. I'm a bit confused
> that before it meant bytes and now some multiple of 4?
"count" means PT/DT entry count to flush here. I would add some more 
comment on it.

Thank you very much,
Shunqian
>
> Best regards,
> Tomasz
>
> _______________________________________________
> Linux-rockchip mailing list
> Linux-rockchip@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-rockchip
Tomasz Figa June 13, 2016, 10:21 a.m. UTC | #3
On Mon, Jun 13, 2016 at 6:56 PM, Shunqian Zheng
<shunqian.zheng@gmail.com> wrote:
> Hi
>
> On 2016年06月10日 17:10, Tomasz Figa wrote:
>>
>> Hi,
>>
>> On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng <zhengsq@rock-chips.com>
>> wrote:
>>>
>>> Use DMA API instead of architecture internal functions like
>>> __cpuc_flush_dcache_area() etc.
>>>
>>> To support the virtual device like DRM the virtual slave iommu
>>> added in the previous patch, attaching to which the DRM can use
>>> it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled.
>>>
>>> With this patch, this driver is available for ARM64 like RK3399.
>>>
>> Could we instead simply allocate coherent memory for page tables using
>> dma_alloc_coherent() and skip any flushing on CPU side completely? If
>> I'm looking correctly, the driver only reads back the page directory
>> when checking if there is a need to allocate new page table, so there
>> shouldn't be any significant penalty for disabling the cache.
>
> I try to use dma_alloc_coherent() to replace the dma_map_single(),
> but it doesn't work for me properly.
> Because the DRM uses the iommu_dma_ops instead the swiotlb_dma_ops after
> attaching
> to iommu, so when the iommu domain need to alloc a new page in
> rk_iommu_map(),
> it would call:
> rk_iommu_map()  --> dma_alloc_coherent()  --> ops->alloc()  --> iommu_map()
> --> rk_iommu_map()

It shouldn't call iommu_map(), because the IOMMU is not behind another
IOMMU. Are you sure you called dma_alloc_coherent() on behalf of the
IOMMU struct device and not the DRM device?

Best regards,
Tomasz
Shunqian Zheng June 13, 2016, 10:31 a.m. UTC | #4
HI,

On 2016年06月13日 18:21, Tomasz Figa wrote:
> On Mon, Jun 13, 2016 at 6:56 PM, Shunqian Zheng
> <shunqian.zheng@gmail.com> wrote:
>> Hi
>>
>> On 2016年06月10日 17:10, Tomasz Figa wrote:
>>> Hi,
>>>
>>> On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng <zhengsq@rock-chips.com>
>>> wrote:
>>>> Use DMA API instead of architecture internal functions like
>>>> __cpuc_flush_dcache_area() etc.
>>>>
>>>> To support the virtual device like DRM the virtual slave iommu
>>>> added in the previous patch, attaching to which the DRM can use
>>>> it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled.
>>>>
>>>> With this patch, this driver is available for ARM64 like RK3399.
>>>>
>>> Could we instead simply allocate coherent memory for page tables using
>>> dma_alloc_coherent() and skip any flushing on CPU side completely? If
>>> I'm looking correctly, the driver only reads back the page directory
>>> when checking if there is a need to allocate new page table, so there
>>> shouldn't be any significant penalty for disabling the cache.
>> I try to use dma_alloc_coherent() to replace the dma_map_single(),
>> but it doesn't work for me properly.
>> Because the DRM uses the iommu_dma_ops instead the swiotlb_dma_ops after
>> attaching
>> to iommu, so when the iommu domain need to alloc a new page in
>> rk_iommu_map(),
>> it would call:
>> rk_iommu_map()  --> dma_alloc_coherent()  --> ops->alloc()  --> iommu_map()
>> --> rk_iommu_map()
> It shouldn't call iommu_map(), because the IOMMU is not behind another
> IOMMU. Are you sure you called dma_alloc_coherent() on behalf of the
> IOMMU struct device and not the DRM device?
I called dma_alloc_coherent() with DRM device but not the IOMMU device,
because DRM didn't attach to any iommu. Even allocating an virtual one 
when attaching, the iommu->dev
is DRM device though.
Am I right here?

Thank you very much,
Shunqian
>
> Best regards,
> Tomasz
Robin Murphy June 13, 2016, 10:39 a.m. UTC | #5
On 13/06/16 10:56, Shunqian Zheng wrote:
> Hi
>
> On 2016年06月10日 17:10, Tomasz Figa wrote:
>> Hi,
>>
>> On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng
>> <zhengsq@rock-chips.com> wrote:
>>> Use DMA API instead of architecture internal functions like
>>> __cpuc_flush_dcache_area() etc.
>>>
>>> To support the virtual device like DRM the virtual slave iommu
>>> added in the previous patch, attaching to which the DRM can use
>>> it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled.
>>>
>>> With this patch, this driver is available for ARM64 like RK3399.
>>>
>> Could we instead simply allocate coherent memory for page tables using
>> dma_alloc_coherent() and skip any flushing on CPU side completely? If
>> I'm looking correctly, the driver only reads back the page directory
>> when checking if there is a need to allocate new page table, so there
>> shouldn't be any significant penalty for disabling the cache.
> I try to use dma_alloc_coherent() to replace the dma_map_single(),
> but it doesn't work for me properly.
> Because the DRM uses the iommu_dma_ops instead the swiotlb_dma_ops after
> attaching
> to iommu, so when the iommu domain need to alloc a new page in
> rk_iommu_map(),
> it would call:
> rk_iommu_map()  --> dma_alloc_coherent()  --> ops->alloc()  -->
> iommu_map() --> rk_iommu_map()

That sounds more like you're passing the wrong device around somewhere, 
since this approach is working fine for other IOMMUs; specifically, the 
flow goes:

dma_alloc_coherent(DRM dev)   // for buffer
--> ops->alloc(DRM dev)
     --> iommu_dma_alloc(DRM dev)
         --> iommu_map()
             --> dma_alloc_coherent(IOMMU dev)  // for pagetable
                 --> ops->alloc(IOMMU dev)
                     --> swiotlb_alloc(IOMMU dev)

There shouldn't be any need for this "virtual IOMMU" at all. I think the 
Exynos DRM driver is in a similar situation of having multiple devices 
(involving multiple IOMMUs) backing the virtual DRM device, and that 
doesn't seem to be doing anything this crazy so it's probably worth 
taking a look at.

Robin.

> Then I try to reserve memory for coherent so that, dma_alloc_coherent()
> calls dma_alloc_from_coherent()
> but not ops->alloc(). But it doesn't work too because when DRM request
> buffer it never uses iommu.
>>
>> Other than that, please see some comments inline.
>>
>>> Signed-off-by: Shunqian Zheng <zhengsq@rock-chips.com>
>>> ---
>>>   drivers/iommu/rockchip-iommu.c | 113
>>> ++++++++++++++++++++++++++---------------
>>>   1 file changed, 71 insertions(+), 42 deletions(-)
>>>
>>> diff --git a/drivers/iommu/rockchip-iommu.c
>>> b/drivers/iommu/rockchip-iommu.c
>>> index d6c3051..aafea6e 100644
>>> --- a/drivers/iommu/rockchip-iommu.c
>>> +++ b/drivers/iommu/rockchip-iommu.c
>>> @@ -4,8 +4,6 @@
>>>    * published by the Free Software Foundation.
>>>    */
>>>
>>> -#include <asm/cacheflush.h>
>>> -#include <asm/pgtable.h>
>>>   #include <linux/compiler.h>
>>>   #include <linux/delay.h>
>>>   #include <linux/device.h>
>>> @@ -61,8 +59,7 @@
>>>   #define RK_MMU_IRQ_BUS_ERROR     0x02  /* bus read error */
>>>   #define RK_MMU_IRQ_MASK          (RK_MMU_IRQ_PAGE_FAULT |
>>> RK_MMU_IRQ_BUS_ERROR)
>>>
>>> -#define NUM_DT_ENTRIES 1024
>>> -#define NUM_PT_ENTRIES 1024
>>> +#define NUM_TLB_ENTRIES 1024 /* for both DT and PT */
>> Is it necessary to change this in this patch? In general, it's not a
>> good idea to mix multiple logical changes together.
> Sure, will restore changes in v3.
>>
>>>   #define SPAGE_ORDER 12
>>>   #define SPAGE_SIZE (1 << SPAGE_ORDER)
>>> @@ -82,7 +79,9 @@
>>>
>>>   struct rk_iommu_domain {
>>>          struct list_head iommus;
>>> +       struct device *dev;
>>>          u32 *dt; /* page directory table */
>>> +       dma_addr_t dt_dma;
>>>          spinlock_t iommus_lock; /* lock for iommus list */
>>>          spinlock_t dt_lock; /* lock for modifying page directory
>>> table */
>>>
>>> @@ -98,14 +97,12 @@ struct rk_iommu {
>>>          struct iommu_domain *domain; /* domain to which iommu is
>>> attached */
>>>   };
>>>
>>> -static inline void rk_table_flush(u32 *va, unsigned int count)
>>> +static inline void rk_table_flush(struct device *dev, dma_addr_t dma,
>>> +                                 unsigned int count)
>>>   {
>>> -       phys_addr_t pa_start = virt_to_phys(va);
>>> -       phys_addr_t pa_end = virt_to_phys(va + count);
>>> -       size_t size = pa_end - pa_start;
>>> +       size_t size = count * 4;
>> It would be a good idea to specify what "count" is. I'm a bit confused
>> that before it meant bytes and now some multiple of 4?
> "count" means PT/DT entry count to flush here. I would add some more
> comment on it.
>
> Thank you very much,
> Shunqian
>>
>> Best regards,
>> Tomasz
>>
>> _______________________________________________
>> Linux-rockchip mailing list
>> Linux-rockchip@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-rockchip
>
> _______________________________________________
> iommu mailing list
> iommu@lists.linux-foundation.org
> https://lists.linuxfoundation.org/mailman/listinfo/iommu
Tomasz Figa June 13, 2016, 10:41 a.m. UTC | #6
On Mon, Jun 13, 2016 at 7:31 PM, Shunqian Zheng
<shunqian.zheng@gmail.com> wrote:
> HI,
>
>
> On 2016年06月13日 18:21, Tomasz Figa wrote:
>>
>> On Mon, Jun 13, 2016 at 6:56 PM, Shunqian Zheng
>> <shunqian.zheng@gmail.com> wrote:
>>>
>>> Hi
>>>
>>> On 2016年06月10日 17:10, Tomasz Figa wrote:
>>>>
>>>> Hi,
>>>>
>>>> On Wed, Jun 8, 2016 at 10:26 PM, Shunqian Zheng <zhengsq@rock-chips.com>
>>>> wrote:
>>>>>
>>>>> Use DMA API instead of architecture internal functions like
>>>>> __cpuc_flush_dcache_area() etc.
>>>>>
>>>>> To support the virtual device like DRM the virtual slave iommu
>>>>> added in the previous patch, attaching to which the DRM can use
>>>>> it own domain->dev for dma_map_*(), dma_sync_*() even VOP is disabled.
>>>>>
>>>>> With this patch, this driver is available for ARM64 like RK3399.
>>>>>
>>>> Could we instead simply allocate coherent memory for page tables using
>>>> dma_alloc_coherent() and skip any flushing on CPU side completely? If
>>>> I'm looking correctly, the driver only reads back the page directory
>>>> when checking if there is a need to allocate new page table, so there
>>>> shouldn't be any significant penalty for disabling the cache.
>>>
>>> I try to use dma_alloc_coherent() to replace the dma_map_single(),
>>> but it doesn't work for me properly.
>>> Because the DRM uses the iommu_dma_ops instead the swiotlb_dma_ops after
>>> attaching
>>> to iommu, so when the iommu domain need to alloc a new page in
>>> rk_iommu_map(),
>>> it would call:
>>> rk_iommu_map()  --> dma_alloc_coherent()  --> ops->alloc()  -->
>>> iommu_map()
>>> --> rk_iommu_map()
>>
>> It shouldn't call iommu_map(), because the IOMMU is not behind another
>> IOMMU. Are you sure you called dma_alloc_coherent() on behalf of the
>> IOMMU struct device and not the DRM device?
>
> I called dma_alloc_coherent() with DRM device but not the IOMMU device,
> because DRM didn't attach to any iommu. Even allocating an virtual one when
> attaching, the iommu->dev
> is DRM device though.
> Am I right here?

What I meant, is that even though rk_iommu_map() is called for DRM
device, the page table allocation happening inside should be called
for the IOMMU device itself, because it's the consumer of these page
tables.

Best regards,
Tomasz
diff mbox

Patch

diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index d6c3051..aafea6e 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -4,8 +4,6 @@ 
  * published by the Free Software Foundation.
  */
 
-#include <asm/cacheflush.h>
-#include <asm/pgtable.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/device.h>
@@ -61,8 +59,7 @@ 
 #define RK_MMU_IRQ_BUS_ERROR     0x02  /* bus read error */
 #define RK_MMU_IRQ_MASK          (RK_MMU_IRQ_PAGE_FAULT | RK_MMU_IRQ_BUS_ERROR)
 
-#define NUM_DT_ENTRIES 1024
-#define NUM_PT_ENTRIES 1024
+#define NUM_TLB_ENTRIES 1024 /* for both DT and PT */
 
 #define SPAGE_ORDER 12
 #define SPAGE_SIZE (1 << SPAGE_ORDER)
@@ -82,7 +79,9 @@ 
 
 struct rk_iommu_domain {
 	struct list_head iommus;
+	struct device *dev;
 	u32 *dt; /* page directory table */
+	dma_addr_t dt_dma;
 	spinlock_t iommus_lock; /* lock for iommus list */
 	spinlock_t dt_lock; /* lock for modifying page directory table */
 
@@ -98,14 +97,12 @@  struct rk_iommu {
 	struct iommu_domain *domain; /* domain to which iommu is attached */
 };
 
-static inline void rk_table_flush(u32 *va, unsigned int count)
+static inline void rk_table_flush(struct device *dev, dma_addr_t dma,
+				  unsigned int count)
 {
-	phys_addr_t pa_start = virt_to_phys(va);
-	phys_addr_t pa_end = virt_to_phys(va + count);
-	size_t size = pa_end - pa_start;
+	size_t size = count * 4;
 
-	__cpuc_flush_dcache_area(va, size);
-	outer_flush_range(pa_start, pa_end);
+	dma_sync_single_range_for_device(dev, dma, 0, size, DMA_TO_DEVICE);
 }
 
 static struct rk_iommu_domain *to_rk_domain(struct iommu_domain *dom)
@@ -188,10 +185,9 @@  static inline bool rk_dte_is_pt_valid(u32 dte)
 	return dte & RK_DTE_PT_VALID;
 }
 
-static u32 rk_mk_dte(u32 *pt)
+static inline u32 rk_mk_dte(dma_addr_t pt_dma)
 {
-	phys_addr_t pt_phys = virt_to_phys(pt);
-	return (pt_phys & RK_DTE_PT_ADDRESS_MASK) | RK_DTE_PT_VALID;
+	return (pt_dma & RK_DTE_PT_ADDRESS_MASK) | RK_DTE_PT_VALID;
 }
 
 /*
@@ -609,12 +605,14 @@  static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain,
 				  dma_addr_t iova)
 {
 	u32 *page_table, *dte_addr;
+	u32 dte_index = rk_iova_dte_index(iova);
 	u32 dte;
 	phys_addr_t pt_phys;
+	dma_addr_t pt_dma;
 
 	assert_spin_locked(&rk_domain->dt_lock);
 
-	dte_addr = &rk_domain->dt[rk_iova_dte_index(iova)];
+	dte_addr = &rk_domain->dt[dte_index];
 	dte = *dte_addr;
 	if (rk_dte_is_pt_valid(dte))
 		goto done;
@@ -623,19 +621,27 @@  static u32 *rk_dte_get_page_table(struct rk_iommu_domain *rk_domain,
 	if (!page_table)
 		return ERR_PTR(-ENOMEM);
 
-	dte = rk_mk_dte(page_table);
-	*dte_addr = dte;
+	pt_dma = dma_map_single(rk_domain->dev, page_table,
+				SPAGE_SIZE, DMA_TO_DEVICE);
+	if (dma_mapping_error(rk_domain->dev, pt_dma)) {
+		dev_err(rk_domain->dev, "dma mapping error\n");
+		free_page((unsigned long)page_table);
+		return ERR_PTR(-ENOMEM);
+	}
 
-	rk_table_flush(page_table, NUM_PT_ENTRIES);
-	rk_table_flush(dte_addr, 1);
+	dte = rk_mk_dte(pt_dma);
+	*dte_addr = dte;
 
+	rk_table_flush(rk_domain->dev, pt_dma, NUM_TLB_ENTRIES);
+	rk_table_flush(rk_domain->dev, rk_domain->dt_dma + dte_index * 4, 1);
 done:
 	pt_phys = rk_dte_pt_address(dte);
 	return (u32 *)phys_to_virt(pt_phys);
 }
 
 static size_t rk_iommu_unmap_iova(struct rk_iommu_domain *rk_domain,
-				  u32 *pte_addr, dma_addr_t iova, size_t size)
+				  u32 *pte_addr, dma_addr_t pte_dma,
+				  size_t size)
 {
 	unsigned int pte_count;
 	unsigned int pte_total = size / SPAGE_SIZE;
@@ -650,14 +656,14 @@  static size_t rk_iommu_unmap_iova(struct rk_iommu_domain *rk_domain,
 		pte_addr[pte_count] = rk_mk_pte_invalid(pte);
 	}
 
-	rk_table_flush(pte_addr, pte_count);
+	rk_table_flush(rk_domain->dev, pte_dma, pte_count);
 
 	return pte_count * SPAGE_SIZE;
 }
 
 static int rk_iommu_map_iova(struct rk_iommu_domain *rk_domain, u32 *pte_addr,
-			     dma_addr_t iova, phys_addr_t paddr, size_t size,
-			     int prot)
+			     dma_addr_t pte_dma, dma_addr_t iova,
+			     phys_addr_t paddr, size_t size, int prot)
 {
 	unsigned int pte_count;
 	unsigned int pte_total = size / SPAGE_SIZE;
@@ -676,7 +682,7 @@  static int rk_iommu_map_iova(struct rk_iommu_domain *rk_domain, u32 *pte_addr,
 		paddr += SPAGE_SIZE;
 	}
 
-	rk_table_flush(pte_addr, pte_count);
+	rk_table_flush(rk_domain->dev, pte_dma, pte_total);
 
 	/*
 	 * Zap the first and last iova to evict from iotlb any previously
@@ -689,7 +695,8 @@  static int rk_iommu_map_iova(struct rk_iommu_domain *rk_domain, u32 *pte_addr,
 	return 0;
 unwind:
 	/* Unmap the range of iovas that we just mapped */
-	rk_iommu_unmap_iova(rk_domain, pte_addr, iova, pte_count * SPAGE_SIZE);
+	rk_iommu_unmap_iova(rk_domain, pte_addr, pte_dma,
+			    pte_count * SPAGE_SIZE);
 
 	iova += pte_count * SPAGE_SIZE;
 	page_phys = rk_pte_page_address(pte_addr[pte_count]);
@@ -704,8 +711,9 @@  static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova,
 {
 	struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
 	unsigned long flags;
-	dma_addr_t iova = (dma_addr_t)_iova;
+	dma_addr_t pte_dma, iova = (dma_addr_t)_iova;
 	u32 *page_table, *pte_addr;
+	u32 dte_index, pte_index;
 	int ret;
 
 	spin_lock_irqsave(&rk_domain->dt_lock, flags);
@@ -723,8 +731,13 @@  static int rk_iommu_map(struct iommu_domain *domain, unsigned long _iova,
 		return PTR_ERR(page_table);
 	}
 
-	pte_addr = &page_table[rk_iova_pte_index(iova)];
-	ret = rk_iommu_map_iova(rk_domain, pte_addr, iova, paddr, size, prot);
+	dte_index = rk_domain->dt[rk_iova_dte_index(iova)];
+	pte_index = rk_iova_pte_index(iova);
+	pte_addr = &page_table[pte_index];
+	pte_dma = rk_dte_pt_address(dte_index) + pte_index * 4;
+	ret = rk_iommu_map_iova(rk_domain, pte_addr, pte_dma, iova,
+				paddr, size, prot);
+
 	spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
 
 	return ret;
@@ -735,7 +748,7 @@  static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova,
 {
 	struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
 	unsigned long flags;
-	dma_addr_t iova = (dma_addr_t)_iova;
+	dma_addr_t pte_dma, iova = (dma_addr_t)_iova;
 	phys_addr_t pt_phys;
 	u32 dte;
 	u32 *pte_addr;
@@ -759,7 +772,8 @@  static size_t rk_iommu_unmap(struct iommu_domain *domain, unsigned long _iova,
 
 	pt_phys = rk_dte_pt_address(dte);
 	pte_addr = (u32 *)phys_to_virt(pt_phys) + rk_iova_pte_index(iova);
-	unmap_size = rk_iommu_unmap_iova(rk_domain, pte_addr, iova, size);
+	pte_dma = pt_phys + rk_iova_pte_index(iova) * 4;
+	unmap_size = rk_iommu_unmap_iova(rk_domain, pte_addr, pte_dma, size);
 
 	spin_unlock_irqrestore(&rk_domain->dt_lock, flags);
 
@@ -776,8 +790,6 @@  static struct rk_iommu *rk_iommu_from_dev(struct device *dev)
 	struct rk_iommu *rk_iommu;
 
 	group = iommu_group_get(dev);
-	if (!group)
-		return NULL;
 	iommu_dev = iommu_group_get_iommudata(group);
 	rk_iommu = dev_get_drvdata(iommu_dev);
 	iommu_group_put(group);
@@ -792,9 +804,21 @@  static int rk_iommu_attach_device(struct iommu_domain *domain,
 	struct rk_iommu_domain *rk_domain = to_rk_domain(domain);
 	unsigned long flags;
 	int ret, i;
-	phys_addr_t dte_addr;
 
 	iommu = rk_iommu_from_dev(dev);
+	/* Set the domain dev to virtual one if exist */
+	if (RK_IOMMU_IS_VIRTUAL(iommu) || !rk_domain->dev)
+		rk_domain->dev = iommu->dev;
+
+	if (!rk_domain->dt_dma) {
+		rk_domain->dt_dma = dma_map_single(rk_domain->dev, rk_domain->dt,
+						   SPAGE_SIZE, DMA_TO_DEVICE);
+		if (dma_mapping_error(rk_domain->dev, rk_domain->dt_dma))
+			return -ENOMEM;
+
+		rk_table_flush(rk_domain->dev, rk_domain->dt_dma,
+			       NUM_TLB_ENTRIES);
+	}
 
 	iommu->domain = domain;
 	if (RK_IOMMU_IS_VIRTUAL(iommu)) {
@@ -804,28 +828,27 @@  static int rk_iommu_attach_device(struct iommu_domain *domain,
 
 	ret = rk_iommu_enable_stall(iommu);
 	if (ret)
-		return ret;
+		goto unmap;
 
 	ret = rk_iommu_force_reset(iommu);
 	if (ret)
-		return ret;
+		goto unmap;
 
 
 	ret = devm_request_irq(iommu->dev, iommu->irq, rk_iommu_irq,
 			       IRQF_SHARED, dev_name(dev), iommu);
 	if (ret)
-		return ret;
+		goto unmap;
 
-	dte_addr = virt_to_phys(rk_domain->dt);
 	for (i = 0; i < iommu->num_mmu; i++) {
-		rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR, dte_addr);
+		rk_iommu_write(iommu->bases[i], RK_MMU_DTE_ADDR, rk_domain->dt_dma);
 		rk_iommu_command(iommu->bases[i], RK_MMU_CMD_ZAP_CACHE);
 		rk_iommu_write(iommu->bases[i], RK_MMU_INT_MASK, RK_MMU_IRQ_MASK);
 	}
 
 	ret = rk_iommu_enable_paging(iommu);
 	if (ret)
-		return ret;
+		goto unmap;
 
 	spin_lock_irqsave(&rk_domain->iommus_lock, flags);
 	list_add_tail(&iommu->node, &rk_domain->iommus);
@@ -836,6 +859,10 @@  static int rk_iommu_attach_device(struct iommu_domain *domain,
 	rk_iommu_disable_stall(iommu);
 
 	return 0;
+unmap:
+	dma_unmap_single(rk_domain->dev, rk_domain->dt_dma, SPAGE_SIZE,
+			 DMA_TO_DEVICE);
+	return ret;
 }
 
 static void rk_iommu_detach_device(struct iommu_domain *domain,
@@ -846,8 +873,10 @@  static void rk_iommu_detach_device(struct iommu_domain *domain,
 	unsigned long flags;
 	int i;
 
-	iommu = rk_iommu_from_dev(dev);
+	rk_domain->dev = NULL;
 
+	/* Allow 'virtual devices' (eg drm) to detach from domain */
+	iommu = rk_iommu_from_dev(dev);
 	iommu->domain = NULL;
 	if (RK_IOMMU_IS_VIRTUAL(iommu)) {
 		dev_dbg(dev, "Master with virtual iommu detached from domain\n");
@@ -883,6 +912,8 @@  static struct iommu_domain *rk_iommu_domain_alloc(unsigned type)
 	if (!rk_domain)
 		return NULL;
 
+	rk_domain->dev = NULL;
+	rk_domain->dt_dma = 0;
 	/*
 	 * rk32xx iommus use a 2 level pagetable.
 	 * Each level1 (dt) and level2 (pt) table has 1024 4-byte entries.
@@ -892,8 +923,6 @@  static struct iommu_domain *rk_iommu_domain_alloc(unsigned type)
 	if (!rk_domain->dt)
 		goto err_dt;
 
-	rk_table_flush(rk_domain->dt, NUM_DT_ENTRIES);
-
 	spin_lock_init(&rk_domain->iommus_lock);
 	spin_lock_init(&rk_domain->dt_lock);
 	INIT_LIST_HEAD(&rk_domain->iommus);
@@ -912,7 +941,7 @@  static void rk_iommu_domain_free(struct iommu_domain *domain)
 
 	WARN_ON(!list_empty(&rk_domain->iommus));
 
-	for (i = 0; i < NUM_DT_ENTRIES; i++) {
+	for (i = 0; i < NUM_TLB_ENTRIES; i++) {
 		u32 dte = rk_domain->dt[i];
 		if (rk_dte_is_pt_valid(dte)) {
 			phys_addr_t pt_phys = rk_dte_pt_address(dte);