diff mbox

[V4,3/6] iommu/arm-smmu: Invoke pm_runtime during probe, add/remove device

Message ID 1499333825-7658-4-git-send-email-vivek.gautam@codeaurora.org (mailing list archive)
State New, archived
Headers show

Commit Message

Vivek Gautam July 6, 2017, 9:37 a.m. UTC
From: Sricharan R <sricharan@codeaurora.org>

The smmu device probe/remove and add/remove master device callbacks
gets called when the smmu is not linked to its master, that is without
the context of the master device. So calling runtime apis in those places
separately.

Signed-off-by: Sricharan R <sricharan@codeaurora.org>
[stanimir: added runtime pm in .unmap iommu op]
Signed-off-by: Stanimir Varbanov <stanimir.varbanov@linaro.org>
[vivek: Cleanup pm runtime calls]
Signed-off-by: Vivek Gautam <vivek.gautam@codeaurora.org>
---
 drivers/iommu/arm-smmu.c | 54 ++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 6 deletions(-)

Comments

Stephen Boyd July 12, 2017, 10:54 p.m. UTC | #1
On 07/06, Vivek Gautam wrote:
> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>  static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>  			     size_t size)
>  {
> -	struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
> +	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> +	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
> +	size_t ret;
>  
>  	if (!ops)
>  		return 0;
>  
> -	return ops->unmap(ops, iova, size);
> +	pm_runtime_get_sync(smmu_domain->smmu->dev);

Can these map/unmap ops be called from an atomic context? I seem
to recall that being a problem before.


> +	ret = ops->unmap(ops, iova, size);
> +	pm_runtime_put_sync(smmu_domain->smmu->dev);
> +
> +	return ret;
>  }
>  
>  static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
Vivek Gautam July 13, 2017, 5:13 a.m. UTC | #2
Hi Stephen,


On 07/13/2017 04:24 AM, Stephen Boyd wrote:
> On 07/06, Vivek Gautam wrote:
>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>   			     size_t size)
>>   {
>> -	struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>> +	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>> +	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>> +	size_t ret;
>>   
>>   	if (!ops)
>>   		return 0;
>>   
>> -	return ops->unmap(ops, iova, size);
>> +	pm_runtime_get_sync(smmu_domain->smmu->dev);
> Can these map/unmap ops be called from an atomic context? I seem
> to recall that being a problem before.

That's something which was dropped in the following patch merged in master:
523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock

Looks like we don't  need locks here anymore?

Best Regards
Vivek

>
>
>> +	ret = ops->unmap(ops, iova, size);
>> +	pm_runtime_put_sync(smmu_domain->smmu->dev);
>> +
>> +	return ret;
>>   }
>>   
>>   static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
Sricharan Ramabadhran July 13, 2017, 5:35 a.m. UTC | #3
Hi Vivek,

On 7/13/2017 10:43 AM, Vivek Gautam wrote:
> Hi Stephen,
> 
> 
> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>> On 07/06, Vivek Gautam wrote:
>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>                    size_t size)
>>>   {
>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>> +    size_t ret;
>>>         if (!ops)
>>>           return 0;
>>>   -    return ops->unmap(ops, iova, size);
>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>> Can these map/unmap ops be called from an atomic context? I seem
>> to recall that being a problem before.
> 
> That's something which was dropped in the following patch merged in master:
> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
> 
> Looks like we don't  need locks here anymore?

 Apart from the locking, wonder why a explicit pm_runtime is needed
 from unmap. Somehow looks like some path in the master using that
 should have enabled the pm ?

Regards,
 Sricharan
Stephen Boyd July 13, 2017, 6:48 a.m. UTC | #4
On 07/13, Vivek Gautam wrote:
> Hi Stephen,
> 
> 
> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
> >On 07/06, Vivek Gautam wrote:
> >>@@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
> >>  static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
> >>  			     size_t size)
> >>  {
> >>-	struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
> >>+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> >>+	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
> >>+	size_t ret;
> >>  	if (!ops)
> >>  		return 0;
> >>-	return ops->unmap(ops, iova, size);
> >>+	pm_runtime_get_sync(smmu_domain->smmu->dev);
> >Can these map/unmap ops be called from an atomic context? I seem
> >to recall that being a problem before.
> 
> That's something which was dropped in the following patch merged in master:
> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
> 
> Looks like we don't  need locks here anymore?
> 

While removing the spinlock around the map/unmap path may be one
thing, I'm not sure that's all of them. Is there a path from an
atomic DMA allocation (GFP_ATOMIC sort of thing) mapped into an
IOMMU for a device that can eventually get down to here and
attempt to turn a clk on?
Robin Murphy July 13, 2017, 9:50 a.m. UTC | #5
On 13/07/17 07:48, Stephen Boyd wrote:
> On 07/13, Vivek Gautam wrote:
>> Hi Stephen,
>>
>>
>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>> On 07/06, Vivek Gautam wrote:
>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>  static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>  			     size_t size)
>>>>  {
>>>> -	struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>> +	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>> +	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>> +	size_t ret;
>>>>  	if (!ops)
>>>>  		return 0;
>>>> -	return ops->unmap(ops, iova, size);
>>>> +	pm_runtime_get_sync(smmu_domain->smmu->dev);
>>> Can these map/unmap ops be called from an atomic context? I seem
>>> to recall that being a problem before.
>>
>> That's something which was dropped in the following patch merged in master:
>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>
>> Looks like we don't  need locks here anymore?
>>
> 
> While removing the spinlock around the map/unmap path may be one
> thing, I'm not sure that's all of them. Is there a path from an
> atomic DMA allocation (GFP_ATOMIC sort of thing) mapped into an
> IOMMU for a device that can eventually get down to here and
> attempt to turn a clk on?

Yes, in the DMA path map/unmap will frequently be called from IRQ
handlers (think e.g. network packets). The whole point of removing the
lock was to allow multiple maps/unmaps to execute in parallel (since we
know they will be safely operating on different areas of the pagetable).
AFAICS this change is going to largely reintroduce that bottleneck via
dev->power_lock, which is anything but what we want :(

Robin.
Rob Clark July 13, 2017, 11:50 a.m. UTC | #6
On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> Hi Vivek,
>
> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>> Hi Stephen,
>>
>>
>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>> On 07/06, Vivek Gautam wrote:
>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>                    size_t size)
>>>>   {
>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>> +    size_t ret;
>>>>         if (!ops)
>>>>           return 0;
>>>>   -    return ops->unmap(ops, iova, size);
>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>> Can these map/unmap ops be called from an atomic context? I seem
>>> to recall that being a problem before.
>>
>> That's something which was dropped in the following patch merged in master:
>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>
>> Looks like we don't  need locks here anymore?
>
>  Apart from the locking, wonder why a explicit pm_runtime is needed
>  from unmap. Somehow looks like some path in the master using that
>  should have enabled the pm ?
>

Yes, there are a bunch of scenarios where unmap can happen with
disabled master (but not in atomic context).  On the gpu side we
opportunistically keep a buffer mapping until the buffer is freed
(which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
an exported dmabuf while some other driver holds a reference to it
(which can be dropped when the v4l2 device is suspended).

Since unmap triggers tbl flush which touches iommu regs, the iommu
driver *definitely* needs a pm_runtime_get_sync().

BR,
-R
Rob Clark July 13, 2017, 11:53 a.m. UTC | #7
On Thu, Jul 13, 2017 at 5:50 AM, Robin Murphy <robin.murphy@arm.com> wrote:
> On 13/07/17 07:48, Stephen Boyd wrote:
>> On 07/13, Vivek Gautam wrote:
>>> Hi Stephen,
>>>
>>>
>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>> On 07/06, Vivek Gautam wrote:
>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>  static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>                         size_t size)
>>>>>  {
>>>>> -  struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>> +  struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>> +  struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>> +  size_t ret;
>>>>>    if (!ops)
>>>>>            return 0;
>>>>> -  return ops->unmap(ops, iova, size);
>>>>> +  pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>> to recall that being a problem before.
>>>
>>> That's something which was dropped in the following patch merged in master:
>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>
>>> Looks like we don't  need locks here anymore?
>>>
>>
>> While removing the spinlock around the map/unmap path may be one
>> thing, I'm not sure that's all of them. Is there a path from an
>> atomic DMA allocation (GFP_ATOMIC sort of thing) mapped into an
>> IOMMU for a device that can eventually get down to here and
>> attempt to turn a clk on?
>
> Yes, in the DMA path map/unmap will frequently be called from IRQ
> handlers (think e.g. network packets). The whole point of removing the
> lock was to allow multiple maps/unmaps to execute in parallel (since we
> know they will be safely operating on different areas of the pagetable).
> AFAICS this change is going to largely reintroduce that bottleneck via
> dev->power_lock, which is anything but what we want :(
>

Maybe __pm_runtime_resume() needs some sort of fast-path if already
enabled?  Or otherwise we need some sort of flag to tell the iommu
that it cannot rely on the unmapping device to be resumed?

BR,
-R
Marek Szyprowski July 13, 2017, 12:02 p.m. UTC | #8
Hi All,

On 2017-07-13 13:50, Rob Clark wrote:
> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>> On 07/06, Vivek Gautam wrote:
>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>    static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>                     size_t size)
>>>>>    {
>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>> +    size_t ret;
>>>>>          if (!ops)
>>>>>            return 0;
>>>>>    -    return ops->unmap(ops, iova, size);
>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>> to recall that being a problem before.
>>> That's something which was dropped in the following patch merged in master:
>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>
>>> Looks like we don't  need locks here anymore?
>>   Apart from the locking, wonder why a explicit pm_runtime is needed
>>   from unmap. Somehow looks like some path in the master using that
>>   should have enabled the pm ?
>>
> Yes, there are a bunch of scenarios where unmap can happen with
> disabled master (but not in atomic context).  On the gpu side we
> opportunistically keep a buffer mapping until the buffer is freed
> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
> an exported dmabuf while some other driver holds a reference to it
> (which can be dropped when the v4l2 device is suspended).
>
> Since unmap triggers tbl flush which touches iommu regs, the iommu
> driver *definitely* needs a pm_runtime_get_sync().

Afair unmap might be called from atomic context as well, for example as
a result of dma_unmap_page(). In exynos IOMMU I simply check the runtime
PM state of IOMMU device. TLB flush is performed only when IOMMU is in 
active
state. If it is suspended, I assume that the IOMMU controller's context
is already lost and its respective power domain might be already turned off,
so there is no point in touching IOMMU registers.

Best regards
Rob Clark July 13, 2017, 12:10 p.m. UTC | #9
On Thu, Jul 13, 2017 at 8:02 AM, Marek Szyprowski
<m.szyprowski@samsung.com> wrote:
> Hi All,
>
> On 2017-07-13 13:50, Rob Clark wrote:
>>
>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org>
>> wrote:
>>>
>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>>>
>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>>>
>>>>> On 07/06, Vivek Gautam wrote:
>>>>>>
>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain
>>>>>> *domain, unsigned long iova,
>>>>>>    static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned
>>>>>> long iova,
>>>>>>                     size_t size)
>>>>>>    {
>>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>>> +    size_t ret;
>>>>>>          if (!ops)
>>>>>>            return 0;
>>>>>>    -    return ops->unmap(ops, iova, size);
>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>>>
>>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>>> to recall that being a problem before.
>>>>
>>>> That's something which was dropped in the following patch merged in
>>>> master:
>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>>
>>>> Looks like we don't  need locks here anymore?
>>>
>>>   Apart from the locking, wonder why a explicit pm_runtime is needed
>>>   from unmap. Somehow looks like some path in the master using that
>>>   should have enabled the pm ?
>>>
>> Yes, there are a bunch of scenarios where unmap can happen with
>> disabled master (but not in atomic context).  On the gpu side we
>> opportunistically keep a buffer mapping until the buffer is freed
>> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
>> an exported dmabuf while some other driver holds a reference to it
>> (which can be dropped when the v4l2 device is suspended).
>>
>> Since unmap triggers tbl flush which touches iommu regs, the iommu
>> driver *definitely* needs a pm_runtime_get_sync().
>
>
> Afair unmap might be called from atomic context as well, for example as
> a result of dma_unmap_page(). In exynos IOMMU I simply check the runtime
> PM state of IOMMU device. TLB flush is performed only when IOMMU is in
> active
> state. If it is suspended, I assume that the IOMMU controller's context
> is already lost and its respective power domain might be already turned off,
> so there is no point in touching IOMMU registers.
>

that seems like an interesting approach.. although I wonder if there
can be some race w/ new device memory access once clks are enabled
before tlb flush completes?  That would be rather bad, since this
approach is letting the backing pages of memory be freed before tlb
flush.

BR,
-R
Marek Szyprowski July 13, 2017, 12:23 p.m. UTC | #10
Hi Rob,

On 2017-07-13 14:10, Rob Clark wrote:
> On Thu, Jul 13, 2017 at 8:02 AM, Marek Szyprowski
> <m.szyprowski@samsung.com> wrote:
>> On 2017-07-13 13:50, Rob Clark wrote:
>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org>
>>> wrote:
>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>>>> On 07/06, Vivek Gautam wrote:
>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain
>>>>>>> *domain, unsigned long iova,
>>>>>>>     static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned
>>>>>>> long iova,
>>>>>>>                      size_t size)
>>>>>>>     {
>>>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>>>> +    size_t ret;
>>>>>>>           if (!ops)
>>>>>>>             return 0;
>>>>>>>     -    return ops->unmap(ops, iova, size);
>>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>>>> to recall that being a problem before.
>>>>> That's something which was dropped in the following patch merged in
>>>>> master:
>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>>>
>>>>> Looks like we don't  need locks here anymore?
>>>>    Apart from the locking, wonder why a explicit pm_runtime is needed
>>>>    from unmap. Somehow looks like some path in the master using that
>>>>    should have enabled the pm ?
>>>>
>>> Yes, there are a bunch of scenarios where unmap can happen with
>>> disabled master (but not in atomic context).  On the gpu side we
>>> opportunistically keep a buffer mapping until the buffer is freed
>>> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
>>> an exported dmabuf while some other driver holds a reference to it
>>> (which can be dropped when the v4l2 device is suspended).
>>>
>>> Since unmap triggers tbl flush which touches iommu regs, the iommu
>>> driver *definitely* needs a pm_runtime_get_sync().
>>
>> Afair unmap might be called from atomic context as well, for example as
>> a result of dma_unmap_page(). In exynos IOMMU I simply check the runtime
>> PM state of IOMMU device. TLB flush is performed only when IOMMU is in
>> active
>> state. If it is suspended, I assume that the IOMMU controller's context
>> is already lost and its respective power domain might be already turned off,
>> so there is no point in touching IOMMU registers.
>>
> that seems like an interesting approach.. although I wonder if there
> can be some race w/ new device memory access once clks are enabled
> before tlb flush completes?  That would be rather bad, since this
> approach is letting the backing pages of memory be freed before tlb
> flush.

Exynos IOMMU has spinlock for ensuring that there is no race between PM 
runtime
suspend and unmap/tlb flush.

Best regards
Sricharan Ramabadhran July 13, 2017, 1:53 p.m. UTC | #11
Hi,

On 7/13/2017 5:20 PM, Rob Clark wrote:
> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> Hi Vivek,
>>
>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>> Hi Stephen,
>>>
>>>
>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>> On 07/06, Vivek Gautam wrote:
>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>                    size_t size)
>>>>>   {
>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>> +    size_t ret;
>>>>>         if (!ops)
>>>>>           return 0;
>>>>>   -    return ops->unmap(ops, iova, size);
>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>> to recall that being a problem before.
>>>
>>> That's something which was dropped in the following patch merged in master:
>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>
>>> Looks like we don't  need locks here anymore?
>>
>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>>  from unmap. Somehow looks like some path in the master using that
>>  should have enabled the pm ?
>>
> 
> Yes, there are a bunch of scenarios where unmap can happen with
> disabled master (but not in atomic context).  On the gpu side we
> opportunistically keep a buffer mapping until the buffer is freed
> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
> an exported dmabuf while some other driver holds a reference to it
> (which can be dropped when the v4l2 device is suspended).
> 
> Since unmap triggers tbl flush which touches iommu regs, the iommu
> driver *definitely* needs a pm_runtime_get_sync().

 Ok, with that being the case, there are two things here,

 1) If the device links are still intact at these places where unmap is called,
    then pm_runtime from the master would setup the all the clocks. That would
    avoid reintroducing the locking indirectly here.

 2) If not, then doing it here is the only way. But for both cases, since
    the unmap can be called from atomic context, resume handler here should
    avoid doing clk_prepare_enable , instead move the clk_prepare to the init.

Regards,
 Sricharan
Vivek Gautam July 13, 2017, 1:57 p.m. UTC | #12
On Thu, Jul 13, 2017 at 11:05 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> Hi Vivek,
>
> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>> Hi Stephen,
>>
>>
>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>> On 07/06, Vivek Gautam wrote:
>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>                    size_t size)
>>>>   {
>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>> +    size_t ret;
>>>>         if (!ops)
>>>>           return 0;
>>>>   -    return ops->unmap(ops, iova, size);
>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>> Can these map/unmap ops be called from an atomic context? I seem
>>> to recall that being a problem before.
>>
>> That's something which was dropped in the following patch merged in master:
>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>
>> Looks like we don't  need locks here anymore?
>
>  Apart from the locking, wonder why a explicit pm_runtime is needed
>  from unmap. Somehow looks like some path in the master using that
>  should have enabled the pm ?

Right, the master should have done a runtime_get(), and with
device links the iommu will also resume.

The master will call the unmap when it is attached to the iommu
and therefore the iommu should be in resume state.
We shouldn't have an unmap without the master attached anyways.
Will investigate this further if we need the pm_runtime() calls
around unmap or not.

Best regards
Vivek

>
> Regards,
>  Sricharan
>
> --
> "QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation
>
> ---
> This email has been checked for viruses by Avast antivirus software.
> https://www.avast.com/antivirus
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Vivek Gautam July 13, 2017, 2:01 p.m. UTC | #13
On Thu, Jul 13, 2017 at 7:27 PM, Vivek Gautam
<vivek.gautam@codeaurora.org> wrote:
> On Thu, Jul 13, 2017 at 11:05 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> Hi Vivek,
>>
>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>> Hi Stephen,
>>>
>>>
>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>> On 07/06, Vivek Gautam wrote:
>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>                    size_t size)
>>>>>   {
>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>> +    size_t ret;
>>>>>         if (!ops)
>>>>>           return 0;
>>>>>   -    return ops->unmap(ops, iova, size);
>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>> to recall that being a problem before.
>>>
>>> That's something which was dropped in the following patch merged in master:
>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>
>>> Looks like we don't  need locks here anymore?
>>
>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>>  from unmap. Somehow looks like some path in the master using that
>>  should have enabled the pm ?
>
> Right, the master should have done a runtime_get(), and with
> device links the iommu will also resume.
>
> The master will call the unmap when it is attached to the iommu
> and therefore the iommu should be in resume state.
> We shouldn't have an unmap without the master attached anyways.
> Will investigate this further if we need the pm_runtime() calls
> around unmap or not.

My apologies. My email client didn't update the thread. So please ignore
this comment.

>
> Best regards
> Vivek
>
>>
>> Regards,
>>  Sricharan
>>
>> --
>> "QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation
>>
>> ---
>> This email has been checked for viruses by Avast antivirus software.
>> https://www.avast.com/antivirus
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
>
> --
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> a Linux Foundation Collaborative Project
Rob Clark July 13, 2017, 2:55 p.m. UTC | #14
On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> Hi,
>
> On 7/13/2017 5:20 PM, Rob Clark wrote:
>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>>> Hi Vivek,
>>>
>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>>> Hi Stephen,
>>>>
>>>>
>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>>> On 07/06, Vivek Gautam wrote:
>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>>                    size_t size)
>>>>>>   {
>>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>>> +    size_t ret;
>>>>>>         if (!ops)
>>>>>>           return 0;
>>>>>>   -    return ops->unmap(ops, iova, size);
>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>>> to recall that being a problem before.
>>>>
>>>> That's something which was dropped in the following patch merged in master:
>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>>
>>>> Looks like we don't  need locks here anymore?
>>>
>>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>>>  from unmap. Somehow looks like some path in the master using that
>>>  should have enabled the pm ?
>>>
>>
>> Yes, there are a bunch of scenarios where unmap can happen with
>> disabled master (but not in atomic context).  On the gpu side we
>> opportunistically keep a buffer mapping until the buffer is freed
>> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
>> an exported dmabuf while some other driver holds a reference to it
>> (which can be dropped when the v4l2 device is suspended).
>>
>> Since unmap triggers tbl flush which touches iommu regs, the iommu
>> driver *definitely* needs a pm_runtime_get_sync().
>
>  Ok, with that being the case, there are two things here,
>
>  1) If the device links are still intact at these places where unmap is called,
>     then pm_runtime from the master would setup the all the clocks. That would
>     avoid reintroducing the locking indirectly here.
>
>  2) If not, then doing it here is the only way. But for both cases, since
>     the unmap can be called from atomic context, resume handler here should
>     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
>

I do kinda like the approach Marek suggested.. of deferring the tlb
flush until resume.  I'm wondering if we could combine that with
putting the mmu in a stalled state when we suspend (and not resume the
mmu until after the pending tlb flush)?

BR,
-R
Will Deacon July 14, 2017, 5:07 p.m. UTC | #15
On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote:
> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> > Hi,
> >
> > On 7/13/2017 5:20 PM, Rob Clark wrote:
> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> >>> Hi Vivek,
> >>>
> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
> >>>> Hi Stephen,
> >>>>
> >>>>
> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
> >>>>> On 07/06, Vivek Gautam wrote:
> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
> >>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
> >>>>>>                    size_t size)
> >>>>>>   {
> >>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
> >>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> >>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
> >>>>>> +    size_t ret;
> >>>>>>         if (!ops)
> >>>>>>           return 0;
> >>>>>>   -    return ops->unmap(ops, iova, size);
> >>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
> >>>>> Can these map/unmap ops be called from an atomic context? I seem
> >>>>> to recall that being a problem before.
> >>>>
> >>>> That's something which was dropped in the following patch merged in master:
> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
> >>>>
> >>>> Looks like we don't  need locks here anymore?
> >>>
> >>>  Apart from the locking, wonder why a explicit pm_runtime is needed
> >>>  from unmap. Somehow looks like some path in the master using that
> >>>  should have enabled the pm ?
> >>>
> >>
> >> Yes, there are a bunch of scenarios where unmap can happen with
> >> disabled master (but not in atomic context).  On the gpu side we
> >> opportunistically keep a buffer mapping until the buffer is freed
> >> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
> >> an exported dmabuf while some other driver holds a reference to it
> >> (which can be dropped when the v4l2 device is suspended).
> >>
> >> Since unmap triggers tbl flush which touches iommu regs, the iommu
> >> driver *definitely* needs a pm_runtime_get_sync().
> >
> >  Ok, with that being the case, there are two things here,
> >
> >  1) If the device links are still intact at these places where unmap is called,
> >     then pm_runtime from the master would setup the all the clocks. That would
> >     avoid reintroducing the locking indirectly here.
> >
> >  2) If not, then doing it here is the only way. But for both cases, since
> >     the unmap can be called from atomic context, resume handler here should
> >     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
> >
> 
> I do kinda like the approach Marek suggested.. of deferring the tlb
> flush until resume.  I'm wondering if we could combine that with
> putting the mmu in a stalled state when we suspend (and not resume the
> mmu until after the pending tlb flush)?

I'm not sure that a stalled state is what we're after here, because we need
to take care to prevent any table walks if we've freed the underlying pages.
What we could try to do is disable the SMMU (put into global bypass) and
invalidate the TLB when performing a suspend operation, then we just ignore
invalidation whilst the clocks are stopped and, on resume, enable the SMMU
again.

That said, I don't think we can tolerate suspend/resume racing with
map/unmap, and it's not clear to me how we avoid that without penalising
the fastpath.

Will
Rob Clark July 14, 2017, 5:42 p.m. UTC | #16
On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote:
> On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote:
>> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> > Hi,
>> >
>> > On 7/13/2017 5:20 PM, Rob Clark wrote:
>> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> >>> Hi Vivek,
>> >>>
>> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>> >>>> Hi Stephen,
>> >>>>
>> >>>>
>> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>> >>>>> On 07/06, Vivek Gautam wrote:
>> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>> >>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>> >>>>>>                    size_t size)
>> >>>>>>   {
>> >>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>> >>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>> >>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>> >>>>>> +    size_t ret;
>> >>>>>>         if (!ops)
>> >>>>>>           return 0;
>> >>>>>>   -    return ops->unmap(ops, iova, size);
>> >>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>> >>>>> Can these map/unmap ops be called from an atomic context? I seem
>> >>>>> to recall that being a problem before.
>> >>>>
>> >>>> That's something which was dropped in the following patch merged in master:
>> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>> >>>>
>> >>>> Looks like we don't  need locks here anymore?
>> >>>
>> >>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>> >>>  from unmap. Somehow looks like some path in the master using that
>> >>>  should have enabled the pm ?
>> >>>
>> >>
>> >> Yes, there are a bunch of scenarios where unmap can happen with
>> >> disabled master (but not in atomic context).  On the gpu side we
>> >> opportunistically keep a buffer mapping until the buffer is freed
>> >> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
>> >> an exported dmabuf while some other driver holds a reference to it
>> >> (which can be dropped when the v4l2 device is suspended).
>> >>
>> >> Since unmap triggers tbl flush which touches iommu regs, the iommu
>> >> driver *definitely* needs a pm_runtime_get_sync().
>> >
>> >  Ok, with that being the case, there are two things here,
>> >
>> >  1) If the device links are still intact at these places where unmap is called,
>> >     then pm_runtime from the master would setup the all the clocks. That would
>> >     avoid reintroducing the locking indirectly here.
>> >
>> >  2) If not, then doing it here is the only way. But for both cases, since
>> >     the unmap can be called from atomic context, resume handler here should
>> >     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
>> >
>>
>> I do kinda like the approach Marek suggested.. of deferring the tlb
>> flush until resume.  I'm wondering if we could combine that with
>> putting the mmu in a stalled state when we suspend (and not resume the
>> mmu until after the pending tlb flush)?
>
> I'm not sure that a stalled state is what we're after here, because we need
> to take care to prevent any table walks if we've freed the underlying pages.
> What we could try to do is disable the SMMU (put into global bypass) and
> invalidate the TLB when performing a suspend operation, then we just ignore
> invalidation whilst the clocks are stopped and, on resume, enable the SMMU
> again.

wouldn't stalled just block any memory transactions by device(s) using
the context bank?  Putting it in bypass isn't really a good thing if
there is any chance the device can sneak in a memory access before
we've taking it back out of bypass (ie. makes gpu a giant userspace
controlled root hole).

BR,
-R

> That said, I don't think we can tolerate suspend/resume racing with
> map/unmap, and it's not clear to me how we avoid that without penalising
> the fastpath.
>
> Will
Will Deacon July 14, 2017, 6:06 p.m. UTC | #17
On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote:
> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote:
> > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote:
> >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> >> > Hi,
> >> >
> >> > On 7/13/2017 5:20 PM, Rob Clark wrote:
> >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> >> >>> Hi Vivek,
> >> >>>
> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
> >> >>>> Hi Stephen,
> >> >>>>
> >> >>>>
> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
> >> >>>>> On 07/06, Vivek Gautam wrote:
> >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
> >> >>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
> >> >>>>>>                    size_t size)
> >> >>>>>>   {
> >> >>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
> >> >>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> >> >>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
> >> >>>>>> +    size_t ret;
> >> >>>>>>         if (!ops)
> >> >>>>>>           return 0;
> >> >>>>>>   -    return ops->unmap(ops, iova, size);
> >> >>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
> >> >>>>> Can these map/unmap ops be called from an atomic context? I seem
> >> >>>>> to recall that being a problem before.
> >> >>>>
> >> >>>> That's something which was dropped in the following patch merged in master:
> >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
> >> >>>>
> >> >>>> Looks like we don't  need locks here anymore?
> >> >>>
> >> >>>  Apart from the locking, wonder why a explicit pm_runtime is needed
> >> >>>  from unmap. Somehow looks like some path in the master using that
> >> >>>  should have enabled the pm ?
> >> >>>
> >> >>
> >> >> Yes, there are a bunch of scenarios where unmap can happen with
> >> >> disabled master (but not in atomic context).  On the gpu side we
> >> >> opportunistically keep a buffer mapping until the buffer is freed
> >> >> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
> >> >> an exported dmabuf while some other driver holds a reference to it
> >> >> (which can be dropped when the v4l2 device is suspended).
> >> >>
> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu
> >> >> driver *definitely* needs a pm_runtime_get_sync().
> >> >
> >> >  Ok, with that being the case, there are two things here,
> >> >
> >> >  1) If the device links are still intact at these places where unmap is called,
> >> >     then pm_runtime from the master would setup the all the clocks. That would
> >> >     avoid reintroducing the locking indirectly here.
> >> >
> >> >  2) If not, then doing it here is the only way. But for both cases, since
> >> >     the unmap can be called from atomic context, resume handler here should
> >> >     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
> >> >
> >>
> >> I do kinda like the approach Marek suggested.. of deferring the tlb
> >> flush until resume.  I'm wondering if we could combine that with
> >> putting the mmu in a stalled state when we suspend (and not resume the
> >> mmu until after the pending tlb flush)?
> >
> > I'm not sure that a stalled state is what we're after here, because we need
> > to take care to prevent any table walks if we've freed the underlying pages.
> > What we could try to do is disable the SMMU (put into global bypass) and
> > invalidate the TLB when performing a suspend operation, then we just ignore
> > invalidation whilst the clocks are stopped and, on resume, enable the SMMU
> > again.
> 
> wouldn't stalled just block any memory transactions by device(s) using
> the context bank?  Putting it in bypass isn't really a good thing if
> there is any chance the device can sneak in a memory access before
> we've taking it back out of bypass (ie. makes gpu a giant userspace
> controlled root hole).

If it doesn't deadlock, then yes, it will stall transactions. However, that
doesn't mean it necessarily prevents page table walks. Instead of bypass, we
could configure all the streams to terminate, but this race still worries me
somewhat. I thought that the SMMU would only be suspended if all of its
masters were suspended, so if the GPU wants to come out of suspend then the
SMMU should be resumed first.

It would be helpful if somebody could figure out exactly what can race with
the suspend/resume calls here.

Will
Rob Clark July 14, 2017, 6:25 p.m. UTC | #18
On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote:
> On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote:
>> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote:
>> > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote:
>> >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> >> > Hi,
>> >> >
>> >> > On 7/13/2017 5:20 PM, Rob Clark wrote:
>> >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> >> >>> Hi Vivek,
>> >> >>>
>> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>> >> >>>> Hi Stephen,
>> >> >>>>
>> >> >>>>
>> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>> >> >>>>> On 07/06, Vivek Gautam wrote:
>> >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>> >> >>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>> >> >>>>>>                    size_t size)
>> >> >>>>>>   {
>> >> >>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>> >> >>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>> >> >>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>> >> >>>>>> +    size_t ret;
>> >> >>>>>>         if (!ops)
>> >> >>>>>>           return 0;
>> >> >>>>>>   -    return ops->unmap(ops, iova, size);
>> >> >>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>> >> >>>>> Can these map/unmap ops be called from an atomic context? I seem
>> >> >>>>> to recall that being a problem before.
>> >> >>>>
>> >> >>>> That's something which was dropped in the following patch merged in master:
>> >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>> >> >>>>
>> >> >>>> Looks like we don't  need locks here anymore?
>> >> >>>
>> >> >>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>> >> >>>  from unmap. Somehow looks like some path in the master using that
>> >> >>>  should have enabled the pm ?
>> >> >>>
>> >> >>
>> >> >> Yes, there are a bunch of scenarios where unmap can happen with
>> >> >> disabled master (but not in atomic context).  On the gpu side we
>> >> >> opportunistically keep a buffer mapping until the buffer is freed
>> >> >> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
>> >> >> an exported dmabuf while some other driver holds a reference to it
>> >> >> (which can be dropped when the v4l2 device is suspended).
>> >> >>
>> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu
>> >> >> driver *definitely* needs a pm_runtime_get_sync().
>> >> >
>> >> >  Ok, with that being the case, there are two things here,
>> >> >
>> >> >  1) If the device links are still intact at these places where unmap is called,
>> >> >     then pm_runtime from the master would setup the all the clocks. That would
>> >> >     avoid reintroducing the locking indirectly here.
>> >> >
>> >> >  2) If not, then doing it here is the only way. But for both cases, since
>> >> >     the unmap can be called from atomic context, resume handler here should
>> >> >     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
>> >> >
>> >>
>> >> I do kinda like the approach Marek suggested.. of deferring the tlb
>> >> flush until resume.  I'm wondering if we could combine that with
>> >> putting the mmu in a stalled state when we suspend (and not resume the
>> >> mmu until after the pending tlb flush)?
>> >
>> > I'm not sure that a stalled state is what we're after here, because we need
>> > to take care to prevent any table walks if we've freed the underlying pages.
>> > What we could try to do is disable the SMMU (put into global bypass) and
>> > invalidate the TLB when performing a suspend operation, then we just ignore
>> > invalidation whilst the clocks are stopped and, on resume, enable the SMMU
>> > again.
>>
>> wouldn't stalled just block any memory transactions by device(s) using
>> the context bank?  Putting it in bypass isn't really a good thing if
>> there is any chance the device can sneak in a memory access before
>> we've taking it back out of bypass (ie. makes gpu a giant userspace
>> controlled root hole).
>
> If it doesn't deadlock, then yes, it will stall transactions. However, that
> doesn't mean it necessarily prevents page table walks.

btw, I guess the concern about pagetable walk is that the unmap could
have removed some sub-level of the pt that the tlb walk would hit?
Would deferring freeing those pages help?

> Instead of bypass, we
> could configure all the streams to terminate, but this race still worries me
> somewhat. I thought that the SMMU would only be suspended if all of its
> masters were suspended, so if the GPU wants to come out of suspend then the
> SMMU should be resumed first.

I believe this should be true.. on the gpu side, I'm mostly trying to
avoid having to power the gpu back on to free buffers.  (On the v4l2
side, somewhere in the core videobuf code would also need to be made
to wrap it's dma_unmap_sg() with pm_runtime_get/put()..)

BR,
-R

> It would be helpful if somebody could figure out exactly what can race with
> the suspend/resume calls here.
>
> Will
Will Deacon July 14, 2017, 7:01 p.m. UTC | #19
On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote:
> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote:
> > On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote:
> >> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote:
> >> > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote:
> >> >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> >> >> > Hi,
> >> >> >
> >> >> > On 7/13/2017 5:20 PM, Rob Clark wrote:
> >> >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> >> >> >>> Hi Vivek,
> >> >> >>>
> >> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
> >> >> >>>> Hi Stephen,
> >> >> >>>>
> >> >> >>>>
> >> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
> >> >> >>>>> On 07/06, Vivek Gautam wrote:
> >> >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
> >> >> >>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
> >> >> >>>>>>                    size_t size)
> >> >> >>>>>>   {
> >> >> >>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
> >> >> >>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> >> >> >>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
> >> >> >>>>>> +    size_t ret;
> >> >> >>>>>>         if (!ops)
> >> >> >>>>>>           return 0;
> >> >> >>>>>>   -    return ops->unmap(ops, iova, size);
> >> >> >>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
> >> >> >>>>> Can these map/unmap ops be called from an atomic context? I seem
> >> >> >>>>> to recall that being a problem before.
> >> >> >>>>
> >> >> >>>> That's something which was dropped in the following patch merged in master:
> >> >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
> >> >> >>>>
> >> >> >>>> Looks like we don't  need locks here anymore?
> >> >> >>>
> >> >> >>>  Apart from the locking, wonder why a explicit pm_runtime is needed
> >> >> >>>  from unmap. Somehow looks like some path in the master using that
> >> >> >>>  should have enabled the pm ?
> >> >> >>>
> >> >> >>
> >> >> >> Yes, there are a bunch of scenarios where unmap can happen with
> >> >> >> disabled master (but not in atomic context).  On the gpu side we
> >> >> >> opportunistically keep a buffer mapping until the buffer is freed
> >> >> >> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
> >> >> >> an exported dmabuf while some other driver holds a reference to it
> >> >> >> (which can be dropped when the v4l2 device is suspended).
> >> >> >>
> >> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu
> >> >> >> driver *definitely* needs a pm_runtime_get_sync().
> >> >> >
> >> >> >  Ok, with that being the case, there are two things here,
> >> >> >
> >> >> >  1) If the device links are still intact at these places where unmap is called,
> >> >> >     then pm_runtime from the master would setup the all the clocks. That would
> >> >> >     avoid reintroducing the locking indirectly here.
> >> >> >
> >> >> >  2) If not, then doing it here is the only way. But for both cases, since
> >> >> >     the unmap can be called from atomic context, resume handler here should
> >> >> >     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
> >> >> >
> >> >>
> >> >> I do kinda like the approach Marek suggested.. of deferring the tlb
> >> >> flush until resume.  I'm wondering if we could combine that with
> >> >> putting the mmu in a stalled state when we suspend (and not resume the
> >> >> mmu until after the pending tlb flush)?
> >> >
> >> > I'm not sure that a stalled state is what we're after here, because we need
> >> > to take care to prevent any table walks if we've freed the underlying pages.
> >> > What we could try to do is disable the SMMU (put into global bypass) and
> >> > invalidate the TLB when performing a suspend operation, then we just ignore
> >> > invalidation whilst the clocks are stopped and, on resume, enable the SMMU
> >> > again.
> >>
> >> wouldn't stalled just block any memory transactions by device(s) using
> >> the context bank?  Putting it in bypass isn't really a good thing if
> >> there is any chance the device can sneak in a memory access before
> >> we've taking it back out of bypass (ie. makes gpu a giant userspace
> >> controlled root hole).
> >
> > If it doesn't deadlock, then yes, it will stall transactions. However, that
> > doesn't mean it necessarily prevents page table walks.
> 
> btw, I guess the concern about pagetable walk is that the unmap could
> have removed some sub-level of the pt that the tlb walk would hit?
> Would deferring freeing those pages help?

Could do, but it sounds like a lot of complication that I think we can fix
by making the suspend operation put the SMMU into a "clean" state.

> > Instead of bypass, we
> > could configure all the streams to terminate, but this race still worries me
> > somewhat. I thought that the SMMU would only be suspended if all of its
> > masters were suspended, so if the GPU wants to come out of suspend then the
> > SMMU should be resumed first.
> 
> I believe this should be true.. on the gpu side, I'm mostly trying to
> avoid having to power the gpu back on to free buffers.  (On the v4l2
> side, somewhere in the core videobuf code would also need to be made
> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..)

Right, and we shouldn't have to resume it if we suspend it in a clean state,
with the TLBs invalidated.

Will
Rob Clark July 14, 2017, 7:34 p.m. UTC | #20
On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote:
> On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote:
>> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote:
>> > On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote:
>> >> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote:
>> >> > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote:
>> >> >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> >> >> > Hi,
>> >> >> >
>> >> >> > On 7/13/2017 5:20 PM, Rob Clark wrote:
>> >> >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> >> >> >>> Hi Vivek,
>> >> >> >>>
>> >> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>> >> >> >>>> Hi Stephen,
>> >> >> >>>>
>> >> >> >>>>
>> >> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>> >> >> >>>>> On 07/06, Vivek Gautam wrote:
>> >> >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>> >> >> >>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>> >> >> >>>>>>                    size_t size)
>> >> >> >>>>>>   {
>> >> >> >>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>> >> >> >>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>> >> >> >>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>> >> >> >>>>>> +    size_t ret;
>> >> >> >>>>>>         if (!ops)
>> >> >> >>>>>>           return 0;
>> >> >> >>>>>>   -    return ops->unmap(ops, iova, size);
>> >> >> >>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>> >> >> >>>>> Can these map/unmap ops be called from an atomic context? I seem
>> >> >> >>>>> to recall that being a problem before.
>> >> >> >>>>
>> >> >> >>>> That's something which was dropped in the following patch merged in master:
>> >> >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>> >> >> >>>>
>> >> >> >>>> Looks like we don't  need locks here anymore?
>> >> >> >>>
>> >> >> >>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>> >> >> >>>  from unmap. Somehow looks like some path in the master using that
>> >> >> >>>  should have enabled the pm ?
>> >> >> >>>
>> >> >> >>
>> >> >> >> Yes, there are a bunch of scenarios where unmap can happen with
>> >> >> >> disabled master (but not in atomic context).  On the gpu side we
>> >> >> >> opportunistically keep a buffer mapping until the buffer is freed
>> >> >> >> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
>> >> >> >> an exported dmabuf while some other driver holds a reference to it
>> >> >> >> (which can be dropped when the v4l2 device is suspended).
>> >> >> >>
>> >> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu
>> >> >> >> driver *definitely* needs a pm_runtime_get_sync().
>> >> >> >
>> >> >> >  Ok, with that being the case, there are two things here,
>> >> >> >
>> >> >> >  1) If the device links are still intact at these places where unmap is called,
>> >> >> >     then pm_runtime from the master would setup the all the clocks. That would
>> >> >> >     avoid reintroducing the locking indirectly here.
>> >> >> >
>> >> >> >  2) If not, then doing it here is the only way. But for both cases, since
>> >> >> >     the unmap can be called from atomic context, resume handler here should
>> >> >> >     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
>> >> >> >
>> >> >>
>> >> >> I do kinda like the approach Marek suggested.. of deferring the tlb
>> >> >> flush until resume.  I'm wondering if we could combine that with
>> >> >> putting the mmu in a stalled state when we suspend (and not resume the
>> >> >> mmu until after the pending tlb flush)?
>> >> >
>> >> > I'm not sure that a stalled state is what we're after here, because we need
>> >> > to take care to prevent any table walks if we've freed the underlying pages.
>> >> > What we could try to do is disable the SMMU (put into global bypass) and
>> >> > invalidate the TLB when performing a suspend operation, then we just ignore
>> >> > invalidation whilst the clocks are stopped and, on resume, enable the SMMU
>> >> > again.
>> >>
>> >> wouldn't stalled just block any memory transactions by device(s) using
>> >> the context bank?  Putting it in bypass isn't really a good thing if
>> >> there is any chance the device can sneak in a memory access before
>> >> we've taking it back out of bypass (ie. makes gpu a giant userspace
>> >> controlled root hole).
>> >
>> > If it doesn't deadlock, then yes, it will stall transactions. However, that
>> > doesn't mean it necessarily prevents page table walks.
>>
>> btw, I guess the concern about pagetable walk is that the unmap could
>> have removed some sub-level of the pt that the tlb walk would hit?
>> Would deferring freeing those pages help?
>
> Could do, but it sounds like a lot of complication that I think we can fix
> by making the suspend operation put the SMMU into a "clean" state.
>
>> > Instead of bypass, we
>> > could configure all the streams to terminate, but this race still worries me
>> > somewhat. I thought that the SMMU would only be suspended if all of its
>> > masters were suspended, so if the GPU wants to come out of suspend then the
>> > SMMU should be resumed first.
>>
>> I believe this should be true.. on the gpu side, I'm mostly trying to
>> avoid having to power the gpu back on to free buffers.  (On the v4l2
>> side, somewhere in the core videobuf code would also need to be made
>> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..)
>
> Right, and we shouldn't have to resume it if we suspend it in a clean state,
> with the TLBs invalidated.
>

I guess if the device_link() stuff ensured the attached device
(gpu/etc) was suspended before suspending the iommu, then I guess I
can't see how temporarily putting the iommu in bypass would be a
problem.  I haven't looked at the device_link() stuff too closely, but
iommu being resumed first and suspended last seems like the only thing
that would make sense.  I'm mostly just nervous about iommu in bypass
vs gpu since userspace has so much control over what address gpu
writes to / reads from, so getting it wrong w/ the iommu would be a
rather bad thing ;-)

BR,
-R
Will Deacon July 14, 2017, 7:36 p.m. UTC | #21
On Fri, Jul 14, 2017 at 03:34:42PM -0400, Rob Clark wrote:
> On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote:
> > On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote:
> >> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote:
> >> > On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote:
> >> >> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote:
> >> >> > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote:
> >> >> >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> >> >> >> > Hi,
> >> >> >> >
> >> >> >> > On 7/13/2017 5:20 PM, Rob Clark wrote:
> >> >> >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> >> >> >> >>> Hi Vivek,
> >> >> >> >>>
> >> >> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
> >> >> >> >>>> Hi Stephen,
> >> >> >> >>>>
> >> >> >> >>>>
> >> >> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
> >> >> >> >>>>> On 07/06, Vivek Gautam wrote:
> >> >> >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
> >> >> >> >>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
> >> >> >> >>>>>>                    size_t size)
> >> >> >> >>>>>>   {
> >> >> >> >>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
> >> >> >> >>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> >> >> >> >>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
> >> >> >> >>>>>> +    size_t ret;
> >> >> >> >>>>>>         if (!ops)
> >> >> >> >>>>>>           return 0;
> >> >> >> >>>>>>   -    return ops->unmap(ops, iova, size);
> >> >> >> >>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
> >> >> >> >>>>> Can these map/unmap ops be called from an atomic context? I seem
> >> >> >> >>>>> to recall that being a problem before.
> >> >> >> >>>>
> >> >> >> >>>> That's something which was dropped in the following patch merged in master:
> >> >> >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
> >> >> >> >>>>
> >> >> >> >>>> Looks like we don't  need locks here anymore?
> >> >> >> >>>
> >> >> >> >>>  Apart from the locking, wonder why a explicit pm_runtime is needed
> >> >> >> >>>  from unmap. Somehow looks like some path in the master using that
> >> >> >> >>>  should have enabled the pm ?
> >> >> >> >>>
> >> >> >> >>
> >> >> >> >> Yes, there are a bunch of scenarios where unmap can happen with
> >> >> >> >> disabled master (but not in atomic context).  On the gpu side we
> >> >> >> >> opportunistically keep a buffer mapping until the buffer is freed
> >> >> >> >> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
> >> >> >> >> an exported dmabuf while some other driver holds a reference to it
> >> >> >> >> (which can be dropped when the v4l2 device is suspended).
> >> >> >> >>
> >> >> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu
> >> >> >> >> driver *definitely* needs a pm_runtime_get_sync().
> >> >> >> >
> >> >> >> >  Ok, with that being the case, there are two things here,
> >> >> >> >
> >> >> >> >  1) If the device links are still intact at these places where unmap is called,
> >> >> >> >     then pm_runtime from the master would setup the all the clocks. That would
> >> >> >> >     avoid reintroducing the locking indirectly here.
> >> >> >> >
> >> >> >> >  2) If not, then doing it here is the only way. But for both cases, since
> >> >> >> >     the unmap can be called from atomic context, resume handler here should
> >> >> >> >     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
> >> >> >> >
> >> >> >>
> >> >> >> I do kinda like the approach Marek suggested.. of deferring the tlb
> >> >> >> flush until resume.  I'm wondering if we could combine that with
> >> >> >> putting the mmu in a stalled state when we suspend (and not resume the
> >> >> >> mmu until after the pending tlb flush)?
> >> >> >
> >> >> > I'm not sure that a stalled state is what we're after here, because we need
> >> >> > to take care to prevent any table walks if we've freed the underlying pages.
> >> >> > What we could try to do is disable the SMMU (put into global bypass) and
> >> >> > invalidate the TLB when performing a suspend operation, then we just ignore
> >> >> > invalidation whilst the clocks are stopped and, on resume, enable the SMMU
> >> >> > again.
> >> >>
> >> >> wouldn't stalled just block any memory transactions by device(s) using
> >> >> the context bank?  Putting it in bypass isn't really a good thing if
> >> >> there is any chance the device can sneak in a memory access before
> >> >> we've taking it back out of bypass (ie. makes gpu a giant userspace
> >> >> controlled root hole).
> >> >
> >> > If it doesn't deadlock, then yes, it will stall transactions. However, that
> >> > doesn't mean it necessarily prevents page table walks.
> >>
> >> btw, I guess the concern about pagetable walk is that the unmap could
> >> have removed some sub-level of the pt that the tlb walk would hit?
> >> Would deferring freeing those pages help?
> >
> > Could do, but it sounds like a lot of complication that I think we can fix
> > by making the suspend operation put the SMMU into a "clean" state.
> >
> >> > Instead of bypass, we
> >> > could configure all the streams to terminate, but this race still worries me
> >> > somewhat. I thought that the SMMU would only be suspended if all of its
> >> > masters were suspended, so if the GPU wants to come out of suspend then the
> >> > SMMU should be resumed first.
> >>
> >> I believe this should be true.. on the gpu side, I'm mostly trying to
> >> avoid having to power the gpu back on to free buffers.  (On the v4l2
> >> side, somewhere in the core videobuf code would also need to be made
> >> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..)
> >
> > Right, and we shouldn't have to resume it if we suspend it in a clean state,
> > with the TLBs invalidated.
> >
> 
> I guess if the device_link() stuff ensured the attached device
> (gpu/etc) was suspended before suspending the iommu, then I guess I
> can't see how temporarily putting the iommu in bypass would be a
> problem.  I haven't looked at the device_link() stuff too closely, but
> iommu being resumed first and suspended last seems like the only thing
> that would make sense.  I'm mostly just nervous about iommu in bypass
> vs gpu since userspace has so much control over what address gpu
> writes to / reads from, so getting it wrong w/ the iommu would be a
> rather bad thing ;-)

Right, but we can also configure it to terminate if you don't want bypass.

Will
Rob Clark July 14, 2017, 7:39 p.m. UTC | #22
On Fri, Jul 14, 2017 at 3:36 PM, Will Deacon <will.deacon@arm.com> wrote:
> On Fri, Jul 14, 2017 at 03:34:42PM -0400, Rob Clark wrote:
>> On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote:
>> > On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote:
>> >> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote:
>> >> > On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote:
>> >> >> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote:
>> >> >> > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote:
>> >> >> >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> >> >> >> > Hi,
>> >> >> >> >
>> >> >> >> > On 7/13/2017 5:20 PM, Rob Clark wrote:
>> >> >> >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> >> >> >> >>> Hi Vivek,
>> >> >> >> >>>
>> >> >> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>> >> >> >> >>>> Hi Stephen,
>> >> >> >> >>>>
>> >> >> >> >>>>
>> >> >> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>> >> >> >> >>>>> On 07/06, Vivek Gautam wrote:
>> >> >> >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>> >> >> >> >>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>> >> >> >> >>>>>>                    size_t size)
>> >> >> >> >>>>>>   {
>> >> >> >> >>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>> >> >> >> >>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>> >> >> >> >>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>> >> >> >> >>>>>> +    size_t ret;
>> >> >> >> >>>>>>         if (!ops)
>> >> >> >> >>>>>>           return 0;
>> >> >> >> >>>>>>   -    return ops->unmap(ops, iova, size);
>> >> >> >> >>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>> >> >> >> >>>>> Can these map/unmap ops be called from an atomic context? I seem
>> >> >> >> >>>>> to recall that being a problem before.
>> >> >> >> >>>>
>> >> >> >> >>>> That's something which was dropped in the following patch merged in master:
>> >> >> >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>> >> >> >> >>>>
>> >> >> >> >>>> Looks like we don't  need locks here anymore?
>> >> >> >> >>>
>> >> >> >> >>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>> >> >> >> >>>  from unmap. Somehow looks like some path in the master using that
>> >> >> >> >>>  should have enabled the pm ?
>> >> >> >> >>>
>> >> >> >> >>
>> >> >> >> >> Yes, there are a bunch of scenarios where unmap can happen with
>> >> >> >> >> disabled master (but not in atomic context).  On the gpu side we
>> >> >> >> >> opportunistically keep a buffer mapping until the buffer is freed
>> >> >> >> >> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
>> >> >> >> >> an exported dmabuf while some other driver holds a reference to it
>> >> >> >> >> (which can be dropped when the v4l2 device is suspended).
>> >> >> >> >>
>> >> >> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu
>> >> >> >> >> driver *definitely* needs a pm_runtime_get_sync().
>> >> >> >> >
>> >> >> >> >  Ok, with that being the case, there are two things here,
>> >> >> >> >
>> >> >> >> >  1) If the device links are still intact at these places where unmap is called,
>> >> >> >> >     then pm_runtime from the master would setup the all the clocks. That would
>> >> >> >> >     avoid reintroducing the locking indirectly here.
>> >> >> >> >
>> >> >> >> >  2) If not, then doing it here is the only way. But for both cases, since
>> >> >> >> >     the unmap can be called from atomic context, resume handler here should
>> >> >> >> >     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
>> >> >> >> >
>> >> >> >>
>> >> >> >> I do kinda like the approach Marek suggested.. of deferring the tlb
>> >> >> >> flush until resume.  I'm wondering if we could combine that with
>> >> >> >> putting the mmu in a stalled state when we suspend (and not resume the
>> >> >> >> mmu until after the pending tlb flush)?
>> >> >> >
>> >> >> > I'm not sure that a stalled state is what we're after here, because we need
>> >> >> > to take care to prevent any table walks if we've freed the underlying pages.
>> >> >> > What we could try to do is disable the SMMU (put into global bypass) and
>> >> >> > invalidate the TLB when performing a suspend operation, then we just ignore
>> >> >> > invalidation whilst the clocks are stopped and, on resume, enable the SMMU
>> >> >> > again.
>> >> >>
>> >> >> wouldn't stalled just block any memory transactions by device(s) using
>> >> >> the context bank?  Putting it in bypass isn't really a good thing if
>> >> >> there is any chance the device can sneak in a memory access before
>> >> >> we've taking it back out of bypass (ie. makes gpu a giant userspace
>> >> >> controlled root hole).
>> >> >
>> >> > If it doesn't deadlock, then yes, it will stall transactions. However, that
>> >> > doesn't mean it necessarily prevents page table walks.
>> >>
>> >> btw, I guess the concern about pagetable walk is that the unmap could
>> >> have removed some sub-level of the pt that the tlb walk would hit?
>> >> Would deferring freeing those pages help?
>> >
>> > Could do, but it sounds like a lot of complication that I think we can fix
>> > by making the suspend operation put the SMMU into a "clean" state.
>> >
>> >> > Instead of bypass, we
>> >> > could configure all the streams to terminate, but this race still worries me
>> >> > somewhat. I thought that the SMMU would only be suspended if all of its
>> >> > masters were suspended, so if the GPU wants to come out of suspend then the
>> >> > SMMU should be resumed first.
>> >>
>> >> I believe this should be true.. on the gpu side, I'm mostly trying to
>> >> avoid having to power the gpu back on to free buffers.  (On the v4l2
>> >> side, somewhere in the core videobuf code would also need to be made
>> >> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..)
>> >
>> > Right, and we shouldn't have to resume it if we suspend it in a clean state,
>> > with the TLBs invalidated.
>> >
>>
>> I guess if the device_link() stuff ensured the attached device
>> (gpu/etc) was suspended before suspending the iommu, then I guess I
>> can't see how temporarily putting the iommu in bypass would be a
>> problem.  I haven't looked at the device_link() stuff too closely, but
>> iommu being resumed first and suspended last seems like the only thing
>> that would make sense.  I'm mostly just nervous about iommu in bypass
>> vs gpu since userspace has so much control over what address gpu
>> writes to / reads from, so getting it wrong w/ the iommu would be a
>> rather bad thing ;-)
>
> Right, but we can also configure it to terminate if you don't want bypass.
>

ok, terminate wfm

BR,
-R
Sricharan Ramabadhran July 17, 2017, 11:46 a.m. UTC | #23
Hi,

On 7/15/2017 1:09 AM, Rob Clark wrote:
> On Fri, Jul 14, 2017 at 3:36 PM, Will Deacon <will.deacon@arm.com> wrote:
>> On Fri, Jul 14, 2017 at 03:34:42PM -0400, Rob Clark wrote:
>>> On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote:
>>>> On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote:
>>>>> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote:
>>>>>> On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote:
>>>>>>> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote:
>>>>>>>> On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote:
>>>>>>>>> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>>>>>>>>>> Hi,
>>>>>>>>>>
>>>>>>>>>> On 7/13/2017 5:20 PM, Rob Clark wrote:
>>>>>>>>>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>>>>>>>>>>>> Hi Vivek,
>>>>>>>>>>>>
>>>>>>>>>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>>>>>>>>>>>> Hi Stephen,
>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>>>>>>>>>>>> On 07/06, Vivek Gautam wrote:
>>>>>>>>>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>>>>>>>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>>>>>>>>>>>                    size_t size)
>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>>>>>>>>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>>>>>>>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>>>>>>>>>>>> +    size_t ret;
>>>>>>>>>>>>>>>         if (!ops)
>>>>>>>>>>>>>>>           return 0;
>>>>>>>>>>>>>>>   -    return ops->unmap(ops, iova, size);
>>>>>>>>>>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>>>>>>>>>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>>>>>>>>>>>> to recall that being a problem before.
>>>>>>>>>>>>>
>>>>>>>>>>>>> That's something which was dropped in the following patch merged in master:
>>>>>>>>>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>>>>>>>>>>>
>>>>>>>>>>>>> Looks like we don't  need locks here anymore?
>>>>>>>>>>>>
>>>>>>>>>>>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>>>>>>>>>>>>  from unmap. Somehow looks like some path in the master using that
>>>>>>>>>>>>  should have enabled the pm ?
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Yes, there are a bunch of scenarios where unmap can happen with
>>>>>>>>>>> disabled master (but not in atomic context).  On the gpu side we
>>>>>>>>>>> opportunistically keep a buffer mapping until the buffer is freed
>>>>>>>>>>> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
>>>>>>>>>>> an exported dmabuf while some other driver holds a reference to it
>>>>>>>>>>> (which can be dropped when the v4l2 device is suspended).
>>>>>>>>>>>
>>>>>>>>>>> Since unmap triggers tbl flush which touches iommu regs, the iommu
>>>>>>>>>>> driver *definitely* needs a pm_runtime_get_sync().
>>>>>>>>>>
>>>>>>>>>>  Ok, with that being the case, there are two things here,
>>>>>>>>>>
>>>>>>>>>>  1) If the device links are still intact at these places where unmap is called,
>>>>>>>>>>     then pm_runtime from the master would setup the all the clocks. That would
>>>>>>>>>>     avoid reintroducing the locking indirectly here.
>>>>>>>>>>
>>>>>>>>>>  2) If not, then doing it here is the only way. But for both cases, since
>>>>>>>>>>     the unmap can be called from atomic context, resume handler here should
>>>>>>>>>>     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I do kinda like the approach Marek suggested.. of deferring the tlb
>>>>>>>>> flush until resume.  I'm wondering if we could combine that with
>>>>>>>>> putting the mmu in a stalled state when we suspend (and not resume the
>>>>>>>>> mmu until after the pending tlb flush)?
>>>>>>>>
>>>>>>>> I'm not sure that a stalled state is what we're after here, because we need
>>>>>>>> to take care to prevent any table walks if we've freed the underlying pages.
>>>>>>>> What we could try to do is disable the SMMU (put into global bypass) and
>>>>>>>> invalidate the TLB when performing a suspend operation, then we just ignore
>>>>>>>> invalidation whilst the clocks are stopped and, on resume, enable the SMMU
>>>>>>>> again.
>>>>>>>
>>>>>>> wouldn't stalled just block any memory transactions by device(s) using
>>>>>>> the context bank?  Putting it in bypass isn't really a good thing if
>>>>>>> there is any chance the device can sneak in a memory access before
>>>>>>> we've taking it back out of bypass (ie. makes gpu a giant userspace
>>>>>>> controlled root hole).
>>>>>>
>>>>>> If it doesn't deadlock, then yes, it will stall transactions. However, that
>>>>>> doesn't mean it necessarily prevents page table walks.
>>>>>
>>>>> btw, I guess the concern about pagetable walk is that the unmap could
>>>>> have removed some sub-level of the pt that the tlb walk would hit?
>>>>> Would deferring freeing those pages help?
>>>>
>>>> Could do, but it sounds like a lot of complication that I think we can fix
>>>> by making the suspend operation put the SMMU into a "clean" state.
>>>>
>>>>>> Instead of bypass, we
>>>>>> could configure all the streams to terminate, but this race still worries me
>>>>>> somewhat. I thought that the SMMU would only be suspended if all of its
>>>>>> masters were suspended, so if the GPU wants to come out of suspend then the
>>>>>> SMMU should be resumed first.
>>>>>
>>>>> I believe this should be true.. on the gpu side, I'm mostly trying to
>>>>> avoid having to power the gpu back on to free buffers.  (On the v4l2
>>>>> side, somewhere in the core videobuf code would also need to be made
>>>>> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..)
>>>>
>>>> Right, and we shouldn't have to resume it if we suspend it in a clean state,
>>>> with the TLBs invalidated.
>>>>
>>>
>>> I guess if the device_link() stuff ensured the attached device
>>> (gpu/etc) was suspended before suspending the iommu, then I guess I
>>> can't see how temporarily putting the iommu in bypass would be a
>>> problem.  I haven't looked at the device_link() stuff too closely, but
>>> iommu being resumed first and suspended last seems like the only thing
>>> that would make sense.  I'm mostly just nervous about iommu in bypass
>>> vs gpu since userspace has so much control over what address gpu
>>> writes to / reads from, so getting it wrong w/ the iommu would be a
>>> rather bad thing ;-)
>>
>> Right, but we can also configure it to terminate if you don't want bypass.
>>
> 

 But one thing here is, with devicelinks in picture, iommu suspend/resume
 is called along with the master. That means, we can end up cleaning even
 active entries on the suspend path ?, if suspend is going to
 put the smmu in to a clean state every time. So if the master's are following
 the pm_runtime sequence before a dma_map/unmap operation, that seems better.

Regards,
 Sricharan
Sricharan Ramabadhran July 17, 2017, 12:28 p.m. UTC | #24
Hi,

On 7/17/2017 5:16 PM, Sricharan R wrote:
> Hi,
> 
> On 7/15/2017 1:09 AM, Rob Clark wrote:
>> On Fri, Jul 14, 2017 at 3:36 PM, Will Deacon <will.deacon@arm.com> wrote:
>>> On Fri, Jul 14, 2017 at 03:34:42PM -0400, Rob Clark wrote:
>>>> On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote:
>>>>> On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote:
>>>>>> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote:
>>>>>>> On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote:
>>>>>>>> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote:
>>>>>>>>> On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote:
>>>>>>>>>> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>>>>>>>>>>> Hi,
>>>>>>>>>>>
>>>>>>>>>>> On 7/13/2017 5:20 PM, Rob Clark wrote:
>>>>>>>>>>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>>>>>>>>>>>>> Hi Vivek,
>>>>>>>>>>>>>
>>>>>>>>>>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>>>>>>>>>>>>> Hi Stephen,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>>>>>>>>>>>>> On 07/06, Vivek Gautam wrote:
>>>>>>>>>>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>>>>>>>>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>>>>>>>>>>>>                    size_t size)
>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>>>>>>>>>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>>>>>>>>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>>>>>>>>>>>>> +    size_t ret;
>>>>>>>>>>>>>>>>         if (!ops)
>>>>>>>>>>>>>>>>           return 0;
>>>>>>>>>>>>>>>>   -    return ops->unmap(ops, iova, size);
>>>>>>>>>>>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>>>>>>>>>>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>>>>>>>>>>>>> to recall that being a problem before.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> That's something which was dropped in the following patch merged in master:
>>>>>>>>>>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> Looks like we don't  need locks here anymore?
>>>>>>>>>>>>>
>>>>>>>>>>>>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>>>>>>>>>>>>>  from unmap. Somehow looks like some path in the master using that
>>>>>>>>>>>>>  should have enabled the pm ?
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> Yes, there are a bunch of scenarios where unmap can happen with
>>>>>>>>>>>> disabled master (but not in atomic context).  On the gpu side we
>>>>>>>>>>>> opportunistically keep a buffer mapping until the buffer is freed
>>>>>>>>>>>> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
>>>>>>>>>>>> an exported dmabuf while some other driver holds a reference to it
>>>>>>>>>>>> (which can be dropped when the v4l2 device is suspended).
>>>>>>>>>>>>
>>>>>>>>>>>> Since unmap triggers tbl flush which touches iommu regs, the iommu
>>>>>>>>>>>> driver *definitely* needs a pm_runtime_get_sync().
>>>>>>>>>>>
>>>>>>>>>>>  Ok, with that being the case, there are two things here,
>>>>>>>>>>>
>>>>>>>>>>>  1) If the device links are still intact at these places where unmap is called,
>>>>>>>>>>>     then pm_runtime from the master would setup the all the clocks. That would
>>>>>>>>>>>     avoid reintroducing the locking indirectly here.
>>>>>>>>>>>
>>>>>>>>>>>  2) If not, then doing it here is the only way. But for both cases, since
>>>>>>>>>>>     the unmap can be called from atomic context, resume handler here should
>>>>>>>>>>>     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I do kinda like the approach Marek suggested.. of deferring the tlb
>>>>>>>>>> flush until resume.  I'm wondering if we could combine that with
>>>>>>>>>> putting the mmu in a stalled state when we suspend (and not resume the
>>>>>>>>>> mmu until after the pending tlb flush)?
>>>>>>>>>
>>>>>>>>> I'm not sure that a stalled state is what we're after here, because we need
>>>>>>>>> to take care to prevent any table walks if we've freed the underlying pages.
>>>>>>>>> What we could try to do is disable the SMMU (put into global bypass) and
>>>>>>>>> invalidate the TLB when performing a suspend operation, then we just ignore
>>>>>>>>> invalidation whilst the clocks are stopped and, on resume, enable the SMMU
>>>>>>>>> again.
>>>>>>>>
>>>>>>>> wouldn't stalled just block any memory transactions by device(s) using
>>>>>>>> the context bank?  Putting it in bypass isn't really a good thing if
>>>>>>>> there is any chance the device can sneak in a memory access before
>>>>>>>> we've taking it back out of bypass (ie. makes gpu a giant userspace
>>>>>>>> controlled root hole).
>>>>>>>
>>>>>>> If it doesn't deadlock, then yes, it will stall transactions. However, that
>>>>>>> doesn't mean it necessarily prevents page table walks.
>>>>>>
>>>>>> btw, I guess the concern about pagetable walk is that the unmap could
>>>>>> have removed some sub-level of the pt that the tlb walk would hit?
>>>>>> Would deferring freeing those pages help?
>>>>>
>>>>> Could do, but it sounds like a lot of complication that I think we can fix
>>>>> by making the suspend operation put the SMMU into a "clean" state.
>>>>>
>>>>>>> Instead of bypass, we
>>>>>>> could configure all the streams to terminate, but this race still worries me
>>>>>>> somewhat. I thought that the SMMU would only be suspended if all of its
>>>>>>> masters were suspended, so if the GPU wants to come out of suspend then the
>>>>>>> SMMU should be resumed first.
>>>>>>
>>>>>> I believe this should be true.. on the gpu side, I'm mostly trying to
>>>>>> avoid having to power the gpu back on to free buffers.  (On the v4l2
>>>>>> side, somewhere in the core videobuf code would also need to be made
>>>>>> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..)
>>>>>
>>>>> Right, and we shouldn't have to resume it if we suspend it in a clean state,
>>>>> with the TLBs invalidated.
>>>>>
>>>>
>>>> I guess if the device_link() stuff ensured the attached device
>>>> (gpu/etc) was suspended before suspending the iommu, then I guess I
>>>> can't see how temporarily putting the iommu in bypass would be a
>>>> problem.  I haven't looked at the device_link() stuff too closely, but
>>>> iommu being resumed first and suspended last seems like the only thing
>>>> that would make sense.  I'm mostly just nervous about iommu in bypass
>>>> vs gpu since userspace has so much control over what address gpu
>>>> writes to / reads from, so getting it wrong w/ the iommu would be a
>>>> rather bad thing ;-)
>>>
>>> Right, but we can also configure it to terminate if you don't want bypass.
>>>
>>
> 
>  But one thing here is, with devicelinks in picture, iommu suspend/resume
>  is called along with the master. That means, we can end up cleaning even
>  active entries on the suspend path ?, if suspend is going to
>  put the smmu in to a clean state every time. So if the master's are following
>  the pm_runtime sequence before a dma_map/unmap operation, that seems better.
> 

 Also, for the usecase of unmap being done from master's like GPU while it is already
 suspended, then following the Marek's approach of checking for the smmu state while
 in unmap and defer the TLB flush till resume seems correct way. All of the above
 true if we want to use device_link.

Regards,
 Sricharan
Vivek Gautam July 24, 2017, 3:31 p.m. UTC | #25
Hi,

On Mon, Jul 17, 2017 at 5:58 PM, Sricharan R <sricharan@codeaurora.org> wrote:
> Hi,
>
> On 7/17/2017 5:16 PM, Sricharan R wrote:
>> Hi,
>>
>> On 7/15/2017 1:09 AM, Rob Clark wrote:
>>> On Fri, Jul 14, 2017 at 3:36 PM, Will Deacon <will.deacon@arm.com> wrote:
>>>> On Fri, Jul 14, 2017 at 03:34:42PM -0400, Rob Clark wrote:
>>>>> On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote:
>>>>>> On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote:
>>>>>>> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote:
>>>>>>>> On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote:
>>>>>>>>> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote:
>>>>>>>>>> On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote:
>>>>>>>>>>> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>>>>>>>>>>>> Hi,
>>>>>>>>>>>>
>>>>>>>>>>>> On 7/13/2017 5:20 PM, Rob Clark wrote:
>>>>>>>>>>>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>>>>>>>>>>>>>> Hi Vivek,
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>>>>>>>>>>>>>> Hi Stephen,
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>>>>>>>>>>>>>> On 07/06, Vivek Gautam wrote:
>>>>>>>>>>>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>>>>>>>>>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>>>>>>>>>>>>>                    size_t size)
>>>>>>>>>>>>>>>>>   {
>>>>>>>>>>>>>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>>>>>>>>>>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>>>>>>>>>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>>>>>>>>>>>>>> +    size_t ret;
>>>>>>>>>>>>>>>>>         if (!ops)
>>>>>>>>>>>>>>>>>           return 0;
>>>>>>>>>>>>>>>>>   -    return ops->unmap(ops, iova, size);
>>>>>>>>>>>>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>>>>>>>>>>>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>>>>>>>>>>>>>> to recall that being a problem before.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> That's something which was dropped in the following patch merged in master:
>>>>>>>>>>>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> Looks like we don't  need locks here anymore?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>>>>>>>>>>>>>>  from unmap. Somehow looks like some path in the master using that
>>>>>>>>>>>>>>  should have enabled the pm ?
>>>>>>>>>>>>>>
>>>>>>>>>>>>>
>>>>>>>>>>>>> Yes, there are a bunch of scenarios where unmap can happen with
>>>>>>>>>>>>> disabled master (but not in atomic context).  On the gpu side we
>>>>>>>>>>>>> opportunistically keep a buffer mapping until the buffer is freed
>>>>>>>>>>>>> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
>>>>>>>>>>>>> an exported dmabuf while some other driver holds a reference to it
>>>>>>>>>>>>> (which can be dropped when the v4l2 device is suspended).
>>>>>>>>>>>>>
>>>>>>>>>>>>> Since unmap triggers tbl flush which touches iommu regs, the iommu
>>>>>>>>>>>>> driver *definitely* needs a pm_runtime_get_sync().
>>>>>>>>>>>>
>>>>>>>>>>>>  Ok, with that being the case, there are two things here,
>>>>>>>>>>>>
>>>>>>>>>>>>  1) If the device links are still intact at these places where unmap is called,
>>>>>>>>>>>>     then pm_runtime from the master would setup the all the clocks. That would
>>>>>>>>>>>>     avoid reintroducing the locking indirectly here.
>>>>>>>>>>>>
>>>>>>>>>>>>  2) If not, then doing it here is the only way. But for both cases, since
>>>>>>>>>>>>     the unmap can be called from atomic context, resume handler here should
>>>>>>>>>>>>     avoid doing clk_prepare_enable , instead move the clk_prepare to the init.
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> I do kinda like the approach Marek suggested.. of deferring the tlb
>>>>>>>>>>> flush until resume.  I'm wondering if we could combine that with
>>>>>>>>>>> putting the mmu in a stalled state when we suspend (and not resume the
>>>>>>>>>>> mmu until after the pending tlb flush)?
>>>>>>>>>>
>>>>>>>>>> I'm not sure that a stalled state is what we're after here, because we need
>>>>>>>>>> to take care to prevent any table walks if we've freed the underlying pages.
>>>>>>>>>> What we could try to do is disable the SMMU (put into global bypass) and
>>>>>>>>>> invalidate the TLB when performing a suspend operation, then we just ignore
>>>>>>>>>> invalidation whilst the clocks are stopped and, on resume, enable the SMMU
>>>>>>>>>> again.
>>>>>>>>>
>>>>>>>>> wouldn't stalled just block any memory transactions by device(s) using
>>>>>>>>> the context bank?  Putting it in bypass isn't really a good thing if
>>>>>>>>> there is any chance the device can sneak in a memory access before
>>>>>>>>> we've taking it back out of bypass (ie. makes gpu a giant userspace
>>>>>>>>> controlled root hole).
>>>>>>>>
>>>>>>>> If it doesn't deadlock, then yes, it will stall transactions. However, that
>>>>>>>> doesn't mean it necessarily prevents page table walks.
>>>>>>>
>>>>>>> btw, I guess the concern about pagetable walk is that the unmap could
>>>>>>> have removed some sub-level of the pt that the tlb walk would hit?
>>>>>>> Would deferring freeing those pages help?
>>>>>>
>>>>>> Could do, but it sounds like a lot of complication that I think we can fix
>>>>>> by making the suspend operation put the SMMU into a "clean" state.
>>>>>>
>>>>>>>> Instead of bypass, we
>>>>>>>> could configure all the streams to terminate, but this race still worries me
>>>>>>>> somewhat. I thought that the SMMU would only be suspended if all of its
>>>>>>>> masters were suspended, so if the GPU wants to come out of suspend then the
>>>>>>>> SMMU should be resumed first.
>>>>>>>
>>>>>>> I believe this should be true.. on the gpu side, I'm mostly trying to
>>>>>>> avoid having to power the gpu back on to free buffers.  (On the v4l2
>>>>>>> side, somewhere in the core videobuf code would also need to be made
>>>>>>> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..)
>>>>>>
>>>>>> Right, and we shouldn't have to resume it if we suspend it in a clean state,
>>>>>> with the TLBs invalidated.
>>>>>>
>>>>>
>>>>> I guess if the device_link() stuff ensured the attached device
>>>>> (gpu/etc) was suspended before suspending the iommu, then I guess I
>>>>> can't see how temporarily putting the iommu in bypass would be a
>>>>> problem.  I haven't looked at the device_link() stuff too closely, but
>>>>> iommu being resumed first and suspended last seems like the only thing
>>>>> that would make sense.  I'm mostly just nervous about iommu in bypass
>>>>> vs gpu since userspace has so much control over what address gpu
>>>>> writes to / reads from, so getting it wrong w/ the iommu would be a
>>>>> rather bad thing ;-)
>>>>
>>>> Right, but we can also configure it to terminate if you don't want bypass.
>>>>
>>>
>>
>>  But one thing here is, with devicelinks in picture, iommu suspend/resume
>>  is called along with the master. That means, we can end up cleaning even
>>  active entries on the suspend path ?, if suspend is going to
>>  put the smmu in to a clean state every time. So if the master's are following
>>  the pm_runtime sequence before a dma_map/unmap operation, that seems better.
>>
>
>  Also, for the usecase of unmap being done from master's like GPU while it is already
>  suspended, then following the Marek's approach of checking for the smmu state while
>  in unmap and defer the TLB flush till resume seems correct way. All of the above
>  true if we want to use device_link.

This sounds like a plan.
I have a WIP version in which we will just check in 'unmap' if the mmu is
already suspended. If yes, then we save the unmap request (domain, iova)
and defer this request.
On resume, we check the pending tlb flush requests, and call unmap.

I will give a try for the venus use-case.

Regards
Vivek

>
> Regards,
>  Sricharan
>
> --
> "QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation
>
> ---
> This email has been checked for viruses by Avast antivirus software.
> https://www.avast.com/antivirus
>
Vivek Gautam Aug. 7, 2017, 8:27 a.m. UTC | #26
On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark <robdclark@gmail.com> wrote:
> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> Hi Vivek,
>>
>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>> Hi Stephen,
>>>
>>>
>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>> On 07/06, Vivek Gautam wrote:
>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>                    size_t size)
>>>>>   {
>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>> +    size_t ret;
>>>>>         if (!ops)
>>>>>           return 0;
>>>>>   -    return ops->unmap(ops, iova, size);
>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>> to recall that being a problem before.
>>>
>>> That's something which was dropped in the following patch merged in master:
>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>
>>> Looks like we don't  need locks here anymore?
>>
>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>>  from unmap. Somehow looks like some path in the master using that
>>  should have enabled the pm ?
>>
>
> Yes, there are a bunch of scenarios where unmap can happen with
> disabled master (but not in atomic context).

I would like to understand whether there is a situation where an unmap is
called in atomic context without an enabled master?

Let's say we have the case where all the unmap calls in atomic context happen
only from the master's context (in which case the device link should
take care of
the pm state of smmu), and the only unmap that happen in non-atomic context
is the one with master disabled. In such a case doesn it make sense to
distinguish
the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only
for the non-atomic context since that would be the one with master disabled.


Thanks
Vivek

> On the gpu side we
> opportunistically keep a buffer mapping until the buffer is freed
> (which can happen after gpu is disabled).  Likewise, v4l2 won't unmap
> an exported dmabuf while some other driver holds a reference to it
> (which can be dropped when the v4l2 device is suspended).
>
> Since unmap triggers tbl flush which touches iommu regs, the iommu
> driver *definitely* needs a pm_runtime_get_sync().
>
> BR,
> -R
> --
> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rob Clark Aug. 7, 2017, 12:29 p.m. UTC | #27
On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam
<vivek.gautam@codeaurora.org> wrote:
> On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark <robdclark@gmail.com> wrote:
>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>>> Hi Vivek,
>>>
>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>>> Hi Stephen,
>>>>
>>>>
>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>>> On 07/06, Vivek Gautam wrote:
>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>>                    size_t size)
>>>>>>   {
>>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>>> +    size_t ret;
>>>>>>         if (!ops)
>>>>>>           return 0;
>>>>>>   -    return ops->unmap(ops, iova, size);
>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>>> to recall that being a problem before.
>>>>
>>>> That's something which was dropped in the following patch merged in master:
>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>>
>>>> Looks like we don't  need locks here anymore?
>>>
>>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>>>  from unmap. Somehow looks like some path in the master using that
>>>  should have enabled the pm ?
>>>
>>
>> Yes, there are a bunch of scenarios where unmap can happen with
>> disabled master (but not in atomic context).
>
> I would like to understand whether there is a situation where an unmap is
> called in atomic context without an enabled master?
>
> Let's say we have the case where all the unmap calls in atomic context happen
> only from the master's context (in which case the device link should
> take care of
> the pm state of smmu), and the only unmap that happen in non-atomic context
> is the one with master disabled. In such a case doesn it make sense to
> distinguish
> the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only
> for the non-atomic context since that would be the one with master disabled.
>

At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it
won't unmap anything in atomic ctx (but it can unmap w/ master
disabled).  I can't really comment about other non-gpu drivers.  It
seems like a reasonable constraint that either master is enabled or
not in atomic ctx.

Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd
like to drop that to avoid powering up the gpu.

BR,
-R
Vivek Gautam Nov. 14, 2017, 6:30 p.m. UTC | #28
Hi,


On Mon, Aug 7, 2017 at 5:59 PM, Rob Clark <robdclark@gmail.com> wrote:
> On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam
> <vivek.gautam@codeaurora.org> wrote:
>> On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark <robdclark@gmail.com> wrote:
>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>>>> Hi Vivek,
>>>>
>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>>>> Hi Stephen,
>>>>>
>>>>>
>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>>>> On 07/06, Vivek Gautam wrote:
>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>>>                    size_t size)
>>>>>>>   {
>>>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>>>> +    size_t ret;
>>>>>>>         if (!ops)
>>>>>>>           return 0;
>>>>>>>   -    return ops->unmap(ops, iova, size);
>>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>>>> to recall that being a problem before.
>>>>>
>>>>> That's something which was dropped in the following patch merged in master:
>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>>>
>>>>> Looks like we don't  need locks here anymore?
>>>>
>>>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>>>>  from unmap. Somehow looks like some path in the master using that
>>>>  should have enabled the pm ?
>>>>
>>>
>>> Yes, there are a bunch of scenarios where unmap can happen with
>>> disabled master (but not in atomic context).
>>
>> I would like to understand whether there is a situation where an unmap is
>> called in atomic context without an enabled master?
>>
>> Let's say we have the case where all the unmap calls in atomic context happen
>> only from the master's context (in which case the device link should
>> take care of
>> the pm state of smmu), and the only unmap that happen in non-atomic context
>> is the one with master disabled. In such a case doesn it make sense to
>> distinguish
>> the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only
>> for the non-atomic context since that would be the one with master disabled.
>>
>
> At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it
> won't unmap anything in atomic ctx (but it can unmap w/ master
> disabled).  I can't really comment about other non-gpu drivers.  It
> seems like a reasonable constraint that either master is enabled or
> not in atomic ctx.
>
> Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd
> like to drop that to avoid powering up the gpu.

Since the deferring the TLB maintenance doesn't look like the best approach [1],
how about if we try to power-up only the smmu from different client
devices such as,
GPU in the unmap path. Then we won't need to add pm_runtime_get/put() calls in
arm_smmu_unmap().

The client device can use something like - pm_runtime_get_supplier() since
we already have the device link in place with this patch series. This should
power-on the supplier (which is smmu) without turning on the consumer
(such as GPU).

pm_runtime_get_supplier() however is not exported at this moment.
Will it be useful to export this API and use it in the drivers.

Adding Rafael J. Wysocki for suggestions on pm_runtime_get_suppliers() API.


[1] https://patchwork.kernel.org/patch/9876489/


Best regards
Vivek

>
> BR,
> -R
> --
> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stephen Boyd Nov. 27, 2017, 10:22 p.m. UTC | #29
On 11/15, Vivek Gautam wrote:
> Hi,
> 
> 
> On Mon, Aug 7, 2017 at 5:59 PM, Rob Clark <robdclark@gmail.com> wrote:
> > On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam
> > <vivek.gautam@codeaurora.org> wrote:
> >> On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark <robdclark@gmail.com> wrote:
> >>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
> >>>> Hi Vivek,
> >>>>
> >>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
> >>>>> Hi Stephen,
> >>>>>
> >>>>>
> >>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
> >>>>>> On 07/06, Vivek Gautam wrote:
> >>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
> >>>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
> >>>>>>>                    size_t size)
> >>>>>>>   {
> >>>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
> >>>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
> >>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
> >>>>>>> +    size_t ret;
> >>>>>>>         if (!ops)
> >>>>>>>           return 0;
> >>>>>>>   -    return ops->unmap(ops, iova, size);
> >>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
> >>>>>> Can these map/unmap ops be called from an atomic context? I seem
> >>>>>> to recall that being a problem before.
> >>>>>
> >>>>> That's something which was dropped in the following patch merged in master:
> >>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
> >>>>>
> >>>>> Looks like we don't  need locks here anymore?
> >>>>
> >>>>  Apart from the locking, wonder why a explicit pm_runtime is needed
> >>>>  from unmap. Somehow looks like some path in the master using that
> >>>>  should have enabled the pm ?
> >>>>
> >>>
> >>> Yes, there are a bunch of scenarios where unmap can happen with
> >>> disabled master (but not in atomic context).
> >>
> >> I would like to understand whether there is a situation where an unmap is
> >> called in atomic context without an enabled master?
> >>
> >> Let's say we have the case where all the unmap calls in atomic context happen
> >> only from the master's context (in which case the device link should
> >> take care of
> >> the pm state of smmu), and the only unmap that happen in non-atomic context
> >> is the one with master disabled. In such a case doesn it make sense to
> >> distinguish
> >> the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only
> >> for the non-atomic context since that would be the one with master disabled.
> >>
> >
> > At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it
> > won't unmap anything in atomic ctx (but it can unmap w/ master
> > disabled).  I can't really comment about other non-gpu drivers.  It
> > seems like a reasonable constraint that either master is enabled or
> > not in atomic ctx.
> >
> > Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd
> > like to drop that to avoid powering up the gpu.
> 
> Since the deferring the TLB maintenance doesn't look like the best approach [1],
> how about if we try to power-up only the smmu from different client
> devices such as,
> GPU in the unmap path. Then we won't need to add pm_runtime_get/put() calls in
> arm_smmu_unmap().
> 
> The client device can use something like - pm_runtime_get_supplier() since
> we already have the device link in place with this patch series. This should
> power-on the supplier (which is smmu) without turning on the consumer
> (such as GPU).
> 
> pm_runtime_get_supplier() however is not exported at this moment.
> Will it be useful to export this API and use it in the drivers.
> 

I'm not sure pm_runtime_get_supplier() is correct either. That
feels like we're relying on the GPU driver knowing the internal
details of how the device links are configured.

Is there some way to have the GPU driver know in its runtime PM
resume hook that it doesn't need to be powered on because it
isn't actively drawing anything or processing commands? I'm
thinking of the code calling pm_runtime_get() as proposed around
the IOMMU unmap path in the GPU driver and then having the
runtime PM resume hook in the GPU driver return some special
value to indicate that it didn't really resume because it didn't
need to and to treat the device as runtime suspended but not
return an error. Then the runtime PM core can keep track of that
and try to power the GPU on again when another pm_runtime_get()
is called on the GPU device.

This keeps the consumer API the same, always pm_runtime_get(),
but leaves the device driver logic of what to do when the GPU
doesn't need to power on to the runtime PM hook where the driver
has all the information.
Rob Clark Nov. 27, 2017, 11:43 p.m. UTC | #30
On Mon, Nov 27, 2017 at 5:22 PM, Stephen Boyd <sboyd@codeaurora.org> wrote:
> On 11/15, Vivek Gautam wrote:
>> Hi,
>>
>>
>> On Mon, Aug 7, 2017 at 5:59 PM, Rob Clark <robdclark@gmail.com> wrote:
>> > On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam
>> > <vivek.gautam@codeaurora.org> wrote:
>> >> On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark <robdclark@gmail.com> wrote:
>> >>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote:
>> >>>> Hi Vivek,
>> >>>>
>> >>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>> >>>>> Hi Stephen,
>> >>>>>
>> >>>>>
>> >>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>> >>>>>> On 07/06, Vivek Gautam wrote:
>> >>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>> >>>>>>>   static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>> >>>>>>>                    size_t size)
>> >>>>>>>   {
>> >>>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>> >>>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>> >>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>> >>>>>>> +    size_t ret;
>> >>>>>>>         if (!ops)
>> >>>>>>>           return 0;
>> >>>>>>>   -    return ops->unmap(ops, iova, size);
>> >>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>> >>>>>> Can these map/unmap ops be called from an atomic context? I seem
>> >>>>>> to recall that being a problem before.
>> >>>>>
>> >>>>> That's something which was dropped in the following patch merged in master:
>> >>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>> >>>>>
>> >>>>> Looks like we don't  need locks here anymore?
>> >>>>
>> >>>>  Apart from the locking, wonder why a explicit pm_runtime is needed
>> >>>>  from unmap. Somehow looks like some path in the master using that
>> >>>>  should have enabled the pm ?
>> >>>>
>> >>>
>> >>> Yes, there are a bunch of scenarios where unmap can happen with
>> >>> disabled master (but not in atomic context).
>> >>
>> >> I would like to understand whether there is a situation where an unmap is
>> >> called in atomic context without an enabled master?
>> >>
>> >> Let's say we have the case where all the unmap calls in atomic context happen
>> >> only from the master's context (in which case the device link should
>> >> take care of
>> >> the pm state of smmu), and the only unmap that happen in non-atomic context
>> >> is the one with master disabled. In such a case doesn it make sense to
>> >> distinguish
>> >> the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only
>> >> for the non-atomic context since that would be the one with master disabled.
>> >>
>> >
>> > At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it
>> > won't unmap anything in atomic ctx (but it can unmap w/ master
>> > disabled).  I can't really comment about other non-gpu drivers.  It
>> > seems like a reasonable constraint that either master is enabled or
>> > not in atomic ctx.
>> >
>> > Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd
>> > like to drop that to avoid powering up the gpu.
>>
>> Since the deferring the TLB maintenance doesn't look like the best approach [1],
>> how about if we try to power-up only the smmu from different client
>> devices such as,
>> GPU in the unmap path. Then we won't need to add pm_runtime_get/put() calls in
>> arm_smmu_unmap().
>>
>> The client device can use something like - pm_runtime_get_supplier() since
>> we already have the device link in place with this patch series. This should
>> power-on the supplier (which is smmu) without turning on the consumer
>> (such as GPU).
>>
>> pm_runtime_get_supplier() however is not exported at this moment.
>> Will it be useful to export this API and use it in the drivers.
>>
>
> I'm not sure pm_runtime_get_supplier() is correct either. That
> feels like we're relying on the GPU driver knowing the internal
> details of how the device links are configured.
>

what does pm_runtime_get_supplier() do if IOMMU driver hasn't setup
device-link?  If it is a no-op, then I guess the GPU driver calling
pm_runtime_get_supplier() seems reasonable, and less annoying than
having special cases in pm_resume path.. I don't feel too bad about
having "just in case" get/put_supplier() calls in the unmap path.

Also, presumably we still want to avoid powering up GPU even if we
short circuit the firmware loading and rest of "booting up the GPU"..
since presumably the GPU draws somewhat more power than the IOMMU..
having the pm_resume/suspend path know about the diff between waking
up / suspending the iommu and itself doesn't really feel less-bad than
just doing "just in case" get/put_supplier() calls.

BR,
-R

> Is there some way to have the GPU driver know in its runtime PM
> resume hook that it doesn't need to be powered on because it
> isn't actively drawing anything or processing commands? I'm
> thinking of the code calling pm_runtime_get() as proposed around
> the IOMMU unmap path in the GPU driver and then having the
> runtime PM resume hook in the GPU driver return some special
> value to indicate that it didn't really resume because it didn't
> need to and to treat the device as runtime suspended but not
> return an error. Then the runtime PM core can keep track of that
> and try to power the GPU on again when another pm_runtime_get()
> is called on the GPU device.
>
> This keeps the consumer API the same, always pm_runtime_get(),
> but leaves the device driver logic of what to do when the GPU
> doesn't need to power on to the runtime PM hook where the driver
> has all the information.
>
> --
> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
> a Linux Foundation Collaborative Project
Vivek Gautam Nov. 28, 2017, 1:43 p.m. UTC | #31
On 11/28/2017 05:13 AM, Rob Clark wrote:
> On Mon, Nov 27, 2017 at 5:22 PM, Stephen Boyd<sboyd@codeaurora.org>  wrote:
>> On 11/15, Vivek Gautam wrote:
>>> Hi,
>>>
>>>
>>> On Mon, Aug 7, 2017 at 5:59 PM, Rob Clark<robdclark@gmail.com>  wrote:
>>>> On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam
>>>> <vivek.gautam@codeaurora.org>  wrote:
>>>>> On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark<robdclark@gmail.com>  wrote:
>>>>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R<sricharan@codeaurora.org>  wrote:
>>>>>>> Hi Vivek,
>>>>>>>
>>>>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>>>>>>> Hi Stephen,
>>>>>>>>
>>>>>>>>
>>>>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>>>>>>> On 07/06, Vivek Gautam wrote:
>>>>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
>>>>>>>>>>    static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
>>>>>>>>>>                     size_t size)
>>>>>>>>>>    {
>>>>>>>>>> -    struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
>>>>>>>>>> +    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
>>>>>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>>>>>>> +    size_t ret;
>>>>>>>>>>          if (!ops)
>>>>>>>>>>            return 0;
>>>>>>>>>>    -    return ops->unmap(ops, iova, size);
>>>>>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>>>>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>>>>>>> to recall that being a problem before.
>>>>>>>> That's something which was dropped in the following patch merged in master:
>>>>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>>>>>>
>>>>>>>> Looks like we don't  need locks here anymore?
>>>>>>>   Apart from the locking, wonder why a explicit pm_runtime is needed
>>>>>>>   from unmap. Somehow looks like some path in the master using that
>>>>>>>   should have enabled the pm ?
>>>>>>>
>>>>>> Yes, there are a bunch of scenarios where unmap can happen with
>>>>>> disabled master (but not in atomic context).
>>>>> I would like to understand whether there is a situation where an unmap is
>>>>> called in atomic context without an enabled master?
>>>>>
>>>>> Let's say we have the case where all the unmap calls in atomic context happen
>>>>> only from the master's context (in which case the device link should
>>>>> take care of
>>>>> the pm state of smmu), and the only unmap that happen in non-atomic context
>>>>> is the one with master disabled. In such a case doesn it make sense to
>>>>> distinguish
>>>>> the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only
>>>>> for the non-atomic context since that would be the one with master disabled.
>>>>>
>>>> At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it
>>>> won't unmap anything in atomic ctx (but it can unmap w/ master
>>>> disabled).  I can't really comment about other non-gpu drivers.  It
>>>> seems like a reasonable constraint that either master is enabled or
>>>> not in atomic ctx.
>>>>
>>>> Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd
>>>> like to drop that to avoid powering up the gpu.
>>> Since the deferring the TLB maintenance doesn't look like the best approach [1],
>>> how about if we try to power-up only the smmu from different client
>>> devices such as,
>>> GPU in the unmap path. Then we won't need to add pm_runtime_get/put() calls in
>>> arm_smmu_unmap().
>>>
>>> The client device can use something like - pm_runtime_get_supplier() since
>>> we already have the device link in place with this patch series. This should
>>> power-on the supplier (which is smmu) without turning on the consumer
>>> (such as GPU).
>>>
>>> pm_runtime_get_supplier() however is not exported at this moment.
>>> Will it be useful to export this API and use it in the drivers.
>>>
>> I'm not sure pm_runtime_get_supplier() is correct either. That
>> feels like we're relying on the GPU driver knowing the internal
>> details of how the device links are configured.
>>
> what does pm_runtime_get_supplier() do if IOMMU driver hasn't setup
> device-link?

It will be a no-op.

> If it is a no-op, then I guess the GPU driver calling
> pm_runtime_get_supplier() seems reasonable, and less annoying than
> having special cases in pm_resume path.. I don't feel too bad about
> having "just in case" get/put_supplier() calls in the unmap path.
>
> Also, presumably we still want to avoid powering up GPU even if we
> short circuit the firmware loading and rest of "booting up the GPU"..
> since presumably the GPU draws somewhat more power than the IOMMU..
> having the pm_resume/suspend path know about the diff between waking
> up / suspending the iommu and itself doesn't really feel less-bad than
> just doing "just in case" get/put_supplier() calls.

If it sounds okay, then i can send a patch that exports the
pm_runtime_get/put_suppliers() APIs.


Best regards
Vivek

> BR,
> -R
>
>> Is there some way to have the GPU driver know in its runtime PM
>> resume hook that it doesn't need to be powered on because it
>> isn't actively drawing anything or processing commands? I'm
>> thinking of the code calling pm_runtime_get() as proposed around
>> the IOMMU unmap path in the GPU driver and then having the
>> runtime PM resume hook in the GPU driver return some special
>> value to indicate that it didn't really resume because it didn't
>> need to and to treat the device as runtime suspended but not
>> return an error. Then the runtime PM core can keep track of that
>> and try to power the GPU on again when another pm_runtime_get()
>> is called on the GPU device.
>>
>> This keeps the consumer API the same, always pm_runtime_get(),
>> but leaves the device driver logic of what to do when the GPU
>> doesn't need to power on to the runtime PM hook where the driver
>> has all the information.
>>
>> --
>> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
>> a Linux Foundation Collaborative Project
> --
> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in
> the body of a message tomajordomo@vger.kernel.org
> More majordomo info athttp://vger.kernel.org/majordomo-info.html
Rob Clark Nov. 28, 2017, 8:05 p.m. UTC | #32
On Tue, Nov 28, 2017 at 8:43 AM, Vivek Gautam
<vivek.gautam@codeaurora.org> wrote:
>
>
> On 11/28/2017 05:13 AM, Rob Clark wrote:
>>
>> On Mon, Nov 27, 2017 at 5:22 PM, Stephen Boyd<sboyd@codeaurora.org>
>> wrote:
>>>
>>> On 11/15, Vivek Gautam wrote:
>>>>
>>>> Hi,
>>>>
>>>>
>>>> On Mon, Aug 7, 2017 at 5:59 PM, Rob Clark<robdclark@gmail.com>  wrote:
>>>>>
>>>>> On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam
>>>>> <vivek.gautam@codeaurora.org>  wrote:
>>>>>>
>>>>>> On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark<robdclark@gmail.com>
>>>>>> wrote:
>>>>>>>
>>>>>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan
>>>>>>> R<sricharan@codeaurora.org>  wrote:
>>>>>>>>
>>>>>>>> Hi Vivek,
>>>>>>>>
>>>>>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote:
>>>>>>>>>
>>>>>>>>> Hi Stephen,
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote:
>>>>>>>>>>
>>>>>>>>>> On 07/06, Vivek Gautam wrote:
>>>>>>>>>>>
>>>>>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct
>>>>>>>>>>> iommu_domain *domain, unsigned long iova,
>>>>>>>>>>>    static size_t arm_smmu_unmap(struct iommu_domain *domain,
>>>>>>>>>>> unsigned long iova,
>>>>>>>>>>>                     size_t size)
>>>>>>>>>>>    {
>>>>>>>>>>> -    struct io_pgtable_ops *ops =
>>>>>>>>>>> to_smmu_domain(domain)->pgtbl_ops;
>>>>>>>>>>> +    struct arm_smmu_domain *smmu_domain =
>>>>>>>>>>> to_smmu_domain(domain);
>>>>>>>>>>> +    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
>>>>>>>>>>> +    size_t ret;
>>>>>>>>>>>          if (!ops)
>>>>>>>>>>>            return 0;
>>>>>>>>>>>    -    return ops->unmap(ops, iova, size);
>>>>>>>>>>> +    pm_runtime_get_sync(smmu_domain->smmu->dev);
>>>>>>>>>>
>>>>>>>>>> Can these map/unmap ops be called from an atomic context? I seem
>>>>>>>>>> to recall that being a problem before.
>>>>>>>>>
>>>>>>>>> That's something which was dropped in the following patch merged in
>>>>>>>>> master:
>>>>>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock
>>>>>>>>>
>>>>>>>>> Looks like we don't  need locks here anymore?
>>>>>>>>
>>>>>>>>   Apart from the locking, wonder why a explicit pm_runtime is needed
>>>>>>>>   from unmap. Somehow looks like some path in the master using that
>>>>>>>>   should have enabled the pm ?
>>>>>>>>
>>>>>>> Yes, there are a bunch of scenarios where unmap can happen with
>>>>>>> disabled master (but not in atomic context).
>>>>>>
>>>>>> I would like to understand whether there is a situation where an unmap
>>>>>> is
>>>>>> called in atomic context without an enabled master?
>>>>>>
>>>>>> Let's say we have the case where all the unmap calls in atomic context
>>>>>> happen
>>>>>> only from the master's context (in which case the device link should
>>>>>> take care of
>>>>>> the pm state of smmu), and the only unmap that happen in non-atomic
>>>>>> context
>>>>>> is the one with master disabled. In such a case doesn it make sense to
>>>>>> distinguish
>>>>>> the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync()
>>>>>> only
>>>>>> for the non-atomic context since that would be the one with master
>>>>>> disabled.
>>>>>>
>>>>> At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it
>>>>> won't unmap anything in atomic ctx (but it can unmap w/ master
>>>>> disabled).  I can't really comment about other non-gpu drivers.  It
>>>>> seems like a reasonable constraint that either master is enabled or
>>>>> not in atomic ctx.
>>>>>
>>>>> Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd
>>>>> like to drop that to avoid powering up the gpu.
>>>>
>>>> Since the deferring the TLB maintenance doesn't look like the best
>>>> approach [1],
>>>> how about if we try to power-up only the smmu from different client
>>>> devices such as,
>>>> GPU in the unmap path. Then we won't need to add pm_runtime_get/put()
>>>> calls in
>>>> arm_smmu_unmap().
>>>>
>>>> The client device can use something like - pm_runtime_get_supplier()
>>>> since
>>>> we already have the device link in place with this patch series. This
>>>> should
>>>> power-on the supplier (which is smmu) without turning on the consumer
>>>> (such as GPU).
>>>>
>>>> pm_runtime_get_supplier() however is not exported at this moment.
>>>> Will it be useful to export this API and use it in the drivers.
>>>>
>>> I'm not sure pm_runtime_get_supplier() is correct either. That
>>> feels like we're relying on the GPU driver knowing the internal
>>> details of how the device links are configured.
>>>
>> what does pm_runtime_get_supplier() do if IOMMU driver hasn't setup
>> device-link?
>
>
> It will be a no-op.
>
>> If it is a no-op, then I guess the GPU driver calling
>> pm_runtime_get_supplier() seems reasonable, and less annoying than
>> having special cases in pm_resume path.. I don't feel too bad about
>> having "just in case" get/put_supplier() calls in the unmap path.
>>
>> Also, presumably we still want to avoid powering up GPU even if we
>> short circuit the firmware loading and rest of "booting up the GPU"..
>> since presumably the GPU draws somewhat more power than the IOMMU..
>> having the pm_resume/suspend path know about the diff between waking
>> up / suspending the iommu and itself doesn't really feel less-bad than
>> just doing "just in case" get/put_supplier() calls.
>
>
> If it sounds okay, then i can send a patch that exports the
> pm_runtime_get/put_suppliers() APIs.
>

sounds good to me

BR,
-R


>
> Best regards
> Vivek
>
>> BR,
>> -R
>>
>>> Is there some way to have the GPU driver know in its runtime PM
>>> resume hook that it doesn't need to be powered on because it
>>> isn't actively drawing anything or processing commands? I'm
>>> thinking of the code calling pm_runtime_get() as proposed around
>>> the IOMMU unmap path in the GPU driver and then having the
>>> runtime PM resume hook in the GPU driver return some special
>>> value to indicate that it didn't really resume because it didn't
>>> need to and to treat the device as runtime suspended but not
>>> return an error. Then the runtime PM core can keep track of that
>>> and try to power the GPU on again when another pm_runtime_get()
>>> is called on the GPU device.
>>>
>>> This keeps the consumer API the same, always pm_runtime_get(),
>>> but leaves the device driver logic of what to do when the GPU
>>> doesn't need to power on to the runtime PM hook where the driver
>>> has all the information.
>>>
>>> --
>>> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
>>> a Linux Foundation Collaborative Project
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm"
>> in
>> the body of a message tomajordomo@vger.kernel.org
>> More majordomo info athttp://vger.kernel.org/majordomo-info.html
>
>
> --
> The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
>
> a Linux Foundation Collaborative Project
>
diff mbox

Patch

diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index bfe613f8939c..ddbfa8ab69e6 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -897,11 +897,15 @@  static void arm_smmu_destroy_domain_context(struct iommu_domain *domain)
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
 	struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
 	void __iomem *cb_base;
-	int irq;
+	int ret, irq;
 
 	if (!smmu || domain->type == IOMMU_DOMAIN_IDENTITY)
 		return;
 
+	ret = pm_runtime_get_sync(smmu->dev);
+	if (ret)
+		return;
+
 	/*
 	 * Disable the context bank and free the page tables before freeing
 	 * it.
@@ -916,6 +920,8 @@  static void arm_smmu_destroy_domain_context(struct iommu_domain *domain)
 
 	free_io_pgtable_ops(smmu_domain->pgtbl_ops);
 	__arm_smmu_free_bitmap(smmu->context_map, cfg->cbndx);
+
+	pm_runtime_put_sync(smmu->dev);
 }
 
 static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
@@ -1231,12 +1237,18 @@  static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
 static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
 			     size_t size)
 {
-	struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops;
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+	size_t ret;
 
 	if (!ops)
 		return 0;
 
-	return ops->unmap(ops, iova, size);
+	pm_runtime_get_sync(smmu_domain->smmu->dev);
+	ret = ops->unmap(ops, iova, size);
+	pm_runtime_put_sync(smmu_domain->smmu->dev);
+
+	return ret;
 }
 
 static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
@@ -1377,12 +1389,20 @@  static int arm_smmu_add_device(struct device *dev)
 	while (i--)
 		cfg->smendx[i] = INVALID_SMENDX;
 
-	ret = arm_smmu_master_alloc_smes(dev);
+	ret = pm_runtime_get_sync(smmu->dev);
 	if (ret)
 		goto out_cfg_free;
 
+	ret = arm_smmu_master_alloc_smes(dev);
+	if (ret) {
+		pm_runtime_put_sync(smmu->dev);
+		goto out_cfg_free;
+	}
+
 	iommu_device_link(&smmu->iommu, dev);
 
+	pm_runtime_put_sync(smmu->dev);
+
 	return 0;
 
 out_cfg_free:
@@ -1397,7 +1417,7 @@  static void arm_smmu_remove_device(struct device *dev)
 	struct iommu_fwspec *fwspec = dev->iommu_fwspec;
 	struct arm_smmu_master_cfg *cfg;
 	struct arm_smmu_device *smmu;
-
+	int ret;
 
 	if (!fwspec || fwspec->ops != &arm_smmu_ops)
 		return;
@@ -1405,8 +1425,21 @@  static void arm_smmu_remove_device(struct device *dev)
 	cfg  = fwspec->iommu_priv;
 	smmu = cfg->smmu;
 
+	/*
+	 * The device link between the master device and
+	 * smmu is already purged at this point.
+	 * So enable the power to smmu explicitly.
+	 */
+
+	ret = pm_runtime_get_sync(smmu->dev);
+	if (ret)
+		return;
+
 	iommu_device_unlink(&smmu->iommu, dev);
 	arm_smmu_master_free_smes(fwspec);
+
+	pm_runtime_put_sync(smmu->dev);
+
 	iommu_group_remove_device(dev);
 	kfree(fwspec->iommu_priv);
 	iommu_fwspec_free(dev);
@@ -2103,6 +2136,13 @@  static int arm_smmu_device_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
+	platform_set_drvdata(pdev, smmu);
+	pm_runtime_enable(dev);
+
+	err = pm_runtime_get_sync(dev);
+	if (err)
+		return err;
+
 	err = arm_smmu_device_cfg_probe(smmu);
 	if (err)
 		return err;
@@ -2144,9 +2184,9 @@  static int arm_smmu_device_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	platform_set_drvdata(pdev, smmu);
 	arm_smmu_device_reset(smmu);
 	arm_smmu_test_smr_masks(smmu);
+	pm_runtime_put_sync(dev);
 
 	/*
 	 * For ACPI and generic DT bindings, an SMMU will be probed before
@@ -2185,6 +2225,8 @@  static int arm_smmu_device_remove(struct platform_device *pdev)
 
 	/* Turn the thing off */
 	writel(sCR0_CLIENTPD, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0);
+	pm_runtime_force_suspend(smmu->dev);
+
 	return 0;
 }