Message ID | 1499333825-7658-4-git-send-email-vivek.gautam@codeaurora.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On 07/06, Vivek Gautam wrote: > @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, > static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, > size_t size) > { > - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; > + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); > + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; > + size_t ret; > > if (!ops) > return 0; > > - return ops->unmap(ops, iova, size); > + pm_runtime_get_sync(smmu_domain->smmu->dev); Can these map/unmap ops be called from an atomic context? I seem to recall that being a problem before. > + ret = ops->unmap(ops, iova, size); > + pm_runtime_put_sync(smmu_domain->smmu->dev); > + > + return ret; > } > > static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
Hi Stephen, On 07/13/2017 04:24 AM, Stephen Boyd wrote: > On 07/06, Vivek Gautam wrote: >> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >> size_t size) >> { >> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >> + size_t ret; >> >> if (!ops) >> return 0; >> >> - return ops->unmap(ops, iova, size); >> + pm_runtime_get_sync(smmu_domain->smmu->dev); > Can these map/unmap ops be called from an atomic context? I seem > to recall that being a problem before. That's something which was dropped in the following patch merged in master: 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock Looks like we don't need locks here anymore? Best Regards Vivek > > >> + ret = ops->unmap(ops, iova, size); >> + pm_runtime_put_sync(smmu_domain->smmu->dev); >> + >> + return ret; >> } >> >> static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
Hi Vivek, On 7/13/2017 10:43 AM, Vivek Gautam wrote: > Hi Stephen, > > > On 07/13/2017 04:24 AM, Stephen Boyd wrote: >> On 07/06, Vivek Gautam wrote: >>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>> size_t size) >>> { >>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>> + size_t ret; >>> if (!ops) >>> return 0; >>> - return ops->unmap(ops, iova, size); >>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >> Can these map/unmap ops be called from an atomic context? I seem >> to recall that being a problem before. > > That's something which was dropped in the following patch merged in master: > 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock > > Looks like we don't need locks here anymore? Apart from the locking, wonder why a explicit pm_runtime is needed from unmap. Somehow looks like some path in the master using that should have enabled the pm ? Regards, Sricharan
On 07/13, Vivek Gautam wrote: > Hi Stephen, > > > On 07/13/2017 04:24 AM, Stephen Boyd wrote: > >On 07/06, Vivek Gautam wrote: > >>@@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, > >> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, > >> size_t size) > >> { > >>- struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; > >>+ struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); > >>+ struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; > >>+ size_t ret; > >> if (!ops) > >> return 0; > >>- return ops->unmap(ops, iova, size); > >>+ pm_runtime_get_sync(smmu_domain->smmu->dev); > >Can these map/unmap ops be called from an atomic context? I seem > >to recall that being a problem before. > > That's something which was dropped in the following patch merged in master: > 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock > > Looks like we don't need locks here anymore? > While removing the spinlock around the map/unmap path may be one thing, I'm not sure that's all of them. Is there a path from an atomic DMA allocation (GFP_ATOMIC sort of thing) mapped into an IOMMU for a device that can eventually get down to here and attempt to turn a clk on?
On 13/07/17 07:48, Stephen Boyd wrote: > On 07/13, Vivek Gautam wrote: >> Hi Stephen, >> >> >> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>> On 07/06, Vivek Gautam wrote: >>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>> size_t size) >>>> { >>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>> + size_t ret; >>>> if (!ops) >>>> return 0; >>>> - return ops->unmap(ops, iova, size); >>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>> Can these map/unmap ops be called from an atomic context? I seem >>> to recall that being a problem before. >> >> That's something which was dropped in the following patch merged in master: >> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >> >> Looks like we don't need locks here anymore? >> > > While removing the spinlock around the map/unmap path may be one > thing, I'm not sure that's all of them. Is there a path from an > atomic DMA allocation (GFP_ATOMIC sort of thing) mapped into an > IOMMU for a device that can eventually get down to here and > attempt to turn a clk on? Yes, in the DMA path map/unmap will frequently be called from IRQ handlers (think e.g. network packets). The whole point of removing the lock was to allow multiple maps/unmaps to execute in parallel (since we know they will be safely operating on different areas of the pagetable). AFAICS this change is going to largely reintroduce that bottleneck via dev->power_lock, which is anything but what we want :( Robin.
On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: > Hi Vivek, > > On 7/13/2017 10:43 AM, Vivek Gautam wrote: >> Hi Stephen, >> >> >> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>> On 07/06, Vivek Gautam wrote: >>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>> size_t size) >>>> { >>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>> + size_t ret; >>>> if (!ops) >>>> return 0; >>>> - return ops->unmap(ops, iova, size); >>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>> Can these map/unmap ops be called from an atomic context? I seem >>> to recall that being a problem before. >> >> That's something which was dropped in the following patch merged in master: >> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >> >> Looks like we don't need locks here anymore? > > Apart from the locking, wonder why a explicit pm_runtime is needed > from unmap. Somehow looks like some path in the master using that > should have enabled the pm ? > Yes, there are a bunch of scenarios where unmap can happen with disabled master (but not in atomic context). On the gpu side we opportunistically keep a buffer mapping until the buffer is freed (which can happen after gpu is disabled). Likewise, v4l2 won't unmap an exported dmabuf while some other driver holds a reference to it (which can be dropped when the v4l2 device is suspended). Since unmap triggers tbl flush which touches iommu regs, the iommu driver *definitely* needs a pm_runtime_get_sync(). BR, -R
On Thu, Jul 13, 2017 at 5:50 AM, Robin Murphy <robin.murphy@arm.com> wrote: > On 13/07/17 07:48, Stephen Boyd wrote: >> On 07/13, Vivek Gautam wrote: >>> Hi Stephen, >>> >>> >>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>> On 07/06, Vivek Gautam wrote: >>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>> size_t size) >>>>> { >>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>> + size_t ret; >>>>> if (!ops) >>>>> return 0; >>>>> - return ops->unmap(ops, iova, size); >>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>> Can these map/unmap ops be called from an atomic context? I seem >>>> to recall that being a problem before. >>> >>> That's something which was dropped in the following patch merged in master: >>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>> >>> Looks like we don't need locks here anymore? >>> >> >> While removing the spinlock around the map/unmap path may be one >> thing, I'm not sure that's all of them. Is there a path from an >> atomic DMA allocation (GFP_ATOMIC sort of thing) mapped into an >> IOMMU for a device that can eventually get down to here and >> attempt to turn a clk on? > > Yes, in the DMA path map/unmap will frequently be called from IRQ > handlers (think e.g. network packets). The whole point of removing the > lock was to allow multiple maps/unmaps to execute in parallel (since we > know they will be safely operating on different areas of the pagetable). > AFAICS this change is going to largely reintroduce that bottleneck via > dev->power_lock, which is anything but what we want :( > Maybe __pm_runtime_resume() needs some sort of fast-path if already enabled? Or otherwise we need some sort of flag to tell the iommu that it cannot rely on the unmapping device to be resumed? BR, -R
Hi All, On 2017-07-13 13:50, Rob Clark wrote: > On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>> On 07/06, Vivek Gautam wrote: >>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>> size_t size) >>>>> { >>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>> + size_t ret; >>>>> if (!ops) >>>>> return 0; >>>>> - return ops->unmap(ops, iova, size); >>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>> Can these map/unmap ops be called from an atomic context? I seem >>>> to recall that being a problem before. >>> That's something which was dropped in the following patch merged in master: >>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>> >>> Looks like we don't need locks here anymore? >> Apart from the locking, wonder why a explicit pm_runtime is needed >> from unmap. Somehow looks like some path in the master using that >> should have enabled the pm ? >> > Yes, there are a bunch of scenarios where unmap can happen with > disabled master (but not in atomic context). On the gpu side we > opportunistically keep a buffer mapping until the buffer is freed > (which can happen after gpu is disabled). Likewise, v4l2 won't unmap > an exported dmabuf while some other driver holds a reference to it > (which can be dropped when the v4l2 device is suspended). > > Since unmap triggers tbl flush which touches iommu regs, the iommu > driver *definitely* needs a pm_runtime_get_sync(). Afair unmap might be called from atomic context as well, for example as a result of dma_unmap_page(). In exynos IOMMU I simply check the runtime PM state of IOMMU device. TLB flush is performed only when IOMMU is in active state. If it is suspended, I assume that the IOMMU controller's context is already lost and its respective power domain might be already turned off, so there is no point in touching IOMMU registers. Best regards
On Thu, Jul 13, 2017 at 8:02 AM, Marek Szyprowski <m.szyprowski@samsung.com> wrote: > Hi All, > > On 2017-07-13 13:50, Rob Clark wrote: >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> >> wrote: >>> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>>> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>>> >>>>> On 07/06, Vivek Gautam wrote: >>>>>> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain >>>>>> *domain, unsigned long iova, >>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned >>>>>> long iova, >>>>>> size_t size) >>>>>> { >>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>>> + size_t ret; >>>>>> if (!ops) >>>>>> return 0; >>>>>> - return ops->unmap(ops, iova, size); >>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>>> >>>>> Can these map/unmap ops be called from an atomic context? I seem >>>>> to recall that being a problem before. >>>> >>>> That's something which was dropped in the following patch merged in >>>> master: >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>>> >>>> Looks like we don't need locks here anymore? >>> >>> Apart from the locking, wonder why a explicit pm_runtime is needed >>> from unmap. Somehow looks like some path in the master using that >>> should have enabled the pm ? >>> >> Yes, there are a bunch of scenarios where unmap can happen with >> disabled master (but not in atomic context). On the gpu side we >> opportunistically keep a buffer mapping until the buffer is freed >> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap >> an exported dmabuf while some other driver holds a reference to it >> (which can be dropped when the v4l2 device is suspended). >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu >> driver *definitely* needs a pm_runtime_get_sync(). > > > Afair unmap might be called from atomic context as well, for example as > a result of dma_unmap_page(). In exynos IOMMU I simply check the runtime > PM state of IOMMU device. TLB flush is performed only when IOMMU is in > active > state. If it is suspended, I assume that the IOMMU controller's context > is already lost and its respective power domain might be already turned off, > so there is no point in touching IOMMU registers. > that seems like an interesting approach.. although I wonder if there can be some race w/ new device memory access once clks are enabled before tlb flush completes? That would be rather bad, since this approach is letting the backing pages of memory be freed before tlb flush. BR, -R
Hi Rob, On 2017-07-13 14:10, Rob Clark wrote: > On Thu, Jul 13, 2017 at 8:02 AM, Marek Szyprowski > <m.szyprowski@samsung.com> wrote: >> On 2017-07-13 13:50, Rob Clark wrote: >>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> >>> wrote: >>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>>>> On 07/06, Vivek Gautam wrote: >>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain >>>>>>> *domain, unsigned long iova, >>>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned >>>>>>> long iova, >>>>>>> size_t size) >>>>>>> { >>>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>>>> + size_t ret; >>>>>>> if (!ops) >>>>>>> return 0; >>>>>>> - return ops->unmap(ops, iova, size); >>>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>>>> Can these map/unmap ops be called from an atomic context? I seem >>>>>> to recall that being a problem before. >>>>> That's something which was dropped in the following patch merged in >>>>> master: >>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>>>> >>>>> Looks like we don't need locks here anymore? >>>> Apart from the locking, wonder why a explicit pm_runtime is needed >>>> from unmap. Somehow looks like some path in the master using that >>>> should have enabled the pm ? >>>> >>> Yes, there are a bunch of scenarios where unmap can happen with >>> disabled master (but not in atomic context). On the gpu side we >>> opportunistically keep a buffer mapping until the buffer is freed >>> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap >>> an exported dmabuf while some other driver holds a reference to it >>> (which can be dropped when the v4l2 device is suspended). >>> >>> Since unmap triggers tbl flush which touches iommu regs, the iommu >>> driver *definitely* needs a pm_runtime_get_sync(). >> >> Afair unmap might be called from atomic context as well, for example as >> a result of dma_unmap_page(). In exynos IOMMU I simply check the runtime >> PM state of IOMMU device. TLB flush is performed only when IOMMU is in >> active >> state. If it is suspended, I assume that the IOMMU controller's context >> is already lost and its respective power domain might be already turned off, >> so there is no point in touching IOMMU registers. >> > that seems like an interesting approach.. although I wonder if there > can be some race w/ new device memory access once clks are enabled > before tlb flush completes? That would be rather bad, since this > approach is letting the backing pages of memory be freed before tlb > flush. Exynos IOMMU has spinlock for ensuring that there is no race between PM runtime suspend and unmap/tlb flush. Best regards
Hi, On 7/13/2017 5:20 PM, Rob Clark wrote: > On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> Hi Vivek, >> >> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>> Hi Stephen, >>> >>> >>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>> On 07/06, Vivek Gautam wrote: >>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>> size_t size) >>>>> { >>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>> + size_t ret; >>>>> if (!ops) >>>>> return 0; >>>>> - return ops->unmap(ops, iova, size); >>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>> Can these map/unmap ops be called from an atomic context? I seem >>>> to recall that being a problem before. >>> >>> That's something which was dropped in the following patch merged in master: >>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>> >>> Looks like we don't need locks here anymore? >> >> Apart from the locking, wonder why a explicit pm_runtime is needed >> from unmap. Somehow looks like some path in the master using that >> should have enabled the pm ? >> > > Yes, there are a bunch of scenarios where unmap can happen with > disabled master (but not in atomic context). On the gpu side we > opportunistically keep a buffer mapping until the buffer is freed > (which can happen after gpu is disabled). Likewise, v4l2 won't unmap > an exported dmabuf while some other driver holds a reference to it > (which can be dropped when the v4l2 device is suspended). > > Since unmap triggers tbl flush which touches iommu regs, the iommu > driver *definitely* needs a pm_runtime_get_sync(). Ok, with that being the case, there are two things here, 1) If the device links are still intact at these places where unmap is called, then pm_runtime from the master would setup the all the clocks. That would avoid reintroducing the locking indirectly here. 2) If not, then doing it here is the only way. But for both cases, since the unmap can be called from atomic context, resume handler here should avoid doing clk_prepare_enable , instead move the clk_prepare to the init. Regards, Sricharan
On Thu, Jul 13, 2017 at 11:05 AM, Sricharan R <sricharan@codeaurora.org> wrote: > Hi Vivek, > > On 7/13/2017 10:43 AM, Vivek Gautam wrote: >> Hi Stephen, >> >> >> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>> On 07/06, Vivek Gautam wrote: >>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>> size_t size) >>>> { >>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>> + size_t ret; >>>> if (!ops) >>>> return 0; >>>> - return ops->unmap(ops, iova, size); >>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>> Can these map/unmap ops be called from an atomic context? I seem >>> to recall that being a problem before. >> >> That's something which was dropped in the following patch merged in master: >> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >> >> Looks like we don't need locks here anymore? > > Apart from the locking, wonder why a explicit pm_runtime is needed > from unmap. Somehow looks like some path in the master using that > should have enabled the pm ? Right, the master should have done a runtime_get(), and with device links the iommu will also resume. The master will call the unmap when it is attached to the iommu and therefore the iommu should be in resume state. We shouldn't have an unmap without the master attached anyways. Will investigate this further if we need the pm_runtime() calls around unmap or not. Best regards Vivek > > Regards, > Sricharan > > -- > "QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation > > --- > This email has been checked for viruses by Avast antivirus software. > https://www.avast.com/antivirus > > -- > To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jul 13, 2017 at 7:27 PM, Vivek Gautam <vivek.gautam@codeaurora.org> wrote: > On Thu, Jul 13, 2017 at 11:05 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> Hi Vivek, >> >> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>> Hi Stephen, >>> >>> >>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>> On 07/06, Vivek Gautam wrote: >>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>> size_t size) >>>>> { >>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>> + size_t ret; >>>>> if (!ops) >>>>> return 0; >>>>> - return ops->unmap(ops, iova, size); >>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>> Can these map/unmap ops be called from an atomic context? I seem >>>> to recall that being a problem before. >>> >>> That's something which was dropped in the following patch merged in master: >>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>> >>> Looks like we don't need locks here anymore? >> >> Apart from the locking, wonder why a explicit pm_runtime is needed >> from unmap. Somehow looks like some path in the master using that >> should have enabled the pm ? > > Right, the master should have done a runtime_get(), and with > device links the iommu will also resume. > > The master will call the unmap when it is attached to the iommu > and therefore the iommu should be in resume state. > We shouldn't have an unmap without the master attached anyways. > Will investigate this further if we need the pm_runtime() calls > around unmap or not. My apologies. My email client didn't update the thread. So please ignore this comment. > > Best regards > Vivek > >> >> Regards, >> Sricharan >> >> -- >> "QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation >> >> --- >> This email has been checked for viruses by Avast antivirus software. >> https://www.avast.com/antivirus >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html > > > > -- > Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, > a Linux Foundation Collaborative Project
On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: > Hi, > > On 7/13/2017 5:20 PM, Rob Clark wrote: >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >>> Hi Vivek, >>> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>>> Hi Stephen, >>>> >>>> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>>> On 07/06, Vivek Gautam wrote: >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>>> size_t size) >>>>>> { >>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>>> + size_t ret; >>>>>> if (!ops) >>>>>> return 0; >>>>>> - return ops->unmap(ops, iova, size); >>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>>> Can these map/unmap ops be called from an atomic context? I seem >>>>> to recall that being a problem before. >>>> >>>> That's something which was dropped in the following patch merged in master: >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>>> >>>> Looks like we don't need locks here anymore? >>> >>> Apart from the locking, wonder why a explicit pm_runtime is needed >>> from unmap. Somehow looks like some path in the master using that >>> should have enabled the pm ? >>> >> >> Yes, there are a bunch of scenarios where unmap can happen with >> disabled master (but not in atomic context). On the gpu side we >> opportunistically keep a buffer mapping until the buffer is freed >> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap >> an exported dmabuf while some other driver holds a reference to it >> (which can be dropped when the v4l2 device is suspended). >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu >> driver *definitely* needs a pm_runtime_get_sync(). > > Ok, with that being the case, there are two things here, > > 1) If the device links are still intact at these places where unmap is called, > then pm_runtime from the master would setup the all the clocks. That would > avoid reintroducing the locking indirectly here. > > 2) If not, then doing it here is the only way. But for both cases, since > the unmap can be called from atomic context, resume handler here should > avoid doing clk_prepare_enable , instead move the clk_prepare to the init. > I do kinda like the approach Marek suggested.. of deferring the tlb flush until resume. I'm wondering if we could combine that with putting the mmu in a stalled state when we suspend (and not resume the mmu until after the pending tlb flush)? BR, -R
On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote: > On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: > > Hi, > > > > On 7/13/2017 5:20 PM, Rob Clark wrote: > >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: > >>> Hi Vivek, > >>> > >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: > >>>> Hi Stephen, > >>>> > >>>> > >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: > >>>>> On 07/06, Vivek Gautam wrote: > >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, > >>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, > >>>>>> size_t size) > >>>>>> { > >>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; > >>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); > >>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; > >>>>>> + size_t ret; > >>>>>> if (!ops) > >>>>>> return 0; > >>>>>> - return ops->unmap(ops, iova, size); > >>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); > >>>>> Can these map/unmap ops be called from an atomic context? I seem > >>>>> to recall that being a problem before. > >>>> > >>>> That's something which was dropped in the following patch merged in master: > >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock > >>>> > >>>> Looks like we don't need locks here anymore? > >>> > >>> Apart from the locking, wonder why a explicit pm_runtime is needed > >>> from unmap. Somehow looks like some path in the master using that > >>> should have enabled the pm ? > >>> > >> > >> Yes, there are a bunch of scenarios where unmap can happen with > >> disabled master (but not in atomic context). On the gpu side we > >> opportunistically keep a buffer mapping until the buffer is freed > >> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap > >> an exported dmabuf while some other driver holds a reference to it > >> (which can be dropped when the v4l2 device is suspended). > >> > >> Since unmap triggers tbl flush which touches iommu regs, the iommu > >> driver *definitely* needs a pm_runtime_get_sync(). > > > > Ok, with that being the case, there are two things here, > > > > 1) If the device links are still intact at these places where unmap is called, > > then pm_runtime from the master would setup the all the clocks. That would > > avoid reintroducing the locking indirectly here. > > > > 2) If not, then doing it here is the only way. But for both cases, since > > the unmap can be called from atomic context, resume handler here should > > avoid doing clk_prepare_enable , instead move the clk_prepare to the init. > > > > I do kinda like the approach Marek suggested.. of deferring the tlb > flush until resume. I'm wondering if we could combine that with > putting the mmu in a stalled state when we suspend (and not resume the > mmu until after the pending tlb flush)? I'm not sure that a stalled state is what we're after here, because we need to take care to prevent any table walks if we've freed the underlying pages. What we could try to do is disable the SMMU (put into global bypass) and invalidate the TLB when performing a suspend operation, then we just ignore invalidation whilst the clocks are stopped and, on resume, enable the SMMU again. That said, I don't think we can tolerate suspend/resume racing with map/unmap, and it's not clear to me how we avoid that without penalising the fastpath. Will
On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote: > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote: >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> > Hi, >> > >> > On 7/13/2017 5:20 PM, Rob Clark wrote: >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> >>> Hi Vivek, >> >>> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >> >>>> Hi Stephen, >> >>>> >> >>>> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >> >>>>> On 07/06, Vivek Gautam wrote: >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >> >>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >> >>>>>> size_t size) >> >>>>>> { >> >>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >> >>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >> >>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >> >>>>>> + size_t ret; >> >>>>>> if (!ops) >> >>>>>> return 0; >> >>>>>> - return ops->unmap(ops, iova, size); >> >>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >> >>>>> Can these map/unmap ops be called from an atomic context? I seem >> >>>>> to recall that being a problem before. >> >>>> >> >>>> That's something which was dropped in the following patch merged in master: >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >> >>>> >> >>>> Looks like we don't need locks here anymore? >> >>> >> >>> Apart from the locking, wonder why a explicit pm_runtime is needed >> >>> from unmap. Somehow looks like some path in the master using that >> >>> should have enabled the pm ? >> >>> >> >> >> >> Yes, there are a bunch of scenarios where unmap can happen with >> >> disabled master (but not in atomic context). On the gpu side we >> >> opportunistically keep a buffer mapping until the buffer is freed >> >> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap >> >> an exported dmabuf while some other driver holds a reference to it >> >> (which can be dropped when the v4l2 device is suspended). >> >> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu >> >> driver *definitely* needs a pm_runtime_get_sync(). >> > >> > Ok, with that being the case, there are two things here, >> > >> > 1) If the device links are still intact at these places where unmap is called, >> > then pm_runtime from the master would setup the all the clocks. That would >> > avoid reintroducing the locking indirectly here. >> > >> > 2) If not, then doing it here is the only way. But for both cases, since >> > the unmap can be called from atomic context, resume handler here should >> > avoid doing clk_prepare_enable , instead move the clk_prepare to the init. >> > >> >> I do kinda like the approach Marek suggested.. of deferring the tlb >> flush until resume. I'm wondering if we could combine that with >> putting the mmu in a stalled state when we suspend (and not resume the >> mmu until after the pending tlb flush)? > > I'm not sure that a stalled state is what we're after here, because we need > to take care to prevent any table walks if we've freed the underlying pages. > What we could try to do is disable the SMMU (put into global bypass) and > invalidate the TLB when performing a suspend operation, then we just ignore > invalidation whilst the clocks are stopped and, on resume, enable the SMMU > again. wouldn't stalled just block any memory transactions by device(s) using the context bank? Putting it in bypass isn't really a good thing if there is any chance the device can sneak in a memory access before we've taking it back out of bypass (ie. makes gpu a giant userspace controlled root hole). BR, -R > That said, I don't think we can tolerate suspend/resume racing with > map/unmap, and it's not clear to me how we avoid that without penalising > the fastpath. > > Will
On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote: > On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote: > > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote: > >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: > >> > Hi, > >> > > >> > On 7/13/2017 5:20 PM, Rob Clark wrote: > >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: > >> >>> Hi Vivek, > >> >>> > >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: > >> >>>> Hi Stephen, > >> >>>> > >> >>>> > >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: > >> >>>>> On 07/06, Vivek Gautam wrote: > >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, > >> >>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, > >> >>>>>> size_t size) > >> >>>>>> { > >> >>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; > >> >>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); > >> >>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; > >> >>>>>> + size_t ret; > >> >>>>>> if (!ops) > >> >>>>>> return 0; > >> >>>>>> - return ops->unmap(ops, iova, size); > >> >>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); > >> >>>>> Can these map/unmap ops be called from an atomic context? I seem > >> >>>>> to recall that being a problem before. > >> >>>> > >> >>>> That's something which was dropped in the following patch merged in master: > >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock > >> >>>> > >> >>>> Looks like we don't need locks here anymore? > >> >>> > >> >>> Apart from the locking, wonder why a explicit pm_runtime is needed > >> >>> from unmap. Somehow looks like some path in the master using that > >> >>> should have enabled the pm ? > >> >>> > >> >> > >> >> Yes, there are a bunch of scenarios where unmap can happen with > >> >> disabled master (but not in atomic context). On the gpu side we > >> >> opportunistically keep a buffer mapping until the buffer is freed > >> >> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap > >> >> an exported dmabuf while some other driver holds a reference to it > >> >> (which can be dropped when the v4l2 device is suspended). > >> >> > >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu > >> >> driver *definitely* needs a pm_runtime_get_sync(). > >> > > >> > Ok, with that being the case, there are two things here, > >> > > >> > 1) If the device links are still intact at these places where unmap is called, > >> > then pm_runtime from the master would setup the all the clocks. That would > >> > avoid reintroducing the locking indirectly here. > >> > > >> > 2) If not, then doing it here is the only way. But for both cases, since > >> > the unmap can be called from atomic context, resume handler here should > >> > avoid doing clk_prepare_enable , instead move the clk_prepare to the init. > >> > > >> > >> I do kinda like the approach Marek suggested.. of deferring the tlb > >> flush until resume. I'm wondering if we could combine that with > >> putting the mmu in a stalled state when we suspend (and not resume the > >> mmu until after the pending tlb flush)? > > > > I'm not sure that a stalled state is what we're after here, because we need > > to take care to prevent any table walks if we've freed the underlying pages. > > What we could try to do is disable the SMMU (put into global bypass) and > > invalidate the TLB when performing a suspend operation, then we just ignore > > invalidation whilst the clocks are stopped and, on resume, enable the SMMU > > again. > > wouldn't stalled just block any memory transactions by device(s) using > the context bank? Putting it in bypass isn't really a good thing if > there is any chance the device can sneak in a memory access before > we've taking it back out of bypass (ie. makes gpu a giant userspace > controlled root hole). If it doesn't deadlock, then yes, it will stall transactions. However, that doesn't mean it necessarily prevents page table walks. Instead of bypass, we could configure all the streams to terminate, but this race still worries me somewhat. I thought that the SMMU would only be suspended if all of its masters were suspended, so if the GPU wants to come out of suspend then the SMMU should be resumed first. It would be helpful if somebody could figure out exactly what can race with the suspend/resume calls here. Will
On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote: > On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote: >> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote: >> > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote: >> >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> >> > Hi, >> >> > >> >> > On 7/13/2017 5:20 PM, Rob Clark wrote: >> >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> >> >>> Hi Vivek, >> >> >>> >> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >> >> >>>> Hi Stephen, >> >> >>>> >> >> >>>> >> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >> >> >>>>> On 07/06, Vivek Gautam wrote: >> >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >> >> >>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >> >> >>>>>> size_t size) >> >> >>>>>> { >> >> >>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >> >> >>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >> >> >>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >> >> >>>>>> + size_t ret; >> >> >>>>>> if (!ops) >> >> >>>>>> return 0; >> >> >>>>>> - return ops->unmap(ops, iova, size); >> >> >>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >> >> >>>>> Can these map/unmap ops be called from an atomic context? I seem >> >> >>>>> to recall that being a problem before. >> >> >>>> >> >> >>>> That's something which was dropped in the following patch merged in master: >> >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >> >> >>>> >> >> >>>> Looks like we don't need locks here anymore? >> >> >>> >> >> >>> Apart from the locking, wonder why a explicit pm_runtime is needed >> >> >>> from unmap. Somehow looks like some path in the master using that >> >> >>> should have enabled the pm ? >> >> >>> >> >> >> >> >> >> Yes, there are a bunch of scenarios where unmap can happen with >> >> >> disabled master (but not in atomic context). On the gpu side we >> >> >> opportunistically keep a buffer mapping until the buffer is freed >> >> >> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap >> >> >> an exported dmabuf while some other driver holds a reference to it >> >> >> (which can be dropped when the v4l2 device is suspended). >> >> >> >> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu >> >> >> driver *definitely* needs a pm_runtime_get_sync(). >> >> > >> >> > Ok, with that being the case, there are two things here, >> >> > >> >> > 1) If the device links are still intact at these places where unmap is called, >> >> > then pm_runtime from the master would setup the all the clocks. That would >> >> > avoid reintroducing the locking indirectly here. >> >> > >> >> > 2) If not, then doing it here is the only way. But for both cases, since >> >> > the unmap can be called from atomic context, resume handler here should >> >> > avoid doing clk_prepare_enable , instead move the clk_prepare to the init. >> >> > >> >> >> >> I do kinda like the approach Marek suggested.. of deferring the tlb >> >> flush until resume. I'm wondering if we could combine that with >> >> putting the mmu in a stalled state when we suspend (and not resume the >> >> mmu until after the pending tlb flush)? >> > >> > I'm not sure that a stalled state is what we're after here, because we need >> > to take care to prevent any table walks if we've freed the underlying pages. >> > What we could try to do is disable the SMMU (put into global bypass) and >> > invalidate the TLB when performing a suspend operation, then we just ignore >> > invalidation whilst the clocks are stopped and, on resume, enable the SMMU >> > again. >> >> wouldn't stalled just block any memory transactions by device(s) using >> the context bank? Putting it in bypass isn't really a good thing if >> there is any chance the device can sneak in a memory access before >> we've taking it back out of bypass (ie. makes gpu a giant userspace >> controlled root hole). > > If it doesn't deadlock, then yes, it will stall transactions. However, that > doesn't mean it necessarily prevents page table walks. btw, I guess the concern about pagetable walk is that the unmap could have removed some sub-level of the pt that the tlb walk would hit? Would deferring freeing those pages help? > Instead of bypass, we > could configure all the streams to terminate, but this race still worries me > somewhat. I thought that the SMMU would only be suspended if all of its > masters were suspended, so if the GPU wants to come out of suspend then the > SMMU should be resumed first. I believe this should be true.. on the gpu side, I'm mostly trying to avoid having to power the gpu back on to free buffers. (On the v4l2 side, somewhere in the core videobuf code would also need to be made to wrap it's dma_unmap_sg() with pm_runtime_get/put()..) BR, -R > It would be helpful if somebody could figure out exactly what can race with > the suspend/resume calls here. > > Will
On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote: > On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote: > > On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote: > >> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote: > >> > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote: > >> >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: > >> >> > Hi, > >> >> > > >> >> > On 7/13/2017 5:20 PM, Rob Clark wrote: > >> >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: > >> >> >>> Hi Vivek, > >> >> >>> > >> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: > >> >> >>>> Hi Stephen, > >> >> >>>> > >> >> >>>> > >> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: > >> >> >>>>> On 07/06, Vivek Gautam wrote: > >> >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, > >> >> >>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, > >> >> >>>>>> size_t size) > >> >> >>>>>> { > >> >> >>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; > >> >> >>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); > >> >> >>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; > >> >> >>>>>> + size_t ret; > >> >> >>>>>> if (!ops) > >> >> >>>>>> return 0; > >> >> >>>>>> - return ops->unmap(ops, iova, size); > >> >> >>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); > >> >> >>>>> Can these map/unmap ops be called from an atomic context? I seem > >> >> >>>>> to recall that being a problem before. > >> >> >>>> > >> >> >>>> That's something which was dropped in the following patch merged in master: > >> >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock > >> >> >>>> > >> >> >>>> Looks like we don't need locks here anymore? > >> >> >>> > >> >> >>> Apart from the locking, wonder why a explicit pm_runtime is needed > >> >> >>> from unmap. Somehow looks like some path in the master using that > >> >> >>> should have enabled the pm ? > >> >> >>> > >> >> >> > >> >> >> Yes, there are a bunch of scenarios where unmap can happen with > >> >> >> disabled master (but not in atomic context). On the gpu side we > >> >> >> opportunistically keep a buffer mapping until the buffer is freed > >> >> >> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap > >> >> >> an exported dmabuf while some other driver holds a reference to it > >> >> >> (which can be dropped when the v4l2 device is suspended). > >> >> >> > >> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu > >> >> >> driver *definitely* needs a pm_runtime_get_sync(). > >> >> > > >> >> > Ok, with that being the case, there are two things here, > >> >> > > >> >> > 1) If the device links are still intact at these places where unmap is called, > >> >> > then pm_runtime from the master would setup the all the clocks. That would > >> >> > avoid reintroducing the locking indirectly here. > >> >> > > >> >> > 2) If not, then doing it here is the only way. But for both cases, since > >> >> > the unmap can be called from atomic context, resume handler here should > >> >> > avoid doing clk_prepare_enable , instead move the clk_prepare to the init. > >> >> > > >> >> > >> >> I do kinda like the approach Marek suggested.. of deferring the tlb > >> >> flush until resume. I'm wondering if we could combine that with > >> >> putting the mmu in a stalled state when we suspend (and not resume the > >> >> mmu until after the pending tlb flush)? > >> > > >> > I'm not sure that a stalled state is what we're after here, because we need > >> > to take care to prevent any table walks if we've freed the underlying pages. > >> > What we could try to do is disable the SMMU (put into global bypass) and > >> > invalidate the TLB when performing a suspend operation, then we just ignore > >> > invalidation whilst the clocks are stopped and, on resume, enable the SMMU > >> > again. > >> > >> wouldn't stalled just block any memory transactions by device(s) using > >> the context bank? Putting it in bypass isn't really a good thing if > >> there is any chance the device can sneak in a memory access before > >> we've taking it back out of bypass (ie. makes gpu a giant userspace > >> controlled root hole). > > > > If it doesn't deadlock, then yes, it will stall transactions. However, that > > doesn't mean it necessarily prevents page table walks. > > btw, I guess the concern about pagetable walk is that the unmap could > have removed some sub-level of the pt that the tlb walk would hit? > Would deferring freeing those pages help? Could do, but it sounds like a lot of complication that I think we can fix by making the suspend operation put the SMMU into a "clean" state. > > Instead of bypass, we > > could configure all the streams to terminate, but this race still worries me > > somewhat. I thought that the SMMU would only be suspended if all of its > > masters were suspended, so if the GPU wants to come out of suspend then the > > SMMU should be resumed first. > > I believe this should be true.. on the gpu side, I'm mostly trying to > avoid having to power the gpu back on to free buffers. (On the v4l2 > side, somewhere in the core videobuf code would also need to be made > to wrap it's dma_unmap_sg() with pm_runtime_get/put()..) Right, and we shouldn't have to resume it if we suspend it in a clean state, with the TLBs invalidated. Will
On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote: > On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote: >> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote: >> > On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote: >> >> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote: >> >> > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote: >> >> >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> >> >> > Hi, >> >> >> > >> >> >> > On 7/13/2017 5:20 PM, Rob Clark wrote: >> >> >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> >> >> >>> Hi Vivek, >> >> >> >>> >> >> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >> >> >> >>>> Hi Stephen, >> >> >> >>>> >> >> >> >>>> >> >> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >> >> >> >>>>> On 07/06, Vivek Gautam wrote: >> >> >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >> >> >> >>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >> >> >> >>>>>> size_t size) >> >> >> >>>>>> { >> >> >> >>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >> >> >> >>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >> >> >> >>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >> >> >> >>>>>> + size_t ret; >> >> >> >>>>>> if (!ops) >> >> >> >>>>>> return 0; >> >> >> >>>>>> - return ops->unmap(ops, iova, size); >> >> >> >>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >> >> >> >>>>> Can these map/unmap ops be called from an atomic context? I seem >> >> >> >>>>> to recall that being a problem before. >> >> >> >>>> >> >> >> >>>> That's something which was dropped in the following patch merged in master: >> >> >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >> >> >> >>>> >> >> >> >>>> Looks like we don't need locks here anymore? >> >> >> >>> >> >> >> >>> Apart from the locking, wonder why a explicit pm_runtime is needed >> >> >> >>> from unmap. Somehow looks like some path in the master using that >> >> >> >>> should have enabled the pm ? >> >> >> >>> >> >> >> >> >> >> >> >> Yes, there are a bunch of scenarios where unmap can happen with >> >> >> >> disabled master (but not in atomic context). On the gpu side we >> >> >> >> opportunistically keep a buffer mapping until the buffer is freed >> >> >> >> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap >> >> >> >> an exported dmabuf while some other driver holds a reference to it >> >> >> >> (which can be dropped when the v4l2 device is suspended). >> >> >> >> >> >> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu >> >> >> >> driver *definitely* needs a pm_runtime_get_sync(). >> >> >> > >> >> >> > Ok, with that being the case, there are two things here, >> >> >> > >> >> >> > 1) If the device links are still intact at these places where unmap is called, >> >> >> > then pm_runtime from the master would setup the all the clocks. That would >> >> >> > avoid reintroducing the locking indirectly here. >> >> >> > >> >> >> > 2) If not, then doing it here is the only way. But for both cases, since >> >> >> > the unmap can be called from atomic context, resume handler here should >> >> >> > avoid doing clk_prepare_enable , instead move the clk_prepare to the init. >> >> >> > >> >> >> >> >> >> I do kinda like the approach Marek suggested.. of deferring the tlb >> >> >> flush until resume. I'm wondering if we could combine that with >> >> >> putting the mmu in a stalled state when we suspend (and not resume the >> >> >> mmu until after the pending tlb flush)? >> >> > >> >> > I'm not sure that a stalled state is what we're after here, because we need >> >> > to take care to prevent any table walks if we've freed the underlying pages. >> >> > What we could try to do is disable the SMMU (put into global bypass) and >> >> > invalidate the TLB when performing a suspend operation, then we just ignore >> >> > invalidation whilst the clocks are stopped and, on resume, enable the SMMU >> >> > again. >> >> >> >> wouldn't stalled just block any memory transactions by device(s) using >> >> the context bank? Putting it in bypass isn't really a good thing if >> >> there is any chance the device can sneak in a memory access before >> >> we've taking it back out of bypass (ie. makes gpu a giant userspace >> >> controlled root hole). >> > >> > If it doesn't deadlock, then yes, it will stall transactions. However, that >> > doesn't mean it necessarily prevents page table walks. >> >> btw, I guess the concern about pagetable walk is that the unmap could >> have removed some sub-level of the pt that the tlb walk would hit? >> Would deferring freeing those pages help? > > Could do, but it sounds like a lot of complication that I think we can fix > by making the suspend operation put the SMMU into a "clean" state. > >> > Instead of bypass, we >> > could configure all the streams to terminate, but this race still worries me >> > somewhat. I thought that the SMMU would only be suspended if all of its >> > masters were suspended, so if the GPU wants to come out of suspend then the >> > SMMU should be resumed first. >> >> I believe this should be true.. on the gpu side, I'm mostly trying to >> avoid having to power the gpu back on to free buffers. (On the v4l2 >> side, somewhere in the core videobuf code would also need to be made >> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..) > > Right, and we shouldn't have to resume it if we suspend it in a clean state, > with the TLBs invalidated. > I guess if the device_link() stuff ensured the attached device (gpu/etc) was suspended before suspending the iommu, then I guess I can't see how temporarily putting the iommu in bypass would be a problem. I haven't looked at the device_link() stuff too closely, but iommu being resumed first and suspended last seems like the only thing that would make sense. I'm mostly just nervous about iommu in bypass vs gpu since userspace has so much control over what address gpu writes to / reads from, so getting it wrong w/ the iommu would be a rather bad thing ;-) BR, -R
On Fri, Jul 14, 2017 at 03:34:42PM -0400, Rob Clark wrote: > On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote: > > On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote: > >> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote: > >> > On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote: > >> >> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote: > >> >> > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote: > >> >> >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: > >> >> >> > Hi, > >> >> >> > > >> >> >> > On 7/13/2017 5:20 PM, Rob Clark wrote: > >> >> >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: > >> >> >> >>> Hi Vivek, > >> >> >> >>> > >> >> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: > >> >> >> >>>> Hi Stephen, > >> >> >> >>>> > >> >> >> >>>> > >> >> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: > >> >> >> >>>>> On 07/06, Vivek Gautam wrote: > >> >> >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, > >> >> >> >>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, > >> >> >> >>>>>> size_t size) > >> >> >> >>>>>> { > >> >> >> >>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; > >> >> >> >>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); > >> >> >> >>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; > >> >> >> >>>>>> + size_t ret; > >> >> >> >>>>>> if (!ops) > >> >> >> >>>>>> return 0; > >> >> >> >>>>>> - return ops->unmap(ops, iova, size); > >> >> >> >>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); > >> >> >> >>>>> Can these map/unmap ops be called from an atomic context? I seem > >> >> >> >>>>> to recall that being a problem before. > >> >> >> >>>> > >> >> >> >>>> That's something which was dropped in the following patch merged in master: > >> >> >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock > >> >> >> >>>> > >> >> >> >>>> Looks like we don't need locks here anymore? > >> >> >> >>> > >> >> >> >>> Apart from the locking, wonder why a explicit pm_runtime is needed > >> >> >> >>> from unmap. Somehow looks like some path in the master using that > >> >> >> >>> should have enabled the pm ? > >> >> >> >>> > >> >> >> >> > >> >> >> >> Yes, there are a bunch of scenarios where unmap can happen with > >> >> >> >> disabled master (but not in atomic context). On the gpu side we > >> >> >> >> opportunistically keep a buffer mapping until the buffer is freed > >> >> >> >> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap > >> >> >> >> an exported dmabuf while some other driver holds a reference to it > >> >> >> >> (which can be dropped when the v4l2 device is suspended). > >> >> >> >> > >> >> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu > >> >> >> >> driver *definitely* needs a pm_runtime_get_sync(). > >> >> >> > > >> >> >> > Ok, with that being the case, there are two things here, > >> >> >> > > >> >> >> > 1) If the device links are still intact at these places where unmap is called, > >> >> >> > then pm_runtime from the master would setup the all the clocks. That would > >> >> >> > avoid reintroducing the locking indirectly here. > >> >> >> > > >> >> >> > 2) If not, then doing it here is the only way. But for both cases, since > >> >> >> > the unmap can be called from atomic context, resume handler here should > >> >> >> > avoid doing clk_prepare_enable , instead move the clk_prepare to the init. > >> >> >> > > >> >> >> > >> >> >> I do kinda like the approach Marek suggested.. of deferring the tlb > >> >> >> flush until resume. I'm wondering if we could combine that with > >> >> >> putting the mmu in a stalled state when we suspend (and not resume the > >> >> >> mmu until after the pending tlb flush)? > >> >> > > >> >> > I'm not sure that a stalled state is what we're after here, because we need > >> >> > to take care to prevent any table walks if we've freed the underlying pages. > >> >> > What we could try to do is disable the SMMU (put into global bypass) and > >> >> > invalidate the TLB when performing a suspend operation, then we just ignore > >> >> > invalidation whilst the clocks are stopped and, on resume, enable the SMMU > >> >> > again. > >> >> > >> >> wouldn't stalled just block any memory transactions by device(s) using > >> >> the context bank? Putting it in bypass isn't really a good thing if > >> >> there is any chance the device can sneak in a memory access before > >> >> we've taking it back out of bypass (ie. makes gpu a giant userspace > >> >> controlled root hole). > >> > > >> > If it doesn't deadlock, then yes, it will stall transactions. However, that > >> > doesn't mean it necessarily prevents page table walks. > >> > >> btw, I guess the concern about pagetable walk is that the unmap could > >> have removed some sub-level of the pt that the tlb walk would hit? > >> Would deferring freeing those pages help? > > > > Could do, but it sounds like a lot of complication that I think we can fix > > by making the suspend operation put the SMMU into a "clean" state. > > > >> > Instead of bypass, we > >> > could configure all the streams to terminate, but this race still worries me > >> > somewhat. I thought that the SMMU would only be suspended if all of its > >> > masters were suspended, so if the GPU wants to come out of suspend then the > >> > SMMU should be resumed first. > >> > >> I believe this should be true.. on the gpu side, I'm mostly trying to > >> avoid having to power the gpu back on to free buffers. (On the v4l2 > >> side, somewhere in the core videobuf code would also need to be made > >> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..) > > > > Right, and we shouldn't have to resume it if we suspend it in a clean state, > > with the TLBs invalidated. > > > > I guess if the device_link() stuff ensured the attached device > (gpu/etc) was suspended before suspending the iommu, then I guess I > can't see how temporarily putting the iommu in bypass would be a > problem. I haven't looked at the device_link() stuff too closely, but > iommu being resumed first and suspended last seems like the only thing > that would make sense. I'm mostly just nervous about iommu in bypass > vs gpu since userspace has so much control over what address gpu > writes to / reads from, so getting it wrong w/ the iommu would be a > rather bad thing ;-) Right, but we can also configure it to terminate if you don't want bypass. Will
On Fri, Jul 14, 2017 at 3:36 PM, Will Deacon <will.deacon@arm.com> wrote: > On Fri, Jul 14, 2017 at 03:34:42PM -0400, Rob Clark wrote: >> On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote: >> > On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote: >> >> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote: >> >> > On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote: >> >> >> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote: >> >> >> > On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote: >> >> >> >> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> >> >> >> > Hi, >> >> >> >> > >> >> >> >> > On 7/13/2017 5:20 PM, Rob Clark wrote: >> >> >> >> >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> >> >> >> >>> Hi Vivek, >> >> >> >> >>> >> >> >> >> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >> >> >> >> >>>> Hi Stephen, >> >> >> >> >>>> >> >> >> >> >>>> >> >> >> >> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >> >> >> >> >>>>> On 07/06, Vivek Gautam wrote: >> >> >> >> >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >> >> >> >> >>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >> >> >> >> >>>>>> size_t size) >> >> >> >> >>>>>> { >> >> >> >> >>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >> >> >> >> >>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >> >> >> >> >>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >> >> >> >> >>>>>> + size_t ret; >> >> >> >> >>>>>> if (!ops) >> >> >> >> >>>>>> return 0; >> >> >> >> >>>>>> - return ops->unmap(ops, iova, size); >> >> >> >> >>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >> >> >> >> >>>>> Can these map/unmap ops be called from an atomic context? I seem >> >> >> >> >>>>> to recall that being a problem before. >> >> >> >> >>>> >> >> >> >> >>>> That's something which was dropped in the following patch merged in master: >> >> >> >> >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >> >> >> >> >>>> >> >> >> >> >>>> Looks like we don't need locks here anymore? >> >> >> >> >>> >> >> >> >> >>> Apart from the locking, wonder why a explicit pm_runtime is needed >> >> >> >> >>> from unmap. Somehow looks like some path in the master using that >> >> >> >> >>> should have enabled the pm ? >> >> >> >> >>> >> >> >> >> >> >> >> >> >> >> Yes, there are a bunch of scenarios where unmap can happen with >> >> >> >> >> disabled master (but not in atomic context). On the gpu side we >> >> >> >> >> opportunistically keep a buffer mapping until the buffer is freed >> >> >> >> >> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap >> >> >> >> >> an exported dmabuf while some other driver holds a reference to it >> >> >> >> >> (which can be dropped when the v4l2 device is suspended). >> >> >> >> >> >> >> >> >> >> Since unmap triggers tbl flush which touches iommu regs, the iommu >> >> >> >> >> driver *definitely* needs a pm_runtime_get_sync(). >> >> >> >> > >> >> >> >> > Ok, with that being the case, there are two things here, >> >> >> >> > >> >> >> >> > 1) If the device links are still intact at these places where unmap is called, >> >> >> >> > then pm_runtime from the master would setup the all the clocks. That would >> >> >> >> > avoid reintroducing the locking indirectly here. >> >> >> >> > >> >> >> >> > 2) If not, then doing it here is the only way. But for both cases, since >> >> >> >> > the unmap can be called from atomic context, resume handler here should >> >> >> >> > avoid doing clk_prepare_enable , instead move the clk_prepare to the init. >> >> >> >> > >> >> >> >> >> >> >> >> I do kinda like the approach Marek suggested.. of deferring the tlb >> >> >> >> flush until resume. I'm wondering if we could combine that with >> >> >> >> putting the mmu in a stalled state when we suspend (and not resume the >> >> >> >> mmu until after the pending tlb flush)? >> >> >> > >> >> >> > I'm not sure that a stalled state is what we're after here, because we need >> >> >> > to take care to prevent any table walks if we've freed the underlying pages. >> >> >> > What we could try to do is disable the SMMU (put into global bypass) and >> >> >> > invalidate the TLB when performing a suspend operation, then we just ignore >> >> >> > invalidation whilst the clocks are stopped and, on resume, enable the SMMU >> >> >> > again. >> >> >> >> >> >> wouldn't stalled just block any memory transactions by device(s) using >> >> >> the context bank? Putting it in bypass isn't really a good thing if >> >> >> there is any chance the device can sneak in a memory access before >> >> >> we've taking it back out of bypass (ie. makes gpu a giant userspace >> >> >> controlled root hole). >> >> > >> >> > If it doesn't deadlock, then yes, it will stall transactions. However, that >> >> > doesn't mean it necessarily prevents page table walks. >> >> >> >> btw, I guess the concern about pagetable walk is that the unmap could >> >> have removed some sub-level of the pt that the tlb walk would hit? >> >> Would deferring freeing those pages help? >> > >> > Could do, but it sounds like a lot of complication that I think we can fix >> > by making the suspend operation put the SMMU into a "clean" state. >> > >> >> > Instead of bypass, we >> >> > could configure all the streams to terminate, but this race still worries me >> >> > somewhat. I thought that the SMMU would only be suspended if all of its >> >> > masters were suspended, so if the GPU wants to come out of suspend then the >> >> > SMMU should be resumed first. >> >> >> >> I believe this should be true.. on the gpu side, I'm mostly trying to >> >> avoid having to power the gpu back on to free buffers. (On the v4l2 >> >> side, somewhere in the core videobuf code would also need to be made >> >> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..) >> > >> > Right, and we shouldn't have to resume it if we suspend it in a clean state, >> > with the TLBs invalidated. >> > >> >> I guess if the device_link() stuff ensured the attached device >> (gpu/etc) was suspended before suspending the iommu, then I guess I >> can't see how temporarily putting the iommu in bypass would be a >> problem. I haven't looked at the device_link() stuff too closely, but >> iommu being resumed first and suspended last seems like the only thing >> that would make sense. I'm mostly just nervous about iommu in bypass >> vs gpu since userspace has so much control over what address gpu >> writes to / reads from, so getting it wrong w/ the iommu would be a >> rather bad thing ;-) > > Right, but we can also configure it to terminate if you don't want bypass. > ok, terminate wfm BR, -R
Hi, On 7/15/2017 1:09 AM, Rob Clark wrote: > On Fri, Jul 14, 2017 at 3:36 PM, Will Deacon <will.deacon@arm.com> wrote: >> On Fri, Jul 14, 2017 at 03:34:42PM -0400, Rob Clark wrote: >>> On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote: >>>> On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote: >>>>> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote: >>>>>> On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote: >>>>>>> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote: >>>>>>>> On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote: >>>>>>>>> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: >>>>>>>>>> Hi, >>>>>>>>>> >>>>>>>>>> On 7/13/2017 5:20 PM, Rob Clark wrote: >>>>>>>>>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >>>>>>>>>>>> Hi Vivek, >>>>>>>>>>>> >>>>>>>>>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>>>>>>>>>>>> Hi Stephen, >>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>>>>>>>>>>>> On 07/06, Vivek Gautam wrote: >>>>>>>>>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>>>>>>>>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>>>>>>>>>>>> size_t size) >>>>>>>>>>>>>>> { >>>>>>>>>>>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>>>>>>>>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>>>>>>>>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>>>>>>>>>>>> + size_t ret; >>>>>>>>>>>>>>> if (!ops) >>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>> - return ops->unmap(ops, iova, size); >>>>>>>>>>>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>>>>>>>>>>>> Can these map/unmap ops be called from an atomic context? I seem >>>>>>>>>>>>>> to recall that being a problem before. >>>>>>>>>>>>> >>>>>>>>>>>>> That's something which was dropped in the following patch merged in master: >>>>>>>>>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>>>>>>>>>>>> >>>>>>>>>>>>> Looks like we don't need locks here anymore? >>>>>>>>>>>> >>>>>>>>>>>> Apart from the locking, wonder why a explicit pm_runtime is needed >>>>>>>>>>>> from unmap. Somehow looks like some path in the master using that >>>>>>>>>>>> should have enabled the pm ? >>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> Yes, there are a bunch of scenarios where unmap can happen with >>>>>>>>>>> disabled master (but not in atomic context). On the gpu side we >>>>>>>>>>> opportunistically keep a buffer mapping until the buffer is freed >>>>>>>>>>> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap >>>>>>>>>>> an exported dmabuf while some other driver holds a reference to it >>>>>>>>>>> (which can be dropped when the v4l2 device is suspended). >>>>>>>>>>> >>>>>>>>>>> Since unmap triggers tbl flush which touches iommu regs, the iommu >>>>>>>>>>> driver *definitely* needs a pm_runtime_get_sync(). >>>>>>>>>> >>>>>>>>>> Ok, with that being the case, there are two things here, >>>>>>>>>> >>>>>>>>>> 1) If the device links are still intact at these places where unmap is called, >>>>>>>>>> then pm_runtime from the master would setup the all the clocks. That would >>>>>>>>>> avoid reintroducing the locking indirectly here. >>>>>>>>>> >>>>>>>>>> 2) If not, then doing it here is the only way. But for both cases, since >>>>>>>>>> the unmap can be called from atomic context, resume handler here should >>>>>>>>>> avoid doing clk_prepare_enable , instead move the clk_prepare to the init. >>>>>>>>>> >>>>>>>>> >>>>>>>>> I do kinda like the approach Marek suggested.. of deferring the tlb >>>>>>>>> flush until resume. I'm wondering if we could combine that with >>>>>>>>> putting the mmu in a stalled state when we suspend (and not resume the >>>>>>>>> mmu until after the pending tlb flush)? >>>>>>>> >>>>>>>> I'm not sure that a stalled state is what we're after here, because we need >>>>>>>> to take care to prevent any table walks if we've freed the underlying pages. >>>>>>>> What we could try to do is disable the SMMU (put into global bypass) and >>>>>>>> invalidate the TLB when performing a suspend operation, then we just ignore >>>>>>>> invalidation whilst the clocks are stopped and, on resume, enable the SMMU >>>>>>>> again. >>>>>>> >>>>>>> wouldn't stalled just block any memory transactions by device(s) using >>>>>>> the context bank? Putting it in bypass isn't really a good thing if >>>>>>> there is any chance the device can sneak in a memory access before >>>>>>> we've taking it back out of bypass (ie. makes gpu a giant userspace >>>>>>> controlled root hole). >>>>>> >>>>>> If it doesn't deadlock, then yes, it will stall transactions. However, that >>>>>> doesn't mean it necessarily prevents page table walks. >>>>> >>>>> btw, I guess the concern about pagetable walk is that the unmap could >>>>> have removed some sub-level of the pt that the tlb walk would hit? >>>>> Would deferring freeing those pages help? >>>> >>>> Could do, but it sounds like a lot of complication that I think we can fix >>>> by making the suspend operation put the SMMU into a "clean" state. >>>> >>>>>> Instead of bypass, we >>>>>> could configure all the streams to terminate, but this race still worries me >>>>>> somewhat. I thought that the SMMU would only be suspended if all of its >>>>>> masters were suspended, so if the GPU wants to come out of suspend then the >>>>>> SMMU should be resumed first. >>>>> >>>>> I believe this should be true.. on the gpu side, I'm mostly trying to >>>>> avoid having to power the gpu back on to free buffers. (On the v4l2 >>>>> side, somewhere in the core videobuf code would also need to be made >>>>> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..) >>>> >>>> Right, and we shouldn't have to resume it if we suspend it in a clean state, >>>> with the TLBs invalidated. >>>> >>> >>> I guess if the device_link() stuff ensured the attached device >>> (gpu/etc) was suspended before suspending the iommu, then I guess I >>> can't see how temporarily putting the iommu in bypass would be a >>> problem. I haven't looked at the device_link() stuff too closely, but >>> iommu being resumed first and suspended last seems like the only thing >>> that would make sense. I'm mostly just nervous about iommu in bypass >>> vs gpu since userspace has so much control over what address gpu >>> writes to / reads from, so getting it wrong w/ the iommu would be a >>> rather bad thing ;-) >> >> Right, but we can also configure it to terminate if you don't want bypass. >> > But one thing here is, with devicelinks in picture, iommu suspend/resume is called along with the master. That means, we can end up cleaning even active entries on the suspend path ?, if suspend is going to put the smmu in to a clean state every time. So if the master's are following the pm_runtime sequence before a dma_map/unmap operation, that seems better. Regards, Sricharan
Hi, On 7/17/2017 5:16 PM, Sricharan R wrote: > Hi, > > On 7/15/2017 1:09 AM, Rob Clark wrote: >> On Fri, Jul 14, 2017 at 3:36 PM, Will Deacon <will.deacon@arm.com> wrote: >>> On Fri, Jul 14, 2017 at 03:34:42PM -0400, Rob Clark wrote: >>>> On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote: >>>>> On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote: >>>>>> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote: >>>>>>> On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote: >>>>>>>> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote: >>>>>>>>> On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote: >>>>>>>>>> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: >>>>>>>>>>> Hi, >>>>>>>>>>> >>>>>>>>>>> On 7/13/2017 5:20 PM, Rob Clark wrote: >>>>>>>>>>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >>>>>>>>>>>>> Hi Vivek, >>>>>>>>>>>>> >>>>>>>>>>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>>>>>>>>>>>>> Hi Stephen, >>>>>>>>>>>>>> >>>>>>>>>>>>>> >>>>>>>>>>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>>>>>>>>>>>>> On 07/06, Vivek Gautam wrote: >>>>>>>>>>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>>>>>>>>>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>>>>>>>>>>>>> size_t size) >>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>>>>>>>>>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>>>>>>>>>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>>>>>>>>>>>>> + size_t ret; >>>>>>>>>>>>>>>> if (!ops) >>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>> - return ops->unmap(ops, iova, size); >>>>>>>>>>>>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>>>>>>>>>>>>> Can these map/unmap ops be called from an atomic context? I seem >>>>>>>>>>>>>>> to recall that being a problem before. >>>>>>>>>>>>>> >>>>>>>>>>>>>> That's something which was dropped in the following patch merged in master: >>>>>>>>>>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>>>>>>>>>>>>> >>>>>>>>>>>>>> Looks like we don't need locks here anymore? >>>>>>>>>>>>> >>>>>>>>>>>>> Apart from the locking, wonder why a explicit pm_runtime is needed >>>>>>>>>>>>> from unmap. Somehow looks like some path in the master using that >>>>>>>>>>>>> should have enabled the pm ? >>>>>>>>>>>>> >>>>>>>>>>>> >>>>>>>>>>>> Yes, there are a bunch of scenarios where unmap can happen with >>>>>>>>>>>> disabled master (but not in atomic context). On the gpu side we >>>>>>>>>>>> opportunistically keep a buffer mapping until the buffer is freed >>>>>>>>>>>> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap >>>>>>>>>>>> an exported dmabuf while some other driver holds a reference to it >>>>>>>>>>>> (which can be dropped when the v4l2 device is suspended). >>>>>>>>>>>> >>>>>>>>>>>> Since unmap triggers tbl flush which touches iommu regs, the iommu >>>>>>>>>>>> driver *definitely* needs a pm_runtime_get_sync(). >>>>>>>>>>> >>>>>>>>>>> Ok, with that being the case, there are two things here, >>>>>>>>>>> >>>>>>>>>>> 1) If the device links are still intact at these places where unmap is called, >>>>>>>>>>> then pm_runtime from the master would setup the all the clocks. That would >>>>>>>>>>> avoid reintroducing the locking indirectly here. >>>>>>>>>>> >>>>>>>>>>> 2) If not, then doing it here is the only way. But for both cases, since >>>>>>>>>>> the unmap can be called from atomic context, resume handler here should >>>>>>>>>>> avoid doing clk_prepare_enable , instead move the clk_prepare to the init. >>>>>>>>>>> >>>>>>>>>> >>>>>>>>>> I do kinda like the approach Marek suggested.. of deferring the tlb >>>>>>>>>> flush until resume. I'm wondering if we could combine that with >>>>>>>>>> putting the mmu in a stalled state when we suspend (and not resume the >>>>>>>>>> mmu until after the pending tlb flush)? >>>>>>>>> >>>>>>>>> I'm not sure that a stalled state is what we're after here, because we need >>>>>>>>> to take care to prevent any table walks if we've freed the underlying pages. >>>>>>>>> What we could try to do is disable the SMMU (put into global bypass) and >>>>>>>>> invalidate the TLB when performing a suspend operation, then we just ignore >>>>>>>>> invalidation whilst the clocks are stopped and, on resume, enable the SMMU >>>>>>>>> again. >>>>>>>> >>>>>>>> wouldn't stalled just block any memory transactions by device(s) using >>>>>>>> the context bank? Putting it in bypass isn't really a good thing if >>>>>>>> there is any chance the device can sneak in a memory access before >>>>>>>> we've taking it back out of bypass (ie. makes gpu a giant userspace >>>>>>>> controlled root hole). >>>>>>> >>>>>>> If it doesn't deadlock, then yes, it will stall transactions. However, that >>>>>>> doesn't mean it necessarily prevents page table walks. >>>>>> >>>>>> btw, I guess the concern about pagetable walk is that the unmap could >>>>>> have removed some sub-level of the pt that the tlb walk would hit? >>>>>> Would deferring freeing those pages help? >>>>> >>>>> Could do, but it sounds like a lot of complication that I think we can fix >>>>> by making the suspend operation put the SMMU into a "clean" state. >>>>> >>>>>>> Instead of bypass, we >>>>>>> could configure all the streams to terminate, but this race still worries me >>>>>>> somewhat. I thought that the SMMU would only be suspended if all of its >>>>>>> masters were suspended, so if the GPU wants to come out of suspend then the >>>>>>> SMMU should be resumed first. >>>>>> >>>>>> I believe this should be true.. on the gpu side, I'm mostly trying to >>>>>> avoid having to power the gpu back on to free buffers. (On the v4l2 >>>>>> side, somewhere in the core videobuf code would also need to be made >>>>>> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..) >>>>> >>>>> Right, and we shouldn't have to resume it if we suspend it in a clean state, >>>>> with the TLBs invalidated. >>>>> >>>> >>>> I guess if the device_link() stuff ensured the attached device >>>> (gpu/etc) was suspended before suspending the iommu, then I guess I >>>> can't see how temporarily putting the iommu in bypass would be a >>>> problem. I haven't looked at the device_link() stuff too closely, but >>>> iommu being resumed first and suspended last seems like the only thing >>>> that would make sense. I'm mostly just nervous about iommu in bypass >>>> vs gpu since userspace has so much control over what address gpu >>>> writes to / reads from, so getting it wrong w/ the iommu would be a >>>> rather bad thing ;-) >>> >>> Right, but we can also configure it to terminate if you don't want bypass. >>> >> > > But one thing here is, with devicelinks in picture, iommu suspend/resume > is called along with the master. That means, we can end up cleaning even > active entries on the suspend path ?, if suspend is going to > put the smmu in to a clean state every time. So if the master's are following > the pm_runtime sequence before a dma_map/unmap operation, that seems better. > Also, for the usecase of unmap being done from master's like GPU while it is already suspended, then following the Marek's approach of checking for the smmu state while in unmap and defer the TLB flush till resume seems correct way. All of the above true if we want to use device_link. Regards, Sricharan
Hi, On Mon, Jul 17, 2017 at 5:58 PM, Sricharan R <sricharan@codeaurora.org> wrote: > Hi, > > On 7/17/2017 5:16 PM, Sricharan R wrote: >> Hi, >> >> On 7/15/2017 1:09 AM, Rob Clark wrote: >>> On Fri, Jul 14, 2017 at 3:36 PM, Will Deacon <will.deacon@arm.com> wrote: >>>> On Fri, Jul 14, 2017 at 03:34:42PM -0400, Rob Clark wrote: >>>>> On Fri, Jul 14, 2017 at 3:01 PM, Will Deacon <will.deacon@arm.com> wrote: >>>>>> On Fri, Jul 14, 2017 at 02:25:45PM -0400, Rob Clark wrote: >>>>>>> On Fri, Jul 14, 2017 at 2:06 PM, Will Deacon <will.deacon@arm.com> wrote: >>>>>>>> On Fri, Jul 14, 2017 at 01:42:13PM -0400, Rob Clark wrote: >>>>>>>>> On Fri, Jul 14, 2017 at 1:07 PM, Will Deacon <will.deacon@arm.com> wrote: >>>>>>>>>> On Thu, Jul 13, 2017 at 10:55:10AM -0400, Rob Clark wrote: >>>>>>>>>>> On Thu, Jul 13, 2017 at 9:53 AM, Sricharan R <sricharan@codeaurora.org> wrote: >>>>>>>>>>>> Hi, >>>>>>>>>>>> >>>>>>>>>>>> On 7/13/2017 5:20 PM, Rob Clark wrote: >>>>>>>>>>>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >>>>>>>>>>>>>> Hi Vivek, >>>>>>>>>>>>>> >>>>>>>>>>>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>>>>>>>>>>>>>> Hi Stephen, >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>>>>>>>>>>>>>> On 07/06, Vivek Gautam wrote: >>>>>>>>>>>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>>>>>>>>>>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>>>>>>>>>>>>>> size_t size) >>>>>>>>>>>>>>>>> { >>>>>>>>>>>>>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>>>>>>>>>>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>>>>>>>>>>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>>>>>>>>>>>>>> + size_t ret; >>>>>>>>>>>>>>>>> if (!ops) >>>>>>>>>>>>>>>>> return 0; >>>>>>>>>>>>>>>>> - return ops->unmap(ops, iova, size); >>>>>>>>>>>>>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>>>>>>>>>>>>>> Can these map/unmap ops be called from an atomic context? I seem >>>>>>>>>>>>>>>> to recall that being a problem before. >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> That's something which was dropped in the following patch merged in master: >>>>>>>>>>>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>>>>>>>>>>>>>> >>>>>>>>>>>>>>> Looks like we don't need locks here anymore? >>>>>>>>>>>>>> >>>>>>>>>>>>>> Apart from the locking, wonder why a explicit pm_runtime is needed >>>>>>>>>>>>>> from unmap. Somehow looks like some path in the master using that >>>>>>>>>>>>>> should have enabled the pm ? >>>>>>>>>>>>>> >>>>>>>>>>>>> >>>>>>>>>>>>> Yes, there are a bunch of scenarios where unmap can happen with >>>>>>>>>>>>> disabled master (but not in atomic context). On the gpu side we >>>>>>>>>>>>> opportunistically keep a buffer mapping until the buffer is freed >>>>>>>>>>>>> (which can happen after gpu is disabled). Likewise, v4l2 won't unmap >>>>>>>>>>>>> an exported dmabuf while some other driver holds a reference to it >>>>>>>>>>>>> (which can be dropped when the v4l2 device is suspended). >>>>>>>>>>>>> >>>>>>>>>>>>> Since unmap triggers tbl flush which touches iommu regs, the iommu >>>>>>>>>>>>> driver *definitely* needs a pm_runtime_get_sync(). >>>>>>>>>>>> >>>>>>>>>>>> Ok, with that being the case, there are two things here, >>>>>>>>>>>> >>>>>>>>>>>> 1) If the device links are still intact at these places where unmap is called, >>>>>>>>>>>> then pm_runtime from the master would setup the all the clocks. That would >>>>>>>>>>>> avoid reintroducing the locking indirectly here. >>>>>>>>>>>> >>>>>>>>>>>> 2) If not, then doing it here is the only way. But for both cases, since >>>>>>>>>>>> the unmap can be called from atomic context, resume handler here should >>>>>>>>>>>> avoid doing clk_prepare_enable , instead move the clk_prepare to the init. >>>>>>>>>>>> >>>>>>>>>>> >>>>>>>>>>> I do kinda like the approach Marek suggested.. of deferring the tlb >>>>>>>>>>> flush until resume. I'm wondering if we could combine that with >>>>>>>>>>> putting the mmu in a stalled state when we suspend (and not resume the >>>>>>>>>>> mmu until after the pending tlb flush)? >>>>>>>>>> >>>>>>>>>> I'm not sure that a stalled state is what we're after here, because we need >>>>>>>>>> to take care to prevent any table walks if we've freed the underlying pages. >>>>>>>>>> What we could try to do is disable the SMMU (put into global bypass) and >>>>>>>>>> invalidate the TLB when performing a suspend operation, then we just ignore >>>>>>>>>> invalidation whilst the clocks are stopped and, on resume, enable the SMMU >>>>>>>>>> again. >>>>>>>>> >>>>>>>>> wouldn't stalled just block any memory transactions by device(s) using >>>>>>>>> the context bank? Putting it in bypass isn't really a good thing if >>>>>>>>> there is any chance the device can sneak in a memory access before >>>>>>>>> we've taking it back out of bypass (ie. makes gpu a giant userspace >>>>>>>>> controlled root hole). >>>>>>>> >>>>>>>> If it doesn't deadlock, then yes, it will stall transactions. However, that >>>>>>>> doesn't mean it necessarily prevents page table walks. >>>>>>> >>>>>>> btw, I guess the concern about pagetable walk is that the unmap could >>>>>>> have removed some sub-level of the pt that the tlb walk would hit? >>>>>>> Would deferring freeing those pages help? >>>>>> >>>>>> Could do, but it sounds like a lot of complication that I think we can fix >>>>>> by making the suspend operation put the SMMU into a "clean" state. >>>>>> >>>>>>>> Instead of bypass, we >>>>>>>> could configure all the streams to terminate, but this race still worries me >>>>>>>> somewhat. I thought that the SMMU would only be suspended if all of its >>>>>>>> masters were suspended, so if the GPU wants to come out of suspend then the >>>>>>>> SMMU should be resumed first. >>>>>>> >>>>>>> I believe this should be true.. on the gpu side, I'm mostly trying to >>>>>>> avoid having to power the gpu back on to free buffers. (On the v4l2 >>>>>>> side, somewhere in the core videobuf code would also need to be made >>>>>>> to wrap it's dma_unmap_sg() with pm_runtime_get/put()..) >>>>>> >>>>>> Right, and we shouldn't have to resume it if we suspend it in a clean state, >>>>>> with the TLBs invalidated. >>>>>> >>>>> >>>>> I guess if the device_link() stuff ensured the attached device >>>>> (gpu/etc) was suspended before suspending the iommu, then I guess I >>>>> can't see how temporarily putting the iommu in bypass would be a >>>>> problem. I haven't looked at the device_link() stuff too closely, but >>>>> iommu being resumed first and suspended last seems like the only thing >>>>> that would make sense. I'm mostly just nervous about iommu in bypass >>>>> vs gpu since userspace has so much control over what address gpu >>>>> writes to / reads from, so getting it wrong w/ the iommu would be a >>>>> rather bad thing ;-) >>>> >>>> Right, but we can also configure it to terminate if you don't want bypass. >>>> >>> >> >> But one thing here is, with devicelinks in picture, iommu suspend/resume >> is called along with the master. That means, we can end up cleaning even >> active entries on the suspend path ?, if suspend is going to >> put the smmu in to a clean state every time. So if the master's are following >> the pm_runtime sequence before a dma_map/unmap operation, that seems better. >> > > Also, for the usecase of unmap being done from master's like GPU while it is already > suspended, then following the Marek's approach of checking for the smmu state while > in unmap and defer the TLB flush till resume seems correct way. All of the above > true if we want to use device_link. This sounds like a plan. I have a WIP version in which we will just check in 'unmap' if the mmu is already suspended. If yes, then we save the unmap request (domain, iova) and defer this request. On resume, we check the pending tlb flush requests, and call unmap. I will give a try for the venus use-case. Regards Vivek > > Regards, > Sricharan > > -- > "QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation > > --- > This email has been checked for viruses by Avast antivirus software. > https://www.avast.com/antivirus >
On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark <robdclark@gmail.com> wrote: > On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> Hi Vivek, >> >> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>> Hi Stephen, >>> >>> >>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>> On 07/06, Vivek Gautam wrote: >>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>> size_t size) >>>>> { >>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>> + size_t ret; >>>>> if (!ops) >>>>> return 0; >>>>> - return ops->unmap(ops, iova, size); >>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>> Can these map/unmap ops be called from an atomic context? I seem >>>> to recall that being a problem before. >>> >>> That's something which was dropped in the following patch merged in master: >>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>> >>> Looks like we don't need locks here anymore? >> >> Apart from the locking, wonder why a explicit pm_runtime is needed >> from unmap. Somehow looks like some path in the master using that >> should have enabled the pm ? >> > > Yes, there are a bunch of scenarios where unmap can happen with > disabled master (but not in atomic context). I would like to understand whether there is a situation where an unmap is called in atomic context without an enabled master? Let's say we have the case where all the unmap calls in atomic context happen only from the master's context (in which case the device link should take care of the pm state of smmu), and the only unmap that happen in non-atomic context is the one with master disabled. In such a case doesn it make sense to distinguish the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only for the non-atomic context since that would be the one with master disabled. Thanks Vivek > On the gpu side we > opportunistically keep a buffer mapping until the buffer is freed > (which can happen after gpu is disabled). Likewise, v4l2 won't unmap > an exported dmabuf while some other driver holds a reference to it > (which can be dropped when the v4l2 device is suspended). > > Since unmap triggers tbl flush which touches iommu regs, the iommu > driver *definitely* needs a pm_runtime_get_sync(). > > BR, > -R > -- > To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam <vivek.gautam@codeaurora.org> wrote: > On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark <robdclark@gmail.com> wrote: >> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >>> Hi Vivek, >>> >>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>>> Hi Stephen, >>>> >>>> >>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>>> On 07/06, Vivek Gautam wrote: >>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>>> size_t size) >>>>>> { >>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>>> + size_t ret; >>>>>> if (!ops) >>>>>> return 0; >>>>>> - return ops->unmap(ops, iova, size); >>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>>> Can these map/unmap ops be called from an atomic context? I seem >>>>> to recall that being a problem before. >>>> >>>> That's something which was dropped in the following patch merged in master: >>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>>> >>>> Looks like we don't need locks here anymore? >>> >>> Apart from the locking, wonder why a explicit pm_runtime is needed >>> from unmap. Somehow looks like some path in the master using that >>> should have enabled the pm ? >>> >> >> Yes, there are a bunch of scenarios where unmap can happen with >> disabled master (but not in atomic context). > > I would like to understand whether there is a situation where an unmap is > called in atomic context without an enabled master? > > Let's say we have the case where all the unmap calls in atomic context happen > only from the master's context (in which case the device link should > take care of > the pm state of smmu), and the only unmap that happen in non-atomic context > is the one with master disabled. In such a case doesn it make sense to > distinguish > the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only > for the non-atomic context since that would be the one with master disabled. > At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it won't unmap anything in atomic ctx (but it can unmap w/ master disabled). I can't really comment about other non-gpu drivers. It seems like a reasonable constraint that either master is enabled or not in atomic ctx. Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd like to drop that to avoid powering up the gpu. BR, -R
Hi, On Mon, Aug 7, 2017 at 5:59 PM, Rob Clark <robdclark@gmail.com> wrote: > On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam > <vivek.gautam@codeaurora.org> wrote: >> On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark <robdclark@gmail.com> wrote: >>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >>>> Hi Vivek, >>>> >>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>>>> Hi Stephen, >>>>> >>>>> >>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>>>> On 07/06, Vivek Gautam wrote: >>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>>>> size_t size) >>>>>>> { >>>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>>>> + size_t ret; >>>>>>> if (!ops) >>>>>>> return 0; >>>>>>> - return ops->unmap(ops, iova, size); >>>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>>>> Can these map/unmap ops be called from an atomic context? I seem >>>>>> to recall that being a problem before. >>>>> >>>>> That's something which was dropped in the following patch merged in master: >>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>>>> >>>>> Looks like we don't need locks here anymore? >>>> >>>> Apart from the locking, wonder why a explicit pm_runtime is needed >>>> from unmap. Somehow looks like some path in the master using that >>>> should have enabled the pm ? >>>> >>> >>> Yes, there are a bunch of scenarios where unmap can happen with >>> disabled master (but not in atomic context). >> >> I would like to understand whether there is a situation where an unmap is >> called in atomic context without an enabled master? >> >> Let's say we have the case where all the unmap calls in atomic context happen >> only from the master's context (in which case the device link should >> take care of >> the pm state of smmu), and the only unmap that happen in non-atomic context >> is the one with master disabled. In such a case doesn it make sense to >> distinguish >> the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only >> for the non-atomic context since that would be the one with master disabled. >> > > At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it > won't unmap anything in atomic ctx (but it can unmap w/ master > disabled). I can't really comment about other non-gpu drivers. It > seems like a reasonable constraint that either master is enabled or > not in atomic ctx. > > Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd > like to drop that to avoid powering up the gpu. Since the deferring the TLB maintenance doesn't look like the best approach [1], how about if we try to power-up only the smmu from different client devices such as, GPU in the unmap path. Then we won't need to add pm_runtime_get/put() calls in arm_smmu_unmap(). The client device can use something like - pm_runtime_get_supplier() since we already have the device link in place with this patch series. This should power-on the supplier (which is smmu) without turning on the consumer (such as GPU). pm_runtime_get_supplier() however is not exported at this moment. Will it be useful to export this API and use it in the drivers. Adding Rafael J. Wysocki for suggestions on pm_runtime_get_suppliers() API. [1] https://patchwork.kernel.org/patch/9876489/ Best regards Vivek > > BR, > -R > -- > To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html
On 11/15, Vivek Gautam wrote: > Hi, > > > On Mon, Aug 7, 2017 at 5:59 PM, Rob Clark <robdclark@gmail.com> wrote: > > On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam > > <vivek.gautam@codeaurora.org> wrote: > >> On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark <robdclark@gmail.com> wrote: > >>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: > >>>> Hi Vivek, > >>>> > >>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: > >>>>> Hi Stephen, > >>>>> > >>>>> > >>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: > >>>>>> On 07/06, Vivek Gautam wrote: > >>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, > >>>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, > >>>>>>> size_t size) > >>>>>>> { > >>>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; > >>>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); > >>>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; > >>>>>>> + size_t ret; > >>>>>>> if (!ops) > >>>>>>> return 0; > >>>>>>> - return ops->unmap(ops, iova, size); > >>>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); > >>>>>> Can these map/unmap ops be called from an atomic context? I seem > >>>>>> to recall that being a problem before. > >>>>> > >>>>> That's something which was dropped in the following patch merged in master: > >>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock > >>>>> > >>>>> Looks like we don't need locks here anymore? > >>>> > >>>> Apart from the locking, wonder why a explicit pm_runtime is needed > >>>> from unmap. Somehow looks like some path in the master using that > >>>> should have enabled the pm ? > >>>> > >>> > >>> Yes, there are a bunch of scenarios where unmap can happen with > >>> disabled master (but not in atomic context). > >> > >> I would like to understand whether there is a situation where an unmap is > >> called in atomic context without an enabled master? > >> > >> Let's say we have the case where all the unmap calls in atomic context happen > >> only from the master's context (in which case the device link should > >> take care of > >> the pm state of smmu), and the only unmap that happen in non-atomic context > >> is the one with master disabled. In such a case doesn it make sense to > >> distinguish > >> the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only > >> for the non-atomic context since that would be the one with master disabled. > >> > > > > At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it > > won't unmap anything in atomic ctx (but it can unmap w/ master > > disabled). I can't really comment about other non-gpu drivers. It > > seems like a reasonable constraint that either master is enabled or > > not in atomic ctx. > > > > Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd > > like to drop that to avoid powering up the gpu. > > Since the deferring the TLB maintenance doesn't look like the best approach [1], > how about if we try to power-up only the smmu from different client > devices such as, > GPU in the unmap path. Then we won't need to add pm_runtime_get/put() calls in > arm_smmu_unmap(). > > The client device can use something like - pm_runtime_get_supplier() since > we already have the device link in place with this patch series. This should > power-on the supplier (which is smmu) without turning on the consumer > (such as GPU). > > pm_runtime_get_supplier() however is not exported at this moment. > Will it be useful to export this API and use it in the drivers. > I'm not sure pm_runtime_get_supplier() is correct either. That feels like we're relying on the GPU driver knowing the internal details of how the device links are configured. Is there some way to have the GPU driver know in its runtime PM resume hook that it doesn't need to be powered on because it isn't actively drawing anything or processing commands? I'm thinking of the code calling pm_runtime_get() as proposed around the IOMMU unmap path in the GPU driver and then having the runtime PM resume hook in the GPU driver return some special value to indicate that it didn't really resume because it didn't need to and to treat the device as runtime suspended but not return an error. Then the runtime PM core can keep track of that and try to power the GPU on again when another pm_runtime_get() is called on the GPU device. This keeps the consumer API the same, always pm_runtime_get(), but leaves the device driver logic of what to do when the GPU doesn't need to power on to the runtime PM hook where the driver has all the information.
On Mon, Nov 27, 2017 at 5:22 PM, Stephen Boyd <sboyd@codeaurora.org> wrote: > On 11/15, Vivek Gautam wrote: >> Hi, >> >> >> On Mon, Aug 7, 2017 at 5:59 PM, Rob Clark <robdclark@gmail.com> wrote: >> > On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam >> > <vivek.gautam@codeaurora.org> wrote: >> >> On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark <robdclark@gmail.com> wrote: >> >>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R <sricharan@codeaurora.org> wrote: >> >>>> Hi Vivek, >> >>>> >> >>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >> >>>>> Hi Stephen, >> >>>>> >> >>>>> >> >>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >> >>>>>> On 07/06, Vivek Gautam wrote: >> >>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >> >>>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >> >>>>>>> size_t size) >> >>>>>>> { >> >>>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >> >>>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >> >>>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >> >>>>>>> + size_t ret; >> >>>>>>> if (!ops) >> >>>>>>> return 0; >> >>>>>>> - return ops->unmap(ops, iova, size); >> >>>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >> >>>>>> Can these map/unmap ops be called from an atomic context? I seem >> >>>>>> to recall that being a problem before. >> >>>>> >> >>>>> That's something which was dropped in the following patch merged in master: >> >>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >> >>>>> >> >>>>> Looks like we don't need locks here anymore? >> >>>> >> >>>> Apart from the locking, wonder why a explicit pm_runtime is needed >> >>>> from unmap. Somehow looks like some path in the master using that >> >>>> should have enabled the pm ? >> >>>> >> >>> >> >>> Yes, there are a bunch of scenarios where unmap can happen with >> >>> disabled master (but not in atomic context). >> >> >> >> I would like to understand whether there is a situation where an unmap is >> >> called in atomic context without an enabled master? >> >> >> >> Let's say we have the case where all the unmap calls in atomic context happen >> >> only from the master's context (in which case the device link should >> >> take care of >> >> the pm state of smmu), and the only unmap that happen in non-atomic context >> >> is the one with master disabled. In such a case doesn it make sense to >> >> distinguish >> >> the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only >> >> for the non-atomic context since that would be the one with master disabled. >> >> >> > >> > At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it >> > won't unmap anything in atomic ctx (but it can unmap w/ master >> > disabled). I can't really comment about other non-gpu drivers. It >> > seems like a reasonable constraint that either master is enabled or >> > not in atomic ctx. >> > >> > Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd >> > like to drop that to avoid powering up the gpu. >> >> Since the deferring the TLB maintenance doesn't look like the best approach [1], >> how about if we try to power-up only the smmu from different client >> devices such as, >> GPU in the unmap path. Then we won't need to add pm_runtime_get/put() calls in >> arm_smmu_unmap(). >> >> The client device can use something like - pm_runtime_get_supplier() since >> we already have the device link in place with this patch series. This should >> power-on the supplier (which is smmu) without turning on the consumer >> (such as GPU). >> >> pm_runtime_get_supplier() however is not exported at this moment. >> Will it be useful to export this API and use it in the drivers. >> > > I'm not sure pm_runtime_get_supplier() is correct either. That > feels like we're relying on the GPU driver knowing the internal > details of how the device links are configured. > what does pm_runtime_get_supplier() do if IOMMU driver hasn't setup device-link? If it is a no-op, then I guess the GPU driver calling pm_runtime_get_supplier() seems reasonable, and less annoying than having special cases in pm_resume path.. I don't feel too bad about having "just in case" get/put_supplier() calls in the unmap path. Also, presumably we still want to avoid powering up GPU even if we short circuit the firmware loading and rest of "booting up the GPU".. since presumably the GPU draws somewhat more power than the IOMMU.. having the pm_resume/suspend path know about the diff between waking up / suspending the iommu and itself doesn't really feel less-bad than just doing "just in case" get/put_supplier() calls. BR, -R > Is there some way to have the GPU driver know in its runtime PM > resume hook that it doesn't need to be powered on because it > isn't actively drawing anything or processing commands? I'm > thinking of the code calling pm_runtime_get() as proposed around > the IOMMU unmap path in the GPU driver and then having the > runtime PM resume hook in the GPU driver return some special > value to indicate that it didn't really resume because it didn't > need to and to treat the device as runtime suspended but not > return an error. Then the runtime PM core can keep track of that > and try to power the GPU on again when another pm_runtime_get() > is called on the GPU device. > > This keeps the consumer API the same, always pm_runtime_get(), > but leaves the device driver logic of what to do when the GPU > doesn't need to power on to the runtime PM hook where the driver > has all the information. > > -- > Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, > a Linux Foundation Collaborative Project
On 11/28/2017 05:13 AM, Rob Clark wrote: > On Mon, Nov 27, 2017 at 5:22 PM, Stephen Boyd<sboyd@codeaurora.org> wrote: >> On 11/15, Vivek Gautam wrote: >>> Hi, >>> >>> >>> On Mon, Aug 7, 2017 at 5:59 PM, Rob Clark<robdclark@gmail.com> wrote: >>>> On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam >>>> <vivek.gautam@codeaurora.org> wrote: >>>>> On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark<robdclark@gmail.com> wrote: >>>>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan R<sricharan@codeaurora.org> wrote: >>>>>>> Hi Vivek, >>>>>>> >>>>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>>>>>>> Hi Stephen, >>>>>>>> >>>>>>>> >>>>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>>>>>>> On 07/06, Vivek Gautam wrote: >>>>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, >>>>>>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, >>>>>>>>>> size_t size) >>>>>>>>>> { >>>>>>>>>> - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; >>>>>>>>>> + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); >>>>>>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>>>>>>> + size_t ret; >>>>>>>>>> if (!ops) >>>>>>>>>> return 0; >>>>>>>>>> - return ops->unmap(ops, iova, size); >>>>>>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>>>>>>> Can these map/unmap ops be called from an atomic context? I seem >>>>>>>>> to recall that being a problem before. >>>>>>>> That's something which was dropped in the following patch merged in master: >>>>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>>>>>>> >>>>>>>> Looks like we don't need locks here anymore? >>>>>>> Apart from the locking, wonder why a explicit pm_runtime is needed >>>>>>> from unmap. Somehow looks like some path in the master using that >>>>>>> should have enabled the pm ? >>>>>>> >>>>>> Yes, there are a bunch of scenarios where unmap can happen with >>>>>> disabled master (but not in atomic context). >>>>> I would like to understand whether there is a situation where an unmap is >>>>> called in atomic context without an enabled master? >>>>> >>>>> Let's say we have the case where all the unmap calls in atomic context happen >>>>> only from the master's context (in which case the device link should >>>>> take care of >>>>> the pm state of smmu), and the only unmap that happen in non-atomic context >>>>> is the one with master disabled. In such a case doesn it make sense to >>>>> distinguish >>>>> the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() only >>>>> for the non-atomic context since that would be the one with master disabled. >>>>> >>>> At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it >>>> won't unmap anything in atomic ctx (but it can unmap w/ master >>>> disabled). I can't really comment about other non-gpu drivers. It >>>> seems like a reasonable constraint that either master is enabled or >>>> not in atomic ctx. >>>> >>>> Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd >>>> like to drop that to avoid powering up the gpu. >>> Since the deferring the TLB maintenance doesn't look like the best approach [1], >>> how about if we try to power-up only the smmu from different client >>> devices such as, >>> GPU in the unmap path. Then we won't need to add pm_runtime_get/put() calls in >>> arm_smmu_unmap(). >>> >>> The client device can use something like - pm_runtime_get_supplier() since >>> we already have the device link in place with this patch series. This should >>> power-on the supplier (which is smmu) without turning on the consumer >>> (such as GPU). >>> >>> pm_runtime_get_supplier() however is not exported at this moment. >>> Will it be useful to export this API and use it in the drivers. >>> >> I'm not sure pm_runtime_get_supplier() is correct either. That >> feels like we're relying on the GPU driver knowing the internal >> details of how the device links are configured. >> > what does pm_runtime_get_supplier() do if IOMMU driver hasn't setup > device-link? It will be a no-op. > If it is a no-op, then I guess the GPU driver calling > pm_runtime_get_supplier() seems reasonable, and less annoying than > having special cases in pm_resume path.. I don't feel too bad about > having "just in case" get/put_supplier() calls in the unmap path. > > Also, presumably we still want to avoid powering up GPU even if we > short circuit the firmware loading and rest of "booting up the GPU".. > since presumably the GPU draws somewhat more power than the IOMMU.. > having the pm_resume/suspend path know about the diff between waking > up / suspending the iommu and itself doesn't really feel less-bad than > just doing "just in case" get/put_supplier() calls. If it sounds okay, then i can send a patch that exports the pm_runtime_get/put_suppliers() APIs. Best regards Vivek > BR, > -R > >> Is there some way to have the GPU driver know in its runtime PM >> resume hook that it doesn't need to be powered on because it >> isn't actively drawing anything or processing commands? I'm >> thinking of the code calling pm_runtime_get() as proposed around >> the IOMMU unmap path in the GPU driver and then having the >> runtime PM resume hook in the GPU driver return some special >> value to indicate that it didn't really resume because it didn't >> need to and to treat the device as runtime suspended but not >> return an error. Then the runtime PM core can keep track of that >> and try to power the GPU on again when another pm_runtime_get() >> is called on the GPU device. >> >> This keeps the consumer API the same, always pm_runtime_get(), >> but leaves the device driver logic of what to do when the GPU >> doesn't need to power on to the runtime PM hook where the driver >> has all the information. >> >> -- >> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, >> a Linux Foundation Collaborative Project > -- > To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" in > the body of a message tomajordomo@vger.kernel.org > More majordomo info athttp://vger.kernel.org/majordomo-info.html
On Tue, Nov 28, 2017 at 8:43 AM, Vivek Gautam <vivek.gautam@codeaurora.org> wrote: > > > On 11/28/2017 05:13 AM, Rob Clark wrote: >> >> On Mon, Nov 27, 2017 at 5:22 PM, Stephen Boyd<sboyd@codeaurora.org> >> wrote: >>> >>> On 11/15, Vivek Gautam wrote: >>>> >>>> Hi, >>>> >>>> >>>> On Mon, Aug 7, 2017 at 5:59 PM, Rob Clark<robdclark@gmail.com> wrote: >>>>> >>>>> On Mon, Aug 7, 2017 at 4:27 AM, Vivek Gautam >>>>> <vivek.gautam@codeaurora.org> wrote: >>>>>> >>>>>> On Thu, Jul 13, 2017 at 5:20 PM, Rob Clark<robdclark@gmail.com> >>>>>> wrote: >>>>>>> >>>>>>> On Thu, Jul 13, 2017 at 1:35 AM, Sricharan >>>>>>> R<sricharan@codeaurora.org> wrote: >>>>>>>> >>>>>>>> Hi Vivek, >>>>>>>> >>>>>>>> On 7/13/2017 10:43 AM, Vivek Gautam wrote: >>>>>>>>> >>>>>>>>> Hi Stephen, >>>>>>>>> >>>>>>>>> >>>>>>>>> On 07/13/2017 04:24 AM, Stephen Boyd wrote: >>>>>>>>>> >>>>>>>>>> On 07/06, Vivek Gautam wrote: >>>>>>>>>>> >>>>>>>>>>> @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct >>>>>>>>>>> iommu_domain *domain, unsigned long iova, >>>>>>>>>>> static size_t arm_smmu_unmap(struct iommu_domain *domain, >>>>>>>>>>> unsigned long iova, >>>>>>>>>>> size_t size) >>>>>>>>>>> { >>>>>>>>>>> - struct io_pgtable_ops *ops = >>>>>>>>>>> to_smmu_domain(domain)->pgtbl_ops; >>>>>>>>>>> + struct arm_smmu_domain *smmu_domain = >>>>>>>>>>> to_smmu_domain(domain); >>>>>>>>>>> + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; >>>>>>>>>>> + size_t ret; >>>>>>>>>>> if (!ops) >>>>>>>>>>> return 0; >>>>>>>>>>> - return ops->unmap(ops, iova, size); >>>>>>>>>>> + pm_runtime_get_sync(smmu_domain->smmu->dev); >>>>>>>>>> >>>>>>>>>> Can these map/unmap ops be called from an atomic context? I seem >>>>>>>>>> to recall that being a problem before. >>>>>>>>> >>>>>>>>> That's something which was dropped in the following patch merged in >>>>>>>>> master: >>>>>>>>> 523d7423e21b iommu/arm-smmu: Remove io-pgtable spinlock >>>>>>>>> >>>>>>>>> Looks like we don't need locks here anymore? >>>>>>>> >>>>>>>> Apart from the locking, wonder why a explicit pm_runtime is needed >>>>>>>> from unmap. Somehow looks like some path in the master using that >>>>>>>> should have enabled the pm ? >>>>>>>> >>>>>>> Yes, there are a bunch of scenarios where unmap can happen with >>>>>>> disabled master (but not in atomic context). >>>>>> >>>>>> I would like to understand whether there is a situation where an unmap >>>>>> is >>>>>> called in atomic context without an enabled master? >>>>>> >>>>>> Let's say we have the case where all the unmap calls in atomic context >>>>>> happen >>>>>> only from the master's context (in which case the device link should >>>>>> take care of >>>>>> the pm state of smmu), and the only unmap that happen in non-atomic >>>>>> context >>>>>> is the one with master disabled. In such a case doesn it make sense to >>>>>> distinguish >>>>>> the atomic/non-atomic context and add pm_runtime_get_sync()/put_sync() >>>>>> only >>>>>> for the non-atomic context since that would be the one with master >>>>>> disabled. >>>>>> >>>>> At least drm/msm needs to hold obj->lock (a mutex) in unmap, so it >>>>> won't unmap anything in atomic ctx (but it can unmap w/ master >>>>> disabled). I can't really comment about other non-gpu drivers. It >>>>> seems like a reasonable constraint that either master is enabled or >>>>> not in atomic ctx. >>>>> >>>>> Currently we actually wrap unmap w/ pm_runtime_get/put_sync(), but I'd >>>>> like to drop that to avoid powering up the gpu. >>>> >>>> Since the deferring the TLB maintenance doesn't look like the best >>>> approach [1], >>>> how about if we try to power-up only the smmu from different client >>>> devices such as, >>>> GPU in the unmap path. Then we won't need to add pm_runtime_get/put() >>>> calls in >>>> arm_smmu_unmap(). >>>> >>>> The client device can use something like - pm_runtime_get_supplier() >>>> since >>>> we already have the device link in place with this patch series. This >>>> should >>>> power-on the supplier (which is smmu) without turning on the consumer >>>> (such as GPU). >>>> >>>> pm_runtime_get_supplier() however is not exported at this moment. >>>> Will it be useful to export this API and use it in the drivers. >>>> >>> I'm not sure pm_runtime_get_supplier() is correct either. That >>> feels like we're relying on the GPU driver knowing the internal >>> details of how the device links are configured. >>> >> what does pm_runtime_get_supplier() do if IOMMU driver hasn't setup >> device-link? > > > It will be a no-op. > >> If it is a no-op, then I guess the GPU driver calling >> pm_runtime_get_supplier() seems reasonable, and less annoying than >> having special cases in pm_resume path.. I don't feel too bad about >> having "just in case" get/put_supplier() calls in the unmap path. >> >> Also, presumably we still want to avoid powering up GPU even if we >> short circuit the firmware loading and rest of "booting up the GPU".. >> since presumably the GPU draws somewhat more power than the IOMMU.. >> having the pm_resume/suspend path know about the diff between waking >> up / suspending the iommu and itself doesn't really feel less-bad than >> just doing "just in case" get/put_supplier() calls. > > > If it sounds okay, then i can send a patch that exports the > pm_runtime_get/put_suppliers() APIs. > sounds good to me BR, -R > > Best regards > Vivek > >> BR, >> -R >> >>> Is there some way to have the GPU driver know in its runtime PM >>> resume hook that it doesn't need to be powered on because it >>> isn't actively drawing anything or processing commands? I'm >>> thinking of the code calling pm_runtime_get() as proposed around >>> the IOMMU unmap path in the GPU driver and then having the >>> runtime PM resume hook in the GPU driver return some special >>> value to indicate that it didn't really resume because it didn't >>> need to and to treat the device as runtime suspended but not >>> return an error. Then the runtime PM core can keep track of that >>> and try to power the GPU on again when another pm_runtime_get() >>> is called on the GPU device. >>> >>> This keeps the consumer API the same, always pm_runtime_get(), >>> but leaves the device driver logic of what to do when the GPU >>> doesn't need to power on to the runtime PM hook where the driver >>> has all the information. >>> >>> -- >>> Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, >>> a Linux Foundation Collaborative Project >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-arm-msm" >> in >> the body of a message tomajordomo@vger.kernel.org >> More majordomo info athttp://vger.kernel.org/majordomo-info.html > > > -- > The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, > > a Linux Foundation Collaborative Project >
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c index bfe613f8939c..ddbfa8ab69e6 100644 --- a/drivers/iommu/arm-smmu.c +++ b/drivers/iommu/arm-smmu.c @@ -897,11 +897,15 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain) struct arm_smmu_device *smmu = smmu_domain->smmu; struct arm_smmu_cfg *cfg = &smmu_domain->cfg; void __iomem *cb_base; - int irq; + int ret, irq; if (!smmu || domain->type == IOMMU_DOMAIN_IDENTITY) return; + ret = pm_runtime_get_sync(smmu->dev); + if (ret) + return; + /* * Disable the context bank and free the page tables before freeing * it. @@ -916,6 +920,8 @@ static void arm_smmu_destroy_domain_context(struct iommu_domain *domain) free_io_pgtable_ops(smmu_domain->pgtbl_ops); __arm_smmu_free_bitmap(smmu->context_map, cfg->cbndx); + + pm_runtime_put_sync(smmu->dev); } static struct iommu_domain *arm_smmu_domain_alloc(unsigned type) @@ -1231,12 +1237,18 @@ static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova, static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { - struct io_pgtable_ops *ops = to_smmu_domain(domain)->pgtbl_ops; + struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); + struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops; + size_t ret; if (!ops) return 0; - return ops->unmap(ops, iova, size); + pm_runtime_get_sync(smmu_domain->smmu->dev); + ret = ops->unmap(ops, iova, size); + pm_runtime_put_sync(smmu_domain->smmu->dev); + + return ret; } static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain, @@ -1377,12 +1389,20 @@ static int arm_smmu_add_device(struct device *dev) while (i--) cfg->smendx[i] = INVALID_SMENDX; - ret = arm_smmu_master_alloc_smes(dev); + ret = pm_runtime_get_sync(smmu->dev); if (ret) goto out_cfg_free; + ret = arm_smmu_master_alloc_smes(dev); + if (ret) { + pm_runtime_put_sync(smmu->dev); + goto out_cfg_free; + } + iommu_device_link(&smmu->iommu, dev); + pm_runtime_put_sync(smmu->dev); + return 0; out_cfg_free: @@ -1397,7 +1417,7 @@ static void arm_smmu_remove_device(struct device *dev) struct iommu_fwspec *fwspec = dev->iommu_fwspec; struct arm_smmu_master_cfg *cfg; struct arm_smmu_device *smmu; - + int ret; if (!fwspec || fwspec->ops != &arm_smmu_ops) return; @@ -1405,8 +1425,21 @@ static void arm_smmu_remove_device(struct device *dev) cfg = fwspec->iommu_priv; smmu = cfg->smmu; + /* + * The device link between the master device and + * smmu is already purged at this point. + * So enable the power to smmu explicitly. + */ + + ret = pm_runtime_get_sync(smmu->dev); + if (ret) + return; + iommu_device_unlink(&smmu->iommu, dev); arm_smmu_master_free_smes(fwspec); + + pm_runtime_put_sync(smmu->dev); + iommu_group_remove_device(dev); kfree(fwspec->iommu_priv); iommu_fwspec_free(dev); @@ -2103,6 +2136,13 @@ static int arm_smmu_device_probe(struct platform_device *pdev) if (err) return err; + platform_set_drvdata(pdev, smmu); + pm_runtime_enable(dev); + + err = pm_runtime_get_sync(dev); + if (err) + return err; + err = arm_smmu_device_cfg_probe(smmu); if (err) return err; @@ -2144,9 +2184,9 @@ static int arm_smmu_device_probe(struct platform_device *pdev) return err; } - platform_set_drvdata(pdev, smmu); arm_smmu_device_reset(smmu); arm_smmu_test_smr_masks(smmu); + pm_runtime_put_sync(dev); /* * For ACPI and generic DT bindings, an SMMU will be probed before @@ -2185,6 +2225,8 @@ static int arm_smmu_device_remove(struct platform_device *pdev) /* Turn the thing off */ writel(sCR0_CLIENTPD, ARM_SMMU_GR0_NS(smmu) + ARM_SMMU_GR0_sCR0); + pm_runtime_force_suspend(smmu->dev); + return 0; }